Compare commits
11 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| c04e75f1eb | |||
| 55d7b04a42 | |||
| b23e733a93 | |||
| 4c0cd6b705 | |||
| af7afc6112 | |||
| dc858ad164 | |||
| 2ffd44c694 | |||
| 4f5d683f4b | |||
| df4a0e3f9d | |||
| c3cfbea750 | |||
| a01d1d8f86 |
@@ -23,7 +23,6 @@ import dataclasses
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
@@ -45,9 +44,15 @@ REQUIRED_CONTEXTS_RAW = _env(
|
||||
"REQUIRED_CONTEXTS",
|
||||
default=(
|
||||
"CI / all-required (pull_request),"
|
||||
"sop-checklist / all-items-acked (pull_request)"
|
||||
"sop-checklist / all-items-acked (pull_request),"
|
||||
"E2E Chat / E2E Chat (pull_request)"
|
||||
),
|
||||
)
|
||||
# E2E Chat is not in branch protection's status_check_contexts, but Gitea's
|
||||
# merge gate evaluates the full combined status including it. Adding it here
|
||||
# prevents the queue from attempting a merge that will be 405'd by Gitea when
|
||||
# E2E Chat is failing (e.g. runner-stall Quirk #9 on a flaky test).
|
||||
# See: mc#420 / molecule-core runbooks/gitea-operational-quirks.md Quirk #9.
|
||||
# Required contexts for push (main/staging) runs. The push CI uses the same
|
||||
# aggregator names with " (push)" suffix. Checking these explicitly instead of
|
||||
# the combined state avoids false-pause when non-blocking jobs (e.g. Platform
|
||||
@@ -66,6 +71,11 @@ class ApiError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
class MergePermissionError(ApiError):
|
||||
"""Merge failed with a permanent permission error (403/404/405).
|
||||
The queue should skip this PR and move to the next one."""
|
||||
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class MergeDecision:
|
||||
ready: bool
|
||||
@@ -315,6 +325,31 @@ def post_comment(pr_number: int, body: str, *, dry_run: bool) -> None:
|
||||
api("POST", f"/repos/{OWNER}/{NAME}/issues/{pr_number}/comments", body={"body": body})
|
||||
|
||||
|
||||
def add_hold_label(pr_number: int, *, dry_run: bool) -> None:
|
||||
"""Add HOLD_LABEL to a PR if not already present."""
|
||||
if not HOLD_LABEL:
|
||||
return
|
||||
# Check current labels first to avoid a no-op API call in dry-run.
|
||||
_, current = api("GET", f"/repos/{OWNER}/{NAME}/issues/{pr_number}/labels")
|
||||
current_names = {
|
||||
l["name"] for l in (current if isinstance(current, list) else [])
|
||||
}
|
||||
if HOLD_LABEL in current_names:
|
||||
print(f"::notice::PR #{pr_number} already has hold label; skipping add")
|
||||
return
|
||||
print(f"::notice::PR #{pr_number} adding hold label `{HOLD_LABEL}`")
|
||||
if dry_run:
|
||||
return
|
||||
# Gitea accepts {"labels": ["label1", "label2"]} to append labels.
|
||||
new_labels = list(current_names) + [HOLD_LABEL]
|
||||
api(
|
||||
"PATCH",
|
||||
f"/repos/{OWNER}/{NAME}/issues/{pr_number}",
|
||||
body={"labels": new_labels},
|
||||
expect_json=False,
|
||||
)
|
||||
|
||||
|
||||
def update_pull(pr_number: int, *, dry_run: bool) -> None:
|
||||
print(f"::notice::updating PR #{pr_number} with base branch via style={UPDATE_STYLE}")
|
||||
if dry_run:
|
||||
@@ -327,43 +362,6 @@ def update_pull(pr_number: int, *, dry_run: bool) -> None:
|
||||
)
|
||||
|
||||
|
||||
def wait_for_ci(
|
||||
head_sha: str,
|
||||
contexts: list[str],
|
||||
*,
|
||||
max_wait_seconds: int = 300,
|
||||
poll_interval: int = 15,
|
||||
) -> bool:
|
||||
"""Poll CI statuses for head_sha until all required contexts are terminal.
|
||||
|
||||
Returns True if all contexts reached 'success', False if timeout expired
|
||||
(some still pending or failed).
|
||||
|
||||
Background: after a queue-triggered PR update, CI re-runs on the new head.
|
||||
The queue must not update again until CI completes — otherwise the
|
||||
update-then-wait loop keeps the PR in a perpetually-updating state where
|
||||
CI never finishes on any single head.
|
||||
"""
|
||||
deadline = time.time() + max_wait_seconds
|
||||
while time.time() < deadline:
|
||||
time.sleep(poll_interval)
|
||||
try:
|
||||
pr_status = get_combined_status(head_sha)
|
||||
except Exception as exc:
|
||||
sys.stderr.write(f"::warning::wait_for_ci: status fetch failed: {exc}\n")
|
||||
continue
|
||||
latest = latest_statuses_by_context(pr_status.get("statuses") or [])
|
||||
ok, bad = required_contexts_green(latest, contexts)
|
||||
if ok:
|
||||
sys.stderr.write(f"::notice::wait_for_ci: all contexts green after {int(time.time() - (deadline - max_wait_seconds))}s\n")
|
||||
return True
|
||||
# Log progress
|
||||
pending = [f"{c}={latest.get(c, {}).get('status', 'missing')}" for c in contexts if latest.get(c, {}).get('status') != 'success']
|
||||
sys.stderr.write(f"::notice::wait_for_ci: still waiting ({int(deadline - time.time())}s left): {', '.join(pending[:3])}\n")
|
||||
sys.stderr.write(f"::warning::wait_for_ci: timeout after {max_wait_seconds}s; proceeding with merge check\n")
|
||||
return False
|
||||
|
||||
|
||||
def merge_pull(pr_number: int, *, dry_run: bool) -> None:
|
||||
payload = {
|
||||
"Do": "merge",
|
||||
@@ -376,24 +374,16 @@ def merge_pull(pr_number: int, *, dry_run: bool) -> None:
|
||||
print(f"::notice::merging PR #{pr_number}")
|
||||
if dry_run:
|
||||
return
|
||||
# Gitea's merge endpoint returns HTTP 200 with an empty body on success.
|
||||
# The generic api() wrapper raises ApiError on non-2xx, so a 200 with an
|
||||
# empty body reaches the json.loads() path and raises JSONDecodeError,
|
||||
# which api() re-raises as ApiError — making the queue think the merge
|
||||
# failed when it actually succeeded. Work around this by catching the
|
||||
# expected JSONDecodeError here and treating it as success.
|
||||
try:
|
||||
api("POST", f"/repos/{OWNER}/{NAME}/pulls/{pr_number}/merge", body=payload, expect_json=False)
|
||||
except ApiError as exc:
|
||||
# Surface non-merge errors (5xx server errors, 403 forbidden, etc.)
|
||||
if "merge" in str(exc).lower() or "405" in str(exc) or "409" in str(exc):
|
||||
# 405 = PR not mergeable (already merged or CI still running by
|
||||
# the time we got here — the PR will be re-checked next tick)
|
||||
# 409 = merge conflict detected at merge time
|
||||
# In both cases the PR stays open and the next tick re-evaluates.
|
||||
sys.stderr.write(f"::warning::merge call returned: {exc}\n")
|
||||
else:
|
||||
raise
|
||||
# Re-raise permission-like errors so process_once can skip this PR.
|
||||
# 403 = no push access, 404 = repo/pr not found, 405 = not allowed.
|
||||
msg = str(exc)
|
||||
for code in ("403", "404", "405"):
|
||||
if code in msg:
|
||||
raise MergePermissionError(msg) from exc
|
||||
raise # re-raise other ApiErrors unchanged
|
||||
|
||||
|
||||
def process_once(*, dry_run: bool = False) -> int:
|
||||
@@ -445,32 +435,6 @@ def process_once(*, dry_run: bool = False) -> int:
|
||||
print(f"::notice::PR #{pr_number} decision={decision.action}: {decision.reason}")
|
||||
if decision.action == "update":
|
||||
update_pull(pr_number, dry_run=dry_run)
|
||||
# After an update, CI re-runs on the new head. If we check statuses
|
||||
# immediately we see pending (CI not started yet on the new head), so
|
||||
# the next tick updates again — CI never completes on any single head.
|
||||
# Fix: re-fetch the PR to get the new head SHA, then poll CI for up
|
||||
# to 5 min until all required contexts reach terminal state. If CI
|
||||
# finishes in time, proceed to merge on the same tick.
|
||||
if not dry_run:
|
||||
updated_pr = get_pull(pr_number)
|
||||
new_head = updated_pr.get("head", {}).get("sha", "")
|
||||
if new_head and new_head != head_sha:
|
||||
sys.stderr.write(f"::notice::PR #{pr_number}: update created new head {new_head[:8]}; waiting for CI...\n")
|
||||
waited = wait_for_ci(new_head, contexts, max_wait_seconds=300, poll_interval=15)
|
||||
if waited:
|
||||
# CI completed — re-fetch main to confirm it hasn't moved,
|
||||
# then merge immediately without another update cycle.
|
||||
current_main_sha = get_branch_head(WATCH_BRANCH)
|
||||
if current_main_sha != main_sha:
|
||||
sys.stderr.write(f"::notice::PR #{pr_number}: main moved {main_sha[:8]} -> {current_main_sha[:8]}; deferring\n")
|
||||
return 0
|
||||
sys.stderr.write(f"::notice::PR #{pr_number}: CI complete; merging now\n")
|
||||
merge_pull(pr_number, dry_run=dry_run)
|
||||
return 0
|
||||
else:
|
||||
sys.stderr.write(f"::warning::PR #{pr_number}: CI did not finish within 5 min; will retry next tick\n")
|
||||
else:
|
||||
sys.stderr.write(f"::notice::PR #{pr_number}: update did not change head SHA; will retry\n")
|
||||
post_comment(
|
||||
pr_number,
|
||||
(
|
||||
@@ -481,13 +445,6 @@ def process_once(*, dry_run: bool = False) -> int:
|
||||
)
|
||||
return 0
|
||||
if decision.ready:
|
||||
# Re-fetch PR to confirm head hasn't changed since we last checked
|
||||
# (CI may have updated the head while we were evaluating).
|
||||
current_pr = get_pull(pr_number)
|
||||
current_head = current_pr.get("head", {}).get("sha", "")
|
||||
if current_head != head_sha:
|
||||
print(f"::notice::PR #{pr_number} head changed {head_sha[:8]} -> {current_head[:8]}; re-evaluating")
|
||||
return 0
|
||||
latest_main_sha = get_branch_head(WATCH_BRANCH)
|
||||
if latest_main_sha != main_sha:
|
||||
print(
|
||||
@@ -495,7 +452,45 @@ def process_once(*, dry_run: bool = False) -> int:
|
||||
"deferring to next tick"
|
||||
)
|
||||
return 0
|
||||
merge_pull(pr_number, dry_run=dry_run)
|
||||
try:
|
||||
merge_pull(pr_number, dry_run=dry_run)
|
||||
except MergePermissionError as exc:
|
||||
msg = str(exc)
|
||||
is_status_check_failure = "not all required status checks successful" in msg
|
||||
if is_status_check_failure:
|
||||
# Gitea's merge gate failed due to a status check that passed our
|
||||
# pre-flight but is failing at Gitea's side (e.g. runner-stall Quirk
|
||||
# #9, or a context not in REQUIRED_CONTEXTS). Auto-add hold so the
|
||||
# queue skips this PR and processes the next one. The hold can be
|
||||
# removed once CI is green again.
|
||||
add_hold_label(pr_number, dry_run=dry_run)
|
||||
post_comment(
|
||||
pr_number,
|
||||
(
|
||||
"merge-queue: merge blocked by Gitea's status-check gate "
|
||||
"(E2E Chat or other non-required context failing). "
|
||||
"Auto-held via `merge-queue-hold`. "
|
||||
"Remove the hold label to requeue once CI is green. "
|
||||
"If E2E Chat is stuck (runner stall / Quirk #9), CI will "
|
||||
"self-recover after ~90 min and the hold can then be removed."
|
||||
),
|
||||
dry_run=dry_run,
|
||||
)
|
||||
return 0
|
||||
else:
|
||||
# Genuine permission error — token lacks Can-merge.
|
||||
sys.stderr.write(f"::error::merge permission error for PR #{pr_number}: {exc}\n")
|
||||
post_comment(
|
||||
pr_number,
|
||||
(
|
||||
"merge-queue: merge failed with HTTP 405 'User not allowed to merge PR'. "
|
||||
"No available token has Can-merge permission on this repo. "
|
||||
"Fix: grant Can-merge to a token, or add a maintain/admin collaborator. "
|
||||
"Skipping to next queued PR on next tick."
|
||||
),
|
||||
dry_run=dry_run,
|
||||
)
|
||||
return 0
|
||||
return 0
|
||||
return 0
|
||||
|
||||
|
||||
@@ -118,3 +118,13 @@ def test_merge_decision_updates_stale_pr_before_merge():
|
||||
|
||||
assert decision.ready is False
|
||||
assert decision.action == "update"
|
||||
|
||||
|
||||
def test_MergePermissionError_inherits_from_ApiError():
|
||||
assert issubclass(mq.MergePermissionError, mq.ApiError)
|
||||
|
||||
|
||||
def test_MergePermissionError_message_preserved():
|
||||
exc = mq.MergePermissionError("POST /merge -> HTTP 405: User not allowed")
|
||||
assert "405" in str(exc)
|
||||
assert "User not allowed" in str(exc)
|
||||
|
||||
@@ -57,7 +57,7 @@ permissions:
|
||||
# can produce duplicate comments before the title-search dedup wins.
|
||||
concurrency:
|
||||
group: ci-required-drift
|
||||
cancel-in-progress: false
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
drift:
|
||||
|
||||
@@ -32,12 +32,6 @@ on:
|
||||
# iterating all open PRs when PR_NUMBER is empty.
|
||||
workflow_dispatch:
|
||||
|
||||
# Cancel stale runs so the 8-runner pool stays available for PR jobs.
|
||||
# Per-SHA group ensures push and cron runs at different SHAs don't cancel each other.
|
||||
concurrency:
|
||||
group: gate-check-v3-${{ github.event.pull_request.head.sha || github.sha }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
# read: contents — for checkout (base ref, not PR head for security)
|
||||
# read: pull-requests — for reading PR info via API
|
||||
|
||||
@@ -22,7 +22,7 @@ permissions:
|
||||
|
||||
concurrency:
|
||||
group: gitea-merge-queue-${{ github.repository }}
|
||||
cancel-in-progress: false
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
queue:
|
||||
|
||||
@@ -56,9 +56,13 @@ permissions:
|
||||
# Workflow-scoped serialisation — two simultaneous runs would race on the
|
||||
# `[main-red] {SHA}` open/PATCH path. Idempotent by title, but parallel
|
||||
# POSTs can produce duplicates before the title search dedup wins.
|
||||
# NOTE: cancel-in-progress: true is safe here — the idempotent design means
|
||||
# a cancelled run produces identical output to a completed one. This also
|
||||
# prevents the Gitea scheduler freeze that occurs when a cron tick fires
|
||||
# while a previous run is still executing (Quirk #8).
|
||||
concurrency:
|
||||
group: main-red-watchdog
|
||||
cancel-in-progress: false
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
watchdog:
|
||||
|
||||
@@ -162,6 +162,7 @@ jobs:
|
||||
exit 1
|
||||
fi
|
||||
python -m twine upload \
|
||||
--verbose \
|
||||
--repository pypi \
|
||||
--username __token__ \
|
||||
--password "$PYPI_TOKEN" \
|
||||
|
||||
@@ -44,12 +44,6 @@ on:
|
||||
- ".github/scripts/lint_secret_pattern_drift.py"
|
||||
- ".githooks/pre-commit"
|
||||
|
||||
# Cancel stale runs to keep the 8-runner pool available for PR jobs.
|
||||
# Per-SHA group ensures push and scheduled runs at different SHAs don't cancel each other.
|
||||
concurrency:
|
||||
group: secret-pattern-drift-${{ github.event.pull_request.head.sha || github.sha }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GITHUB_SERVER_URL: https://git.moleculesai.app
|
||||
|
||||
|
||||
@@ -22,11 +22,6 @@ on:
|
||||
- cron: '17 4 * * 1' # Mondays at 04:17 UTC
|
||||
workflow_dispatch:
|
||||
|
||||
# Cancel stale runs to keep the 8-runner pool available for PR jobs.
|
||||
concurrency:
|
||||
group: weekly-platform-go-${{ github.event.pull_request.head.sha || github.sha }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
statuses: write
|
||||
|
||||
@@ -77,6 +77,31 @@ does not replace the queue. The queue still performs its own current-main
|
||||
check immediately before merge because branch protection alone cannot
|
||||
serialize two already-green PRs.
|
||||
|
||||
### Correct API field names (Gitea 1.22.6)
|
||||
|
||||
When setting branch protection via API, use these exact field names — several
|
||||
intuitively-correct names are silently ignored (see `gitea-operational-quirks.md`
|
||||
Quirk #7):
|
||||
|
||||
```json
|
||||
{
|
||||
"branch_name": "main",
|
||||
"enable_merge_whitelist": true,
|
||||
"merge_whitelist_usernames": ["devops-engineer", "hongming", "core-devops"],
|
||||
"enable_status_check": true,
|
||||
"status_check_contexts": ["CI / all-required"],
|
||||
"required_approvals": 1,
|
||||
"block_on_rejected_reviews": true
|
||||
}
|
||||
```
|
||||
|
||||
After any `POST /branch_protections`, immediately GET and verify the values
|
||||
persisted — the API returns 201 even when fields are silently dropped.
|
||||
|
||||
If the queue returns HTTP 405 ("User not allowed to merge"), the first
|
||||
diagnostic step is `GET /branch_protections/main` and checking whether
|
||||
`merge_whitelist_usernames` still contains `devops-engineer`.
|
||||
|
||||
## Failure Handling
|
||||
|
||||
If `main` is not green, the queue pauses and does not merge anything.
|
||||
|
||||
@@ -196,69 +196,134 @@ primary consumer of combined status and is affected.
|
||||
|
||||
---
|
||||
|
||||
## Quirk #7 — TBD
|
||||
|
||||
*[Placeholder — document here when a new Gitea Actions quirk is discovered.]*
|
||||
## Quirk #7 — Gitea branch protection API silently ignores some field names
|
||||
|
||||
### Finding
|
||||
|
||||
*[What Gitea Actions does differently from GitHub Actions.]*
|
||||
The Gitea 1.22.6 `POST /repos/{org}/{repo}/branch_protections` API accepts a
|
||||
non-obvious set of field names. Several intuitively-correct names are silently
|
||||
ignored — the call returns 201 but the field is dropped:
|
||||
|
||||
| Intended field | Correct API name | Silently ignored aliases |
|
||||
|---|---|---|
|
||||
| Enable merge whitelist | `enable_merge_whitelist` | `user_can_merge`, `merge_whitelist_enabled` |
|
||||
| Users who can merge | `merge_whitelist_usernames` | `merge_whitelist_users`, `whitelisted_users` |
|
||||
| Enable status check | `enable_status_check` | `enable_status_checks`, `require_status_checks` |
|
||||
| Required status contexts | `status_check_contexts` | `required_status_checks.contexts` |
|
||||
| Block on rejected reviews | `block_on_rejected_reviews` | (this one works) |
|
||||
| Required approvals | `required_approvals` | `required_reviewers` |
|
||||
|
||||
The GET response after a POST shows the actual stored values. A naive
|
||||
GET → modify → POST cycle (without using the exact GET field names) will
|
||||
silently reset the merge whitelist on every call.
|
||||
|
||||
### Impact
|
||||
|
||||
*[Which workflows or operations are affected.]*
|
||||
- Branch protection merge whitelist resets to empty after any API mis-invocation
|
||||
- Queue AUTO_SYNC_TOKEN (`devops-engineer`) loses Can-merge permission → HTTP 405
|
||||
- All queued PRs blocked until whitelist is restored
|
||||
- Confirmed reset on Gitea server restart/upgrade (Gitea uses default values)
|
||||
|
||||
### Workaround
|
||||
|
||||
*[How to work around this quirk.]*
|
||||
1. Always GET the current protection first and use **exact** field names from the
|
||||
GET response when modifying
|
||||
2. After any `POST /branch_protections`, immediately GET and verify
|
||||
`enable_merge_whitelist: true` and `merge_whitelist_usernames` contains
|
||||
`["devops-engineer", "hongming", "core-devops"]`
|
||||
3. The queue bot should verify branch protection before each merge tick
|
||||
4. For queue to work: `enable_merge_whitelist: true` +
|
||||
`merge_whitelist_usernames: ["devops-engineer", "hongming", "core-devops"]` +
|
||||
`enable_status_check: true` + `status_check_contexts: ["CI / all-required"]`
|
||||
|
||||
### References
|
||||
|
||||
- internal#[N]: first observation
|
||||
- SEV-1 2026-05-17: 3x branch protection resets caused 405 on all queue merges
|
||||
- `feedback_gitea_branch_protection_api_field_names`
|
||||
|
||||
---
|
||||
|
||||
## Quirk #8 — TBD
|
||||
|
||||
*[Placeholder — document here when a new Gitea Actions quirk is discovered.]*
|
||||
## Quirk #8 — Scheduled workflow with `cancel-in-progress: false` causes scheduler freeze
|
||||
|
||||
### Finding
|
||||
|
||||
*[What Gitea Actions does differently from GitHub Actions.]*
|
||||
When a `schedule:` workflow has `concurrency.cancel-in-progress: false`, and a
|
||||
new cron tick fires while the previous run is still executing, the Gitea Actions
|
||||
scheduler stops dispatching the workflow entirely. Pending entries accumulate
|
||||
indefinitely — the scheduler shows the workflow as "scheduled" but never dispatches.
|
||||
|
||||
This is dangerous for workflows with variable execution time (e.g., workflows that
|
||||
wait for downstream CI, or workflows that run on slow/degraded runners).
|
||||
|
||||
### Impact
|
||||
|
||||
*[Which workflows or operations are affected.]*
|
||||
- `gitea-merge-queue.yml` with `cancel-in-progress: false` froze on 2026-05-17
|
||||
starting ~16:44Z — pending runs accumulated, no new runs dispatched
|
||||
- Queue appeared stalled; all 22 queued PRs blocked
|
||||
- The `gitea-merge-queue` workflow itself becomes invisible to operators
|
||||
|
||||
### Workaround
|
||||
|
||||
*[How to work around this quirk.]*
|
||||
**Always set `cancel-in-progress: true` on `schedule:` workflows:**
|
||||
|
||||
```yaml
|
||||
concurrency:
|
||||
group: workflow-name
|
||||
cancel-in-progress: true # ← always true for schedule: workflows
|
||||
```
|
||||
|
||||
If the freeze has already occurred: the scheduler recovers automatically after the
|
||||
currently-running instance completes (Gitea dispatches the next queued tick).
|
||||
|
||||
### References
|
||||
|
||||
- internal#[N]: first observation
|
||||
- SEV-1 2026-05-17: queue frozen since 16:44Z; fixed by setting `cancel-in-progress: true`
|
||||
- PR #1358: `fix(scheduled-workflows): enable cancel-in-progress` (pending merge)
|
||||
|
||||
---
|
||||
|
||||
## Quirk #9 — TBD
|
||||
|
||||
*[Placeholder — document here when a new Gitea Actions quirk is discovered.]*
|
||||
## Quirk #9 — Gitea Actions runner accepts runs but stalls (jobs never start)
|
||||
|
||||
### Finding
|
||||
|
||||
*[What Gitea Actions does differently from GitHub Actions.]*
|
||||
The Gitea Actions runner on host `5.78.80.188` can enter a degraded state where:
|
||||
1. It accepts new workflow runs (shows "in_progress" in the UI)
|
||||
2. It never starts any jobs — pending count grows indefinitely
|
||||
3. The runner shows as "online" and accepting runs
|
||||
4. After ~60–90 minutes, the runner self-recovers and all pending jobs start
|
||||
|
||||
This is distinct from a true runner crash (which would show as offline).
|
||||
|
||||
### Impact
|
||||
|
||||
*[Which workflows or operations are affected.]*
|
||||
- All CI jobs for all PRs stall — no status updates posted
|
||||
- Queue waits indefinitely for CI (which never posts success)
|
||||
- `sop-checklist` and other workflows time out on affected PRs
|
||||
- Looks like the runner is working (green in UI) but nothing executes
|
||||
|
||||
### How to diagnose
|
||||
|
||||
Add a debug step to a known-failing workflow:
|
||||
|
||||
```bash
|
||||
# In a stalled job:
|
||||
curl -s http://localhost:8088/debug/pprof/trace?seconds=5 | head
|
||||
# Check runner process CPU — if near 0% while jobs are pending, runner is stalled
|
||||
```
|
||||
|
||||
Check runner logs on the host (`/var/log/actrunner.log` or similar).
|
||||
|
||||
### Workaround
|
||||
|
||||
*[How to work around this quirk.]*
|
||||
No operator workaround while stalled — the runner self-recovers. Options:
|
||||
1. **Wait** — runner typically recovers within 90 minutes
|
||||
2. **Restart the runner service** — `systemctl restart act_runner` (requires host access)
|
||||
3. **Move to a second runner** — if registered, re-route dispatch
|
||||
|
||||
### References
|
||||
|
||||
- internal#[N]: first observation
|
||||
- SEV-1 2026-05-17: runner stalled; self-recovered ~21:33Z after ~90 min
|
||||
- `feedback_gitea_runner_stall_accepted_jobs_no_execution`
|
||||
|
||||
---
|
||||
|
||||
|
||||
Reference in New Issue
Block a user