Compare commits
13 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| f27097a5c8 | |||
| 4980982aea | |||
| 07ed95fd14 | |||
| 1c9255125e | |||
| 33e0f8e24b | |||
| f9214391fb | |||
| 2f51a6176d | |||
| fae62ac8c1 | |||
| 8c343e3ac4 | |||
| b915f1bc2d | |||
| df821c8258 | |||
| 0bc1381ffe | |||
| 7d011828e8 |
@@ -49,11 +49,11 @@ if [ "$MERGED" != "true" ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
MERGE_SHA=$(echo "$PR" | jq -r '.merge_commit_sha // empty')
|
||||
MERGED_BY=$(echo "$PR" | jq -r '.merged_by.login // "unknown"')
|
||||
TITLE=$(echo "$PR" | jq -r '.title // ""')
|
||||
BASE_BRANCH=$(echo "$PR" | jq -r '.base.ref // "main"')
|
||||
HEAD_SHA=$(echo "$PR" | jq -r '.head.sha // empty')
|
||||
MERGE_SHA=$(echo "$PR" | jq -r '.merge_commit_sha // empty') || true
|
||||
MERGED_BY=$(echo "$PR" | jq -r '.merged_by.login // "unknown"') || true
|
||||
TITLE=$(echo "$PR" | jq -r '.title // ""') || true
|
||||
BASE_BRANCH=$(echo "$PR" | jq -r '.base.ref // "main"') || true
|
||||
HEAD_SHA=$(echo "$PR" | jq -r '.head.sha // empty') || true
|
||||
|
||||
if [ -z "$MERGE_SHA" ]; then
|
||||
echo "::warning::PR #${PR_NUMBER} merged=true but no merge_commit_sha — cannot evaluate force-merge."
|
||||
@@ -75,7 +75,7 @@ STATUS=$(curl -sS -H "$AUTH" \
|
||||
declare -A CHECK_STATE
|
||||
while IFS=$'\t' read -r ctx state; do
|
||||
[ -n "$ctx" ] && CHECK_STATE[$ctx]="$state"
|
||||
done < <(echo "$STATUS" | jq -r '.statuses // [] | .[] | "\(.context)\t\(.status)"')
|
||||
done < <(echo "$STATUS" | jq -r '.statuses // [] | .[] | "\(.context)\t\(.status)"') || true
|
||||
|
||||
# 4. For each required check, was it green at merge? YAML block scalars
|
||||
# (`|`) leave a trailing newline; skip blank/whitespace-only lines.
|
||||
@@ -97,7 +97,7 @@ fi
|
||||
|
||||
# 5. Emit structured audit event.
|
||||
NOW=$(date -u +%Y-%m-%dT%H:%M:%SZ)
|
||||
FAILED_JSON=$(printf '%s\n' "${FAILED_CHECKS[@]}" | jq -R . | jq -s .)
|
||||
FAILED_JSON=$(printf '%s\n' "${FAILED_CHECKS[@]}" | jq -R . | jq -s .) || true
|
||||
|
||||
# Print as a single-line JSON so Vector's parse_json transform can pick
|
||||
# it up cleanly from docker_logs.
|
||||
|
||||
@@ -301,7 +301,19 @@ def expected_context(job_key: str, workflow_name: str = "ci") -> str:
|
||||
# Drift detection
|
||||
# --------------------------------------------------------------------------
|
||||
def detect_drift(branch: str) -> tuple[list[str], dict]:
|
||||
"""Returns (findings, debug). Empty findings == no drift."""
|
||||
"""Returns (findings, debug). Empty findings == no drift.
|
||||
|
||||
Raises:
|
||||
ApiError: propagated from the protection fetch only when the
|
||||
failure is likely a transient Gitea outage (5xx).
|
||||
403/404 from the protection endpoint is treated as
|
||||
"cannot determine drift for this branch" — a token-
|
||||
scope issue (missing repo-admin on DRIFT_BOT_TOKEN) or
|
||||
a repo with no protection set should not turn the
|
||||
hourly cron red. The workflow continues to the next
|
||||
branch; no [ci-drift] issue is filed for a branch
|
||||
whose protection cannot be read.
|
||||
"""
|
||||
findings: list[str] = []
|
||||
|
||||
ci_doc = load_yaml(CI_WORKFLOW_PATH)
|
||||
@@ -313,9 +325,50 @@ def detect_drift(branch: str) -> tuple[list[str], dict]:
|
||||
env_set = required_checks_env(audit_doc)
|
||||
|
||||
# Protection
|
||||
# api() raises ApiError on non-2xx; let it propagate so a transient
|
||||
# 500 fails the run loudly rather than producing a "no drift" lie.
|
||||
_, protection = api("GET", f"/repos/{OWNER}/{NAME}/branch_protections/{branch}")
|
||||
# api() raises ApiError on non-2xx. Transient 5xx should fail loud.
|
||||
# 403/404 means the token lacks repo-admin scope (Gitea 1.22.6's
|
||||
# branch_protections endpoint requires it — see DRIFT_BOT_TOKEN
|
||||
# provisioning trail in ci-required-drift.yml). Treat as
|
||||
# "cannot determine drift for this branch" — skip without turning
|
||||
# the workflow red. Surface a clear diagnostic so the operator
|
||||
# knows what to fix.
|
||||
contexts: set[str] = set()
|
||||
protection_path = f"/repos/{OWNER}/{NAME}/branch_protections/{branch}"
|
||||
try:
|
||||
_, protection = api("GET", protection_path)
|
||||
except ApiError as e:
|
||||
# Isolate the HTTP status from the error message.
|
||||
http_status: int | None = None
|
||||
msg = str(e)
|
||||
# ApiError message format: "{method} {path} → HTTP {status}: {body}"
|
||||
import re as _re
|
||||
|
||||
m = _re.search(r"HTTP (\d{3})", msg)
|
||||
if m:
|
||||
http_status = int(m.group(1))
|
||||
if http_status in (403, 404):
|
||||
# Token lacks scope OR branch has no protection. Cannot
|
||||
# determine drift — skip this branch. Do NOT exit non-zero;
|
||||
# the issue IS the alarm, not a red workflow.
|
||||
sys.stderr.write(
|
||||
f"::error::GET {protection_path} returned HTTP {http_status} — "
|
||||
f"DRIFT_BOT_TOKEN lacks repo-admin scope (Gitea 1.22.6 "
|
||||
f"requires it for this endpoint) OR branch has no protection "
|
||||
f"configured. Cannot determine drift for {branch}; "
|
||||
f"skipping. Fix: grant repo-admin to mc-drift-bot or "
|
||||
f"configure protection on {branch}.\n"
|
||||
)
|
||||
debug = {
|
||||
"branch": branch,
|
||||
"ci_jobs": sorted(jobs),
|
||||
"sentinel_needs": sorted(needs),
|
||||
"protection_contexts_skipped": True,
|
||||
"protection_http_status": http_status,
|
||||
"audit_env_checks": sorted(env_set),
|
||||
}
|
||||
return [], debug
|
||||
# 5xx — propagate (transient outage, fail loud per design).
|
||||
raise
|
||||
if not isinstance(protection, dict):
|
||||
sys.stderr.write(
|
||||
f"::error::protection response for {branch} not a JSON object\n"
|
||||
|
||||
@@ -96,16 +96,27 @@ API="https://${GITEA_HOST}/api/v1"
|
||||
AUTH="Authorization: token ${GITEA_TOKEN}"
|
||||
echo "::notice::tier-check start: repo=$OWNER/$NAME pr=$PR_NUMBER author=$PR_AUTHOR"
|
||||
|
||||
# Sanity: token resolves to a user
|
||||
WHOAMI=$(curl -sS -H "$AUTH" "${API}/user" | jq -r '.login // ""')
|
||||
# Sanity: token resolves to a user.
|
||||
# Use || true on the jq pipeline so that set -euo pipefail (line 45) does not
|
||||
# cause the script to exit prematurely when the token is empty/invalid — the
|
||||
# if check below handles that case gracefully. Without || true, a 401 from an
|
||||
# empty/invalid token causes jq to exit 1, triggering set -e and exiting the
|
||||
# entire script before SOP_FAIL_OPEN can be evaluated (the check is in the jq-
|
||||
# install block; if jq is already on PATH, that block is skipped entirely).
|
||||
WHOAMI=$(curl -sS -H "$AUTH" "${API}/user" | jq -r '.login // ""') || true
|
||||
if [ -z "$WHOAMI" ]; then
|
||||
echo "::error::GITEA_TOKEN cannot resolve a user via /api/v1/user — check the token scope and that the secret is wired correctly."
|
||||
if [ "${SOP_FAIL_OPEN:-}" = "1" ]; then
|
||||
echo "::warning::SOP_FAIL_OPEN=1 — exiting 0 so CI does not block."
|
||||
exit 0
|
||||
fi
|
||||
exit 1
|
||||
fi
|
||||
echo "::notice::token resolves to user: $WHOAMI"
|
||||
|
||||
# 1. Read tier label
|
||||
LABELS=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/issues/${PR_NUMBER}/labels" | jq -r '.[].name')
|
||||
# 1. Read tier label. || true ensures set -euo pipefail does not abort the
|
||||
# script if curl or jq fails (e.g. 401 from empty token).
|
||||
LABELS=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/issues/${PR_NUMBER}/labels" | jq -r '.[].name') || true
|
||||
TIER=""
|
||||
for L in $LABELS; do
|
||||
case "$L" in
|
||||
@@ -176,17 +187,25 @@ fi
|
||||
# 4. Resolve all team names → IDs
|
||||
# /orgs/{org}/teams/{slug}/... endpoints don't exist on Gitea 1.22;
|
||||
# we use /teams/{id}.
|
||||
# set +e prevents set -e from aborting the script if curl fails (e.g. empty token).
|
||||
ORG_TEAMS_FILE=$(mktemp)
|
||||
trap 'rm -f "$ORG_TEAMS_FILE"' EXIT
|
||||
set +e
|
||||
HTTP_CODE=$(curl -sS -o "$ORG_TEAMS_FILE" -w '%{http_code}' -H "$AUTH" \
|
||||
"${API}/orgs/${OWNER}/teams")
|
||||
debug "teams-list HTTP=$HTTP_CODE size=$(wc -c <"$ORG_TEAMS_FILE")"
|
||||
_HTTP_EXIT=$?
|
||||
set -e
|
||||
debug "teams-list HTTP=$HTTP_CODE (curl exit=$_HTTP_EXIT) size=$(wc -c <"$ORG_TEAMS_FILE")"
|
||||
if [ "${SOP_DEBUG:-}" = "1" ]; then
|
||||
echo " [debug] teams-list body (first 300 chars):" >&2
|
||||
head -c 300 "$ORG_TEAMS_FILE" >&2; echo >&2
|
||||
fi
|
||||
if [ "$HTTP_CODE" != "200" ]; then
|
||||
echo "::error::GET /orgs/${OWNER}/teams returned HTTP $HTTP_CODE — token likely lacks read:org scope."
|
||||
if [ "$_HTTP_EXIT" -ne 0 ] || [ "$HTTP_CODE" != "200" ]; then
|
||||
echo "::error::GET /orgs/${OWNER}/teams failed (curl exit=$_HTTP_EXIT HTTP=$HTTP_CODE) — token may lack read:org scope or be invalid."
|
||||
if [ "${SOP_FAIL_OPEN:-}" = "1" ]; then
|
||||
echo "::warning::SOP_FAIL_OPEN=1 — exiting 0 so CI does not block."
|
||||
exit 0
|
||||
fi
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -231,9 +250,22 @@ for _t in $_all_teams; do
|
||||
debug "team-id: $_t → $_id"
|
||||
done
|
||||
|
||||
# 5. Read approving reviewers
|
||||
# 5. Read approving reviewers. set +e disables set -e temporarily so that curl
|
||||
# failures (e.g. empty/invalid token → HTTP 401) do not abort the script before
|
||||
# SOP_FAIL_OPEN is evaluated. set -e is restored immediately after.
|
||||
set +e
|
||||
REVIEWS=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/pulls/${PR_NUMBER}/reviews")
|
||||
APPROVERS=$(echo "$REVIEWS" | jq -r '[.[] | select(.state=="APPROVED") | .user.login] | unique | .[]')
|
||||
_REVIEWS_EXIT=$?
|
||||
set -e
|
||||
if [ $_REVIEWS_EXIT -ne 0 ] || [ -z "$REVIEWS" ]; then
|
||||
echo "::error::Failed to fetch reviews (curl exit=$_REVIEWS_EXIT) — token may be invalid or unreachable."
|
||||
if [ "${SOP_FAIL_OPEN:-}" = "1" ]; then
|
||||
echo "::warning::SOP_FAIL_OPEN=1 — exiting 0 so CI does not block."
|
||||
exit 0
|
||||
fi
|
||||
exit 1
|
||||
fi
|
||||
APPROVERS=$(echo "$REVIEWS" | jq -r '[.[] | select(.state=="APPROVED") | .user.login] | unique | .[]') || true
|
||||
if [ -z "$APPROVERS" ]; then
|
||||
echo "::error::No approving reviews on this PR. Set SOP_DEBUG=1 and re-run for diagnostics."
|
||||
exit 1
|
||||
|
||||
@@ -19,13 +19,18 @@ What this script does, per `.gitea/workflows/status-reaper.yml` invocation:
|
||||
downstream — Gitea uses ` / ` as the workflow/job separator).
|
||||
Classify each by whether `on:` contains a `push:` trigger.
|
||||
|
||||
2. List the last N (=10) commits on WATCH_BRANCH via
|
||||
GET /repos/{o}/{r}/commits?sha={branch}&limit={N}. rev2 sweeps
|
||||
N commits per tick instead of HEAD only — schedule workflows
|
||||
post `failure` to whatever SHA was HEAD when they COMPLETED, so
|
||||
by the next */5 tick main has often moved forward and the red
|
||||
gets stranded on a stale commit (Phase 1+2 evidence: rev1 saw
|
||||
`compensated:0` every tick across ~6 cycles).
|
||||
2. List the last N (=30, rev3 — widened from 10) commits on
|
||||
WATCH_BRANCH via GET /repos/{o}/{r}/commits?sha={branch}&limit={N}.
|
||||
rev2 sweeps N commits per tick instead of HEAD only — schedule
|
||||
workflows post `failure` to whatever SHA was HEAD when they
|
||||
COMPLETED, so by the next */5 tick main has often moved forward
|
||||
and the red gets stranded on a stale commit. rev3 widens the
|
||||
window from 10 → 30 because schedule workflows post `failure`
|
||||
RETROACTIVELY (5-15 min after their merge); a 10-commit window
|
||||
is narrower than the merge-cadence during a burst, so reds land
|
||||
OUTSIDE the window before reaper sees them (Phase 1+2 evidence:
|
||||
rev2 run 17057 at 02:46Z saw 185/0 contexts on 10 SHAs; direct
|
||||
probe ~30min later showed ~25 fails on those same 10 SHAs).
|
||||
|
||||
3. For EACH SHA in the list:
|
||||
- GET combined commit status. Per-SHA error isolation
|
||||
@@ -502,7 +507,17 @@ def reap(
|
||||
# already stale enough that the schedule-run that posted them has long
|
||||
# since been overwritten by a real push trigger. See `reference_post_
|
||||
# suspension_pipeline` for the merge-cadence baseline.
|
||||
DEFAULT_SWEEP_LIMIT = 10
|
||||
#
|
||||
# rev3 (2026-05-12, hongming-pc2 GO 03:25Z): widened from 10 → 30.
|
||||
# rev2 (limit=10) shipped 01:48Z and ran 6/6 ticks post-merge with
|
||||
# `compensated:0` despite ~25 stranded reds visible on those same 10
|
||||
# SHAs ~30min later. Root cause: schedule workflows post `failure`
|
||||
# RETROACTIVELY 5-15 min after their merge, so by the time reaper's
|
||||
# next */5 tick lands, the stranded red is on a SHA that has already
|
||||
# fallen out of a 10-commit window during a burst-merge period.
|
||||
# Trades window-width-cheap for cadence-loady (per hongming-pc2):
|
||||
# kept `*/5` cron unchanged; only the window-N is widened.
|
||||
DEFAULT_SWEEP_LIMIT = 30
|
||||
|
||||
|
||||
def list_recent_commit_shas(branch: str, limit: int) -> list[str]:
|
||||
|
||||
@@ -23,11 +23,11 @@
|
||||
# `feedback_behavior_based_ast_gates` — NOT grep-by-name. That way
|
||||
# job renames or matrix-expansion-induced churn produce honest signal.
|
||||
#
|
||||
# IMPORTANT — TRANSITIONAL STATE: molecule-core's ci.yml does NOT yet
|
||||
# contain the `all-required` sentinel job (RFC §4 Phase 4 adds it).
|
||||
# Until Phase 4 lands the detector will hard-fail with exit 3 on the
|
||||
# missing sentinel. That's intentional: a red workflow on a 5-min cron
|
||||
# is louder than a silent issue and forces Phase 4 to land soon.
|
||||
# NOTE on protection endpoint scope: `GET /repos/.../branch_protections/{branch}`
|
||||
# requires repo-admin role in Gitea 1.22.6. If DRIFT_BOT_TOKEN lacks it,
|
||||
# the script skips that branch with a clear ::error:: diagnostic and exits 0
|
||||
# (the issue IS the alarm, not a red workflow). See provisioning trail in
|
||||
# the run step's GITEA_TOKEN env comment.
|
||||
|
||||
name: ci-required-drift
|
||||
|
||||
|
||||
@@ -37,13 +37,15 @@ name: main-red-watchdog
|
||||
# "unknown on type" when `workflow_dispatch.inputs.X` is present. Revisit
|
||||
# when Gitea ≥ 1.23 is fleet-wide.
|
||||
on:
|
||||
# SCHEDULE DISABLED 2026-05-12 — interim per RFC#420 Option-C machinery-down emergency
|
||||
# Watchdog timing out behind runner saturation; rev3+dedicated-runner-label in flight
|
||||
# Re-enable after rev3 lands + runner saturation root resolved
|
||||
# schedule:
|
||||
# # Hourly at :05 — task spec calls for "off-zero" (`5 * * * *`),
|
||||
# # offset from :17 (ci-required-drift) and :00 (peak cron load).
|
||||
# - cron: '5 * * * *'
|
||||
# SCHEDULE RE-ENABLED 2026-05-12 rev3 — interim disable (mc#645) reverted alongside
|
||||
# status-reaper rev3 (widen-window). Job-level timeout-minutes raised 5 → 15 below
|
||||
# to absorb runner-saturation latency without spurious cancels (the original cascade
|
||||
# cause). If runner-saturation root persists, the dedicated-runner-label split
|
||||
# remains the structural next step (tracked separately).
|
||||
schedule:
|
||||
# Hourly at :05 — task spec calls for "off-zero" (`5 * * * *`),
|
||||
# offset from :17 (ci-required-drift) and :00 (peak cron load).
|
||||
- cron: '5 * * * *'
|
||||
workflow_dispatch:
|
||||
|
||||
# Read commit status + branch ref + issues; write issues (open/PATCH/close).
|
||||
@@ -61,7 +63,12 @@ concurrency:
|
||||
jobs:
|
||||
watchdog:
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 5
|
||||
# rev3 (2026-05-12, mc#645 revert): raised 5 → 15 to absorb runner-saturation
|
||||
# latency. Original 5min cap was producing 124-style cancels under load,
|
||||
# which fed the very `[main-red]` issues this workflow files (self-poisoning).
|
||||
# 15min is still well below Gitea-default 6h job ceiling; if a real hang
|
||||
# occurs the issue-file path is still the alarm surface.
|
||||
timeout-minutes: 15
|
||||
steps:
|
||||
- name: Check out repo (script lives at .gitea/scripts/)
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
|
||||
@@ -53,16 +53,19 @@ name: status-reaper
|
||||
# `inputs:` block here. Gitea 1.22.6 rejects the whole workflow as
|
||||
# "unknown on type" when `workflow_dispatch.inputs.X` is present.
|
||||
on:
|
||||
# SCHEDULE DISABLED 2026-05-12 — interim per RFC#420 Option-C machinery-down emergency
|
||||
# Reaper rev2 not compensating + watchdog timeout-cascade; rev3 in flight
|
||||
# Re-enable after rev3 lands + runner saturation root resolved
|
||||
# schedule:
|
||||
# # Every 5 minutes. Off-zero alignment with sibling cron workflows:
|
||||
# # ci-required-drift (`:17`), main-red-watchdog (`:05`),
|
||||
# # railway-pin-audit (`:23`). 5-min cadence gives a tight enough
|
||||
# # close on schedule-triggered false-reds that main-red-watchdog
|
||||
# # (hourly :05) almost never files an issue on the false case.
|
||||
# - cron: '*/5 * * * *'
|
||||
# SCHEDULE RE-ENABLED 2026-05-12 rev3 — interim disable (mc#645) reverted now that
|
||||
# rev3 widens DEFAULT_SWEEP_LIMIT 10 → 30 (covers retroactive-failure timing window).
|
||||
# Sibling watchdog re-enabled in the same PR with timeout-minutes raised 5 → 15.
|
||||
schedule:
|
||||
# Every 5 minutes. Off-zero alignment with sibling cron workflows:
|
||||
# ci-required-drift (`:17`), main-red-watchdog (`:05`),
|
||||
# railway-pin-audit (`:23`). 5-min cadence gives a tight enough
|
||||
# close on schedule-triggered false-reds that main-red-watchdog
|
||||
# (hourly :05) almost never files an issue on the false case.
|
||||
# rev3 keeps `*/5` unchanged per hongming-pc2 03:25Z review:
|
||||
# "trades window-width-cheap for cadence-loady" — N=30 widens
|
||||
# the lookback cheaply without doubling runner load via `*/2`.
|
||||
- cron: '*/5 * * * *'
|
||||
workflow_dispatch:
|
||||
|
||||
# Compensating-status POST needs write on repo statuses; no other
|
||||
|
||||
@@ -53,9 +53,20 @@ jobs:
|
||||
- name: Build
|
||||
run: go build ./cmd/server
|
||||
|
||||
# `go vet` is NOT `|| true`-guarded: surfacing latent vet errors on main is
|
||||
# the whole point of this workflow (issue #567 — the motivating case was a
|
||||
# `go vet` error in org_external.go that sat undetected on main for weeks).
|
||||
# A vet error here fails the step → fails the job → shows red on the weekly
|
||||
# commit. Per Gitea quirk #10 (job-level continue-on-error is ignored), that
|
||||
# red surfaces on main — which is the intended signal, not a regression.
|
||||
- name: go vet
|
||||
run: go vet ./... || true
|
||||
run: go vet ./...
|
||||
|
||||
# golangci-lint stays `|| true`-guarded: lint is noisier (more false-
|
||||
# positives than vet) and golangci-lint may not be pre-installed on every
|
||||
# runner image — a `|| true` here keeps a missing-binary or lint-noise case
|
||||
# from masking the vet/test signal above. Tighten to match ci.yml's lint
|
||||
# gate if/when ci.yml's lint step becomes hard-failing.
|
||||
- name: golangci-lint
|
||||
run: golangci-lint run --timeout 3m ./... || true
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -4,11 +4,11 @@ Documents persistent operational findings about Gitea Actions runner behaviour
|
||||
that differ from GitHub Actions and require workarounds in workflow YAML or
|
||||
runbooks.
|
||||
|
||||
> Last updated: 2026-05-11 (core-devops-agent)
|
||||
> Last updated: 2026-05-12 (infra-runtime-be-agent)
|
||||
|
||||
---
|
||||
|
||||
## Large repo causes fetch timeout on Gitea Actions runner
|
||||
## Quirk #1 — Large repo causes fetch timeout on Gitea Actions runner
|
||||
|
||||
### Finding
|
||||
|
||||
@@ -68,7 +68,7 @@ confirming this is a repo-size constraint, not network isolation.
|
||||
|
||||
---
|
||||
|
||||
## `continue-on-error` only works at step level, not job level
|
||||
## Quirk #2 — `continue-on-error` only works at step level, not job level
|
||||
|
||||
### Finding
|
||||
|
||||
@@ -112,12 +112,12 @@ jobs:
|
||||
|
||||
### References
|
||||
|
||||
- Gitea Actions quirk #10 (from migration checklist)
|
||||
- Quirk #10 (this document): Gitea does NOT auto-populate `secrets.GITHUB_TOKEN`
|
||||
- PR #441: fix applied to `harness-replays.yml`
|
||||
|
||||
---
|
||||
|
||||
## `workflow_dispatch.inputs` not supported
|
||||
## Quirk #3 — `workflow_dispatch.inputs` not supported
|
||||
|
||||
Gitea 1.22.6 parser rejects `workflow_dispatch.inputs`. Drop from all workflow
|
||||
YAML files ported from GitHub Actions. Manual triggers should use
|
||||
@@ -127,21 +127,21 @@ YAML files ported from GitHub Actions. Manual triggers should use
|
||||
|
||||
---
|
||||
|
||||
## `merge_group` not supported
|
||||
## Quirk #4 — `merge_group` not supported
|
||||
|
||||
Gitea has no merge queue concept. Drop `merge_group:` triggers from all
|
||||
workflow YAML files.
|
||||
|
||||
---
|
||||
|
||||
## `environment:` blocks not supported
|
||||
## Quirk #5 — `environment:` blocks not supported
|
||||
|
||||
Gitea has no environments concept. Drop `environment:` from all workflow YAML
|
||||
files. Secrets and variables are repo-level.
|
||||
|
||||
---
|
||||
|
||||
## Gitea combined status reports `failure` when all contexts are `null`
|
||||
## Quirk #6 — Gitea combined status reports `failure` when all contexts are `null`
|
||||
|
||||
### Finding
|
||||
|
||||
@@ -189,3 +189,215 @@ primary consumer of combined status and is affected.
|
||||
|
||||
- Issue #481: first real-world case of this bug (2026-05-11)
|
||||
- `feedback_no_such_thing_as_flakes`: watchdog directive
|
||||
|
||||
---
|
||||
|
||||
## Quirk #7 — TBD
|
||||
|
||||
*[Placeholder — document here when a new Gitea Actions quirk is discovered.]*
|
||||
|
||||
### Finding
|
||||
|
||||
*[What Gitea Actions does differently from GitHub Actions.]*
|
||||
|
||||
### Impact
|
||||
|
||||
*[Which workflows or operations are affected.]*
|
||||
|
||||
### Workaround
|
||||
|
||||
*[How to work around this quirk.]*
|
||||
|
||||
### References
|
||||
|
||||
- internal#[N]: first observation
|
||||
|
||||
---
|
||||
|
||||
## Quirk #8 — TBD
|
||||
|
||||
*[Placeholder — document here when a new Gitea Actions quirk is discovered.]*
|
||||
|
||||
### Finding
|
||||
|
||||
*[What Gitea Actions does differently from GitHub Actions.]*
|
||||
|
||||
### Impact
|
||||
|
||||
*[Which workflows or operations are affected.]*
|
||||
|
||||
### Workaround
|
||||
|
||||
*[How to work around this quirk.]*
|
||||
|
||||
### References
|
||||
|
||||
- internal#[N]: first observation
|
||||
|
||||
---
|
||||
|
||||
## Quirk #9 — TBD
|
||||
|
||||
*[Placeholder — document here when a new Gitea Actions quirk is discovered.]*
|
||||
|
||||
### Finding
|
||||
|
||||
*[What Gitea Actions does differently from GitHub Actions.]*
|
||||
|
||||
### Impact
|
||||
|
||||
*[Which workflows or operations are affected.]*
|
||||
|
||||
### Workaround
|
||||
|
||||
*[How to work around this quirk.]*
|
||||
|
||||
### References
|
||||
|
||||
- internal#[N]: first observation
|
||||
|
||||
---
|
||||
|
||||
## Quirk #10 — Gitea does NOT auto-populate `secrets.GITHUB_TOKEN`
|
||||
|
||||
### Finding
|
||||
|
||||
Gitea Actions (1.22.6) does **not** auto-populate `secrets.GITHUB_TOKEN`
|
||||
the way GitHub Actions does. A workflow that references `secrets.GITHUB_TOKEN`
|
||||
without explicitly provisioning a named secret gets an empty string — not a
|
||||
read-only token scoped to the repo.
|
||||
|
||||
### Impact
|
||||
|
||||
Workflows that call the Gitea REST API using `secrets.GITHUB_TOKEN` as auth
|
||||
receive **HTTP 401** on every API call. Affected workflows in molecule-core:
|
||||
|
||||
| Workflow | Symptom | Workaround |
|
||||
|---|---|---|
|
||||
| `gate-check-v3.yml` | Reports BLOCKED on every PR | Provision `SOP_TIER_CHECK_TOKEN`; update workflow to use it |
|
||||
| `qa-review.yml` | Fails immediately on PR open | Same — needs named secret |
|
||||
| `security-review.yml` | Fails immediately on PR open | Same — needs named secret |
|
||||
|
||||
### How to diagnose
|
||||
|
||||
Add a debug step to the failing workflow:
|
||||
|
||||
```yaml
|
||||
- name: Diagnose token
|
||||
run: |
|
||||
echo "Token present: ${{ secrets.GITHUB_TOKEN != '' }}"
|
||||
curl -sS --fail -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
|
||||
"$GITHUB_SERVER_URL/api/v1/user" | jq -r '.login'
|
||||
# Expected (GitHub): prints your username.
|
||||
# Actual (Gitea): HTTP 401 or empty string.
|
||||
```
|
||||
|
||||
### References
|
||||
|
||||
- internal#325: root-cause analysis and token provisioning
|
||||
- `feedback_gitea_no_auto_supplied_github_token`
|
||||
|
||||
---
|
||||
|
||||
## Quirk #11 — PR-create event dispatcher races — only 1 of N workflows fires on `pull_request opened`
|
||||
|
||||
### Finding
|
||||
|
||||
When a PR is created via the Gitea web UI or API, the Gitea Actions event
|
||||
dispatcher may fire **only 1 of N eligible workflows** on the initial
|
||||
`pull_request opened` event. All other eligible workflows are silently dropped.
|
||||
|
||||
This was observed on molecule-core PR #558 (created 2026-05-11T19:54:10Z):
|
||||
12+ workflows had no `paths:` filter and should have fired, but only
|
||||
`sop-tier-check.yml` dispatched.
|
||||
|
||||
Concurrent PRs created within the same minute received 12–30 dispatches each,
|
||||
confirming this is specific to the PR-create event dispatch, not a general
|
||||
runner capacity issue.
|
||||
|
||||
### Impact
|
||||
|
||||
- PRs may not run the full CI suite on first open.
|
||||
- `gate-check-v3`, `secret-scan`, `qa-review`, and `security-review` can be
|
||||
silently absent from the PR's status checks.
|
||||
- Branch protection may block merge even though CI is effectively green.
|
||||
|
||||
### How to diagnose
|
||||
|
||||
```bash
|
||||
# List workflow runs for the PR:
|
||||
gh run list --event pull_request --repo molecule-ai/molecule-core \
|
||||
| grep "$(gh pr view $PR --json number --jq '.number')"
|
||||
|
||||
# Expected: 12+ runs on PR open.
|
||||
# Actual (when race fires): only 1 run.
|
||||
```
|
||||
|
||||
### Workaround
|
||||
|
||||
Force a second dispatch by pushing a no-op synchronize commit:
|
||||
|
||||
```bash
|
||||
git commit --allow-empty -m "chore: trigger workflows [skip ci]"
|
||||
git push
|
||||
```
|
||||
|
||||
The synchronize event fires a second `pull_request` event, which reliably
|
||||
triggers all eligible workflows.
|
||||
|
||||
### References
|
||||
|
||||
- internal#329: first observation on PR #558
|
||||
- `feedback_gitea_pr_create_dispatcher_race`
|
||||
|
||||
---
|
||||
|
||||
## When you find a new quirk
|
||||
|
||||
Copy the template below, increment the quirk number, and fill in the finding,
|
||||
impact, workaround, and references. Place the new section in the **correct
|
||||
numerical position** (before the next higher-numbered quirk). Update this
|
||||
section's final paragraph to remove the next slot's number.
|
||||
|
||||
### Template
|
||||
|
||||
```markdown
|
||||
## Quirk #N — <short title>
|
||||
|
||||
### Finding
|
||||
|
||||
<What Gitea Actions does differently from GitHub Actions.>
|
||||
|
||||
### Impact
|
||||
|
||||
<Which workflows or operations are affected. Include an affected workflows
|
||||
table if more than one is affected.>
|
||||
|
||||
### How to diagnose
|
||||
|
||||
<Shell commands or API calls that confirm this is the quirk, not a real failure.>
|
||||
|
||||
### Workaround
|
||||
|
||||
<How to work around this quirk in workflow YAML or operations.>
|
||||
|
||||
### References
|
||||
|
||||
- internal#[N]: first observation
|
||||
- <Any Gitea issue, feedback label, or upstream bug tracker reference>
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Open questions for Gitea 1.23
|
||||
|
||||
- [ ] **act_runner concurrent-job cap**: issue #305 — runner saturation under
|
||||
merge burst; needs `max_concurrent_jobs` cap configured on act_runner
|
||||
- [ ] **Infisical→Gitea secret-sync**: issue #307 — eliminate manual secret
|
||||
PUTs by wiring an Infisical cron to the Gitea API
|
||||
- [ ] **PR-create dispatcher race resolution**: internal #329 — is there a
|
||||
Gitea fix or config knob to disable the race? File upstream bug if not
|
||||
- [ ] **GITHUB_TOKEN auto-population**: internal #325 — is this on the
|
||||
Gitea 1.23 roadmap? If not, the workaround (named secret) is the permanent
|
||||
answer
|
||||
|
||||
|
||||
@@ -713,6 +713,92 @@ def test_reap_skips_combined_success_shas(sr_module, monkeypatch):
|
||||
assert posts[0][0] == f"/repos/owner/repo/statuses/{SHA_B}"
|
||||
|
||||
|
||||
def test_default_sweep_limit_is_30(sr_module):
|
||||
"""rev3 contract: `DEFAULT_SWEEP_LIMIT = 30` (widened from rev2's 10).
|
||||
|
||||
Root cause of the widening: schedule workflows post `failure`
|
||||
RETROACTIVELY 5-15 min after their merge. A 10-commit window is
|
||||
narrower than the merge-cadence during a burst, so reds land
|
||||
OUTSIDE the window before reaper's next tick sees them.
|
||||
|
||||
Evidence: rev2 run 17057 (02:46Z 2026-05-12) saw 185 contexts / 0
|
||||
fails on its 10 SHAs; direct probe ~30min later showed ~25 fails
|
||||
on those same 10 SHAs.
|
||||
|
||||
If this default is ever lowered back, that change MUST cite
|
||||
re-measured cadence data — a smaller window than the
|
||||
retroactive-failure-post lag re-introduces compensated:0.
|
||||
"""
|
||||
assert sr_module.DEFAULT_SWEEP_LIMIT == 30
|
||||
|
||||
|
||||
def test_reap_widened_window_catches_retroactive_failure(sr_module, monkeypatch):
|
||||
"""rev3 regression: with limit=30, a stranded red on a SHA at depth=20
|
||||
(which the rev2 limit=10 window would have missed) IS swept + compensated.
|
||||
|
||||
Why this matters: rev2 ran with limit=10 and saw `compensated:0` for
|
||||
6 consecutive ticks despite ~25 known-stranded reds across the last
|
||||
30 main commits. Widening to 30 must demonstrably catch a SHA past
|
||||
the old window. We mock 30 SHAs, plant the failure on SHA[20], and
|
||||
verify exactly one compensation lands on that SHA.
|
||||
"""
|
||||
shas = [f"{c:02x}" * 20 for c in range(30)] # 30 deterministic SHAs
|
||||
failing_sha = shas[20] # depth 20 — outside rev2's window=10, inside rev3's =30
|
||||
|
||||
posts: list[tuple[str, dict]] = []
|
||||
|
||||
def fake_api(method, path, *, body=None, query=None, expect_json=True):
|
||||
if method == "GET" and path.endswith("/commits"):
|
||||
# /commits listing — return all 30 fake commit objects
|
||||
assert query.get("limit") == "30", (
|
||||
f"expected limit=30 in query, got {query}"
|
||||
)
|
||||
return (200, [{"sha": s} for s in shas])
|
||||
if method == "GET" and "/commits/" in path and path.endswith("/status"):
|
||||
sha = path.split("/commits/")[1].split("/status")[0]
|
||||
if sha == failing_sha:
|
||||
return (
|
||||
200,
|
||||
{
|
||||
"state": "failure",
|
||||
"statuses": [
|
||||
{
|
||||
"context": "retroactive-drift / drift (push)",
|
||||
"state": "failure",
|
||||
"target_url": "https://example.test/run/9001",
|
||||
}
|
||||
],
|
||||
},
|
||||
)
|
||||
# All others combined=success (cost-opt short-circuit).
|
||||
return (200, {"state": "success", "statuses": []})
|
||||
if method == "POST":
|
||||
posts.append((path, body))
|
||||
return (201, {})
|
||||
raise AssertionError(f"unexpected api call: {method} {path}")
|
||||
|
||||
monkeypatch.setattr(sr_module, "api", fake_api)
|
||||
|
||||
workflow_map = {"retroactive-drift": False} # schedule-only → class-O
|
||||
counters = sr_module.reap_branch(
|
||||
workflow_map, "main", limit=sr_module.DEFAULT_SWEEP_LIMIT, dry_run=False
|
||||
)
|
||||
|
||||
# All 30 SHAs walked; exactly one compensated.
|
||||
assert counters["scanned_shas"] == 30
|
||||
assert counters["compensated"] == 1
|
||||
assert failing_sha in counters["compensated_per_sha"]
|
||||
assert counters["compensated_per_sha"][failing_sha] == [
|
||||
"retroactive-drift / drift (push)"
|
||||
]
|
||||
assert len(posts) == 1
|
||||
assert posts[0][0] == f"/repos/owner/repo/statuses/{failing_sha}"
|
||||
# Sanity: with rev2's window=10, depth=20 would NOT have been reached.
|
||||
# This assertion documents the rev3 widening as the structural fix:
|
||||
# the failing_sha index (20) is strictly greater than rev2's old limit (10).
|
||||
assert shas.index(failing_sha) >= 10
|
||||
|
||||
|
||||
def test_reap_continues_on_per_sha_apierror(sr_module, monkeypatch, capsys):
|
||||
"""rev2 refinement #7 (MOST CRITICAL): a transient ApiError or HTTP-5xx
|
||||
on get_combined_status(SHA_X) must NOT fail the whole tick. Log + skip
|
||||
|
||||
Reference in New Issue
Block a user