diff --git a/.gitea/scripts/sop-tier-check.sh b/.gitea/scripts/sop-tier-check.sh index 3ca882cd..afd13e48 100755 --- a/.gitea/scripts/sop-tier-check.sh +++ b/.gitea/scripts/sop-tier-check.sh @@ -96,16 +96,27 @@ API="https://${GITEA_HOST}/api/v1" AUTH="Authorization: token ${GITEA_TOKEN}" echo "::notice::tier-check start: repo=$OWNER/$NAME pr=$PR_NUMBER author=$PR_AUTHOR" -# Sanity: token resolves to a user -WHOAMI=$(curl -sS -H "$AUTH" "${API}/user" | jq -r '.login // ""') +# Sanity: token resolves to a user. +# Use || true on the jq pipeline so that set -euo pipefail (line 45) does not +# cause the script to exit prematurely when the token is empty/invalid — the +# if check below handles that case gracefully. Without || true, a 401 from an +# empty/invalid token causes jq to exit 1, triggering set -e and exiting the +# entire script before SOP_FAIL_OPEN can be evaluated (the check is in the jq- +# install block; if jq is already on PATH, that block is skipped entirely). +WHOAMI=$(curl -sS -H "$AUTH" "${API}/user" | jq -r '.login // ""') || true if [ -z "$WHOAMI" ]; then echo "::error::GITEA_TOKEN cannot resolve a user via /api/v1/user — check the token scope and that the secret is wired correctly." + if [ "${SOP_FAIL_OPEN:-}" = "1" ]; then + echo "::warning::SOP_FAIL_OPEN=1 — exiting 0 so CI does not block." + exit 0 + fi exit 1 fi echo "::notice::token resolves to user: $WHOAMI" -# 1. Read tier label -LABELS=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/issues/${PR_NUMBER}/labels" | jq -r '.[].name') +# 1. Read tier label. || true ensures set -euo pipefail does not abort the +# script if curl or jq fails (e.g. 401 from empty token). +LABELS=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/issues/${PR_NUMBER}/labels" | jq -r '.[].name') || true TIER="" for L in $LABELS; do case "$L" in @@ -176,17 +187,25 @@ fi # 4. Resolve all team names → IDs # /orgs/{org}/teams/{slug}/... endpoints don't exist on Gitea 1.22; # we use /teams/{id}. +# set +e prevents set -e from aborting the script if curl fails (e.g. empty token). ORG_TEAMS_FILE=$(mktemp) trap 'rm -f "$ORG_TEAMS_FILE"' EXIT +set +e HTTP_CODE=$(curl -sS -o "$ORG_TEAMS_FILE" -w '%{http_code}' -H "$AUTH" \ "${API}/orgs/${OWNER}/teams") -debug "teams-list HTTP=$HTTP_CODE size=$(wc -c <"$ORG_TEAMS_FILE")" +_HTTP_EXIT=$? +set -e +debug "teams-list HTTP=$HTTP_CODE (curl exit=$_HTTP_EXIT) size=$(wc -c <"$ORG_TEAMS_FILE")" if [ "${SOP_DEBUG:-}" = "1" ]; then echo " [debug] teams-list body (first 300 chars):" >&2 head -c 300 "$ORG_TEAMS_FILE" >&2; echo >&2 fi -if [ "$HTTP_CODE" != "200" ]; then - echo "::error::GET /orgs/${OWNER}/teams returned HTTP $HTTP_CODE — token likely lacks read:org scope." +if [ "$_HTTP_EXIT" -ne 0 ] || [ "$HTTP_CODE" != "200" ]; then + echo "::error::GET /orgs/${OWNER}/teams failed (curl exit=$_HTTP_EXIT HTTP=$HTTP_CODE) — token may lack read:org scope or be invalid." + if [ "${SOP_FAIL_OPEN:-}" = "1" ]; then + echo "::warning::SOP_FAIL_OPEN=1 — exiting 0 so CI does not block." + exit 0 + fi exit 1 fi @@ -231,9 +250,22 @@ for _t in $_all_teams; do debug "team-id: $_t → $_id" done -# 5. Read approving reviewers +# 5. Read approving reviewers. set +e disables set -e temporarily so that curl +# failures (e.g. empty/invalid token → HTTP 401) do not abort the script before +# SOP_FAIL_OPEN is evaluated. set -e is restored immediately after. +set +e REVIEWS=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/pulls/${PR_NUMBER}/reviews") -APPROVERS=$(echo "$REVIEWS" | jq -r '[.[] | select(.state=="APPROVED") | .user.login] | unique | .[]') +_REVIEWS_EXIT=$? +set -e +if [ $_REVIEWS_EXIT -ne 0 ] || [ -z "$REVIEWS" ]; then + echo "::error::Failed to fetch reviews (curl exit=$_REVIEWS_EXIT) — token may be invalid or unreachable." + if [ "${SOP_FAIL_OPEN:-}" = "1" ]; then + echo "::warning::SOP_FAIL_OPEN=1 — exiting 0 so CI does not block." + exit 0 + fi + exit 1 +fi +APPROVERS=$(echo "$REVIEWS" | jq -r '[.[] | select(.state=="APPROVED") | .user.login] | unique | .[]') || true if [ -z "$APPROVERS" ]; then echo "::error::No approving reviews on this PR. Set SOP_DEBUG=1 and re-run for diagnostics." exit 1 diff --git a/runbooks/gitea-operational-quirks.md b/runbooks/gitea-operational-quirks.md index 3bc1cd94..59fc94c3 100644 --- a/runbooks/gitea-operational-quirks.md +++ b/runbooks/gitea-operational-quirks.md @@ -4,11 +4,11 @@ Documents persistent operational findings about Gitea Actions runner behaviour that differ from GitHub Actions and require workarounds in workflow YAML or runbooks. -> Last updated: 2026-05-11 (core-devops-agent) +> Last updated: 2026-05-12 (infra-runtime-be-agent) --- -## Large repo causes fetch timeout on Gitea Actions runner +## Quirk #1 — Large repo causes fetch timeout on Gitea Actions runner ### Finding @@ -68,7 +68,7 @@ confirming this is a repo-size constraint, not network isolation. --- -## `continue-on-error` only works at step level, not job level +## Quirk #2 — `continue-on-error` only works at step level, not job level ### Finding @@ -112,12 +112,12 @@ jobs: ### References -- Gitea Actions quirk #10 (from migration checklist) +- Quirk #10 (this document): Gitea does NOT auto-populate `secrets.GITHUB_TOKEN` - PR #441: fix applied to `harness-replays.yml` --- -## `workflow_dispatch.inputs` not supported +## Quirk #3 — `workflow_dispatch.inputs` not supported Gitea 1.22.6 parser rejects `workflow_dispatch.inputs`. Drop from all workflow YAML files ported from GitHub Actions. Manual triggers should use @@ -127,21 +127,21 @@ YAML files ported from GitHub Actions. Manual triggers should use --- -## `merge_group` not supported +## Quirk #4 — `merge_group` not supported Gitea has no merge queue concept. Drop `merge_group:` triggers from all workflow YAML files. --- -## `environment:` blocks not supported +## Quirk #5 — `environment:` blocks not supported Gitea has no environments concept. Drop `environment:` from all workflow YAML files. Secrets and variables are repo-level. --- -## Gitea combined status reports `failure` when all contexts are `null` +## Quirk #6 — Gitea combined status reports `failure` when all contexts are `null` ### Finding @@ -189,3 +189,215 @@ primary consumer of combined status and is affected. - Issue #481: first real-world case of this bug (2026-05-11) - `feedback_no_such_thing_as_flakes`: watchdog directive + +--- + +## Quirk #7 — TBD + +*[Placeholder — document here when a new Gitea Actions quirk is discovered.]* + +### Finding + +*[What Gitea Actions does differently from GitHub Actions.]* + +### Impact + +*[Which workflows or operations are affected.]* + +### Workaround + +*[How to work around this quirk.]* + +### References + +- internal#[N]: first observation + +--- + +## Quirk #8 — TBD + +*[Placeholder — document here when a new Gitea Actions quirk is discovered.]* + +### Finding + +*[What Gitea Actions does differently from GitHub Actions.]* + +### Impact + +*[Which workflows or operations are affected.]* + +### Workaround + +*[How to work around this quirk.]* + +### References + +- internal#[N]: first observation + +--- + +## Quirk #9 — TBD + +*[Placeholder — document here when a new Gitea Actions quirk is discovered.]* + +### Finding + +*[What Gitea Actions does differently from GitHub Actions.]* + +### Impact + +*[Which workflows or operations are affected.]* + +### Workaround + +*[How to work around this quirk.]* + +### References + +- internal#[N]: first observation + +--- + +## Quirk #10 — Gitea does NOT auto-populate `secrets.GITHUB_TOKEN` + +### Finding + +Gitea Actions (1.22.6) does **not** auto-populate `secrets.GITHUB_TOKEN` +the way GitHub Actions does. A workflow that references `secrets.GITHUB_TOKEN` +without explicitly provisioning a named secret gets an empty string — not a +read-only token scoped to the repo. + +### Impact + +Workflows that call the Gitea REST API using `secrets.GITHUB_TOKEN` as auth +receive **HTTP 401** on every API call. Affected workflows in molecule-core: + +| Workflow | Symptom | Workaround | +|---|---|---| +| `gate-check-v3.yml` | Reports BLOCKED on every PR | Provision `SOP_TIER_CHECK_TOKEN`; update workflow to use it | +| `qa-review.yml` | Fails immediately on PR open | Same — needs named secret | +| `security-review.yml` | Fails immediately on PR open | Same — needs named secret | + +### How to diagnose + +Add a debug step to the failing workflow: + +```yaml +- name: Diagnose token + run: | + echo "Token present: ${{ secrets.GITHUB_TOKEN != '' }}" + curl -sS --fail -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ + "$GITHUB_SERVER_URL/api/v1/user" | jq -r '.login' + # Expected (GitHub): prints your username. + # Actual (Gitea): HTTP 401 or empty string. +``` + +### References + +- internal#325: root-cause analysis and token provisioning +- `feedback_gitea_no_auto_supplied_github_token` + +--- + +## Quirk #11 — PR-create event dispatcher races — only 1 of N workflows fires on `pull_request opened` + +### Finding + +When a PR is created via the Gitea web UI or API, the Gitea Actions event +dispatcher may fire **only 1 of N eligible workflows** on the initial +`pull_request opened` event. All other eligible workflows are silently dropped. + +This was observed on molecule-core PR #558 (created 2026-05-11T19:54:10Z): +12+ workflows had no `paths:` filter and should have fired, but only +`sop-tier-check.yml` dispatched. + +Concurrent PRs created within the same minute received 12–30 dispatches each, +confirming this is specific to the PR-create event dispatch, not a general +runner capacity issue. + +### Impact + +- PRs may not run the full CI suite on first open. +- `gate-check-v3`, `secret-scan`, `qa-review`, and `security-review` can be + silently absent from the PR's status checks. +- Branch protection may block merge even though CI is effectively green. + +### How to diagnose + +```bash +# List workflow runs for the PR: +gh run list --event pull_request --repo molecule-ai/molecule-core \ + | grep "$(gh pr view $PR --json number --jq '.number')" + +# Expected: 12+ runs on PR open. +# Actual (when race fires): only 1 run. +``` + +### Workaround + +Force a second dispatch by pushing a no-op synchronize commit: + +```bash +git commit --allow-empty -m "chore: trigger workflows [skip ci]" +git push +``` + +The synchronize event fires a second `pull_request` event, which reliably +triggers all eligible workflows. + +### References + +- internal#329: first observation on PR #558 +- `feedback_gitea_pr_create_dispatcher_race` + +--- + +## When you find a new quirk + +Copy the template below, increment the quirk number, and fill in the finding, +impact, workaround, and references. Place the new section in the **correct +numerical position** (before the next higher-numbered quirk). Update this +section's final paragraph to remove the next slot's number. + +### Template + +```markdown +## Quirk #N — + +### Finding + + + +### Impact + + + +### How to diagnose + + + +### Workaround + + + +### References + +- internal#[N]: first observation +- +``` + +--- + +## Open questions for Gitea 1.23 + +- [ ] **act_runner concurrent-job cap**: issue #305 — runner saturation under + merge burst; needs `max_concurrent_jobs` cap configured on act_runner +- [ ] **Infisical→Gitea secret-sync**: issue #307 — eliminate manual secret + PUTs by wiring an Infisical cron to the Gitea API +- [ ] **PR-create dispatcher race resolution**: internal #329 — is there a + Gitea fix or config knob to disable the race? File upstream bug if not +- [ ] **GITHUB_TOKEN auto-population**: internal #325 — is this on the + Gitea 1.23 roadmap? If not, the workaround (named secret) is the permanent + answer +