test(canvas): add MemoryTab tests (42 cases)

Cover awareness dashboard expand/collapse, iframe with workspaceId in URL, status grid, KV memory list, expand/collapse entries, add/edit/delete memory entries, JSON parsing, TTL support, 409 conflict retry hint, error states, and refresh. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
chore: retrigger CI after rebase to main
2026-05-12 03:46:50 +00:00 · 2026-05-12 03:46:50 +00:00 · 2026-05-12 03:38:40 +00:00 · 2026-05-12 03:37:52 +00:00 · 2026-05-12 03:34:57 +00:00 · 2026-05-12 03:34:13 +00:00
11 changed files with 992 additions and 715 deletions
@@ -49,11 +49,11 @@ if [ "$MERGED" != "true" ]; then
  exit 0
 fi

-MERGE_SHA=$(echo "$PR" | jq -r '.merge_commit_sha // empty')
-MERGED_BY=$(echo "$PR" | jq -r '.merged_by.login // "unknown"')
-TITLE=$(echo "$PR" | jq -r '.title // ""')
-BASE_BRANCH=$(echo "$PR" | jq -r '.base.ref // "main"')
-HEAD_SHA=$(echo "$PR" | jq -r '.head.sha // empty')
+MERGE_SHA=$(echo "$PR" | jq -r '.merge_commit_sha // empty') || true
+MERGED_BY=$(echo "$PR" | jq -r '.merged_by.login // "unknown"') || true
+TITLE=$(echo "$PR" | jq -r '.title // ""') || true
+BASE_BRANCH=$(echo "$PR" | jq -r '.base.ref // "main"') || true
+HEAD_SHA=$(echo "$PR" | jq -r '.head.sha // empty') || true

 if [ -z "$MERGE_SHA" ]; then
  echo "::warning::PR #${PR_NUMBER} merged=true but no merge_commit_sha — cannot evaluate force-merge."
@@ -75,7 +75,7 @@ STATUS=$(curl -sS -H "$AUTH" \
 declare -A CHECK_STATE
 while IFS=$'\t' read -r ctx state; do
  [ -n "$ctx" ] && CHECK_STATE[$ctx]="$state"
-done < <(echo "$STATUS" | jq -r '.statuses // [] | .[] | "\(.context)\t\(.status)"')
+done < <(echo "$STATUS" | jq -r '.statuses // [] | .[] | "\(.context)\t\(.status)"') || true

 # 4. For each required check, was it green at merge? YAML block scalars
 #    (`|`) leave a trailing newline; skip blank/whitespace-only lines.
@@ -97,7 +97,7 @@ fi

 # 5. Emit structured audit event.
 NOW=$(date -u +%Y-%m-%dT%H:%M:%SZ)
-FAILED_JSON=$(printf '%s\n' "${FAILED_CHECKS[@]}" | jq -R . | jq -s .)
+FAILED_JSON=$(printf '%s\n' "${FAILED_CHECKS[@]}" | jq -R . | jq -s .) || true

 # Print as a single-line JSON so Vector's parse_json transform can pick
 # it up cleanly from docker_logs.
@@ -301,7 +301,19 @@ def expected_context(job_key: str, workflow_name: str = "ci") -> str:
 # Drift detection
 # --------------------------------------------------------------------------
 def detect_drift(branch: str) -> tuple[list[str], dict]:
-    """Returns (findings, debug). Empty findings == no drift."""
+    """Returns (findings, debug). Empty findings == no drift.
+
+    Raises:
+        ApiError: propagated from the protection fetch only when the
+                  failure is likely a transient Gitea outage (5xx).
+                  403/404 from the protection endpoint is treated as
+                  "cannot determine drift for this branch" — a token-
+                  scope issue (missing repo-admin on DRIFT_BOT_TOKEN) or
+                  a repo with no protection set should not turn the
+                  hourly cron red. The workflow continues to the next
+                  branch; no [ci-drift] issue is filed for a branch
+                  whose protection cannot be read.
+    """
    findings: list[str] = []

    ci_doc = load_yaml(CI_WORKFLOW_PATH)
@@ -313,9 +325,50 @@ def detect_drift(branch: str) -> tuple[list[str], dict]:
    env_set = required_checks_env(audit_doc)

    # Protection
-    # api() raises ApiError on non-2xx; let it propagate so a transient
-    # 500 fails the run loudly rather than producing a "no drift" lie.
-    _, protection = api("GET", f"/repos/{OWNER}/{NAME}/branch_protections/{branch}")
+    # api() raises ApiError on non-2xx. Transient 5xx should fail loud.
+    # 403/404 means the token lacks repo-admin scope (Gitea 1.22.6's
+    # branch_protections endpoint requires it — see DRIFT_BOT_TOKEN
+    # provisioning trail in ci-required-drift.yml). Treat as
+    # "cannot determine drift for this branch" — skip without turning
+    # the workflow red. Surface a clear diagnostic so the operator
+    # knows what to fix.
+    contexts: set[str] = set()
+    protection_path = f"/repos/{OWNER}/{NAME}/branch_protections/{branch}"
+    try:
+        _, protection = api("GET", protection_path)
+    except ApiError as e:
+        # Isolate the HTTP status from the error message.
+        http_status: int | None = None
+        msg = str(e)
+        # ApiError message format: "{method} {path} → HTTP {status}: {body}"
+        import re as _re
+
+        m = _re.search(r"HTTP (\d{3})", msg)
+        if m:
+            http_status = int(m.group(1))
+        if http_status in (403, 404):
+            # Token lacks scope OR branch has no protection. Cannot
+            # determine drift — skip this branch. Do NOT exit non-zero;
+            # the issue IS the alarm, not a red workflow.
+            sys.stderr.write(
+                f"::error::GET {protection_path} returned HTTP {http_status} — "
+                f"DRIFT_BOT_TOKEN lacks repo-admin scope (Gitea 1.22.6 "
+                f"requires it for this endpoint) OR branch has no protection "
+                f"configured. Cannot determine drift for {branch}; "
+                f"skipping. Fix: grant repo-admin to mc-drift-bot or "
+                f"configure protection on {branch}.\n"
+            )
+            debug = {
+                "branch": branch,
+                "ci_jobs": sorted(jobs),
+                "sentinel_needs": sorted(needs),
+                "protection_contexts_skipped": True,
+                "protection_http_status": http_status,
+                "audit_env_checks": sorted(env_set),
+            }
+            return [], debug
+        # 5xx — propagate (transient outage, fail loud per design).
+        raise
    if not isinstance(protection, dict):
        sys.stderr.write(
            f"::error::protection response for {branch} not a JSON object\n"
@@ -96,16 +96,27 @@ API="https://${GITEA_HOST}/api/v1"
 AUTH="Authorization: token ${GITEA_TOKEN}"
 echo "::notice::tier-check start: repo=$OWNER/$NAME pr=$PR_NUMBER author=$PR_AUTHOR"

-# Sanity: token resolves to a user
-WHOAMI=$(curl -sS -H "$AUTH" "${API}/user" | jq -r '.login // ""')
+# Sanity: token resolves to a user.
+# Use || true on the jq pipeline so that set -euo pipefail (line 45) does not
+# cause the script to exit prematurely when the token is empty/invalid — the
+# if check below handles that case gracefully. Without || true, a 401 from an
+# empty/invalid token causes jq to exit 1, triggering set -e and exiting the
+# entire script before SOP_FAIL_OPEN can be evaluated (the check is in the jq-
+# install block; if jq is already on PATH, that block is skipped entirely).
+WHOAMI=$(curl -sS -H "$AUTH" "${API}/user" | jq -r '.login // ""') || true
 if [ -z "$WHOAMI" ]; then
  echo "::error::GITEA_TOKEN cannot resolve a user via /api/v1/user — check the token scope and that the secret is wired correctly."
+  if [ "${SOP_FAIL_OPEN:-}" = "1" ]; then
+    echo "::warning::SOP_FAIL_OPEN=1 — exiting 0 so CI does not block."
+    exit 0
+  fi
  exit 1
 fi
 echo "::notice::token resolves to user: $WHOAMI"

-# 1. Read tier label
-LABELS=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/issues/${PR_NUMBER}/labels" | jq -r '.[].name')
+# 1. Read tier label. || true ensures set -euo pipefail does not abort the
+# script if curl or jq fails (e.g. 401 from empty token).
+LABELS=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/issues/${PR_NUMBER}/labels" | jq -r '.[].name') || true
 TIER=""
 for L in $LABELS; do
  case "$L" in
@@ -176,17 +187,25 @@ fi
 # 4. Resolve all team names → IDs
 # /orgs/{org}/teams/{slug}/... endpoints don't exist on Gitea 1.22;
 # we use /teams/{id}.
+# set +e prevents set -e from aborting the script if curl fails (e.g. empty token).
 ORG_TEAMS_FILE=$(mktemp)
 trap 'rm -f "$ORG_TEAMS_FILE"' EXIT
+set +e
 HTTP_CODE=$(curl -sS -o "$ORG_TEAMS_FILE" -w '%{http_code}' -H "$AUTH" \
  "${API}/orgs/${OWNER}/teams")
-debug "teams-list HTTP=$HTTP_CODE size=$(wc -c <"$ORG_TEAMS_FILE")"
+_HTTP_EXIT=$?
+set -e
+debug "teams-list HTTP=$HTTP_CODE (curl exit=$_HTTP_EXIT) size=$(wc -c <"$ORG_TEAMS_FILE")"
 if [ "${SOP_DEBUG:-}" = "1" ]; then
  echo "  [debug] teams-list body (first 300 chars):" >&2
  head -c 300 "$ORG_TEAMS_FILE" >&2; echo >&2
 fi
-if [ "$HTTP_CODE" != "200" ]; then
-  echo "::error::GET /orgs/${OWNER}/teams returned HTTP $HTTP_CODE — token likely lacks read:org scope."
+if [ "$_HTTP_EXIT" -ne 0 ] || [ "$HTTP_CODE" != "200" ]; then
+  echo "::error::GET /orgs/${OWNER}/teams failed (curl exit=$_HTTP_EXIT HTTP=$HTTP_CODE) — token may lack read:org scope or be invalid."
+  if [ "${SOP_FAIL_OPEN:-}" = "1" ]; then
+    echo "::warning::SOP_FAIL_OPEN=1 — exiting 0 so CI does not block."
+    exit 0
+  fi
  exit 1
 fi

@@ -231,9 +250,22 @@ for _t in $_all_teams; do
  debug "team-id: $_t → $_id"
 done

-# 5. Read approving reviewers
+# 5. Read approving reviewers. set +e disables set -e temporarily so that curl
+# failures (e.g. empty/invalid token → HTTP 401) do not abort the script before
+# SOP_FAIL_OPEN is evaluated. set -e is restored immediately after.
+set +e
 REVIEWS=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/pulls/${PR_NUMBER}/reviews")
-APPROVERS=$(echo "$REVIEWS" | jq -r '[.[] | select(.state=="APPROVED") | .user.login] | unique | .[]')
+_REVIEWS_EXIT=$?
+set -e
+if [ $_REVIEWS_EXIT -ne 0 ] || [ -z "$REVIEWS" ]; then
+  echo "::error::Failed to fetch reviews (curl exit=$_REVIEWS_EXIT) — token may be invalid or unreachable."
+  if [ "${SOP_FAIL_OPEN:-}" = "1" ]; then
+    echo "::warning::SOP_FAIL_OPEN=1 — exiting 0 so CI does not block."
+    exit 0
+  fi
+  exit 1
+fi
+APPROVERS=$(echo "$REVIEWS" | jq -r '[.[] | select(.state=="APPROVED") | .user.login] | unique | .[]') || true
 if [ -z "$APPROVERS" ]; then
  echo "::error::No approving reviews on this PR. Set SOP_DEBUG=1 and re-run for diagnostics."
  exit 1
@@ -19,13 +19,18 @@ What this script does, per `.gitea/workflows/status-reaper.yml` invocation:
         downstream — Gitea uses ` / ` as the workflow/job separator).
     Classify each by whether `on:` contains a `push:` trigger.

-  2. List the last N (=10) commits on WATCH_BRANCH via
-     GET /repos/{o}/{r}/commits?sha={branch}&limit={N}. rev2 sweeps
-     N commits per tick instead of HEAD only — schedule workflows
-     post `failure` to whatever SHA was HEAD when they COMPLETED, so
-     by the next */5 tick main has often moved forward and the red
-     gets stranded on a stale commit (Phase 1+2 evidence: rev1 saw
-     `compensated:0` every tick across ~6 cycles).
+  2. List the last N (=30, rev3 — widened from 10) commits on
+     WATCH_BRANCH via GET /repos/{o}/{r}/commits?sha={branch}&limit={N}.
+     rev2 sweeps N commits per tick instead of HEAD only — schedule
+     workflows post `failure` to whatever SHA was HEAD when they
+     COMPLETED, so by the next */5 tick main has often moved forward
+     and the red gets stranded on a stale commit. rev3 widens the
+     window from 10 → 30 because schedule workflows post `failure`
+     RETROACTIVELY (5-15 min after their merge); a 10-commit window
+     is narrower than the merge-cadence during a burst, so reds land
+     OUTSIDE the window before reaper sees them (Phase 1+2 evidence:
+     rev2 run 17057 at 02:46Z saw 185/0 contexts on 10 SHAs; direct
+     probe ~30min later showed ~25 fails on those same 10 SHAs).

  3. For EACH SHA in the list:
       - GET combined commit status. Per-SHA error isolation
@@ -502,7 +507,17 @@ def reap(
 # already stale enough that the schedule-run that posted them has long
 # since been overwritten by a real push trigger. See `reference_post_
 # suspension_pipeline` for the merge-cadence baseline.
-DEFAULT_SWEEP_LIMIT = 10
+#
+# rev3 (2026-05-12, hongming-pc2 GO 03:25Z): widened from 10 → 30.
+# rev2 (limit=10) shipped 01:48Z and ran 6/6 ticks post-merge with
+# `compensated:0` despite ~25 stranded reds visible on those same 10
+# SHAs ~30min later. Root cause: schedule workflows post `failure`
+# RETROACTIVELY 5-15 min after their merge, so by the time reaper's
+# next */5 tick lands, the stranded red is on a SHA that has already
+# fallen out of a 10-commit window during a burst-merge period.
+# Trades window-width-cheap for cadence-loady (per hongming-pc2):
+# kept `*/5` cron unchanged; only the window-N is widened.
+DEFAULT_SWEEP_LIMIT = 30


 def list_recent_commit_shas(branch: str, limit: int) -> list[str]:
@@ -23,11 +23,11 @@
 # `feedback_behavior_based_ast_gates` — NOT grep-by-name. That way
 # job renames or matrix-expansion-induced churn produce honest signal.
 #
-# IMPORTANT — TRANSITIONAL STATE: molecule-core's ci.yml does NOT yet
-# contain the `all-required` sentinel job (RFC §4 Phase 4 adds it).
-# Until Phase 4 lands the detector will hard-fail with exit 3 on the
-# missing sentinel. That's intentional: a red workflow on a 5-min cron
-# is louder than a silent issue and forces Phase 4 to land soon.
+# NOTE on protection endpoint scope: `GET /repos/.../branch_protections/{branch}`
+# requires repo-admin role in Gitea 1.22.6. If DRIFT_BOT_TOKEN lacks it,
+# the script skips that branch with a clear ::error:: diagnostic and exits 0
+# (the issue IS the alarm, not a red workflow). See provisioning trail in
+# the run step's GITEA_TOKEN env comment.

 name: ci-required-drift

@@ -37,13 +37,15 @@ name: main-red-watchdog
 # "unknown on type" when `workflow_dispatch.inputs.X` is present. Revisit
 # when Gitea ≥ 1.23 is fleet-wide.
 on:
-  # SCHEDULE DISABLED 2026-05-12 — interim per RFC#420 Option-C machinery-down emergency
-  # Watchdog timing out behind runner saturation; rev3+dedicated-runner-label in flight
-  # Re-enable after rev3 lands + runner saturation root resolved
-  #   schedule:
-  #     # Hourly at :05 — task spec calls for "off-zero" (`5 * * * *`),
-  #     # offset from :17 (ci-required-drift) and :00 (peak cron load).
-  #     - cron: '5 * * * *'
+  # SCHEDULE RE-ENABLED 2026-05-12 rev3 — interim disable (mc#645) reverted alongside
+  # status-reaper rev3 (widen-window). Job-level timeout-minutes raised 5 → 15 below
+  # to absorb runner-saturation latency without spurious cancels (the original cascade
+  # cause). If runner-saturation root persists, the dedicated-runner-label split
+  # remains the structural next step (tracked separately).
+  schedule:
+    # Hourly at :05 — task spec calls for "off-zero" (`5 * * * *`),
+    # offset from :17 (ci-required-drift) and :00 (peak cron load).
+    - cron: '5 * * * *'
  workflow_dispatch:

 # Read commit status + branch ref + issues; write issues (open/PATCH/close).
@@ -61,7 +63,12 @@ concurrency:
 jobs:
  watchdog:
    runs-on: ubuntu-latest
-    timeout-minutes: 5
+    # rev3 (2026-05-12, mc#645 revert): raised 5 → 15 to absorb runner-saturation
+    # latency. Original 5min cap was producing 124-style cancels under load,
+    # which fed the very `[main-red]` issues this workflow files (self-poisoning).
+    # 15min is still well below Gitea-default 6h job ceiling; if a real hang
+    # occurs the issue-file path is still the alarm surface.
+    timeout-minutes: 15
    steps:
      - name: Check out repo (script lives at .gitea/scripts/)
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
@@ -53,16 +53,19 @@ name: status-reaper
 # `inputs:` block here. Gitea 1.22.6 rejects the whole workflow as
 # "unknown on type" when `workflow_dispatch.inputs.X` is present.
 on:
-  # SCHEDULE DISABLED 2026-05-12 — interim per RFC#420 Option-C machinery-down emergency
-  # Reaper rev2 not compensating + watchdog timeout-cascade; rev3 in flight
-  # Re-enable after rev3 lands + runner saturation root resolved
-  #   schedule:
-  #     # Every 5 minutes. Off-zero alignment with sibling cron workflows:
-  #     # ci-required-drift (`:17`), main-red-watchdog (`:05`),
-  #     # railway-pin-audit (`:23`). 5-min cadence gives a tight enough
-  #     # close on schedule-triggered false-reds that main-red-watchdog
-  #     # (hourly :05) almost never files an issue on the false case.
-  #     - cron: '*/5 * * * *'
+  # SCHEDULE RE-ENABLED 2026-05-12 rev3 — interim disable (mc#645) reverted now that
+  # rev3 widens DEFAULT_SWEEP_LIMIT 10 → 30 (covers retroactive-failure timing window).
+  # Sibling watchdog re-enabled in the same PR with timeout-minutes raised 5 → 15.
+  schedule:
+    # Every 5 minutes. Off-zero alignment with sibling cron workflows:
+    # ci-required-drift (`:17`), main-red-watchdog (`:05`),
+    # railway-pin-audit (`:23`). 5-min cadence gives a tight enough
+    # close on schedule-triggered false-reds that main-red-watchdog
+    # (hourly :05) almost never files an issue on the false case.
+    # rev3 keeps `*/5` unchanged per hongming-pc2 03:25Z review:
+    # "trades window-width-cheap for cadence-loady" — N=30 widens
+    # the lookback cheaply without doubling runner load via `*/2`.
+    - cron: '*/5 * * * *'
  workflow_dispatch:

 # Compensating-status POST needs write on repo statuses; no other
@@ -53,9 +53,20 @@ jobs:
      - name: Build
        run: go build ./cmd/server

+      # `go vet` is NOT `|| true`-guarded: surfacing latent vet errors on main is
+      # the whole point of this workflow (issue #567 — the motivating case was a
+      # `go vet` error in org_external.go that sat undetected on main for weeks).
+      # A vet error here fails the step → fails the job → shows red on the weekly
+      # commit. Per Gitea quirk #10 (job-level continue-on-error is ignored), that
+      # red surfaces on main — which is the intended signal, not a regression.
      - name: go vet
-        run: go vet ./... || true
+        run: go vet ./...

+      # golangci-lint stays `|| true`-guarded: lint is noisier (more false-
+      # positives than vet) and golangci-lint may not be pre-installed on every
+      # runner image — a `|| true` here keeps a missing-binary or lint-noise case
+      # from masking the vet/test signal above. Tighten to match ci.yml's lint
+      # gate if/when ci.yml's lint step becomes hard-failing.
      - name: golangci-lint
        run: golangci-lint run --timeout 3m ./... || true

@@ -4,11 +4,11 @@ Documents persistent operational findings about Gitea Actions runner behaviour
 that differ from GitHub Actions and require workarounds in workflow YAML or
 runbooks.

-> Last updated: 2026-05-11 (core-devops-agent)
+> Last updated: 2026-05-12 (infra-runtime-be-agent)

 ---

-## Large repo causes fetch timeout on Gitea Actions runner
+## Quirk #1 — Large repo causes fetch timeout on Gitea Actions runner

 ### Finding

@@ -68,7 +68,7 @@ confirming this is a repo-size constraint, not network isolation.

 ---

-## `continue-on-error` only works at step level, not job level
+## Quirk #2 — `continue-on-error` only works at step level, not job level

 ### Finding

@@ -112,12 +112,12 @@ jobs:

 ### References

- Gitea Actions quirk #10 (from migration checklist)
+- Quirk #10 (this document): Gitea does NOT auto-populate `secrets.GITHUB_TOKEN`
 - PR #441: fix applied to `harness-replays.yml`

 ---

-## `workflow_dispatch.inputs` not supported
+## Quirk #3 — `workflow_dispatch.inputs` not supported

 Gitea 1.22.6 parser rejects `workflow_dispatch.inputs`. Drop from all workflow
 YAML files ported from GitHub Actions. Manual triggers should use
@@ -127,21 +127,21 @@ YAML files ported from GitHub Actions. Manual triggers should use

 ---

-## `merge_group` not supported
+## Quirk #4 — `merge_group` not supported

 Gitea has no merge queue concept. Drop `merge_group:` triggers from all
 workflow YAML files.

 ---

-## `environment:` blocks not supported
+## Quirk #5 — `environment:` blocks not supported

 Gitea has no environments concept. Drop `environment:` from all workflow YAML
 files. Secrets and variables are repo-level.

 ---

-## Gitea combined status reports `failure` when all contexts are `null`
+## Quirk #6 — Gitea combined status reports `failure` when all contexts are `null`

 ### Finding

@@ -189,3 +189,215 @@ primary consumer of combined status and is affected.

 - Issue #481: first real-world case of this bug (2026-05-11)
 - `feedback_no_such_thing_as_flakes`: watchdog directive
+
+---
+
+## Quirk #7 — TBD
+
+*[Placeholder — document here when a new Gitea Actions quirk is discovered.]*
+
+### Finding
+
+*[What Gitea Actions does differently from GitHub Actions.]*
+
+### Impact
+
+*[Which workflows or operations are affected.]*
+
+### Workaround
+
+*[How to work around this quirk.]*
+
+### References
+
+- internal#[N]: first observation
+
+---
+
+## Quirk #8 — TBD
+
+*[Placeholder — document here when a new Gitea Actions quirk is discovered.]*
+
+### Finding
+
+*[What Gitea Actions does differently from GitHub Actions.]*
+
+### Impact
+
+*[Which workflows or operations are affected.]*
+
+### Workaround
+
+*[How to work around this quirk.]*
+
+### References
+
+- internal#[N]: first observation
+
+---
+
+## Quirk #9 — TBD
+
+*[Placeholder — document here when a new Gitea Actions quirk is discovered.]*
+
+### Finding
+
+*[What Gitea Actions does differently from GitHub Actions.]*
+
+### Impact
+
+*[Which workflows or operations are affected.]*
+
+### Workaround
+
+*[How to work around this quirk.]*
+
+### References
+
+- internal#[N]: first observation
+
+---
+
+## Quirk #10 — Gitea does NOT auto-populate `secrets.GITHUB_TOKEN`
+
+### Finding
+
+Gitea Actions (1.22.6) does **not** auto-populate `secrets.GITHUB_TOKEN`
+the way GitHub Actions does. A workflow that references `secrets.GITHUB_TOKEN`
+without explicitly provisioning a named secret gets an empty string — not a
+read-only token scoped to the repo.
+
+### Impact
+
+Workflows that call the Gitea REST API using `secrets.GITHUB_TOKEN` as auth
+receive **HTTP 401** on every API call. Affected workflows in molecule-core:
+
+| Workflow | Symptom | Workaround |
+|---|---|---|
+| `gate-check-v3.yml` | Reports BLOCKED on every PR | Provision `SOP_TIER_CHECK_TOKEN`; update workflow to use it |
+| `qa-review.yml` | Fails immediately on PR open | Same — needs named secret |
+| `security-review.yml` | Fails immediately on PR open | Same — needs named secret |
+
+### How to diagnose
+
+Add a debug step to the failing workflow:
+
+```yaml
+- name: Diagnose token
+  run: |
+    echo "Token present: ${{ secrets.GITHUB_TOKEN != '' }}"
+    curl -sS --fail -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
+      "$GITHUB_SERVER_URL/api/v1/user" | jq -r '.login'
+    # Expected (GitHub): prints your username.
+    # Actual (Gitea): HTTP 401 or empty string.
+```
+
+### References
+
+- internal#325: root-cause analysis and token provisioning
+- `feedback_gitea_no_auto_supplied_github_token`
+
+---
+
+## Quirk #11 — PR-create event dispatcher races — only 1 of N workflows fires on `pull_request opened`
+
+### Finding
+
+When a PR is created via the Gitea web UI or API, the Gitea Actions event
+dispatcher may fire **only 1 of N eligible workflows** on the initial
+`pull_request opened` event. All other eligible workflows are silently dropped.
+
+This was observed on molecule-core PR #558 (created 2026-05-11T19:54:10Z):
+12+ workflows had no `paths:` filter and should have fired, but only
+`sop-tier-check.yml` dispatched.
+
+Concurrent PRs created within the same minute received 12–30 dispatches each,
+confirming this is specific to the PR-create event dispatch, not a general
+runner capacity issue.
+
+### Impact
+
+- PRs may not run the full CI suite on first open.
+- `gate-check-v3`, `secret-scan`, `qa-review`, and `security-review` can be
+  silently absent from the PR's status checks.
+- Branch protection may block merge even though CI is effectively green.
+
+### How to diagnose
+
+```bash
+# List workflow runs for the PR:
+gh run list --event pull_request --repo molecule-ai/molecule-core \
+  | grep "$(gh pr view $PR --json number --jq '.number')"
+
+# Expected: 12+ runs on PR open.
+# Actual (when race fires): only 1 run.
+```
+
+### Workaround
+
+Force a second dispatch by pushing a no-op synchronize commit:
+
+```bash
+git commit --allow-empty -m "chore: trigger workflows [skip ci]"
+git push
+```
+
+The synchronize event fires a second `pull_request` event, which reliably
+triggers all eligible workflows.
+
+### References
+
+- internal#329: first observation on PR #558
+- `feedback_gitea_pr_create_dispatcher_race`
+
+---
+
+## When you find a new quirk
+
+Copy the template below, increment the quirk number, and fill in the finding,
+impact, workaround, and references. Place the new section in the **correct
+numerical position** (before the next higher-numbered quirk). Update this
+section's final paragraph to remove the next slot's number.
+
+### Template
+
+```markdown
+## Quirk #N — <short title>
+
+### Finding
+
+<What Gitea Actions does differently from GitHub Actions.>
+
+### Impact
+
+<Which workflows or operations are affected. Include an affected workflows
+table if more than one is affected.>
+
+### How to diagnose
+
+<Shell commands or API calls that confirm this is the quirk, not a real failure.>
+
+### Workaround
+
+<How to work around this quirk in workflow YAML or operations.>
+
+### References
+
+- internal#[N]: first observation
+- <Any Gitea issue, feedback label, or upstream bug tracker reference>
+```
+
+---
+
+## Open questions for Gitea 1.23
+
+- [ ] **act_runner concurrent-job cap**: issue #305 — runner saturation under
+  merge burst; needs `max_concurrent_jobs` cap configured on act_runner
+- [ ] **Infisical→Gitea secret-sync**: issue #307 — eliminate manual secret
+  PUTs by wiring an Infisical cron to the Gitea API
+- [ ] **PR-create dispatcher race resolution**: internal #329 — is there a
+  Gitea fix or config knob to disable the race? File upstream bug if not
+- [ ] **GITHUB_TOKEN auto-population**: internal #325 — is this on the
+  Gitea 1.23 roadmap? If not, the workaround (named secret) is the permanent
+  answer
+
@@ -713,6 +713,92 @@ def test_reap_skips_combined_success_shas(sr_module, monkeypatch):
    assert posts[0][0] == f"/repos/owner/repo/statuses/{SHA_B}"


+def test_default_sweep_limit_is_30(sr_module):
+    """rev3 contract: `DEFAULT_SWEEP_LIMIT = 30` (widened from rev2's 10).
+
+    Root cause of the widening: schedule workflows post `failure`
+    RETROACTIVELY 5-15 min after their merge. A 10-commit window is
+    narrower than the merge-cadence during a burst, so reds land
+    OUTSIDE the window before reaper's next tick sees them.
+
+    Evidence: rev2 run 17057 (02:46Z 2026-05-12) saw 185 contexts / 0
+    fails on its 10 SHAs; direct probe ~30min later showed ~25 fails
+    on those same 10 SHAs.
+
+    If this default is ever lowered back, that change MUST cite
+    re-measured cadence data — a smaller window than the
+    retroactive-failure-post lag re-introduces compensated:0.
+    """
+    assert sr_module.DEFAULT_SWEEP_LIMIT == 30
+
+
+def test_reap_widened_window_catches_retroactive_failure(sr_module, monkeypatch):
+    """rev3 regression: with limit=30, a stranded red on a SHA at depth=20
+    (which the rev2 limit=10 window would have missed) IS swept + compensated.
+
+    Why this matters: rev2 ran with limit=10 and saw `compensated:0` for
+    6 consecutive ticks despite ~25 known-stranded reds across the last
+    30 main commits. Widening to 30 must demonstrably catch a SHA past
+    the old window. We mock 30 SHAs, plant the failure on SHA[20], and
+    verify exactly one compensation lands on that SHA.
+    """
+    shas = [f"{c:02x}" * 20 for c in range(30)]  # 30 deterministic SHAs
+    failing_sha = shas[20]  # depth 20 — outside rev2's window=10, inside rev3's =30
+
+    posts: list[tuple[str, dict]] = []
+
+    def fake_api(method, path, *, body=None, query=None, expect_json=True):
+        if method == "GET" and path.endswith("/commits"):
+            # /commits listing — return all 30 fake commit objects
+            assert query.get("limit") == "30", (
+                f"expected limit=30 in query, got {query}"
+            )
+            return (200, [{"sha": s} for s in shas])
+        if method == "GET" and "/commits/" in path and path.endswith("/status"):
+            sha = path.split("/commits/")[1].split("/status")[0]
+            if sha == failing_sha:
+                return (
+                    200,
+                    {
+                        "state": "failure",
+                        "statuses": [
+                            {
+                                "context": "retroactive-drift / drift (push)",
+                                "state": "failure",
+                                "target_url": "https://example.test/run/9001",
+                            }
+                        ],
+                    },
+                )
+            # All others combined=success (cost-opt short-circuit).
+            return (200, {"state": "success", "statuses": []})
+        if method == "POST":
+            posts.append((path, body))
+            return (201, {})
+        raise AssertionError(f"unexpected api call: {method} {path}")
+
+    monkeypatch.setattr(sr_module, "api", fake_api)
+
+    workflow_map = {"retroactive-drift": False}  # schedule-only → class-O
+    counters = sr_module.reap_branch(
+        workflow_map, "main", limit=sr_module.DEFAULT_SWEEP_LIMIT, dry_run=False
+    )
+
+    # All 30 SHAs walked; exactly one compensated.
+    assert counters["scanned_shas"] == 30
+    assert counters["compensated"] == 1
+    assert failing_sha in counters["compensated_per_sha"]
+    assert counters["compensated_per_sha"][failing_sha] == [
+        "retroactive-drift / drift (push)"
+    ]
+    assert len(posts) == 1
+    assert posts[0][0] == f"/repos/owner/repo/statuses/{failing_sha}"
+    # Sanity: with rev2's window=10, depth=20 would NOT have been reached.
+    # This assertion documents the rev3 widening as the structural fix:
+    # the failing_sha index (20) is strictly greater than rev2's old limit (10).
+    assert shas.index(failing_sha) >= 10
+
+
 def test_reap_continues_on_per_sha_apierror(sr_module, monkeypatch, capsys):
    """rev2 refinement #7 (MOST CRITICAL): a transient ApiError or HTTP-5xx
    on get_combined_status(SHA_X) must NOT fail the whole tick. Log + skip