diff --git a/.github/workflows/auto-promote-on-e2e.yml b/.github/workflows/auto-promote-on-e2e.yml index 82d771a6..6184a80d 100644 --- a/.github/workflows/auto-promote-on-e2e.yml +++ b/.github/workflows/auto-promote-on-e2e.yml @@ -271,23 +271,46 @@ jobs: echo " ok: $tag exists" done - - name: Ancestry check — refuse to promote :latest backwards - # #2244: workflow_run completions arrive in arbitrary order. If - # SHA-A and SHA-B both reach main within ~10 min and SHA-B's E2E - # completes before SHA-A's, this workflow can fire for SHA-A - # AFTER it already promoted SHA-B → :latest goes backwards. The - # orphan-reconciler "next run corrects it" doesn't apply: there's - # no auto-corrective re-promote, :latest stays wrong until the - # next main push lands. + - name: Checkout for local ancestry compute + # #2244: workflow_run completions arrive in arbitrary order. The + # ancestry check below uses `git merge-base --is-ancestor` to + # compare CURRENT_REVISION (read off the live :latest image) and + # TARGET_SHA (this run's SHA) — both full commit SHAs. # + # Why a local checkout, not the Gitea compare API: + # + # Gitea v1.22's `/api/v1/repos/.../compare/A...B` does NOT accept + # full commit SHAs as either side — it returns + # {"total_commits":null, "message":"BaseNotExist"} + # for any non-branch / non-tag ref (verified 2026-05-07, issue + # #75). Branch + tag refs work, but ancestry between two + # arbitrary historical commits does not. The previous version + # called GitHub's `gh api repos/.../compare/A...B` which DOES + # accept SHAs and returns `.status: ahead|behind|identical| + # diverged` — that surface simply doesn't exist on Gitea. + # + # Local git is exact, fast (depth=200 covers any realistic + # divergence between :latest and a candidate retag — promote + # cycles are minutes, not hundreds of commits), and removes the + # cross-host API dependency entirely. + if: steps.gate.outputs.proceed == 'true' && github.event_name != 'workflow_dispatch' + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + # Need enough history to resolve both CURRENT_REVISION and + # TARGET_SHA + their merge-base. 200 covers ~a week of main + # activity at the current commit cadence. Bump if a future + # cron pause lets :latest fall further behind. + fetch-depth: 200 + + - name: Ancestry check — refuse to promote :latest backwards # Detection: read current :latest's `org.opencontainers.image.revision` # label (set by publish-workspace-server-image.yml at build time) - # and ask the GitHub compare API whether the candidate SHA is - # ahead-of / identical-to / behind / diverged-from current. - # Hard-fail on `behind` and `diverged` per the approved design — - # silent-bypass is the class we're moving away from. Workflow - # goes red, oncall sees it, operator decides how to recover - # (manual dispatch with the right SHA, force-promote, etc.). + # and ask local git whether the candidate SHA is ahead-of / + # identical-to / behind / diverged-from current. Hard-fail on + # `behind` and `diverged` per the approved design — silent- + # bypass is the class we're moving away from. Workflow goes red, + # oncall sees it, operator decides how to recover (manual + # dispatch with the right SHA, force-promote, etc.). # # Manual dispatch skips this check — operator override semantics # match the gate-check step above. @@ -299,7 +322,6 @@ jobs: if: steps.gate.outputs.proceed == 'true' && github.event_name != 'workflow_dispatch' id: ancestry env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} REPO: ${{ github.repository }} TARGET_SHA: ${{ steps.sha.outputs.full }} run: | @@ -331,13 +353,44 @@ jobs: exit 0 fi - # Ask GitHub which side of the merge graph TARGET_SHA sits on - # relative to CURRENT_REVISION. Returns one of: ahead | identical - # | behind | diverged. Network or auth errors collapse to "error" - # via the explicit fallback so the case below always matches. - STATUS=$(gh api \ - "repos/${REPO}/compare/${CURRENT_REVISION}...${TARGET_SHA}" \ - --jq '.status' 2>/dev/null || echo "error") + # Compute ancestry locally with git. The four buckets match + # GitHub's compare API status semantics exactly: + # + # ahead — target reaches current via parent pointers + # AND current does not reach target. I.e. target + # is a descendant of current → :latest moves + # forward, allow. + # identical — caught above by SHA-equality early-return. + # behind — current reaches target via parent pointers + # AND target does not reach current. I.e. target + # is an ancestor of current → moving :latest + # backwards (the #2244 race), block. + # diverged — neither reaches the other. Force-push or + # history rewrite, block. + # + # `git merge-base --is-ancestor X Y` exits 0 iff X is an + # ancestor of Y. Both calls are cheap (constant-ish in depth, + # which we bounded at 200 above). + # + # Both SHAs MUST be reachable in the runner's clone. If + # either rev-parse fails (e.g. the depth=200 we fetched isn't + # deep enough for an unusually old :latest revision), fall + # back to "error" — the previous version's `error` branch + # exits 1 and surfaces an explicit failure for operator + # action, same as a network blip in the old GitHub version. + if ! git rev-parse --verify --quiet "${CURRENT_REVISION}^{commit}" >/dev/null \ + || ! git rev-parse --verify --quiet "${TARGET_SHA}^{commit}" >/dev/null; then + STATUS="error" + elif git merge-base --is-ancestor "$CURRENT_REVISION" "$TARGET_SHA" 2>/dev/null; then + # CURRENT is ancestor of TARGET → TARGET is ahead. + STATUS="ahead" + elif git merge-base --is-ancestor "$TARGET_SHA" "$CURRENT_REVISION" 2>/dev/null; then + # TARGET is ancestor of CURRENT → TARGET is behind. + STATUS="behind" + else + # Neither reaches the other → divergent. + STATUS="diverged" + fi echo "ancestry compare ${CURRENT_REVISION:0:7} → ${TARGET_SHA:0:7}: $STATUS" @@ -346,10 +399,6 @@ jobs: echo "decision=ahead" >> "$GITHUB_OUTPUT" echo "::notice::Target ${TARGET_SHA:0:7} is ahead of current :latest (${CURRENT_REVISION:0:7}) — proceeding with retag" ;; - identical) - echo "decision=identical" >> "$GITHUB_OUTPUT" - echo "::notice::Target identical to :latest — retag will be a no-op" - ;; behind) echo "decision=behind" >> "$GITHUB_OUTPUT" { @@ -359,7 +408,7 @@ jobs: echo "|---|---|" echo "| Target SHA | \`$TARGET_SHA\` |" echo "| Current :latest revision | \`$CURRENT_REVISION\` |" - echo "| GitHub compare status | \`behind\` |" + echo "| Ancestry compute | \`behind\` (target is an ancestor of :latest) |" echo echo "This guard catches the workflow_run-completion-order race (#2244):" echo "two rapid main pushes whose E2Es complete out-of-order can otherwise" @@ -380,7 +429,7 @@ jobs: echo "|---|---|" echo "| Target SHA | \`$TARGET_SHA\` |" echo "| Current :latest revision | \`$CURRENT_REVISION\` |" - echo "| GitHub compare status | \`diverged\` |" + echo "| Ancestry compute | \`diverged\` (neither commit reaches the other) |" echo echo "Likely cause: force-push rewrote main's history, leaving the previous" echo "\`:latest\` revision orphaned. Needs human review before \`:latest\` advances." @@ -390,9 +439,10 @@ jobs: error|*) echo "decision=error" >> "$GITHUB_OUTPUT" { - echo "## ❌ Auto-promote aborted — ancestry-check API error" + echo "## ❌ Auto-promote aborted — ancestry-check error" echo - echo "\`gh api repos/${REPO}/compare/${CURRENT_REVISION}...${TARGET_SHA}\` returned unexpected status: \`$STATUS\`" + echo "Could not resolve both \`$CURRENT_REVISION\` and \`$TARGET_SHA\` in the runner clone (status=\`$STATUS\`)." + echo "Likely cause: \`fetch-depth: 200\` did not reach \`$CURRENT_REVISION\` — increase the fetch depth in this workflow." echo echo "Manual dispatch with the target sha bypasses this check." } >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/check-merge-group-trigger.yml b/.github/workflows/check-merge-group-trigger.yml index 49ca669a..e09870dc 100644 --- a/.github/workflows/check-merge-group-trigger.yml +++ b/.github/workflows/check-merge-group-trigger.yml @@ -1,19 +1,64 @@ name: Check merge_group trigger on required workflows -# Pre-merge guard against the deadlock pattern where a workflow whose -# check is in `required_status_checks` lacks a `merge_group:` trigger. -# Without it, GitHub merge queue stalls forever in AWAITING_CHECKS -# because the required check can't fire on `gh-readonly-queue/...` refs. +# Stub workflow — Gitea Actions has no merge queue (post-2026-05-06 +# SCM migration off GitHub). # -# This workflow: -# 1. Lists required status checks on the branch protection rule for `staging` -# 2. For each required check, finds the workflow that produces it (by job -# name match) -# 3. Fails if any such workflow lacks `merge_group:` in its triggers +# Why this is a stub, not a real linter: # -# Reasoning for staging-only: main has its own CI gating model (PR review), -# but staging is what the merge queue runs on, so it's the trigger that -# matters. +# 1. The original workflow existed exclusively to lint that any +# workflow producing a required status check on the staging branch +# also declares a `merge_group:` trigger. Without that trigger, +# GitHub's merge queue dead-locks at AWAITING_CHECKS forever +# because the required check can't fire on `gh-readonly-queue/...` +# refs (a GitHub-merge-queue-specific ref shape). +# +# 2. Gitea has no merge queue. There is no `gh-readonly-queue/...` +# ref shape on Gitea. There is no `merge_group` event type in the +# Gitea Actions trigger surface. The deadlock pattern this linter +# catches CANNOT occur on the new SCM, by construction. +# +# 3. The original linter additionally called +# `gh api repos/.../branches/staging/protection/required_status_checks`, +# which on Gitea returns 404 — Gitea's branch-protection API lives +# at `/repos/{owner}/{repo}/branch_protections/{name}` (different +# path), and the response shape uses `status_check_contexts` +# instead of `contexts`. So the linter would have failed to fetch +# the required-check list even if Gitea had a merge queue. +# +# What this stub preserves: +# +# - Workflow name `Check merge_group trigger on required workflows` +# is unchanged so any external surface that referenced it (none +# confirmed via branch-protection audit 2026-05-07) keeps resolving. +# - Trigger surface (pull_request, push, merge_group). The +# merge_group: trigger is a no-op on Gitea (the event never fires) +# but kept declared so the workflow file itself stays a faithful +# subscriber to anything that DID fire it on the GitHub side, in +# case we ever migrate back. +# +# Re-enabling real linting (future work): +# +# - If/when Gitea grows a merge queue (none on the roadmap as of +# 2026-05-07), reinstate the linter using +# `/api/v1/repos/.../branch_protections/{branch}` to read the +# required-check list (.status_check_contexts), then walk +# workflow files for the appropriate Gitea-shape trigger. +# - Alternative invariant worth checking on Gitea today: that any +# workflow producing a status_check_context declared in branch +# protection has at least one non-skipped path through to a +# terminal step (the "required check name needs a job that always +# runs" invariant from saved memory +# feedback_branch_protection_check_name_parity). That linter +# exists separately as `branch-protection-drift.yml` so this stub +# doesn't need to also cover it. +# +# Until merge-queue semantics return to this fleet, the stub keeps +# the workflow surface visible in Gitea's Actions UI so the next +# operator notices it's a stub instead of a missing surface, and +# emits success so the surrounding chain isn't artificially red. +# +# Issue tracking: post-#66 sweep tracked in #75 (class D — `gh api` +# REST passthroughs that have no Gitea v1 equivalent). on: pull_request: @@ -25,7 +70,8 @@ on: paths: - '.github/workflows/**.yml' - '.github/workflows/**.yaml' - # Self-listen on merge_group so the linter passes its own queue run. + # Self-listen on merge_group is structurally a no-op on Gitea; kept + # declared for parity with the GitHub-era contract. merge_group: types: [checks_requested] @@ -36,88 +82,25 @@ jobs: permissions: contents: read steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - name: Verify merge_group trigger on required-check workflows - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - REPO: ${{ github.repository }} + - name: Stub — Gitea has no merge queue shell: bash run: | set -euo pipefail + cat <<'EOF' + merge_group-trigger linter is a no-op on Gitea Actions. - # Branch we care about — the one merge queue runs on. - BRANCH=staging + Reason: Gitea has no merge queue. The dead-lock pattern this + linter catches (required check missing merge_group: trigger + → AWAITING_CHECKS forever on gh-readonly-queue refs) cannot + occur on this SCM by construction. - # Pull the list of required status check contexts. If the branch - # has no protection or no required checks, exit clean — nothing - # to lint. - REQUIRED=$(gh api "repos/${REPO}/branches/${BRANCH}/protection/required_status_checks" \ - --jq '.contexts[]' 2>/dev/null || true) - if [ -z "$REQUIRED" ]; then - echo "No required status checks on ${BRANCH} — nothing to verify." - exit 0 - fi + Adjacent live linters that DO catch real Gitea-shape failure + modes: + - branch-protection-drift.yml — guards the protection rules + - check-name parity gate (PR #56, issue #144) — guards + that path-filtered workflows still emit required checks - echo "Required checks on ${BRANCH}:" - echo "${REQUIRED}" | sed 's/^/ - /' - echo - - # Build a map: workflow file -> set of job names declared in it. - # We use yq if available, otherwise grep the `name:` lines under - # `jobs:`. Stick with grep for portability — runner image always - # has it; yq isn't in the default image as of 2026-04. - declare -A workflow_jobs - shopt -s nullglob - for wf in .github/workflows/*.yml .github/workflows/*.yaml; do - [ -f "$wf" ] || continue - # Extract the workflow name (the `name:` at file root). - wf_name=$(awk '/^name:[[:space:]]/ {sub(/^name:[[:space:]]+/,""); gsub(/^"|"$/,""); print; exit}' "$wf") - # Extract job step names from the `jobs:` block. A job step is: - # - id under `jobs:` (key with 2-space indent followed by colon) - # - the `name:` field inside that job (4-space indent) - # We collect both because required_status_checks contexts can - # match either, depending on how the workflow was authored. - jobs_block=$(awk '/^jobs:/{flag=1; next} flag' "$wf") - job_names=$(echo "$jobs_block" | awk '/^[[:space:]]{4}name:[[:space:]]/ {sub(/^[[:space:]]+name:[[:space:]]+/,""); gsub(/^["'"'"']|["'"'"']$/,""); print}') - workflow_jobs["$wf"]="${wf_name}"$'\n'"${job_names}" - done - - # For each required check, find the workflow that produces it. - # Then verify that workflow lists merge_group as a trigger. - FAILED=0 - while IFS= read -r check; do - [ -z "$check" ] && continue - owning_wf="" - for wf in "${!workflow_jobs[@]}"; do - if echo "${workflow_jobs[$wf]}" | grep -Fxq "$check"; then - owning_wf="$wf" - break - fi - done - - if [ -z "$owning_wf" ]; then - echo "::warning::Required check '${check}' has no matching workflow in this repo. Skipping (may be from an external app)." - continue - fi - - # Does the workflow's trigger list include merge_group? - # Match either bare `merge_group:` line or merge_group with - # subsequent indented config (types: [checks_requested]). - if grep -qE '^[[:space:]]*merge_group:' "$owning_wf"; then - echo "OK: '${check}' (in $owning_wf) — has merge_group trigger" - else - echo "::error file=${owning_wf}::Required check '${check}' is produced by ${owning_wf}, but the workflow does not declare a 'merge_group:' trigger. With merge queue enabled on ${BRANCH}, this will deadlock the queue (every PR sits AWAITING_CHECKS forever). Add this to the workflow's 'on:' block:" - echo "::error file=${owning_wf}:: merge_group:" - echo "::error file=${owning_wf}:: types: [checks_requested]" - FAILED=1 - fi - done <<< "$REQUIRED" - - if [ "$FAILED" -ne 0 ]; then - echo - echo "::error::Block. See errors above. Reference: $(grep -l 'reference_merge_queue' /dev/null 2>/dev/null || echo 'memory: reference_merge_queue_enablement.md')." - exit 1 - fi - - echo - echo "All required workflows on ${BRANCH} declare merge_group triggers." + Issue: #75 (post-#66 gh-CLI sweep, class D — gh api REST + calls that have no Gitea v1 equivalent). + EOF + echo "::notice::merge_group-trigger linter is a stub on Gitea — see workflow file header for context." diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6b447291..e69ae005 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -299,42 +299,50 @@ jobs: # Only fires on direct pushes to main (i.e. after staging→main promotion). if: needs.changes.outputs.canvas == 'true' && github.event_name == 'push' && github.ref == 'refs/heads/main' permissions: - # Required to post commit comments via the GitHub API. - contents: write + contents: read steps: - - name: Post deploy reminder as commit comment + - name: Write deploy reminder to step summary + # Post-2026-05-06 (Gitea migration, issue #75): the previous + # version called `gh api -X POST repos/.../commits/SHA/comments` + # to drop a commit comment for the operator. Gitea's REST API + # does NOT expose `/repos/{owner}/{repo}/commits/{sha}/comments` + # at all (verified 2026-05-07 — only commit *statuses* exist on + # the commit endpoint; comments don't), so the call 404'd on + # every main push since the migration. + # + # The reminder content is entirely operator-facing and just as + # discoverable in the run summary, so write it there instead. + # Operators land on the workflow run page anyway when they need + # to action a deploy; commit comments were a stale UI artefact + # of the GitHub era. env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} COMMIT_SHA: ${{ github.sha }} RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} run: | - # Write body to a temp file — avoids backtick escaping in shell. - cat > /tmp/deploy-reminder.md << 'BODY' - ## Canvas build passed ✅ — deploy required - - The `publish-canvas-image` workflow is now building a fresh Docker image - (`ghcr.io/molecule-ai/canvas:latest`) in the background. - - Once it completes (~3–5 min), apply on the host machine with: - ```bash - cd - git pull origin main - docker compose pull canvas && docker compose up -d canvas - ``` - - If you need to rebuild from local source instead (e.g. testing unreleased - changes or a new `NEXT_PUBLIC_*` URL), use: - ```bash - docker compose build canvas && docker compose up -d canvas - ``` - BODY - printf '\n> Posted automatically by CI · commit `%s` · [build log](%s)\n' \ - "$COMMIT_SHA" "$RUN_URL" >> /tmp/deploy-reminder.md - - gh api \ - --method POST \ - "repos/${{ github.repository }}/commits/${{ github.sha }}/comments" \ - --field "body=@/tmp/deploy-reminder.md" + { + echo "## Canvas build passed — deploy required" + echo + echo "The \`publish-canvas-image\` workflow is now building a fresh Docker image" + echo "(\`ghcr.io/molecule-ai/canvas:latest\`) in the background." + echo + echo "Once it completes (~3–5 min), apply on the host machine with:" + echo + echo '```bash' + echo "cd " + echo "git pull origin main" + echo "docker compose pull canvas && docker compose up -d canvas" + echo '```' + echo + echo "If you need to rebuild from local source instead (e.g. testing unreleased" + echo "changes or a new \`NEXT_PUBLIC_*\` URL), use:" + echo + echo '```bash' + echo "docker compose build canvas && docker compose up -d canvas" + echo '```' + echo + printf '> Posted automatically by CI · commit `%s` · [build log](%s)\n' \ + "$COMMIT_SHA" "$RUN_URL" + } >> "$GITHUB_STEP_SUMMARY" # Python Lint & Test — required check, always runs. See platform-build # for the rationale.