From 08e8d325e2998fe49bd0c56d344762bb10f8da23 Mon Sep 17 00:00:00 2001 From: claude-ceo-assistant Date: Fri, 8 May 2026 14:18:35 +0000 Subject: [PATCH] chore(workflows): delete obsolete promote/sync workflows (Phase 3C of internal#81) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Trunk-based migration final cleanup for molecule-core. The 6 workflows deleted here all existed to manage the staging↔main branch dance that trunk-based makes obsolete: - auto-promote-staging.yml fast-forward staging→main on green - auto-promote-on-e2e.yml alt promote path on E2E green - auto-promote-stale-alarm.yml alarm if staging promotion stalls - auto-sync-main-to-staging.yml sync main→staging after UI merges - auto-sync-canary.yml dry-run probe of the auto-sync token+push path - retarget-main-to-staging.yml rebase open PRs onto staging After Phase 3A (PR #108 promoted 5 staging-only feature PRs to main) and Phase 3B (PR #109 dropped staging-branch triggers from the 4 e2e workflows), main is the only branch the CI cares about. None of the above workflows have anything to do; they're 1977 lines of dead Go-time- no-Gitea-time-yes code. Rollback: `git revert` this commit to restore the workflows. They still work mechanically; trunk-based just doesn't need them. The `staging` branch on the remote is deleted in a follow-up step (`git push origin --delete staging`) after this PR merges, so reviewers can confirm CI runs cleanly on the new shape before the ref disappears. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/auto-promote-on-e2e.yml | 467 ----------------- .github/workflows/auto-promote-staging.yml | 492 ------------------ .../workflows/auto-promote-stale-alarm.yml | 83 --- .github/workflows/auto-sync-canary.yml | 404 -------------- .../workflows/auto-sync-main-to-staging.yml | 255 --------- .../workflows/retarget-main-to-staging.yml | 276 ---------- 6 files changed, 1977 deletions(-) delete mode 100644 .github/workflows/auto-promote-on-e2e.yml delete mode 100644 .github/workflows/auto-promote-staging.yml delete mode 100644 .github/workflows/auto-promote-stale-alarm.yml delete mode 100644 .github/workflows/auto-sync-canary.yml delete mode 100644 .github/workflows/auto-sync-main-to-staging.yml delete mode 100644 .github/workflows/retarget-main-to-staging.yml diff --git a/.github/workflows/auto-promote-on-e2e.yml b/.github/workflows/auto-promote-on-e2e.yml deleted file mode 100644 index a4daef2b..00000000 --- a/.github/workflows/auto-promote-on-e2e.yml +++ /dev/null @@ -1,467 +0,0 @@ -name: Auto-promote :latest after main image build - -# Retags `ghcr.io/molecule-ai/{platform,platform-tenant}:staging-` -# → `:latest` after either the image build or E2E completes on a `main` -# push, gated on E2E Staging SaaS not being red for that SHA. -# -# Why two triggers: -# -# `publish-workspace-server-image` and `e2e-staging-saas` are both -# paths-filtered, but with DIFFERENT path sets: -# -# publish-workspace-server-image: -# workspace-server/**, canvas/**, manifest.json -# -# e2e-staging-saas (full lifecycle): -# workspace-server/internal/handlers/{registry,workspace_provision, -# a2a_proxy}.go, workspace-server/internal/middleware/**, -# workspace-server/internal/provisioner/**, tests/e2e/test_staging_full_saas.sh -# -# The E2E set is a strict SUBSET of the publish set. So: -# - canvas/** changes → publish fires, E2E does not -# - workspace-server/cmd/** changes → publish fires, E2E does not -# - workspace-server/internal/sweep/** → publish fires, E2E does not -# -# The previous version triggered ONLY on E2E completion, which meant -# non-E2E-path changes (canvas, cmd, sweep, etc.) rebuilt the image -# but never advanced `:latest`. Result: as of 2026-04-28 this workflow -# had run zero times since merge despite eight main pushes — `:latest` -# was ~7 hours / 9 PRs behind main with no human realising. See -# `molecule-core` Slack discussion 2026-04-28. -# -# Adding `publish-workspace-server-image` as a second trigger closes -# the gap: any image rebuild on main eligibly advances `:latest`. -# -# Why E2E remains a kill-switch (not the trigger): -# -# When E2E DID run for this SHA and ended red, we abort — `:latest` -# stays on the prior known-good digest. When E2E didn't run (paths -# filtered out), we proceed: pre-merge gates already validated this -# SHA on staging via auto-promote-staging requiring CI + E2E Canvas + -# E2E API + CodeQL all green. Image content for non-E2E-paths -# (canvas, cmd, sweep) is exercised by those staging gates. -# -# Why `main` only: -# -# `:latest` is what prod tenants pull. We only want SHAs that have -# reached main (via auto-promote-staging) to advance `:latest`. -# Triggering on staging would let a staging-only revert advance -# `:latest` to a SHA that never reaches main, breaking the "production -# runs what's on main" invariant. -# -# Idempotency: -# -# When a SHA touches paths that match BOTH publish and E2E, both -# workflows fire and complete. Both trigger this workflow on -# completion → two runs race. Both retag `:staging-` → -# `:latest`. crane tag is idempotent (re-tagging the same digest is a -# no-op), so the second run is harmless. concurrency group serializes -# them anyway. - -on: - workflow_run: - workflows: - - 'E2E Staging SaaS (full lifecycle)' - - 'publish-workspace-server-image' - types: [completed] - branches: [main] - workflow_dispatch: - inputs: - sha: - description: 'Short sha to promote (override; defaults to upstream workflow_run head_sha)' - required: false - type: string - -permissions: - contents: read - packages: write - -concurrency: - # Serialize promotes per-SHA so the publish+E2E both-fired race lands - # cleanly. Different SHAs can promote in parallel. - group: auto-promote-latest-${{ github.event.workflow_run.head_sha || github.event.inputs.sha || github.sha }} - cancel-in-progress: false - -env: - IMAGE_NAME: ghcr.io/molecule-ai/platform - TENANT_IMAGE_NAME: ghcr.io/molecule-ai/platform-tenant - -jobs: - promote: - # Proceed if upstream succeeded OR manual dispatch. Upstream-failure - # paths are filtered here; the E2E-was-red kill-switch lives in the - # gate-check step below (covers the case where upstream is publish - # success but E2E for the same SHA failed). - if: | - github.event_name == 'workflow_dispatch' || - (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') - runs-on: ubuntu-latest - steps: - - name: Compute short sha - id: sha - run: | - set -euo pipefail - if [ -n "${{ github.event.inputs.sha }}" ]; then - FULL="${{ github.event.inputs.sha }}" - else - FULL="${{ github.event.workflow_run.head_sha }}" - fi - echo "short=${FULL:0:7}" >> "$GITHUB_OUTPUT" - echo "full=${FULL}" >> "$GITHUB_OUTPUT" - - - name: Gate — E2E Staging SaaS state for this SHA - # When upstream IS E2E success, we know it's green (filtered by - # the job-level `if` already). When upstream is publish, look up - # E2E state for the same SHA. Four buckets: - # - # - completed/success: E2E confirmed safe → proceed - # - completed/failure|cancelled|timed_out: E2E found a - # regression → ABORT (exit 1), `:latest` stays put - # - in_progress|queued|requested: E2E is RACING with publish - # for a runtime-touching SHA. publish typically completes - # ~5-10min before E2E (~10-15min). If we promote on the - # publish signal here, a later E2E failure can't roll back - # `:latest` — it'd already be wrongly advanced. So we DEFER: - # skip subsequent steps (proceed=false) and let E2E's own - # completion event re-fire this workflow, which then takes - # the upstream-is-E2E path. exit 0 so the run shows as - # success rather than a noisy fake-failure. - # - none/none: E2E was paths-filtered out for this SHA (the - # change touched canvas/cmd/sweep/etc. — paths covered by - # publish but not by E2E). pre-merge gates on staging - # already validated this SHA → proceed. - # - # Manual dispatch skips this check — operator override. - id: gate - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - REPO: ${{ github.repository }} - SHA: ${{ steps.sha.outputs.full }} - UPSTREAM_NAME: ${{ github.event.workflow_run.name }} - EVENT_NAME: ${{ github.event_name }} - run: | - set -euo pipefail - - if [ "$EVENT_NAME" = "workflow_dispatch" ]; then - echo "proceed=true" >> "$GITHUB_OUTPUT" - echo "::notice::Manual dispatch — skipping E2E gate (operator override)" - exit 0 - fi - - if [ "$UPSTREAM_NAME" = "E2E Staging SaaS (full lifecycle)" ]; then - echo "proceed=true" >> "$GITHUB_OUTPUT" - echo "::notice::Upstream is E2E itself (success per job-level if) — gate trivially satisfied" - exit 0 - fi - - # Upstream is publish-workspace-server-image. Check E2E state - # for the same SHA via Gitea's commit-status API. - # - # GitHub-era this was `gh run list --workflow=X --commit=SHA - # --json status,conclusion` returning either `[]` (no run on - # this SHA) or `[{status, conclusion}]` (the run's state). - # Gitea has NO workflow-runs API at all — `/api/v1/repos/.../ - # actions/runs` returns 404 (verified 2026-05-07, issue #75). - # However Gitea Actions DOES emit a commit status per workflow - # job, with `context = " / ()"`, - # which is exactly what we need: each E2E run leg becomes one - # status row on the SHA, and the aggregate state encodes the - # run's outcome. - # - # Mapping: - # 0 matched contexts → "none/none" (E2E paths- - # filtered - # out — same - # semantic - # as before) - # any context = pending → "in_progress/none" (defer) - # any context = error|failure → "completed/failure" (abort) - # all contexts = success → "completed/success" (proceed) - # - # The "completed/cancelled" and "completed/timed_out" buckets - # don't have direct Gitea analogs (Gitea statuses are - # success / failure / error / pending / warning). Per-SHA - # concurrency cancellation surfaces as `error` on Gitea, which - # we map to "completed/failure" rather than "completed/cancelled" - # — losing the soft-defer semantic of the cancelled bucket on - # this fleet. Tradeoff: the staleness alarm (auto-promote-stale- - # alarm.yml) still catches a stuck :latest within 4h, and a - # legitimate cancel is rare enough that aborting + manual - # re-dispatch is acceptable. If we measure cancel frequency - # > 1/week, revisit by reading the run-step-summary text via - # a follow-up script. - # - # Network or auth blips collapse to "none/none" via the curl - # `|| true` fallback, matching the pre-Gitea behaviour where - # an empty list also degenerated to none/none. - GITEA_API_URL="${GITHUB_SERVER_URL:-https://git.moleculesai.app}/api/v1" - STATUSES_JSON=$(curl --fail-with-body -sS \ - -H "Authorization: token ${GH_TOKEN}" \ - -H "Accept: application/json" \ - "${GITEA_API_URL}/repos/${REPO}/commits/${SHA}/statuses?limit=100" \ - 2>/dev/null || echo "[]") - RESULT=$(printf '%s' "$STATUSES_JSON" | jq -r ' - # Filter to E2E Staging SaaS (full lifecycle) statuses. - # Match by leading workflow-name prefix so the " - # ()" tail is irrelevant. Gitea emits the workflow - # name verbatim from the YAML `name:` field. - [.[] | select(.context | startswith("E2E Staging SaaS (full lifecycle) /"))] as $rows - | if ($rows | length) == 0 then - "none/none" - elif any($rows[]; .status == "pending") then - "in_progress/none" - elif any($rows[]; .status == "failure" or .status == "error") then - "completed/failure" - elif all($rows[]; .status == "success") then - "completed/success" - else - # Mixed / unknown — fall through to *) bucket below. - "completed/" + ($rows[0].status // "unknown") - end - ' 2>/dev/null || echo "none/none") - - echo "E2E Staging SaaS for ${SHA:0:7}: $RESULT" - - case "$RESULT" in - completed/success) - echo "proceed=true" >> "$GITHUB_OUTPUT" - echo "::notice::E2E green for this SHA — proceeding with promote" - ;; - completed/failure|completed/timed_out) - echo "proceed=false" >> "$GITHUB_OUTPUT" - { - echo "## ❌ Auto-promote aborted — E2E Staging SaaS failed" - echo - echo "E2E Staging SaaS for \`${SHA:0:7}\`: \`$RESULT\`" - echo "\`:latest\` stays on the prior known-good digest." - echo - echo "If the failure was a flake, manually dispatch this workflow with the same sha to override." - } >> "$GITHUB_STEP_SUMMARY" - exit 1 - ;; - completed/cancelled) - # GitHub-era only: cancelled ≠ failure. Gitea statuses - # don't expose a "cancelled" state — a per-SHA concurrency - # cancellation surfaces as `failure` or `error` on Gitea - # and is now handled by the failure branch above. This - # arm is kept for backwards compatibility / dual-host - # operation (if we ever add a non-Gitea fallback) but - # under the post-#75 flow it's unreachable. - echo "proceed=false" >> "$GITHUB_OUTPUT" - { - echo "## ⏭ Auto-promote deferred — E2E Staging SaaS was cancelled" - echo - echo "E2E Staging SaaS for \`${SHA:0:7}\`: \`$RESULT\`" - echo "Likely per-SHA concurrency (newer push superseded this E2E run)." - echo "The newer SHA's E2E will fire its own promote when it lands." - echo "If you need this specific SHA promoted, manually dispatch." - } >> "$GITHUB_STEP_SUMMARY" - ;; - in_progress/*|queued/*|requested/*|waiting/*|pending/*) - echo "proceed=false" >> "$GITHUB_OUTPUT" - { - echo "## ⏳ Auto-promote deferred — E2E Staging SaaS still running" - echo - echo "Publish completed before E2E for \`${SHA:0:7}\` (state: \`$RESULT\`)." - echo "Skipping retag here — E2E's own completion event will re-fire this workflow." - echo "If E2E ends green, that run promotes \`:latest\`. If red, it aborts." - } >> "$GITHUB_STEP_SUMMARY" - ;; - none/none) - echo "proceed=true" >> "$GITHUB_OUTPUT" - echo "::notice::E2E paths-filtered out for this SHA — pre-merge staging gates carry" - ;; - *) - echo "proceed=false" >> "$GITHUB_OUTPUT" - { - echo "## ❓ Auto-promote aborted — unexpected E2E state" - echo - echo "E2E Staging SaaS for \`${SHA:0:7}\`: \`$RESULT\` (unhandled)" - echo "Manual investigation needed; re-dispatch with the same sha once resolved." - } >> "$GITHUB_STEP_SUMMARY" - exit 1 - ;; - esac - - - if: steps.gate.outputs.proceed == 'true' - uses: imjasonh/setup-crane@6da1ae018866400525525ce74ff892880c099987 # v0.5 - - - name: GHCR login - if: steps.gate.outputs.proceed == 'true' - run: | - echo "${{ secrets.GITHUB_TOKEN }}" | \ - crane auth login ghcr.io -u "${{ github.actor }}" --password-stdin - - - name: Verify :staging- exists for both images - # Better to fail fast with a clear message than to half-tag - # (platform retagged but platform-tenant missing → tenants pull - # a stale image). - if: steps.gate.outputs.proceed == 'true' - run: | - set -euo pipefail - for img in "${IMAGE_NAME}" "${TENANT_IMAGE_NAME}"; do - tag="${img}:staging-${{ steps.sha.outputs.short }}" - if ! crane manifest "$tag" >/dev/null 2>&1; then - echo "::error::Missing tag: $tag" - echo "::error::publish-workspace-server-image must complete on this SHA before auto-promote can retag :latest." - exit 1 - fi - echo " ok: $tag exists" - done - - - name: Ancestry check — refuse to promote :latest backwards - # #2244: workflow_run completions arrive in arbitrary order. If - # SHA-A and SHA-B both reach main within ~10 min and SHA-B's E2E - # completes before SHA-A's, this workflow can fire for SHA-A - # AFTER it already promoted SHA-B → :latest goes backwards. The - # orphan-reconciler "next run corrects it" doesn't apply: there's - # no auto-corrective re-promote, :latest stays wrong until the - # next main push lands. - # - # Detection: read current :latest's `org.opencontainers.image.revision` - # label (set by publish-workspace-server-image.yml at build time) - # and ask the GitHub compare API whether the candidate SHA is - # ahead-of / identical-to / behind / diverged-from current. - # Hard-fail on `behind` and `diverged` per the approved design — - # silent-bypass is the class we're moving away from. Workflow - # goes red, oncall sees it, operator decides how to recover - # (manual dispatch with the right SHA, force-promote, etc.). - # - # Manual dispatch skips this check — operator override semantics - # match the gate-check step above. - # - # Backward-compat: when current :latest carries no revision - # label (legacy image pre-publish-with-label), skip-with-warning. - # All :latest images on main are post-label as of 2026-04-29, so - # this branch will be dead within 90 days; remove then. - if: steps.gate.outputs.proceed == 'true' && github.event_name != 'workflow_dispatch' - id: ancestry - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - REPO: ${{ github.repository }} - TARGET_SHA: ${{ steps.sha.outputs.full }} - run: | - set -euo pipefail - - # Read the current :latest config and pull the revision label. - # `crane config` returns the OCI image config blob (not the manifest); - # labels live under `.config.Labels`. `// empty` makes jq return "" - # rather than the literal "null" so the test below works. - CURRENT_REVISION=$(crane config "${IMAGE_NAME}:latest" 2>/dev/null \ - | jq -r '.config.Labels["org.opencontainers.image.revision"] // empty' \ - || true) - - if [ -z "$CURRENT_REVISION" ]; then - echo "decision=skip-no-label" >> "$GITHUB_OUTPUT" - { - echo "## ⚠ Ancestry check skipped — current :latest has no revision label" - echo - echo "Likely a legacy image built before \`org.opencontainers.image.revision\` was set." - echo "Falling through to retag. After all \`:latest\` images are post-label (TODO 90 days), this branch is dead and should be removed." - } >> "$GITHUB_STEP_SUMMARY" - echo "::warning::Current :latest carries no revision label — skipping ancestry check (legacy image)" - exit 0 - fi - - if [ "$CURRENT_REVISION" = "$TARGET_SHA" ]; then - echo "decision=identical" >> "$GITHUB_OUTPUT" - echo "::notice:::latest already at ${TARGET_SHA:0:7} — retag will be a no-op" - exit 0 - fi - - # Ask GitHub which side of the merge graph TARGET_SHA sits on - # relative to CURRENT_REVISION. Returns one of: ahead | identical - # | behind | diverged. Network or auth errors collapse to "error" - # via the explicit fallback so the case below always matches. - STATUS=$(gh api \ - "repos/${REPO}/compare/${CURRENT_REVISION}...${TARGET_SHA}" \ - --jq '.status' 2>/dev/null || echo "error") - - echo "ancestry compare ${CURRENT_REVISION:0:7} → ${TARGET_SHA:0:7}: $STATUS" - - case "$STATUS" in - ahead) - echo "decision=ahead" >> "$GITHUB_OUTPUT" - echo "::notice::Target ${TARGET_SHA:0:7} is ahead of current :latest (${CURRENT_REVISION:0:7}) — proceeding with retag" - ;; - identical) - echo "decision=identical" >> "$GITHUB_OUTPUT" - echo "::notice::Target identical to :latest — retag will be a no-op" - ;; - behind) - echo "decision=behind" >> "$GITHUB_OUTPUT" - { - echo "## ❌ Auto-promote refused — target is BEHIND current :latest" - echo - echo "| Field | Value |" - echo "|---|---|" - echo "| Target SHA | \`$TARGET_SHA\` |" - echo "| Current :latest revision | \`$CURRENT_REVISION\` |" - echo "| GitHub compare status | \`behind\` |" - echo - echo "This guard catches the workflow_run-completion-order race (#2244):" - echo "two rapid main pushes whose E2Es complete out-of-order can otherwise" - echo "promote \`:latest\` backwards. \`:latest\` stays on \`${CURRENT_REVISION:0:7}\`." - echo - echo "**Recovery:** if this is a legitimate revert that should land on \`:latest\`," - echo "manually dispatch this workflow with the target sha as input — the manual-dispatch" - echo "path skips the ancestry check (operator override)." - } >> "$GITHUB_STEP_SUMMARY" - exit 1 - ;; - diverged) - echo "decision=diverged" >> "$GITHUB_OUTPUT" - { - echo "## ❓ Auto-promote refused — history diverged" - echo - echo "| Field | Value |" - echo "|---|---|" - echo "| Target SHA | \`$TARGET_SHA\` |" - echo "| Current :latest revision | \`$CURRENT_REVISION\` |" - echo "| GitHub compare status | \`diverged\` |" - echo - echo "Likely cause: force-push rewrote main's history, leaving the previous" - echo "\`:latest\` revision orphaned. Needs human review before \`:latest\` advances." - } >> "$GITHUB_STEP_SUMMARY" - exit 1 - ;; - error|*) - echo "decision=error" >> "$GITHUB_OUTPUT" - { - echo "## ❌ Auto-promote aborted — ancestry-check API error" - echo - echo "\`gh api repos/${REPO}/compare/${CURRENT_REVISION}...${TARGET_SHA}\` returned unexpected status: \`$STATUS\`" - echo - echo "Manual dispatch with the target sha bypasses this check." - } >> "$GITHUB_STEP_SUMMARY" - exit 1 - ;; - esac - - - name: Retag platform :staging- → :latest - if: steps.gate.outputs.proceed == 'true' - run: | - crane tag "${IMAGE_NAME}:staging-${{ steps.sha.outputs.short }}" latest - - - name: Retag tenant :staging- → :latest - if: steps.gate.outputs.proceed == 'true' - run: | - crane tag "${TENANT_IMAGE_NAME}:staging-${{ steps.sha.outputs.short }}" latest - - - name: Summary - if: steps.gate.outputs.proceed == 'true' - run: | - { - echo "## :latest promoted to ${{ steps.sha.outputs.short }}" - echo - if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then - echo "- Trigger: manual dispatch" - else - echo "- Upstream: \`${{ github.event.workflow_run.name }}\` ([run](${{ github.event.workflow_run.html_url }}))" - fi - echo "- platform:staging-${{ steps.sha.outputs.short }} → :latest" - echo "- platform-tenant:staging-${{ steps.sha.outputs.short }} → :latest" - echo - echo "Tenant fleet auto-pulls within 5 min via IMAGE_AUTO_REFRESH=true." - echo "Force immediate fanout: dispatch redeploy-tenants-on-main.yml." - } >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/auto-promote-staging.yml b/.github/workflows/auto-promote-staging.yml deleted file mode 100644 index 7d2ce310..00000000 --- a/.github/workflows/auto-promote-staging.yml +++ /dev/null @@ -1,492 +0,0 @@ -name: Auto-promote staging → main - -# Fires after any of the staging-branch quality gates complete. When ALL -# required gates are green on the same staging SHA, opens (or re-uses) -# a PR `staging → main` and schedules Gitea auto-merge so the PR lands -# automatically once approval + status checks are satisfied. -# -# ============================================================ -# What this workflow does -# ============================================================ -# -# 1. On a workflow_run completion event for one of the staging gate -# workflows (CI, E2E Staging Canvas, E2E API Smoke, CodeQL), -# checks if the combined status on the staging head SHA is green. -# 2. If green, opens (or re-uses) a PR `head: staging → base: main` -# via Gitea REST `POST /api/v1/repos/.../pulls`. -# 3. Schedules auto-merge via `POST /api/v1/repos/.../pulls/{index}/merge` -# with `merge_when_checks_succeed: true`. Gitea waits for the -# approval requirement on `main` (`required_approvals: 1`) and -# the status-check gates, then merges. -# 4. The merge commit lands on `main` and fires -# `publish-workspace-server-image.yml` naturally via its -# `on: push: branches: [main]` trigger — no explicit dispatch -# needed (see "Why no workflow_dispatch tail" below). -# -# `auto-sync-main-to-staging.yml` is the reverse-direction -# counterpart (main → staging, fast-forward push). Together they -# keep the staging-superset-of-main invariant tight. -# -# ============================================================ -# Why Gitea REST (and not `gh pr create`) -# ============================================================ -# -# Pre-2026-05-06 this workflow used `gh pr create`, `gh pr merge --auto`, -# `gh run list`, and `gh workflow run` against GitHub. After the -# GitHub→Gitea cutover those calls fail because: -# -# - `gh pr create / merge / view / list` route to GitHub GraphQL -# (`/api/graphql`). Gitea does not expose a GraphQL endpoint; -# every call returns `HTTP 405 Method Not Allowed` — same root -# cause as #65 (auto-sync) which PR #66 fixed by dropping `gh` -# entirely. -# - `gh run list --workflow=...` GitHub-shape; Gitea has the -# simpler `GET /repos/.../commits/{ref}/status` combined-status -# endpoint instead. -# - `gh workflow run X.yml` calls `POST /repos/.../actions/workflows/{id}/dispatches`, -# which does NOT exist on Gitea 1.22.6 (verified via swagger.v1.json). -# -# So this workflow uses direct `curl` calls to Gitea REST. No `gh` -# CLI dependency, no GraphQL, no missing-endpoint footgun. -# -# ============================================================ -# Why no workflow_dispatch tail (was load-bearing on GitHub, dead on Gitea) -# ============================================================ -# -# The GitHub-era version had a 60-line polling step that waited for -# the promote PR to merge, then explicitly dispatched -# `publish-workspace-server-image.yml` on `--ref main`. That step -# existed because GitHub's GITHUB_TOKEN-initiated merges suppress -# downstream `on: push` workflows (the documented "no recursion" rule -# — https://docs.github.com/en/actions/using-workflows/triggering-a-workflow#triggering-a-workflow-from-a-workflow). -# The explicit dispatch was the workaround. -# -# Gitea Actions does NOT have this no-recursion rule. PR #66's auto- -# sync merge to main fired `auto-promote-staging` on the next push -# trigger naturally. So the cascade fires on the natural push event; -# the explicit dispatch is dead code. (And even if we wanted to -# preserve it, Gitea has no `workflow_dispatch` REST endpoint.) -# -# Removed in this rewrite. If we ever observe the cascade misfire, -# operator can push an empty commit to `main` to wake it. -# -# ============================================================ -# Why open a PR (and not direct push) -# ============================================================ -# -# `main` branch protection has `enable_push: false` with NO -# `push_whitelist_usernames`. Direct push is impossible for any -# persona, including admins. PR-mediated merge is the only path, -# which is intentional: prod state mutations (and staging→main IS a -# prod mutation, since the next deploy fans out to tenants) require -# Hongming's approval per `feedback_prod_apply_needs_hongming_chat_go`. -# -# The auto-merge schedule preserves this gate: `merge_when_checks_succeed` -# does NOT bypass `required_approvals: 1`. Gitea waits for BOTH -# approval AND green checks before merging. Hongming reviews via the -# canvas/chat-handle of the PR notification, approves, and Gitea -# auto-merges within seconds. -# -# ============================================================ -# Identity + token (anti-bot-ring per saved-memory -# `feedback_per_agent_gitea_identity_default`) -# ============================================================ -# -# This workflow uses `secrets.AUTO_SYNC_TOKEN` — a personal access -# token issued to the `devops-engineer` Gitea persona. NOT the -# founder PAT. The bot-ring fingerprint that triggered the GitHub -# org suspension on 2026-05-06 was characterised by founder PAT -# acting as CI at machine speed. -# -# Token scope: `push: true` (read+write) on this repo. The persona -# can: open PRs, comment on PRs, schedule auto-merge. The persona -# CANNOT bypass main's branch protection (`required_approvals: 1` -# still applies — only Hongming's review unblocks merge). -# -# Authorship: the PR is opened by `devops-engineer`; the merge -# commit credits Hongming-as-approver and `devops-engineer` as -# the merger. -# -# ============================================================ -# Failure modes & operational notes -# ============================================================ -# -# A — staging gates not all green at trigger time: -# - The combined-status check returns `state: pending|failure`. -# Workflow exits 0 with a step-summary "not all green; staying -# on current main". Re-fires on the next gate completion. -# -# B — Gitea PR-create returns non-201 (e.g. 422 already-exists): -# - Idempotent: the workflow first GETs the existing open -# staging→main PR. If found, reuse it; if not, POST a new one. -# 422 should never surface; if it does (race), step summary -# captures the body and the next workflow_run picks up. -# -# C — `merge_when_checks_succeed` schedule fails: -# - 422 with "Pull request is not mergeable" if there are -# conflicts or stale base. Step summary surfaces it; operator -# (or `auto-sync-main-to-staging`) needs to bring staging up -# to date with main first. Workflow exits 1 to surface red. -# -# D — `AUTO_SYNC_TOKEN` rotated / wrong scope: -# - 401/403 on first REST call. Step summary surfaces it. -# Re-issue the token from `~/.molecule-ai/personas/` on the -# operator host and update the repo Actions secret. -# -# ============================================================ -# Loop safety -# ============================================================ -# -# When the promote PR merges to main, `auto-sync-main-to-staging.yml` -# fires (on:push:main) and pushes the merge commit back to staging. -# That push to staging is by `devops-engineer`, NOT this workflow's -# token, and triggers the staging gate workflows. When they all -# complete, we end up back here — but the tree-diff guard catches -# it: staging tree == main tree (the merge commit changes nothing), -# so we skip and the cycle terminates. - -on: - workflow_run: - workflows: - - CI - - E2E Staging Canvas (Playwright) - - E2E API Smoke Test - - CodeQL - types: [completed] - workflow_dispatch: - inputs: - force: - description: "Force promote even when AUTO_PROMOTE_ENABLED is unset (manual override)" - required: false - default: "false" - -permissions: - contents: read - pull-requests: write - -# Serialize auto-promote runs. Multiple staging gate completions can land -# in quick succession (CI + E2E + CodeQL all finish within seconds of -# each other on a green PR) — without this, two parallel runs both: -# 1. Would race the GET-or-POST PR step. -# 2. Would both call merge-schedule (idempotent — fine on Gitea). -# cancel-in-progress: false because the second run on a fresh staging -# tip should NOT kill the first which has already opened the PR. -concurrency: - group: auto-promote-staging - cancel-in-progress: false - -jobs: - check-all-gates-green: - # Only consider staging pushes. PRs into staging don't promote. - if: > - (github.event_name == 'workflow_run' && - github.event.workflow_run.head_branch == 'staging' && - github.event.workflow_run.event == 'push') - || github.event_name == 'workflow_dispatch' - runs-on: ubuntu-latest - outputs: - all_green: ${{ steps.gates.outputs.all_green }} - head_sha: ${{ steps.gates.outputs.head_sha }} - steps: - # Skip empty-tree promotes (the perpetual auto-promote↔auto-sync - # cycle observed pre-cutover on GitHub). On Gitea the cycle shape - # is different (auto-sync uses fast-forward, no merge commit), - # but the tree-diff guard is cheap insurance and protects against - # any future merge-style regression. - - name: Checkout for tree-diff check - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - fetch-depth: 0 - ref: staging - - - name: Skip if staging tree == main tree (cycle-break safety) - id: tree-diff - env: - HEAD_SHA: ${{ github.event.workflow_run.head_sha || github.sha }} - run: | - set -eu - git fetch origin main --depth=50 || { echo "::warning::git fetch main failed — proceeding (fail-open)"; exit 0; } - if git diff --quiet origin/main "$HEAD_SHA" -- 2>/dev/null; then - { - echo "## Skipped — no code to promote" - echo - echo "staging tip (\`${HEAD_SHA:0:8}\`) and \`main\` have identical trees." - echo "Skipping to avoid opening an empty promote PR." - } >> "$GITHUB_STEP_SUMMARY" - echo "::notice::auto-promote: staging tree == main tree — no code to promote, skipping" - echo "skip=true" >> "$GITHUB_OUTPUT" - else - echo "skip=false" >> "$GITHUB_OUTPUT" - fi - - - name: Check combined status on staging head - if: steps.tree-diff.outputs.skip != 'true' - id: gates - env: - GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }} - HEAD_SHA: ${{ github.event.workflow_run.head_sha || github.sha }} - REPO: ${{ github.repository }} - GITEA_HOST: ${{ vars.GITEA_HOST || 'https://git.moleculesai.app' }} - run: | - set -euo pipefail - - # Gitea-native combined-status endpoint aggregates every - # check context attached to a SHA. This is structurally - # cleaner than the GitHub-era per-workflow `gh run list` - # loop because: - # - # 1. There's no risk of "workflow name collision" (the - # GitHub-era code had to switch from `--workflow=NAME` - # to `--workflow=FILE.YML` to disambiguate "CodeQL" - # between the explicit workflow and GitHub's UI- - # configured default setup; Gitea has no such - # duplicate-name surface). - # 2. Gitea's combined state already encodes the AND - # across all contexts: success only if EVERY context - # is success. Pending or failure on any context - # produces non-success state. - # - # See https://docs.gitea.com/api/1.22 for the schema — - # `state` is one of: success, pending, failure, error. - - echo "head_sha=${HEAD_SHA}" >> "$GITHUB_OUTPUT" - echo "Checking combined status on SHA ${HEAD_SHA}" - - # `set +o pipefail` for the http-code capture pattern; restore - # immediately. Pattern hardened per `feedback_curl_status_capture_pollution`. - BODY_FILE=$(mktemp) - set +e - STATUS=$(curl -sS \ - -H "Authorization: token ${GITEA_TOKEN}" \ - -H "Accept: application/json" \ - -o "${BODY_FILE}" \ - -w "%{http_code}" \ - "${GITEA_HOST}/api/v1/repos/${REPO}/commits/${HEAD_SHA}/status") - CURL_RC=$? - set -e - - if [ "${CURL_RC}" -ne 0 ] || [ "${STATUS}" != "200" ]; then - echo "::error::combined-status fetch failed: curl=${CURL_RC} http=${STATUS}" - cat "${BODY_FILE}" | head -c 500 || true - rm -f "${BODY_FILE}" - echo "all_green=false" >> "$GITHUB_OUTPUT" - exit 0 - fi - - STATE=$(jq -r '.state // "missing"' < "${BODY_FILE}") - TOTAL=$(jq -r '.total_count // 0' < "${BODY_FILE}") - rm -f "${BODY_FILE}" - - echo "Combined status: state=${STATE} total_count=${TOTAL}" - - if [ "${STATE}" = "success" ] && [ "${TOTAL}" -gt 0 ]; then - echo "all_green=true" >> "$GITHUB_OUTPUT" - echo "::notice::All gates green on ${HEAD_SHA} (${TOTAL} contexts)" - else - echo "all_green=false" >> "$GITHUB_OUTPUT" - { - echo "## Not promoting — combined status not green" - echo - echo "- SHA: \`${HEAD_SHA:0:8}\`" - echo "- Combined state: \`${STATE}\`" - echo "- Context count: ${TOTAL}" - echo - echo "Will re-fire on the next gate completion. Investigate any red gate via the Actions UI." - } >> "$GITHUB_STEP_SUMMARY" - echo "::notice::auto-promote: combined status is ${STATE} on ${HEAD_SHA} — staying on current main" - fi - - promote: - needs: check-all-gates-green - if: needs.check-all-gates-green.outputs.all_green == 'true' - runs-on: ubuntu-latest - steps: - - name: Check rollout gate - env: - AUTO_PROMOTE_ENABLED: ${{ vars.AUTO_PROMOTE_ENABLED }} - FORCE_INPUT: ${{ github.event.inputs.force }} - run: | - set -eu - # Repo variable AUTO_PROMOTE_ENABLED=true flips this on. While - # it's unset, the workflow dry-runs (logs what it would have - # done) but doesn't open the promote PR. Set the variable in - # Settings → Actions → Variables. - if [ "${AUTO_PROMOTE_ENABLED:-}" != "true" ] && [ "${FORCE_INPUT:-false}" != "true" ]; then - { - echo "## Auto-promote disabled" - echo - echo "Repo variable \`AUTO_PROMOTE_ENABLED\` is not set to \`true\`." - echo "All gates are green on staging; would have opened a promote PR to \`main\`." - echo - echo "To enable: Settings → Actions → Variables → \`AUTO_PROMOTE_ENABLED=true\`." - echo "To test once manually: workflow_dispatch with \`force=true\`." - } >> "$GITHUB_STEP_SUMMARY" - echo "::notice::auto-promote disabled — dry run only" - exit 0 - fi - - - name: Open or reuse promote PR + schedule auto-merge - if: ${{ vars.AUTO_PROMOTE_ENABLED == 'true' || github.event.inputs.force == 'true' }} - env: - GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }} - REPO: ${{ github.repository }} - TARGET_SHA: ${{ needs.check-all-gates-green.outputs.head_sha }} - GITEA_HOST: ${{ vars.GITEA_HOST || 'https://git.moleculesai.app' }} - run: | - set -euo pipefail - - API="${GITEA_HOST}/api/v1/repos/${REPO}" - AUTH=(-H "Authorization: token ${GITEA_TOKEN}" -H "Accept: application/json") - - # http_status_get RESULT_VAR URL - # Sets RESULT_VAR to ":". Curl status - # capture pattern per `feedback_curl_status_capture_pollution`: - # http_code goes to its own tempfile-equivalent (-w), body to - # another tempfile, set +e/-e bracket protects pipeline state. - http_get() { - local body_file="$1"; shift - local url="$1"; shift - set +e - local code - code=$(curl -sS "${AUTH[@]}" -o "${body_file}" -w "%{http_code}" "${url}") - local rc=$? - set -e - if [ "${rc}" -ne 0 ]; then - echo "::error::curl GET failed (rc=${rc}) on ${url}" - return 99 - fi - echo "${code}" - } - http_post_json() { - local body_file="$1"; shift - local data="$1"; shift - local url="$1"; shift - set +e - local code - code=$(curl -sS "${AUTH[@]}" -H "Content-Type: application/json" \ - -X POST -d "${data}" -o "${body_file}" -w "%{http_code}" "${url}") - local rc=$? - set -e - if [ "${rc}" -ne 0 ]; then - echo "::error::curl POST failed (rc=${rc}) on ${url}" - return 99 - fi - echo "${code}" - } - - # Step 1: look for an existing open staging→main promote PR - # (idempotent on workflow re-run). Gitea doesn't have a - # head/base filter on the list endpoint that's as ergonomic - # as gh's, but the dedicated `/pulls/{base}/{head}` lookup - # works. - BODY=$(mktemp) - STATUS=$(http_get "${BODY}" "${API}/pulls/main/staging") || true - - PR_NUM="" - if [ "${STATUS}" = "200" ]; then - STATE=$(jq -r '.state // "missing"' < "${BODY}") - if [ "${STATE}" = "open" ]; then - PR_NUM=$(jq -r '.number // ""' < "${BODY}") - echo "::notice::Re-using existing open promote PR #${PR_NUM}" - fi - fi - rm -f "${BODY}" - - # Step 2: if no open PR, create one. - if [ -z "${PR_NUM}" ]; then - TITLE="staging → main: auto-promote ${TARGET_SHA:0:7}" - BODY_TEXT=$(cat <> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/auto-promote-stale-alarm.yml b/.github/workflows/auto-promote-stale-alarm.yml deleted file mode 100644 index 58667c6f..00000000 --- a/.github/workflows/auto-promote-stale-alarm.yml +++ /dev/null @@ -1,83 +0,0 @@ -name: auto-promote-stale-alarm - -# Hourly cron + on-demand alarm for the silent-block failure mode that -# motivated issue #2975: -# - The auto-promote-staging.yml workflow opened a PR + armed -# auto-merge, but main's branch protection requires a human review -# (reviewDecision=REVIEW_REQUIRED). The PR sat BLOCKED with no -# surface-up-the-stack for 12+ hours, holding 25 commits hostage -# including the Memory v2 redesign and a reno-stars data-loss fix. -# -# This workflow runs `scripts/check-stale-promote-pr.sh` against the -# repo's open auto-promote PRs (base=main head=staging). When a PR has -# been BLOCKED on REVIEW_REQUIRED for >4h, it: -# 1. Emits a workflow-level warning (visible in run summary + the -# Actions UI feed). -# 2. Posts a comment on the PR (idempotent — one alarm per PR). -# -# The detection logic lives in scripts/check-stale-promote-pr.sh so -# it's unit-testable with stubbed `gh` (see test-check-stale-promote-pr.sh). -# This file is the schedule + invocation surface only — SSOT for the -# detector itself. - -on: - schedule: - # Hourly. Cheap (one `gh pr list` + jq), and 1h granularity is - # plenty for a 4h staleness threshold — operators see the alarm - # within at most 1h of crossing the threshold. - - cron: "27 * * * *" # at :27 to dodge the cron herd at :00 - workflow_dispatch: - inputs: - stale_hours: - description: "Hours after which a BLOCKED+REVIEW_REQUIRED PR is stale (default 4)" - required: false - default: "4" - post_comment: - description: "Post a comment on stale PRs (default true)" - required: false - default: "true" - -permissions: - contents: read - pull-requests: write # post comments on stale PRs - -# Serialize so the on-demand and scheduled runs don't double-comment -# the same PR. cancel-in-progress=false because the script is idempotent -# (existing comment marker prevents dupes), but a scheduled run firing -# while a manual one runs would just re-list the same PR set. -concurrency: - group: auto-promote-stale-alarm - cancel-in-progress: false - -jobs: - scan: - runs-on: ubuntu-latest - steps: - - name: Checkout (need scripts/ only) - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - sparse-checkout: | - scripts/check-stale-promote-pr.sh - sparse-checkout-cone-mode: false - - name: Run stale-PR detector - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - GITHUB_REPOSITORY: ${{ github.repository }} - STALE_HOURS: ${{ inputs.stale_hours || '4' }} - POST_COMMENT: ${{ inputs.post_comment || 'true' }} - run: | - # The script's exit code reflects the count of stale PRs. - # We don't want a stale finding to fail the workflow run — - # the warning + comment are the signal, the green/red is - # noise. So convert any non-zero exit to a workflow notice - # and exit 0. - set +e - bash scripts/check-stale-promote-pr.sh - rc=$? - set -e - if [ "$rc" -ne 0 ]; then - echo "::notice::Stale PR detector found $rc PR(s) needing attention. See warnings above + comments on the PRs." - fi - # Always succeed — operator-facing surface is the warning, - # not the workflow status. - exit 0 diff --git a/.github/workflows/auto-sync-canary.yml b/.github/workflows/auto-sync-canary.yml deleted file mode 100644 index f6b0437b..00000000 --- a/.github/workflows/auto-sync-canary.yml +++ /dev/null @@ -1,404 +0,0 @@ -name: Auto-sync canary — AUTO_SYNC_TOKEN rotation drift - -# Synthetic health check for the AUTO_SYNC_TOKEN secret consumed by -# auto-sync-main-to-staging.yml (PR #66) and publish-workspace-server-image.yml. -# -# ============================================================ -# Why this workflow exists -# ============================================================ -# -# PR #66 fixed auto-sync (replaced GitHub-era `gh pr create` — which -# 405s on Gitea's GraphQL endpoint — with a direct git push from the -# `devops-engineer` persona's `AUTO_SYNC_TOKEN`). Hostile self-review -# weakest spot #3 of that PR: -# -# "Token rotation silently breaks auto-sync. If AUTO_SYNC_TOKEN is -# rotated without updating the repo secret, every push to main -# fails red on the auto-sync push step. The workflow surfaces the -# failure mode in the step summary (failure mode B in the header), -# but there's no proactive monitoring." -# -# Detection latency under the status quo: rotation is only caught on -# the next push to `main`. During quiet periods (no main push for -# many hours) the staging-superset-of-main invariant silently breaks. -# -# This workflow closes the gap: every 6 hours, it fires the auth -# surface that auto-sync depends on and emits a red workflow status -# if AUTO_SYNC_TOKEN has drifted out of validity. -# -# ============================================================ -# What this checks (Option B — read-only verify) -# ============================================================ -# -# 1. `GET /api/v1/user` against Gitea with the token → validates the -# token authenticates AND resolves to `devops-engineer` (catches -# the case where the token was regenerated under a different -# persona by mistake). -# 2. `GET /api/v1/repos/molecule-ai/molecule-core` with the token → -# validates the token has `read:repository` scope on this repo -# (the v2 scope contract — see saved memory -# `reference_persona_token_v2_scope`). -# 3. `git push --dry-run` of the current staging SHA back to -# `refs/heads/staging` via `https://oauth2:@/...` -# → validates the EXACT HTTPS basic-auth path that -# `actions/checkout` + `git push origin staging` use inside -# auto-sync-main-to-staging.yml. NOP by construction (push the -# current tip to itself = "Everything up-to-date"); auth is -# checked at the smart-protocol handshake BEFORE the empty-diff -# computation, so bad token → exit 128 with "Authentication -# failed". `git ls-remote` is NOT used here because Gitea -# falls back to anonymous read on public repos and would -# silently green-light a rotated token. -# -# Each step exits non-zero with an actionable error message if it -# fails. The workflow status itself is the operator-facing surface. -# -# ============================================================ -# What this does NOT check (intentional) -# ============================================================ -# -# - **Branch-protection authz** (failure mode C in auto-sync header): -# would require an actual write to staging. Already monitored by -# `branch-protection-drift.yml` daily. Don't duplicate. -# - **Conflict resolution** (failure mode A): a real conflict is data- -# driven, not auth-driven; can't synthesise it without polluting -# staging. Already surfaces immediately on the next main push. -# - **Concurrency** (failure mode D): handled by workflow concurrency -# group on auto-sync, not a credential issue. -# -# ============================================================ -# Why Option B (read-only) and not the alternatives -# ============================================================ -# -# Considered + rejected (see issue #72 for full write-up): -# -# - **Option A — full auto-sync on schedule**: every run creates a -# no-op merge commit on staging when main hasn't advanced. 4 noise -# commits/day. And races the real `push:` trigger when main has -# advanced. Rejected. -# -# - **Option C — push to dedicated `auto-sync-canary` branch**: would -# exercise authz too, but adds branch noise on Gitea AND requires -# maintaining a second branch protection (or expanding staging's -# whitelist to a junk branch). Authz already covered by -# `branch-protection-drift.yml`. Rejected. -# -# Prior art for the chosen Option B shape: -# - Cloudflare's `/user/tokens/verify` endpoint (read-only auth -# probe explicitly designed for credential canaries). -# - AWS Secrets Manager rotation Lambda's `testSecret` step (auth -# probe before promoting AWSPENDING → AWSCURRENT). -# - HashiCorp Vault's `vault token lookup` for renewal canaries. -# -# ============================================================ -# Operator runbook — what to do when this workflow goes RED -# ============================================================ -# -# 1. **Identify which step failed**: -# - Step "Verify token authenticates as devops-engineer" red → -# token is invalid OR resolves to wrong persona. -# - Step "Verify token has repo read scope" red → token valid but -# stripped of `read:repository` scope (or repo perms changed). -# - Step "Verify git HTTPS auth path via no-op dry-run push to -# staging" red → token rotated/revoked OR Gitea git-HTTPS -# surface is broken (rare). Auth check happens on the -# smart-protocol handshake, separate from the API path. -# -# 2. **Re-issue the token** on the operator host: -# ``` -# ssh root@5.78.80.188 'docker exec --user git molecule-gitea-1 \ -# gitea admin user generate-access-token \ -# --username devops-engineer \ -# --token-name persona-devops-engineer-vN \ -# --scopes "read:repository,write:repository,read:user,read:organization,read:issue,write:issue,read:notification,read:misc"' -# ``` -# Update `/etc/molecule-bootstrap/agent-secrets.env` in place -# (per `feedback_unified_credentials_file`). The previous token -# file lands at `.bak.`. -# -# 3. **Update the repo Actions secret** at: -# Settings → Secrets and variables → Actions → AUTO_SYNC_TOKEN -# Paste the new token. (Don't echo it in chat — but per -# `feedback_passwords_in_chat_are_burned`, a paste in a 1:1 -# Claude session is within trust boundary.) -# -# 4. **Re-run this canary** via workflow_dispatch. Confirm GREEN. -# -# 5. **Backfill any missed main → staging syncs** by re-running -# `auto-sync-main-to-staging.yml` from its workflow_dispatch -# surface, OR by pushing an empty commit to main (if you'd -# rather force a real trigger). -# -# ============================================================ -# Security notes -# ============================================================ -# -# - Token usage: read-only (`GET /api/v1/user`, `GET /api/v1/repos/...`, -# `git ls-remote`). No write paths. Same blast-radius profile as -# `actions/checkout` on a public repo. -# - The token NEVER appears in logs: every `curl` uses a header -# variable, never inline; the `git ls-remote` URL builds the -# `oauth2:$TOKEN@host` form into a single env var that's not -# echoed. GitHub Actions secret-masking covers anything that does -# slip through. -# - No new token introduced — same `AUTO_SYNC_TOKEN` the workflow -# under monitor uses. Per least-privilege we deliberately do NOT -# broaden scope for the canary. - -on: - schedule: - # Every 6 hours at :17 (offsets the cron herd at :00). Justification - # from issue #72: cheap to run (~5s wall-clock, no quota), 3h average - # detection latency, 6h max. 1h would be 24× the runs for marginal - # benefit; daily would be 6× longer latency and worse than status - # quo on a quiet-main day. - - cron: '17 */6 * * *' - workflow_dispatch: - -# No concurrency group needed — the canary is read-only and idempotent. -# Two parallel runs (e.g. operator dispatch during a scheduled tick) are -# harmless: same result, doubled HTTPS calls, no shared state. - -permissions: - contents: read - -jobs: - verify-token: - name: Verify AUTO_SYNC_TOKEN validity - runs-on: ubuntu-latest - # 2 min surfaces hangs (Gitea API stall, DNS issue) within one - # cron interval. Realistic worst case is ~10s: 2 curls + 1 git - # ls-remote, each capped by the explicit timeouts below. - timeout-minutes: 2 - - env: - # Pinned in env so individual steps can read it without - # repeating the secret reference. GitHub masks the value in - # logs automatically. - AUTO_SYNC_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }} - # MUST stay in sync with auto-sync-main-to-staging.yml's - # `git config user.name "devops-engineer"` line. Renaming the - # devops-engineer persona requires updating both files (and - # the staging branch protection's `push_whitelist_usernames`). - EXPECTED_PERSONA: devops-engineer - GITEA_HOST: git.moleculesai.app - REPO_PATH: molecule-ai/molecule-core - - steps: - - name: Verify AUTO_SYNC_TOKEN secret is configured - # Schedule-vs-dispatch behaviour split, per - # `feedback_schedule_vs_dispatch_secrets_hardening`: - # - # - schedule: hard-fail when the secret is missing. The - # whole point of the canary is to surface drift; soft- - # skipping on missing-secret would make the canary - # itself drift-invisible (sweep-cf-orphans #2088 lesson). - # - workflow_dispatch: hard-fail too — there's no scenario - # where an operator wants this canary to silently no-op. - # The workflow has no other ad-hoc utility; if you ran - # it, you wanted the answer. - run: | - if [ -z "${AUTO_SYNC_TOKEN}" ]; then - echo "::error::AUTO_SYNC_TOKEN secret is not set on this repo." >&2 - echo "::error::Set it at Settings → Secrets and variables → Actions." >&2 - echo "::error::Without it, auto-sync-main-to-staging.yml will fail every push to main." >&2 - exit 1 - fi - echo "AUTO_SYNC_TOKEN is configured (value masked)." - - - name: Verify token authenticates as ${{ env.EXPECTED_PERSONA }} - # Calls Gitea's `/api/v1/user` — the canonical - # auth-probe-with-no-side-effects endpoint (mirrors - # Cloudflare's /user/tokens/verify). - # - # Failure surfaces: - # - HTTP 401: token invalid (rotated, revoked, or never - # correctly registered). - # - HTTP 200 but username != devops-engineer: token was - # regenerated under the wrong persona — this would let - # auth pass but commit attribution would be wrong, and - # branch-protection authz would fail because only - # `devops-engineer` is whitelisted. - run: | - set -euo pipefail - response_file="$(mktemp)" - code_file="$(mktemp)" - # `--max-time 30`: full call ceiling. `--connect-timeout 10`: - # DNS + TCP. `-w "%{http_code}"` routed to a tempfile so curl's - # exit code can't pollute the captured status — see - # feedback_curl_status_capture_pollution + the - # `lint-curl-status-capture.yml` gate that rejects the unsafe - # `$(curl ... || echo "000")` shape. - set +e - curl -sS -o "$response_file" \ - --max-time 30 --connect-timeout 10 \ - -w "%{http_code}" \ - -H "Authorization: token ${AUTO_SYNC_TOKEN}" \ - -H "Accept: application/json" \ - "https://${GITEA_HOST}/api/v1/user" >"$code_file" 2>/dev/null - set -e - status=$(cat "$code_file" 2>/dev/null || true) - [ -z "$status" ] && status="000" - - if [ "$status" != "200" ]; then - echo "::error::Token rotation suspected: GET /api/v1/user returned HTTP $status (expected 200)." >&2 - echo "::error::Likely cause: AUTO_SYNC_TOKEN has been rotated/revoked on Gitea but the repo Actions secret was not updated." >&2 - echo "::error::Runbook: see header comment of this workflow file." >&2 - # Print response body but redact anything that looks like a token. - sed -E 's/[A-Fa-f0-9]{32,}//g' "$response_file" >&2 || true - exit 1 - fi - - username=$(python3 -c "import json,sys; print(json.load(open(sys.argv[1])).get('login',''))" "$response_file") - if [ "$username" != "${EXPECTED_PERSONA}" ]; then - echo "::error::Token resolves to user '$username', expected '${EXPECTED_PERSONA}'." >&2 - echo "::error::AUTO_SYNC_TOKEN must be the devops-engineer persona PAT (not founder PAT, not another persona)." >&2 - echo "::error::Auto-sync push will fail because only 'devops-engineer' is whitelisted on staging branch protection." >&2 - exit 1 - fi - echo "Token authenticates as: $username ✓" - - - name: Verify token has repo read scope - # `GET /api/v1/repos//` requires `read:repository` - # on the persona's v2 scope contract. If the scope was - # narrowed/dropped on rotation we catch it here, before the - # next main push reveals it via a checkout failure. - run: | - set -euo pipefail - response_file="$(mktemp)" - code_file="$(mktemp)" - # See first probe step for the rationale on the tempfile-routed - # `-w "%{http_code}"` pattern — the unsafe `|| echo "000"` shape - # is rejected by lint-curl-status-capture.yml. - set +e - curl -sS -o "$response_file" \ - --max-time 30 --connect-timeout 10 \ - -w "%{http_code}" \ - -H "Authorization: token ${AUTO_SYNC_TOKEN}" \ - -H "Accept: application/json" \ - "https://${GITEA_HOST}/api/v1/repos/${REPO_PATH}" >"$code_file" 2>/dev/null - set -e - status=$(cat "$code_file" 2>/dev/null || true) - [ -z "$status" ] && status="000" - - if [ "$status" != "200" ]; then - echo "::error::Token lacks read:repository scope on ${REPO_PATH}: HTTP $status." >&2 - echo "::error::Auto-sync's actions/checkout step will fail with this token." >&2 - echo "::error::Re-issue with v2 scope contract: read:repository,write:repository,read:user,read:organization,read:issue,write:issue,read:notification,read:misc" >&2 - sed -E 's/[A-Fa-f0-9]{32,}//g' "$response_file" >&2 || true - exit 1 - fi - echo "Token has read:repository on ${REPO_PATH} ✓" - - - name: Verify git HTTPS auth path via no-op dry-run push to staging - # Final probe: exercise the EXACT auth path that - # `actions/checkout` + `git push origin staging` use in - # auto-sync-main-to-staging.yml. Gitea's API and git-HTTPS - # surfaces share the token-lookup code path internally but - # the wire-level error shapes differ — historically (#173) - # the API path was healthy while git-HTTPS rejected, so - # checking only the API would have given false-green. - # - # IMPORTANT: `git ls-remote` on a public repo (which - # molecule-core is) succeeds even with a junk token because - # Gitea falls back to anonymous-read. `ls-remote` therefore - # CANNOT validate auth on this surface. We use - # `git push --dry-run` instead — push is auth-gated even on - # public repos. - # - # NOP shape: read the current staging SHA via authenticated - # ls-remote (the SHA itself is public; auth is incidental - # here, used only to colocate the discovery in one step), then - # `git push --dry-run :refs/heads/staging`. Pushing the - # current tip back to itself is "Everything up-to-date" with - # exit 0 when auth succeeds. With a bad token Gitea returns - # HTTP 401 in the smart-protocol handshake and git exits 128 - # with "Authentication failed". - # - # The dry-run never reaches Gitea's pre-receive hook (which - # is where branch-protection authz runs), so this probe does - # not validate failure mode C. That's intentional — - # branch-protection-drift.yml owns authz monitoring; this - # canary owns auth. - env: - # Don't hang waiting for password prompt if auth fails on a - # terminal-attached run. (In Actions there's no terminal, - # but the env-var hardens against an interactive runner - # config.) - GIT_TERMINAL_PROMPT: "0" - run: | - set -euo pipefail - # Token is in $AUTO_SYNC_TOKEN (job-level env). Compose the - # URL as a local var that's never echoed. - url="https://oauth2:${AUTO_SYNC_TOKEN}@${GITEA_HOST}/${REPO_PATH}" - - # Step a: read current staging SHA. ~1KB; auth-gated only - # on private repos but always works on public — used here - # only to discover the SHA, not to validate auth. - staging_ref=$(timeout 30s git ls-remote --refs "$url" refs/heads/staging 2>&1) || { - redacted=$(echo "$staging_ref" | sed -E "s|oauth2:[^@]+@|oauth2:@|g") - echo "::error::ls-remote against staging failed (network/DNS issue):" >&2 - echo "$redacted" >&2 - exit 1 - } - if ! echo "$staging_ref" | grep -qE '^[0-9a-f]{40}[[:space:]]+refs/heads/staging$'; then - echo "::error::ls-remote returned unexpected shape:" >&2 - echo "$staging_ref" | sed -E "s|oauth2:[^@]+@|oauth2:@|g" >&2 - exit 1 - fi - staging_sha=$(echo "$staging_ref" | awk '{print $1}') - - # Step b: spin up an ephemeral local repo. `git push` always - # requires a local repo even when pushing a remote SHA that - # isn't in the local object DB (the protocol negotiates and - # discovers we don't need to send any objects). We don't use - # `actions/checkout` for this — it would clone the whole - # repo (~hundreds of MB) for what's essentially `git init`. - tmp_repo="$(mktemp -d)" - trap 'rm -rf "$tmp_repo"' EXIT - git -C "$tmp_repo" init -q - # Author config required for any git operation; values are - # arbitrary because nothing gets committed here. - git -C "$tmp_repo" config user.email canary@auto-sync.local - git -C "$tmp_repo" config user.name auto-sync-canary - - # Step c: dry-run push the current staging SHA back to - # staging. NOP by construction — the remote tip equals the - # SHA we're pushing, so "Everything up-to-date" is the - # success path. - # - # Authentication is checked at the smart-protocol handshake, - # BEFORE the dry-run can compute an empty diff. Bad token - # → "Authentication failed", exit 128. Good token → exit 0. - set +e - push_out=$(timeout 30s git -C "$tmp_repo" push --dry-run "$url" "${staging_sha}:refs/heads/staging" 2>&1) - push_rc=$? - set -e - - if [ "$push_rc" -ne 0 ]; then - redacted=$(echo "$push_out" | sed -E "s|oauth2:[^@]+@|oauth2:@|g") - echo "::error::Token rotation suspected: git push --dry-run against staging failed via the AUTO_SYNC_TOKEN HTTPS auth path (exit $push_rc)." >&2 - echo "::error::This is the EXACT auth path that actions/checkout + git push use in auto-sync-main-to-staging.yml." >&2 - echo "::error::Likely cause: AUTO_SYNC_TOKEN was rotated/revoked on Gitea but the repo Actions secret was not updated. Runbook: see header." >&2 - echo "$redacted" >&2 - exit 1 - fi - - echo "git HTTPS auth path: NOP push --dry-run to staging → ${staging_sha:0:8} ✓" - - - name: Summarise canary result - # Everything passed — surface a green summary. (Failures - # already wrote ::error:: lines and exited above; if we got - # here, all three probes passed.) - run: | - { - echo "## Auto-sync canary: GREEN" - echo "" - echo "AUTO_SYNC_TOKEN is healthy:" - echo "- Authenticates as \`${EXPECTED_PERSONA}\` ✓" - echo "- Has \`read:repository\` scope on \`${REPO_PATH}\` ✓" - echo "- Git HTTPS auth path: no-op dry-run push to \`refs/heads/staging\` succeeds ✓" - echo "" - echo "Auto-sync main → staging will succeed on the next push to main." - echo "If this canary ever goes RED, see the runbook in this workflow's header." - } >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/auto-sync-main-to-staging.yml b/.github/workflows/auto-sync-main-to-staging.yml deleted file mode 100644 index c0173a3d..00000000 --- a/.github/workflows/auto-sync-main-to-staging.yml +++ /dev/null @@ -1,255 +0,0 @@ -name: Auto-sync main → staging - -# Reflects every push to `main` back onto `staging` so the -# staging-as-superset-of-main invariant holds. -# -# ============================================================ -# What this workflow does -# ============================================================ -# -# On every push to `main`: -# 1. Checks if staging already contains main → no-op. -# 2. Fetches both branches, merges main into staging in the -# runner workspace (fast-forward if possible, else -# `--no-ff` merge commit). -# 3. Pushes staging directly to origin via the -# `devops-engineer` persona's `AUTO_SYNC_TOKEN`. -# -# Authoritative path: a single `git push origin staging` from -# inside this workflow is the SSOT for advancing staging after -# a main push. No PR, no merge queue, no human approval — -# staging is mechanically maintained as a superset of main. -# -# `auto-promote-staging.yml` is the reverse-direction -# counterpart (staging → main, gated on green CI). Together -# they keep the staging-superset-of-main invariant tight. -# -# ============================================================ -# Why direct push (and not "open a PR") -# ============================================================ -# -# Pre-2026-05-06 the canonical SCM was GitHub.com, where: -# - The `staging` branch had a `merge_queue` ruleset that -# blocked ALL direct pushes (no bypass even for org -# admins or the GitHub Actions integration). -# - Therefore this workflow opened a PR via `gh pr create` -# and let auto-merge land it through the queue. -# -# Post-2026-05-06 the canonical SCM is Gitea -# (`git.moleculesai.app/molecule-ai/molecule-core`). Gitea: -# - Has no `merge_queue` concept. -# - Allows direct push to protected branches via per-user -# `push_whitelist_usernames` on the branch protection. -# - Does not expose a GraphQL endpoint, so `gh pr create` -# returns `HTTP 405 Method Not Allowed -# (https://git.moleculesai.app/api/graphql)` — the -# pre-suspension architecture cannot work on Gitea. -# -# The molecule-ai/molecule-core staging branch protection -# (verified via `GET /api/v1/repos/.../branch_protections`) -# whitelists `devops-engineer` for direct push. So the -# correct Gitea-shape architecture is: authenticate as -# `devops-engineer`, merge locally, push staging directly. -# -# This is structurally simpler than the GitHub-era PR dance -# and removes the dependence on `gh` CLI / GraphQL entirely. -# -# ============================================================ -# Identity + token (anti-bot-ring per saved-memory -# `feedback_per_agent_gitea_identity_default`) -# ============================================================ -# -# This workflow uses `secrets.AUTO_SYNC_TOKEN`, which is a -# personal access token issued to the `devops-engineer` -# persona on Gitea — NOT the founder PAT. The bot-ring -# fingerprint that triggered the GitHub org suspension on -# 2026-05-06 was characterised by founder PAT acting as CI -# at machine speed; per-persona identities split the -# attribution honestly. -# -# Token scope on Gitea: repo write. Push target restricted -# to `staging` (this workflow is the only writer; main is -# untouched). Compromise blast radius: bounded to staging -# branch + this repo's read surface. -# -# Commits are authored by the persona email -# `devops-engineer@agents.moleculesai.app` so commit history -# reflects which automation produced the merge. -# -# ============================================================ -# Failure modes & operational notes -# ============================================================ -# -# A — staging has commits main doesn't, and the merge -# conflicts: -# - The `--no-ff` merge step exits non-zero. Workflow -# fails red. Operator (devops-engineer or human) -# resolves manually: -# git fetch origin -# git checkout staging -# git merge --no-ff origin/main -# # resolve conflicts -# git push origin staging -# - Step summary surfaces the conflict so the failed run -# is self-explanatory. -# -# B — `AUTO_SYNC_TOKEN` rotated / wrong scope: -# - `git push` step exits non-zero with `HTTP 401` / -# `403`. Step summary surfaces the failed push. -# - Re-issue the token from `~/.molecule-ai/personas/` -# on the operator host and update the repo Actions -# secret. Re-run the workflow. -# -# C — staging branch protection no longer whitelists -# `devops-engineer`: -# - `git push` exits non-zero with a Gitea protected- -# branch rejection. Step summary surfaces it. -# - Re-add `devops-engineer` to -# `push_whitelist_usernames` on the staging -# protection (Settings → Branches → staging). -# -# D — concurrent push to main while a sync is in flight: -# - The `concurrency` group below serialises runs. -# The second waits for the first; if main advances -# again while we're syncing, the second run picks -# up the new tip on its own fetch. -# -# ============================================================ -# Loop safety -# ============================================================ -# -# The push to staging from this workflow does NOT itself -# fire a `push: branches: [main]` event (different branch), -# so there's no risk of self-recursion. `auto-promote-staging.yml` -# fires on `workflow_run` of CI etc. — it sees the new -# staging tip on its next gate-completion event, NOT on this -# push directly. No loop. - -on: - push: - branches: [main] - # workflow_dispatch lets operators manually backfill a - # missed sync (e.g. if AUTO_SYNC_TOKEN was rotated and a - # main push slipped through while the secret was stale). - workflow_dispatch: - -permissions: - contents: write - -concurrency: - group: auto-sync-main-to-staging - cancel-in-progress: false - -jobs: - sync-staging: - runs-on: ubuntu-latest - steps: - - name: Checkout staging (with devops-engineer push token) - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - fetch-depth: 0 - ref: staging - # AUTO_SYNC_TOKEN authenticates as the - # `devops-engineer` Gitea persona — the only - # identity whitelisted for direct push to - # staging. See header comment for context. - token: ${{ secrets.AUTO_SYNC_TOKEN }} - - - name: Configure git author - run: | - # Per-persona identity, NOT founder PAT. - # `feedback_per_agent_gitea_identity_default`. - git config user.name "devops-engineer" - git config user.email "devops-engineer@agents.moleculesai.app" - - - name: Check if staging already contains main - id: check - run: | - set -euo pipefail - git fetch origin main - if git merge-base --is-ancestor origin/main HEAD; then - echo "needs_sync=false" >> "$GITHUB_OUTPUT" - { - echo "## No-op" - echo - echo "staging already contains \`origin/main\` ($(git rev-parse --short=8 origin/main))." - } >> "$GITHUB_STEP_SUMMARY" - else - echo "needs_sync=true" >> "$GITHUB_OUTPUT" - MAIN_SHORT=$(git rev-parse --short=8 origin/main) - echo "main_short=${MAIN_SHORT}" >> "$GITHUB_OUTPUT" - echo "::notice::staging is missing main's tip (${MAIN_SHORT}) — merging in-runner and pushing" - fi - - - name: Merge main into staging (in-runner) - if: steps.check.outputs.needs_sync == 'true' - id: merge - run: | - set -euo pipefail - # Already on staging from checkout. Try fast-forward - # first (cleanest history); fall back to merge commit - # if staging has commits main doesn't. - if git merge --ff-only origin/main; then - echo "did_ff=true" >> "$GITHUB_OUTPUT" - echo "::notice::Fast-forwarded staging to origin/main" - else - echo "did_ff=false" >> "$GITHUB_OUTPUT" - if ! git merge --no-ff origin/main \ - -m "chore: sync main → staging (auto, ${{ steps.check.outputs.main_short }})"; then - # Hygiene: leave the work tree clean before failing. - git merge --abort || true - { - echo "## Conflict" - echo - echo "Auto-merge \`main → staging\` failed with conflicts." - echo "A human (or devops-engineer persona) needs to resolve manually:" - echo - echo '```' - echo "git fetch origin" - echo "git checkout staging" - echo "git merge --no-ff origin/main" - echo "# resolve conflicts" - echo "git push origin staging" - echo '```' - } >> "$GITHUB_STEP_SUMMARY" - exit 1 - fi - fi - - - name: Push staging to origin - if: steps.check.outputs.needs_sync == 'true' - run: | - set -euo pipefail - # Direct push to staging. devops-engineer persona is - # whitelisted for direct push on the staging branch - # protection (Settings → Branches → staging). - # - # No --force / --force-with-lease: a fast-forward or - # legitimate merge commit on top of current staging - # is the only thing we'd ever push. If origin/staging - # advanced under us (concurrent merge), the push - # legitimately rejects and the next run picks up the - # new state. - if ! git push origin staging; then - { - echo "## Push rejected" - echo - echo "Direct push to \`staging\` failed. Likely causes:" - echo "- \`AUTO_SYNC_TOKEN\` rotated / wrong scope (HTTP 401/403)" - echo "- \`devops-engineer\` no longer in" - echo " \`push_whitelist_usernames\` on the staging" - echo " branch protection (HTTP 422)" - echo "- staging advanced concurrently — re-running this" - echo " workflow on the new main tip will pick it up" - } >> "$GITHUB_STEP_SUMMARY" - exit 1 - fi - - { - echo "## Auto-sync succeeded" - echo - echo "- staging advanced to: \`$(git rev-parse --short=8 HEAD)\`" - echo "- main tip: \`${{ steps.check.outputs.main_short }}\`" - echo "- Strategy: $([ "${{ steps.merge.outputs.did_ff }}" = "true" ] && echo "fast-forward" || echo "merge commit")" - echo "- Pushed by: \`devops-engineer\` (per-agent persona, anti-bot-ring)" - } >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/retarget-main-to-staging.yml b/.github/workflows/retarget-main-to-staging.yml deleted file mode 100644 index 5c5d81f8..00000000 --- a/.github/workflows/retarget-main-to-staging.yml +++ /dev/null @@ -1,276 +0,0 @@ -name: Retarget main PRs to staging - -# Mechanical enforcement of SHARED_RULES rule 8 ("Staging-first -# workflow, no exceptions"). When a bot opens a PR against `main`, -# retarget it to `staging` automatically and leave an explanatory -# comment. Human / CEO-authored PRs (the staging→main promotion -# PRs, etc.) are left alone — they're the authorised exception -# to the rule. -# -# ============================================================ -# What this workflow does -# ============================================================ -# -# On `pull_request_target` opened/reopened against `main`: -# 1. If the PR head is `staging`, skip (the auto-promote PRs -# MUST stay base=main). -# 2. If the PR author is a bot, retarget the PR base to -# `staging` via Gitea REST `PATCH /pulls/{N}` body -# `{"base":"staging"}`. -# 3. If the retarget returns 422 "pull request already exists -# for base branch 'staging'" (issue #1884 case: another PR -# on the same head already targets staging), close the -# now-redundant main-PR via Gitea REST instead of failing -# red. -# 4. Post an explainer comment on the retargeted PR via -# Gitea REST `POST /issues/{N}/comments`. -# -# ============================================================ -# Why Gitea REST (and not `gh api / gh pr close / gh pr comment`) -# ============================================================ -# -# Pre-2026-05-06 this workflow used `gh api -X PATCH "repos/{owner}/{repo}/pulls/{N}" -f base=staging` -# plus `gh pr close` and `gh pr comment`. After the GitHub→Gitea -# cutover those calls fail because: -# -# - `gh` CLI defaults to `api.github.com`. Even with `GH_HOST` -# pointing at Gitea, `gh pr close / comment` route through -# GraphQL (`/api/graphql`) which Gitea does not expose. -# Empirical: every `gh pr *` call returns -# `HTTP 405 Method Not Allowed (https://git.moleculesai.app/api/graphql)` -# — same root cause as #65 (auto-sync, fixed in PR #66) and -# #73/#195 (auto-promote, fixed in PR #78). -# - `gh api -X PATCH /pulls/{N}` happens to use a REST path -# that Gitea also has, but the `gh` host-resolution layer -# and pagination/retry logic don't always hit Gitea cleanly, -# and the cost of switching to direct `curl` is one extra -# line of code. -# -# So this workflow uses direct `curl` calls to Gitea REST. No -# `gh` CLI dependency, no GraphQL, no flaky host-resolution. -# -# ============================================================ -# Identity + token (anti-bot-ring per saved-memory -# `feedback_per_agent_gitea_identity_default`) -# ============================================================ -# -# Pre-fix this workflow used the per-job ephemeral -# `secrets.GITHUB_TOKEN`. On Gitea Actions that token has -# narrow scope and unpredictable cross-PR write capability. -# -# Post-fix: `secrets.AUTO_SYNC_TOKEN` (the `devops-engineer` -# Gitea persona). Same persona used by `auto-sync-main-to-staging.yml` -# (PR #66) and `auto-promote-staging.yml` (PR #78). Token scope: -# `push: true` repo write, sufficient for PR-edit + close + comment. -# -# Why this token does NOT need branch-protection bypass: -# patching a PR's base ref is a PR-level operation that does not -# require push perms on either branch (the PR's own commits stay -# put; only the metadata changes). -# -# ============================================================ -# Failure modes & operational notes -# ============================================================ -# -# A — PATCH base→staging returns 422 "pull request already exists" -# (issue #1884 case): -# - Detected by string-match on response body. Workflow -# falls through to closing the now-redundant main-PR -# (Gitea REST `PATCH /pulls/{N}` with `state: closed`) -# and posts an explanation comment. Step summary surfaces. -# -# B — `AUTO_SYNC_TOKEN` rotated / wrong scope: -# - First REST call returns 401/403. Step summary surfaces. -# Re-issue token from `~/.molecule-ai/personas/` on the -# operator host and update repo Actions secret. -# -# C — PR was deleted between trigger and run: -# - REST call returns 404. Workflow exits 0 with a notice -# (the rule was already enforced or the PR is gone). -# -# D — author is not actually a bot but the filter mis-fires: -# - Filter is conservative: only triggers on -# `user.type == 'Bot'`, `login` ends with `[bot]`, or -# known bot logins (`molecule-ai[bot]`, `app/molecule-ai`). -# Human PRs slip through unaffected. If a NEW bot login -# starts shipping main-PRs, add it to the filter. - -on: - pull_request_target: - types: [opened, reopened] - branches: [main] - -permissions: - pull-requests: write - -jobs: - retarget: - name: Retarget to staging - runs-on: ubuntu-latest - # Only fire for bot-authored PRs. Human CEO PRs (staging→main - # promotion) are intentional and pass through. - # - # Head-ref guard: never retarget a PR whose head IS `staging` - # — those are the auto-promote staging→main PRs (opened by - # `devops-engineer` since PR #78 / #195 fix). Retargeting - # head=staging onto base=staging fails with HTTP 422 "no new - # commits between base 'staging' and head 'staging'", which - # would surface as a noisy red workflow run on every - # auto-promote (caught 2026-05-03 on the GitHub-era PR #2588). - if: >- - github.event.pull_request.head.ref != 'staging' - && ( - github.event.pull_request.user.type == 'Bot' - || endsWith(github.event.pull_request.user.login, '[bot]') - || github.event.pull_request.user.login == 'app/molecule-ai' - || github.event.pull_request.user.login == 'molecule-ai[bot]' - || github.event.pull_request.user.login == 'devops-engineer' - ) - steps: - - name: Retarget PR base to staging via Gitea REST - id: retarget - env: - GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }} - GITEA_HOST: ${{ vars.GITEA_HOST || 'https://git.moleculesai.app' }} - REPO: ${{ github.repository }} - PR_NUMBER: ${{ github.event.pull_request.number }} - PR_AUTHOR: ${{ github.event.pull_request.user.login }} - # Issue #1884 case: when the bot opens a PR against main - # and there's already another PR on the same head branch - # targeting staging, Gitea's PATCH returns 422 with a - # body mentioning "pull request already exists for base - # branch 'staging'" (the Gitea message wording is - # slightly different from GitHub's; the substring match - # below covers both for forward/back compat). - # The retarget can't proceed — but the right response is - # to close the now-redundant main-PR, not to fail the - # workflow noisily. Detect that specific 422 and close - # instead. - run: | - set -euo pipefail - - API="${GITEA_HOST}/api/v1/repos/${REPO}" - AUTH=(-H "Authorization: token ${GITEA_TOKEN}" -H "Accept: application/json") - - echo "Retargeting PR #${PR_NUMBER} (author: ${PR_AUTHOR}) from main → staging" - - # Curl-status-capture pattern per `feedback_curl_status_capture_pollution`: - # http_code via -w to its own scalar, body to a tempfile, set +e/-e - # bracket so curl's non-zero-on-4xx doesn't pollute the script's exit chain. - BODY_FILE=$(mktemp) - REQ='{"base":"staging"}' - - set +e - STATUS=$(curl -sS "${AUTH[@]}" -H "Content-Type: application/json" \ - -X PATCH -d "${REQ}" \ - -o "${BODY_FILE}" -w "%{http_code}" \ - "${API}/pulls/${PR_NUMBER}") - CURL_RC=$? - set -e - - if [ "${CURL_RC}" -ne 0 ]; then - echo "::error::curl PATCH failed (rc=${CURL_RC})" - rm -f "${BODY_FILE}" - exit 1 - fi - - if [ "${STATUS}" = "201" ] || [ "${STATUS}" = "200" ]; then - NEW_BASE=$(jq -r '.base.ref // "?"' < "${BODY_FILE}") - rm -f "${BODY_FILE}" - if [ "${NEW_BASE}" = "staging" ]; then - echo "::notice::Retargeted PR #${PR_NUMBER} → staging" - echo "outcome=retargeted" >> "$GITHUB_OUTPUT" - exit 0 - fi - echo "::error::PATCH returned ${STATUS} but base.ref is '${NEW_BASE}', not 'staging'" - exit 1 - fi - - # Specifically match the 422 duplicate-base/head error so - # any OTHER PATCH failure (auth, deleted PR, etc.) still - # surfaces as a real workflow failure. - BODY=$(cat "${BODY_FILE}" || true) - rm -f "${BODY_FILE}" - - if [ "${STATUS}" = "422" ] && echo "${BODY}" | grep -qE "(pull request already exists for base branch 'staging'|already exists.*base.*staging)"; then - echo "::notice::PR #${PR_NUMBER}: duplicate target-staging PR exists on same head — closing this main-PR as redundant." - - # Close the now-redundant main-PR via Gitea REST - # (PATCH state=closed). Post comment explaining - # rationale BEFORE close so the comment lands on the - # PR (commenting on a closed PR works on Gitea, but - # historically caused notification ordering surprises). - - CLOSE_BODY_FILE=$(mktemp) - CMT_REQ=$(jq -n '{body:"[retarget-bot] Closing — another PR on the same head branch already targets `staging`. This PR is redundant. See issue #1884 for the rationale."}') - set +e - CMT_STATUS=$(curl -sS "${AUTH[@]}" -H "Content-Type: application/json" \ - -X POST -d "${CMT_REQ}" \ - -o "${CLOSE_BODY_FILE}" -w "%{http_code}" \ - "${API}/issues/${PR_NUMBER}/comments") - set -e - if [ "${CMT_STATUS}" != "201" ]; then - echo "::warning::dup-close comment POST returned ${CMT_STATUS}; continuing to close anyway" - cat "${CLOSE_BODY_FILE}" | head -c 300 || true - fi - rm -f "${CLOSE_BODY_FILE}" - - CLOSE_REQ='{"state":"closed"}' - CLOSE_RESP=$(mktemp) - set +e - CL_STATUS=$(curl -sS "${AUTH[@]}" -H "Content-Type: application/json" \ - -X PATCH -d "${CLOSE_REQ}" \ - -o "${CLOSE_RESP}" -w "%{http_code}" \ - "${API}/pulls/${PR_NUMBER}") - set -e - if [ "${CL_STATUS}" = "201" ] || [ "${CL_STATUS}" = "200" ]; then - echo "::notice::Closed PR #${PR_NUMBER} as redundant" - echo "outcome=closed-as-duplicate" >> "$GITHUB_OUTPUT" - rm -f "${CLOSE_RESP}" - exit 0 - fi - echo "::error::Failed to close redundant PR: HTTP ${CL_STATUS}" - cat "${CLOSE_RESP}" | head -c 300 || true - rm -f "${CLOSE_RESP}" - exit 1 - fi - - echo "::error::Retarget PATCH failed and was NOT a duplicate-base error: HTTP ${STATUS}" - echo "${BODY}" | head -c 500 >&2 - exit 1 - - - name: Post explainer comment - if: steps.retarget.outputs.outcome == 'retargeted' - env: - GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }} - GITEA_HOST: ${{ vars.GITEA_HOST || 'https://git.moleculesai.app' }} - REPO: ${{ github.repository }} - PR_NUMBER: ${{ github.event.pull_request.number }} - run: | - set -euo pipefail - - API="${GITEA_HOST}/api/v1/repos/${REPO}" - AUTH=(-H "Authorization: token ${GITEA_TOKEN}" -H "Accept: application/json") - - # PR comments live on the issue endpoint in Gitea - # (PRs ARE issues — same endpoint, different sub-resources - # for diffs/files/etc.). The body uses jq to safely - # encode the multi-line markdown without shell-quote - # nightmares. - REQ=$(jq -n '{body:"[retarget-bot] This PR was opened against `main` and has been retargeted to `staging` automatically.\n\n**Why:** per [SHARED_RULES rule 8](https://git.moleculesai.app/molecule-ai/molecule-ai-org-template-molecule-dev/src/branch/main/SHARED_RULES.md), all feature work targets `staging` first; the CEO promotes `staging → main` separately.\n\n**What changed:** just the base branch — no code change. CI will re-run against `staging`. If you get merge conflicts, rebase on `staging`.\n\n**If this PR is the CEO`s staging→main promotion:** the Action skipped you (only bot-authored PRs are retargeted, head=staging is also exempted). If you see this comment on your CEO PR, that`s a bug — please tag @hongmingwang."}') - - BODY_FILE=$(mktemp) - set +e - STATUS=$(curl -sS "${AUTH[@]}" -H "Content-Type: application/json" \ - -X POST -d "${REQ}" \ - -o "${BODY_FILE}" -w "%{http_code}" \ - "${API}/issues/${PR_NUMBER}/comments") - set -e - - if [ "${STATUS}" = "201" ]; then - echo "::notice::Posted explainer comment on PR #${PR_NUMBER}" - else - echo "::warning::Failed to post explainer (HTTP ${STATUS}) — retarget itself succeeded" - cat "${BODY_FILE}" | head -c 300 || true - fi - rm -f "${BODY_FILE}"