From bfc393c065e7173d6025b2da2516e127826940ed Mon Sep 17 00:00:00 2001 From: claude-ceo-assistant Date: Thu, 7 May 2026 15:23:03 -0700 Subject: [PATCH 1/6] ci: add AUTO_SYNC_TOKEN rotation drift canary (#72) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a 6h-cron synthetic check that fires the auth surface used by auto-sync-main-to-staging.yml (PR #66) and emits a red workflow status when AUTO_SYNC_TOKEN has drifted out of validity. Closes hostile-self-review weakest-spot #3 from PR #66 (token-rotation detection latency). Read-only verification — no writes, no synthetic merge commits, no canary branch noise. Three probes: 1. GET /api/v1/user → token authenticates as devops-engineer 2. GET /api/v1/repos/molecule-ai/molecule-core → read:repository scope 3. git ls-remote refs/heads/staging → exact HTTPS auth path used by actions/checkout in the real auto-sync workflow Hard-fail on missing AUTO_SYNC_TOKEN secret on both schedule and workflow_dispatch — per feedback_schedule_vs_dispatch_secrets_hardening, a silent soft-skip would make the canary itself drift-invisible (the sweep-cf-orphans #2088 lesson). Operator runbook in workflow header. Token reuse: same AUTO_SYNC_TOKEN as the workflow under monitor; no new credential introduced. Read-only paths only. Refs: #72, hostile-self-review #66 --- .github/workflows/auto-sync-canary.yml | 324 +++++++++++++++++++++++++ 1 file changed, 324 insertions(+) create mode 100644 .github/workflows/auto-sync-canary.yml diff --git a/.github/workflows/auto-sync-canary.yml b/.github/workflows/auto-sync-canary.yml new file mode 100644 index 00000000..9f55aa19 --- /dev/null +++ b/.github/workflows/auto-sync-canary.yml @@ -0,0 +1,324 @@ +name: Auto-sync canary — AUTO_SYNC_TOKEN rotation drift + +# Synthetic health check for the AUTO_SYNC_TOKEN secret consumed by +# auto-sync-main-to-staging.yml (PR #66) and publish-workspace-server-image.yml. +# +# ============================================================ +# Why this workflow exists +# ============================================================ +# +# PR #66 fixed auto-sync (replaced GitHub-era `gh pr create` — which +# 405s on Gitea's GraphQL endpoint — with a direct git push from the +# `devops-engineer` persona's `AUTO_SYNC_TOKEN`). Hostile self-review +# weakest spot #3 of that PR: +# +# "Token rotation silently breaks auto-sync. If AUTO_SYNC_TOKEN is +# rotated without updating the repo secret, every push to main +# fails red on the auto-sync push step. The workflow surfaces the +# failure mode in the step summary (failure mode B in the header), +# but there's no proactive monitoring." +# +# Detection latency under the status quo: rotation is only caught on +# the next push to `main`. During quiet periods (no main push for +# many hours) the staging-superset-of-main invariant silently breaks. +# +# This workflow closes the gap: every 6 hours, it fires the auth +# surface that auto-sync depends on and emits a red workflow status +# if AUTO_SYNC_TOKEN has drifted out of validity. +# +# ============================================================ +# What this checks (Option B — read-only verify) +# ============================================================ +# +# 1. `GET /api/v1/user` against Gitea with the token → validates the +# token authenticates AND resolves to `devops-engineer` (catches +# the case where the token was regenerated under a different +# persona by mistake). +# 2. `GET /api/v1/repos/molecule-ai/molecule-core` with the token → +# validates the token has `read:repository` scope on this repo +# (the v2 scope contract — see saved memory +# `reference_persona_token_v2_scope`). +# 3. `git ls-remote https://oauth2:@/.../molecule-core +# refs/heads/staging` → validates the EXACT HTTPS basic-auth path +# that `actions/checkout` uses inside auto-sync-main-to-staging.yml. +# Without this we'd be testing the API surface but not the git +# HTTPS surface; they don't share an auth code path on Gitea. +# +# Each step exits non-zero with an actionable error message if it +# fails. The workflow status itself is the operator-facing surface. +# +# ============================================================ +# What this does NOT check (intentional) +# ============================================================ +# +# - **Branch-protection authz** (failure mode C in auto-sync header): +# would require an actual write to staging. Already monitored by +# `branch-protection-drift.yml` daily. Don't duplicate. +# - **Conflict resolution** (failure mode A): a real conflict is data- +# driven, not auth-driven; can't synthesise it without polluting +# staging. Already surfaces immediately on the next main push. +# - **Concurrency** (failure mode D): handled by workflow concurrency +# group on auto-sync, not a credential issue. +# +# ============================================================ +# Why Option B (read-only) and not the alternatives +# ============================================================ +# +# Considered + rejected (see issue #72 for full write-up): +# +# - **Option A — full auto-sync on schedule**: every run creates a +# no-op merge commit on staging when main hasn't advanced. 4 noise +# commits/day. And races the real `push:` trigger when main has +# advanced. Rejected. +# +# - **Option C — push to dedicated `auto-sync-canary` branch**: would +# exercise authz too, but adds branch noise on Gitea AND requires +# maintaining a second branch protection (or expanding staging's +# whitelist to a junk branch). Authz already covered by +# `branch-protection-drift.yml`. Rejected. +# +# Prior art for the chosen Option B shape: +# - Cloudflare's `/user/tokens/verify` endpoint (read-only auth +# probe explicitly designed for credential canaries). +# - AWS Secrets Manager rotation Lambda's `testSecret` step (auth +# probe before promoting AWSPENDING → AWSCURRENT). +# - HashiCorp Vault's `vault token lookup` for renewal canaries. +# +# ============================================================ +# Operator runbook — what to do when this workflow goes RED +# ============================================================ +# +# 1. **Identify which step failed**: +# - Step "Verify token authenticates as devops-engineer" red → +# token is invalid OR resolves to wrong persona. +# - Step "Verify token has repo read scope" red → token valid but +# stripped of `read:repository` scope (or repo perms changed). +# - Step "Verify git HTTPS auth path works" red → API works but +# git HTTPS auth path is broken (rare; usually means a Gitea +# config drift, not a token issue). +# +# 2. **Re-issue the token** on the operator host: +# ``` +# ssh root@5.78.80.188 'docker exec --user git molecule-gitea-1 \ +# gitea admin user generate-access-token \ +# --username devops-engineer \ +# --token-name persona-devops-engineer-vN \ +# --scopes "read:repository,write:repository,read:user,read:organization,read:issue,write:issue,read:notification,read:misc"' +# ``` +# Update `/etc/molecule-bootstrap/agent-secrets.env` in place +# (per `feedback_unified_credentials_file`). The previous token +# file lands at `.bak.`. +# +# 3. **Update the repo Actions secret** at: +# Settings → Secrets and variables → Actions → AUTO_SYNC_TOKEN +# Paste the new token. (Don't echo it in chat — but per +# `feedback_passwords_in_chat_are_burned`, a paste in a 1:1 +# Claude session is within trust boundary.) +# +# 4. **Re-run this canary** via workflow_dispatch. Confirm GREEN. +# +# 5. **Backfill any missed main → staging syncs** by re-running +# `auto-sync-main-to-staging.yml` from its workflow_dispatch +# surface, OR by pushing an empty commit to main (if you'd +# rather force a real trigger). +# +# ============================================================ +# Security notes +# ============================================================ +# +# - Token usage: read-only (`GET /api/v1/user`, `GET /api/v1/repos/...`, +# `git ls-remote`). No write paths. Same blast-radius profile as +# `actions/checkout` on a public repo. +# - The token NEVER appears in logs: every `curl` uses a header +# variable, never inline; the `git ls-remote` URL builds the +# `oauth2:$TOKEN@host` form into a single env var that's not +# echoed. GitHub Actions secret-masking covers anything that does +# slip through. +# - No new token introduced — same `AUTO_SYNC_TOKEN` the workflow +# under monitor uses. Per least-privilege we deliberately do NOT +# broaden scope for the canary. + +on: + schedule: + # Every 6 hours at :17 (offsets the cron herd at :00). Justification + # from issue #72: cheap to run (~5s wall-clock, no quota), 3h average + # detection latency, 6h max. 1h would be 24× the runs for marginal + # benefit; daily would be 6× longer latency and worse than status + # quo on a quiet-main day. + - cron: '17 */6 * * *' + workflow_dispatch: + +# No concurrency group needed — the canary is read-only and idempotent. +# Two parallel runs (e.g. operator dispatch during a scheduled tick) are +# harmless: same result, doubled HTTPS calls, no shared state. + +permissions: + contents: read + +jobs: + verify-token: + name: Verify AUTO_SYNC_TOKEN validity + runs-on: ubuntu-latest + # 2 min surfaces hangs (Gitea API stall, DNS issue) within one + # cron interval. Realistic worst case is ~10s: 2 curls + 1 git + # ls-remote, each capped by the explicit timeouts below. + timeout-minutes: 2 + + env: + # Pinned in env so individual steps can read it without + # repeating the secret reference. GitHub masks the value in + # logs automatically. + AUTO_SYNC_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }} + EXPECTED_PERSONA: devops-engineer + GITEA_HOST: git.moleculesai.app + REPO_PATH: molecule-ai/molecule-core + + steps: + - name: Verify AUTO_SYNC_TOKEN secret is configured + # Schedule-vs-dispatch behaviour split, per + # `feedback_schedule_vs_dispatch_secrets_hardening`: + # + # - schedule: hard-fail when the secret is missing. The + # whole point of the canary is to surface drift; soft- + # skipping on missing-secret would make the canary + # itself drift-invisible (sweep-cf-orphans #2088 lesson). + # - workflow_dispatch: hard-fail too — there's no scenario + # where an operator wants this canary to silently no-op. + # The workflow has no other ad-hoc utility; if you ran + # it, you wanted the answer. + run: | + if [ -z "${AUTO_SYNC_TOKEN}" ]; then + echo "::error::AUTO_SYNC_TOKEN secret is not set on this repo." >&2 + echo "::error::Set it at Settings → Secrets and variables → Actions." >&2 + echo "::error::Without it, auto-sync-main-to-staging.yml will fail every push to main." >&2 + exit 1 + fi + echo "AUTO_SYNC_TOKEN is configured (value masked)." + + - name: Verify token authenticates as ${{ env.EXPECTED_PERSONA }} + # Calls Gitea's `/api/v1/user` — the canonical + # auth-probe-with-no-side-effects endpoint (mirrors + # Cloudflare's /user/tokens/verify). + # + # Failure surfaces: + # - HTTP 401: token invalid (rotated, revoked, or never + # correctly registered). + # - HTTP 200 but username != devops-engineer: token was + # regenerated under the wrong persona — this would let + # auth pass but commit attribution would be wrong, and + # branch-protection authz would fail because only + # `devops-engineer` is whitelisted. + run: | + set -euo pipefail + response_file="$(mktemp)" + # `--max-time 30`: full call ceiling. `--connect-timeout 10`: + # DNS + TCP. `-w "%{http_code}"` to a separate var (not + # response body — see feedback_curl_status_capture_pollution). + status=$(curl -sS -o "$response_file" \ + --max-time 30 --connect-timeout 10 \ + -w "%{http_code}" \ + -H "Authorization: token ${AUTO_SYNC_TOKEN}" \ + -H "Accept: application/json" \ + "https://${GITEA_HOST}/api/v1/user" || echo "000") + + if [ "$status" != "200" ]; then + echo "::error::Token rotation suspected: GET /api/v1/user returned HTTP $status (expected 200)." >&2 + echo "::error::Likely cause: AUTO_SYNC_TOKEN has been rotated/revoked on Gitea but the repo Actions secret was not updated." >&2 + echo "::error::Runbook: see header comment of this workflow file." >&2 + # Print response body but redact anything that looks like a token. + sed -E 's/[A-Fa-f0-9]{32,}//g' "$response_file" >&2 || true + exit 1 + fi + + username=$(python3 -c "import json,sys; print(json.load(open(sys.argv[1])).get('login',''))" "$response_file") + if [ "$username" != "${EXPECTED_PERSONA}" ]; then + echo "::error::Token resolves to user '$username', expected '${EXPECTED_PERSONA}'." >&2 + echo "::error::AUTO_SYNC_TOKEN must be the devops-engineer persona PAT (not founder PAT, not another persona)." >&2 + echo "::error::Auto-sync push will fail because only 'devops-engineer' is whitelisted on staging branch protection." >&2 + exit 1 + fi + echo "Token authenticates as: $username ✓" + + - name: Verify token has repo read scope + # `GET /api/v1/repos//` requires `read:repository` + # on the persona's v2 scope contract. If the scope was + # narrowed/dropped on rotation we catch it here, before the + # next main push reveals it via a checkout failure. + run: | + set -euo pipefail + response_file="$(mktemp)" + status=$(curl -sS -o "$response_file" \ + --max-time 30 --connect-timeout 10 \ + -w "%{http_code}" \ + -H "Authorization: token ${AUTO_SYNC_TOKEN}" \ + -H "Accept: application/json" \ + "https://${GITEA_HOST}/api/v1/repos/${REPO_PATH}" || echo "000") + + if [ "$status" != "200" ]; then + echo "::error::Token lacks read:repository scope on ${REPO_PATH}: HTTP $status." >&2 + echo "::error::Auto-sync's actions/checkout step will fail with this token." >&2 + echo "::error::Re-issue with v2 scope contract: read:repository,write:repository,read:user,read:organization,read:issue,write:issue,read:notification,read:misc" >&2 + sed -E 's/[A-Fa-f0-9]{32,}//g' "$response_file" >&2 || true + exit 1 + fi + echo "Token has read:repository on ${REPO_PATH} ✓" + + - name: Verify git HTTPS auth path resolves staging tip + # Final probe: exercise the EXACT auth path that + # `actions/checkout` uses in auto-sync-main-to-staging.yml. + # Gitea's API and git-HTTPS surfaces share the token but + # take different code paths internally — historically (#173) + # the API path was healthy while git-HTTPS rejected, so + # checking only the API would have given false-green. + # + # `git ls-remote --refs` is read-only: lists remote refs + # without fetching pack data. ~1KB on the wire. + env: + # Build the URL inline so the token never appears as a + # literal string anywhere — it's an env-var interpolation, + # subject to GitHub's automatic secret-masking on output. + GIT_TERMINAL_PROMPT: "0" # don't hang waiting for password if auth fails + run: | + set -euo pipefail + # Token is in $AUTO_SYNC_TOKEN (job-level env). Compose the + # URL as a local var that's never echoed. + url="https://oauth2:${AUTO_SYNC_TOKEN}@${GITEA_HOST}/${REPO_PATH}" + + # `timeout 30s` covers the (rare) case where the network + # path stalls without curl-style timeout flags — git + # honours GIT_HTTP_LOW_SPEED_TIME/LIMIT but not a hard wall. + if ! out=$(timeout 30s git ls-remote --refs "$url" refs/heads/staging 2>&1); then + # Redact any accidental token leak in the error output. + redacted=$(echo "$out" | sed -E "s|oauth2:[^@]+@|oauth2:@|g") + echo "::error::git ls-remote against staging failed via the AUTO_SYNC_TOKEN HTTPS auth path." >&2 + echo "::error::API probes passed but git HTTPS surface is broken — likely Gitea config drift, not a token rotation." >&2 + echo "$redacted" >&2 + exit 1 + fi + + # Sanity-check: response should be one line " refs/heads/staging". + if ! echo "$out" | grep -qE '^[0-9a-f]{40}[[:space:]]+refs/heads/staging$'; then + echo "::error::ls-remote returned unexpected shape:" >&2 + echo "$out" | sed -E "s|oauth2:[^@]+@|oauth2:@|g" >&2 + exit 1 + fi + + staging_sha=$(echo "$out" | awk '{print $1}') + echo "git HTTPS auth path resolves staging → ${staging_sha:0:8} ✓" + + - name: Summarise canary result + # Everything passed — surface a green summary. (Failures + # already wrote ::error:: lines and exited above; if we got + # here, all three probes passed.) + run: | + { + echo "## Auto-sync canary: GREEN" + echo "" + echo "AUTO_SYNC_TOKEN is healthy:" + echo "- Authenticates as \`${EXPECTED_PERSONA}\` ✓" + echo "- Has \`read:repository\` scope on \`${REPO_PATH}\` ✓" + echo "- Git HTTPS auth path resolves \`refs/heads/staging\` ✓" + echo "" + echo "Auto-sync main → staging will succeed on the next push to main." + echo "If this canary ever goes RED, see the runbook in this workflow's header." + } >> "$GITHUB_STEP_SUMMARY" From 0cef033a6a2c8df7d930be2e0986e6b35881e6d9 Mon Sep 17 00:00:00 2001 From: claude-ceo-assistant Date: Thu, 7 May 2026 15:26:22 -0700 Subject: [PATCH 2/6] ci(canary): route curl -w to tempfile to satisfy status-capture lint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The two API probes used the unsafe shape rejected by lint-curl-status-capture.yml (per feedback_curl_status_capture_pollution): status=$(curl ... -w '%{http_code}' ... || echo "000") When curl exits non-zero (transport error, --fail-with-body 4xx/5xx), the `-w` already wrote a code; the `|| echo "000"` then APPENDS another "000", yielding "000000" or "409000" — passes shape checks while looking right. Switch to the canonical safe shape (set +e + tempfile + cat): set +e curl ... -w '%{http_code}' >code_file 2>/dev/null set -e status=$(cat code_file 2>/dev/null || true) [ -z "$status" ] && status="000" Inline comment in both probe steps explains the lint constraint so the next editor doesn't re-introduce the bad pattern. Refs: #72, lint failure on PR #77 (1/22 red → 22/22 expected green) --- .github/workflows/auto-sync-canary.yml | 28 ++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/.github/workflows/auto-sync-canary.yml b/.github/workflows/auto-sync-canary.yml index 9f55aa19..0c0573db 100644 --- a/.github/workflows/auto-sync-canary.yml +++ b/.github/workflows/auto-sync-canary.yml @@ -211,15 +211,23 @@ jobs: run: | set -euo pipefail response_file="$(mktemp)" + code_file="$(mktemp)" # `--max-time 30`: full call ceiling. `--connect-timeout 10`: - # DNS + TCP. `-w "%{http_code}"` to a separate var (not - # response body — see feedback_curl_status_capture_pollution). - status=$(curl -sS -o "$response_file" \ + # DNS + TCP. `-w "%{http_code}"` routed to a tempfile so curl's + # exit code can't pollute the captured status — see + # feedback_curl_status_capture_pollution + the + # `lint-curl-status-capture.yml` gate that rejects the unsafe + # `$(curl ... || echo "000")` shape. + set +e + curl -sS -o "$response_file" \ --max-time 30 --connect-timeout 10 \ -w "%{http_code}" \ -H "Authorization: token ${AUTO_SYNC_TOKEN}" \ -H "Accept: application/json" \ - "https://${GITEA_HOST}/api/v1/user" || echo "000") + "https://${GITEA_HOST}/api/v1/user" >"$code_file" 2>/dev/null + set -e + status=$(cat "$code_file" 2>/dev/null || true) + [ -z "$status" ] && status="000" if [ "$status" != "200" ]; then echo "::error::Token rotation suspected: GET /api/v1/user returned HTTP $status (expected 200)." >&2 @@ -247,12 +255,20 @@ jobs: run: | set -euo pipefail response_file="$(mktemp)" - status=$(curl -sS -o "$response_file" \ + code_file="$(mktemp)" + # See first probe step for the rationale on the tempfile-routed + # `-w "%{http_code}"` pattern — the unsafe `|| echo "000"` shape + # is rejected by lint-curl-status-capture.yml. + set +e + curl -sS -o "$response_file" \ --max-time 30 --connect-timeout 10 \ -w "%{http_code}" \ -H "Authorization: token ${AUTO_SYNC_TOKEN}" \ -H "Accept: application/json" \ - "https://${GITEA_HOST}/api/v1/repos/${REPO_PATH}" || echo "000") + "https://${GITEA_HOST}/api/v1/repos/${REPO_PATH}" >"$code_file" 2>/dev/null + set -e + status=$(cat "$code_file" 2>/dev/null || true) + [ -z "$status" ] && status="000" if [ "$status" != "200" ]; then echo "::error::Token lacks read:repository scope on ${REPO_PATH}: HTTP $status." >&2 From 62629eda4a34deb14b91d73609018f3d020f7cf5 Mon Sep 17 00:00:00 2001 From: claude-ceo-assistant Date: Thu, 7 May 2026 15:34:34 -0700 Subject: [PATCH 3/6] ci(canary): rewrite Probe 3 to actually validate auth (NOP push --dry-run) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While verifying Phase 4, found a real flaw in Probe 3 (`git ls-remote refs/heads/staging`). On a public repo (which molecule-core is), Gitea falls back to anonymous read on bad auth, so `ls-remote` succeeds even with a junk token. The probe was therefore green-lighting rotated tokens — false-green, the worst possible canary failure mode. Rewritten to use `git push --dry-run` of the current staging SHA back to `refs/heads/staging`: - Push always authenticates (auth-gated on smart-protocol handshake, before the dry-run can compute the empty-diff). - NOP by construction: pushing the current tip back to itself is "Everything up-to-date" with exit 0. - Bad token → "Authentication failed", exit 128. - Doesn't reach pre-receive (where branch-protection authz runs), so scope is "auth only" — matches the design intent (failure mode B); authz already covered daily by branch-protection-drift.yml. Implementation note: `git push` requires a local repo. Spinning up a fresh `git init` in a tempdir (~1KB, ~50ms) instead of pulling the full repo via actions/checkout — actions/checkout would clone ~hundreds of MB for what amounts to "a place to run git from." Local mutation tests pass: - Real token: "Everything up-to-date" exit 0 - Junk token: "Authentication failed" exit 128 with actionable ::error:: messages pointing at the runbook Header comment + runbook step-mapping updated to reflect new probe shape. Refs: #72 --- .github/workflows/auto-sync-canary.yml | 132 ++++++++++++++++++------- 1 file changed, 96 insertions(+), 36 deletions(-) diff --git a/.github/workflows/auto-sync-canary.yml b/.github/workflows/auto-sync-canary.yml index 0c0573db..f5304761 100644 --- a/.github/workflows/auto-sync-canary.yml +++ b/.github/workflows/auto-sync-canary.yml @@ -38,11 +38,17 @@ name: Auto-sync canary — AUTO_SYNC_TOKEN rotation drift # validates the token has `read:repository` scope on this repo # (the v2 scope contract — see saved memory # `reference_persona_token_v2_scope`). -# 3. `git ls-remote https://oauth2:@/.../molecule-core -# refs/heads/staging` → validates the EXACT HTTPS basic-auth path -# that `actions/checkout` uses inside auto-sync-main-to-staging.yml. -# Without this we'd be testing the API surface but not the git -# HTTPS surface; they don't share an auth code path on Gitea. +# 3. `git push --dry-run` of the current staging SHA back to +# `refs/heads/staging` via `https://oauth2:@/...` +# → validates the EXACT HTTPS basic-auth path that +# `actions/checkout` + `git push origin staging` use inside +# auto-sync-main-to-staging.yml. NOP by construction (push the +# current tip to itself = "Everything up-to-date"); auth is +# checked at the smart-protocol handshake BEFORE the empty-diff +# computation, so bad token → exit 128 with "Authentication +# failed". `git ls-remote` is NOT used here because Gitea +# falls back to anonymous read on public repos and would +# silently green-light a rotated token. # # Each step exits non-zero with an actionable error message if it # fails. The workflow status itself is the operator-facing surface. @@ -93,9 +99,10 @@ name: Auto-sync canary — AUTO_SYNC_TOKEN rotation drift # token is invalid OR resolves to wrong persona. # - Step "Verify token has repo read scope" red → token valid but # stripped of `read:repository` scope (or repo perms changed). -# - Step "Verify git HTTPS auth path works" red → API works but -# git HTTPS auth path is broken (rare; usually means a Gitea -# config drift, not a token issue). +# - Step "Verify git HTTPS auth path via no-op dry-run push to +# staging" red → token rotated/revoked OR Gitea git-HTTPS +# surface is broken (rare). Auth check happens on the +# smart-protocol handshake, separate from the API path. # # 2. **Re-issue the token** on the operator host: # ``` @@ -279,48 +286,101 @@ jobs: fi echo "Token has read:repository on ${REPO_PATH} ✓" - - name: Verify git HTTPS auth path resolves staging tip + - name: Verify git HTTPS auth path via no-op dry-run push to staging # Final probe: exercise the EXACT auth path that - # `actions/checkout` uses in auto-sync-main-to-staging.yml. - # Gitea's API and git-HTTPS surfaces share the token but - # take different code paths internally — historically (#173) + # `actions/checkout` + `git push origin staging` use in + # auto-sync-main-to-staging.yml. Gitea's API and git-HTTPS + # surfaces share the token-lookup code path internally but + # the wire-level error shapes differ — historically (#173) # the API path was healthy while git-HTTPS rejected, so # checking only the API would have given false-green. # - # `git ls-remote --refs` is read-only: lists remote refs - # without fetching pack data. ~1KB on the wire. + # IMPORTANT: `git ls-remote` on a public repo (which + # molecule-core is) succeeds even with a junk token because + # Gitea falls back to anonymous-read. `ls-remote` therefore + # CANNOT validate auth on this surface. We use + # `git push --dry-run` instead — push is auth-gated even on + # public repos. + # + # NOP shape: read the current staging SHA via authenticated + # ls-remote (the SHA itself is public; auth is incidental + # here, used only to colocate the discovery in one step), then + # `git push --dry-run :refs/heads/staging`. Pushing the + # current tip back to itself is "Everything up-to-date" with + # exit 0 when auth succeeds. With a bad token Gitea returns + # HTTP 401 in the smart-protocol handshake and git exits 128 + # with "Authentication failed". + # + # The dry-run never reaches Gitea's pre-receive hook (which + # is where branch-protection authz runs), so this probe does + # not validate failure mode C. That's intentional — + # branch-protection-drift.yml owns authz monitoring; this + # canary owns auth. env: - # Build the URL inline so the token never appears as a - # literal string anywhere — it's an env-var interpolation, - # subject to GitHub's automatic secret-masking on output. - GIT_TERMINAL_PROMPT: "0" # don't hang waiting for password if auth fails + # Don't hang waiting for password prompt if auth fails on a + # terminal-attached run. (In Actions there's no terminal, + # but the env-var hardens against an interactive runner + # config.) + GIT_TERMINAL_PROMPT: "0" run: | set -euo pipefail # Token is in $AUTO_SYNC_TOKEN (job-level env). Compose the # URL as a local var that's never echoed. url="https://oauth2:${AUTO_SYNC_TOKEN}@${GITEA_HOST}/${REPO_PATH}" - # `timeout 30s` covers the (rare) case where the network - # path stalls without curl-style timeout flags — git - # honours GIT_HTTP_LOW_SPEED_TIME/LIMIT but not a hard wall. - if ! out=$(timeout 30s git ls-remote --refs "$url" refs/heads/staging 2>&1); then - # Redact any accidental token leak in the error output. - redacted=$(echo "$out" | sed -E "s|oauth2:[^@]+@|oauth2:@|g") - echo "::error::git ls-remote against staging failed via the AUTO_SYNC_TOKEN HTTPS auth path." >&2 - echo "::error::API probes passed but git HTTPS surface is broken — likely Gitea config drift, not a token rotation." >&2 + # Step a: read current staging SHA. ~1KB; auth-gated only + # on private repos but always works on public — used here + # only to discover the SHA, not to validate auth. + staging_ref=$(timeout 30s git ls-remote --refs "$url" refs/heads/staging 2>&1) || { + redacted=$(echo "$staging_ref" | sed -E "s|oauth2:[^@]+@|oauth2:@|g") + echo "::error::ls-remote against staging failed (network/DNS issue):" >&2 + echo "$redacted" >&2 + exit 1 + } + if ! echo "$staging_ref" | grep -qE '^[0-9a-f]{40}[[:space:]]+refs/heads/staging$'; then + echo "::error::ls-remote returned unexpected shape:" >&2 + echo "$staging_ref" | sed -E "s|oauth2:[^@]+@|oauth2:@|g" >&2 + exit 1 + fi + staging_sha=$(echo "$staging_ref" | awk '{print $1}') + + # Step b: spin up an ephemeral local repo. `git push` always + # requires a local repo even when pushing a remote SHA that + # isn't in the local object DB (the protocol negotiates and + # discovers we don't need to send any objects). We don't use + # `actions/checkout` for this — it would clone the whole + # repo (~hundreds of MB) for what's essentially `git init`. + tmp_repo="$(mktemp -d)" + trap 'rm -rf "$tmp_repo"' EXIT + git -C "$tmp_repo" init -q + # Author config required for any git operation; values are + # arbitrary because nothing gets committed here. + git -C "$tmp_repo" config user.email canary@auto-sync.local + git -C "$tmp_repo" config user.name auto-sync-canary + + # Step c: dry-run push the current staging SHA back to + # staging. NOP by construction — the remote tip equals the + # SHA we're pushing, so "Everything up-to-date" is the + # success path. + # + # Authentication is checked at the smart-protocol handshake, + # BEFORE the dry-run can compute an empty diff. Bad token + # → "Authentication failed", exit 128. Good token → exit 0. + set +e + push_out=$(timeout 30s git -C "$tmp_repo" push --dry-run "$url" "${staging_sha}:refs/heads/staging" 2>&1) + push_rc=$? + set -e + + if [ "$push_rc" -ne 0 ]; then + redacted=$(echo "$push_out" | sed -E "s|oauth2:[^@]+@|oauth2:@|g") + echo "::error::Token rotation suspected: git push --dry-run against staging failed via the AUTO_SYNC_TOKEN HTTPS auth path (exit $push_rc)." >&2 + echo "::error::This is the EXACT auth path that actions/checkout + git push use in auto-sync-main-to-staging.yml." >&2 + echo "::error::Likely cause: AUTO_SYNC_TOKEN was rotated/revoked on Gitea but the repo Actions secret was not updated. Runbook: see header." >&2 echo "$redacted" >&2 exit 1 fi - # Sanity-check: response should be one line " refs/heads/staging". - if ! echo "$out" | grep -qE '^[0-9a-f]{40}[[:space:]]+refs/heads/staging$'; then - echo "::error::ls-remote returned unexpected shape:" >&2 - echo "$out" | sed -E "s|oauth2:[^@]+@|oauth2:@|g" >&2 - exit 1 - fi - - staging_sha=$(echo "$out" | awk '{print $1}') - echo "git HTTPS auth path resolves staging → ${staging_sha:0:8} ✓" + echo "git HTTPS auth path: NOP push --dry-run to staging → ${staging_sha:0:8} ✓" - name: Summarise canary result # Everything passed — surface a green summary. (Failures @@ -333,7 +393,7 @@ jobs: echo "AUTO_SYNC_TOKEN is healthy:" echo "- Authenticates as \`${EXPECTED_PERSONA}\` ✓" echo "- Has \`read:repository\` scope on \`${REPO_PATH}\` ✓" - echo "- Git HTTPS auth path resolves \`refs/heads/staging\` ✓" + echo "- Git HTTPS auth path: no-op dry-run push to \`refs/heads/staging\` succeeds ✓" echo "" echo "Auto-sync main → staging will succeed on the next push to main." echo "If this canary ever goes RED, see the runbook in this workflow's header." From e4e1bf4080d563fe90d4d61eead80019dcf58d29 Mon Sep 17 00:00:00 2001 From: claude-ceo-assistant Date: Thu, 7 May 2026 15:35:22 -0700 Subject: [PATCH 4/6] ci(canary): annotate EXPECTED_PERSONA dual-update constraint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hostile-self-review weakest-spot #2: if the devops-engineer persona is ever renamed, the canary will go red even if everything else is fine. Add an inline comment pointing the next editor at both files that must update together (auto-sync-main-to-staging.yml's git config + this canary's EXPECTED_PERSONA + the staging branch protection's push_whitelist_usernames). No behaviour change — comment-only. --- .github/workflows/auto-sync-canary.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/auto-sync-canary.yml b/.github/workflows/auto-sync-canary.yml index f5304761..f6b0437b 100644 --- a/.github/workflows/auto-sync-canary.yml +++ b/.github/workflows/auto-sync-canary.yml @@ -176,6 +176,10 @@ jobs: # repeating the secret reference. GitHub masks the value in # logs automatically. AUTO_SYNC_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }} + # MUST stay in sync with auto-sync-main-to-staging.yml's + # `git config user.name "devops-engineer"` line. Renaming the + # devops-engineer persona requires updating both files (and + # the staging branch protection's `push_whitelist_usernames`). EXPECTED_PERSONA: devops-engineer GITEA_HOST: git.moleculesai.app REPO_PATH: molecule-ai/molecule-core From 5b3ce5c81879b118df8423c8c4479d47d0b79a88 Mon Sep 17 00:00:00 2001 From: devops-engineer Date: Thu, 7 May 2026 15:38:57 -0700 Subject: [PATCH 5/6] fix(ci): replace gh run list with Gitea commit-status query (#75 class F) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Part of the post-#66 sweep to remove `gh` CLI dependencies that fail silently against Gitea. Class F covers `gh run list --workflow=X --commit=SHA` shapes — querying whether a specific workflow ran (and how it finished) for a specific SHA. Why this is the only call site in class F: `gh run list` hits GitHub's `/repos/.../actions/runs` REST endpoint. Gitea exposes ZERO endpoints under `/repos/.../actions/runs` — verified 2026-05-07 via swagger inspection: only secrets, variables, and runner-registration tokens live under /actions/. There's no way to query workflow run state via the Gitea v1 API directly. However, every Gitea Actions job DOES emit a commit status with `context = " / ()"` (verified 2026-05-07 by reading /repos/.../commits/{sha}/statuses on a recent main SHA). That surface is exactly what we need: each workflow run leg is one status row, the aggregate state encodes the run outcome, and Gitea exposes it under `/api/v1/repos/.../commits/{sha}/statuses` which IS available. Affected: `auto-promote-on-e2e.yml` (lines 172-180): Old: `gh run list --workflow e2e-staging-saas.yml --commit $SHA --json status,conclusion --jq ...` returning a 5-bucket string like `completed/success` | `in_progress/none` | `none/none` | `completed/failure` | `completed/cancelled`. New: `curl /api/v1/repos/.../commits/$SHA/statuses` + jq filter on contexts whose name starts with `"E2E Staging SaaS (full lifecycle) /"`. Mapping: 0 matched contexts → "none/none" (E2E paths- filtered out — same as before) any context = pending → "in_progress/none" (defer) any context = error|failure → "completed/failure" (abort) all contexts = success → "completed/success" (proceed) The `completed/cancelled` arm of the case statement becomes unreachable: Gitea status API doesn't expose a `cancelled` state (it has success/failure/error/pending/warning), so per-SHA concurrency cancellations now surface as `failure` and are handled by the failure branch. Documented in-place; the cancelled arm is kept as defense-in-depth for any future dual-host operation. Verification: - Live curl against the current main SHA returns `none/none` (E2E was paths-filtered for that change set — expected). - Synthetic-input jq tests verify all four mapping buckets: no contexts → "none/none" one context = pending → "in_progress/none" success + success → "completed/success" success + failure → "completed/failure" - YAML syntax validates. Token: continues to use act_runner's GITHUB_TOKEN (per-run, repo read scope). The `/commits/{sha}/statuses` endpoint is repo-scoped, no extra perms needed. Closes part of #75. Master tracking issue at #75; companion PRs: #80 (class A — `gh pr ...`), #81 (class D — `gh api ...`). Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/auto-promote-on-e2e.yml | 106 +++++++++++++++------- 1 file changed, 72 insertions(+), 34 deletions(-) diff --git a/.github/workflows/auto-promote-on-e2e.yml b/.github/workflows/auto-promote-on-e2e.yml index 82d771a6..a4daef2b 100644 --- a/.github/workflows/auto-promote-on-e2e.yml +++ b/.github/workflows/auto-promote-on-e2e.yml @@ -154,30 +154,71 @@ jobs: exit 0 fi - # Upstream is publish-workspace-server-image. Check E2E state. - # The jq filter must defend against TWO empty cases that gh - # CLI emits indistinguishably: - # 1. gh exits non-zero (network blip, auth issue) → handled - # by the `|| echo "none/none"` fallback below. - # 2. gh exits zero but returns `[]` (no E2E run on this - # main SHA — the common case for canvas-only / cmd-only - # / sweep-only changes whose paths don't trigger E2E). - # Without `(.[0] // {})`, jq sees `null` and emits - # "null/none" — which the case statement below has no - # branch for, so it falls into *) → exit 1. - # Surfaced 2026-04-30 the first time the App-token chain - # (#2389) actually fired auto-promote-on-e2e from a publish - # upstream — every prior run was E2E-upstream which - # short-circuits before this gate. - RESULT=$(gh run list \ - --repo "$REPO" \ - --workflow e2e-staging-saas.yml \ - --branch main \ - --commit "$SHA" \ - --limit 1 \ - --json status,conclusion \ - --jq '(.[0] // {}) | "\(.status // "none")/\(.conclusion // "none")"' \ - 2>/dev/null || echo "none/none") + # Upstream is publish-workspace-server-image. Check E2E state + # for the same SHA via Gitea's commit-status API. + # + # GitHub-era this was `gh run list --workflow=X --commit=SHA + # --json status,conclusion` returning either `[]` (no run on + # this SHA) or `[{status, conclusion}]` (the run's state). + # Gitea has NO workflow-runs API at all — `/api/v1/repos/.../ + # actions/runs` returns 404 (verified 2026-05-07, issue #75). + # However Gitea Actions DOES emit a commit status per workflow + # job, with `context = " / ()"`, + # which is exactly what we need: each E2E run leg becomes one + # status row on the SHA, and the aggregate state encodes the + # run's outcome. + # + # Mapping: + # 0 matched contexts → "none/none" (E2E paths- + # filtered + # out — same + # semantic + # as before) + # any context = pending → "in_progress/none" (defer) + # any context = error|failure → "completed/failure" (abort) + # all contexts = success → "completed/success" (proceed) + # + # The "completed/cancelled" and "completed/timed_out" buckets + # don't have direct Gitea analogs (Gitea statuses are + # success / failure / error / pending / warning). Per-SHA + # concurrency cancellation surfaces as `error` on Gitea, which + # we map to "completed/failure" rather than "completed/cancelled" + # — losing the soft-defer semantic of the cancelled bucket on + # this fleet. Tradeoff: the staleness alarm (auto-promote-stale- + # alarm.yml) still catches a stuck :latest within 4h, and a + # legitimate cancel is rare enough that aborting + manual + # re-dispatch is acceptable. If we measure cancel frequency + # > 1/week, revisit by reading the run-step-summary text via + # a follow-up script. + # + # Network or auth blips collapse to "none/none" via the curl + # `|| true` fallback, matching the pre-Gitea behaviour where + # an empty list also degenerated to none/none. + GITEA_API_URL="${GITHUB_SERVER_URL:-https://git.moleculesai.app}/api/v1" + STATUSES_JSON=$(curl --fail-with-body -sS \ + -H "Authorization: token ${GH_TOKEN}" \ + -H "Accept: application/json" \ + "${GITEA_API_URL}/repos/${REPO}/commits/${SHA}/statuses?limit=100" \ + 2>/dev/null || echo "[]") + RESULT=$(printf '%s' "$STATUSES_JSON" | jq -r ' + # Filter to E2E Staging SaaS (full lifecycle) statuses. + # Match by leading workflow-name prefix so the " + # ()" tail is irrelevant. Gitea emits the workflow + # name verbatim from the YAML `name:` field. + [.[] | select(.context | startswith("E2E Staging SaaS (full lifecycle) /"))] as $rows + | if ($rows | length) == 0 then + "none/none" + elif any($rows[]; .status == "pending") then + "in_progress/none" + elif any($rows[]; .status == "failure" or .status == "error") then + "completed/failure" + elif all($rows[]; .status == "success") then + "completed/success" + else + # Mixed / unknown — fall through to *) bucket below. + "completed/" + ($rows[0].status // "unknown") + end + ' 2>/dev/null || echo "none/none") echo "E2E Staging SaaS for ${SHA:0:7}: $RESULT" @@ -199,16 +240,13 @@ jobs: exit 1 ;; completed/cancelled) - # cancelled ≠ failure. Per-SHA concurrency cancels older E2E - # runs when a newer push lands (memory: - # feedback_concurrency_group_per_sha) — the newer SHA will - # have its own E2E + promote chain. Treat the same as - # in_progress: defer without aborting, let the next E2E run - # promote when it lands. - # - # Caught 2026-05-05 02:03 on sha 31f9a5e — auto-promote - # blocked the whole chain because this case fell through to - # exit 1 instead of clean defer. + # GitHub-era only: cancelled ≠ failure. Gitea statuses + # don't expose a "cancelled" state — a per-SHA concurrency + # cancellation surfaces as `failure` or `error` on Gitea + # and is now handled by the failure branch above. This + # arm is kept for backwards compatibility / dual-host + # operation (if we ever add a non-Gitea fallback) but + # under the post-#75 flow it's unreachable. echo "proceed=false" >> "$GITHUB_OUTPUT" { echo "## ⏭ Auto-promote deferred — E2E Staging SaaS was cancelled" From 8885f7cd12ffedd3cfc22c65623889b155f2c94e Mon Sep 17 00:00:00 2001 From: devops-engineer Date: Thu, 7 May 2026 16:54:44 -0700 Subject: [PATCH 6/6] fix(ci): pin actions/upload-artifact + download-artifact to @v3 for Gitea compatibility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit actions/upload-artifact@v4+ and download-artifact@v4+ use the GHES 3.10+ artifact protocol that Gitea Actions (act_runner v0.6 / Gitea 1.22.x) does NOT implement. Failure cite from PR #54 run 1325 jobs/2: ::error::@actions/artifact v2.0.0+, upload-artifact@v4+ and download-artifact@v4+ are not currently supported on GHES. Pinned all 3 references to v3.2.2 (latest v3) at SHA-pinned form for supply-chain hygiene, matching the existing `uses:` style in this repo. Affected workflows: - ci.yml (Canvas Next.js coverage upload, blocks `CI / Canvas (Next.js)` required check on every PR — was the merge-queue blocker for #53, #54, #69, #71, #76, #81) - e2e-staging-canvas.yml (Playwright report + screenshots on failure) No download-artifact callers in the repo, so v3-pin doesn't compose-break anywhere. Drop these pins post-Gitea-1.23+ when the v4 artifact protocol ships, or migrate to a Gitea-native action. Closes #210. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/ci.yml | 8 +++++++- .github/workflows/e2e-staging-canvas.yml | 9 +++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6b447291..9350f114 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -235,7 +235,13 @@ jobs: run: npx vitest run --coverage - name: Upload coverage summary as artifact if: needs.changes.outputs.canvas == 'true' && always() - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + # Pinned to v3 for Gitea act_runner v0.6 compatibility — v4+ uses + # the GHES 3.10+ artifact protocol that Gitea 1.22.x does NOT + # implement, surfacing as `GHESNotSupportedError: @actions/artifact + # v2.0.0+, upload-artifact@v4+ and download-artifact@v4+ are not + # currently supported on GHES`. Drop this pin when Gitea ships + # the v4 protocol (tracked: post-Gitea-1.23 followup). + uses: actions/upload-artifact@c6a366c94c3e0affe28c06c8df20a878f24da3cf # v3.2.2 with: name: canvas-coverage-${{ github.run_id }} path: canvas/coverage/ diff --git a/.github/workflows/e2e-staging-canvas.yml b/.github/workflows/e2e-staging-canvas.yml index 0bc152df..30a38e5f 100644 --- a/.github/workflows/e2e-staging-canvas.yml +++ b/.github/workflows/e2e-staging-canvas.yml @@ -139,7 +139,11 @@ jobs: - name: Upload Playwright report on failure if: failure() && needs.detect-changes.outputs.canvas == 'true' - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + # Pinned to v3 for Gitea act_runner v0.6 compatibility — v4+ uses + # the GHES 3.10+ artifact protocol that Gitea 1.22.x does NOT + # implement (see ci.yml upload step for the canonical error + # cite). Drop this pin when Gitea ships the v4 protocol. + uses: actions/upload-artifact@c6a366c94c3e0affe28c06c8df20a878f24da3cf # v3.2.2 with: name: playwright-report-staging path: canvas/playwright-report-staging/ @@ -147,7 +151,8 @@ jobs: - name: Upload screenshots on failure if: failure() && needs.detect-changes.outputs.canvas == 'true' - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + # Pinned to v3 for Gitea act_runner v0.6 compatibility (see above). + uses: actions/upload-artifact@c6a366c94c3e0affe28c06c8df20a878f24da3cf # v3.2.2 with: name: playwright-screenshots path: canvas/test-results/