diff --git a/.github/workflows/auto-sync-canary.yml b/.github/workflows/auto-sync-canary.yml new file mode 100644 index 00000000..9f55aa19 --- /dev/null +++ b/.github/workflows/auto-sync-canary.yml @@ -0,0 +1,324 @@ +name: Auto-sync canary — AUTO_SYNC_TOKEN rotation drift + +# Synthetic health check for the AUTO_SYNC_TOKEN secret consumed by +# auto-sync-main-to-staging.yml (PR #66) and publish-workspace-server-image.yml. +# +# ============================================================ +# Why this workflow exists +# ============================================================ +# +# PR #66 fixed auto-sync (replaced GitHub-era `gh pr create` — which +# 405s on Gitea's GraphQL endpoint — with a direct git push from the +# `devops-engineer` persona's `AUTO_SYNC_TOKEN`). Hostile self-review +# weakest spot #3 of that PR: +# +# "Token rotation silently breaks auto-sync. If AUTO_SYNC_TOKEN is +# rotated without updating the repo secret, every push to main +# fails red on the auto-sync push step. The workflow surfaces the +# failure mode in the step summary (failure mode B in the header), +# but there's no proactive monitoring." +# +# Detection latency under the status quo: rotation is only caught on +# the next push to `main`. During quiet periods (no main push for +# many hours) the staging-superset-of-main invariant silently breaks. +# +# This workflow closes the gap: every 6 hours, it fires the auth +# surface that auto-sync depends on and emits a red workflow status +# if AUTO_SYNC_TOKEN has drifted out of validity. +# +# ============================================================ +# What this checks (Option B — read-only verify) +# ============================================================ +# +# 1. `GET /api/v1/user` against Gitea with the token → validates the +# token authenticates AND resolves to `devops-engineer` (catches +# the case where the token was regenerated under a different +# persona by mistake). +# 2. `GET /api/v1/repos/molecule-ai/molecule-core` with the token → +# validates the token has `read:repository` scope on this repo +# (the v2 scope contract — see saved memory +# `reference_persona_token_v2_scope`). +# 3. `git ls-remote https://oauth2:@/.../molecule-core +# refs/heads/staging` → validates the EXACT HTTPS basic-auth path +# that `actions/checkout` uses inside auto-sync-main-to-staging.yml. +# Without this we'd be testing the API surface but not the git +# HTTPS surface; they don't share an auth code path on Gitea. +# +# Each step exits non-zero with an actionable error message if it +# fails. The workflow status itself is the operator-facing surface. +# +# ============================================================ +# What this does NOT check (intentional) +# ============================================================ +# +# - **Branch-protection authz** (failure mode C in auto-sync header): +# would require an actual write to staging. Already monitored by +# `branch-protection-drift.yml` daily. Don't duplicate. +# - **Conflict resolution** (failure mode A): a real conflict is data- +# driven, not auth-driven; can't synthesise it without polluting +# staging. Already surfaces immediately on the next main push. +# - **Concurrency** (failure mode D): handled by workflow concurrency +# group on auto-sync, not a credential issue. +# +# ============================================================ +# Why Option B (read-only) and not the alternatives +# ============================================================ +# +# Considered + rejected (see issue #72 for full write-up): +# +# - **Option A — full auto-sync on schedule**: every run creates a +# no-op merge commit on staging when main hasn't advanced. 4 noise +# commits/day. And races the real `push:` trigger when main has +# advanced. Rejected. +# +# - **Option C — push to dedicated `auto-sync-canary` branch**: would +# exercise authz too, but adds branch noise on Gitea AND requires +# maintaining a second branch protection (or expanding staging's +# whitelist to a junk branch). Authz already covered by +# `branch-protection-drift.yml`. Rejected. +# +# Prior art for the chosen Option B shape: +# - Cloudflare's `/user/tokens/verify` endpoint (read-only auth +# probe explicitly designed for credential canaries). +# - AWS Secrets Manager rotation Lambda's `testSecret` step (auth +# probe before promoting AWSPENDING → AWSCURRENT). +# - HashiCorp Vault's `vault token lookup` for renewal canaries. +# +# ============================================================ +# Operator runbook — what to do when this workflow goes RED +# ============================================================ +# +# 1. **Identify which step failed**: +# - Step "Verify token authenticates as devops-engineer" red → +# token is invalid OR resolves to wrong persona. +# - Step "Verify token has repo read scope" red → token valid but +# stripped of `read:repository` scope (or repo perms changed). +# - Step "Verify git HTTPS auth path works" red → API works but +# git HTTPS auth path is broken (rare; usually means a Gitea +# config drift, not a token issue). +# +# 2. **Re-issue the token** on the operator host: +# ``` +# ssh root@5.78.80.188 'docker exec --user git molecule-gitea-1 \ +# gitea admin user generate-access-token \ +# --username devops-engineer \ +# --token-name persona-devops-engineer-vN \ +# --scopes "read:repository,write:repository,read:user,read:organization,read:issue,write:issue,read:notification,read:misc"' +# ``` +# Update `/etc/molecule-bootstrap/agent-secrets.env` in place +# (per `feedback_unified_credentials_file`). The previous token +# file lands at `.bak.`. +# +# 3. **Update the repo Actions secret** at: +# Settings → Secrets and variables → Actions → AUTO_SYNC_TOKEN +# Paste the new token. (Don't echo it in chat — but per +# `feedback_passwords_in_chat_are_burned`, a paste in a 1:1 +# Claude session is within trust boundary.) +# +# 4. **Re-run this canary** via workflow_dispatch. Confirm GREEN. +# +# 5. **Backfill any missed main → staging syncs** by re-running +# `auto-sync-main-to-staging.yml` from its workflow_dispatch +# surface, OR by pushing an empty commit to main (if you'd +# rather force a real trigger). +# +# ============================================================ +# Security notes +# ============================================================ +# +# - Token usage: read-only (`GET /api/v1/user`, `GET /api/v1/repos/...`, +# `git ls-remote`). No write paths. Same blast-radius profile as +# `actions/checkout` on a public repo. +# - The token NEVER appears in logs: every `curl` uses a header +# variable, never inline; the `git ls-remote` URL builds the +# `oauth2:$TOKEN@host` form into a single env var that's not +# echoed. GitHub Actions secret-masking covers anything that does +# slip through. +# - No new token introduced — same `AUTO_SYNC_TOKEN` the workflow +# under monitor uses. Per least-privilege we deliberately do NOT +# broaden scope for the canary. + +on: + schedule: + # Every 6 hours at :17 (offsets the cron herd at :00). Justification + # from issue #72: cheap to run (~5s wall-clock, no quota), 3h average + # detection latency, 6h max. 1h would be 24× the runs for marginal + # benefit; daily would be 6× longer latency and worse than status + # quo on a quiet-main day. + - cron: '17 */6 * * *' + workflow_dispatch: + +# No concurrency group needed — the canary is read-only and idempotent. +# Two parallel runs (e.g. operator dispatch during a scheduled tick) are +# harmless: same result, doubled HTTPS calls, no shared state. + +permissions: + contents: read + +jobs: + verify-token: + name: Verify AUTO_SYNC_TOKEN validity + runs-on: ubuntu-latest + # 2 min surfaces hangs (Gitea API stall, DNS issue) within one + # cron interval. Realistic worst case is ~10s: 2 curls + 1 git + # ls-remote, each capped by the explicit timeouts below. + timeout-minutes: 2 + + env: + # Pinned in env so individual steps can read it without + # repeating the secret reference. GitHub masks the value in + # logs automatically. + AUTO_SYNC_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }} + EXPECTED_PERSONA: devops-engineer + GITEA_HOST: git.moleculesai.app + REPO_PATH: molecule-ai/molecule-core + + steps: + - name: Verify AUTO_SYNC_TOKEN secret is configured + # Schedule-vs-dispatch behaviour split, per + # `feedback_schedule_vs_dispatch_secrets_hardening`: + # + # - schedule: hard-fail when the secret is missing. The + # whole point of the canary is to surface drift; soft- + # skipping on missing-secret would make the canary + # itself drift-invisible (sweep-cf-orphans #2088 lesson). + # - workflow_dispatch: hard-fail too — there's no scenario + # where an operator wants this canary to silently no-op. + # The workflow has no other ad-hoc utility; if you ran + # it, you wanted the answer. + run: | + if [ -z "${AUTO_SYNC_TOKEN}" ]; then + echo "::error::AUTO_SYNC_TOKEN secret is not set on this repo." >&2 + echo "::error::Set it at Settings → Secrets and variables → Actions." >&2 + echo "::error::Without it, auto-sync-main-to-staging.yml will fail every push to main." >&2 + exit 1 + fi + echo "AUTO_SYNC_TOKEN is configured (value masked)." + + - name: Verify token authenticates as ${{ env.EXPECTED_PERSONA }} + # Calls Gitea's `/api/v1/user` — the canonical + # auth-probe-with-no-side-effects endpoint (mirrors + # Cloudflare's /user/tokens/verify). + # + # Failure surfaces: + # - HTTP 401: token invalid (rotated, revoked, or never + # correctly registered). + # - HTTP 200 but username != devops-engineer: token was + # regenerated under the wrong persona — this would let + # auth pass but commit attribution would be wrong, and + # branch-protection authz would fail because only + # `devops-engineer` is whitelisted. + run: | + set -euo pipefail + response_file="$(mktemp)" + # `--max-time 30`: full call ceiling. `--connect-timeout 10`: + # DNS + TCP. `-w "%{http_code}"` to a separate var (not + # response body — see feedback_curl_status_capture_pollution). + status=$(curl -sS -o "$response_file" \ + --max-time 30 --connect-timeout 10 \ + -w "%{http_code}" \ + -H "Authorization: token ${AUTO_SYNC_TOKEN}" \ + -H "Accept: application/json" \ + "https://${GITEA_HOST}/api/v1/user" || echo "000") + + if [ "$status" != "200" ]; then + echo "::error::Token rotation suspected: GET /api/v1/user returned HTTP $status (expected 200)." >&2 + echo "::error::Likely cause: AUTO_SYNC_TOKEN has been rotated/revoked on Gitea but the repo Actions secret was not updated." >&2 + echo "::error::Runbook: see header comment of this workflow file." >&2 + # Print response body but redact anything that looks like a token. + sed -E 's/[A-Fa-f0-9]{32,}//g' "$response_file" >&2 || true + exit 1 + fi + + username=$(python3 -c "import json,sys; print(json.load(open(sys.argv[1])).get('login',''))" "$response_file") + if [ "$username" != "${EXPECTED_PERSONA}" ]; then + echo "::error::Token resolves to user '$username', expected '${EXPECTED_PERSONA}'." >&2 + echo "::error::AUTO_SYNC_TOKEN must be the devops-engineer persona PAT (not founder PAT, not another persona)." >&2 + echo "::error::Auto-sync push will fail because only 'devops-engineer' is whitelisted on staging branch protection." >&2 + exit 1 + fi + echo "Token authenticates as: $username ✓" + + - name: Verify token has repo read scope + # `GET /api/v1/repos//` requires `read:repository` + # on the persona's v2 scope contract. If the scope was + # narrowed/dropped on rotation we catch it here, before the + # next main push reveals it via a checkout failure. + run: | + set -euo pipefail + response_file="$(mktemp)" + status=$(curl -sS -o "$response_file" \ + --max-time 30 --connect-timeout 10 \ + -w "%{http_code}" \ + -H "Authorization: token ${AUTO_SYNC_TOKEN}" \ + -H "Accept: application/json" \ + "https://${GITEA_HOST}/api/v1/repos/${REPO_PATH}" || echo "000") + + if [ "$status" != "200" ]; then + echo "::error::Token lacks read:repository scope on ${REPO_PATH}: HTTP $status." >&2 + echo "::error::Auto-sync's actions/checkout step will fail with this token." >&2 + echo "::error::Re-issue with v2 scope contract: read:repository,write:repository,read:user,read:organization,read:issue,write:issue,read:notification,read:misc" >&2 + sed -E 's/[A-Fa-f0-9]{32,}//g' "$response_file" >&2 || true + exit 1 + fi + echo "Token has read:repository on ${REPO_PATH} ✓" + + - name: Verify git HTTPS auth path resolves staging tip + # Final probe: exercise the EXACT auth path that + # `actions/checkout` uses in auto-sync-main-to-staging.yml. + # Gitea's API and git-HTTPS surfaces share the token but + # take different code paths internally — historically (#173) + # the API path was healthy while git-HTTPS rejected, so + # checking only the API would have given false-green. + # + # `git ls-remote --refs` is read-only: lists remote refs + # without fetching pack data. ~1KB on the wire. + env: + # Build the URL inline so the token never appears as a + # literal string anywhere — it's an env-var interpolation, + # subject to GitHub's automatic secret-masking on output. + GIT_TERMINAL_PROMPT: "0" # don't hang waiting for password if auth fails + run: | + set -euo pipefail + # Token is in $AUTO_SYNC_TOKEN (job-level env). Compose the + # URL as a local var that's never echoed. + url="https://oauth2:${AUTO_SYNC_TOKEN}@${GITEA_HOST}/${REPO_PATH}" + + # `timeout 30s` covers the (rare) case where the network + # path stalls without curl-style timeout flags — git + # honours GIT_HTTP_LOW_SPEED_TIME/LIMIT but not a hard wall. + if ! out=$(timeout 30s git ls-remote --refs "$url" refs/heads/staging 2>&1); then + # Redact any accidental token leak in the error output. + redacted=$(echo "$out" | sed -E "s|oauth2:[^@]+@|oauth2:@|g") + echo "::error::git ls-remote against staging failed via the AUTO_SYNC_TOKEN HTTPS auth path." >&2 + echo "::error::API probes passed but git HTTPS surface is broken — likely Gitea config drift, not a token rotation." >&2 + echo "$redacted" >&2 + exit 1 + fi + + # Sanity-check: response should be one line " refs/heads/staging". + if ! echo "$out" | grep -qE '^[0-9a-f]{40}[[:space:]]+refs/heads/staging$'; then + echo "::error::ls-remote returned unexpected shape:" >&2 + echo "$out" | sed -E "s|oauth2:[^@]+@|oauth2:@|g" >&2 + exit 1 + fi + + staging_sha=$(echo "$out" | awk '{print $1}') + echo "git HTTPS auth path resolves staging → ${staging_sha:0:8} ✓" + + - name: Summarise canary result + # Everything passed — surface a green summary. (Failures + # already wrote ::error:: lines and exited above; if we got + # here, all three probes passed.) + run: | + { + echo "## Auto-sync canary: GREEN" + echo "" + echo "AUTO_SYNC_TOKEN is healthy:" + echo "- Authenticates as \`${EXPECTED_PERSONA}\` ✓" + echo "- Has \`read:repository\` scope on \`${REPO_PATH}\` ✓" + echo "- Git HTTPS auth path resolves \`refs/heads/staging\` ✓" + echo "" + echo "Auto-sync main → staging will succeed on the next push to main." + echo "If this canary ever goes RED, see the runbook in this workflow's header." + } >> "$GITHUB_STEP_SUMMARY"