name: Railway pin audit (drift detection) # Daily audit of Railway env vars for drift-prone image-tag pins — # automation-cadence layer over the detection script + regression test # shipped in PR #2168 (#2001 closure). # # Background: on 2026-04-24 a stale `:staging-a14cf86` SHA pin in CP's # TENANT_IMAGE caused 3+ hours of E2E failure with the appearance that # "every fix didn't propagate" — really the tenant image was so old it # didn't read the env vars those fixes produced. The audit script # (scripts/ops/audit-railway-sha-pins.sh) flags drift; this workflow # runs the same check unattended on a daily cron. # # Cadence: once a day, 13:00 UTC (06:00 PT). Daily is the right # cadence for variables-tier config — Railway env var changes are # deliberate operator actions, low-frequency. Hourly would risk # Railway API rate-limit surprises and is overkill for the change rate. # # Issue-on-failure: drift triggers a priority-high issue, mirroring # .github/workflows/e2e-staging-sanity.yml's pattern. Drift is # medium-priority "config slipped, fix at next ops window," not # active-outage paging. # # Secret hardening: per feedback_schedule_vs_dispatch_secrets_hardening, # the schedule trigger HARD-FAILS on missing RAILWAY_AUDIT_TOKEN # (silent-success on schedule was the failure-mode class that bit the # team before; cron firing without checking anything is worse than no # cron). The workflow_dispatch trigger SOFT-SKIPS on missing secret so # an operator can dry-run the workflow shape during initial provisioning # without tripping a fake red. on: schedule: - cron: '0 13 * * *' workflow_dispatch: concurrency: group: railway-pin-audit cancel-in-progress: false permissions: issues: write contents: read jobs: audit: name: Audit Railway env vars for drift-prone pins runs-on: ubuntu-latest timeout-minutes: 10 steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Verify RAILWAY_AUDIT_TOKEN present # Schedule trigger: hard-fail when the secret is missing — # otherwise the cron silently runs against the wrong scope (or # exits 2 from the script and we issue-spam) without anyone # noticing the token rot. # Dispatch trigger: soft-skip — operator may be dry-running the # workflow shape before provisioning the secret. Logged as a # workflow notice, not a failure. env: RAILWAY_AUDIT_TOKEN: ${{ secrets.RAILWAY_AUDIT_TOKEN }} EVENT_NAME: ${{ github.event_name }} id: secret_check run: | set -euo pipefail if [ -n "${RAILWAY_AUDIT_TOKEN:-}" ]; then echo "have_secret=true" >> "$GITHUB_OUTPUT" exit 0 fi echo "have_secret=false" >> "$GITHUB_OUTPUT" if [ "$EVENT_NAME" = "workflow_dispatch" ]; then echo "::notice::RAILWAY_AUDIT_TOKEN not configured — soft-skipping (manual dispatch)" exit 0 fi echo "::error::RAILWAY_AUDIT_TOKEN secret missing — schedule trigger requires it. Provision the token (read-only \`variables\` scope on the molecule-platform Railway project) and store as repo secret RAILWAY_AUDIT_TOKEN." exit 1 - name: Install Railway CLI if: steps.secret_check.outputs.have_secret == 'true' # Pinned hash matching the public install instructions; bump in # tandem with the audit-script's documented Railway CLI version. run: | set -euo pipefail curl -fsSL https://railway.com/install.sh | sh # The installer drops the binary in ~/.railway/bin echo "$HOME/.railway/bin" >> "$GITHUB_PATH" - name: Verify Railway CLI authenticated if: steps.secret_check.outputs.have_secret == 'true' env: RAILWAY_TOKEN: ${{ secrets.RAILWAY_AUDIT_TOKEN }} run: | set -euo pipefail # `railway whoami` exits non-zero when the token is # unauthenticated or doesn't have any project access. if ! railway whoami >/dev/null 2>&1; then echo "::error::Railway CLI failed to authenticate with RAILWAY_AUDIT_TOKEN — token may be revoked or scoped incorrectly" exit 2 fi - name: Link molecule-platform project if: steps.secret_check.outputs.have_secret == 'true' env: RAILWAY_TOKEN: ${{ secrets.RAILWAY_AUDIT_TOKEN }} # Project ID from reference_production_stack: molecule-platform # / 7ccc8c68-61f4-42ab-9be5-586eeee11768. Linking is per-process, # so we re-link in this CI shell (the audit script comment says # it deliberately doesn't chdir for you because the linked # project's identity matters). run: | set -euo pipefail railway link --project 7ccc8c68-61f4-42ab-9be5-586eeee11768 - name: Run drift audit if: steps.secret_check.outputs.have_secret == 'true' id: audit env: RAILWAY_TOKEN: ${{ secrets.RAILWAY_AUDIT_TOKEN }} run: | set +e bash scripts/ops/audit-railway-sha-pins.sh 2>&1 | tee /tmp/audit.log rc=${PIPESTATUS[0]} echo "rc=$rc" >> "$GITHUB_OUTPUT" # Capture the audit log for the issue body. { echo 'log<> "$GITHUB_OUTPUT" # Exit codes from the script: # 0 — no drift; workflow goes green # 1 — drift detected; we'll file an issue and fail the run # 2 — railway CLI unauthenticated / project unlinked; fail # Anything else: also fail. case "$rc" in 0) exit 0 ;; 1) echo "::warning::Drift-prone pin(s) detected — issue will be filed"; exit 1 ;; 2) echo "::error::Railway CLI auth/link failed mid-script — token or project ID drift"; exit 2 ;; *) echo "::error::Unexpected audit rc=$rc"; exit 1 ;; esac - name: Open / update drift issue if: failure() && steps.audit.outputs.rc == '1' uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 env: AUDIT_LOG: ${{ steps.audit.outputs.log }} with: script: | const title = "🚨 Railway env-var drift detected"; const runURL = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`; const body = `Daily Railway pin audit found drift-prone image-tag pins in the molecule-platform Railway project.\n\n` + `**What this means:** an env var (likely on \`controlplane\`) is pinned to a SHA-shaped or semver tag instead of a floating tag. ` + `Same pattern that caused the 2026-04-24 TENANT_IMAGE incident — fix-PRs land but the running service doesn't pick them up.\n\n` + `**Recovery:** open the Railway dashboard, replace the flagged value with a floating tag (\`:staging-latest\`, \`:main\`) unless the pin is intentional and documented in the ops runbook.\n\n` + `**Audit output:**\n\n\`\`\`\n${process.env.AUDIT_LOG || '(log unavailable)'}\n\`\`\`\n\n` + `Run: ${runURL}\n\n` + `Closes automatically when a subsequent daily run reports clean.`; const { data: existing } = await github.rest.issues.listForRepo({ owner: context.repo.owner, repo: context.repo.repo, state: 'open', labels: 'railway-drift', }); const match = existing.find(i => i.title === title); if (match) { await github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: match.number, body: `Still drifting. ${runURL}\n\n\`\`\`\n${process.env.AUDIT_LOG || '(log unavailable)'}\n\`\`\``, }); } else { await github.rest.issues.create({ owner: context.repo.owner, repo: context.repo.repo, title, body, labels: ['railway-drift', 'bug', 'priority-high'], }); } - name: Close stale drift issue on clean run # When a previously-flagged drift gets fixed by an operator, # the next daily run goes green. Close any open `railway-drift` # issue with a confirmation comment so the queue doesn't carry # stale ones. if: success() && steps.audit.outputs.rc == '0' uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0 with: script: | const runURL = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`; const { data: existing } = await github.rest.issues.listForRepo({ owner: context.repo.owner, repo: context.repo.repo, state: 'open', labels: 'railway-drift', }); for (const issue of existing) { await github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: issue.number, body: `Daily audit clean — drift resolved. ${runURL}`, }); await github.rest.issues.update({ owner: context.repo.owner, repo: context.repo.repo, issue_number: issue.number, state: 'closed', state_reason: 'completed', }); }