208 lines
9.4 KiB
YAML
208 lines
9.4 KiB
YAML
name: Railway pin audit (drift detection)
|
|
|
|
# Daily audit of Railway env vars for drift-prone image-tag pins —
|
|
# automation-cadence layer over the detection script + regression test
|
|
# shipped in PR #2168 (#2001 closure).
|
|
#
|
|
# Background: on 2026-04-24 a stale `:staging-a14cf86` SHA pin in CP's
|
|
# TENANT_IMAGE caused 3+ hours of E2E failure with the appearance that
|
|
# "every fix didn't propagate" — really the tenant image was so old it
|
|
# didn't read the env vars those fixes produced. The audit script
|
|
# (scripts/ops/audit-railway-sha-pins.sh) flags drift; this workflow
|
|
# runs the same check unattended on a daily cron.
|
|
#
|
|
# Cadence: once a day, 13:00 UTC (06:00 PT). Daily is the right
|
|
# cadence for variables-tier config — Railway env var changes are
|
|
# deliberate operator actions, low-frequency. Hourly would risk
|
|
# Railway API rate-limit surprises and is overkill for the change rate.
|
|
#
|
|
# Issue-on-failure: drift triggers a priority-high issue, mirroring
|
|
# .github/workflows/e2e-staging-sanity.yml's pattern. Drift is
|
|
# medium-priority "config slipped, fix at next ops window," not
|
|
# active-outage paging.
|
|
#
|
|
# Secret hardening: per feedback_schedule_vs_dispatch_secrets_hardening,
|
|
# the schedule trigger HARD-FAILS on missing RAILWAY_AUDIT_TOKEN
|
|
# (silent-success on schedule was the failure-mode class that bit the
|
|
# team before; cron firing without checking anything is worse than no
|
|
# cron). The workflow_dispatch trigger SOFT-SKIPS on missing secret so
|
|
# an operator can dry-run the workflow shape during initial provisioning
|
|
# without tripping a fake red.
|
|
|
|
on:
|
|
schedule:
|
|
- cron: '0 13 * * *'
|
|
workflow_dispatch:
|
|
|
|
concurrency:
|
|
group: railway-pin-audit
|
|
cancel-in-progress: false
|
|
|
|
permissions:
|
|
issues: write
|
|
contents: read
|
|
|
|
jobs:
|
|
audit:
|
|
name: Audit Railway env vars for drift-prone pins
|
|
runs-on: ubuntu-latest
|
|
timeout-minutes: 10
|
|
|
|
steps:
|
|
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
|
|
|
- name: Verify RAILWAY_AUDIT_TOKEN present
|
|
# Schedule trigger: hard-fail when the secret is missing —
|
|
# otherwise the cron silently runs against the wrong scope (or
|
|
# exits 2 from the script and we issue-spam) without anyone
|
|
# noticing the token rot.
|
|
# Dispatch trigger: soft-skip — operator may be dry-running the
|
|
# workflow shape before provisioning the secret. Logged as a
|
|
# workflow notice, not a failure.
|
|
env:
|
|
RAILWAY_AUDIT_TOKEN: ${{ secrets.RAILWAY_AUDIT_TOKEN }}
|
|
EVENT_NAME: ${{ github.event_name }}
|
|
id: secret_check
|
|
run: |
|
|
set -euo pipefail
|
|
if [ -n "${RAILWAY_AUDIT_TOKEN:-}" ]; then
|
|
echo "have_secret=true" >> "$GITHUB_OUTPUT"
|
|
exit 0
|
|
fi
|
|
echo "have_secret=false" >> "$GITHUB_OUTPUT"
|
|
if [ "$EVENT_NAME" = "workflow_dispatch" ]; then
|
|
echo "::notice::RAILWAY_AUDIT_TOKEN not configured — soft-skipping (manual dispatch)"
|
|
exit 0
|
|
fi
|
|
echo "::error::RAILWAY_AUDIT_TOKEN secret missing — schedule trigger requires it. Provision the token (read-only \`variables\` scope on the molecule-platform Railway project) and store as repo secret RAILWAY_AUDIT_TOKEN."
|
|
exit 1
|
|
|
|
- name: Install Railway CLI
|
|
if: steps.secret_check.outputs.have_secret == 'true'
|
|
# Pinned hash matching the public install instructions; bump in
|
|
# tandem with the audit-script's documented Railway CLI version.
|
|
run: |
|
|
set -euo pipefail
|
|
curl -fsSL https://railway.com/install.sh | sh
|
|
# The installer drops the binary in ~/.railway/bin
|
|
echo "$HOME/.railway/bin" >> "$GITHUB_PATH"
|
|
|
|
- name: Verify Railway CLI authenticated
|
|
if: steps.secret_check.outputs.have_secret == 'true'
|
|
env:
|
|
RAILWAY_TOKEN: ${{ secrets.RAILWAY_AUDIT_TOKEN }}
|
|
run: |
|
|
set -euo pipefail
|
|
# `railway whoami` exits non-zero when the token is
|
|
# unauthenticated or doesn't have any project access.
|
|
if ! railway whoami >/dev/null 2>&1; then
|
|
echo "::error::Railway CLI failed to authenticate with RAILWAY_AUDIT_TOKEN — token may be revoked or scoped incorrectly"
|
|
exit 2
|
|
fi
|
|
|
|
- name: Link molecule-platform project
|
|
if: steps.secret_check.outputs.have_secret == 'true'
|
|
env:
|
|
RAILWAY_TOKEN: ${{ secrets.RAILWAY_AUDIT_TOKEN }}
|
|
# Project ID from reference_production_stack: molecule-platform
|
|
# / 7ccc8c68-61f4-42ab-9be5-586eeee11768. Linking is per-process,
|
|
# so we re-link in this CI shell (the audit script comment says
|
|
# it deliberately doesn't chdir for you because the linked
|
|
# project's identity matters).
|
|
run: |
|
|
set -euo pipefail
|
|
railway link --project 7ccc8c68-61f4-42ab-9be5-586eeee11768
|
|
|
|
- name: Run drift audit
|
|
if: steps.secret_check.outputs.have_secret == 'true'
|
|
id: audit
|
|
env:
|
|
RAILWAY_TOKEN: ${{ secrets.RAILWAY_AUDIT_TOKEN }}
|
|
run: |
|
|
set +e
|
|
bash scripts/ops/audit-railway-sha-pins.sh 2>&1 | tee /tmp/audit.log
|
|
rc=${PIPESTATUS[0]}
|
|
echo "rc=$rc" >> "$GITHUB_OUTPUT"
|
|
# Capture the audit log for the issue body.
|
|
{
|
|
echo 'log<<AUDIT_EOF'
|
|
cat /tmp/audit.log
|
|
echo 'AUDIT_EOF'
|
|
} >> "$GITHUB_OUTPUT"
|
|
# Exit codes from the script:
|
|
# 0 — no drift; workflow goes green
|
|
# 1 — drift detected; we'll file an issue and fail the run
|
|
# 2 — railway CLI unauthenticated / project unlinked; fail
|
|
# Anything else: also fail.
|
|
case "$rc" in
|
|
0) exit 0 ;;
|
|
1) echo "::warning::Drift-prone pin(s) detected — issue will be filed"; exit 1 ;;
|
|
2) echo "::error::Railway CLI auth/link failed mid-script — token or project ID drift"; exit 2 ;;
|
|
*) echo "::error::Unexpected audit rc=$rc"; exit 1 ;;
|
|
esac
|
|
|
|
- name: Open / update drift issue
|
|
if: failure() && steps.audit.outputs.rc == '1'
|
|
uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
|
|
env:
|
|
AUDIT_LOG: ${{ steps.audit.outputs.log }}
|
|
with:
|
|
script: |
|
|
const title = "🚨 Railway env-var drift detected";
|
|
const runURL = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
|
|
const body =
|
|
`Daily Railway pin audit found drift-prone image-tag pins in the molecule-platform Railway project.\n\n` +
|
|
`**What this means:** an env var (likely on \`controlplane\`) is pinned to a SHA-shaped or semver tag instead of a floating tag. ` +
|
|
`Same pattern that caused the 2026-04-24 TENANT_IMAGE incident — fix-PRs land but the running service doesn't pick them up.\n\n` +
|
|
`**Recovery:** open the Railway dashboard, replace the flagged value with a floating tag (\`:staging-latest\`, \`:main\`) unless the pin is intentional and documented in the ops runbook.\n\n` +
|
|
`**Audit output:**\n\n\`\`\`\n${process.env.AUDIT_LOG || '(log unavailable)'}\n\`\`\`\n\n` +
|
|
`Run: ${runURL}\n\n` +
|
|
`Closes automatically when a subsequent daily run reports clean.`;
|
|
|
|
const { data: existing } = await github.rest.issues.listForRepo({
|
|
owner: context.repo.owner, repo: context.repo.repo,
|
|
state: 'open', labels: 'railway-drift',
|
|
});
|
|
const match = existing.find(i => i.title === title);
|
|
if (match) {
|
|
await github.rest.issues.createComment({
|
|
owner: context.repo.owner, repo: context.repo.repo,
|
|
issue_number: match.number,
|
|
body: `Still drifting. ${runURL}\n\n\`\`\`\n${process.env.AUDIT_LOG || '(log unavailable)'}\n\`\`\``,
|
|
});
|
|
} else {
|
|
await github.rest.issues.create({
|
|
owner: context.repo.owner, repo: context.repo.repo,
|
|
title, body,
|
|
labels: ['railway-drift', 'bug', 'priority-high'],
|
|
});
|
|
}
|
|
|
|
- name: Close stale drift issue on clean run
|
|
# When a previously-flagged drift gets fixed by an operator,
|
|
# the next daily run goes green. Close any open `railway-drift`
|
|
# issue with a confirmation comment so the queue doesn't carry
|
|
# stale ones.
|
|
if: success() && steps.audit.outputs.rc == '0'
|
|
uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
|
|
with:
|
|
script: |
|
|
const runURL = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
|
|
const { data: existing } = await github.rest.issues.listForRepo({
|
|
owner: context.repo.owner, repo: context.repo.repo,
|
|
state: 'open', labels: 'railway-drift',
|
|
});
|
|
for (const issue of existing) {
|
|
await github.rest.issues.createComment({
|
|
owner: context.repo.owner, repo: context.repo.repo,
|
|
issue_number: issue.number,
|
|
body: `Daily audit clean — drift resolved. ${runURL}`,
|
|
});
|
|
await github.rest.issues.update({
|
|
owner: context.repo.owner, repo: context.repo.repo,
|
|
issue_number: issue.number,
|
|
state: 'closed',
|
|
state_reason: 'completed',
|
|
});
|
|
}
|