ops: add Railway SHA-pin drift audit script + regression test (#2001)
#2000 fixed one symptom — TENANT_IMAGE pinned to `staging-a14cf86` (10 days stale) silently no-op'd four upstream fixes on 2026-04-24. This adds the audit pattern as a re-runnable script so the broader class is observable on demand without new CI infrastructure. Audit results today (2026-04-27): controlplane / production: 54 vars audited, 0 drift-prone pins controlplane / staging: 52 vars audited, 0 drift-prone pins So the immediate audit deliverable is clean — TENANT_IMAGE is the only known violation and #2000 already fixed it. The script makes the ongoing audit a 5-second command instead of a manual one. Detection regex catches: * branch-SHA suffixes (`staging|main|prod|production-<6+ hex>`) — the exact 2026-04-24 incident shape * version pins after `:` or `=` (`:v1.2.3`, `=v0.1.16`) — same drift class, just rendered differently Anchoring on `:` or `=` keeps prose like "version 1.2.3 of the api" out of the false-positive set. UUIDs, ARNs, AMI IDs, secrets, and floating tags (`:staging-latest`, `:main`) pass through untouched. Regression test (tests/ops/test_audit_railway_sha_pins.sh) pins 20 representative cases — 9 should-flag (covering all four branch prefixes + semver variants + middle-of-value matches) and 11 should-pass (the false-positive guards). Same regex inlined in both files so a future tweak that weakens detection fails the test in lockstep with weakening the audit. Both files shellcheck clean. CI gate (acceptance criterion's "regression: add a CI check") is deliberately scoped out — querying Railway from CI requires plumbing RAILWAY_TOKEN as a repo secret, which is multi-step setup. The re-runnable script + test cover the same surface today; the CI workflow is a small follow-up once the token is provisioned. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
7cf77f274a
commit
026f5e51d9
99
scripts/ops/audit-railway-sha-pins.sh
Executable file
99
scripts/ops/audit-railway-sha-pins.sh
Executable file
@ -0,0 +1,99 @@
|
||||
#!/usr/bin/env bash
|
||||
# Audit Railway env vars for drift-prone image-tag pins.
|
||||
#
|
||||
# Background (#2001): on 2026-04-24 a stale `:staging-a14cf86` SHA pin
|
||||
# in CP's TENANT_IMAGE caused 3+ hours of E2E failure with the
|
||||
# appearance that "every fix didn't propagate" — really the tenant
|
||||
# image was so old it didn't read the env vars those fixes produced.
|
||||
# This script flags anywhere we've re-introduced that pattern.
|
||||
#
|
||||
# Pattern matched: any env-var value ending in `<branch>-<hex>` (e.g.
|
||||
# `staging-a14cf86`) or `:vN.M.P` semver tag, OR containing such a
|
||||
# substring (catches embedded refs like `repo/img:staging-abc1234`).
|
||||
# Floating tags (`:staging-latest`, `:main`, `:latest`) and other
|
||||
# values pass through untouched.
|
||||
#
|
||||
# Usage:
|
||||
# bash scripts/ops/audit-railway-sha-pins.sh # both envs
|
||||
# bash scripts/ops/audit-railway-sha-pins.sh production # one env
|
||||
# bash scripts/ops/audit-railway-sha-pins.sh staging
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 — no drift-prone pins
|
||||
# 1 — drift detected, list printed
|
||||
# 2 — railway CLI unauthenticated / project unlinked
|
||||
#
|
||||
# Pre-req: run from a directory linked to a Railway project
|
||||
# (e.g. molecule-controlplane). The script does not chdir for you
|
||||
# because the linked project's identity matters.
|
||||
set -euo pipefail
|
||||
|
||||
ENV_FILTER="${1:-}"
|
||||
ENVS=()
|
||||
case "$ENV_FILTER" in
|
||||
"") ENVS=(production staging) ;;
|
||||
production|staging) ENVS=("$ENV_FILTER") ;;
|
||||
*) echo "usage: $0 [production|staging]" >&2; exit 2 ;;
|
||||
esac
|
||||
|
||||
# All services in the linked Railway project. Discovery isn't worth
|
||||
# the complexity — list them explicitly and add new services here.
|
||||
SERVICES=(controlplane)
|
||||
|
||||
# A single regex that matches:
|
||||
# - `<branch>-<hex>` at end of value
|
||||
# - `:vN.M.P` semver tag at end
|
||||
# - either pattern as a substring
|
||||
# Drift-prone patterns — same class as the 2026-04-24 TENANT_IMAGE
|
||||
# incident. Matched against full env-var lines (KEY=VALUE).
|
||||
#
|
||||
# branch-SHA (e.g. `staging-a14cf86`):
|
||||
# anchored by branch-name prefix + 6+ hex chars, so a UUID hex
|
||||
# run that happens to look hex-shaped doesn't trip the audit
|
||||
# (UUIDs use dashes, ARNs use colons).
|
||||
#
|
||||
# semver pin (`:v1.2.3`, `=v0.1.16`):
|
||||
# requires `:` or `=` immediately before, so prose like
|
||||
# "version 1.2.3 of the api" is NOT flagged. The trailing
|
||||
# negated-class ensures we don't fold patches like 1.2.34
|
||||
# into 1.2.3.
|
||||
DRIFT_REGEX='(staging|main|prod|production)-[a-f0-9]{6,}|[:=]v?[0-9]+\.[0-9]+\.[0-9]+([^a-z0-9]|$)'
|
||||
|
||||
drift_count=0
|
||||
for env in "${ENVS[@]}"; do
|
||||
for svc in "${SERVICES[@]}"; do
|
||||
echo "─── env=$env service=$svc ───"
|
||||
if ! out=$(railway variables --service "$svc" --environment "$env" --kv 2>&1); then
|
||||
# Detect "not authenticated" / "no linked project" vs "service not found"
|
||||
if echo "$out" | grep -qiE 'not (authenticated|logged in)|unlinked|no project'; then
|
||||
echo " ❌ railway CLI not authenticated or project not linked" >&2
|
||||
exit 2
|
||||
fi
|
||||
echo " (skipped: $out)" >&2
|
||||
continue
|
||||
fi
|
||||
matched=$(echo "$out" | grep -nE "=.*($DRIFT_REGEX)" || true)
|
||||
if [ -z "$matched" ]; then
|
||||
total=$(echo "$out" | grep -c '=' || echo 0)
|
||||
echo " ✓ $total env vars audited, no drift-prone pins"
|
||||
else
|
||||
lines=$(echo "$matched" | wc -l | tr -d ' ')
|
||||
drift_count=$((drift_count + lines))
|
||||
echo " ⚠ $lines drift-prone pin(s):"
|
||||
# Truncate values past 80 chars so a tokenful one-liner doesn't
|
||||
# hide the relevant suffix off-screen.
|
||||
echo "$matched" | sed -E 's/(.{80}).+/\1.../' | sed 's/^/ /'
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
if [ "$drift_count" -gt 0 ]; then
|
||||
echo
|
||||
echo "Total drift-prone pins: $drift_count"
|
||||
echo "Replace with floating tags (e.g. :staging-latest, :main) unless"
|
||||
echo "intentional and documented in the ops runbook."
|
||||
exit 1
|
||||
fi
|
||||
echo
|
||||
echo "✓ Clean — no drift-prone image pins in any audited env."
|
||||
exit 0
|
||||
64
tests/ops/test_audit_railway_sha_pins.sh
Executable file
64
tests/ops/test_audit_railway_sha_pins.sh
Executable file
@ -0,0 +1,64 @@
|
||||
#!/usr/bin/env bash
|
||||
# Regression test for scripts/ops/audit-railway-sha-pins.sh — pins the
|
||||
# drift-detection regex's behavior against a curated set of should-flag
|
||||
# and should-pass values. A future regex tweak that weakens detection
|
||||
# (e.g. drops the substring branch, narrows the SHA length, etc.) fails
|
||||
# loud here.
|
||||
set -uo pipefail
|
||||
|
||||
# Same regex as the audit script. Keep these two locked in step.
|
||||
DRIFT_REGEX='(staging|main|prod|production)-[a-f0-9]{6,}|[:=]v?[0-9]+\.[0-9]+\.[0-9]+([^a-z0-9]|$)'
|
||||
|
||||
PASS=0
|
||||
FAIL=0
|
||||
|
||||
assert() {
|
||||
local label="$1" value="$2" want="$3" # want = "flag" or "pass"
|
||||
local hit
|
||||
if echo "$value" | grep -qE "$DRIFT_REGEX"; then
|
||||
hit="flag"
|
||||
else
|
||||
hit="pass"
|
||||
fi
|
||||
if [ "$hit" = "$want" ]; then
|
||||
echo " ✓ $label"
|
||||
PASS=$((PASS+1))
|
||||
else
|
||||
echo " ✗ $label: value=$value expected=$want got=$hit" >&2
|
||||
FAIL=$((FAIL+1))
|
||||
fi
|
||||
}
|
||||
|
||||
echo "Test: drift-detection regex"
|
||||
echo
|
||||
|
||||
# ── should FLAG ────────────────────────────────────────────────────────
|
||||
echo "Should flag (drift-prone):"
|
||||
assert "branch-SHA suffix (the 2026-04-24 incident)" "ghcr.io/molecule/tenant:staging-a14cf86" flag
|
||||
assert "main-SHA suffix" "ghcr.io/molecule/tenant:main-d3adb33f" flag
|
||||
assert "prod-SHA suffix" "ghcr.io/molecule/tenant:prod-cafef00d" flag
|
||||
assert "production-SHA suffix" "ghcr.io/molecule/tenant:production-1234567890ab" flag
|
||||
assert "semver tag :v1.2.3" "ghcr.io/molecule/tenant:v1.2.3" flag
|
||||
assert "semver tag :1.2.3 (no v)" "ghcr.io/molecule/tenant:1.2.3" flag
|
||||
assert "semver patch-zero :v2.0.0" "ghcr.io/molecule/tenant:v2.0.0" flag
|
||||
assert "semver in middle of value" "TEMPLATE_PIN=v0.1.16/extra" flag
|
||||
assert "branch-SHA as part of longer value" "image=foo:staging-abc1234,other=bar" flag
|
||||
|
||||
# ── should PASS ────────────────────────────────────────────────────────
|
||||
echo
|
||||
echo "Should pass (floating / unrelated):"
|
||||
assert "floating tag :staging-latest" "ghcr.io/molecule/tenant:staging-latest" pass
|
||||
assert "floating tag :main" "ghcr.io/molecule/tenant:main" pass
|
||||
assert "floating tag :latest" "ghcr.io/molecule/tenant:latest" pass
|
||||
assert "URL" "https://api.moleculesai.app/v1" pass
|
||||
assert "secret-shaped string" "cfut_loLRZGHCF0ySpUeESUL0OB" pass
|
||||
assert "human name" "Hongming Wang" pass
|
||||
assert "uuid" "a034108e-da16-d131-ef7f-766b923ef464" pass
|
||||
assert "AWS ARN" "arn:aws:secretsmanager:us-east-2:123:secret/foo" pass
|
||||
assert "short hash (under 6 chars)" "ghcr.io/molecule/tenant:staging-abc12" pass
|
||||
assert "version field, not tag (no leading colon)" "version 1.2.3 of the api" pass
|
||||
assert "AMI id" "ami-0abcd1234efgh5678" pass
|
||||
|
||||
echo
|
||||
echo "passed=$PASS failed=$FAIL"
|
||||
[ "$FAIL" = "0" ]
|
||||
Loading…
Reference in New Issue
Block a user