From 026f5e51d99f345a6e8f48e98eeb3258cb48f96c Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Mon, 27 Apr 2026 05:01:23 -0700 Subject: [PATCH] ops: add Railway SHA-pin drift audit script + regression test (#2001) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit #2000 fixed one symptom — TENANT_IMAGE pinned to `staging-a14cf86` (10 days stale) silently no-op'd four upstream fixes on 2026-04-24. This adds the audit pattern as a re-runnable script so the broader class is observable on demand without new CI infrastructure. Audit results today (2026-04-27): controlplane / production: 54 vars audited, 0 drift-prone pins controlplane / staging: 52 vars audited, 0 drift-prone pins So the immediate audit deliverable is clean — TENANT_IMAGE is the only known violation and #2000 already fixed it. The script makes the ongoing audit a 5-second command instead of a manual one. Detection regex catches: * branch-SHA suffixes (`staging|main|prod|production-<6+ hex>`) — the exact 2026-04-24 incident shape * version pins after `:` or `=` (`:v1.2.3`, `=v0.1.16`) — same drift class, just rendered differently Anchoring on `:` or `=` keeps prose like "version 1.2.3 of the api" out of the false-positive set. UUIDs, ARNs, AMI IDs, secrets, and floating tags (`:staging-latest`, `:main`) pass through untouched. Regression test (tests/ops/test_audit_railway_sha_pins.sh) pins 20 representative cases — 9 should-flag (covering all four branch prefixes + semver variants + middle-of-value matches) and 11 should-pass (the false-positive guards). Same regex inlined in both files so a future tweak that weakens detection fails the test in lockstep with weakening the audit. Both files shellcheck clean. CI gate (acceptance criterion's "regression: add a CI check") is deliberately scoped out — querying Railway from CI requires plumbing RAILWAY_TOKEN as a repo secret, which is multi-step setup. The re-runnable script + test cover the same surface today; the CI workflow is a small follow-up once the token is provisioned. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/ops/audit-railway-sha-pins.sh | 99 ++++++++++++++++++++++++ tests/ops/test_audit_railway_sha_pins.sh | 64 +++++++++++++++ 2 files changed, 163 insertions(+) create mode 100755 scripts/ops/audit-railway-sha-pins.sh create mode 100755 tests/ops/test_audit_railway_sha_pins.sh diff --git a/scripts/ops/audit-railway-sha-pins.sh b/scripts/ops/audit-railway-sha-pins.sh new file mode 100755 index 00000000..2c1140a3 --- /dev/null +++ b/scripts/ops/audit-railway-sha-pins.sh @@ -0,0 +1,99 @@ +#!/usr/bin/env bash +# Audit Railway env vars for drift-prone image-tag pins. +# +# Background (#2001): on 2026-04-24 a stale `:staging-a14cf86` SHA pin +# in CP's TENANT_IMAGE caused 3+ hours of E2E failure with the +# appearance that "every fix didn't propagate" — really the tenant +# image was so old it didn't read the env vars those fixes produced. +# This script flags anywhere we've re-introduced that pattern. +# +# Pattern matched: any env-var value ending in `-` (e.g. +# `staging-a14cf86`) or `:vN.M.P` semver tag, OR containing such a +# substring (catches embedded refs like `repo/img:staging-abc1234`). +# Floating tags (`:staging-latest`, `:main`, `:latest`) and other +# values pass through untouched. +# +# Usage: +# bash scripts/ops/audit-railway-sha-pins.sh # both envs +# bash scripts/ops/audit-railway-sha-pins.sh production # one env +# bash scripts/ops/audit-railway-sha-pins.sh staging +# +# Exit codes: +# 0 — no drift-prone pins +# 1 — drift detected, list printed +# 2 — railway CLI unauthenticated / project unlinked +# +# Pre-req: run from a directory linked to a Railway project +# (e.g. molecule-controlplane). The script does not chdir for you +# because the linked project's identity matters. +set -euo pipefail + +ENV_FILTER="${1:-}" +ENVS=() +case "$ENV_FILTER" in + "") ENVS=(production staging) ;; + production|staging) ENVS=("$ENV_FILTER") ;; + *) echo "usage: $0 [production|staging]" >&2; exit 2 ;; +esac + +# All services in the linked Railway project. Discovery isn't worth +# the complexity — list them explicitly and add new services here. +SERVICES=(controlplane) + +# A single regex that matches: +# - `-` at end of value +# - `:vN.M.P` semver tag at end +# - either pattern as a substring +# Drift-prone patterns — same class as the 2026-04-24 TENANT_IMAGE +# incident. Matched against full env-var lines (KEY=VALUE). +# +# branch-SHA (e.g. `staging-a14cf86`): +# anchored by branch-name prefix + 6+ hex chars, so a UUID hex +# run that happens to look hex-shaped doesn't trip the audit +# (UUIDs use dashes, ARNs use colons). +# +# semver pin (`:v1.2.3`, `=v0.1.16`): +# requires `:` or `=` immediately before, so prose like +# "version 1.2.3 of the api" is NOT flagged. The trailing +# negated-class ensures we don't fold patches like 1.2.34 +# into 1.2.3. +DRIFT_REGEX='(staging|main|prod|production)-[a-f0-9]{6,}|[:=]v?[0-9]+\.[0-9]+\.[0-9]+([^a-z0-9]|$)' + +drift_count=0 +for env in "${ENVS[@]}"; do + for svc in "${SERVICES[@]}"; do + echo "─── env=$env service=$svc ───" + if ! out=$(railway variables --service "$svc" --environment "$env" --kv 2>&1); then + # Detect "not authenticated" / "no linked project" vs "service not found" + if echo "$out" | grep -qiE 'not (authenticated|logged in)|unlinked|no project'; then + echo " ❌ railway CLI not authenticated or project not linked" >&2 + exit 2 + fi + echo " (skipped: $out)" >&2 + continue + fi + matched=$(echo "$out" | grep -nE "=.*($DRIFT_REGEX)" || true) + if [ -z "$matched" ]; then + total=$(echo "$out" | grep -c '=' || echo 0) + echo " ✓ $total env vars audited, no drift-prone pins" + else + lines=$(echo "$matched" | wc -l | tr -d ' ') + drift_count=$((drift_count + lines)) + echo " ⚠ $lines drift-prone pin(s):" + # Truncate values past 80 chars so a tokenful one-liner doesn't + # hide the relevant suffix off-screen. + echo "$matched" | sed -E 's/(.{80}).+/\1.../' | sed 's/^/ /' + fi + done +done + +if [ "$drift_count" -gt 0 ]; then + echo + echo "Total drift-prone pins: $drift_count" + echo "Replace with floating tags (e.g. :staging-latest, :main) unless" + echo "intentional and documented in the ops runbook." + exit 1 +fi +echo +echo "✓ Clean — no drift-prone image pins in any audited env." +exit 0 diff --git a/tests/ops/test_audit_railway_sha_pins.sh b/tests/ops/test_audit_railway_sha_pins.sh new file mode 100755 index 00000000..c13fea32 --- /dev/null +++ b/tests/ops/test_audit_railway_sha_pins.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +# Regression test for scripts/ops/audit-railway-sha-pins.sh — pins the +# drift-detection regex's behavior against a curated set of should-flag +# and should-pass values. A future regex tweak that weakens detection +# (e.g. drops the substring branch, narrows the SHA length, etc.) fails +# loud here. +set -uo pipefail + +# Same regex as the audit script. Keep these two locked in step. +DRIFT_REGEX='(staging|main|prod|production)-[a-f0-9]{6,}|[:=]v?[0-9]+\.[0-9]+\.[0-9]+([^a-z0-9]|$)' + +PASS=0 +FAIL=0 + +assert() { + local label="$1" value="$2" want="$3" # want = "flag" or "pass" + local hit + if echo "$value" | grep -qE "$DRIFT_REGEX"; then + hit="flag" + else + hit="pass" + fi + if [ "$hit" = "$want" ]; then + echo " ✓ $label" + PASS=$((PASS+1)) + else + echo " ✗ $label: value=$value expected=$want got=$hit" >&2 + FAIL=$((FAIL+1)) + fi +} + +echo "Test: drift-detection regex" +echo + +# ── should FLAG ──────────────────────────────────────────────────────── +echo "Should flag (drift-prone):" +assert "branch-SHA suffix (the 2026-04-24 incident)" "ghcr.io/molecule/tenant:staging-a14cf86" flag +assert "main-SHA suffix" "ghcr.io/molecule/tenant:main-d3adb33f" flag +assert "prod-SHA suffix" "ghcr.io/molecule/tenant:prod-cafef00d" flag +assert "production-SHA suffix" "ghcr.io/molecule/tenant:production-1234567890ab" flag +assert "semver tag :v1.2.3" "ghcr.io/molecule/tenant:v1.2.3" flag +assert "semver tag :1.2.3 (no v)" "ghcr.io/molecule/tenant:1.2.3" flag +assert "semver patch-zero :v2.0.0" "ghcr.io/molecule/tenant:v2.0.0" flag +assert "semver in middle of value" "TEMPLATE_PIN=v0.1.16/extra" flag +assert "branch-SHA as part of longer value" "image=foo:staging-abc1234,other=bar" flag + +# ── should PASS ──────────────────────────────────────────────────────── +echo +echo "Should pass (floating / unrelated):" +assert "floating tag :staging-latest" "ghcr.io/molecule/tenant:staging-latest" pass +assert "floating tag :main" "ghcr.io/molecule/tenant:main" pass +assert "floating tag :latest" "ghcr.io/molecule/tenant:latest" pass +assert "URL" "https://api.moleculesai.app/v1" pass +assert "secret-shaped string" "cfut_loLRZGHCF0ySpUeESUL0OB" pass +assert "human name" "Hongming Wang" pass +assert "uuid" "a034108e-da16-d131-ef7f-766b923ef464" pass +assert "AWS ARN" "arn:aws:secretsmanager:us-east-2:123:secret/foo" pass +assert "short hash (under 6 chars)" "ghcr.io/molecule/tenant:staging-abc12" pass +assert "version field, not tag (no leading colon)" "version 1.2.3 of the api" pass +assert "AMI id" "ami-0abcd1234efgh5678" pass + +echo +echo "passed=$PASS failed=$FAIL" +[ "$FAIL" = "0" ]