From 41d5f9558f46fa49f1507b89e2782a09740271b1 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 30 Apr 2026 13:13:47 -0700 Subject: [PATCH] =?UTF-8?q?ops:=20scripts/ops/check-prod-versions.sh=20?= =?UTF-8?q?=E2=80=94=20one-line=20"is=20each=20tenant=20on=20latest=3F"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Iterates a list of tenant slugs (default canary set on production, operator-supplied on staging), curls each tenant's /buildinfo plus canvas's /api/buildinfo, compares to origin/main's HEAD SHA, prints a table with one of {current, stale, unreachable} per surface. Returns non-zero if any surface is stale, so it can be wired into a periodic alert later. Why this exists: every "is the fix live?" question used to be answered with a one-off curl + git rev-parse + manual diff. This script does that uniformly across every public surface (workspace tenants + canvas) and is parseable. The redeploy verifier (#2398) covers the deploy moment; this covers any-time-after. Reads EXPECTED_SHA from `gh api repos/Molecule-AI/molecule-core/ commits/main` so it always reflects the actual upstream tip, not local working-copy state. Falls back to local origin/main with a WARN if `gh` isn't logged in — debugging is still useful even if the comparison may lag. Depends on: - #2409 (TenantGuard /buildinfo allowlist) — without it every tenant looks "unreachable" because the route 404s before the handler. Already merged on staging; will hit production after the next staging→main fast-forward + redeploy. - #2407 (canvas /api/buildinfo) — already on main + Vercel. Usage: ./scripts/ops/check-prod-versions.sh # production canary set TENANT_SLUGS="a b c" ./scripts/ops/check-prod-versions.sh # custom set ENV=staging TENANT_SLUGS="..." ./scripts/ops/check-prod-versions.sh Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/ops/check-prod-versions.sh | 112 +++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100755 scripts/ops/check-prod-versions.sh diff --git a/scripts/ops/check-prod-versions.sh b/scripts/ops/check-prod-versions.sh new file mode 100755 index 00000000..88c721e7 --- /dev/null +++ b/scripts/ops/check-prod-versions.sh @@ -0,0 +1,112 @@ +#!/usr/bin/env bash +# Check whether production tenants and canvas are running latest main. +# +# Usage: +# ./scripts/ops/check-prod-versions.sh # production +# ENV=staging ./scripts/ops/check-prod-versions.sh # staging tenants +# +# Outputs a table of {surface, current_sha, expected_sha, status}. Returns +# non-zero if any surface is stale so this can be wired into a periodic +# alert. +# +# Why this exists: every time someone hits a "is the fix live?" question, +# they have to remember the curl pattern + cross-reference with +# `git rev-parse origin/main`. This script does that check uniformly across +# every public surface (workspace tenants + canvas) and gives a one-line +# verdict instead of a stack of one-off curls. + +set -euo pipefail + +ENV="${ENV:-production}" +EXPECTED_REF="${EXPECTED_REF:-main}" + +case "$ENV" in + production) + TENANT_DOMAIN="moleculesai.app" + CANVAS_URL="https://canvas.moleculesai.app" + # Default canary tenant for production. Override via TENANT_SLUGS= + # to cover a custom set. + DEFAULT_TENANTS="hongmingwang reno-stars" + ;; + staging) + TENANT_DOMAIN="staging.moleculesai.app" + CANVAS_URL="https://canvas-staging.moleculesai.app" + DEFAULT_TENANTS="" # staging tenants are ephemeral; user must specify + ;; + *) + echo "Unknown ENV=$ENV (expected: production | staging)" >&2 + exit 2 + ;; +esac + +TENANT_SLUGS="${TENANT_SLUGS:-$DEFAULT_TENANTS}" + +# Pull EXPECTED_SHA from GitHub. Falls back to local git if gh isn't +# logged in — local main may lag origin but is usually close enough for +# debugging, and we still report the comparison clearly. +EXPECTED_SHA="" +if command -v gh >/dev/null 2>&1; then + EXPECTED_SHA=$(gh api "repos/Molecule-AI/molecule-core/commits/${EXPECTED_REF}" --jq '.sha' 2>/dev/null || true) +fi +if [ -z "$EXPECTED_SHA" ]; then + if git rev-parse "origin/${EXPECTED_REF}" >/dev/null 2>&1; then + EXPECTED_SHA=$(git rev-parse "origin/${EXPECTED_REF}") + echo "[check-prod-versions] WARN: gh unavailable, using local origin/${EXPECTED_REF}=${EXPECTED_SHA:0:7} (may lag)" + else + echo "[check-prod-versions] ERROR: cannot resolve expected SHA — gh not logged in and origin/${EXPECTED_REF} not fetched" >&2 + exit 2 + fi +fi +EXPECTED_SHORT="${EXPECTED_SHA:0:7}" + +echo "Checking ${ENV} surfaces against ${EXPECTED_REF}=${EXPECTED_SHORT}" +echo "" +printf "%-25s %-9s %-9s %s\n" "Surface" "Live" "Expected" "Status" +printf "%-25s %-9s %-9s %s\n" "-------" "----" "--------" "------" + +STALE_COUNT=0 +UNREACHABLE_COUNT=0 + +# Tenant surfaces — workspace-server /buildinfo (added in PR #2398). +for slug in $TENANT_SLUGS; do + URL="https://${slug}.${TENANT_DOMAIN}/buildinfo" + BODY=$(curl -sS --max-time 15 "$URL" 2>/dev/null || echo "") + ACTUAL_SHA=$(echo "$BODY" | jq -r '.git_sha // ""' 2>/dev/null || echo "") + if [ -z "$ACTUAL_SHA" ]; then + printf "%-25s %-9s %-9s ⚠ unreachable\n" "tenant: $slug" "—" "$EXPECTED_SHORT" + UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1)) + elif [ "$ACTUAL_SHA" = "$EXPECTED_SHA" ]; then + printf "%-25s %-9s %-9s ✓ current\n" "tenant: $slug" "${ACTUAL_SHA:0:7}" "$EXPECTED_SHORT" + else + printf "%-25s %-9s %-9s ✗ stale\n" "tenant: $slug" "${ACTUAL_SHA:0:7}" "$EXPECTED_SHORT" + STALE_COUNT=$((STALE_COUNT + 1)) + fi +done + +# Canvas — Next.js /api/buildinfo (PR #2407). Vercel injects +# VERCEL_GIT_COMMIT_SHA at build time so this reflects the deployed +# commit, not the request time. +CANVAS_BODY=$(curl -sS --max-time 15 "${CANVAS_URL}/api/buildinfo" 2>/dev/null || echo "") +CANVAS_SHA=$(echo "$CANVAS_BODY" | jq -r '.git_sha // ""' 2>/dev/null || echo "") +if [ -z "$CANVAS_SHA" ]; then + printf "%-25s %-9s %-9s ⚠ unreachable (route may not be deployed yet)\n" "canvas" "—" "$EXPECTED_SHORT" + UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1)) +elif [ "$CANVAS_SHA" = "dev" ]; then + printf "%-25s %-9s %-9s ⚠ dev sentinel (Vercel env not injected — check VERCEL_GIT_COMMIT_SHA)\n" "canvas" "dev" "$EXPECTED_SHORT" + UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1)) +elif [ "$CANVAS_SHA" = "$EXPECTED_SHA" ]; then + printf "%-25s %-9s %-9s ✓ current\n" "canvas" "${CANVAS_SHA:0:7}" "$EXPECTED_SHORT" +else + printf "%-25s %-9s %-9s ✗ stale\n" "canvas" "${CANVAS_SHA:0:7}" "$EXPECTED_SHORT" + STALE_COUNT=$((STALE_COUNT + 1)) +fi + +echo "" +if [ $STALE_COUNT -eq 0 ] && [ $UNREACHABLE_COUNT -eq 0 ]; then + echo "All surfaces current." + exit 0 +fi +echo "Summary: ${STALE_COUNT} stale, ${UNREACHABLE_COUNT} unreachable." +# Stale is a deploy gap; unreachable is operational (DNS, CF, route absent). +# Both are signal — exit non-zero so cron / CI can alert. +exit 1