forked from molecule-ai/molecule-core
Iterates a list of tenant slugs (default canary set on production,
operator-supplied on staging), curls each tenant's /buildinfo plus
canvas's /api/buildinfo, compares to origin/main's HEAD SHA, prints a
table with one of {current, stale, unreachable} per surface. Returns
non-zero if any surface is stale, so it can be wired into a periodic
alert later.
Why this exists: every "is the fix live?" question used to be
answered with a one-off curl + git rev-parse + manual diff. This
script does that uniformly across every public surface (workspace
tenants + canvas) and is parseable. The redeploy verifier (#2398)
covers the deploy moment; this covers any-time-after.
Reads EXPECTED_SHA from `gh api repos/Molecule-AI/molecule-core/
commits/main` so it always reflects the actual upstream tip, not
local working-copy state. Falls back to local origin/main with a
WARN if `gh` isn't logged in — debugging is still useful even if
the comparison may lag.
Depends on:
- #2409 (TenantGuard /buildinfo allowlist) — without it every
tenant looks "unreachable" because the route 404s before the
handler. Already merged on staging; will hit production after
the next staging→main fast-forward + redeploy.
- #2407 (canvas /api/buildinfo) — already on main + Vercel.
Usage:
./scripts/ops/check-prod-versions.sh # production canary set
TENANT_SLUGS="a b c" ./scripts/ops/check-prod-versions.sh # custom set
ENV=staging TENANT_SLUGS="..." ./scripts/ops/check-prod-versions.sh
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
113 lines
4.7 KiB
Bash
Executable File
113 lines
4.7 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Check whether production tenants and canvas are running latest main.
|
|
#
|
|
# Usage:
|
|
# ./scripts/ops/check-prod-versions.sh # production
|
|
# ENV=staging ./scripts/ops/check-prod-versions.sh # staging tenants
|
|
#
|
|
# Outputs a table of {surface, current_sha, expected_sha, status}. Returns
|
|
# non-zero if any surface is stale so this can be wired into a periodic
|
|
# alert.
|
|
#
|
|
# Why this exists: every time someone hits a "is the fix live?" question,
|
|
# they have to remember the curl pattern + cross-reference with
|
|
# `git rev-parse origin/main`. This script does that check uniformly across
|
|
# every public surface (workspace tenants + canvas) and gives a one-line
|
|
# verdict instead of a stack of one-off curls.
|
|
|
|
set -euo pipefail
|
|
|
|
ENV="${ENV:-production}"
|
|
EXPECTED_REF="${EXPECTED_REF:-main}"
|
|
|
|
case "$ENV" in
|
|
production)
|
|
TENANT_DOMAIN="moleculesai.app"
|
|
CANVAS_URL="https://canvas.moleculesai.app"
|
|
# Default canary tenant for production. Override via TENANT_SLUGS=
|
|
# to cover a custom set.
|
|
DEFAULT_TENANTS="hongmingwang reno-stars"
|
|
;;
|
|
staging)
|
|
TENANT_DOMAIN="staging.moleculesai.app"
|
|
CANVAS_URL="https://canvas-staging.moleculesai.app"
|
|
DEFAULT_TENANTS="" # staging tenants are ephemeral; user must specify
|
|
;;
|
|
*)
|
|
echo "Unknown ENV=$ENV (expected: production | staging)" >&2
|
|
exit 2
|
|
;;
|
|
esac
|
|
|
|
TENANT_SLUGS="${TENANT_SLUGS:-$DEFAULT_TENANTS}"
|
|
|
|
# Pull EXPECTED_SHA from GitHub. Falls back to local git if gh isn't
|
|
# logged in — local main may lag origin but is usually close enough for
|
|
# debugging, and we still report the comparison clearly.
|
|
EXPECTED_SHA=""
|
|
if command -v gh >/dev/null 2>&1; then
|
|
EXPECTED_SHA=$(gh api "repos/Molecule-AI/molecule-core/commits/${EXPECTED_REF}" --jq '.sha' 2>/dev/null || true)
|
|
fi
|
|
if [ -z "$EXPECTED_SHA" ]; then
|
|
if git rev-parse "origin/${EXPECTED_REF}" >/dev/null 2>&1; then
|
|
EXPECTED_SHA=$(git rev-parse "origin/${EXPECTED_REF}")
|
|
echo "[check-prod-versions] WARN: gh unavailable, using local origin/${EXPECTED_REF}=${EXPECTED_SHA:0:7} (may lag)"
|
|
else
|
|
echo "[check-prod-versions] ERROR: cannot resolve expected SHA — gh not logged in and origin/${EXPECTED_REF} not fetched" >&2
|
|
exit 2
|
|
fi
|
|
fi
|
|
EXPECTED_SHORT="${EXPECTED_SHA:0:7}"
|
|
|
|
echo "Checking ${ENV} surfaces against ${EXPECTED_REF}=${EXPECTED_SHORT}"
|
|
echo ""
|
|
printf "%-25s %-9s %-9s %s\n" "Surface" "Live" "Expected" "Status"
|
|
printf "%-25s %-9s %-9s %s\n" "-------" "----" "--------" "------"
|
|
|
|
STALE_COUNT=0
|
|
UNREACHABLE_COUNT=0
|
|
|
|
# Tenant surfaces — workspace-server /buildinfo (added in PR #2398).
|
|
for slug in $TENANT_SLUGS; do
|
|
URL="https://${slug}.${TENANT_DOMAIN}/buildinfo"
|
|
BODY=$(curl -sS --max-time 15 "$URL" 2>/dev/null || echo "")
|
|
ACTUAL_SHA=$(echo "$BODY" | jq -r '.git_sha // ""' 2>/dev/null || echo "")
|
|
if [ -z "$ACTUAL_SHA" ]; then
|
|
printf "%-25s %-9s %-9s ⚠ unreachable\n" "tenant: $slug" "—" "$EXPECTED_SHORT"
|
|
UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1))
|
|
elif [ "$ACTUAL_SHA" = "$EXPECTED_SHA" ]; then
|
|
printf "%-25s %-9s %-9s ✓ current\n" "tenant: $slug" "${ACTUAL_SHA:0:7}" "$EXPECTED_SHORT"
|
|
else
|
|
printf "%-25s %-9s %-9s ✗ stale\n" "tenant: $slug" "${ACTUAL_SHA:0:7}" "$EXPECTED_SHORT"
|
|
STALE_COUNT=$((STALE_COUNT + 1))
|
|
fi
|
|
done
|
|
|
|
# Canvas — Next.js /api/buildinfo (PR #2407). Vercel injects
|
|
# VERCEL_GIT_COMMIT_SHA at build time so this reflects the deployed
|
|
# commit, not the request time.
|
|
CANVAS_BODY=$(curl -sS --max-time 15 "${CANVAS_URL}/api/buildinfo" 2>/dev/null || echo "")
|
|
CANVAS_SHA=$(echo "$CANVAS_BODY" | jq -r '.git_sha // ""' 2>/dev/null || echo "")
|
|
if [ -z "$CANVAS_SHA" ]; then
|
|
printf "%-25s %-9s %-9s ⚠ unreachable (route may not be deployed yet)\n" "canvas" "—" "$EXPECTED_SHORT"
|
|
UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1))
|
|
elif [ "$CANVAS_SHA" = "dev" ]; then
|
|
printf "%-25s %-9s %-9s ⚠ dev sentinel (Vercel env not injected — check VERCEL_GIT_COMMIT_SHA)\n" "canvas" "dev" "$EXPECTED_SHORT"
|
|
UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1))
|
|
elif [ "$CANVAS_SHA" = "$EXPECTED_SHA" ]; then
|
|
printf "%-25s %-9s %-9s ✓ current\n" "canvas" "${CANVAS_SHA:0:7}" "$EXPECTED_SHORT"
|
|
else
|
|
printf "%-25s %-9s %-9s ✗ stale\n" "canvas" "${CANVAS_SHA:0:7}" "$EXPECTED_SHORT"
|
|
STALE_COUNT=$((STALE_COUNT + 1))
|
|
fi
|
|
|
|
echo ""
|
|
if [ $STALE_COUNT -eq 0 ] && [ $UNREACHABLE_COUNT -eq 0 ]; then
|
|
echo "All surfaces current."
|
|
exit 0
|
|
fi
|
|
echo "Summary: ${STALE_COUNT} stale, ${UNREACHABLE_COUNT} unreachable."
|
|
# Stale is a deploy gap; unreachable is operational (DNS, CF, route absent).
|
|
# Both are signal — exit non-zero so cron / CI can alert.
|
|
exit 1
|