Self-review of PR #2810 caught a regression: my mass-fix added `2>/dev/null` to every curl invocation, suppressing stderr. The original `|| echo "000"` shape only swallowed exit codes — stderr (curl's `-sS`-shown dial errors, timeouts, DNS failures) still went to the runner log so operators could see WHY a connection failed. After PR #2810 the next deploy failure would log only the bare HTTP code with no context. That's exactly the kind of diagnostic loss that makes outages take longer to triage. Drop `2>/dev/null` from each curl line — keep it on the `cat` fallback (which legitimately suppresses "no such file" when curl crashed before -w ran). The `>tempfile` redirect alone captures curl's stdout (where -w writes) without touching stderr. Same 8 files as #2810: redeploy-tenants-on-{main,staging}, sweep-stale-e2e-orgs, e2e-staging-{sanity,saas,external,canvas}, canary-staging. Tests: - All 8 files pass the lint - YAML valid Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
397 lines
19 KiB
YAML
397 lines
19 KiB
YAML
name: redeploy-tenants-on-main
|
|
|
|
# Auto-refresh prod tenant EC2s after every main merge.
|
|
#
|
|
# Why this workflow exists: publish-workspace-server-image builds and
|
|
# pushes a new platform-tenant:latest + :<sha> to GHCR on every merge
|
|
# to main, but running tenants pulled their image once at boot and
|
|
# never re-pull. Users see stale code indefinitely.
|
|
#
|
|
# This workflow closes the gap by calling the control-plane admin
|
|
# endpoint that performs a canary-first, batched, health-gated rolling
|
|
# redeploy across every live tenant. Implemented in Molecule-AI/
|
|
# molecule-controlplane as POST /cp/admin/tenants/redeploy-fleet
|
|
# (feat/tenant-auto-redeploy, landing alongside this workflow).
|
|
#
|
|
# Runtime ordering:
|
|
# 1. publish-workspace-server-image completes → new :latest in GHCR.
|
|
# 2. This workflow fires via workflow_run, waits 30s for GHCR's
|
|
# CDN to propagate the new tag to the region the tenants pull from.
|
|
# 3. Calls redeploy-fleet with canary_slug=hongming and a 60s
|
|
# soak. Canary proves the image boots; batches follow.
|
|
# 4. Any failure aborts the rollout and leaves older tenants on the
|
|
# prior image — safer default than half-and-half state.
|
|
#
|
|
# Rollback path: re-run this workflow with a specific SHA pinned via
|
|
# the workflow_dispatch input. That calls redeploy-fleet with
|
|
# target_tag=<sha>, re-pulling the older image on every tenant.
|
|
|
|
on:
|
|
workflow_run:
|
|
workflows: ['publish-workspace-server-image']
|
|
types: [completed]
|
|
branches: [main]
|
|
workflow_dispatch:
|
|
inputs:
|
|
target_tag:
|
|
# Empty default → auto-trigger and dispatch-without-input both
|
|
# resolve to `staging-<short_head_sha>` (the digest publish-image
|
|
# just pushed). Pre-fix this defaulted to 'latest', which only
|
|
# gets retagged by canary-verify's promote-to-latest job — and
|
|
# that job soft-skips when CANARY_TENANT_URLS is unset (the
|
|
# current state, until Phase 2 canary fleet is live). Result:
|
|
# `:latest` had been pinned to a 4-day-old digest (2026-04-28)
|
|
# while every main push pushed fresh `staging-<sha>` images;
|
|
# every prod redeploy pulled the stale `:latest` and the verify
|
|
# step correctly flagged 3/3 tenants STALE. Pulling the
|
|
# just-published `staging-<sha>` directly skips the dead retag
|
|
# path. When canary fleet is real, this workflow should chain
|
|
# on canary-verify completion (workflow_run from canary-verify),
|
|
# not publish-image — separate, smaller PR.
|
|
description: 'Tenant image tag to deploy (e.g. "latest", "staging-a59f1a6c"). Empty = auto staging-<head_sha>.'
|
|
required: false
|
|
type: string
|
|
default: ''
|
|
canary_slug:
|
|
description: 'Tenant slug to deploy first + soak (empty = skip canary, fan out immediately).'
|
|
required: false
|
|
type: string
|
|
# Must be an actual prod tenant slug (current: hongming,
|
|
# chloe-dong, reno-stars). The previous default 'hongmingwang'
|
|
# didn't match any tenant — CP soft-skipped the missing canary
|
|
# and the fleet rolled out without the soak gate, defeating the
|
|
# whole point of canary-first.
|
|
default: 'hongming'
|
|
soak_seconds:
|
|
description: 'Seconds to wait after canary before fanning out.'
|
|
required: false
|
|
type: string
|
|
default: '60'
|
|
batch_size:
|
|
description: 'How many tenants SSM redeploys in parallel per batch.'
|
|
required: false
|
|
type: string
|
|
default: '3'
|
|
dry_run:
|
|
description: 'Plan only — do not actually redeploy.'
|
|
required: false
|
|
type: boolean
|
|
default: false
|
|
|
|
permissions:
|
|
contents: read
|
|
# No write scopes needed — the workflow hits an external CP endpoint,
|
|
# not the GitHub API.
|
|
|
|
# Serialize redeploys so two rapid main pushes' redeploys don't overlap
|
|
# and cause confusing per-tenant SSM state. Without this, GitHub's
|
|
# implicit workflow_run queueing would *probably* serialize them, but
|
|
# the explicit block makes the invariant defensible. Mirrors the
|
|
# concurrency block on redeploy-tenants-on-staging.yml for shape parity.
|
|
#
|
|
# cancel-in-progress: false → aborting a half-rolled-out fleet would
|
|
# leave tenants stuck on whatever image they happened to be on when
|
|
# cancelled. Better to finish the in-flight rollout before starting
|
|
# the next one.
|
|
concurrency:
|
|
group: redeploy-tenants-on-main
|
|
cancel-in-progress: false
|
|
|
|
jobs:
|
|
redeploy:
|
|
# Skip the auto-trigger if publish-workspace-server-image didn't
|
|
# actually succeed. workflow_run fires on any completion state; we
|
|
# don't want to redeploy against a half-built image.
|
|
if: |
|
|
github.event_name == 'workflow_dispatch' ||
|
|
(github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success')
|
|
runs-on: ubuntu-latest
|
|
timeout-minutes: 25
|
|
steps:
|
|
- name: Wait for GHCR tag propagation
|
|
# GHCR's edge cache takes ~15-30s to consistently serve the new
|
|
# manifest after the registry accepts the push. Without this
|
|
# sleep, the first tenant's docker pull sometimes races and
|
|
# fetches the previous digest; sleeping is the cheapest way to
|
|
# reduce that without polling GHCR for the new digest.
|
|
run: sleep 30
|
|
|
|
- name: Compute target tag
|
|
id: tag
|
|
# Resolution order:
|
|
# 1. Operator-supplied input (workflow_dispatch with explicit
|
|
# tag) → used verbatim. Lets ops pin `latest` for emergency
|
|
# rollback to last canary-verified digest, or pin a specific
|
|
# `staging-<sha>` to roll back to a known-good build.
|
|
# 2. Default → `staging-<short_head_sha>`. The just-published
|
|
# digest. Bypasses the `:latest` retag path that's currently
|
|
# dead (canary-verify soft-skips without canary fleet, so
|
|
# the only thing retagging `:latest` today is the manual
|
|
# promote-latest.yml — last run 2026-04-28). Auto-trigger
|
|
# from workflow_run uses workflow_run.head_sha; manual
|
|
# dispatch with no input falls through to github.sha.
|
|
env:
|
|
INPUT_TAG: ${{ inputs.target_tag }}
|
|
HEAD_SHA: ${{ github.event.workflow_run.head_sha || github.sha }}
|
|
run: |
|
|
set -euo pipefail
|
|
if [ -n "${INPUT_TAG:-}" ]; then
|
|
echo "target_tag=$INPUT_TAG" >> "$GITHUB_OUTPUT"
|
|
echo "Using operator-pinned tag: $INPUT_TAG"
|
|
else
|
|
SHORT="${HEAD_SHA:0:7}"
|
|
echo "target_tag=staging-$SHORT" >> "$GITHUB_OUTPUT"
|
|
echo "Using auto tag: staging-$SHORT (head_sha=$HEAD_SHA)"
|
|
fi
|
|
|
|
- name: Call CP redeploy-fleet
|
|
# CP_ADMIN_API_TOKEN must be set as a repo/org secret on
|
|
# Molecule-AI/molecule-core, matching the staging/prod CP's
|
|
# CP_ADMIN_API_TOKEN env. Stored in Railway, mirrored to this
|
|
# repo's secrets for CI.
|
|
env:
|
|
CP_URL: ${{ vars.CP_URL || 'https://api.moleculesai.app' }}
|
|
CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
|
|
TARGET_TAG: ${{ steps.tag.outputs.target_tag }}
|
|
CANARY_SLUG: ${{ inputs.canary_slug || 'hongming' }}
|
|
SOAK_SECONDS: ${{ inputs.soak_seconds || '60' }}
|
|
BATCH_SIZE: ${{ inputs.batch_size || '3' }}
|
|
DRY_RUN: ${{ inputs.dry_run || false }}
|
|
run: |
|
|
set -euo pipefail
|
|
|
|
if [ -z "${CP_ADMIN_API_TOKEN:-}" ]; then
|
|
echo "::error::CP_ADMIN_API_TOKEN secret not set — skipping redeploy"
|
|
echo "::notice::Set CP_ADMIN_API_TOKEN in repo secrets to enable auto-redeploy."
|
|
exit 1
|
|
fi
|
|
|
|
BODY=$(jq -nc \
|
|
--arg tag "$TARGET_TAG" \
|
|
--arg canary "$CANARY_SLUG" \
|
|
--argjson soak "$SOAK_SECONDS" \
|
|
--argjson batch "$BATCH_SIZE" \
|
|
--argjson dry "$DRY_RUN" \
|
|
'{
|
|
target_tag: $tag,
|
|
canary_slug: $canary,
|
|
soak_seconds: $soak,
|
|
batch_size: $batch,
|
|
dry_run: $dry
|
|
}')
|
|
|
|
echo "POST $CP_URL/cp/admin/tenants/redeploy-fleet"
|
|
echo " body: $BODY"
|
|
|
|
HTTP_RESPONSE=$(mktemp)
|
|
HTTP_CODE_FILE=$(mktemp)
|
|
# Route -w into its own tempfile so curl's exit code (e.g. 56
|
|
# on connection-reset, 22 on --fail-with-body 4xx/5xx) can't
|
|
# pollute the captured stdout. The previous inline-substitution
|
|
# shape produced "000000" on connection reset (curl wrote
|
|
# "000" via -w, then the inline echo-fallback appended another
|
|
# "000") — caught on the 2026-05-04 redeploy of sha 2b862f6.
|
|
# set +e/-e keeps the non-zero curl exit from tripping the
|
|
# outer pipeline. See lint-curl-status-capture.yml for the
|
|
# CI gate that pins this fix shape.
|
|
set +e
|
|
curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \
|
|
-m 1200 \
|
|
-H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
|
|
-H "Content-Type: application/json" \
|
|
-X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \
|
|
-d "$BODY" >"$HTTP_CODE_FILE"
|
|
set -e
|
|
# Stderr from curl (e.g. dial errors with -sS) goes to the runner
|
|
# log so operators can see WHY a connection failed. Stdout is
|
|
# captured to $HTTP_CODE_FILE because that's where -w writes.
|
|
HTTP_CODE=$(cat "$HTTP_CODE_FILE" 2>/dev/null || echo "000")
|
|
[ -z "$HTTP_CODE" ] && HTTP_CODE="000"
|
|
|
|
echo "HTTP $HTTP_CODE"
|
|
cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE"
|
|
|
|
# Pretty-print per-tenant results in the job summary so
|
|
# ops can see which tenants were redeployed without drilling
|
|
# into the raw response.
|
|
{
|
|
echo "## Tenant redeploy fleet"
|
|
echo ""
|
|
echo "**Target tag:** \`$TARGET_TAG\`"
|
|
echo "**Canary:** \`$CANARY_SLUG\` (soak ${SOAK_SECONDS}s)"
|
|
echo "**Batch size:** $BATCH_SIZE"
|
|
echo "**Dry run:** $DRY_RUN"
|
|
echo "**HTTP:** $HTTP_CODE"
|
|
echo ""
|
|
echo "### Per-tenant result"
|
|
echo ""
|
|
echo '| Slug | Phase | SSM Status | Exit | Healthz | Error |'
|
|
echo '|------|-------|------------|------|---------|-------|'
|
|
jq -r '.results[]? | "| \(.slug) | \(.phase) | \(.ssm_status // "-") | \(.ssm_exit_code) | \(.healthz_ok) | \(.error // "-") |"' "$HTTP_RESPONSE" || true
|
|
} >> "$GITHUB_STEP_SUMMARY"
|
|
|
|
if [ "$HTTP_CODE" != "200" ]; then
|
|
echo "::error::redeploy-fleet returned HTTP $HTTP_CODE"
|
|
exit 1
|
|
fi
|
|
OK=$(jq -r '.ok' "$HTTP_RESPONSE")
|
|
if [ "$OK" != "true" ]; then
|
|
echo "::error::redeploy-fleet reported ok=false (see summary for which tenant halted the rollout)"
|
|
exit 1
|
|
fi
|
|
echo "::notice::Tenant fleet redeploy reported ssm_status=Success — verifying actual image roll on each tenant..."
|
|
|
|
# Stash the response for the verify step. $RUNNER_TEMP outlasts
|
|
# the step boundary; $HTTP_RESPONSE doesn't.
|
|
cp "$HTTP_RESPONSE" "$RUNNER_TEMP/redeploy-response.json"
|
|
|
|
- name: Verify each tenant /buildinfo matches published SHA
|
|
# ROOT FIX FOR #2395.
|
|
#
|
|
# `redeploy-fleet`'s `ssm_status=Success` means "the SSM RPC
|
|
# didn't error" — NOT "the new image is running on the tenant."
|
|
# `:latest` lives in the local Docker daemon's image cache; if
|
|
# the SSM document does `docker compose up -d` without an
|
|
# explicit `docker pull`, the daemon serves the previously-
|
|
# cached digest and the container restarts on stale code.
|
|
# 2026-04-30 incident: hongmingwang's tenant reported
|
|
# ssm_status=Success at 17:00:53Z but kept serving pre-501a42d7
|
|
# chat_files for 30+ min — the lazy-heal fix never reached the
|
|
# user despite green deploy + green redeploy.
|
|
#
|
|
# This step closes the gap by curling each tenant's /buildinfo
|
|
# endpoint (added in workspace-server/internal/buildinfo +
|
|
# /Dockerfile* GIT_SHA build-arg, this PR) and comparing the
|
|
# returned git_sha to the SHA the workflow expects. Mismatches
|
|
# fail the workflow, which is what `ok=true` should have
|
|
# guaranteed all along.
|
|
#
|
|
# When the redeploy was triggered by workflow_dispatch with a
|
|
# specific tag (target_tag != "latest"), the expected SHA may
|
|
# not equal ${{ github.sha }} — in that case we resolve via
|
|
# GHCR's manifest. For workflow_run (default :latest) the
|
|
# workflow_run.head_sha is the SHA that just published.
|
|
env:
|
|
EXPECTED_SHA: ${{ github.event.workflow_run.head_sha || github.sha }}
|
|
TARGET_TAG: ${{ steps.tag.outputs.target_tag }}
|
|
# Tenant subdomain template — slugs from the response are
|
|
# appended. Production CP issues `<slug>.moleculesai.app`;
|
|
# staging CP issues `<slug>.staging.moleculesai.app`. This
|
|
# workflow runs on main → prod CP → no `staging.` infix.
|
|
TENANT_DOMAIN: 'moleculesai.app'
|
|
run: |
|
|
set -euo pipefail
|
|
|
|
EXPECTED_SHORT="${EXPECTED_SHA:0:7}"
|
|
if [ "$TARGET_TAG" != "latest" ] \
|
|
&& [ "$TARGET_TAG" != "$EXPECTED_SHA" ] \
|
|
&& [ "$TARGET_TAG" != "staging-$EXPECTED_SHORT" ]; then
|
|
# workflow_dispatch with a pinned tag that isn't the head
|
|
# SHA — operator is rolling back / pinning. Skip the
|
|
# verification because we don't have the expected SHA in
|
|
# this context (would need to crane-inspect the GHCR
|
|
# manifest, which is a follow-up). Failing-open here is
|
|
# safe: the operator chose the tag deliberately.
|
|
#
|
|
# `staging-<short_head_sha>` IS verified — it's the new
|
|
# auto-trigger default (see Compute target tag step) and
|
|
# the digest under that tag SHOULD match EXPECTED_SHA.
|
|
echo "::notice::target_tag=$TARGET_TAG (operator-pinned) — skipping per-tenant SHA verification."
|
|
exit 0
|
|
fi
|
|
|
|
RESP="$RUNNER_TEMP/redeploy-response.json"
|
|
if [ ! -s "$RESP" ]; then
|
|
echo "::error::redeploy-response.json missing or empty — verify step ran without a response to read"
|
|
exit 1
|
|
fi
|
|
|
|
# Pull only successfully-redeployed tenants. Any tenant that
|
|
# halted the rollout already failed the previous step, so we
|
|
# don't double-count them here.
|
|
mapfile -t SLUGS < <(jq -r '.results[]? | select(.healthz_ok == true) | .slug' "$RESP")
|
|
if [ ${#SLUGS[@]} -eq 0 ]; then
|
|
echo "::warning::No tenants reported healthz_ok — nothing to verify"
|
|
exit 0
|
|
fi
|
|
|
|
echo "Verifying ${#SLUGS[@]} tenant(s) against EXPECTED_SHA=${EXPECTED_SHA:0:7}..."
|
|
|
|
# Two distinct failure modes — STALE (the #2395 bug class, hard-fail)
|
|
# vs UNREACHABLE (teardown race, soft-warn). See the staging variant's
|
|
# comment for the full rationale; same logic applies on prod even
|
|
# though prod has fewer ephemeral tenants — the asymmetry would be a
|
|
# gratuitous fork.
|
|
STALE_COUNT=0
|
|
UNREACHABLE_COUNT=0
|
|
STALE_LINES=()
|
|
UNREACHABLE_LINES=()
|
|
for slug in "${SLUGS[@]}"; do
|
|
URL="https://${slug}.${TENANT_DOMAIN}/buildinfo"
|
|
# 30s total: tenant just SSM-restarted, may still be coming
|
|
# up. Retry-on-empty rather than retry-on-status — we want
|
|
# to fail fast on "responded with wrong SHA", not "still
|
|
# warming up".
|
|
BODY=$(curl -sS --max-time 30 --retry 3 --retry-delay 5 --retry-connrefused "$URL" || true)
|
|
ACTUAL_SHA=$(echo "$BODY" | jq -r '.git_sha // ""' 2>/dev/null || echo "")
|
|
if [ -z "$ACTUAL_SHA" ]; then
|
|
UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1))
|
|
UNREACHABLE_LINES+=("| $slug | (no /buildinfo response) | ${EXPECTED_SHA:0:7} | ⚠ unreachable (likely teardown race) |")
|
|
continue
|
|
fi
|
|
if [ "$ACTUAL_SHA" = "$EXPECTED_SHA" ]; then
|
|
echo " $slug: ${ACTUAL_SHA:0:7} ✓"
|
|
else
|
|
STALE_COUNT=$((STALE_COUNT + 1))
|
|
STALE_LINES+=("| $slug | ${ACTUAL_SHA:0:7} | ${EXPECTED_SHA:0:7} | ❌ stale |")
|
|
fi
|
|
done
|
|
|
|
{
|
|
echo ""
|
|
echo "### Per-tenant /buildinfo verification"
|
|
echo ""
|
|
echo "Expected SHA: \`${EXPECTED_SHA:0:7}\`"
|
|
echo ""
|
|
if [ $STALE_COUNT -gt 0 ]; then
|
|
echo "**${STALE_COUNT} STALE tenant(s) — these did NOT pick up the new image despite ssm_status=Success:**"
|
|
echo ""
|
|
echo "| Slug | Actual /buildinfo SHA | Expected | Status |"
|
|
echo "|------|----------------------|----------|--------|"
|
|
for line in "${STALE_LINES[@]}"; do echo "$line"; done
|
|
echo ""
|
|
fi
|
|
if [ $UNREACHABLE_COUNT -gt 0 ]; then
|
|
echo "**${UNREACHABLE_COUNT} unreachable tenant(s) — likely teardown race (soft-warn, not failing):**"
|
|
echo ""
|
|
echo "| Slug | Actual /buildinfo SHA | Expected | Status |"
|
|
echo "|------|----------------------|----------|--------|"
|
|
for line in "${UNREACHABLE_LINES[@]}"; do echo "$line"; done
|
|
echo ""
|
|
fi
|
|
if [ $STALE_COUNT -eq 0 ] && [ $UNREACHABLE_COUNT -eq 0 ]; then
|
|
echo "All ${#SLUGS[@]} tenants returned matching SHA. ✓"
|
|
fi
|
|
} >> "$GITHUB_STEP_SUMMARY"
|
|
|
|
if [ $UNREACHABLE_COUNT -gt 0 ]; then
|
|
echo "::warning::$UNREACHABLE_COUNT tenant(s) unreachable post-redeploy. Likely benign teardown race — CP healthz monitor catches real outages."
|
|
fi
|
|
|
|
# Belt-and-suspenders sanity floor: same logic as the staging
|
|
# variant — see that file's comment for the full rationale.
|
|
# Floor only applies when fleet >= 4; below that, canary-verify
|
|
# is the actual gate.
|
|
TOTAL_VERIFIED=${#SLUGS[@]}
|
|
if [ $TOTAL_VERIFIED -ge 4 ] && [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then
|
|
echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED tenant(s) unreachable — exceeds 50% threshold on a fleet large enough that this signals a real outage, not teardown race."
|
|
exit 1
|
|
fi
|
|
|
|
if [ $STALE_COUNT -gt 0 ]; then
|
|
echo "::error::$STALE_COUNT tenant(s) returned a stale SHA. ssm_status=Success was misleading — see job summary."
|
|
exit 1
|
|
fi
|
|
|
|
echo "::notice::Tenant fleet redeploy complete — all reachable tenants on ${EXPECTED_SHA:0:7} (${UNREACHABLE_COUNT} unreachable, soft-warned)."
|