forked from molecule-ai/molecule-core
Parity with #2337's redeploy-tenants-on-staging.yml. Both prod and staging redeploys now have explicit serialization: group: redeploy-tenants-on-main (per-workflow, global) group: redeploy-tenants-on-staging (per-workflow, global) cancel-in-progress: false on both — aborting a half-rolled-out fleet would leave tenants stuck on whatever image they happened to be on when cancelled. Better to finish the in-flight rollout before starting the next one. Pre-fix this workflow relied on GitHub's implicit workflow_run queueing, which is "probably fine" but not defensible — explicit > implicit for load-bearing pipeline behavior. Picked up as a #2337 review nit (architecture finding 1: concurrency asymmetry between the two redeploy workflows). No behavior change in the common case. The change matters only when two main pushes land within seconds AND the first redeploy is still mid-rollout — currently rare; will become more common once #2335 (staging-trigger publish) feeds main more frequently via auto-promote.
179 lines
7.2 KiB
YAML
179 lines
7.2 KiB
YAML
name: redeploy-tenants-on-main
|
|
|
|
# Auto-refresh prod tenant EC2s after every main merge.
|
|
#
|
|
# Why this workflow exists: publish-workspace-server-image builds and
|
|
# pushes a new platform-tenant:latest + :<sha> to GHCR on every merge
|
|
# to main, but running tenants pulled their image once at boot and
|
|
# never re-pull. Users see stale code indefinitely.
|
|
#
|
|
# This workflow closes the gap by calling the control-plane admin
|
|
# endpoint that performs a canary-first, batched, health-gated rolling
|
|
# redeploy across every live tenant. Implemented in Molecule-AI/
|
|
# molecule-controlplane as POST /cp/admin/tenants/redeploy-fleet
|
|
# (feat/tenant-auto-redeploy, landing alongside this workflow).
|
|
#
|
|
# Runtime ordering:
|
|
# 1. publish-workspace-server-image completes → new :latest in GHCR.
|
|
# 2. This workflow fires via workflow_run, waits 30s for GHCR's
|
|
# CDN to propagate the new tag to the region the tenants pull from.
|
|
# 3. Calls redeploy-fleet with canary_slug=hongmingwang and a 60s
|
|
# soak. Canary proves the image boots; batches follow.
|
|
# 4. Any failure aborts the rollout and leaves older tenants on the
|
|
# prior image — safer default than half-and-half state.
|
|
#
|
|
# Rollback path: re-run this workflow with a specific SHA pinned via
|
|
# the workflow_dispatch input. That calls redeploy-fleet with
|
|
# target_tag=<sha>, re-pulling the older image on every tenant.
|
|
|
|
on:
|
|
workflow_run:
|
|
workflows: ['publish-workspace-server-image']
|
|
types: [completed]
|
|
branches: [main]
|
|
workflow_dispatch:
|
|
inputs:
|
|
target_tag:
|
|
description: 'Tenant image tag to deploy (e.g. "latest" or "a59f1a6c"). Defaults to latest when empty.'
|
|
required: false
|
|
type: string
|
|
default: 'latest'
|
|
canary_slug:
|
|
description: 'Tenant slug to deploy first + soak (empty = skip canary, fan out immediately).'
|
|
required: false
|
|
type: string
|
|
default: 'hongmingwang'
|
|
soak_seconds:
|
|
description: 'Seconds to wait after canary before fanning out.'
|
|
required: false
|
|
type: string
|
|
default: '60'
|
|
batch_size:
|
|
description: 'How many tenants SSM redeploys in parallel per batch.'
|
|
required: false
|
|
type: string
|
|
default: '3'
|
|
dry_run:
|
|
description: 'Plan only — do not actually redeploy.'
|
|
required: false
|
|
type: boolean
|
|
default: false
|
|
|
|
permissions:
|
|
contents: read
|
|
# No write scopes needed — the workflow hits an external CP endpoint,
|
|
# not the GitHub API.
|
|
|
|
# Serialize redeploys so two rapid main pushes' redeploys don't overlap
|
|
# and cause confusing per-tenant SSM state. Without this, GitHub's
|
|
# implicit workflow_run queueing would *probably* serialize them, but
|
|
# the explicit block makes the invariant defensible. Mirrors the
|
|
# concurrency block on redeploy-tenants-on-staging.yml for shape parity.
|
|
#
|
|
# cancel-in-progress: false → aborting a half-rolled-out fleet would
|
|
# leave tenants stuck on whatever image they happened to be on when
|
|
# cancelled. Better to finish the in-flight rollout before starting
|
|
# the next one.
|
|
concurrency:
|
|
group: redeploy-tenants-on-main
|
|
cancel-in-progress: false
|
|
|
|
jobs:
|
|
redeploy:
|
|
# Skip the auto-trigger if publish-workspace-server-image didn't
|
|
# actually succeed. workflow_run fires on any completion state; we
|
|
# don't want to redeploy against a half-built image.
|
|
if: |
|
|
github.event_name == 'workflow_dispatch' ||
|
|
(github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success')
|
|
runs-on: ubuntu-latest
|
|
timeout-minutes: 25
|
|
steps:
|
|
- name: Wait for GHCR tag propagation
|
|
# GHCR's edge cache takes ~15-30s to consistently serve the new
|
|
# :latest manifest after the registry accepts the push. Without
|
|
# this sleep, the first tenant's docker pull sometimes races
|
|
# and fetches the previous digest; sleeping is the cheapest
|
|
# way to reduce that without polling GHCR for the new digest.
|
|
run: sleep 30
|
|
|
|
- name: Call CP redeploy-fleet
|
|
# CP_ADMIN_API_TOKEN must be set as a repo/org secret on
|
|
# Molecule-AI/molecule-core, matching the staging/prod CP's
|
|
# CP_ADMIN_API_TOKEN env. Stored in Railway, mirrored to this
|
|
# repo's secrets for CI.
|
|
env:
|
|
CP_URL: ${{ vars.CP_URL || 'https://api.moleculesai.app' }}
|
|
CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
|
|
TARGET_TAG: ${{ inputs.target_tag || 'latest' }}
|
|
CANARY_SLUG: ${{ inputs.canary_slug || 'hongmingwang' }}
|
|
SOAK_SECONDS: ${{ inputs.soak_seconds || '60' }}
|
|
BATCH_SIZE: ${{ inputs.batch_size || '3' }}
|
|
DRY_RUN: ${{ inputs.dry_run || false }}
|
|
run: |
|
|
set -euo pipefail
|
|
|
|
if [ -z "${CP_ADMIN_API_TOKEN:-}" ]; then
|
|
echo "::error::CP_ADMIN_API_TOKEN secret not set — skipping redeploy"
|
|
echo "::notice::Set CP_ADMIN_API_TOKEN in repo secrets to enable auto-redeploy."
|
|
exit 1
|
|
fi
|
|
|
|
BODY=$(jq -nc \
|
|
--arg tag "$TARGET_TAG" \
|
|
--arg canary "$CANARY_SLUG" \
|
|
--argjson soak "$SOAK_SECONDS" \
|
|
--argjson batch "$BATCH_SIZE" \
|
|
--argjson dry "$DRY_RUN" \
|
|
'{
|
|
target_tag: $tag,
|
|
canary_slug: $canary,
|
|
soak_seconds: $soak,
|
|
batch_size: $batch,
|
|
dry_run: $dry
|
|
}')
|
|
|
|
echo "POST $CP_URL/cp/admin/tenants/redeploy-fleet"
|
|
echo " body: $BODY"
|
|
|
|
HTTP_RESPONSE=$(mktemp)
|
|
HTTP_CODE=$(curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \
|
|
-m 1200 \
|
|
-H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
|
|
-H "Content-Type: application/json" \
|
|
-X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \
|
|
-d "$BODY" || echo "000")
|
|
|
|
echo "HTTP $HTTP_CODE"
|
|
cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE"
|
|
|
|
# Pretty-print per-tenant results in the job summary so
|
|
# ops can see which tenants were redeployed without drilling
|
|
# into the raw response.
|
|
{
|
|
echo "## Tenant redeploy fleet"
|
|
echo ""
|
|
echo "**Target tag:** \`$TARGET_TAG\`"
|
|
echo "**Canary:** \`$CANARY_SLUG\` (soak ${SOAK_SECONDS}s)"
|
|
echo "**Batch size:** $BATCH_SIZE"
|
|
echo "**Dry run:** $DRY_RUN"
|
|
echo "**HTTP:** $HTTP_CODE"
|
|
echo ""
|
|
echo "### Per-tenant result"
|
|
echo ""
|
|
echo '| Slug | Phase | SSM Status | Exit | Healthz | Error |'
|
|
echo '|------|-------|------------|------|---------|-------|'
|
|
jq -r '.results[]? | "| \(.slug) | \(.phase) | \(.ssm_status // "-") | \(.ssm_exit_code) | \(.healthz_ok) | \(.error // "-") |"' "$HTTP_RESPONSE" || true
|
|
} >> "$GITHUB_STEP_SUMMARY"
|
|
|
|
if [ "$HTTP_CODE" != "200" ]; then
|
|
echo "::error::redeploy-fleet returned HTTP $HTTP_CODE"
|
|
exit 1
|
|
fi
|
|
OK=$(jq -r '.ok' "$HTTP_RESPONSE")
|
|
if [ "$OK" != "true" ]; then
|
|
echo "::error::redeploy-fleet reported ok=false (see summary for which tenant halted the rollout)"
|
|
exit 1
|
|
fi
|
|
echo "::notice::Tenant fleet redeploy complete."
|