diff --git a/.github/workflows/canary-verify.yml b/.github/workflows/canary-verify.yml deleted file mode 100644 index f97bdc463..000000000 --- a/.github/workflows/canary-verify.yml +++ /dev/null @@ -1,255 +0,0 @@ -name: canary-verify - -# Runs the canary smoke suite against the staging canary tenant fleet -# after a new :staging- image lands in ECR. On green, calls the -# CP redeploy-fleet endpoint to promote :staging- → :latest so -# the prod tenant fleet's 5-minute auto-updater picks up the verified -# digest. On red, :latest stays on the prior known-good digest and -# prod is untouched. -# -# Registry note (2026-05-10): This workflow previously used GHCR -# (ghcr.io/molecule-ai/platform-tenant) — that registry was retired -# during the 2026-05-06 Gitea suspension migration when publish- -# workspace-server-image.yml switched to the operator's ECR org -# (153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/ -# platform-tenant). The GHCR → ECR migration was never applied to -# this file, so canary-verify was silently smoke-testing the stale -# GHCR image while the actual staging/prod tenants ran the ECR image. -# Result: smoke tests could not catch a broken ECR build. Fix: -# - Wait step: reads SHA from running canary /health (tenant- -# agnostic, works regardless of registry). -# - Promote step: calls CP redeploy-fleet endpoint with target_tag= -# staging-, same mechanism as redeploy-tenants-on-main.yml. -# No longer attempts GHCR crane ops. -# -# Dependencies: -# - publish-workspace-server-image.yml publishes :staging- -# to ECR on staging and main merges. -# - Canary tenants are configured to pull :staging- from ECR -# (TENANT_IMAGE env set to the ECR :staging- tag). -# - Repo secrets CANARY_TENANT_URLS / CANARY_ADMIN_TOKENS / -# CANARY_CP_SHARED_SECRET are populated. - -on: - workflow_run: - workflows: ["publish-workspace-server-image"] - types: [completed] - workflow_dispatch: - -permissions: - contents: read - packages: write - actions: read - -env: - # ECR registry (post-2026-05-06 SSOT for tenant images). - # publish-workspace-server-image.yml pushes here. - IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform - TENANT_IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform-tenant - # CP endpoint for redeploy-fleet (used in promote step below). - CP_URL: ${{ vars.CP_URL || 'https://staging-api.moleculesai.app' }} - -jobs: - canary-smoke: - # Skip when the upstream workflow failed — no image to test against. - if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }} - runs-on: ubuntu-latest - outputs: - sha: ${{ steps.compute.outputs.sha }} - smoke_ran: ${{ steps.smoke.outputs.ran }} - steps: - - name: Checkout - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - - name: Compute sha - id: compute - run: echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT" - - - name: Wait for canary tenants to pick up :staging- - # Poll canary health endpoints every 30s for up to 7 min instead - # of a fixed 6-min sleep. Exits as soon as ALL canaries report - # the new SHA (~2-3 min typical vs 6 min fixed). Falls back to - # proceeding after 7 min even if not all canaries responded — - # the smoke suite will catch any that didn't update. - # - # NOTE: The SHA is read from the running tenant's /health response, - # NOT from a registry lookup. This is registry-agnostic and works - # regardless of whether the tenant pulls from ECR, GHCR, or any - # other registry — the canary is telling us what it's actually - # running, which is the ground truth for smoke testing. - env: - CANARY_TENANT_URLS: ${{ secrets.CANARY_TENANT_URLS }} - EXPECTED_SHA: ${{ steps.compute.outputs.sha }} - run: | - if [ -z "$CANARY_TENANT_URLS" ]; then - echo "No canary URLs configured — falling back to 60s wait" - sleep 60 - exit 0 - fi - IFS=',' read -ra URLS <<< "$CANARY_TENANT_URLS" - MAX_WAIT=420 # 7 minutes - INTERVAL=30 - ELAPSED=0 - while [ $ELAPSED -lt $MAX_WAIT ]; do - ALL_READY=true - for url in "${URLS[@]}"; do - HEALTH=$(curl -s --max-time 5 "${url}/health" 2>/dev/null || echo "{}") - SHA=$(echo "$HEALTH" | grep -o "\"sha\":\"[^\"]*\"" | head -1 | cut -d'"' -f4) - if [ "$SHA" != "$EXPECTED_SHA" ]; then - ALL_READY=false - break - fi - done - if $ALL_READY; then - echo "All canaries running staging-${EXPECTED_SHA} after ${ELAPSED}s" - exit 0 - fi - echo "Waiting for canaries... (${ELAPSED}s / ${MAX_WAIT}s)" - sleep $INTERVAL - ELAPSED=$((ELAPSED + INTERVAL)) - done - echo "Timeout after ${MAX_WAIT}s — proceeding anyway (smoke suite will validate)" - - - name: Run canary smoke suite - id: smoke - # Graceful-skip when no canary fleet is configured (Phase 2 not yet - # stood up — see molecule-controlplane/docs/canary-tenants.md). - # Sets `ran=false` on skip so promote-to-latest stays off (we don't - # want every main merge auto-promoting without gating). Manual - # promote-latest.yml is the release gate while canary is absent. - # Once the fleet is real: delete the early-exit branch. - env: - CANARY_TENANT_URLS: ${{ secrets.CANARY_TENANT_URLS }} - CANARY_ADMIN_TOKENS: ${{ secrets.CANARY_ADMIN_TOKENS }} - CANARY_CP_BASE_URL: https://staging-api.moleculesai.app - CANARY_CP_SHARED_SECRET: ${{ secrets.CANARY_CP_SHARED_SECRET }} - run: | - set -euo pipefail - if [ -z "${CANARY_TENANT_URLS:-}" ] \ - || [ -z "${CANARY_ADMIN_TOKENS:-}" ] \ - || [ -z "${CANARY_CP_SHARED_SECRET:-}" ]; then - { - echo "## ⚠️ canary-verify skipped" - echo - echo "One or more canary secrets are unset (\`CANARY_TENANT_URLS\`, \`CANARY_ADMIN_TOKENS\`, \`CANARY_CP_SHARED_SECRET\`)." - echo "Phase 2 canary fleet has not been stood up yet —" - echo "see [canary-tenants.md](https://git.moleculesai.app/molecule-ai/molecule-controlplane/blob/main/docs/canary-tenants.md)." - echo - echo "**Skipped — promote-to-latest will NOT auto-fire.** Dispatch \`promote-latest.yml\` manually when ready." - } >> "$GITHUB_STEP_SUMMARY" - echo "ran=false" >> "$GITHUB_OUTPUT" - echo "::notice::canary-verify: skipped — no canary fleet configured" - exit 0 - fi - bash scripts/canary-smoke.sh - echo "ran=true" >> "$GITHUB_OUTPUT" - - - name: Summary on failure - if: ${{ failure() }} - run: | - { - echo "## Canary smoke FAILED" - echo - echo "Canary tenants rejected image \`staging-${{ steps.compute.outputs.sha }}\`." - echo ":latest stays pinned to the prior good digest — prod is untouched." - echo - echo "Fix forward and merge again, or investigate the specific failed" - echo "assertions in the canary-smoke step log above." - } >> "$GITHUB_STEP_SUMMARY" - - promote-to-latest: - # On green, calls the CP redeploy-fleet endpoint with target_tag= - # staging- to promote the verified ECR image. This is the same - # mechanism as redeploy-tenants-on-main.yml — no GHCR crane ops. - # - # Pre-fix history: the old GHCR promote step used `crane tag` against - # ghcr.io/molecule-ai/platform-tenant, but publish-workspace-server- - # image.yml had already migrated to ECR on 2026-05-07 (commit - # 10e510f5). The GHCR tags were never updated, so this step was - # silently promoting a stale GHCR image while actual prod tenants - # pulled from ECR. Canary smoke tests were GHCR-targeted and could - # not catch a broken ECR build. - needs: canary-smoke - if: ${{ needs.canary-smoke.result == 'success' && needs.canary-smoke.outputs.smoke_ran == 'true' }} - runs-on: ubuntu-latest - env: - SHA: ${{ needs.canary-smoke.outputs.sha }} - CP_URL: ${{ vars.CP_URL || 'https://staging-api.moleculesai.app' }} - # CP_ADMIN_API_TOKEN gates write access to the redeploy endpoint. - # Stored at the repo level so all workflows pick it up automatically. - CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }} - # canary_slug pin: deploy the verified :staging- to the canary - # first (soak 120s), then fan out to the rest of the fleet. - CANARY_SLUG: ${{ vars.CANARY_PROMOTE_SLUG || '' }} - SOAK_SECONDS: ${{ vars.CANARY_PROMOTE_SOAK || '120' }} - BATCH_SIZE: ${{ vars.CANARY_PROMOTE_BATCH || '3' }} - steps: - - name: Check CP credentials - run: | - if [ -z "${CP_ADMIN_API_TOKEN:-}" ]; then - echo "::error::CP_ADMIN_API_TOKEN secret is not set — promote step cannot call redeploy-fleet." - echo "::error::Set it at: repo Settings → Actions → Variables and Secrets → New Secret." - exit 1 - fi - - - name: Promote verified ECR image to :latest - run: | - set -euo pipefail - - TARGET_TAG="staging-${SHA}" - BODY=$(jq -nc \ - --arg tag "$TARGET_TAG" \ - --argjson soak "${SOAK_SECONDS:-120}" \ - --argjson batch "${BATCH_SIZE:-3}" \ - --argjson dry false \ - '{ - target_tag: $tag, - soak_seconds: $soak, - batch_size: $batch, - dry_run: $dry - }') - - if [ -n "${CANARY_SLUG:-}" ]; then - BODY=$(jq '. * {canary_slug: $slug}' --arg slug "$CANARY_SLUG" <<<"$BODY") - fi - - echo "Calling: POST $CP_URL/cp/admin/tenants/redeploy-fleet" - echo " target_tag: $TARGET_TAG" - echo " body: $BODY" - - HTTP_RESPONSE=$(mktemp) - HTTP_CODE_FILE=$(mktemp) - set +e - curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \ - -m 1200 \ - -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \ - -H "Content-Type: application/json" \ - -X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \ - -d "$BODY" >"$HTTP_CODE_FILE" - CURL_EXIT=$? - set -e - - HTTP_CODE=$(cat "$HTTP_CODE_FILE" 2>/dev/null || echo "000") - [ -z "$HTTP_CODE" ] && HTTP_CODE="000" - - echo "HTTP $HTTP_CODE (curl exit $CURL_EXIT)" - cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE" - - if [ "$HTTP_CODE" -ge 400 ]; then - echo "::error::CP redeploy-fleet returned HTTP $HTTP_CODE — refusing to proceed." - exit 1 - fi - - - name: Summary - run: | - { - echo "## Canary verified — :latest promoted via CP redeploy-fleet" - echo "" - echo "- **Target tag:** \`staging-${{ needs.canary-smoke.outputs.sha }}\`" - echo "- **Registry:** ECR (\`${TENANT_IMAGE_NAME}\`)" - echo "- **Canary slug:** \`${CANARY_SLUG:-}\` (soak ${SOAK_SECONDS}s)" - echo "- **Batch size:** ${BATCH_SIZE:-3}" - echo "" - echo "CP redeploy-fleet is rolling out the verified image across the prod fleet." - echo "The fleet's 5-minute health-check loop will pick up the update automatically." - } >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/redeploy-tenants-on-main.yml b/.github/workflows/redeploy-tenants-on-main.yml deleted file mode 100644 index 786da188f..000000000 --- a/.github/workflows/redeploy-tenants-on-main.yml +++ /dev/null @@ -1,400 +0,0 @@ -name: redeploy-tenants-on-main - -# Auto-refresh prod tenant EC2s after every main merge. -# -# Why this workflow exists: publish-workspace-server-image builds and -# pushes a new platform-tenant : to ECR on every merge to main, -# but running tenants pulled their image once at boot and never re-pull. -# Users see stale code indefinitely. -# -# This workflow closes the gap by calling the control-plane admin -# endpoint that performs a canary-first, batched, health-gated rolling -# redeploy across every live tenant. Implemented in molecule-ai/ -# molecule-controlplane as POST /cp/admin/tenants/redeploy-fleet -# (feat/tenant-auto-redeploy, landing alongside this workflow). -# -# Registry: ECR (153263036946.dkr.ecr.us-east-2.amazonaws.com/ -# molecule-ai/platform-tenant). GHCR was retired 2026-05-07 during the -# Gitea suspension migration. The canary-verify.yml promote step now -# uses the same redeploy-fleet endpoint (fixes the silent-GHCR gap). -# -# Runtime ordering: -# 1. publish-workspace-server-image completes → new :staging- in ECR. -# 2. This workflow fires via workflow_run, calls redeploy-fleet with -# target_tag=staging-. No CDN propagation wait needed — -# ECR image manifest is consistent immediately after push. -# 3. Calls redeploy-fleet with canary_slug (if set) and a soak -# period. Canary proves the image boots; batches follow. -# 4. Any failure aborts the rollout and leaves older tenants on the -# prior image — safer default than half-and-half state. -# -# Rollback path: re-run this workflow with a specific SHA pinned via -# the workflow_dispatch input. That calls redeploy-fleet with -# target_tag=, re-pulling the older image on every tenant. - -on: - workflow_run: - workflows: ['publish-workspace-server-image'] - types: [completed] - branches: [main] - workflow_dispatch: - inputs: - target_tag: - # Empty default → auto-trigger and dispatch-without-input both - # resolve to `staging-` (the digest publish-image - # just pushed). Pre-fix this defaulted to 'latest', which only - # gets retagged by canary-verify's promote-to-latest job — and - # that job soft-skips when CANARY_TENANT_URLS is unset (the - # current state, until Phase 2 canary fleet is live). Result: - # `:latest` had been pinned to a 4-day-old digest (2026-04-28) - # while every main push pushed fresh `staging-` images; - # every prod redeploy pulled the stale `:latest` and the verify - # step correctly flagged 3/3 tenants STALE. Pulling the - # just-published `staging-` directly skips the dead retag - # path. When canary fleet is real, this workflow should chain - # on canary-verify completion (workflow_run from canary-verify), - # not publish-image — separate, smaller PR. - description: 'Tenant image tag to deploy (e.g. "latest", "staging-a59f1a6c"). Empty = auto staging-.' - required: false - type: string - default: '' - canary_slug: - description: 'Tenant slug to deploy first + soak (empty = skip canary, fan out immediately).' - required: false - type: string - # Must be an actual prod tenant slug (current: hongming, - # chloe-dong, reno-stars). The previous default 'hongmingwang' - # didn't match any tenant — CP soft-skipped the missing canary - # and the fleet rolled out without the soak gate, defeating the - # whole point of canary-first. - default: 'hongming' - soak_seconds: - description: 'Seconds to wait after canary before fanning out.' - required: false - type: string - default: '60' - batch_size: - description: 'How many tenants SSM redeploys in parallel per batch.' - required: false - type: string - default: '3' - dry_run: - description: 'Plan only — do not actually redeploy.' - required: false - type: boolean - default: false - -permissions: - contents: read - # No write scopes needed — the workflow hits an external CP endpoint, - # not the GitHub API. - -# Serialize redeploys so two rapid main pushes' redeploys don't overlap -# and cause confusing per-tenant SSM state. Without this, GitHub's -# implicit workflow_run queueing would *probably* serialize them, but -# the explicit block makes the invariant defensible. Mirrors the -# concurrency block on redeploy-tenants-on-staging.yml for shape parity. -# -# cancel-in-progress: false → aborting a half-rolled-out fleet would -# leave tenants stuck on whatever image they happened to be on when -# cancelled. Better to finish the in-flight rollout before starting -# the next one. -concurrency: - group: redeploy-tenants-on-main - cancel-in-progress: false - -jobs: - redeploy: - # Skip the auto-trigger if publish-workspace-server-image didn't - # actually succeed. workflow_run fires on any completion state; we - # don't want to redeploy against a half-built image. - if: | - github.event_name == 'workflow_dispatch' || - (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') - runs-on: ubuntu-latest - timeout-minutes: 25 - steps: - - name: Note on ECR propagation - # ECR image manifests are consistent immediately after push — no - # CDN cache to wait for. The old GHCR-based workflow had a 30s - # sleep to avoid race conditions; ECR makes that unnecessary. - run: echo "ECR image available immediately after push — proceeding." - - - name: Compute target tag - id: tag - # Resolution order: - # 1. Operator-supplied input (workflow_dispatch with explicit - # tag) → used verbatim. Lets ops pin `latest` for emergency - # rollback to last canary-verified digest, or pin a specific - # `staging-` to roll back to a known-good build. - # 2. Default → `staging-`. The just-published - # digest. Bypasses the `:latest` retag path that's currently - # dead (canary-verify soft-skips without canary fleet, so - # the only thing retagging `:latest` today is the manual - # promote-latest.yml — last run 2026-04-28). Auto-trigger - # from workflow_run uses workflow_run.head_sha; manual - # dispatch with no input falls through to github.sha. - env: - INPUT_TAG: ${{ inputs.target_tag }} - HEAD_SHA: ${{ github.event.workflow_run.head_sha || github.sha }} - run: | - set -euo pipefail - if [ -n "${INPUT_TAG:-}" ]; then - echo "target_tag=$INPUT_TAG" >> "$GITHUB_OUTPUT" - echo "Using operator-pinned tag: $INPUT_TAG" - else - SHORT="${HEAD_SHA:0:7}" - echo "target_tag=staging-$SHORT" >> "$GITHUB_OUTPUT" - echo "Using auto tag: staging-$SHORT (head_sha=$HEAD_SHA)" - fi - - - name: Call CP redeploy-fleet - # CP_ADMIN_API_TOKEN must be set as a repo/org secret on - # molecule-ai/molecule-core, matching the staging/prod CP's - # CP_ADMIN_API_TOKEN env. Stored in Railway, mirrored to this - # repo's secrets for CI. - env: - CP_URL: ${{ vars.CP_URL || 'https://api.moleculesai.app' }} - CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }} - TARGET_TAG: ${{ steps.tag.outputs.target_tag }} - CANARY_SLUG: ${{ inputs.canary_slug || 'hongming' }} - SOAK_SECONDS: ${{ inputs.soak_seconds || '60' }} - BATCH_SIZE: ${{ inputs.batch_size || '3' }} - DRY_RUN: ${{ inputs.dry_run || false }} - run: | - set -euo pipefail - - if [ -z "${CP_ADMIN_API_TOKEN:-}" ]; then - echo "::error::CP_ADMIN_API_TOKEN secret not set — skipping redeploy" - echo "::notice::Set CP_ADMIN_API_TOKEN in repo secrets to enable auto-redeploy." - exit 1 - fi - - BODY=$(jq -nc \ - --arg tag "$TARGET_TAG" \ - --arg canary "$CANARY_SLUG" \ - --argjson soak "$SOAK_SECONDS" \ - --argjson batch "$BATCH_SIZE" \ - --argjson dry "$DRY_RUN" \ - '{ - target_tag: $tag, - canary_slug: $canary, - soak_seconds: $soak, - batch_size: $batch, - dry_run: $dry - }') - - echo "POST $CP_URL/cp/admin/tenants/redeploy-fleet" - echo " body: $BODY" - - HTTP_RESPONSE=$(mktemp) - HTTP_CODE_FILE=$(mktemp) - # Route -w into its own tempfile so curl's exit code (e.g. 56 - # on connection-reset, 22 on --fail-with-body 4xx/5xx) can't - # pollute the captured stdout. The previous inline-substitution - # shape produced "000000" on connection reset (curl wrote - # "000" via -w, then the inline echo-fallback appended another - # "000") — caught on the 2026-05-04 redeploy of sha 2b862f6. - # set +e/-e keeps the non-zero curl exit from tripping the - # outer pipeline. See lint-curl-status-capture.yml for the - # CI gate that pins this fix shape. - set +e - curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \ - -m 1200 \ - -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \ - -H "Content-Type: application/json" \ - -X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \ - -d "$BODY" >"$HTTP_CODE_FILE" - set -e - # Stderr from curl (e.g. dial errors with -sS) goes to the runner - # log so operators can see WHY a connection failed. Stdout is - # captured to $HTTP_CODE_FILE because that's where -w writes. - HTTP_CODE=$(cat "$HTTP_CODE_FILE" 2>/dev/null || echo "000") - [ -z "$HTTP_CODE" ] && HTTP_CODE="000" - - echo "HTTP $HTTP_CODE" - cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE" - - # Pretty-print per-tenant results in the job summary so - # ops can see which tenants were redeployed without drilling - # into the raw response. - { - echo "## Tenant redeploy fleet" - echo "" - echo "**Target tag:** \`$TARGET_TAG\`" - echo "**Canary:** \`$CANARY_SLUG\` (soak ${SOAK_SECONDS}s)" - echo "**Batch size:** $BATCH_SIZE" - echo "**Dry run:** $DRY_RUN" - echo "**HTTP:** $HTTP_CODE" - echo "" - echo "### Per-tenant result" - echo "" - echo '| Slug | Phase | SSM Status | Exit | Healthz | Error |' - echo '|------|-------|------------|------|---------|-------|' - jq -r '.results[]? | "| \(.slug) | \(.phase) | \(.ssm_status // "-") | \(.ssm_exit_code) | \(.healthz_ok) | \(.error // "-") |"' "$HTTP_RESPONSE" || true - } >> "$GITHUB_STEP_SUMMARY" - - if [ "$HTTP_CODE" != "200" ]; then - echo "::error::redeploy-fleet returned HTTP $HTTP_CODE" - exit 1 - fi - OK=$(jq -r '.ok' "$HTTP_RESPONSE") - if [ "$OK" != "true" ]; then - echo "::error::redeploy-fleet reported ok=false (see summary for which tenant halted the rollout)" - exit 1 - fi - echo "::notice::Tenant fleet redeploy reported ssm_status=Success — verifying actual image roll on each tenant..." - - # Stash the response for the verify step. $RUNNER_TEMP outlasts - # the step boundary; $HTTP_RESPONSE doesn't. - cp "$HTTP_RESPONSE" "$RUNNER_TEMP/redeploy-response.json" - - - name: Verify each tenant /buildinfo matches published SHA - # ROOT FIX FOR #2395. - # - # `redeploy-fleet`'s `ssm_status=Success` means "the SSM RPC - # didn't error" — NOT "the new image is running on the tenant." - # `:latest` lives in the local Docker daemon's image cache; if - # the SSM document does `docker compose up -d` without an - # explicit `docker pull`, the daemon serves the previously- - # cached digest and the container restarts on stale code. - # 2026-04-30 incident: hongmingwang's tenant reported - # ssm_status=Success at 17:00:53Z but kept serving pre-501a42d7 - # chat_files for 30+ min — the lazy-heal fix never reached the - # user despite green deploy + green redeploy. - # - # This step closes the gap by curling each tenant's /buildinfo - # endpoint (added in workspace-server/internal/buildinfo + - # /Dockerfile* GIT_SHA build-arg, this PR) and comparing the - # returned git_sha to the SHA the workflow expects. Mismatches - # fail the workflow, which is what `ok=true` should have - # guaranteed all along. - # - # When the redeploy was triggered by workflow_dispatch with a - # specific tag (target_tag != "latest"), the expected SHA may - # not equal ${{ github.sha }} — in that case we resolve via - # GHCR's manifest. For workflow_run (default :latest) the - # workflow_run.head_sha is the SHA that just published. - env: - EXPECTED_SHA: ${{ github.event.workflow_run.head_sha || github.sha }} - TARGET_TAG: ${{ steps.tag.outputs.target_tag }} - # Tenant subdomain template — slugs from the response are - # appended. Production CP issues `.moleculesai.app`; - # staging CP issues `.staging.moleculesai.app`. This - # workflow runs on main → prod CP → no `staging.` infix. - TENANT_DOMAIN: 'moleculesai.app' - run: | - set -euo pipefail - - EXPECTED_SHORT="${EXPECTED_SHA:0:7}" - if [ "$TARGET_TAG" != "latest" ] \ - && [ "$TARGET_TAG" != "$EXPECTED_SHA" ] \ - && [ "$TARGET_TAG" != "staging-$EXPECTED_SHORT" ]; then - # workflow_dispatch with a pinned tag that isn't the head - # SHA — operator is rolling back / pinning. Skip the - # verification because we don't have the expected SHA in - # this context (would need to crane-inspect the GHCR - # manifest, which is a follow-up). Failing-open here is - # safe: the operator chose the tag deliberately. - # - # `staging-` IS verified — it's the new - # auto-trigger default (see Compute target tag step) and - # the digest under that tag SHOULD match EXPECTED_SHA. - echo "::notice::target_tag=$TARGET_TAG (operator-pinned) — skipping per-tenant SHA verification." - exit 0 - fi - - RESP="$RUNNER_TEMP/redeploy-response.json" - if [ ! -s "$RESP" ]; then - echo "::error::redeploy-response.json missing or empty — verify step ran without a response to read" - exit 1 - fi - - # Pull only successfully-redeployed tenants. Any tenant that - # halted the rollout already failed the previous step, so we - # don't double-count them here. - mapfile -t SLUGS < <(jq -r '.results[]? | select(.healthz_ok == true) | .slug' "$RESP") - if [ ${#SLUGS[@]} -eq 0 ]; then - echo "::warning::No tenants reported healthz_ok — nothing to verify" - exit 0 - fi - - echo "Verifying ${#SLUGS[@]} tenant(s) against EXPECTED_SHA=${EXPECTED_SHA:0:7}..." - - # Two distinct failure modes — STALE (the #2395 bug class, hard-fail) - # vs UNREACHABLE (teardown race, soft-warn). See the staging variant's - # comment for the full rationale; same logic applies on prod even - # though prod has fewer ephemeral tenants — the asymmetry would be a - # gratuitous fork. - STALE_COUNT=0 - UNREACHABLE_COUNT=0 - STALE_LINES=() - UNREACHABLE_LINES=() - for slug in "${SLUGS[@]}"; do - URL="https://${slug}.${TENANT_DOMAIN}/buildinfo" - # 30s total: tenant just SSM-restarted, may still be coming - # up. Retry-on-empty rather than retry-on-status — we want - # to fail fast on "responded with wrong SHA", not "still - # warming up". - BODY=$(curl -sS --max-time 30 --retry 3 --retry-delay 5 --retry-connrefused "$URL" || true) - ACTUAL_SHA=$(echo "$BODY" | jq -r '.git_sha // ""' 2>/dev/null || echo "") - if [ -z "$ACTUAL_SHA" ]; then - UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1)) - UNREACHABLE_LINES+=("| $slug | (no /buildinfo response) | ${EXPECTED_SHA:0:7} | ⚠ unreachable (likely teardown race) |") - continue - fi - if [ "$ACTUAL_SHA" = "$EXPECTED_SHA" ]; then - echo " $slug: ${ACTUAL_SHA:0:7} ✓" - else - STALE_COUNT=$((STALE_COUNT + 1)) - STALE_LINES+=("| $slug | ${ACTUAL_SHA:0:7} | ${EXPECTED_SHA:0:7} | ❌ stale |") - fi - done - - { - echo "" - echo "### Per-tenant /buildinfo verification" - echo "" - echo "Expected SHA: \`${EXPECTED_SHA:0:7}\`" - echo "" - if [ $STALE_COUNT -gt 0 ]; then - echo "**${STALE_COUNT} STALE tenant(s) — these did NOT pick up the new image despite ssm_status=Success:**" - echo "" - echo "| Slug | Actual /buildinfo SHA | Expected | Status |" - echo "|------|----------------------|----------|--------|" - for line in "${STALE_LINES[@]}"; do echo "$line"; done - echo "" - fi - if [ $UNREACHABLE_COUNT -gt 0 ]; then - echo "**${UNREACHABLE_COUNT} unreachable tenant(s) — likely teardown race (soft-warn, not failing):**" - echo "" - echo "| Slug | Actual /buildinfo SHA | Expected | Status |" - echo "|------|----------------------|----------|--------|" - for line in "${UNREACHABLE_LINES[@]}"; do echo "$line"; done - echo "" - fi - if [ $STALE_COUNT -eq 0 ] && [ $UNREACHABLE_COUNT -eq 0 ]; then - echo "All ${#SLUGS[@]} tenants returned matching SHA. ✓" - fi - } >> "$GITHUB_STEP_SUMMARY" - - if [ $UNREACHABLE_COUNT -gt 0 ]; then - echo "::warning::$UNREACHABLE_COUNT tenant(s) unreachable post-redeploy. Likely benign teardown race — CP healthz monitor catches real outages." - fi - - # Belt-and-suspenders sanity floor: same logic as the staging - # variant — see that file's comment for the full rationale. - # Floor only applies when fleet >= 4; below that, canary-verify - # is the actual gate. - TOTAL_VERIFIED=${#SLUGS[@]} - if [ $TOTAL_VERIFIED -ge 4 ] && [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then - echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED tenant(s) unreachable — exceeds 50% threshold on a fleet large enough that this signals a real outage, not teardown race." - exit 1 - fi - - if [ $STALE_COUNT -gt 0 ]; then - echo "::error::$STALE_COUNT tenant(s) returned a stale SHA. ssm_status=Success was misleading — see job summary." - exit 1 - fi - - echo "::notice::Tenant fleet redeploy complete — all reachable tenants on ${EXPECTED_SHA:0:7} (${UNREACHABLE_COUNT} unreachable, soft-warned)." diff --git a/.github/workflows/redeploy-tenants-on-staging.yml b/.github/workflows/redeploy-tenants-on-staging.yml deleted file mode 100644 index 695f66432..000000000 --- a/.github/workflows/redeploy-tenants-on-staging.yml +++ /dev/null @@ -1,362 +0,0 @@ -name: redeploy-tenants-on-staging - -# Auto-refresh staging tenant EC2s after every staging-branch merge. -# -# Mirror of redeploy-tenants-on-main.yml, with the staging-CP host and -# the :staging-latest tag. Sister workflow exists for prod (rolls -# :latest after canary-verify). Both share the same shape — just -# different CP_URL + target_tag + admin token secret. -# -# Why this workflow exists: publish-workspace-server-image now builds -# on every staging-branch push (PR #2335), pushing -# platform-tenant:staging-latest to GHCR. Existing tenants pulled -# their image once at boot and never re-pull, so the new image just -# sits unused until the tenant is reprovisioned. -# -# This workflow closes the gap by calling staging-CP's -# /cp/admin/tenants/redeploy-fleet, which performs a canary-first, -# batched, health-gated SSM redeploy across every live staging tenant. -# Same endpoint shape as prod CP — only the host differs. -# -# Runtime ordering: -# 1. publish-workspace-server-image completes on staging branch → -# new :staging-latest in GHCR. -# 2. This workflow fires via workflow_run, waits 30s for GHCR's CDN -# to propagate the new tag. -# 3. Calls redeploy-fleet with no canary (staging IS canary; we don't -# need a sub-canary inside it). Soak still applies to the first -# tenant in case of bad-deploy detection. -# 4. Any failure aborts the rollout and leaves older tenants on the -# prior image — safer default than half-and-half state. -# -# Rollback path: re-run with workflow_dispatch + target_tag=staging- -# of a known-good build. - -on: - workflow_run: - workflows: ['publish-workspace-server-image'] - types: [completed] - branches: [main] - workflow_dispatch: - inputs: - target_tag: - description: 'Tenant image tag to deploy (e.g. "staging-latest" or "staging-a59f1a6c"). Defaults to staging-latest when empty.' - required: false - type: string - default: 'staging-latest' - canary_slug: - description: 'Tenant slug to deploy first + soak (empty = skip canary, fan out immediately). Default empty for staging since staging itself is the canary.' - required: false - type: string - default: '' - soak_seconds: - description: 'Seconds to wait after canary before fanning out. Only meaningful if canary_slug is set.' - required: false - type: string - default: '60' - batch_size: - description: 'How many tenants SSM redeploys in parallel per batch.' - required: false - type: string - default: '3' - dry_run: - description: 'Plan only — do not actually redeploy.' - required: false - type: boolean - default: false - -permissions: - contents: read - # No write scopes needed — the workflow hits an external CP endpoint, - # not the GitHub API. - -# Serialize per-branch so two rapid staging pushes' redeploys don't -# overlap and cause confusing per-tenant SSM state. cancel-in-progress -# is false because aborting a half-rolled-out fleet leaves tenants -# stuck on whatever image they happened to be on when cancelled. -concurrency: - group: redeploy-tenants-on-staging - cancel-in-progress: false - -jobs: - redeploy: - # Skip the auto-trigger if publish-workspace-server-image didn't - # actually succeed. workflow_run fires on any completion state; we - # don't want to redeploy against a half-built image. - if: | - github.event_name == 'workflow_dispatch' || - (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') - runs-on: ubuntu-latest - timeout-minutes: 25 - steps: - - name: Wait for GHCR tag propagation - # GHCR's edge cache takes ~15-30s to consistently serve the new - # :staging-latest manifest after the registry accepts the push. - # Same rationale as redeploy-tenants-on-main.yml. - run: sleep 30 - - - name: Call staging-CP redeploy-fleet - # CP_STAGING_ADMIN_API_TOKEN must be set as a repo/org secret - # on molecule-ai/molecule-core, matching staging-CP's - # CP_ADMIN_API_TOKEN env var (visible in Railway controlplane - # / staging environment). Stored separately from the prod - # CP_ADMIN_API_TOKEN so a leak of one doesn't auth the other. - env: - CP_URL: ${{ vars.STAGING_CP_URL || 'https://staging-api.moleculesai.app' }} - CP_STAGING_ADMIN_API_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }} - TARGET_TAG: ${{ inputs.target_tag || 'staging-latest' }} - CANARY_SLUG: ${{ inputs.canary_slug || '' }} - SOAK_SECONDS: ${{ inputs.soak_seconds || '60' }} - BATCH_SIZE: ${{ inputs.batch_size || '3' }} - DRY_RUN: ${{ inputs.dry_run || false }} - run: | - set -euo pipefail - - # Schedule-vs-dispatch hardening (mirrors sweep-cf-orphans - # and sweep-cf-tunnels): hard-fail on auto-trigger when the - # secret is missing so a misconfigured-repo doesn't silently - # serve stale staging tenants. Soft-skip on operator dispatch. - if [ -z "${CP_STAGING_ADMIN_API_TOKEN:-}" ]; then - if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then - echo "::warning::CP_STAGING_ADMIN_API_TOKEN secret not set — skipping redeploy" - echo "::warning::Set CP_STAGING_ADMIN_API_TOKEN in repo secrets to enable auto-redeploy." - echo "::notice::Pull the value from staging-CP's CP_ADMIN_API_TOKEN env in Railway." - exit 0 - fi - echo "::error::staging redeploy cannot run — CP_STAGING_ADMIN_API_TOKEN secret missing" - echo "::error::set it at Settings → Secrets and Variables → Actions; pull from staging-CP's CP_ADMIN_API_TOKEN env in Railway." - exit 1 - fi - - BODY=$(jq -nc \ - --arg tag "$TARGET_TAG" \ - --arg canary "$CANARY_SLUG" \ - --argjson soak "$SOAK_SECONDS" \ - --argjson batch "$BATCH_SIZE" \ - --argjson dry "$DRY_RUN" \ - '{ - target_tag: $tag, - canary_slug: $canary, - soak_seconds: $soak, - batch_size: $batch, - dry_run: $dry - }') - - echo "POST $CP_URL/cp/admin/tenants/redeploy-fleet" - echo " body: $BODY" - - HTTP_RESPONSE=$(mktemp) - HTTP_CODE_FILE=$(mktemp) - # Route -w into its own tempfile so curl's exit code (e.g. 56 - # on connection-reset) can't pollute the captured stdout. The - # previous inline-substitution shape produced "000000" on - # connection reset — caught on main variant 2026-05-04 - # redeploying sha 2b862f6. Same fix shape as the synth-E2E - # §9c gate (PR #2797). See lint-curl-status-capture.yml for - # the CI gate that pins this fix shape. - set +e - curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \ - -m 1200 \ - -H "Authorization: Bearer $CP_STAGING_ADMIN_API_TOKEN" \ - -H "Content-Type: application/json" \ - -X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \ - -d "$BODY" >"$HTTP_CODE_FILE" - set -e - # Stderr from curl (-sS shows dial errors etc.) goes to the - # runner log so operators can see WHY a connection failed. - HTTP_CODE=$(cat "$HTTP_CODE_FILE" 2>/dev/null || echo "000") - [ -z "$HTTP_CODE" ] && HTTP_CODE="000" - - echo "HTTP $HTTP_CODE" - cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE" - - { - echo "## Staging tenant redeploy fleet" - echo "" - echo "**Target tag:** \`$TARGET_TAG\`" - echo "**Canary:** \`${CANARY_SLUG:-(none — staging is itself the canary)}\` (soak ${SOAK_SECONDS}s)" - echo "**Batch size:** $BATCH_SIZE" - echo "**Dry run:** $DRY_RUN" - echo "**HTTP:** $HTTP_CODE" - echo "" - echo "### Per-tenant result" - echo "" - echo '| Slug | Phase | SSM Status | Exit | Healthz | Error |' - echo '|------|-------|------------|------|---------|-------|' - jq -r '.results[]? | "| \(.slug) | \(.phase) | \(.ssm_status // "-") | \(.ssm_exit_code) | \(.healthz_ok) | \(.error // "-") |"' "$HTTP_RESPONSE" || true - } >> "$GITHUB_STEP_SUMMARY" - - # Distinguish "real fleet failure" from "E2E teardown race". - # - # CP returns HTTP 500 + ok=false whenever ANY tenant in the - # fleet failed SSM or healthz. In practice the recurring source - # of these is ephemeral test tenants being torn down by their - # parent E2E run mid-redeploy: the EC2 dies → SSM exit=2 or - # healthz timeout → CP marks the fleet failed → this workflow - # goes red even though every operator-facing tenant rolled fine. - # - # Ephemeral slug prefixes (kept in sync with sweep-stale-e2e-orgs.yml - # — see that file for the source-of-truth list and rationale): - # - e2e-* — canvas/saas/ext E2E suites - # - rt-e2e-* — runtime-test harness fixtures (RFC #2251) - # Long-lived prefixes that are NOT ephemeral and MUST hard-fail: - # demo-prep, dryrun-*, dryrun2-*, plus all human tenant slugs. - # - # Filter: if HTTP=500/ok=false AND every failed slug matches an - # ephemeral prefix, treat as soft-warn and let the verify step - # downstream handle unreachable-vs-stale (#2402). Any non-ephemeral - # failure or a non-500 HTTP response remains a hard failure. - OK=$(jq -r '.ok // "false"' "$HTTP_RESPONSE") - FAILED_SLUGS=$(jq -r ' - .results[]? - | select((.healthz_ok != true) or (.ssm_status != "Success")) - | .slug' "$HTTP_RESPONSE" 2>/dev/null || true) - EPHEMERAL_PREFIX_RE='^(e2e-|rt-e2e-)' - NON_EPHEMERAL_FAILED=$(printf '%s\n' "$FAILED_SLUGS" | grep -v '^$' | grep -Ev "$EPHEMERAL_PREFIX_RE" || true) - - if [ "$HTTP_CODE" = "200" ] && [ "$OK" = "true" ]; then - : # happy path — fall through to verification - elif [ "$HTTP_CODE" = "500" ] && [ -z "$NON_EPHEMERAL_FAILED" ] && [ -n "$FAILED_SLUGS" ]; then - COUNT=$(printf '%s\n' "$FAILED_SLUGS" | grep -Ec "$EPHEMERAL_PREFIX_RE" || true) - echo "::warning::redeploy-fleet returned HTTP 500 but every failed tenant ($COUNT) is ephemeral (e2e-*/rt-e2e-*) — treating as teardown race, soft-warning." - printf '%s\n' "$FAILED_SLUGS" | sed 's/^/::warning:: failed: /' - elif [ "$HTTP_CODE" != "200" ]; then - echo "::error::redeploy-fleet returned HTTP $HTTP_CODE" - if [ -n "$NON_EPHEMERAL_FAILED" ]; then - echo "::error::non-ephemeral tenant(s) failed:" - printf '%s\n' "$NON_EPHEMERAL_FAILED" | sed 's/^/::error:: /' - fi - exit 1 - else - # HTTP=200 but ok=false (shouldn't happen with current CP - # but keep the gate for completeness). - echo "::error::redeploy-fleet reported ok=false (see summary for which tenant halted the rollout)" - exit 1 - fi - echo "::notice::Staging tenant fleet redeploy reported ssm_status=Success — verifying actual image roll on each tenant..." - - cp "$HTTP_RESPONSE" "$RUNNER_TEMP/redeploy-response.json" - - - name: Verify each staging tenant /buildinfo matches published SHA - # Mirror of the verify step in redeploy-tenants-on-main.yml — see - # there for the rationale (#2395 root fix). Staging has the same - # ssm_status-success-but-stale-image hazard and benefits from the - # same gate. Diff: TENANT_DOMAIN includes the `staging.` infix. - env: - EXPECTED_SHA: ${{ github.event.workflow_run.head_sha || github.sha }} - TARGET_TAG: ${{ inputs.target_tag || 'staging-latest' }} - TENANT_DOMAIN: 'staging.moleculesai.app' - run: | - set -euo pipefail - - # staging-latest is the staging-side moving tag; treat it the - # same way main treats `latest`. Operator-pinned SHAs skip - # verification (see main variant for why). - if [ "$TARGET_TAG" != "staging-latest" ] && [ "$TARGET_TAG" != "latest" ] && [ "$TARGET_TAG" != "$EXPECTED_SHA" ]; then - echo "::notice::target_tag=$TARGET_TAG (operator-pinned) — skipping per-tenant SHA verification." - exit 0 - fi - - RESP="$RUNNER_TEMP/redeploy-response.json" - if [ ! -s "$RESP" ]; then - echo "::error::redeploy-response.json missing or empty" - exit 1 - fi - - mapfile -t SLUGS < <(jq -r '.results[]? | select(.healthz_ok == true) | .slug' "$RESP") - if [ ${#SLUGS[@]} -eq 0 ]; then - echo "::warning::No staging tenants reported healthz_ok — nothing to verify" - exit 0 - fi - - echo "Verifying ${#SLUGS[@]} staging tenant(s) against EXPECTED_SHA=${EXPECTED_SHA:0:7}..." - - # Two distinct failure modes here: - # STALE_COUNT — tenant returned a SHA that doesn't match. THIS is - # the #2395 bug class: tenant up + serving old code. - # Always hard-fail the workflow. - # UNREACHABLE_COUNT — tenant didn't respond. Almost always a benign - # teardown race: redeploy-fleet snapshot says - # healthz_ok=true, then the E2E suite tears the - # ephemeral tenant down before this step runs (the - # e2e-* fixtures churn 5-10/hour on staging). Soft- - # warn so we don't block staging→main on cleanup. - # Real "tenant up but unreachable" is caught by CP's - # own healthz monitor + the post-redeploy alert; we - # don't need to double-count it here. - STALE_COUNT=0 - UNREACHABLE_COUNT=0 - STALE_LINES=() - UNREACHABLE_LINES=() - for slug in "${SLUGS[@]}"; do - URL="https://${slug}.${TENANT_DOMAIN}/buildinfo" - BODY=$(curl -sS --max-time 30 --retry 3 --retry-delay 5 --retry-connrefused "$URL" || true) - ACTUAL_SHA=$(echo "$BODY" | jq -r '.git_sha // ""' 2>/dev/null || echo "") - if [ -z "$ACTUAL_SHA" ]; then - UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1)) - UNREACHABLE_LINES+=("| $slug | (no /buildinfo response) | ${EXPECTED_SHA:0:7} | ⚠ unreachable (likely teardown race) |") - continue - fi - if [ "$ACTUAL_SHA" = "$EXPECTED_SHA" ]; then - echo " $slug: ${ACTUAL_SHA:0:7} ✓" - else - STALE_COUNT=$((STALE_COUNT + 1)) - STALE_LINES+=("| $slug | ${ACTUAL_SHA:0:7} | ${EXPECTED_SHA:0:7} | ❌ stale |") - fi - done - - { - echo "" - echo "### Per-tenant /buildinfo verification (staging)" - echo "" - echo "Expected SHA: \`${EXPECTED_SHA:0:7}\`" - echo "" - if [ $STALE_COUNT -gt 0 ]; then - echo "**${STALE_COUNT} STALE tenant(s) — these did NOT pick up the new image despite ssm_status=Success:**" - echo "" - echo "| Slug | Actual /buildinfo SHA | Expected | Status |" - echo "|------|----------------------|----------|--------|" - for line in "${STALE_LINES[@]}"; do echo "$line"; done - echo "" - fi - if [ $UNREACHABLE_COUNT -gt 0 ]; then - echo "**${UNREACHABLE_COUNT} unreachable tenant(s) — likely E2E teardown race (soft-warn, not failing):**" - echo "" - echo "| Slug | Actual /buildinfo SHA | Expected | Status |" - echo "|------|----------------------|----------|--------|" - for line in "${UNREACHABLE_LINES[@]}"; do echo "$line"; done - echo "" - fi - if [ $STALE_COUNT -eq 0 ] && [ $UNREACHABLE_COUNT -eq 0 ]; then - echo "All ${#SLUGS[@]} staging tenants returned matching SHA. ✓" - fi - } >> "$GITHUB_STEP_SUMMARY" - - if [ $UNREACHABLE_COUNT -gt 0 ]; then - echo "::warning::$UNREACHABLE_COUNT staging tenant(s) unreachable post-redeploy. Likely benign teardown race — CP healthz monitor catches real outages." - fi - - # Belt-and-suspenders sanity floor: if MORE than half the fleet is - # unreachable AND the fleet is large enough that "half down" is - # statistically meaningful, this is a real outage (e.g. new image - # crashes on startup), not a teardown race. Hard-fail. - # - # Floor only applies when TOTAL_VERIFIED >= 4 — below that, the - # canary-verify step is the actual gate for "all tenants down" - # detection (it runs against the canary first and aborts the - # rollout if the canary fails to come up). Without the >=4 gate, - # a 1-tenant fleet (e.g. a single ephemeral e2e-* tenant on a - # quiet staging push) would re-flake on the exact teardown-race - # condition #2402 fixed: 1 of 1 unreachable = 100% > 50% → fail. - TOTAL_VERIFIED=${#SLUGS[@]} - if [ $TOTAL_VERIFIED -ge 4 ] && [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then - echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED staging tenant(s) unreachable — exceeds 50% threshold on a fleet large enough that this signals a real outage, not teardown race." - exit 1 - fi - - if [ $STALE_COUNT -gt 0 ]; then - echo "::error::$STALE_COUNT staging tenant(s) returned a stale SHA. ssm_status=Success was misleading — see job summary." - exit 1 - fi - - echo "::notice::Staging tenant fleet redeploy complete — all reachable tenants on ${EXPECTED_SHA:0:7} (${UNREACHABLE_COUNT} unreachable, soft-warned)."