diff --git a/.github/workflows/canary-verify.yml b/.github/workflows/canary-verify.yml index e19c1619..f97bdc46 100644 --- a/.github/workflows/canary-verify.yml +++ b/.github/workflows/canary-verify.yml @@ -1,19 +1,34 @@ name: canary-verify # Runs the canary smoke suite against the staging canary tenant fleet -# after a new :staging- image lands in GHCR. On green, promotes -# :staging- → :latest so the prod tenant fleet's 5-minute -# auto-updater picks up the verified digest. On red, :latest stays -# on the prior known-good digest and prod is untouched. +# after a new :staging- image lands in ECR. On green, calls the +# CP redeploy-fleet endpoint to promote :staging- → :latest so +# the prod tenant fleet's 5-minute auto-updater picks up the verified +# digest. On red, :latest stays on the prior known-good digest and +# prod is untouched. +# +# Registry note (2026-05-10): This workflow previously used GHCR +# (ghcr.io/molecule-ai/platform-tenant) — that registry was retired +# during the 2026-05-06 Gitea suspension migration when publish- +# workspace-server-image.yml switched to the operator's ECR org +# (153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/ +# platform-tenant). The GHCR → ECR migration was never applied to +# this file, so canary-verify was silently smoke-testing the stale +# GHCR image while the actual staging/prod tenants ran the ECR image. +# Result: smoke tests could not catch a broken ECR build. Fix: +# - Wait step: reads SHA from running canary /health (tenant- +# agnostic, works regardless of registry). +# - Promote step: calls CP redeploy-fleet endpoint with target_tag= +# staging-, same mechanism as redeploy-tenants-on-main.yml. +# No longer attempts GHCR crane ops. # # Dependencies: # - publish-workspace-server-image.yml publishes :staging- -# (NOT :latest) on main merge -# - canary tenants are configured to pull :staging- as their -# tenant image (set TENANT_IMAGE=ghcr.io/…:staging- on the -# canary provisioner code path OR rotate via an admin endpoint) +# to ECR on staging and main merges. +# - Canary tenants are configured to pull :staging- from ECR +# (TENANT_IMAGE env set to the ECR :staging- tag). # - Repo secrets CANARY_TENANT_URLS / CANARY_ADMIN_TOKENS / -# CANARY_CP_SHARED_SECRET are populated +# CANARY_CP_SHARED_SECRET are populated. on: workflow_run: @@ -27,8 +42,12 @@ permissions: actions: read env: - IMAGE_NAME: ghcr.io/molecule-ai/platform - TENANT_IMAGE_NAME: ghcr.io/molecule-ai/platform-tenant + # ECR registry (post-2026-05-06 SSOT for tenant images). + # publish-workspace-server-image.yml pushes here. + IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform + TENANT_IMAGE_NAME: 153263036946.dkr.ecr.us-east-2.amazonaws.com/molecule-ai/platform-tenant + # CP endpoint for redeploy-fleet (used in promote step below). + CP_URL: ${{ vars.CP_URL || 'https://staging-api.moleculesai.app' }} jobs: canary-smoke: @@ -52,6 +71,12 @@ jobs: # the new SHA (~2-3 min typical vs 6 min fixed). Falls back to # proceeding after 7 min even if not all canaries responded — # the smoke suite will catch any that didn't update. + # + # NOTE: The SHA is read from the running tenant's /health response, + # NOT from a registry lookup. This is registry-agnostic and works + # regardless of whether the tenant pulls from ECR, GHCR, or any + # other registry — the canary is telling us what it's actually + # running, which is the ground truth for smoke testing. env: CANARY_TENANT_URLS: ${{ secrets.CANARY_TENANT_URLS }} EXPECTED_SHA: ${{ steps.compute.outputs.sha }} @@ -133,42 +158,98 @@ jobs: } >> "$GITHUB_STEP_SUMMARY" promote-to-latest: - # On green, retag :staging- → :latest for BOTH images. - # crane is a lightweight registry client (no Docker daemon needed on - # the runner) that can retag remotely with a single API call each. - # Gated on smoke_ran=true — without a real canary fleet the smoke - # step no-ops with success, and we don't want that to silently - # auto-promote every main merge. + # On green, calls the CP redeploy-fleet endpoint with target_tag= + # staging- to promote the verified ECR image. This is the same + # mechanism as redeploy-tenants-on-main.yml — no GHCR crane ops. + # + # Pre-fix history: the old GHCR promote step used `crane tag` against + # ghcr.io/molecule-ai/platform-tenant, but publish-workspace-server- + # image.yml had already migrated to ECR on 2026-05-07 (commit + # 10e510f5). The GHCR tags were never updated, so this step was + # silently promoting a stale GHCR image while actual prod tenants + # pulled from ECR. Canary smoke tests were GHCR-targeted and could + # not catch a broken ECR build. needs: canary-smoke if: ${{ needs.canary-smoke.result == 'success' && needs.canary-smoke.outputs.smoke_ran == 'true' }} runs-on: ubuntu-latest + env: + SHA: ${{ needs.canary-smoke.outputs.sha }} + CP_URL: ${{ vars.CP_URL || 'https://staging-api.moleculesai.app' }} + # CP_ADMIN_API_TOKEN gates write access to the redeploy endpoint. + # Stored at the repo level so all workflows pick it up automatically. + CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }} + # canary_slug pin: deploy the verified :staging- to the canary + # first (soak 120s), then fan out to the rest of the fleet. + CANARY_SLUG: ${{ vars.CANARY_PROMOTE_SLUG || '' }} + SOAK_SECONDS: ${{ vars.CANARY_PROMOTE_SOAK || '120' }} + BATCH_SIZE: ${{ vars.CANARY_PROMOTE_BATCH || '3' }} steps: - - uses: imjasonh/setup-crane@6da1ae018866400525525ce74ff892880c099987 # v0.5 - - - name: GHCR login + - name: Check CP credentials run: | - echo "${{ secrets.GITHUB_TOKEN }}" | \ - crane auth login ghcr.io -u "${{ github.actor }}" --password-stdin + if [ -z "${CP_ADMIN_API_TOKEN:-}" ]; then + echo "::error::CP_ADMIN_API_TOKEN secret is not set — promote step cannot call redeploy-fleet." + echo "::error::Set it at: repo Settings → Actions → Variables and Secrets → New Secret." + exit 1 + fi - - name: Retag platform :staging- → :latest + - name: Promote verified ECR image to :latest run: | - crane tag \ - "${IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}" \ - latest + set -euo pipefail - - name: Retag tenant :staging- → :latest - run: | - crane tag \ - "${TENANT_IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}" \ - latest + TARGET_TAG="staging-${SHA}" + BODY=$(jq -nc \ + --arg tag "$TARGET_TAG" \ + --argjson soak "${SOAK_SECONDS:-120}" \ + --argjson batch "${BATCH_SIZE:-3}" \ + --argjson dry false \ + '{ + target_tag: $tag, + soak_seconds: $soak, + batch_size: $batch, + dry_run: $dry + }') + + if [ -n "${CANARY_SLUG:-}" ]; then + BODY=$(jq '. * {canary_slug: $slug}' --arg slug "$CANARY_SLUG" <<<"$BODY") + fi + + echo "Calling: POST $CP_URL/cp/admin/tenants/redeploy-fleet" + echo " target_tag: $TARGET_TAG" + echo " body: $BODY" + + HTTP_RESPONSE=$(mktemp) + HTTP_CODE_FILE=$(mktemp) + set +e + curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \ + -m 1200 \ + -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \ + -H "Content-Type: application/json" \ + -X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \ + -d "$BODY" >"$HTTP_CODE_FILE" + CURL_EXIT=$? + set -e + + HTTP_CODE=$(cat "$HTTP_CODE_FILE" 2>/dev/null || echo "000") + [ -z "$HTTP_CODE" ] && HTTP_CODE="000" + + echo "HTTP $HTTP_CODE (curl exit $CURL_EXIT)" + cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE" + + if [ "$HTTP_CODE" -ge 400 ]; then + echo "::error::CP redeploy-fleet returned HTTP $HTTP_CODE — refusing to proceed." + exit 1 + fi - name: Summary run: | { - echo "## Canary verified — :latest promoted" - echo - echo "- \`${IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}\` → \`${IMAGE_NAME}:latest\`" - echo "- \`${TENANT_IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}\` → \`${TENANT_IMAGE_NAME}:latest\`" - echo - echo "Prod tenant fleet will pick up the new digest on its next 5-min auto-update cycle." + echo "## Canary verified — :latest promoted via CP redeploy-fleet" + echo "" + echo "- **Target tag:** \`staging-${{ needs.canary-smoke.outputs.sha }}\`" + echo "- **Registry:** ECR (\`${TENANT_IMAGE_NAME}\`)" + echo "- **Canary slug:** \`${CANARY_SLUG:-}\` (soak ${SOAK_SECONDS}s)" + echo "- **Batch size:** ${BATCH_SIZE:-3}" + echo "" + echo "CP redeploy-fleet is rolling out the verified image across the prod fleet." + echo "The fleet's 5-minute health-check loop will pick up the update automatically." } >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/redeploy-tenants-on-main.yml b/.github/workflows/redeploy-tenants-on-main.yml index 0625fc3f..786da188 100644 --- a/.github/workflows/redeploy-tenants-on-main.yml +++ b/.github/workflows/redeploy-tenants-on-main.yml @@ -3,9 +3,9 @@ name: redeploy-tenants-on-main # Auto-refresh prod tenant EC2s after every main merge. # # Why this workflow exists: publish-workspace-server-image builds and -# pushes a new platform-tenant:latest + : to GHCR on every merge -# to main, but running tenants pulled their image once at boot and -# never re-pull. Users see stale code indefinitely. +# pushes a new platform-tenant : to ECR on every merge to main, +# but running tenants pulled their image once at boot and never re-pull. +# Users see stale code indefinitely. # # This workflow closes the gap by calling the control-plane admin # endpoint that performs a canary-first, batched, health-gated rolling @@ -13,12 +13,18 @@ name: redeploy-tenants-on-main # molecule-controlplane as POST /cp/admin/tenants/redeploy-fleet # (feat/tenant-auto-redeploy, landing alongside this workflow). # +# Registry: ECR (153263036946.dkr.ecr.us-east-2.amazonaws.com/ +# molecule-ai/platform-tenant). GHCR was retired 2026-05-07 during the +# Gitea suspension migration. The canary-verify.yml promote step now +# uses the same redeploy-fleet endpoint (fixes the silent-GHCR gap). +# # Runtime ordering: -# 1. publish-workspace-server-image completes → new :latest in GHCR. -# 2. This workflow fires via workflow_run, waits 30s for GHCR's -# CDN to propagate the new tag to the region the tenants pull from. -# 3. Calls redeploy-fleet with canary_slug=hongming and a 60s -# soak. Canary proves the image boots; batches follow. +# 1. publish-workspace-server-image completes → new :staging- in ECR. +# 2. This workflow fires via workflow_run, calls redeploy-fleet with +# target_tag=staging-. No CDN propagation wait needed — +# ECR image manifest is consistent immediately after push. +# 3. Calls redeploy-fleet with canary_slug (if set) and a soak +# period. Canary proves the image boots; batches follow. # 4. Any failure aborts the rollout and leaves older tenants on the # prior image — safer default than half-and-half state. # @@ -108,13 +114,11 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 25 steps: - - name: Wait for GHCR tag propagation - # GHCR's edge cache takes ~15-30s to consistently serve the new - # manifest after the registry accepts the push. Without this - # sleep, the first tenant's docker pull sometimes races and - # fetches the previous digest; sleeping is the cheapest way to - # reduce that without polling GHCR for the new digest. - run: sleep 30 + - name: Note on ECR propagation + # ECR image manifests are consistent immediately after push — no + # CDN cache to wait for. The old GHCR-based workflow had a 30s + # sleep to avoid race conditions; ECR makes that unnecessary. + run: echo "ECR image available immediately after push — proceeding." - name: Compute target tag id: tag diff --git a/scripts/canary-smoke.sh b/scripts/canary-smoke.sh index 0d549de2..32a9fee6 100755 --- a/scripts/canary-smoke.sh +++ b/scripts/canary-smoke.sh @@ -1,10 +1,15 @@ #!/bin/bash # canary-smoke.sh — runs the post-deploy smoke suite against the # staging canary tenant fleet. Called by the canary-verify.yml GitHub -# Actions workflow after a new workspace-server image gets pushed to -# GHCR; exits non-zero on any failure so the workflow can skip the -# :staging-sha → :latest retag that would otherwise release broken -# code to the prod tenant fleet. +# Actions workflow after a new workspace-server image lands in ECR; +# exits non-zero on any failure so the workflow can block the +# redeploy-fleet promotion that would otherwise release broken code +# to the prod tenant fleet. +# +# Registry note: GHCR was retired 2026-05-06. Images are now pushed +# to the operator's ECR org (153263036946.dkr.ecr.us-east-2.amazonaws.com/ +# molecule-ai/platform-tenant). The registry URL is a runtime concern for +# the CI push step; this script tests the running tenant directly. # # Environment: # CANARY_TENANT_URLS space-sep list of canary tenant base URLs @@ -108,6 +113,43 @@ for i in "${!URLS[@]}"; do # 5. Negative: unauth'd admin call must 401 (C4 regression gate). unauth_code=$(curl -sS -o /dev/null -w '%{http_code}' --max-time 10 "$base/admin/liveness" || echo "000") check "unauth'd /admin/liveness returns 401" "401" "$unauth_code" + + # 6. POST /org/import unauth → 401. Proves the route is compiled in + # and AdminAuth is enforced. A missing route returns 404 (the failure + # mode caught by issue #213). Regression guard for the silent-GHCR- + # migration gap: canary-verify was testing a stale GHCR image while + # actual tenants ran ECR — this test would have caught a missing-route + # binary before it reached prod. + unauth_code=$(curl -sS -o /dev/null -w '%{http_code}' \ + --max-time 10 -X POST "$base/org/import" || echo "000") + check "POST /org/import unauth returns 401 (not 404)" "401" "$unauth_code" + + # 7. POST /org/import authed → 400/422 (malformed body, not 404). + # Proves the route IS in the binary AND AdminAuth passed. Using a + # deliberately broken body so we hit the handler's validation, not a + # business-logic error that might return 500 in some states. + bad_code=$(curl -sS -o /dev/null -w '%{http_code}' \ + --max-time 10 -X POST \ + -H "Authorization: Bearer $token" \ + -H "Content-Type: application/json" \ + --data '{"dir":"nonexistent-org-template"}' \ + "$base/org/import" || echo "000") + # Accept 400 (bad request / validation), 404 (template not found but + # route exists — good enough to prove route compiled), or 422 (unproc). + # Reject 000 (connection error) and 500 (server crash). + if [ "$bad_code" = "000" ] || [ "$bad_code" = "500" ]; then + printf " FAIL POST /org/import authed returns HTTP %s (expected 400/404/422)\n" "$bad_code" >&2 + FAIL=$((FAIL + 1)) + else + printf " PASS POST /org/import authed returns HTTP %s (route compiled + AdminAuth enforced)\n" "$bad_code" + PASS=$((PASS + 1)) + fi + + # 8. POST /workspaces unauth → 401. Proves the route is compiled in. + # GET /workspaces was already covered in step 2; POST was the gap. + unauth_code=$(curl -sS -o /dev/null -w '%{http_code}' \ + --max-time 10 -X POST "$base/workspaces" || echo "000") + check "POST /workspaces unauth returns 401 (not 404)" "401" "$unauth_code" done # ── Summary ──────────────────────────────────────────────────────────────