molecule-core/.github/workflows/redeploy-tenants-on-main.yml

name: redeploy-tenants-on-main

# Auto-refresh prod tenant EC2s after every main merge.
#
# Why this workflow exists: publish-workspace-server-image builds and
# pushes a new platform-tenant:latest + :<sha> to GHCR on every merge
# to main, but running tenants pulled their image once at boot and
# never re-pull. Users see stale code indefinitely.
#
# This workflow closes the gap by calling the control-plane admin
# endpoint that performs a canary-first, batched, health-gated rolling
# redeploy across every live tenant. Implemented in Molecule-AI/
# molecule-controlplane as POST /cp/admin/tenants/redeploy-fleet
# (feat/tenant-auto-redeploy, landing alongside this workflow).
#
# Runtime ordering:
#   1. publish-workspace-server-image completes → new :latest in GHCR.
#   2. This workflow fires via workflow_run, waits 30s for GHCR's
#      CDN to propagate the new tag to the region the tenants pull from.
#   3. Calls redeploy-fleet with canary_slug=hongming and a 60s
#      soak. Canary proves the image boots; batches follow.
#   4. Any failure aborts the rollout and leaves older tenants on the
#      prior image — safer default than half-and-half state.
#
# Rollback path: re-run this workflow with a specific SHA pinned via
# the workflow_dispatch input. That calls redeploy-fleet with
# target_tag=<sha>, re-pulling the older image on every tenant.

on:
  workflow_run:
    workflows: ['publish-workspace-server-image']
    types: [completed]
    branches: [main]
  workflow_dispatch:
    inputs:
      target_tag:
        # Empty default → auto-trigger and dispatch-without-input both
        # resolve to `staging-<short_head_sha>` (the digest publish-image
        # just pushed). Pre-fix this defaulted to 'latest', which only
        # gets retagged by canary-verify's promote-to-latest job — and
        # that job soft-skips when CANARY_TENANT_URLS is unset (the
        # current state, until Phase 2 canary fleet is live). Result:
        # `:latest` had been pinned to a 4-day-old digest (2026-04-28)
        # while every main push pushed fresh `staging-<sha>` images;
        # every prod redeploy pulled the stale `:latest` and the verify
        # step correctly flagged 3/3 tenants STALE. Pulling the
        # just-published `staging-<sha>` directly skips the dead retag
        # path. When canary fleet is real, this workflow should chain
        # on canary-verify completion (workflow_run from canary-verify),
        # not publish-image — separate, smaller PR.
        description: 'Tenant image tag to deploy (e.g. "latest", "staging-a59f1a6c"). Empty = auto staging-<head_sha>.'
        required: false
        type: string
        default: ''
      canary_slug:
        description: 'Tenant slug to deploy first + soak (empty = skip canary, fan out immediately).'
        required: false
        type: string
        # Must be an actual prod tenant slug (current: hongming,
        # chloe-dong, reno-stars). The previous default 'hongmingwang'
        # didn't match any tenant — CP soft-skipped the missing canary
        # and the fleet rolled out without the soak gate, defeating the
        # whole point of canary-first.
        default: 'hongming'
      soak_seconds:
        description: 'Seconds to wait after canary before fanning out.'
        required: false
        type: string
        default: '60'
      batch_size:
        description: 'How many tenants SSM redeploys in parallel per batch.'
        required: false
        type: string
        default: '3'
      dry_run:
        description: 'Plan only — do not actually redeploy.'
        required: false
        type: boolean
        default: false

permissions:
  contents: read
  # No write scopes needed — the workflow hits an external CP endpoint,
  # not the GitHub API.

# Serialize redeploys so two rapid main pushes' redeploys don't overlap
# and cause confusing per-tenant SSM state. Without this, GitHub's
# implicit workflow_run queueing would *probably* serialize them, but
# the explicit block makes the invariant defensible. Mirrors the
# concurrency block on redeploy-tenants-on-staging.yml for shape parity.
#
# cancel-in-progress: false → aborting a half-rolled-out fleet would
# leave tenants stuck on whatever image they happened to be on when
# cancelled. Better to finish the in-flight rollout before starting
# the next one.
concurrency:
  group: redeploy-tenants-on-main
  cancel-in-progress: false

jobs:
  redeploy:
    # Skip the auto-trigger if publish-workspace-server-image didn't
    # actually succeed. workflow_run fires on any completion state; we
    # don't want to redeploy against a half-built image.
    if: |
      github.event_name == 'workflow_dispatch' ||
      (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success')
    runs-on: ubuntu-latest
    timeout-minutes: 25
    steps:
      - name: Wait for GHCR tag propagation
        # GHCR's edge cache takes ~15-30s to consistently serve the new
        # manifest after the registry accepts the push. Without this
        # sleep, the first tenant's docker pull sometimes races and
        # fetches the previous digest; sleeping is the cheapest way to
        # reduce that without polling GHCR for the new digest.
        run: sleep 30

      - name: Compute target tag
        id: tag
        # Resolution order:
        #   1. Operator-supplied input (workflow_dispatch with explicit
        #      tag) → used verbatim. Lets ops pin `latest` for emergency
        #      rollback to last canary-verified digest, or pin a specific
        #      `staging-<sha>` to roll back to a known-good build.
        #   2. Default → `staging-<short_head_sha>`. The just-published
        #      digest. Bypasses the `:latest` retag path that's currently
        #      dead (canary-verify soft-skips without canary fleet, so
        #      the only thing retagging `:latest` today is the manual
        #      promote-latest.yml — last run 2026-04-28). Auto-trigger
        #      from workflow_run uses workflow_run.head_sha; manual
        #      dispatch with no input falls through to github.sha.
        env:
          INPUT_TAG: ${{ inputs.target_tag }}
          HEAD_SHA: ${{ github.event.workflow_run.head_sha || github.sha }}
        run: |
          set -euo pipefail
          if [ -n "${INPUT_TAG:-}" ]; then
            echo "target_tag=$INPUT_TAG" >> "$GITHUB_OUTPUT"
            echo "Using operator-pinned tag: $INPUT_TAG"
          else
            SHORT="${HEAD_SHA:0:7}"
            echo "target_tag=staging-$SHORT" >> "$GITHUB_OUTPUT"
            echo "Using auto tag: staging-$SHORT (head_sha=$HEAD_SHA)"
          fi

      - name: Call CP redeploy-fleet
        # CP_ADMIN_API_TOKEN must be set as a repo/org secret on
        # Molecule-AI/molecule-core, matching the staging/prod CP's
        # CP_ADMIN_API_TOKEN env. Stored in Railway, mirrored to this
        # repo's secrets for CI.
        env:
          CP_URL: ${{ vars.CP_URL || 'https://api.moleculesai.app' }}
          CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
          TARGET_TAG: ${{ steps.tag.outputs.target_tag }}
          CANARY_SLUG: ${{ inputs.canary_slug || 'hongming' }}
          SOAK_SECONDS: ${{ inputs.soak_seconds || '60' }}
          BATCH_SIZE: ${{ inputs.batch_size || '3' }}
          DRY_RUN: ${{ inputs.dry_run || false }}
        run: |
          set -euo pipefail

          if [ -z "${CP_ADMIN_API_TOKEN:-}" ]; then
            echo "::error::CP_ADMIN_API_TOKEN secret not set — skipping redeploy"
            echo "::notice::Set CP_ADMIN_API_TOKEN in repo secrets to enable auto-redeploy."
            exit 1
          fi

          BODY=$(jq -nc \
            --arg tag "$TARGET_TAG" \
            --arg canary "$CANARY_SLUG" \
            --argjson soak "$SOAK_SECONDS" \
            --argjson batch "$BATCH_SIZE" \
            --argjson dry "$DRY_RUN" \
            '{
              target_tag: $tag,
              canary_slug: $canary,
              soak_seconds: $soak,
              batch_size: $batch,
              dry_run: $dry
            }')

          echo "POST $CP_URL/cp/admin/tenants/redeploy-fleet"
          echo "  body: $BODY"

          HTTP_RESPONSE=$(mktemp)
          HTTP_CODE_FILE=$(mktemp)
          # Route -w into its own tempfile so curl's exit code (e.g. 56
          # on connection-reset, 22 on --fail-with-body 4xx/5xx) can't
          # pollute the captured stdout. The previous inline-substitution
          # shape produced "000000" on connection reset (curl wrote
          # "000" via -w, then the inline echo-fallback appended another
          # "000") — caught on the 2026-05-04 redeploy of sha 2b862f6.
          # set +e/-e keeps the non-zero curl exit from tripping the
          # outer pipeline. See lint-curl-status-capture.yml for the
          # CI gate that pins this fix shape.
          set +e
          curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \
            -m 1200 \
            -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
            -H "Content-Type: application/json" \
            -X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \
            -d "$BODY" >"$HTTP_CODE_FILE"
          set -e
          # Stderr from curl (e.g. dial errors with -sS) goes to the runner
          # log so operators can see WHY a connection failed. Stdout is
          # captured to $HTTP_CODE_FILE because that's where -w writes.
          HTTP_CODE=$(cat "$HTTP_CODE_FILE" 2>/dev/null || echo "000")
          [ -z "$HTTP_CODE" ] && HTTP_CODE="000"

          echo "HTTP $HTTP_CODE"
          cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE"

          # Pretty-print per-tenant results in the job summary so
          # ops can see which tenants were redeployed without drilling
          # into the raw response.
          {
            echo "## Tenant redeploy fleet"
            echo ""
            echo "**Target tag:** \`$TARGET_TAG\`"
            echo "**Canary:** \`$CANARY_SLUG\` (soak ${SOAK_SECONDS}s)"
            echo "**Batch size:** $BATCH_SIZE"
            echo "**Dry run:** $DRY_RUN"
            echo "**HTTP:** $HTTP_CODE"
            echo ""
            echo "### Per-tenant result"
            echo ""
            echo '| Slug | Phase | SSM Status | Exit | Healthz | Error |'
            echo '|------|-------|------------|------|---------|-------|'
            jq -r '.results[]? | "| \(.slug) | \(.phase) | \(.ssm_status // "-") | \(.ssm_exit_code) | \(.healthz_ok) | \(.error // "-") |"' "$HTTP_RESPONSE" || true
          } >> "$GITHUB_STEP_SUMMARY"

          if [ "$HTTP_CODE" != "200" ]; then
            echo "::error::redeploy-fleet returned HTTP $HTTP_CODE"
            exit 1
          fi
          OK=$(jq -r '.ok' "$HTTP_RESPONSE")
          if [ "$OK" != "true" ]; then
            echo "::error::redeploy-fleet reported ok=false (see summary for which tenant halted the rollout)"
            exit 1
          fi
          echo "::notice::Tenant fleet redeploy reported ssm_status=Success — verifying actual image roll on each tenant..."

          # Stash the response for the verify step. $RUNNER_TEMP outlasts
          # the step boundary; $HTTP_RESPONSE doesn't.
          cp "$HTTP_RESPONSE" "$RUNNER_TEMP/redeploy-response.json"

      - name: Verify each tenant /buildinfo matches published SHA
        # ROOT FIX FOR #2395.
        #
        # `redeploy-fleet`'s `ssm_status=Success` means "the SSM RPC
        # didn't error" — NOT "the new image is running on the tenant."
        # `:latest` lives in the local Docker daemon's image cache; if
        # the SSM document does `docker compose up -d` without an
        # explicit `docker pull`, the daemon serves the previously-
        # cached digest and the container restarts on stale code.
        # 2026-04-30 incident: hongmingwang's tenant reported
        # ssm_status=Success at 17:00:53Z but kept serving pre-501a42d7
        # chat_files for 30+ min — the lazy-heal fix never reached the
        # user despite green deploy + green redeploy.
        #
        # This step closes the gap by curling each tenant's /buildinfo
        # endpoint (added in workspace-server/internal/buildinfo +
        # /Dockerfile* GIT_SHA build-arg, this PR) and comparing the
        # returned git_sha to the SHA the workflow expects. Mismatches
        # fail the workflow, which is what `ok=true` should have
        # guaranteed all along.
        #
        # When the redeploy was triggered by workflow_dispatch with a
        # specific tag (target_tag != "latest"), the expected SHA may
        # not equal ${{ github.sha }} — in that case we resolve via
        # GHCR's manifest. For workflow_run (default :latest) the
        # workflow_run.head_sha is the SHA that just published.
        env:
          EXPECTED_SHA: ${{ github.event.workflow_run.head_sha || github.sha }}
          TARGET_TAG: ${{ steps.tag.outputs.target_tag }}
          # Tenant subdomain template — slugs from the response are
          # appended. Production CP issues `<slug>.moleculesai.app`;
          # staging CP issues `<slug>.staging.moleculesai.app`. This
          # workflow runs on main → prod CP → no `staging.` infix.
          TENANT_DOMAIN: 'moleculesai.app'
        run: |
          set -euo pipefail

          EXPECTED_SHORT="${EXPECTED_SHA:0:7}"
          if [ "$TARGET_TAG" != "latest" ] \
             && [ "$TARGET_TAG" != "$EXPECTED_SHA" ] \
             && [ "$TARGET_TAG" != "staging-$EXPECTED_SHORT" ]; then
            # workflow_dispatch with a pinned tag that isn't the head
            # SHA — operator is rolling back / pinning. Skip the
            # verification because we don't have the expected SHA in
            # this context (would need to crane-inspect the GHCR
            # manifest, which is a follow-up). Failing-open here is
            # safe: the operator chose the tag deliberately.
            #
            # `staging-<short_head_sha>` IS verified — it's the new
            # auto-trigger default (see Compute target tag step) and
            # the digest under that tag SHOULD match EXPECTED_SHA.
            echo "::notice::target_tag=$TARGET_TAG (operator-pinned) — skipping per-tenant SHA verification."
            exit 0
          fi

          RESP="$RUNNER_TEMP/redeploy-response.json"
          if [ ! -s "$RESP" ]; then
            echo "::error::redeploy-response.json missing or empty — verify step ran without a response to read"
            exit 1
          fi

          # Pull only successfully-redeployed tenants. Any tenant that
          # halted the rollout already failed the previous step, so we
          # don't double-count them here.
          mapfile -t SLUGS < <(jq -r '.results[]? | select(.healthz_ok == true) | .slug' "$RESP")
          if [ ${#SLUGS[@]} -eq 0 ]; then
            echo "::warning::No tenants reported healthz_ok — nothing to verify"
            exit 0
          fi

          echo "Verifying ${#SLUGS[@]} tenant(s) against EXPECTED_SHA=${EXPECTED_SHA:0:7}..."

          # Two distinct failure modes — STALE (the #2395 bug class, hard-fail)
          # vs UNREACHABLE (teardown race, soft-warn). See the staging variant's
          # comment for the full rationale; same logic applies on prod even
          # though prod has fewer ephemeral tenants — the asymmetry would be a
          # gratuitous fork.
          STALE_COUNT=0
          UNREACHABLE_COUNT=0
          STALE_LINES=()
          UNREACHABLE_LINES=()
          for slug in "${SLUGS[@]}"; do
            URL="https://${slug}.${TENANT_DOMAIN}/buildinfo"
            # 30s total: tenant just SSM-restarted, may still be coming
            # up. Retry-on-empty rather than retry-on-status — we want
            # to fail fast on "responded with wrong SHA", not "still
            # warming up".
            BODY=$(curl -sS --max-time 30 --retry 3 --retry-delay 5 --retry-connrefused "$URL" || true)
            ACTUAL_SHA=$(echo "$BODY" | jq -r '.git_sha // ""' 2>/dev/null || echo "")
            if [ -z "$ACTUAL_SHA" ]; then
              UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1))
              UNREACHABLE_LINES+=("| $slug | (no /buildinfo response) | ${EXPECTED_SHA:0:7} | ⚠ unreachable (likely teardown race) |")
              continue
            fi
            if [ "$ACTUAL_SHA" = "$EXPECTED_SHA" ]; then
              echo "  $slug: ${ACTUAL_SHA:0:7} ✓"
            else
              STALE_COUNT=$((STALE_COUNT + 1))
              STALE_LINES+=("| $slug | ${ACTUAL_SHA:0:7} | ${EXPECTED_SHA:0:7} | ❌ stale |")
            fi
          done

          {
            echo ""
            echo "### Per-tenant /buildinfo verification"
            echo ""
            echo "Expected SHA: \`${EXPECTED_SHA:0:7}\`"
            echo ""
            if [ $STALE_COUNT -gt 0 ]; then
              echo "**${STALE_COUNT} STALE tenant(s) — these did NOT pick up the new image despite ssm_status=Success:**"
              echo ""
              echo "| Slug | Actual /buildinfo SHA | Expected | Status |"
              echo "|------|----------------------|----------|--------|"
              for line in "${STALE_LINES[@]}"; do echo "$line"; done
              echo ""
            fi
            if [ $UNREACHABLE_COUNT -gt 0 ]; then
              echo "**${UNREACHABLE_COUNT} unreachable tenant(s) — likely teardown race (soft-warn, not failing):**"
              echo ""
              echo "| Slug | Actual /buildinfo SHA | Expected | Status |"
              echo "|------|----------------------|----------|--------|"
              for line in "${UNREACHABLE_LINES[@]}"; do echo "$line"; done
              echo ""
            fi
            if [ $STALE_COUNT -eq 0 ] && [ $UNREACHABLE_COUNT -eq 0 ]; then
              echo "All ${#SLUGS[@]} tenants returned matching SHA. ✓"
            fi
          } >> "$GITHUB_STEP_SUMMARY"

          if [ $UNREACHABLE_COUNT -gt 0 ]; then
            echo "::warning::$UNREACHABLE_COUNT tenant(s) unreachable post-redeploy. Likely benign teardown race — CP healthz monitor catches real outages."
          fi

          # Belt-and-suspenders sanity floor: same logic as the staging
          # variant — see that file's comment for the full rationale.
          # Floor only applies when fleet >= 4; below that, canary-verify
          # is the actual gate.
          TOTAL_VERIFIED=${#SLUGS[@]}
          if [ $TOTAL_VERIFIED -ge 4 ] && [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then
            echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED tenant(s) unreachable — exceeds 50% threshold on a fleet large enough that this signals a real outage, not teardown race."
            exit 1
          fi

          if [ $STALE_COUNT -gt 0 ]; then
            echo "::error::$STALE_COUNT tenant(s) returned a stale SHA. ssm_status=Success was misleading — see job summary."
            exit 1
          fi

          echo "::notice::Tenant fleet redeploy complete — all reachable tenants on ${EXPECTED_SHA:0:7} (${UNREACHABLE_COUNT} unreachable, soft-warned)."