From 115f1f5e6409247d6eed9d9a6c025150dcc44d6d Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Fri, 1 May 2026 23:17:59 -0700 Subject: [PATCH] fix(redeploy-main): pull staging- instead of stale :latest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Auto-trigger from publish-workspace-server-image now resolves target_tag to the just-published `staging-` digest instead of `:latest`. Bypasses the dead retag path that was leaving prod tenants on a 4-day-old image. The chain pre-fix: publish-image → pushes :staging- + :staging-latest (NOT :latest) canary-verify → soft-skips (CANARY_TENANT_URLS unset, fleet not stood up) promote-latest → manual workflow_dispatch only, last run 2026-04-28 redeploy-main → pulls :latest → 2026-04-28 digest → all 3 tenants STALE Today's incident: e7375348 (main) → publish-image green → redeploy fired → tenants pulled :latest (76c604fb digest from prior canary-verified state) → hongming /buildinfo returned 76c604fb instead of e7375348 → verify step correctly flagged 3/3 STALE → workflow failed. Today's PRs (#2473 smoke wedge, #2487 panic recovery, #2496 sweeper followups) shipped to GHCR as :staging- but never reached prod. Fix: - workflow_dispatch input default '' (was 'latest'); empty input triggers auto-compute path - new "Compute target tag" step resolves: 1. operator-supplied input → verbatim (rollback / pin) 2. else → staging- (auto) - verify step's operator-pin detection now allows staging- as a non-pin (verification still runs) When canary fleet is real, this workflow should chain on canary-verify completion (workflow_run from canary-verify, gated on promote-to-latest success) instead of publish-image — separate, smaller PR. Today's fix unblocks prod deploys without that prerequisite. Companion: promote-latest.yml dispatched 2026-05-02 against e7375348 to unstick existing prod tenants. This PR prevents recurrence. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../workflows/redeploy-tenants-on-main.yml | 67 ++++++++++++++++--- 1 file changed, 58 insertions(+), 9 deletions(-) diff --git a/.github/workflows/redeploy-tenants-on-main.yml b/.github/workflows/redeploy-tenants-on-main.yml index 46743347..a46f56f1 100644 --- a/.github/workflows/redeploy-tenants-on-main.yml +++ b/.github/workflows/redeploy-tenants-on-main.yml @@ -34,10 +34,24 @@ on: workflow_dispatch: inputs: target_tag: - description: 'Tenant image tag to deploy (e.g. "latest" or "a59f1a6c"). Defaults to latest when empty.' + # Empty default → auto-trigger and dispatch-without-input both + # resolve to `staging-` (the digest publish-image + # just pushed). Pre-fix this defaulted to 'latest', which only + # gets retagged by canary-verify's promote-to-latest job — and + # that job soft-skips when CANARY_TENANT_URLS is unset (the + # current state, until Phase 2 canary fleet is live). Result: + # `:latest` had been pinned to a 4-day-old digest (2026-04-28) + # while every main push pushed fresh `staging-` images; + # every prod redeploy pulled the stale `:latest` and the verify + # step correctly flagged 3/3 tenants STALE. Pulling the + # just-published `staging-` directly skips the dead retag + # path. When canary fleet is real, this workflow should chain + # on canary-verify completion (workflow_run from canary-verify), + # not publish-image — separate, smaller PR. + description: 'Tenant image tag to deploy (e.g. "latest", "staging-a59f1a6c"). Empty = auto staging-.' required: false type: string - default: 'latest' + default: '' canary_slug: description: 'Tenant slug to deploy first + soak (empty = skip canary, fan out immediately).' required: false @@ -91,12 +105,40 @@ jobs: steps: - name: Wait for GHCR tag propagation # GHCR's edge cache takes ~15-30s to consistently serve the new - # :latest manifest after the registry accepts the push. Without - # this sleep, the first tenant's docker pull sometimes races - # and fetches the previous digest; sleeping is the cheapest - # way to reduce that without polling GHCR for the new digest. + # manifest after the registry accepts the push. Without this + # sleep, the first tenant's docker pull sometimes races and + # fetches the previous digest; sleeping is the cheapest way to + # reduce that without polling GHCR for the new digest. run: sleep 30 + - name: Compute target tag + id: tag + # Resolution order: + # 1. Operator-supplied input (workflow_dispatch with explicit + # tag) → used verbatim. Lets ops pin `latest` for emergency + # rollback to last canary-verified digest, or pin a specific + # `staging-` to roll back to a known-good build. + # 2. Default → `staging-`. The just-published + # digest. Bypasses the `:latest` retag path that's currently + # dead (canary-verify soft-skips without canary fleet, so + # the only thing retagging `:latest` today is the manual + # promote-latest.yml — last run 2026-04-28). Auto-trigger + # from workflow_run uses workflow_run.head_sha; manual + # dispatch with no input falls through to github.sha. + env: + INPUT_TAG: ${{ inputs.target_tag }} + HEAD_SHA: ${{ github.event.workflow_run.head_sha || github.sha }} + run: | + set -euo pipefail + if [ -n "${INPUT_TAG:-}" ]; then + echo "target_tag=$INPUT_TAG" >> "$GITHUB_OUTPUT" + echo "Using operator-pinned tag: $INPUT_TAG" + else + SHORT="${HEAD_SHA:0:7}" + echo "target_tag=staging-$SHORT" >> "$GITHUB_OUTPUT" + echo "Using auto tag: staging-$SHORT (head_sha=$HEAD_SHA)" + fi + - name: Call CP redeploy-fleet # CP_ADMIN_API_TOKEN must be set as a repo/org secret on # Molecule-AI/molecule-core, matching the staging/prod CP's @@ -105,7 +147,7 @@ jobs: env: CP_URL: ${{ vars.CP_URL || 'https://api.moleculesai.app' }} CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }} - TARGET_TAG: ${{ inputs.target_tag || 'latest' }} + TARGET_TAG: ${{ steps.tag.outputs.target_tag }} CANARY_SLUG: ${{ inputs.canary_slug || 'hongmingwang' }} SOAK_SECONDS: ${{ inputs.soak_seconds || '60' }} BATCH_SIZE: ${{ inputs.batch_size || '3' }} @@ -209,7 +251,7 @@ jobs: # workflow_run.head_sha is the SHA that just published. env: EXPECTED_SHA: ${{ github.event.workflow_run.head_sha || github.sha }} - TARGET_TAG: ${{ inputs.target_tag || 'latest' }} + TARGET_TAG: ${{ steps.tag.outputs.target_tag }} # Tenant subdomain template — slugs from the response are # appended. Production CP issues `.moleculesai.app`; # staging CP issues `.staging.moleculesai.app`. This @@ -218,13 +260,20 @@ jobs: run: | set -euo pipefail - if [ "$TARGET_TAG" != "latest" ] && [ "$TARGET_TAG" != "$EXPECTED_SHA" ]; then + EXPECTED_SHORT="${EXPECTED_SHA:0:7}" + if [ "$TARGET_TAG" != "latest" ] \ + && [ "$TARGET_TAG" != "$EXPECTED_SHA" ] \ + && [ "$TARGET_TAG" != "staging-$EXPECTED_SHORT" ]; then # workflow_dispatch with a pinned tag that isn't the head # SHA — operator is rolling back / pinning. Skip the # verification because we don't have the expected SHA in # this context (would need to crane-inspect the GHCR # manifest, which is a follow-up). Failing-open here is # safe: the operator chose the tag deliberately. + # + # `staging-` IS verified — it's the new + # auto-trigger default (see Compute target tag step) and + # the digest under that tag SHOULD match EXPECTED_SHA. echo "::notice::target_tag=$TARGET_TAG (operator-pinned) — skipping per-tenant SHA verification." exit 0 fi