From 60a516bc8da6ac804ae959b68cb94efa24cc6d91 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 3 May 2026 05:06:01 -0700 Subject: [PATCH] =?UTF-8?q?ci(redeploy):=20fix=20stale=20canary=5Fslug=20d?= =?UTF-8?q?efault=20'hongmingwang'=20=E2=86=92=20'hongming'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The workflow_dispatch input default and the workflow_run env fallback both pointed at 'hongmingwang', which doesn't match any current prod tenant (slugs are: hongming, chloe-dong, reno-stars). CP silently skipped the missing canary and put every tenant in batch-1 in parallel, defeating the canary-first soak gate that exists to catch image-boot regressions before they hit the whole fleet. Concrete example from today's c0838d6 redeploy at 11:53Z (run 25278434388): the dispatched body was `{"target_tag":"staging-c0838d6","canary_slug":"hongmingwang",...}` and the CP response showed all 3 tenants in `"phase":"batch-1"` — no soak, no canary. The deploy happened to be safe, but a broken image would have hit hongming + chloe-dong + reno-stars simultaneously. Fixed in three places: the runtime ordering comment, the workflow_dispatch default, and the env fallback used by the workflow_run trigger. Comment documents the rationale so the next slug rename doesn't silently regress this again. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/redeploy-tenants-on-main.yml | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/workflows/redeploy-tenants-on-main.yml b/.github/workflows/redeploy-tenants-on-main.yml index a46f56f1..85acda60 100644 --- a/.github/workflows/redeploy-tenants-on-main.yml +++ b/.github/workflows/redeploy-tenants-on-main.yml @@ -17,7 +17,7 @@ name: redeploy-tenants-on-main # 1. publish-workspace-server-image completes → new :latest in GHCR. # 2. This workflow fires via workflow_run, waits 30s for GHCR's # CDN to propagate the new tag to the region the tenants pull from. -# 3. Calls redeploy-fleet with canary_slug=hongmingwang and a 60s +# 3. Calls redeploy-fleet with canary_slug=hongming and a 60s # soak. Canary proves the image boots; batches follow. # 4. Any failure aborts the rollout and leaves older tenants on the # prior image — safer default than half-and-half state. @@ -56,7 +56,12 @@ on: description: 'Tenant slug to deploy first + soak (empty = skip canary, fan out immediately).' required: false type: string - default: 'hongmingwang' + # Must be an actual prod tenant slug (current: hongming, + # chloe-dong, reno-stars). The previous default 'hongmingwang' + # didn't match any tenant — CP soft-skipped the missing canary + # and the fleet rolled out without the soak gate, defeating the + # whole point of canary-first. + default: 'hongming' soak_seconds: description: 'Seconds to wait after canary before fanning out.' required: false @@ -148,7 +153,7 @@ jobs: CP_URL: ${{ vars.CP_URL || 'https://api.moleculesai.app' }} CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }} TARGET_TAG: ${{ steps.tag.outputs.target_tag }} - CANARY_SLUG: ${{ inputs.canary_slug || 'hongmingwang' }} + CANARY_SLUG: ${{ inputs.canary_slug || 'hongming' }} SOAK_SECONDS: ${{ inputs.soak_seconds || '60' }} BATCH_SIZE: ${{ inputs.batch_size || '3' }} DRY_RUN: ${{ inputs.dry_run || false }}