From bc11ed8a2b5a005d005b096ccd22d3b8e9ffffe2 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 3 May 2026 07:13:26 -0700 Subject: [PATCH] fix(auto-promote): use App token for auto-merge to fire downstream cascade (#2357) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GITHUB_TOKEN-initiated merges suppress the downstream `push` event on main per GitHub's documented limitation: https://docs.github.com/en/actions/using-workflows/triggering-a-workflow#triggering-a-workflow-from-a-workflow Result before this fix: every staging→main promote landed silently — publish-workspace-server-image, canary-verify, and redeploy-tenants-on-main all stayed dark. The polling tail was the SOLE cascade trigger; if it ever 30-min-timed-out the chain dead-locked invisibly. Symptom (from the issue body, 2026-04-30): | Time | Event | Triggered? | |----------|--------------------------------------------------|-----------| | 05:48:04 | Promote PR #2352 merged (c140ad28) | No fired | | 06:07:29 | Promote PR #2356 merged (5973c9bd) | No fired | Fix: mint the molecule-ai App token BEFORE the promote-PR step and hand it to the auto-merge call. App-token-initiated merges DO trigger downstream workflow_run cascades. The polling tail stays as defense-in-depth (with comments updated): once we've observed >=10 successful natural cascades it can be dropped. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/auto-promote-staging.yml | 85 +++++++++++----------- 1 file changed, 41 insertions(+), 44 deletions(-) diff --git a/.github/workflows/auto-promote-staging.yml b/.github/workflows/auto-promote-staging.yml index de6ce46a..9151835b 100644 --- a/.github/workflows/auto-promote-staging.yml +++ b/.github/workflows/auto-promote-staging.yml @@ -209,10 +209,25 @@ jobs: exit 0 fi + # Mint the App token BEFORE the promote-PR step so the auto-merge + # call can use it. GITHUB_TOKEN-initiated merges suppress the + # downstream `push` event on main, breaking the + # publish-workspace-server-image → canary-verify → redeploy-tenants + # chain (issue #2357). Using the App token here means the + # merge-queue-landed merge IS able to fire the cascade naturally; + # the polling tail below stays as defense-in-depth. + - name: Mint App token for promote-PR + downstream dispatch + if: ${{ vars.AUTO_PROMOTE_ENABLED == 'true' || github.event.inputs.force == 'true' }} + id: app-token + uses: actions/create-github-app-token@1b10c78c7865c340bc4f6099eb2f838309f1e8c3 # v3.1.1 + with: + app-id: ${{ secrets.MOLECULE_AI_APP_ID }} + private-key: ${{ secrets.MOLECULE_AI_APP_PRIVATE_KEY }} + - name: Open (or reuse) staging → main promote PR + enable auto-merge if: ${{ vars.AUTO_PROMOTE_ENABLED == 'true' || github.event.inputs.force == 'true' }} env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GH_TOKEN: ${{ steps.app-token.outputs.token }} REPO: ${{ github.repository }} TARGET_SHA: ${{ needs.check-all-gates-green.outputs.head_sha }} run: | @@ -267,52 +282,34 @@ jobs: echo "promote_pr_num=${PR_NUM}" >> "$GITHUB_OUTPUT" id: promote_pr - # Mint a short-lived GitHub App installation token for the dispatch - # step below. We CANNOT use `secrets.GITHUB_TOKEN` to dispatch the - # downstream publish chain — workflow runs created by GITHUB_TOKEN - # do not fire `workflow_run` triggers on completion (the - # documented "no recursion" rule — - # https://docs.github.com/en/actions/using-workflows/triggering-a-workflow#triggering-a-workflow-from-a-workflow). - # - # Symptom this caused (root-caused on 2026-04-30): publish-image - # ran successfully twice (21313dc 14:41Z, 59dec57 15:21Z) but - # canary-verify and redeploy-tenants-on-main never chained, - # because the publish run's `triggering_actor` was - # `github-actions[bot]` (i.e. GITHUB_TOKEN). A manual dispatch - # earlier in the day with the operator's PAT (d850ec7 06:52Z) did - # chain — same workflow file, only the actor differed. - # - # An App token's triggering_actor is the App user (e.g. - # `molecule-ai[bot]`), which IS allowed to fire downstream - # workflow_run cascades. - - name: Mint App token for downstream dispatch - if: steps.promote_pr.outputs.promote_pr_num != '' - id: app-token - uses: actions/create-github-app-token@1b10c78c7865c340bc4f6099eb2f838309f1e8c3 # v3.1.1 - with: - app-id: ${{ secrets.MOLECULE_AI_APP_ID }} - private-key: ${{ secrets.MOLECULE_AI_APP_PRIVATE_KEY }} - + # The App token minted above (before the promote-PR step) is + # also used by the polling tail below. Defense-in-depth: with + # the merge-queue-landed merge now using the App token, the + # main-branch push event SHOULD fire the publish/canary/redeploy + # cascade naturally — but if for any reason it doesn't (e.g. an + # unrelated event-suppression edge case), the explicit dispatches + # below still wake the chain. - name: Wait for promote merge, then dispatch publish + redeploy (#2357) - # GITHUB_TOKEN-initiated merges suppress downstream `push` events - # (https://docs.github.com/en/actions/using-workflows/triggering-a-workflow#triggering-a-workflow-from-a-workflow). - # Result: when the merge queue lands the promote PR, the resulting - # main-branch push DOES NOT fire publish-workspace-server-image, - # so canary-verify and redeploy-tenants-on-main never run and - # tenants stay on stale code (issue #2357). + # Defense-in-depth dispatch. With the auto-merge call above + # now using the App token (this commit), the merge-queue-landed + # merge SHOULD fire publish-workspace-server-image naturally + # via on:push:[main] — App-token-initiated pushes DO trigger + # workflow_run cascades, unlike GITHUB_TOKEN-initiated ones + # (the documented "no recursion" rule — + # https://docs.github.com/en/actions/using-workflows/triggering-a-workflow#triggering-a-workflow-from-a-workflow). # - # Workaround: poll for the merge to land, then explicitly - # `gh workflow run` publish-workspace-server-image. The dispatch - # MUST authenticate as the molecule-ai App (App token minted - # above) — not GITHUB_TOKEN — so that the resulting publish - # run's completion event can fire the workflow_run cascade - # into canary-verify + redeploy-tenants-on-main. See the prior - # step's comment for the GITHUB_TOKEN no-recursion details. + # This explicit dispatch stays as belt-and-suspenders for any + # edge case where the natural cascade misfires. If it never + # observably fires after this token swap (i.e. the publish + # workflow has already started by the time we get here), the + # second dispatch is a harmless no-op (publish-workspace-server-image + # has its own concurrency group that dedupes). # - # Long-term fix: switch the auto-merge call above to use the - # same App token, so the merge's push event fires - # publish-workspace-server-image naturally and this polling tail - # becomes unnecessary. Tracked in #2357. + # See PR for #2357: pre-fix the merge action was via + # GITHUB_TOKEN, suppressing the cascade and forcing this tail + # to be the SOLE chain trigger. With the auto-merge token swap + # the tail becomes redundant in the happy path; keep until + # we've observed >=10 successful natural cascades, then drop. if: steps.promote_pr.outputs.promote_pr_num != '' env: GH_TOKEN: ${{ steps.app-token.outputs.token }}