diff --git a/.github/workflows/e2e-api.yml b/.github/workflows/e2e-api.yml index 201d42a1..30356d40 100644 --- a/.github/workflows/e2e-api.yml +++ b/.github/workflows/e2e-api.yml @@ -27,7 +27,17 @@ on: workflow_dispatch: concurrency: - group: e2e-api-${{ github.ref }} + # Per-SHA grouping (changed 2026-04-28 from per-ref). Per-ref had the + # same auto-promote-staging brittleness as e2e-staging-canvas — back- + # to-back staging pushes share refs/heads/staging, so the older push's + # queued run gets cancelled when a newer push lands. Auto-promote- + # staging then sees `completed/cancelled` for the older SHA and stays + # put; the newer SHA's gates may eventually save the day, but if the + # newer push gets cancelled too, we deadlock. + # + # See e2e-staging-canvas.yml's identical concurrency block for the full + # rationale and the 2026-04-28 incident reference. + group: e2e-api-${{ github.event.pull_request.head.sha || github.sha }} cancel-in-progress: false jobs: diff --git a/.github/workflows/e2e-staging-canvas.yml b/.github/workflows/e2e-staging-canvas.yml index aa26ef64..01e94690 100644 --- a/.github/workflows/e2e-staging-canvas.yml +++ b/.github/workflows/e2e-staging-canvas.yml @@ -37,7 +37,25 @@ on: - cron: '0 8 * * 0' concurrency: - group: e2e-staging-canvas + # Per-SHA grouping (changed 2026-04-28 from a single global group). The + # global group made auto-promote-staging brittle: when a staging push + # queued behind an in-flight run and a third entrant (a PR run, a + # follow-on push) entered the group, the staging push got cancelled — + # leaving auto-promote-staging looking at `completed/cancelled` for a + # required gate and refusing to advance main. Observed 2026-04-28 + # 23:51-23:53 on staging tip 3f99fede. + # + # The original intent of the global group was to throttle parallel + # E2E provisions (each spins a fresh EC2). At our scale that throttle + # isn't worth the correctness cost — fresh-org-per-run isolates the + # state, and the cost of two parallel runs (~$0.001/min × 10min × 2) + # is rounding error vs. the cost of a stuck pipeline. + # + # Per-SHA still dedupes accidental double-triggers for the SAME SHA. + # It does NOT cancel obsolete-PR-version runs on force-push; that + # wasted CI is acceptable given the alternative is losing staging-tip + # data that auto-promote-staging needs. + group: e2e-staging-canvas-${{ github.event.pull_request.head.sha || github.sha }} cancel-in-progress: false jobs: