From 31d25b5a745a8a46b052212e91255445e66595f9 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangalt@gmail.com>
Date: Tue, 28 Apr 2026 12:43:26 -0700
Subject: [PATCH 1/2] fix(ci): e2e gates always emit a result so auto-promote
 can read it
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The auto-promote-staging.yml gate-check (line 99) treats "workflow
didn't run" as failure. Path-filtered triggers on E2E API Smoke Test
and E2E Staging Canvas meant a platform-only or test-only push to
staging — say, the prior PR #2201 which only touched
tests/e2e/test_staging_full_saas.sh — never triggered the canvas
workflow, and auto-promote saw `missing/none`, marked all_green=false,
and aborted. Same class for any push that doesn't touch the gate's
watched paths. Dead-lock by design, never noticed because the gate
was new.

Fix per Design B (always-run + fast-skip):

- Drop `paths:` from the push/pull_request triggers on both gate
  workflows. The workflow now always fires on every staging+main
  push/PR.
- Add a `detect-changes` job using `dorny/paths-filter@v3` that
  decides whether to do real work, scoped to the same paths the
  trigger filter used to watch.
- Real work job (e2e-api / playwright) gates on
  `needs: detect-changes; if: needs.detect-changes.outputs.X == 'true'`.
- Add a sibling `no-op` job that runs when the filter output is
  false, emitting `::notice::… no-op pass`. The workflow run's
  conclusion is `success` either way — auto-promote sees green and
  proceeds.

manual `workflow_dispatch` and the weekly canvas `schedule` short-
circuit detect-changes to always-run — those triggers exist precisely
to exercise the suite and shouldn't be silently no-op'd.

Why this approach over making auto-promote-staging smarter:

The alternative (Design A, considered + rejected) was to teach
auto-promote-staging to read each gate's `paths:` filter and treat
"no run because filter excluded the commit" as conditional pass.
That couples auto-promote to other workflows' YAML schema and breaks
silently if a gate is renamed or its filter changes. Design B keeps
the auto-promote contract simple ("each gate emits success") and
makes each gate self-describing — adding a new gate doesn't require
touching auto-promote.

Cost: ~10-30s of runner overhead per gate per push for the no-op when
paths don't match. Negligible vs the alternative of dead-locked
auto-promote chains.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/e2e-api.yml            | 62 +++++++++++++++++++++---
 .github/workflows/e2e-staging-canvas.yml | 53 +++++++++++++++++---
 2 files changed, 101 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/e2e-api.yml b/.github/workflows/e2e-api.yml
index 89c69b88..d7d6ea09 100644
--- a/.github/workflows/e2e-api.yml
+++ b/.github/workflows/e2e-api.yml
@@ -1,27 +1,73 @@
 name: E2E API Smoke Test
 # Extracted from ci.yml so workflow-level concurrency can protect this job
 # from run-level cancellation (issue #458).
+#
+# Trigger model (changed 2026-04-28 — see auto-promote gap below):
+#
+# This workflow always FIRES on push/pull_request to staging+main, but
+# only does real work when paths under `workspace-server/`,
+# `tests/e2e/`, or this workflow file changed. The detect-changes job
+# uses dorny/paths-filter to decide; the e2e-api job runs only if
+# changes match. Otherwise the no-op job emits success so the workflow
+# always produces a `completed/success` run record.
+#
+# Why: auto-promote-staging.yml's gate-check (line 99) treats "workflow
+# didn't run" as failure, which dead-locked any platform-only or
+# test-only push to staging that didn't touch workspace-server paths.
+# Dropping the path filter on the trigger and gating real work
+# internally guarantees the workflow always emits a result that the
+# auto-promote chain can read. Same pattern applied to
+# e2e-staging-canvas.yml in the same PR.
 
 on:
   push:
     branches: [main, staging]
-    paths:
-      - 'workspace-server/**'
-      - 'tests/e2e/**'
-      - '.github/workflows/e2e-api.yml'
   pull_request:
     branches: [main, staging]
-    paths:
-      - 'workspace-server/**'
-      - 'tests/e2e/**'
-      - '.github/workflows/e2e-api.yml'
+  workflow_dispatch:
 
 concurrency:
   group: e2e-api-${{ github.ref }}
   cancel-in-progress: false
 
 jobs:
+  detect-changes:
+    runs-on: ubuntu-latest
+    outputs:
+      api: ${{ steps.decide.outputs.api }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: dorny/paths-filter@v3
+        id: filter
+        with:
+          filters: |
+            api:
+              - 'workspace-server/**'
+              - 'tests/e2e/**'
+              - '.github/workflows/e2e-api.yml'
+      - id: decide
+        # Always run real work for manual dispatch — no diff context to
+        # filter against and ops dispatching this expects the suite to
+        # actually exercise the platform.
+        run: |
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            echo "api=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "api=${{ steps.filter.outputs.api }}" >> "$GITHUB_OUTPUT"
+          fi
+
+  no-op:
+    needs: detect-changes
+    if: needs.detect-changes.outputs.api != 'true'
+    runs-on: ubuntu-latest
+    steps:
+      - run: |
+          echo "No workspace-server / tests/e2e / workflow changes — E2E API gate satisfied without running tests."
+          echo "::notice::E2E API Smoke Test no-op pass (paths filter excluded this commit)."
+
   e2e-api:
+    needs: detect-changes
+    if: needs.detect-changes.outputs.api == 'true'
     name: E2E API Smoke Test
     runs-on: ubuntu-latest
     timeout-minutes: 15
diff --git a/.github/workflows/e2e-staging-canvas.yml b/.github/workflows/e2e-staging-canvas.yml
index 143d9469..310e16f3 100644
--- a/.github/workflows/e2e-staging-canvas.yml
+++ b/.github/workflows/e2e-staging-canvas.yml
@@ -13,16 +13,23 @@ name: E2E Staging Canvas (Playwright)
 # workflow — mirrors what PR #1891 does for e2e-api.yml.
 
 on:
+  # Trigger model (changed 2026-04-28 — see auto-promote gap below):
+  #
+  # Always fires on push/pull_request; only does real work when canvas/
+  # or this workflow file changed. The detect-changes job uses
+  # dorny/paths-filter to decide; the playwright job runs only if
+  # changes match. Otherwise no-op emits success so the workflow always
+  # produces a `completed/success` run record.
+  #
+  # Why: auto-promote-staging.yml's gate-check (line 99) treats
+  # "workflow didn't run" as failure, which dead-locked platform-only
+  # pushes to staging. Dropping the trigger path filter and gating real
+  # work internally guarantees a result the auto-promote chain can
+  # read. Same pattern applied to e2e-api.yml in the same PR.
   push:
     branches: [main, staging]
-    paths:
-      - 'canvas/**'
-      - '.github/workflows/e2e-staging-canvas.yml'
   pull_request:
     branches: [main, staging]
-    paths:
-      - 'canvas/**'
-      - '.github/workflows/e2e-staging-canvas.yml'
   workflow_dispatch:
   schedule:
     # Weekly on Sunday 08:00 UTC — catches Chrome / Playwright / Next.js
@@ -34,7 +41,41 @@ concurrency:
   cancel-in-progress: false
 
 jobs:
+  detect-changes:
+    runs-on: ubuntu-latest
+    outputs:
+      canvas: ${{ steps.decide.outputs.canvas }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: dorny/paths-filter@v3
+        id: filter
+        with:
+          filters: |
+            canvas:
+              - 'canvas/**'
+              - '.github/workflows/e2e-staging-canvas.yml'
+      - id: decide
+        # Always run real tests for manual dispatch and the weekly cron —
+        # both exist precisely to exercise the suite, regardless of diff.
+        run: |
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ] || [ "${{ github.event_name }}" = "schedule" ]; then
+            echo "canvas=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "canvas=${{ steps.filter.outputs.canvas }}" >> "$GITHUB_OUTPUT"
+          fi
+
+  no-op:
+    needs: detect-changes
+    if: needs.detect-changes.outputs.canvas != 'true'
+    runs-on: ubuntu-latest
+    steps:
+      - run: |
+          echo "No canvas / workflow changes — E2E Staging Canvas gate satisfied without running tests."
+          echo "::notice::E2E Staging Canvas no-op pass (paths filter excluded this commit)."
+
   playwright:
+    needs: detect-changes
+    if: needs.detect-changes.outputs.canvas == 'true'
     name: Canvas tabs E2E
     runs-on: ubuntu-latest
     timeout-minutes: 40

From 17018745d0772bb2a6e2fe02273681f2fafe65da Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangalt@gmail.com>
Date: Tue, 28 Apr 2026 13:15:13 -0700
Subject: [PATCH 2/2] fix(ci): auto-promote gate-check uses workflow file
 paths, not names
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Observed 2026-04-28: auto-promote ran for staging head 96955f7b with
all gates actually green (verified via /commits/<sha>/check-runs API)
yet `check-all-gates-green` reported `CodeQL → missing/none` and
aborted. Same SHA was promotable; auto-promote couldn't see it.

Cause: `gh run list --workflow="CodeQL"` matched two workflows in
this repo:

  - codeql.yml (explicit, scans both staging and main)
  - codeql       (GitHub UI-configured Code-quality default setup,
                  internal, scans default branch only)

gh CLI rejects ambiguous `--workflow=<name>` lookups and returns no
result → the gate fell through to `missing/none` and ALL_GREEN was
set false. Every staging push since both names existed has been
silently dead-locked.

Fix: switch GATES from display-name strings to workflow file paths.
File paths are the unique identifier for a workflow file in
.github/workflows/; display names are decoration and can collide.
The same `gh run list --workflow=<file.yml>` query that fails on
"CodeQL" succeeds on "codeql.yml" because the file path resolves
unambiguously.

No behavior change for the other three gates (CI, E2E Canvas, E2E
API Smoke) since their names didn't collide — they keep working,
they just identify by ci.yml / e2e-staging-canvas.yml / e2e-api.yml
now. The log line shape changes from `CI → completed/success` to
`ci.yml → completed/success` which is fine for ops grep.

When adding/removing a gate going forward: file paths only. Keep
branch-protection required-checks (check-run display names) in
sync as a separate manual step.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/auto-promote-staging.yml | 29 +++++++++++++++++-----
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/auto-promote-staging.yml b/.github/workflows/auto-promote-staging.yml
index c3427787..118d0c83 100644
--- a/.github/workflows/auto-promote-staging.yml
+++ b/.github/workflows/auto-promote-staging.yml
@@ -61,13 +61,30 @@ jobs:
         run: |
           set -euo pipefail
 
-          # Required gate workflow names. Must match the `name:` field
-          # in the respective .github/workflows/*.yml files.
+          # Required gate workflow files. Use file paths (relative to
+          # .github/workflows/) rather than display names because:
+          #
+          #   1. `gh run list --workflow=<name>` is ambiguous when two
+          #      workflows have the same `name:` — observed 2026-04-28
+          #      with "CodeQL" matching both `codeql.yml` (explicit) and
+          #      GitHub's UI-configured Code-quality default setup
+          #      (internal "codeql"). gh CLI returns "could not resolve
+          #      to a unique workflow" → empty result → gate evaluated
+          #      as missing/none → auto-promote dead-locked despite all
+          #      checks actually passing.
+          #
+          #   2. File paths are the unique identifier for workflows;
+          #      `name:` is just a display string and can collide.
+          #
+          # When adding/removing a gate, update this list AND the
+          # branch-protection required-checks list (which uses check-run
+          # display names, not workflow names; the two are decoupled and
+          # should be kept in sync manually).
           GATES=(
-            "CI"
-            "E2E Staging Canvas (Playwright)"
-            "E2E API Smoke Test"
-            "CodeQL"
+            "ci.yml"
+            "e2e-staging-canvas.yml"
+            "e2e-api.yml"
+            "codeql.yml"
           )
 
           echo "head_sha=${HEAD_SHA}" >> "$GITHUB_OUTPUT"