From 7cc1c39c49c97d504b62ebaaff982d75c6925bcd Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangalt@gmail.com>
Date: Mon, 4 May 2026 20:21:59 -0700
Subject: [PATCH] ci: e2e coverage matrix + branch-protection-as-code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #9.

Three pieces, all small:

1. **docs/e2e-coverage.md** — source of truth for which E2E suites
   guard which surfaces. Today three were running but informational
   only on staging; that's how the org-import silent-drop bug shipped
   without a test catching it pre-merge. Now the matrix shows what's
   required where + a follow-up note for the two suites that need an
   always-emit refactor before they can be required.

2. **tools/branch-protection/apply.sh** — branch protection as code.
   Lets `staging` and `main` required-checks live in a reviewable
   shell script instead of UI clicks that get lost between admins.
   This PR's net change: add `E2E API Smoke Test` and `Canvas tabs E2E`
   as required on staging. Both already use the always-emit path-filter
   pattern (no-op step emits SUCCESS when the workflow's paths weren't
   touched), so making them required can't deadlock unrelated PRs.

3. **branch-protection-drift.yml** — daily cron + drift_check.sh
   that compares live protection against apply.sh's desired state.
   Catches out-of-band UI edits before they drift further. Fails the
   workflow on mismatch; ops re-runs apply.sh or updates the script.

Out of scope (filed as follow-ups):
- e2e-staging-saas + e2e-staging-external use plain `paths:` filters
  and never trigger when paths are unchanged. They need refactoring
  to the always-emit shape (same as e2e-api / e2e-staging-canvas)
  before they can be required.
- main branch protection mirrors staging here; if main wants the
  E2E SaaS / External added later, do it in apply.sh and rerun.

Operator must apply once after merge:
  bash tools/branch-protection/apply.sh
The drift check picks it up from there.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/branch-protection-drift.yml |  42 ++++++
 docs/e2e-coverage.md                          |  58 ++++++++
 tools/branch-protection/apply.sh              | 133 ++++++++++++++++++
 tools/branch-protection/drift_check.sh        |  44 ++++++
 4 files changed, 277 insertions(+)
 create mode 100644 .github/workflows/branch-protection-drift.yml
 create mode 100644 docs/e2e-coverage.md
 create mode 100755 tools/branch-protection/apply.sh
 create mode 100755 tools/branch-protection/drift_check.sh

diff --git a/.github/workflows/branch-protection-drift.yml b/.github/workflows/branch-protection-drift.yml
new file mode 100644
index 00000000..7b05f09f
--- /dev/null
+++ b/.github/workflows/branch-protection-drift.yml
@@ -0,0 +1,42 @@
+name: branch-protection drift check
+
+# Catches out-of-band edits to branch protection (UI clicks, manual gh
+# api PATCH from a one-off ops session) by comparing live state against
+# tools/branch-protection/apply.sh's desired state every day. Fails the
+# workflow when they drift; the failure is the signal.
+#
+# When it fails: re-run apply.sh to put the live state back to the
+# script's intent, OR update apply.sh to encode the new intent and
+# commit. Either way the script is the source of truth.
+
+on:
+  schedule:
+    # 14:00 UTC daily. Off-hours for most teams; gives a fresh signal
+    # at the start of every working day.
+    - cron: '0 14 * * *'
+  workflow_dispatch:
+  pull_request:
+    branches: [staging, main]
+    paths:
+      - 'tools/branch-protection/**'
+      - '.github/workflows/branch-protection-drift.yml'
+
+permissions:
+  contents: read
+
+jobs:
+  drift:
+    name: Branch protection drift
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+      - name: Run drift check
+        env:
+          # GH_TOKEN_FOR_ADMIN_API — repo-admin scope, needed for the
+          # /branches/:b/protection endpoint. Falls back to GITHUB_TOKEN
+          # for read-only mode (gh api on the protection endpoint
+          # returns 200 to maintainers, 404 to read-only). Configure at
+          # Settings → Secrets and variables → Actions.
+          GH_TOKEN: ${{ secrets.GH_TOKEN_FOR_ADMIN_API || secrets.GITHUB_TOKEN }}
+        run: bash tools/branch-protection/drift_check.sh
diff --git a/docs/e2e-coverage.md b/docs/e2e-coverage.md
new file mode 100644
index 00000000..48ca0e19
--- /dev/null
+++ b/docs/e2e-coverage.md
@@ -0,0 +1,58 @@
+# E2E coverage matrix
+
+This document is the source of truth for which E2E suites guard which surfaces and which gates are wired up where. Read this before adding a new E2E or moving a check between branches.
+
+## Suites
+
+| Workflow file | Job (= required-check name) | What it covers | Cron |
+|---|---|---|---|
+| `e2e-api.yml` | `E2E API Smoke Test` | A2A handshake, registry/register, /workspaces/:id/a2a forward, structured-event emission. Lightweight enough to run on every PR. | — |
+| `e2e-staging-canvas.yml` | `Canvas tabs E2E` | Canvas-tab Playwright UX checks against staging — config tab, secrets tab, agent-card tab, Activity hydration. | weekly Sun 08:00 UTC |
+| `e2e-staging-saas.yml` | `E2E Staging SaaS` | Full lifecycle: org creation → workspace provision (CP path) → A2A delegation → status/heartbeat → workspace delete → EC2 termination. The integration test that catches the silent-drop bug class (#2486 / #2811 / #2813 / #2814). | daily 07:00 UTC |
+| `e2e-staging-external.yml` | `E2E Staging External Runtime` | External-runtime registration + heartbeat staleness sweep + `/registry/peers` resolution. Validates the OSS-templated workspace path. | daily 07:30 UTC |
+| `e2e-staging-sanity.yml` | `Intentional-failure teardown sanity` | Inverted assertion — the run MUST fail. Validates the leak-detection self-check itself; not for general gating. | weekly Mon 06:00 UTC |
+| `continuous-synth-e2e.yml` | `Synthetic E2E against staging` | Standing background coverage between PR runs. Catches drift in production-like staging that PR-time E2Es miss. | every 15 min |
+
+## Required-check status (branch protection)
+
+| Suite | staging required | main required |
+|---|---|---|
+| `E2E API Smoke Test` | ✅ this PR | ✅ |
+| `Canvas tabs E2E` | ✅ this PR | (see follow-up) |
+| `E2E Staging SaaS` | ❌ — needs always-emit refactor | ❌ |
+| `E2E Staging External Runtime` | ❌ — needs always-emit refactor | ❌ |
+| `Intentional-failure teardown sanity` | ❌ inverted assertion, never required | ❌ |
+| `Synthetic E2E against staging` | ❌ cron-only, not a per-PR gate | ❌ |
+
+## Why the always-emit pattern matters
+
+Branch protection requires a *check name* to land at SUCCESS for every PR. Workflows with `paths:` filters that exclude a PR never run, so the check name never appears, and the PR sits BLOCKED forever.
+
+The pattern that supports being required is:
+
+1. Workflow always triggers on push/PR to the protected branch.
+2. A `detect-changes` job uses `dorny/paths-filter` to decide if real work runs.
+3. The protected job runs unconditionally and either (a) does real work when paths matched, or (b) emits a no-op SUCCESS step when paths skipped.
+
+`e2e-api.yml` and `e2e-staging-canvas.yml` already have this shape. `e2e-staging-saas.yml` and `e2e-staging-external.yml` use plain `paths:` filters and need the refactor before they can be required (filed as follow-up).
+
+## Adding a new E2E suite
+
+1. Pick a verb: smoke test, full lifecycle, fault-injection, drift detection. Pre-existing suites split along these lines.
+2. Use the always-emit shape so the check name can be made required.
+3. Add a row to the matrix above.
+4. Decide cron cadence based on cost + how fast drift would otherwise be caught.
+5. If you want it required, add to the relevant branch protection via `tools/branch-protection/apply.sh` (this PR adds the script).
+
+## When to break glass — temporarily skip a required E2E
+
+Don't. If an E2E is intermittently flaky, fix the test or move it out of required. The point of a required check is that it's load-bearing; bypassing one with admin override teaches the next operator the gate is optional.
+
+If a Production incident requires bypassing, document the override in the incident postmortem with a same-week followup to either fix the test or rip the check out of required.
+
+## Related issues / PRs
+
+- #2486 — silent-drop bug class that the SaaS E2E now catches
+- PR #2811 — `provisionWorkspaceAuto` consolidation (org-import SaaS gate)
+- PR #2824 — `StopWorkspaceAuto` mirror (closes #2813 + #2814)
+- Follow-up: refactor `e2e-staging-saas` + `e2e-staging-external` to always-emit (so they can be required)
diff --git a/tools/branch-protection/apply.sh b/tools/branch-protection/apply.sh
new file mode 100755
index 00000000..928d2829
--- /dev/null
+++ b/tools/branch-protection/apply.sh
@@ -0,0 +1,133 @@
+#!/usr/bin/env bash
+# tools/branch-protection/apply.sh — idempotently apply branch
+# protection to molecule-core's `staging` and `main` branches.
+#
+# Why a script: GitHub's branch protection lives in repo settings, so
+# changes are usually clicked through the UI and lost between admins.
+# This script makes the config reproducible — diff it against the live
+# state, change the file, run it, done. Single source of truth that
+# shows up in code review.
+#
+# Usage:
+#   tools/branch-protection/apply.sh                # apply both branches
+#   tools/branch-protection/apply.sh --dry-run      # show payload only
+#   tools/branch-protection/apply.sh --branch staging
+#
+# Requires: gh CLI authenticated as a repo admin. The script uses gh's
+# token (no separate PAT needed).
+
+set -euo pipefail
+
+REPO="Molecule-AI/molecule-core"
+DRY_RUN=0
+ONLY_BRANCH=""
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --dry-run) DRY_RUN=1; shift ;;
+    --branch)  ONLY_BRANCH="$2"; shift 2 ;;
+    -h|--help)
+      echo "Usage: $0 [--dry-run] [--branch <name>]"
+      exit 0
+      ;;
+    *) echo "Unknown arg: $1" >&2; exit 1 ;;
+  esac
+done
+
+# Required-check matrices. Each branch's set is the canonical list of
+# check NAMES (from each workflow's job-name). Adding/removing a check
+# here is the place to do it. Match docs/e2e-coverage.md.
+#
+# Why staging gets E2E API + Canvas E2E (this PR's addition): both
+# already use the always-emit pattern (path-filter no-ops emit SUCCESS),
+# so making them required can't deadlock a PR that doesn't touch their
+# paths. The other E2Es (SaaS, External) need a refactor to that
+# pattern before they can be required — tracked as follow-up.
+
+read -r -d '' STAGING_CHECKS <<'EOF' || true
+Analyze (go)
+Analyze (javascript-typescript)
+Analyze (python)
+Canvas (Next.js)
+Canvas tabs E2E
+Detect changes
+E2E API Smoke Test
+Platform (Go)
+Python Lint & Test
+Scan diff for credential-shaped strings
+Shellcheck (E2E scripts)
+EOF
+
+read -r -d '' MAIN_CHECKS <<'EOF' || true
+Analyze (go)
+Analyze (javascript-typescript)
+Analyze (python)
+Canvas (Next.js)
+Canvas tabs E2E
+Detect changes
+E2E API Smoke Test
+PR-built wheel + import smoke
+Platform (Go)
+Python Lint & Test
+Scan diff for credential-shaped strings
+Shellcheck (E2E scripts)
+EOF
+
+build_payload() {
+  local checks="$1"
+  local require_reviews="$2"  # true / false
+  local checks_json
+  checks_json=$(printf '%s\n' "$checks" | jq -Rs '
+    split("\n")
+    | map(select(length > 0))
+    | map({context: ., app_id: -1})
+  ')
+  jq -n \
+    --argjson checks "$checks_json" \
+    --argjson reviews "$require_reviews" \
+    '{
+      required_status_checks: {
+        strict: false,
+        checks: $checks
+      },
+      enforce_admins: false,
+      required_pull_request_reviews: (
+        if $reviews then
+          { required_approving_review_count: 1, dismiss_stale_reviews: true }
+        else null end
+      ),
+      restrictions: null,
+      allow_deletions: false,
+      allow_force_pushes: false,
+      block_creations: false,
+      required_conversation_resolution: true,
+      required_linear_history: false,
+      lock_branch: false,
+      allow_fork_syncing: true
+    }'
+}
+
+apply_branch() {
+  local branch="$1"
+  local checks="$2"
+  local require_reviews="$3"
+  local payload
+  payload=$(build_payload "$checks" "$require_reviews")
+  if [[ "$DRY_RUN" -eq 1 ]]; then
+    echo "=== branch: $branch ==="
+    echo "$payload" | jq .
+    return
+  fi
+  echo "Applying branch protection on $branch..."
+  printf '%s' "$payload" | gh api -X PUT \
+    "repos/$REPO/branches/$branch/protection" \
+    --input -
+  echo "Applied: $branch"
+}
+
+if [[ -z "$ONLY_BRANCH" || "$ONLY_BRANCH" == "staging" ]]; then
+  apply_branch staging "$STAGING_CHECKS" true
+fi
+if [[ -z "$ONLY_BRANCH" || "$ONLY_BRANCH" == "main" ]]; then
+  apply_branch main "$MAIN_CHECKS" true
+fi
diff --git a/tools/branch-protection/drift_check.sh b/tools/branch-protection/drift_check.sh
new file mode 100755
index 00000000..b25e23f0
--- /dev/null
+++ b/tools/branch-protection/drift_check.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+# tools/branch-protection/drift_check.sh — compare the live branch
+# protection on staging + main against what apply.sh would set. Used
+# by branch-protection-drift.yml (cron) to catch out-of-band UI edits.
+#
+# Exit codes:
+#   0 — live state matches the script
+#   1 — drift detected (output shows the diff)
+#   2 — gh API call failed
+
+set -euo pipefail
+
+REPO="Molecule-AI/molecule-core"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+EXIT_CODE=0
+
+check_branch() {
+  local branch="$1"
+  local want
+  want=$(bash "$SCRIPT_DIR/apply.sh" --dry-run --branch "$branch" 2>&1 |
+    sed -n '/^{$/,/^}$/p' |
+    jq -S '.required_status_checks.checks | map(.context) | sort')
+  local have
+  if ! have=$(gh api "repos/$REPO/branches/$branch/protection/required_status_checks" 2>/dev/null |
+    jq -S '.checks | map(.context) | sort'); then
+    echo "drift_check: FAIL to fetch $branch protection (gh API error)"
+    return 2
+  fi
+  if [[ "$want" != "$have" ]]; then
+    echo "=== DRIFT on $branch ==="
+    echo "want:"; echo "$want"
+    echo "have:"; echo "$have"
+    diff <(echo "$want") <(echo "$have") || true
+    return 1
+  fi
+  echo "OK: $branch matches desired state"
+}
+
+for b in staging main; do
+  if ! check_branch "$b"; then
+    EXIT_CODE=1
+  fi
+done
+exit "$EXIT_CODE"