diff --git a/.github/workflows/branch-protection-drift.yml b/.github/workflows/branch-protection-drift.yml index 7b05f09f..960f03c8 100644 --- a/.github/workflows/branch-protection-drift.yml +++ b/.github/workflows/branch-protection-drift.yml @@ -31,12 +31,34 @@ jobs: timeout-minutes: 5 steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + # Per memory feedback_schedule_vs_dispatch_secrets_hardening.md: + # schedule + pull_request triggers MUST hard-fail when the admin + # token is missing — silent soft-skip masks the gate disappearing. + # workflow_dispatch keeps soft-skip so an operator can run a + # diagnostic one-off without configuring the secret first. + - name: Verify admin token present (hard-fail on schedule/PR) + if: github.event_name != 'workflow_dispatch' + env: + GH_TOKEN_FOR_ADMIN_API: ${{ secrets.GH_TOKEN_FOR_ADMIN_API }} + run: | + if [[ -z "$GH_TOKEN_FOR_ADMIN_API" ]]; then + echo "::error::GH_TOKEN_FOR_ADMIN_API secret missing." >&2 + echo "" >&2 + echo "drift_check requires repo-admin scope to read /branches/:b/protection." >&2 + echo "GITHUB_TOKEN does not have that scope." >&2 + echo "Set GH_TOKEN_FOR_ADMIN_API at Settings → Secrets and variables → Actions." >&2 + echo "" >&2 + echo "On workflow_dispatch this step soft-skips for one-off operator runs." >&2 + exit 1 + fi + - name: Run drift check env: # GH_TOKEN_FOR_ADMIN_API — repo-admin scope, needed for the # /branches/:b/protection endpoint. Falls back to GITHUB_TOKEN - # for read-only mode (gh api on the protection endpoint - # returns 200 to maintainers, 404 to read-only). Configure at - # Settings → Secrets and variables → Actions. + # only on workflow_dispatch (operator override); the verify + # step above hard-fails any other trigger when the secret is + # missing. GH_TOKEN: ${{ secrets.GH_TOKEN_FOR_ADMIN_API || secrets.GITHUB_TOKEN }} run: bash tools/branch-protection/drift_check.sh diff --git a/tools/branch-protection/apply.sh b/tools/branch-protection/apply.sh index 928d2829..21b72ac2 100755 --- a/tools/branch-protection/apply.sh +++ b/tools/branch-protection/apply.sh @@ -2,16 +2,32 @@ # tools/branch-protection/apply.sh — idempotently apply branch # protection to molecule-core's `staging` and `main` branches. # -# Why a script: GitHub's branch protection lives in repo settings, so -# changes are usually clicked through the UI and lost between admins. -# This script makes the config reproducible — diff it against the live -# state, change the file, run it, done. Single source of truth that -# shows up in code review. +# Single source of truth for the protection settings. Diff this file +# against the live state (drift_check.sh handles that nightly + on +# every PR that touches this directory). +# +# Why each branch has its OWN payload section instead of a shared +# template: pre-2026-05-05 the script generated both branches from a +# shared template that hard-coded enforce_admins=false, +# dismiss_stale_reviews=true, strict=false, allow_fork_syncing=true, +# and dropped bypass_pull_request_allowances. Live staging had +# enforce_admins=true, dismiss_stale_reviews=false, strict=true, +# allow_fork_syncing=false, and a bypass list. Running the script +# would have silently weakened protection on every dimension at once. +# Per-branch payloads codify the deliberate per-branch policy that +# already lives on the repo, with the script's net contribution +# being ONLY the explicit additions to required_status_checks. +# +# Per memory feedback_dismiss_stale_reviews_blocks_promote.md, +# dismiss_stale_reviews=true silently re-blocks every auto-promote PR +# (cost the user 2.5h once already on staging — confirming we keep +# this OFF on staging is load-bearing for the auto-promote chain). # # Usage: # tools/branch-protection/apply.sh # apply both branches # tools/branch-protection/apply.sh --dry-run # show payload only # tools/branch-protection/apply.sh --branch staging +# tools/branch-protection/apply.sh --skip-preflight # skip check-name validation # # Requires: gh CLI authenticated as a repo admin. The script uses gh's # token (no separate PAT needed). @@ -21,28 +37,25 @@ set -euo pipefail REPO="Molecule-AI/molecule-core" DRY_RUN=0 ONLY_BRANCH="" +SKIP_PREFLIGHT=0 while [[ $# -gt 0 ]]; do case "$1" in --dry-run) DRY_RUN=1; shift ;; --branch) ONLY_BRANCH="$2"; shift 2 ;; + --skip-preflight) SKIP_PREFLIGHT=1; shift ;; -h|--help) - echo "Usage: $0 [--dry-run] [--branch ]" + echo "Usage: $0 [--dry-run] [--branch ] [--skip-preflight]" exit 0 ;; *) echo "Unknown arg: $1" >&2; exit 1 ;; esac done -# Required-check matrices. Each branch's set is the canonical list of -# check NAMES (from each workflow's job-name). Adding/removing a check -# here is the place to do it. Match docs/e2e-coverage.md. -# -# Why staging gets E2E API + Canvas E2E (this PR's addition): both -# already use the always-emit pattern (path-filter no-ops emit SUCCESS), -# so making them required can't deadlock a PR that doesn't touch their -# paths. The other E2Es (SaaS, External) need a refactor to that -# pattern before they can be required — tracked as follow-up. +# ─── Required-check matrices ────────────────────────────────────── +# Each branch's set is the canonical list of check NAMES (from each +# workflow's job-name). Adding/removing a check here is the place to +# do it. Match docs/e2e-coverage.md. read -r -d '' STAGING_CHECKS <<'EOF' || true Analyze (go) @@ -73,29 +86,41 @@ Scan diff for credential-shaped strings Shellcheck (E2E scripts) EOF -build_payload() { - local checks="$1" - local require_reviews="$2" # true / false - local checks_json - checks_json=$(printf '%s\n' "$checks" | jq -Rs ' +checks_to_json() { + printf '%s\n' "$1" | jq -Rs ' split("\n") | map(select(length > 0)) | map({context: ., app_id: -1}) - ') + ' +} + +# ─── Per-branch payloads (each preserves live-state policy) ─────── +# Staging payload — preserves the live values that pre-2026-05-05's +# apply.sh would have silently rewritten: +# enforce_admins=true, dismiss_stale_reviews=false, strict=true, +# allow_fork_syncing=false, bypass list = HongmingWang-Rabbit + molecule-ai app. +build_staging_payload() { + local checks_json + checks_json=$(checks_to_json "$STAGING_CHECKS") jq -n \ --argjson checks "$checks_json" \ - --argjson reviews "$require_reviews" \ '{ required_status_checks: { - strict: false, + strict: true, checks: $checks }, - enforce_admins: false, - required_pull_request_reviews: ( - if $reviews then - { required_approving_review_count: 1, dismiss_stale_reviews: true } - else null end - ), + enforce_admins: true, + required_pull_request_reviews: { + required_approving_review_count: 1, + dismiss_stale_reviews: false, + require_code_owner_reviews: false, + require_last_push_approval: false, + bypass_pull_request_allowances: { + users: ["HongmingWang-Rabbit"], + teams: [], + apps: ["molecule-ai"] + } + }, restrictions: null, allow_deletions: false, allow_force_pushes: false, @@ -103,21 +128,101 @@ build_payload() { required_conversation_resolution: true, required_linear_history: false, lock_branch: false, - allow_fork_syncing: true + allow_fork_syncing: false }' } +# Main payload — preserves the live values: +# enforce_admins=false, dismiss_stale_reviews=true, strict=true, +# allow_fork_syncing=false, NO bypass list. +# main intentionally has different settings than staging because main +# is the deploy target — the auto-promote app pushes to main without +# the friction of an admin-bypass list, and stale-review dismissal +# is acceptable here because every change has already cleared +# staging review. +build_main_payload() { + local checks_json + checks_json=$(checks_to_json "$MAIN_CHECKS") + jq -n \ + --argjson checks "$checks_json" \ + '{ + required_status_checks: { + strict: true, + checks: $checks + }, + enforce_admins: false, + required_pull_request_reviews: { + required_approving_review_count: 1, + dismiss_stale_reviews: true, + require_code_owner_reviews: false, + require_last_push_approval: false + }, + restrictions: null, + allow_deletions: false, + allow_force_pushes: false, + block_creations: false, + required_conversation_resolution: true, + required_linear_history: false, + lock_branch: false, + allow_fork_syncing: false + }' +} + +# ─── R3 preflight: validate every desired check name has at least +# one historical run ────────────────────────────────────────────── +# Pre-fix the script accepted arbitrary strings into +# required_status_checks.checks. A typo like "Canvas Tabs E2E" vs +# "Canvas tabs E2E" → GH accepts → every PR is blocked forever +# waiting for a context that never emits. The preflight hits the +# /commits/{sha}/check-runs endpoint and asserts each desired name +# has at least one matching run. Skippable via --skip-preflight for +# the case where you're adding a brand-new workflow whose first run +# hasn't fired yet. +preflight_check_names() { + local branch="$1" + local checks="$2" + local sha + sha=$(gh api "repos/$REPO/commits/$branch" --jq '.sha' 2>/dev/null || echo "") + if [[ -z "$sha" ]]; then + echo "preflight: WARN cannot resolve $branch tip SHA, skipping check-name validation" >&2 + return 0 + fi + local known_names + known_names=$(gh api "repos/$REPO/commits/$sha/check-runs?per_page=100" \ + --jq '.check_runs | map(.name)' 2>/dev/null || echo "[]") + local missing=() + while IFS= read -r name; do + [[ -z "$name" ]] && continue + if ! echo "$known_names" | jq -e --arg n "$name" 'index($n) != null' >/dev/null; then + missing+=("$name") + fi + done <<< "$checks" + if [[ ${#missing[@]} -gt 0 ]]; then + echo "preflight: $branch — these check names are NOT in the historical check-runs for the tip SHA:" >&2 + printf ' - %s\n' "${missing[@]}" >&2 + echo "If they're truly new (workflow added but never run), re-run with --skip-preflight." >&2 + echo "Otherwise typos here will permanently block every PR — fix the names." >&2 + return 1 + fi +} + apply_branch() { local branch="$1" local checks="$2" - local require_reviews="$3" + local payload_fn="$3" local payload - payload=$(build_payload "$checks" "$require_reviews") + payload=$($payload_fn) if [[ "$DRY_RUN" -eq 1 ]]; then echo "=== branch: $branch ===" echo "$payload" | jq . return fi + if [[ "$SKIP_PREFLIGHT" -eq 0 ]]; then + if ! preflight_check_names "$branch" "$checks"; then + echo "FAIL: preflight on $branch caught typos or missing workflows. Aborting." >&2 + return 1 + fi + fi echo "Applying branch protection on $branch..." printf '%s' "$payload" | gh api -X PUT \ "repos/$REPO/branches/$branch/protection" \ @@ -126,8 +231,8 @@ apply_branch() { } if [[ -z "$ONLY_BRANCH" || "$ONLY_BRANCH" == "staging" ]]; then - apply_branch staging "$STAGING_CHECKS" true + apply_branch staging "$STAGING_CHECKS" build_staging_payload fi if [[ -z "$ONLY_BRANCH" || "$ONLY_BRANCH" == "main" ]]; then - apply_branch main "$MAIN_CHECKS" true + apply_branch main "$MAIN_CHECKS" build_main_payload fi diff --git a/tools/branch-protection/drift_check.sh b/tools/branch-protection/drift_check.sh index b25e23f0..86d120f2 100755 --- a/tools/branch-protection/drift_check.sh +++ b/tools/branch-protection/drift_check.sh @@ -3,6 +3,12 @@ # protection on staging + main against what apply.sh would set. Used # by branch-protection-drift.yml (cron) to catch out-of-band UI edits. # +# Pre-2026-05-05 version diffed only required_status_checks.checks — +# would have missed a UI click that flipped enforce_admins or +# dismiss_stale_reviews. Now compares the full normalized payload so +# any silent rewrite of admin/review/lock/deletion settings trips the +# drift gate. +# # Exit codes: # 0 — live state matches the script # 1 — drift detected (output shows the diff) @@ -14,22 +20,129 @@ REPO="Molecule-AI/molecule-core" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" EXIT_CODE=0 +# Normalise the GET /branches/:b/protection response so we can compare +# against apply.sh's payload. The GET response inflates booleans into +# {url, enabled} sub-objects and bypass list users/apps into full +# user/app objects with avatar_url etc — strip those down to match +# the input shape. +NORMALISE_LIVE='{ + required_status_checks: ( + .required_status_checks + | { strict: .strict, + checks: (.checks | map({context}) | sort_by(.context)) } + ), + enforce_admins: ( + if (.enforce_admins | type) == "object" + then .enforce_admins.enabled + else .enforce_admins end + ), + required_pull_request_reviews: ( + .required_pull_request_reviews + | if . == null then null else + { required_approving_review_count, + dismiss_stale_reviews, + require_code_owner_reviews, + require_last_push_approval, + bypass_pull_request_allowances: ( + if .bypass_pull_request_allowances == null then null + else { + users: (.bypass_pull_request_allowances.users // [] | map(.login) | sort), + teams: (.bypass_pull_request_allowances.teams // [] | map(.slug) | sort), + apps: (.bypass_pull_request_allowances.apps // [] | map(.slug) | sort) + } end + ) + } + end + ), + restrictions: ( + if .restrictions == null then null + else { users: (.restrictions.users | map(.login) | sort), + teams: (.restrictions.teams | map(.slug) | sort), + apps: (.restrictions.apps | map(.slug) | sort) } + end + ), + allow_deletions: ( + if (.allow_deletions | type) == "object" then .allow_deletions.enabled + else (.allow_deletions // false) end + ), + allow_force_pushes: ( + if (.allow_force_pushes | type) == "object" then .allow_force_pushes.enabled + else (.allow_force_pushes // false) end + ), + block_creations: ( + if (.block_creations | type) == "object" then .block_creations.enabled + else (.block_creations // false) end + ), + required_conversation_resolution: ( + if (.required_conversation_resolution | type) == "object" + then .required_conversation_resolution.enabled + else (.required_conversation_resolution // false) end + ), + required_linear_history: ( + if (.required_linear_history | type) == "object" then .required_linear_history.enabled + else (.required_linear_history // false) end + ), + lock_branch: ( + if (.lock_branch | type) == "object" then .lock_branch.enabled + else (.lock_branch // false) end + ), + allow_fork_syncing: ( + if (.allow_fork_syncing | type) == "object" then .allow_fork_syncing.enabled + else (.allow_fork_syncing // false) end + ) +}' + +# Apply.sh's payload is already in the input shape; we just need to +# canonicalise the checks order and fill in optional fields with their +# defaults so the comparison aligns. +NORMALISE_SCRIPT='{ + required_status_checks: { + strict: .required_status_checks.strict, + checks: (.required_status_checks.checks | map({context}) | sort_by(.context)) + }, + enforce_admins: .enforce_admins, + required_pull_request_reviews: ( + if .required_pull_request_reviews == null then null else + { required_approving_review_count: .required_pull_request_reviews.required_approving_review_count, + dismiss_stale_reviews: .required_pull_request_reviews.dismiss_stale_reviews, + require_code_owner_reviews: (.required_pull_request_reviews.require_code_owner_reviews // false), + require_last_push_approval: (.required_pull_request_reviews.require_last_push_approval // false), + bypass_pull_request_allowances: ( + if .required_pull_request_reviews.bypass_pull_request_allowances == null then null + else { + users: (.required_pull_request_reviews.bypass_pull_request_allowances.users // [] | sort), + teams: (.required_pull_request_reviews.bypass_pull_request_allowances.teams // [] | sort), + apps: (.required_pull_request_reviews.bypass_pull_request_allowances.apps // [] | sort) + } end + ) + } + end + ), + restrictions: .restrictions, + allow_deletions: (.allow_deletions // false), + allow_force_pushes: (.allow_force_pushes // false), + block_creations: (.block_creations // false), + required_conversation_resolution: (.required_conversation_resolution // false), + required_linear_history: (.required_linear_history // false), + lock_branch: (.lock_branch // false), + allow_fork_syncing: (.allow_fork_syncing // false) +}' + check_branch() { local branch="$1" local want want=$(bash "$SCRIPT_DIR/apply.sh" --dry-run --branch "$branch" 2>&1 | sed -n '/^{$/,/^}$/p' | - jq -S '.required_status_checks.checks | map(.context) | sort') - local have - if ! have=$(gh api "repos/$REPO/branches/$branch/protection/required_status_checks" 2>/dev/null | - jq -S '.checks | map(.context) | sort'); then + jq -S "$NORMALISE_SCRIPT") + local have_raw + if ! have_raw=$(gh api "repos/$REPO/branches/$branch/protection" 2>/dev/null); then echo "drift_check: FAIL to fetch $branch protection (gh API error)" return 2 fi + local have + have=$(echo "$have_raw" | jq -S "$NORMALISE_LIVE") if [[ "$want" != "$have" ]]; then echo "=== DRIFT on $branch ===" - echo "want:"; echo "$want" - echo "have:"; echo "$have" diff <(echo "$want") <(echo "$have") || true return 1 fi