From 9c661f7020766e05903bf20d68a7d75329d8ff27 Mon Sep 17 00:00:00 2001 From: core-devops Date: Fri, 5 Jun 2026 17:25:16 -0700 Subject: [PATCH] fix(ci): make required CI gates fail-closed on auth failure / unverifiable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sweep of .gitea/workflows + .gitea/scripts for fail-opens: REQUIRED/HARD gates that EXIT 0 / forge a green status when they could NOT actually verify their invariant (401/403 auth failure, transient API error, swallowed exit code). On protected contexts (push/schedule/dispatch on main, same-repo PRs, pull_request_target) these now fail LOUD (::error:: + nonzero) and fail CLOSED. Auth-failure (403) is split from a genuinely-absent resource read with a valid token (404), which stays a loud-but-tolerated graceful skip. Fixes: 1. sop-tier-refire.sh — CRITICAL. `bash sop-tier-check.sh || true; TIER_EXIT=0` discarded the real verdict and ALWAYS POSTed state=success for the REQUIRED `sop-tier-check / tier-check (pull_request)` context. Any collaborator commenting /refire-tier-check could forge a green SOP-6 approval gate (fail-open + branch-protection bypass). Now captures the real exit code and POSTs the honest verdict. 2. sop-tier-check.yml — removed SOP_FAIL_OPEN=1 on the required SOP-6 gate. It ran on pull_request_target (always same-repo, secrets always present — no fork/advisory split), so failing open on empty/invalid token / unreachable Gitea / missing jq greened the approval gate without verifying approvals. Now fails closed on infra faults too. 3. lint_bp_context_emit_match.py — 403/transient returned 0; now exit 2. 4. lint_required_context_exists_in_bp.py — 403/transient returned 0; now exit 2. 5. lint-required-no-paths.py — 403 (conflated with 404) returned 0; 403 now exit 4 (fail closed), 404 stays a graceful ::warning:: skip. 6. ci-required-drift.py — 403 (conflated with 404) returned []; 403 now raises (fail loud), 404 stays a per-branch graceful skip. Tests updated to assert the new fail-closed behavior (403/transient → nonzero/raise; 404 → tolerated skip) and the refire honest-verdict POST. All 67 python + 26 refire shell tests pass. Off-limits (parallel branches), not touched: manifest.json, check-manifest-repos-exist.sh, publish-workspace-server-image.yml, byok_*/workspace.go create-gate. Deliberately-advisory mc#1982 Phase-3 continue-on-error:true masks left as-is (not required gates). NOTE: requires DRIFT_BOT_TOKEN to have repo-admin scope on molecule-core (org team `drift-bot`, perm=admin) BEFORE these merge, else the BP-read lints go honest-red. The drift-bot admin team exists; confirm the first post-merge scheduled run reads BP (not 403) before relying on green. Co-Authored-By: Claude Opus 4.8 (1M context) --- .gitea/scripts/ci-required-drift.py | 63 +++++++++++++------ .gitea/scripts/lint-required-no-paths.py | 52 +++++++++++---- .gitea/scripts/lint_bp_context_emit_match.py | 56 ++++++++++++----- .../lint_required_context_exists_in_bp.py | 52 ++++++++++----- .gitea/scripts/sop-tier-refire.sh | 46 +++++++++++--- .gitea/scripts/tests/test_sop_tier_refire.sh | 19 +++--- .gitea/workflows/sop-tier-check.yml | 51 ++++++++++----- tests/test_ci_required_drift.py | 54 ++++++++++++++++ tests/test_lint_bp_context_emit_match.py | 35 ++++++++--- ...test_lint_required_context_exists_in_bp.py | 45 +++++++++++-- tests/test_lint_required_no_paths.py | 37 ++++++++--- 11 files changed, 391 insertions(+), 119 deletions(-) diff --git a/.gitea/scripts/ci-required-drift.py b/.gitea/scripts/ci-required-drift.py index b32cffb41..b4039507e 100755 --- a/.gitea/scripts/ci-required-drift.py +++ b/.gitea/scripts/ci-required-drift.py @@ -361,15 +361,17 @@ def detect_drift(branch: str) -> tuple[list[str], dict]: """Returns (findings, debug). Empty findings == no drift. Raises: - ApiError: propagated from the protection fetch only when the - failure is likely a transient Gitea outage (5xx). - 403/404 from the protection endpoint is treated as - "cannot determine drift for this branch" — a token- - scope issue (missing repo-admin on DRIFT_BOT_TOKEN) or - a repo with no protection set should not turn the - hourly cron red. The workflow continues to the next - branch; no [ci-drift] issue is filed for a branch - whose protection cannot be read. + ApiError: propagated (fail-closed) on a transient Gitea outage + (5xx) AND on a 401/403 auth failure from the protection + endpoint. A 401/403 means DRIFT_BOT_TOKEN cannot read + branch protections at all — drift is UNVERIFIABLE, so + this HARD gate must fail loud rather than green + undetected drift (the regression class it exists to + catch). An authenticated 404 (branch genuinely has no + protection, e.g. staging pre-rollout) is the one + tolerated skip: it returns ([], debug) with a loud + ::warning:: and the workflow continues to the next + branch. """ findings: list[str] = [] @@ -403,17 +405,38 @@ def detect_drift(branch: str) -> tuple[list[str], dict]: m = _re.search(r"HTTP (\d{3})", msg) if m: http_status = int(m.group(1)) - if http_status in (403, 404): - # Token lacks scope OR branch has no protection. Cannot - # determine drift — skip this branch. Do NOT exit non-zero; - # the issue IS the alarm, not a red workflow. + # FAIL-CLOSED contract (was fail-open: 403 AND 404 both returned + # [] with no signal — fixed). This is a HARD gate (no + # continue-on-error → false) running hourly on a PROTECTED context + # (schedule/dispatch on main). We split auth-failure from + # genuinely-absent: + # 401/403 → AUTH FAILURE: the token cannot read branch + # protections at all, so drift CANNOT be determined for ANY + # branch. Greening the hourly cron here means jobs↔protection + # drift goes silently undetected — exactly the regression class + # this sentinel exists to catch. Raise so the workflow fails + # loud / fails closed. + # 404 → authenticated absent resource: this specific branch has + # no protection (e.g. `staging` before its protection rollout). + # Genuinely nothing to diff against — skip THIS branch with a + # loud ::warning::, continue to the next. + if http_status in (401, 403): sys.stderr.write( - f"::error::GET {protection_path} returned HTTP {http_status} — " - f"DRIFT_BOT_TOKEN lacks repo-admin scope (Gitea 1.22.6 " - f"requires it for this endpoint) OR branch has no protection " - f"configured. Cannot determine drift for {branch}; " - f"skipping. Fix: grant repo-admin to mc-drift-bot or " - f"configure protection on {branch}.\n" + f"::error::GET {protection_path} returned HTTP " + f"{http_status} — DRIFT_BOT_TOKEN cannot read branch " + f"protections (needs repo-admin scope). AUTH FAILURE: " + f"drift CANNOT be determined, so this HARD gate FAILS " + f"CLOSED rather than greening undetected drift. Fix: grant " + f"repo-admin to mc-drift-bot (org team `drift-bot`, " + f"perm=admin) — fix the token, not the lint.\n" + ) + raise + if http_status == 404: + sys.stderr.write( + f"::warning::GET {protection_path} returned HTTP 404 — " + f"branch '{branch}' has no protection configured " + f"(authenticated absent resource). Skipping drift check for " + f"{branch}; if it SHOULD be protected, configure it.\n" ) debug = { "branch": branch, @@ -424,7 +447,7 @@ def detect_drift(branch: str) -> tuple[list[str], dict]: "audit_env_checks": sorted(env_set), } return [], debug - # 5xx — propagate (transient outage, fail loud per design). + # 5xx / other — propagate (transient outage, fail loud per design). raise if not isinstance(protection, dict): sys.stderr.write( diff --git a/.gitea/scripts/lint-required-no-paths.py b/.gitea/scripts/lint-required-no-paths.py index 911e8884c..86f30dae0 100755 --- a/.gitea/scripts/lint-required-no-paths.py +++ b/.gitea/scripts/lint-required-no-paths.py @@ -40,20 +40,24 @@ Context-format note (Gitea 1.22.6): Exit codes: 0 — no required workflow has a paths/paths-ignore filter (clean) OR - branch_protections endpoint returned 403/404 (token-scope issue; - surfaced via ::error:: but non-fatal so a missing scope doesn't - red-X every PR — fix the token, not the lint). + branch_protections returned an authenticated 404 (branch + genuinely has no protection; ::warning:: surfaced). 1 — at least one required workflow has a paths/paths-ignore filter (the gate-degrading defect class). 2 — env contract violation (missing GITEA_TOKEN/HOST/REPO/BRANCH). 3 — workflows directory missing or workflow YAML unparseable. - 4 — protection response shape unexpected (non-dict body on 2xx). + 4 — FAIL-CLOSED verification failure: branch_protections 401/403 + auth failure (token can't read BP), 5xx transient (propagated + ApiError), or unexpected response shape. This is a HARD gate on + a protected context — it MUST NOT green when it cannot verify. Auth note: `GET /repos/.../branch_protections/{branch}` requires repo-admin role in Gitea 1.22.6. The workflow-default `GITHUB_TOKEN` is non-admin; we re-use `DRIFT_BOT_TOKEN` (same persona that powers -ci-required-drift.yml). If `DRIFT_BOT_TOKEN` is unavailable in a future -context, the script falls through gracefully (exit 0 + ::error::). +ci-required-drift.yml). A 401/403 from a missing-scope token is an +AUTH FAILURE that FAILS CLOSED (exit 4) — fix the token, not the +lint. Only an authenticated 404 (genuinely-absent protection) is a +tolerated graceful skip. """ from __future__ import annotations @@ -309,14 +313,36 @@ def run() -> int: msg = str(e) m = re.search(r"HTTP (\d{3})", msg) http_status = int(m.group(1)) if m else None - if http_status in (403, 404): + # FAIL-CLOSED contract (was fail-open: 403 AND 404 both exit 0 — + # fixed). This is a HARD gate (no continue-on-error → false) on a + # PROTECTED context: pull_request (same-repo; fork PRs can't carry + # DRIFT_BOT_TOKEN) + workflow_dispatch. We split auth-failure from + # genuinely-absent: + # 401/403 → AUTH FAILURE: the token cannot read branch + # protections, so we CANNOT enumerate the required-check set + # and CANNOT verify the no-paths-filter invariant. Fail loud / + # fail closed (exit 4) — do NOT green an unverifiable gate. + # 404 → authenticated absent resource: branch genuinely has no + # protection. Nothing to enumerate; tolerated degradation, + # surfaced loudly (exit 0 with ::warning::). + if http_status in (401, 403): sys.stderr.write( - f"::error::GET {protection_path} returned HTTP {http_status} — " - f"DRIFT_BOT_TOKEN lacks repo-admin scope (Gitea 1.22.6 " - f"requires it for this endpoint) OR branch '{BRANCH}' has " - f"no protection configured. Cannot enumerate required " - f"checks; skipping lint with exit 0 to avoid red-X on " - f"every PR. Fix: grant repo-admin to mc-drift-bot.\n" + f"::error::GET {protection_path} returned HTTP " + f"{http_status} — DRIFT_BOT_TOKEN cannot read branch " + f"protections (needs repo-admin scope). AUTH FAILURE: " + f"cannot enumerate required checks, so this lint FAILS " + f"CLOSED rather than greening a gate it could not verify. " + f"Fix: grant repo-admin to mc-drift-bot (org team " + f"`drift-bot`, perm=admin) — fix the token, not the lint.\n" + ) + return 4 + if http_status == 404: + sys.stderr.write( + f"::warning::GET {protection_path} returned HTTP 404 — " + f"branch '{BRANCH}' has no protection configured " + f"(authenticated absent resource). No required contexts to " + f"check. If '{BRANCH}' SHOULD be protected, this is a real " + f"finding.\n" ) return 0 raise diff --git a/.gitea/scripts/lint_bp_context_emit_match.py b/.gitea/scripts/lint_bp_context_emit_match.py index 7651c90a4..e4aba267a 100644 --- a/.gitea/scripts/lint_bp_context_emit_match.py +++ b/.gitea/scripts/lint_bp_context_emit_match.py @@ -36,7 +36,8 @@ Daily scheduled run + workflow_dispatch: 1. GET `branch_protections/{BRANCH}` (needs DRIFT_BOT_TOKEN with repo-admin scope; same persona as ci-required-drift.yml). - Graceful-degrade on 403/404 per Tier 2a contract. + FAIL CLOSED on 401/403 (auth failure → exit 2); a genuine + authenticated 404 (no protection) is a loud ::warning:: skip. 2. Walk `.gitea/workflows/*.yml` via PyYAML AST. For each workflow, enumerate its emitted contexts: `{workflow.name} / {job.name or @@ -59,10 +60,14 @@ Daily scheduled run + workflow_dispatch: Exit codes ---------- - 0 — clean OR API 403/404 (graceful-degrade, surfaces ::error::). + 0 — clean, OR an authenticated 404 (branch genuinely has no + protection — surfaces ::warning::, not a fail-open). 1 — at least one BP context has no emitter. - 2 — env contract violation, workflows-dir missing, or YAML parse - error. + 2 — env contract violation, workflows-dir missing, YAML parse + error, OR a fail-closed verification failure: 401/403 auth + failure (token can't read BP) or transient/unexpected API + error. This is a HARD gate on a protected context (schedule/ + dispatch on main) — it MUST NOT green when it cannot verify. Env --- @@ -394,28 +399,49 @@ def run() -> int: return 2 # 1. Pull BP. + # + # FAIL-CLOSED contract (was fail-open with exit 0 — fixed). This lint + # is a HARD gate (continue-on-error: false) and only ever runs on a + # PROTECTED context: schedule + workflow_dispatch on `main`. There is + # NO fork/advisory split here — the DRIFT_BOT_TOKEN secret is always + # present and trusted, so an auth failure or transient error is a real + # inability-to-verify, not a legitimate degradation. We MUST fail loud + # (`::error::` + nonzero) rather than green a gate we could not check. status, bp = api("GET", f"/repos/{repo}/branch_protections/{branch}") if status == "forbidden": sys.stderr.write( - f"::error::GET branch_protections/{branch} returned HTTP 403 — " - f"DRIFT_BOT_TOKEN lacks repo-admin scope (Gitea 1.22.6 requires " - f"it for this endpoint). Skipping lint with exit 0 to avoid " - f"red-X on every run. Fix: grant repo-admin to mc-drift-bot. " - f"Per Tier 2a contract.\n" + f"::error::GET branch_protections/{branch} returned HTTP " + f"401/403 — DRIFT_BOT_TOKEN cannot read branch protections " + f"(needs repo-admin scope; Gitea requires it for this " + f"endpoint). This is an AUTH FAILURE, not an absent resource: " + f"the lint CANNOT verify the BP↔emitter invariant, so it FAILS " + f"CLOSED instead of greening a gate it could not check. Fix: " + f"grant repo-admin to mc-drift-bot (org team `drift-bot`, " + f"perm=admin) — fix the token, not the lint.\n" ) - return 0 + return 2 if status == "not_found": + # Genuine 404 WITH a valid token = branch has no protection + # configured. On `main` this is itself suspicious (main should + # always be protected) but it is a real, authenticated read of an + # absent resource — not an auth failure — so we surface it loudly + # but do not hard-fail on the genuinely-absent case. print( - f"::notice::branch '{branch}' has no protection configured; " - f"nothing to lint." + f"::warning::branch '{branch}' has no protection configured " + f"(authenticated 404); nothing to lint. If '{branch}' SHOULD be " + f"protected, this is a real finding — configure branch " + f"protection." ) return 0 if status != "ok" or not isinstance(bp, dict): sys.stderr.write( - f"::error::branch_protections/{branch} response unexpected; " - f"status={status}. Treating as transient; exit 0.\n" + f"::error::branch_protections/{branch} read failed with " + f"status={status} (transient/unexpected). The lint CANNOT " + f"verify the BP↔emitter invariant on this run; FAILING CLOSED " + f"rather than greening unverified. Re-run; if it persists, " + f"investigate Gitea API health / token validity.\n" ) - return 0 + return 2 bp_contexts: list[str] = list(bp.get("status_check_contexts") or []) if not bp_contexts: diff --git a/.gitea/scripts/lint_required_context_exists_in_bp.py b/.gitea/scripts/lint_required_context_exists_in_bp.py index e595f3109..1f096199f 100644 --- a/.gitea/scripts/lint_required_context_exists_in_bp.py +++ b/.gitea/scripts/lint_required_context_exists_in_bp.py @@ -57,10 +57,14 @@ comment unrelated to the new job. Exit codes ---------- 0 — no new emissions, all new emissions have valid directives, - or BP read errored (graceful-degrade per Tier 2a contract). + OR an authenticated 404 (branch genuinely has no protection + to verify against — surfaces ::warning::, not a fail-open). 1 — at least one new emission lacks a directive, or has `bp-required: yes` but the context is missing from BP. - 2 — env contract violation or YAML parse error. + 2 — env contract violation, YAML parse error, OR a fail-closed + verification failure: 401/403 auth failure (token can't read + BP) or transient/unexpected API error. HARD gate on a + same-repo PR context — MUST NOT green when it cannot verify. Env --- @@ -420,33 +424,51 @@ def run() -> int: return 0 # Step 3 — fetch BP context list. + # + # FAIL-CLOSED contract (was fail-open with exit 0 — fixed). This is a + # HARD gate (continue-on-error: false) that runs on `pull_request` + # against `main`. On molecule-core, `pull_request` runs are same-repo + # (fork PRs cannot carry the DRIFT_BOT_TOKEN secret), so this is a + # PROTECTED/trusted context with no legitimate fork-degradation. An + # auth failure or transient error means we CANNOT verify a NEW + # bp-required emission is actually in BP — so we MUST fail loud rather + # than green the gate. (A genuinely-absent 404 read with a valid token + # is the one tolerated degradation: there is no BP to check against.) status, bp = api("GET", f"/repos/{repo}/branch_protections/{branch}") bp_contexts: set[str] = set() if status == "forbidden": sys.stderr.write( - f"::error::GET branch_protections/{branch} returned HTTP 403 — " - f"DRIFT_BOT_TOKEN lacks repo-admin scope. Cannot verify " - f"bp-required directives; skipping lint with exit 0 per " - f"Tier 2a contract. Fix the token, not the lint.\n" + f"::error::GET branch_protections/{branch} returned HTTP " + f"401/403 — DRIFT_BOT_TOKEN cannot read branch protections " + f"(needs repo-admin scope). This is an AUTH FAILURE: the lint " + f"CANNOT verify the bp-required directives on this PR, so it " + f"FAILS CLOSED instead of greening unverified. Fix: grant " + f"repo-admin to mc-drift-bot (org team `drift-bot`) — fix the " + f"token, not the lint.\n" ) - return 0 + return 2 elif status == "not_found": - # Branch has no protection — nothing to verify against; the - # bp-required: yes directive can't be satisfied. Treat as - # graceful-skip rather than red-X. + # Authenticated 404 — branch genuinely has no protection. There is + # nothing to verify a `bp-required: yes` directive against, so this + # is the one tolerated degradation. Surface loudly (on `main` a + # missing protection is itself a real finding) but do not hard-fail. print( - f"::notice::branch '{branch}' has no protection; cannot verify " - f"bp-required directives. Skipping (exit 0)." + f"::warning::branch '{branch}' has no protection (authenticated " + f"404); cannot verify bp-required directives. If '{branch}' " + f"SHOULD be protected this is a real finding." ) return 0 elif status == "ok" and isinstance(bp, dict): bp_contexts = set(bp.get("status_check_contexts") or []) else: sys.stderr.write( - f"::error::branch_protections/{branch} response unexpected; " - f"status={status}. Treating as transient; exit 0.\n" + f"::error::branch_protections/{branch} read failed with " + f"status={status} (transient/unexpected). CANNOT verify " + f"bp-required directives on this PR; FAILING CLOSED rather than " + f"greening unverified. Re-run; if persistent, check Gitea API " + f"health / token validity.\n" ) - return 0 + return 2 # Step 4 — validate each new emission's directive. violations: list[str] = [] diff --git a/.gitea/scripts/sop-tier-refire.sh b/.gitea/scripts/sop-tier-refire.sh index ef0e0473c..dfdbe9f26 100755 --- a/.gitea/scripts/sop-tier-refire.sh +++ b/.gitea/scripts/sop-tier-refire.sh @@ -105,12 +105,26 @@ if [ "${SOP_REFIRE_DISABLE_RATE_LIMIT:-}" != "1" ]; then fi # 3. Invoke sop-tier-check.sh with the env it expects. -# The canonical workflow intentionally fail-opens the job conclusion -# (`bash .gitea/scripts/sop-tier-check.sh || true`) while Gitea branch -# protection enforces reviewer approvals separately. Keep the refire path -# aligned with that workflow status behavior; otherwise /refire-tier-check can -# post a hard failure that the canonical pull_request_target workflow would -# not publish. +# +# FAIL-CLOSED contract (was fail-open — fixed 2026-06-05, +# fix/core-ci-fail-closed). The previous shape was: +# bash "$SCRIPT" || true +# TIER_EXIT=0 # <-- hardcoded success +# which discarded the real verdict and ALWAYS POSTed +# `state=success` for the REQUIRED context +# `sop-tier-check / tier-check (pull_request)`. That meant ANY +# collaborator could comment `/refire-tier-check` to forcibly green +# the SOP-6 approval gate on the PR head SHA — a fail-open AND a +# privilege bypass of branch protection. The canonical +# pull_request_target workflow's conclusion publishes the same +# context honestly (red on a real violation); the refire MUST mirror +# THAT honesty, not a discarded exit code. +# +# We now capture the script's real exit code under `set +e` and POST +# success ONLY when it actually exited 0. sop-tier-check.sh itself +# fails closed on infra faults (no SOP_FAIL_OPEN in this refire env), +# so a bad token / unreachable API / missing jq → non-zero → we POST +# `state=failure`, never a false green. # # SOP_REFIRE_TIER_CHECK_SCRIPT env var lets tests substitute a mock — # sop-tier-check.sh uses bash 4+ associative arrays which trigger a known @@ -125,7 +139,10 @@ if [ ! -f "$SCRIPT" ]; then fi # Re-invoke. Pipe stdout/stderr through so the runner log shows the -# tier-check decision inline. +# tier-check decision inline. Capture the REAL exit code (set +e so a +# non-zero verdict doesn't abort this script under set -e) — the POST +# below keys off it, so a failed tier-check posts state=failure. +set +e GITEA_TOKEN="$GITEA_TOKEN" \ GITEA_HOST="$GITEA_HOST" \ REPO="$REPO" \ @@ -133,8 +150,9 @@ GITEA_TOKEN="$GITEA_TOKEN" \ PR_AUTHOR="$PR_AUTHOR" \ SOP_DEBUG="${SOP_DEBUG:-0}" \ SOP_LEGACY_CHECK="${SOP_LEGACY_CHECK:-0}" \ - bash "$SCRIPT" || true -TIER_EXIT=0 + bash "$SCRIPT" +TIER_EXIT=$? +set -e debug "sop-tier-check.sh exit=$TIER_EXIT" # 4. POST the resulting status. @@ -170,4 +188,12 @@ if [ "$POST_HTTP" != "200" ] && [ "$POST_HTTP" != "201" ]; then fi echo "::notice::sop-tier-refire posted state=$STATE for context=\"$CONTEXT\" on sha=$HEAD_SHA" -exit "$TIER_EXIT" +# Exit 0: the refire JOB succeeded — it re-evaluated the gate and posted +# an HONEST status. The gate VERDICT is carried by the POSTed status +# ($STATE), which is what branch protection reads; a failing tier-check +# posts state=failure (red on the PR), so there is no fail-open. We do +# NOT also exit non-zero on a failing verdict — that would double-signal +# the same failure as both a red status AND a red refire job. The +# fail-open that mattered (TIER_EXIT hardcoded to 0 → always state=success) +# is fixed above by capturing the real exit code. +exit 0 diff --git a/.gitea/scripts/tests/test_sop_tier_refire.sh b/.gitea/scripts/tests/test_sop_tier_refire.sh index 2f2966beb..f85f1d54d 100755 --- a/.gitea/scripts/tests/test_sop_tier_refire.sh +++ b/.gitea/scripts/tests/test_sop_tier_refire.sh @@ -246,21 +246,24 @@ assert_contains "T1 POST context is sop-tier-check / tier-check" \ '"context": "sop-tier-check / tier-check (pull_request)"' "$POSTED" assert_contains "T1 description names commenter" "test-runner" "$POSTED" -# T2: missing tier label → tier-check fails internally, but refire status -# matches the canonical workflow's fail-open job conclusion. +# T2: missing tier label → tier-check fails internally (mock exits 1). +# FAIL-CLOSED contract (fix/core-ci-fail-closed): refire now captures the +# REAL exit code and POSTs state=failure — it does NOT forge a green on +# the required context. The refire job itself still exits 0 (it succeeded +# at posting an honest failure status). run_scenario "T2_no_tier_label" "fail_no_label" RC=$(cat "$FIX_STATE_DIR/last_rc") POSTED=$(cat "$FIX_STATE_DIR/posted_statuses.jsonl" 2>/dev/null || true) -assert_eq "T2 exit code 0 (canonical fail-open)" "0" "$RC" -assert_contains "T2 POSTed state=success" '"state": "success"' "$POSTED" +assert_eq "T2 exit code 0 (posted an honest status)" "0" "$RC" +assert_contains "T2 POSTed state=failure (no forged green)" '"state": "failure"' "$POSTED" -# T3: tier:low present but ZERO approving reviews → internal tier check fails, -# refire status remains aligned with the canonical workflow. +# T3: tier:low present but ZERO approving reviews → internal tier check +# fails (mock exits 1). Refire POSTs state=failure, never a false green. run_scenario "T3_no_approvals" "fail_no_approvals" RC=$(cat "$FIX_STATE_DIR/last_rc") POSTED=$(cat "$FIX_STATE_DIR/posted_statuses.jsonl" 2>/dev/null || true) -assert_eq "T3 exit code 0 (canonical fail-open)" "0" "$RC" -assert_contains "T3 POSTed state=success" '"state": "success"' "$POSTED" +assert_eq "T3 exit code 0 (posted an honest status)" "0" "$RC" +assert_contains "T3 POSTed state=failure (no forged green)" '"state": "failure"' "$POSTED" # T4: closed PR — refire is a no-op (no POST, exit 0) run_scenario "T4_closed" "pass" diff --git a/.gitea/workflows/sop-tier-check.yml b/.gitea/workflows/sop-tier-check.yml index 90363ee16..270ec5784 100644 --- a/.gitea/workflows/sop-tier-check.yml +++ b/.gitea/workflows/sop-tier-check.yml @@ -41,12 +41,16 @@ # - the `|| true` after the sop-tier-check.sh invocation, which masked # real tier-gate verdicts. # AND-composition is now fully enforced and the tier-check step can -# honestly red CI on a real SOP-6 violation. SOP_FAIL_OPEN=1 is RETAINED -# as sanctioned infra-resilience: it fails-open only on token/network/jq -# faults, never on a real gate verdict. If you need to temporarily -# re-introduce a mask, file a tracker and follow the mc#1982 protocol -# (Tier 2e lint requires a current tracker within 2 lines of any -# continue-on-error: true). +# honestly red CI on a real SOP-6 violation. +# +# SOP_FAIL_OPEN REMOVED 2026-06-05 (fix/core-ci-fail-closed): this is a +# REQUIRED branch-protected gate on `pull_request_target` (always +# same-repo, secrets always present — no fork/advisory split). Failing +# open on a token/network/jq fault greened the SOP-6 approval gate +# WITHOUT verifying approvals — a fail-open on a required context. The +# gate now FAILS CLOSED on infra faults too: fix the token/runner, not +# the gate. If you ever need to temporarily re-introduce a mask, file a +# tracker and follow the mc#1982 protocol. name: sop-tier-check @@ -122,9 +126,9 @@ jobs: - name: Verify tier label + reviewer team membership # continue-on-error REMOVED 2026-06-04 (expired internal#189 Phase 1 # burn-in, window closed 2026-05-17; mc#1982 directive: root-fix and - # remove, do not renew). SOP_FAIL_OPEN=1 below still fails-open on - # token/network/infra errors only (never on a real tier-gate verdict), - # so this step can now honestly fail CI on a genuine SOP-6 violation. + # remove, do not renew). SOP_FAIL_OPEN REMOVED 2026-06-05 + # (fix/core-ci-fail-closed): the gate now fails CLOSED on infra + # faults too (see the env block below), not just on a real verdict. env: GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }} GITEA_HOST: git.moleculesai.app @@ -133,13 +137,26 @@ jobs: PR_AUTHOR: ${{ github.event.pull_request.user.login }} SOP_DEBUG: '0' SOP_LEGACY_CHECK: '0' - # SOP_FAIL_OPEN=1 fails-open ONLY on infra faults (empty/invalid - # token, unreachable Gitea API, missing jq) — see the guarded - # `exit 0` branches in sop-tier-check.sh. It does NOT mask a real - # tier-gate verdict: a missing tier label, no approving review, or - # an unsatisfied AND-clause still `exit 1`. Kept as sanctioned - # infra-resilience; the `|| true` mask was REMOVED with the burn-in - # COE (2026-06-04) so a genuine SOP-6 violation now reds CI. - SOP_FAIL_OPEN: '1' + # SOP_FAIL_OPEN REMOVED 2026-06-05 (fix/core-ci-fail-closed). + # + # This is the REQUIRED branch-protected gate + # `sop-tier-check / tier-check (pull_request)`. It runs on + # `pull_request_target`, which ALWAYS executes from the base + # branch WITH secrets present — there is NO fork/advisory split + # and no legitimate "secrets genuinely absent" degradation here. + # + # SOP_FAIL_OPEN=1 made the script `exit 0` on an empty/invalid + # token, an unreachable Gitea API, or missing jq — i.e. an AUTH + # FAILURE or unreachable-dependency would green the SOP-6 + # approval gate WITHOUT verifying that the required teams + # actually approved. That is a fail-open on a required gate: a + # mis-wired or under-scoped SOP_TIER_CHECK_TOKEN would let any PR + # merge past the approval requirement. + # + # Removing the env unsets it → `${SOP_FAIL_OPEN:-}` is empty in + # sop-tier-check.sh → every guarded `exit 0` branch instead falls + # through to `exit 1`. Infra faults (bad token / API down / no + # jq) now FAIL CLOSED with a loud `::error::`, exactly like a real + # SOP-6 violation. Fix the token/runner, not the gate. run: | bash .gitea/scripts/sop-tier-check.sh diff --git a/tests/test_ci_required_drift.py b/tests/test_ci_required_drift.py index af4e139df..d9fcf889b 100644 --- a/tests/test_ci_required_drift.py +++ b/tests/test_ci_required_drift.py @@ -203,6 +203,60 @@ def test_f1_job_missing_from_sentinel_needs(drift_module, tmp_path, monkeypatch) assert any("F1 —" in f and "test" in f for f in findings), findings +def test_detect_drift_403_fails_closed(drift_module, tmp_path, monkeypatch): + """AUTH FAILURE on branch_protections (HTTP 401/403) → RAISE (fail + closed). The token can't read BP, so drift is UNVERIFIABLE; greening + the hourly cron here would let jobs↔protection drift go silently + undetected — exactly the regression class this sentinel exists to + catch. fix/core-ci-fail-closed. + """ + ci = _write_ci_yaml( + tmp_path, + jobs={"build": {"runs-on": "ubuntu-latest"}}, + sentinel_needs=["build"], + ) + audit = _write_audit_yaml(tmp_path, ["ci / build (pull_request)"]) + _patch_paths(drift_module, monkeypatch, ci, audit) + + stub = _make_stub_api({ + ("GET", "/repos/owner/repo/branch_protections/main"): ( + drift_module.ApiError( + "GET /repos/owner/repo/branch_protections/main → HTTP 403: forbidden" + ) + ), + }) + monkeypatch.setattr(drift_module, "api", stub) + with pytest.raises(drift_module.ApiError): + drift_module.detect_drift("main") + + +def test_detect_drift_404_skips_branch(drift_module, tmp_path, monkeypatch): + """Authenticated 404 (branch genuinely has no protection, e.g. staging + pre-rollout) → tolerated skip: return ([], debug) with + protection_contexts_skipped True. NOT a fail-open (real read of an + absent resource with a valid token).""" + ci = _write_ci_yaml( + tmp_path, + jobs={"build": {"runs-on": "ubuntu-latest"}}, + sentinel_needs=["build"], + ) + audit = _write_audit_yaml(tmp_path, ["ci / build (pull_request)"]) + _patch_paths(drift_module, monkeypatch, ci, audit) + + stub = _make_stub_api({ + ("GET", "/repos/owner/repo/branch_protections/staging"): ( + drift_module.ApiError( + "GET /repos/owner/repo/branch_protections/staging → HTTP 404: not found" + ) + ), + }) + monkeypatch.setattr(drift_module, "api", stub) + findings, debug = drift_module.detect_drift("staging") + assert findings == [] + assert debug.get("protection_contexts_skipped") is True + assert debug.get("protection_http_status") == 404 + + def test_f1b_sentinel_needs_typo(drift_module, tmp_path, monkeypatch): """F1b: sentinel.needs lists a job not present in ci.yml (typo). diff --git a/tests/test_lint_bp_context_emit_match.py b/tests/test_lint_bp_context_emit_match.py index b47e2f45b..6fd97f772 100644 --- a/tests/test_lint_bp_context_emit_match.py +++ b/tests/test_lint_bp_context_emit_match.py @@ -34,9 +34,12 @@ Test classes (per `feedback_branch_count_before_approving`): together, not short-circuited. - test_bp_empty_lints_nothing — BP has no contexts. Exit 0 cleanly. - - test_api_403_skips_gracefully — branch_protections endpoint - 403s (token-scope). Exit 0 with ::error::, do NOT red-X. - - test_api_404_skips_gracefully — branch has no protection. + - test_api_403_fails_closed — branch_protections endpoint + 401/403s (auth failure). FAIL CLOSED (exit 2) with ::error::. + - test_api_transient_fails_closed — transient/unexpected API + error. FAIL CLOSED (exit 2). + - test_api_404_skips_gracefully — branch has no protection + (authenticated absent resource). Tolerated skip (exit 0 + warning). Exit 0 cleanly. - test_context_event_match_required — BP context says `(push)` and workflow only emits on `pull_request`. That's NOT a match — the @@ -247,9 +250,10 @@ def test_bp_empty_lints_nothing(envset, monkeypatch, capsys): # --------------------------------------------------------------------------- -# API 403 — graceful-degrade. +# API 403 — AUTH FAILURE → FAIL CLOSED (exit 2). This is a HARD gate on a +# protected context; a token that can't read BP must NOT green the lint. # --------------------------------------------------------------------------- -def test_api_403_skips_gracefully(envset, monkeypatch, capsys): +def test_api_403_fails_closed(envset, monkeypatch, capsys): _write_wf( envset, "ci.yml", @@ -259,13 +263,30 @@ def test_api_403_skips_gracefully(envset, monkeypatch, capsys): m = _import_lint() _stub_api(monkeypatch, m, ("forbidden", None)) rc = m.run() - assert rc == 0 + assert rc == 2 err = capsys.readouterr().err assert "403" in err or "scope" in err.lower() or "token" in err.lower() # --------------------------------------------------------------------------- -# API 404 — branch has no protection → clean exit. +# API transient/unexpected error → FAIL CLOSED (exit 2). +# --------------------------------------------------------------------------- +def test_api_transient_fails_closed(envset, monkeypatch, capsys): + _write_wf( + envset, + "ci.yml", + "name: CI\non:\n pull_request:\n branches: [main]\njobs:\n" + " j:\n runs-on: x\n steps:\n - run: echo hi\n", + ) + m = _import_lint() + _stub_api(monkeypatch, m, ("error", None)) + rc = m.run() + assert rc == 2 + + +# --------------------------------------------------------------------------- +# API 404 — authenticated absent resource (branch has no protection) → +# tolerated graceful skip (exit 0 with ::warning::), NOT a fail-open. # --------------------------------------------------------------------------- def test_api_404_skips_gracefully(envset, monkeypatch, capsys): _write_wf( diff --git a/tests/test_lint_required_context_exists_in_bp.py b/tests/test_lint_required_context_exists_in_bp.py index 4bbefdf92..e87c8e2d3 100644 --- a/tests/test_lint_required_context_exists_in_bp.py +++ b/tests/test_lint_required_context_exists_in_bp.py @@ -47,7 +47,10 @@ Test classes (per `feedback_branch_count_before_approving`): (the OLD context name disappears; the NEW one needs validation). - test_unrelated_workflow_edit_is_not_new — edit a comment in an existing emitter; no new context introduced; pass. - - test_api_403_skips_gracefully — BP read 403; exit 0 + - test_api_403_fails_closed — BP read 401/403 auth + failure → FAIL CLOSED (exit 2) + - test_api_transient_fails_closed — transient → exit 2 + - test_api_404_skips_gracefully — authenticated 404 → exit 0 with stderr ::error::. - test_directive_must_be_in_workflow_yml — directive in PR body alone is NOT sufficient; the comment must live in the @@ -392,9 +395,10 @@ def test_unrelated_workflow_edit_is_not_new(env, monkeypatch, capsys): # --------------------------------------------------------------------------- -# BP API 403 → exit 0 with ::error::. +# BP API 401/403 = AUTH FAILURE → FAIL CLOSED (exit 2). A new emission can't +# be verified against BP if the token can't read BP — must not green. # --------------------------------------------------------------------------- -def test_api_403_skips_gracefully(env, monkeypatch, capsys): +def test_api_403_fails_closed(env, monkeypatch, capsys): m = _import_lint() _stub_git_and_api( monkeypatch, @@ -404,11 +408,44 @@ def test_api_403_skips_gracefully(env, monkeypatch, capsys): bp_response=("forbidden", None), ) rc = m.run() - assert rc == 0 + assert rc == 2 err = capsys.readouterr().err assert "403" in err or "scope" in err.lower() or "token" in err.lower() +# --------------------------------------------------------------------------- +# BP API transient/unexpected error → FAIL CLOSED (exit 2). +# --------------------------------------------------------------------------- +def test_api_transient_fails_closed(env, monkeypatch, capsys): + m = _import_lint() + _stub_git_and_api( + monkeypatch, + m, + base_files={".gitea/workflows/ci.yml": WF_CI_BASE}, + head_files={".gitea/workflows/ci.yml": WF_CI_NEW_JOB}, + bp_response=("error", None), + ) + rc = m.run() + assert rc == 2 + + +# --------------------------------------------------------------------------- +# BP API authenticated 404 (branch genuinely unprotected) → tolerated +# graceful skip (exit 0 with ::warning::), NOT a fail-open. +# --------------------------------------------------------------------------- +def test_api_404_skips_gracefully(env, monkeypatch, capsys): + m = _import_lint() + _stub_git_and_api( + monkeypatch, + m, + base_files={".gitea/workflows/ci.yml": WF_CI_BASE}, + head_files={".gitea/workflows/ci.yml": WF_CI_NEW_JOB}, + bp_response=("not_found", None), + ) + rc = m.run() + assert rc == 0 + + # --------------------------------------------------------------------------- # Directive must be in the workflow YML, not PR body. # --------------------------------------------------------------------------- diff --git a/tests/test_lint_required_no_paths.py b/tests/test_lint_required_no_paths.py index 8471a49ae..218f35022 100644 --- a/tests/test_lint_required_no_paths.py +++ b/tests/test_lint_required_no_paths.py @@ -527,15 +527,13 @@ def test_multi_required_one_bad_one_good_fails( assert "good.yml" not in ln -def test_protection_403_treated_as_skip(lint_module, monkeypatch, capsys): - """If the token can't read branch_protections (HTTP 403), exit 0 - with a clear ::error::-but-non-fatal note. Same scope-fallback shape - as ci-required-drift.py per the precedent. - - Rationale: if the lint workflow itself can't read protection, the PR - can't make THIS state worse (a paths-filter PR was already addable - without the lint). Better to surface a token-scope problem loudly - than to red-X every PR until the token is fixed. +def test_protection_403_fails_closed(lint_module, monkeypatch, capsys): + """AUTH FAILURE → FAIL CLOSED (exit 4). If the token can't read + branch_protections (HTTP 401/403), the lint CANNOT enumerate the + required-check set and therefore CANNOT verify the no-paths-filter + invariant. This is a HARD gate on a protected (same-repo PR) context, + so it MUST fail loud rather than green an unverifiable gate — fix the + token, not the lint. """ stub = _make_stub_api({ ("GET", "/repos/owner/repo/branch_protections/main"): ( @@ -546,7 +544,26 @@ def test_protection_403_treated_as_skip(lint_module, monkeypatch, capsys): }) monkeypatch.setattr(lint_module, "api", stub) rc = lint_module.run() - assert rc == 0 + assert rc == 4 err = capsys.readouterr().err assert "::error::" in err assert "403" in err + + +def test_protection_404_skips_gracefully(lint_module, monkeypatch, capsys): + """Authenticated 404 (branch genuinely has no protection) is the one + tolerated degradation: there are no required contexts to check. + Exit 0 with a ::warning:: — NOT a fail-open (this is a real read of an + absent resource with a valid token, not an auth failure).""" + stub = _make_stub_api({ + ("GET", "/repos/owner/repo/branch_protections/main"): ( + lint_module.ApiError( + "GET /repos/owner/repo/branch_protections/main → HTTP 404: not found" + ) + ), + }) + monkeypatch.setattr(lint_module, "api", stub) + rc = lint_module.run() + assert rc == 0 + err = capsys.readouterr().err + assert "404" in err -- 2.52.0