fix(ci): fail-closed CI integrity sweep — no fail-open gates #2323
@@ -361,15 +361,17 @@ def detect_drift(branch: str) -> tuple[list[str], dict]:
|
||||
"""Returns (findings, debug). Empty findings == no drift.
|
||||
|
||||
Raises:
|
||||
ApiError: propagated from the protection fetch only when the
|
||||
failure is likely a transient Gitea outage (5xx).
|
||||
403/404 from the protection endpoint is treated as
|
||||
"cannot determine drift for this branch" — a token-
|
||||
scope issue (missing repo-admin on DRIFT_BOT_TOKEN) or
|
||||
a repo with no protection set should not turn the
|
||||
hourly cron red. The workflow continues to the next
|
||||
branch; no [ci-drift] issue is filed for a branch
|
||||
whose protection cannot be read.
|
||||
ApiError: propagated (fail-closed) on a transient Gitea outage
|
||||
(5xx) AND on a 401/403 auth failure from the protection
|
||||
endpoint. A 401/403 means DRIFT_BOT_TOKEN cannot read
|
||||
branch protections at all — drift is UNVERIFIABLE, so
|
||||
this HARD gate must fail loud rather than green
|
||||
undetected drift (the regression class it exists to
|
||||
catch). An authenticated 404 (branch genuinely has no
|
||||
protection, e.g. staging pre-rollout) is the one
|
||||
tolerated skip: it returns ([], debug) with a loud
|
||||
::warning:: and the workflow continues to the next
|
||||
branch.
|
||||
"""
|
||||
findings: list[str] = []
|
||||
|
||||
@@ -403,17 +405,38 @@ def detect_drift(branch: str) -> tuple[list[str], dict]:
|
||||
m = _re.search(r"HTTP (\d{3})", msg)
|
||||
if m:
|
||||
http_status = int(m.group(1))
|
||||
if http_status in (403, 404):
|
||||
# Token lacks scope OR branch has no protection. Cannot
|
||||
# determine drift — skip this branch. Do NOT exit non-zero;
|
||||
# the issue IS the alarm, not a red workflow.
|
||||
# FAIL-CLOSED contract (was fail-open: 403 AND 404 both returned
|
||||
# [] with no signal — fixed). This is a HARD gate (no
|
||||
# continue-on-error → false) running hourly on a PROTECTED context
|
||||
# (schedule/dispatch on main). We split auth-failure from
|
||||
# genuinely-absent:
|
||||
# 401/403 → AUTH FAILURE: the token cannot read branch
|
||||
# protections at all, so drift CANNOT be determined for ANY
|
||||
# branch. Greening the hourly cron here means jobs↔protection
|
||||
# drift goes silently undetected — exactly the regression class
|
||||
# this sentinel exists to catch. Raise so the workflow fails
|
||||
# loud / fails closed.
|
||||
# 404 → authenticated absent resource: this specific branch has
|
||||
# no protection (e.g. `staging` before its protection rollout).
|
||||
# Genuinely nothing to diff against — skip THIS branch with a
|
||||
# loud ::warning::, continue to the next.
|
||||
if http_status in (401, 403):
|
||||
sys.stderr.write(
|
||||
f"::error::GET {protection_path} returned HTTP {http_status} — "
|
||||
f"DRIFT_BOT_TOKEN lacks repo-admin scope (Gitea 1.22.6 "
|
||||
f"requires it for this endpoint) OR branch has no protection "
|
||||
f"configured. Cannot determine drift for {branch}; "
|
||||
f"skipping. Fix: grant repo-admin to mc-drift-bot or "
|
||||
f"configure protection on {branch}.\n"
|
||||
f"::error::GET {protection_path} returned HTTP "
|
||||
f"{http_status} — DRIFT_BOT_TOKEN cannot read branch "
|
||||
f"protections (needs repo-admin scope). AUTH FAILURE: "
|
||||
f"drift CANNOT be determined, so this HARD gate FAILS "
|
||||
f"CLOSED rather than greening undetected drift. Fix: grant "
|
||||
f"repo-admin to mc-drift-bot (org team `drift-bot`, "
|
||||
f"perm=admin) — fix the token, not the lint.\n"
|
||||
)
|
||||
raise
|
||||
if http_status == 404:
|
||||
sys.stderr.write(
|
||||
f"::warning::GET {protection_path} returned HTTP 404 — "
|
||||
f"branch '{branch}' has no protection configured "
|
||||
f"(authenticated absent resource). Skipping drift check for "
|
||||
f"{branch}; if it SHOULD be protected, configure it.\n"
|
||||
)
|
||||
debug = {
|
||||
"branch": branch,
|
||||
@@ -424,7 +447,7 @@ def detect_drift(branch: str) -> tuple[list[str], dict]:
|
||||
"audit_env_checks": sorted(env_set),
|
||||
}
|
||||
return [], debug
|
||||
# 5xx — propagate (transient outage, fail loud per design).
|
||||
# 5xx / other — propagate (transient outage, fail loud per design).
|
||||
raise
|
||||
if not isinstance(protection, dict):
|
||||
sys.stderr.write(
|
||||
|
||||
@@ -40,20 +40,24 @@ Context-format note (Gitea 1.22.6):
|
||||
|
||||
Exit codes:
|
||||
0 — no required workflow has a paths/paths-ignore filter (clean) OR
|
||||
branch_protections endpoint returned 403/404 (token-scope issue;
|
||||
surfaced via ::error:: but non-fatal so a missing scope doesn't
|
||||
red-X every PR — fix the token, not the lint).
|
||||
branch_protections returned an authenticated 404 (branch
|
||||
genuinely has no protection; ::warning:: surfaced).
|
||||
1 — at least one required workflow has a paths/paths-ignore filter
|
||||
(the gate-degrading defect class).
|
||||
2 — env contract violation (missing GITEA_TOKEN/HOST/REPO/BRANCH).
|
||||
3 — workflows directory missing or workflow YAML unparseable.
|
||||
4 — protection response shape unexpected (non-dict body on 2xx).
|
||||
4 — FAIL-CLOSED verification failure: branch_protections 401/403
|
||||
auth failure (token can't read BP), 5xx transient (propagated
|
||||
ApiError), or unexpected response shape. This is a HARD gate on
|
||||
a protected context — it MUST NOT green when it cannot verify.
|
||||
|
||||
Auth note: `GET /repos/.../branch_protections/{branch}` requires
|
||||
repo-admin role in Gitea 1.22.6. The workflow-default `GITHUB_TOKEN`
|
||||
is non-admin; we re-use `DRIFT_BOT_TOKEN` (same persona that powers
|
||||
ci-required-drift.yml). If `DRIFT_BOT_TOKEN` is unavailable in a future
|
||||
context, the script falls through gracefully (exit 0 + ::error::).
|
||||
ci-required-drift.yml). A 401/403 from a missing-scope token is an
|
||||
AUTH FAILURE that FAILS CLOSED (exit 4) — fix the token, not the
|
||||
lint. Only an authenticated 404 (genuinely-absent protection) is a
|
||||
tolerated graceful skip.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -309,14 +313,36 @@ def run() -> int:
|
||||
msg = str(e)
|
||||
m = re.search(r"HTTP (\d{3})", msg)
|
||||
http_status = int(m.group(1)) if m else None
|
||||
if http_status in (403, 404):
|
||||
# FAIL-CLOSED contract (was fail-open: 403 AND 404 both exit 0 —
|
||||
# fixed). This is a HARD gate (no continue-on-error → false) on a
|
||||
# PROTECTED context: pull_request (same-repo; fork PRs can't carry
|
||||
# DRIFT_BOT_TOKEN) + workflow_dispatch. We split auth-failure from
|
||||
# genuinely-absent:
|
||||
# 401/403 → AUTH FAILURE: the token cannot read branch
|
||||
# protections, so we CANNOT enumerate the required-check set
|
||||
# and CANNOT verify the no-paths-filter invariant. Fail loud /
|
||||
# fail closed (exit 4) — do NOT green an unverifiable gate.
|
||||
# 404 → authenticated absent resource: branch genuinely has no
|
||||
# protection. Nothing to enumerate; tolerated degradation,
|
||||
# surfaced loudly (exit 0 with ::warning::).
|
||||
if http_status in (401, 403):
|
||||
sys.stderr.write(
|
||||
f"::error::GET {protection_path} returned HTTP {http_status} — "
|
||||
f"DRIFT_BOT_TOKEN lacks repo-admin scope (Gitea 1.22.6 "
|
||||
f"requires it for this endpoint) OR branch '{BRANCH}' has "
|
||||
f"no protection configured. Cannot enumerate required "
|
||||
f"checks; skipping lint with exit 0 to avoid red-X on "
|
||||
f"every PR. Fix: grant repo-admin to mc-drift-bot.\n"
|
||||
f"::error::GET {protection_path} returned HTTP "
|
||||
f"{http_status} — DRIFT_BOT_TOKEN cannot read branch "
|
||||
f"protections (needs repo-admin scope). AUTH FAILURE: "
|
||||
f"cannot enumerate required checks, so this lint FAILS "
|
||||
f"CLOSED rather than greening a gate it could not verify. "
|
||||
f"Fix: grant repo-admin to mc-drift-bot (org team "
|
||||
f"`drift-bot`, perm=admin) — fix the token, not the lint.\n"
|
||||
)
|
||||
return 4
|
||||
if http_status == 404:
|
||||
sys.stderr.write(
|
||||
f"::warning::GET {protection_path} returned HTTP 404 — "
|
||||
f"branch '{BRANCH}' has no protection configured "
|
||||
f"(authenticated absent resource). No required contexts to "
|
||||
f"check. If '{BRANCH}' SHOULD be protected, this is a real "
|
||||
f"finding.\n"
|
||||
)
|
||||
return 0
|
||||
raise
|
||||
|
||||
@@ -36,7 +36,8 @@ Daily scheduled run + workflow_dispatch:
|
||||
|
||||
1. GET `branch_protections/{BRANCH}` (needs DRIFT_BOT_TOKEN with
|
||||
repo-admin scope; same persona as ci-required-drift.yml).
|
||||
Graceful-degrade on 403/404 per Tier 2a contract.
|
||||
FAIL CLOSED on 401/403 (auth failure → exit 2); a genuine
|
||||
authenticated 404 (no protection) is a loud ::warning:: skip.
|
||||
|
||||
2. Walk `.gitea/workflows/*.yml` via PyYAML AST. For each workflow,
|
||||
enumerate its emitted contexts: `{workflow.name} / {job.name or
|
||||
@@ -59,10 +60,14 @@ Daily scheduled run + workflow_dispatch:
|
||||
|
||||
Exit codes
|
||||
----------
|
||||
0 — clean OR API 403/404 (graceful-degrade, surfaces ::error::).
|
||||
0 — clean, OR an authenticated 404 (branch genuinely has no
|
||||
protection — surfaces ::warning::, not a fail-open).
|
||||
1 — at least one BP context has no emitter.
|
||||
2 — env contract violation, workflows-dir missing, or YAML parse
|
||||
error.
|
||||
2 — env contract violation, workflows-dir missing, YAML parse
|
||||
error, OR a fail-closed verification failure: 401/403 auth
|
||||
failure (token can't read BP) or transient/unexpected API
|
||||
error. This is a HARD gate on a protected context (schedule/
|
||||
dispatch on main) — it MUST NOT green when it cannot verify.
|
||||
|
||||
Env
|
||||
---
|
||||
@@ -394,28 +399,49 @@ def run() -> int:
|
||||
return 2
|
||||
|
||||
# 1. Pull BP.
|
||||
#
|
||||
# FAIL-CLOSED contract (was fail-open with exit 0 — fixed). This lint
|
||||
# is a HARD gate (continue-on-error: false) and only ever runs on a
|
||||
# PROTECTED context: schedule + workflow_dispatch on `main`. There is
|
||||
# NO fork/advisory split here — the DRIFT_BOT_TOKEN secret is always
|
||||
# present and trusted, so an auth failure or transient error is a real
|
||||
# inability-to-verify, not a legitimate degradation. We MUST fail loud
|
||||
# (`::error::` + nonzero) rather than green a gate we could not check.
|
||||
status, bp = api("GET", f"/repos/{repo}/branch_protections/{branch}")
|
||||
if status == "forbidden":
|
||||
sys.stderr.write(
|
||||
f"::error::GET branch_protections/{branch} returned HTTP 403 — "
|
||||
f"DRIFT_BOT_TOKEN lacks repo-admin scope (Gitea 1.22.6 requires "
|
||||
f"it for this endpoint). Skipping lint with exit 0 to avoid "
|
||||
f"red-X on every run. Fix: grant repo-admin to mc-drift-bot. "
|
||||
f"Per Tier 2a contract.\n"
|
||||
f"::error::GET branch_protections/{branch} returned HTTP "
|
||||
f"401/403 — DRIFT_BOT_TOKEN cannot read branch protections "
|
||||
f"(needs repo-admin scope; Gitea requires it for this "
|
||||
f"endpoint). This is an AUTH FAILURE, not an absent resource: "
|
||||
f"the lint CANNOT verify the BP↔emitter invariant, so it FAILS "
|
||||
f"CLOSED instead of greening a gate it could not check. Fix: "
|
||||
f"grant repo-admin to mc-drift-bot (org team `drift-bot`, "
|
||||
f"perm=admin) — fix the token, not the lint.\n"
|
||||
)
|
||||
return 0
|
||||
return 2
|
||||
if status == "not_found":
|
||||
# Genuine 404 WITH a valid token = branch has no protection
|
||||
# configured. On `main` this is itself suspicious (main should
|
||||
# always be protected) but it is a real, authenticated read of an
|
||||
# absent resource — not an auth failure — so we surface it loudly
|
||||
# but do not hard-fail on the genuinely-absent case.
|
||||
print(
|
||||
f"::notice::branch '{branch}' has no protection configured; "
|
||||
f"nothing to lint."
|
||||
f"::warning::branch '{branch}' has no protection configured "
|
||||
f"(authenticated 404); nothing to lint. If '{branch}' SHOULD be "
|
||||
f"protected, this is a real finding — configure branch "
|
||||
f"protection."
|
||||
)
|
||||
return 0
|
||||
if status != "ok" or not isinstance(bp, dict):
|
||||
sys.stderr.write(
|
||||
f"::error::branch_protections/{branch} response unexpected; "
|
||||
f"status={status}. Treating as transient; exit 0.\n"
|
||||
f"::error::branch_protections/{branch} read failed with "
|
||||
f"status={status} (transient/unexpected). The lint CANNOT "
|
||||
f"verify the BP↔emitter invariant on this run; FAILING CLOSED "
|
||||
f"rather than greening unverified. Re-run; if it persists, "
|
||||
f"investigate Gitea API health / token validity.\n"
|
||||
)
|
||||
return 0
|
||||
return 2
|
||||
|
||||
bp_contexts: list[str] = list(bp.get("status_check_contexts") or [])
|
||||
if not bp_contexts:
|
||||
|
||||
@@ -57,10 +57,14 @@ comment unrelated to the new job.
|
||||
Exit codes
|
||||
----------
|
||||
0 — no new emissions, all new emissions have valid directives,
|
||||
or BP read errored (graceful-degrade per Tier 2a contract).
|
||||
OR an authenticated 404 (branch genuinely has no protection
|
||||
to verify against — surfaces ::warning::, not a fail-open).
|
||||
1 — at least one new emission lacks a directive, or has
|
||||
`bp-required: yes` but the context is missing from BP.
|
||||
2 — env contract violation or YAML parse error.
|
||||
2 — env contract violation, YAML parse error, OR a fail-closed
|
||||
verification failure: 401/403 auth failure (token can't read
|
||||
BP) or transient/unexpected API error. HARD gate on a
|
||||
same-repo PR context — MUST NOT green when it cannot verify.
|
||||
|
||||
Env
|
||||
---
|
||||
@@ -420,33 +424,51 @@ def run() -> int:
|
||||
return 0
|
||||
|
||||
# Step 3 — fetch BP context list.
|
||||
#
|
||||
# FAIL-CLOSED contract (was fail-open with exit 0 — fixed). This is a
|
||||
# HARD gate (continue-on-error: false) that runs on `pull_request`
|
||||
# against `main`. On molecule-core, `pull_request` runs are same-repo
|
||||
# (fork PRs cannot carry the DRIFT_BOT_TOKEN secret), so this is a
|
||||
# PROTECTED/trusted context with no legitimate fork-degradation. An
|
||||
# auth failure or transient error means we CANNOT verify a NEW
|
||||
# bp-required emission is actually in BP — so we MUST fail loud rather
|
||||
# than green the gate. (A genuinely-absent 404 read with a valid token
|
||||
# is the one tolerated degradation: there is no BP to check against.)
|
||||
status, bp = api("GET", f"/repos/{repo}/branch_protections/{branch}")
|
||||
bp_contexts: set[str] = set()
|
||||
if status == "forbidden":
|
||||
sys.stderr.write(
|
||||
f"::error::GET branch_protections/{branch} returned HTTP 403 — "
|
||||
f"DRIFT_BOT_TOKEN lacks repo-admin scope. Cannot verify "
|
||||
f"bp-required directives; skipping lint with exit 0 per "
|
||||
f"Tier 2a contract. Fix the token, not the lint.\n"
|
||||
f"::error::GET branch_protections/{branch} returned HTTP "
|
||||
f"401/403 — DRIFT_BOT_TOKEN cannot read branch protections "
|
||||
f"(needs repo-admin scope). This is an AUTH FAILURE: the lint "
|
||||
f"CANNOT verify the bp-required directives on this PR, so it "
|
||||
f"FAILS CLOSED instead of greening unverified. Fix: grant "
|
||||
f"repo-admin to mc-drift-bot (org team `drift-bot`) — fix the "
|
||||
f"token, not the lint.\n"
|
||||
)
|
||||
return 0
|
||||
return 2
|
||||
elif status == "not_found":
|
||||
# Branch has no protection — nothing to verify against; the
|
||||
# bp-required: yes directive can't be satisfied. Treat as
|
||||
# graceful-skip rather than red-X.
|
||||
# Authenticated 404 — branch genuinely has no protection. There is
|
||||
# nothing to verify a `bp-required: yes` directive against, so this
|
||||
# is the one tolerated degradation. Surface loudly (on `main` a
|
||||
# missing protection is itself a real finding) but do not hard-fail.
|
||||
print(
|
||||
f"::notice::branch '{branch}' has no protection; cannot verify "
|
||||
f"bp-required directives. Skipping (exit 0)."
|
||||
f"::warning::branch '{branch}' has no protection (authenticated "
|
||||
f"404); cannot verify bp-required directives. If '{branch}' "
|
||||
f"SHOULD be protected this is a real finding."
|
||||
)
|
||||
return 0
|
||||
elif status == "ok" and isinstance(bp, dict):
|
||||
bp_contexts = set(bp.get("status_check_contexts") or [])
|
||||
else:
|
||||
sys.stderr.write(
|
||||
f"::error::branch_protections/{branch} response unexpected; "
|
||||
f"status={status}. Treating as transient; exit 0.\n"
|
||||
f"::error::branch_protections/{branch} read failed with "
|
||||
f"status={status} (transient/unexpected). CANNOT verify "
|
||||
f"bp-required directives on this PR; FAILING CLOSED rather than "
|
||||
f"greening unverified. Re-run; if persistent, check Gitea API "
|
||||
f"health / token validity.\n"
|
||||
)
|
||||
return 0
|
||||
return 2
|
||||
|
||||
# Step 4 — validate each new emission's directive.
|
||||
violations: list[str] = []
|
||||
|
||||
@@ -105,12 +105,26 @@ if [ "${SOP_REFIRE_DISABLE_RATE_LIMIT:-}" != "1" ]; then
|
||||
fi
|
||||
|
||||
# 3. Invoke sop-tier-check.sh with the env it expects.
|
||||
# The canonical workflow intentionally fail-opens the job conclusion
|
||||
# (`bash .gitea/scripts/sop-tier-check.sh || true`) while Gitea branch
|
||||
# protection enforces reviewer approvals separately. Keep the refire path
|
||||
# aligned with that workflow status behavior; otherwise /refire-tier-check can
|
||||
# post a hard failure that the canonical pull_request_target workflow would
|
||||
# not publish.
|
||||
#
|
||||
# FAIL-CLOSED contract (was fail-open — fixed 2026-06-05,
|
||||
# fix/core-ci-fail-closed). The previous shape was:
|
||||
# bash "$SCRIPT" || true
|
||||
# TIER_EXIT=0 # <-- hardcoded success
|
||||
# which discarded the real verdict and ALWAYS POSTed
|
||||
# `state=success` for the REQUIRED context
|
||||
# `sop-tier-check / tier-check (pull_request)`. That meant ANY
|
||||
# collaborator could comment `/refire-tier-check` to forcibly green
|
||||
# the SOP-6 approval gate on the PR head SHA — a fail-open AND a
|
||||
# privilege bypass of branch protection. The canonical
|
||||
# pull_request_target workflow's conclusion publishes the same
|
||||
# context honestly (red on a real violation); the refire MUST mirror
|
||||
# THAT honesty, not a discarded exit code.
|
||||
#
|
||||
# We now capture the script's real exit code under `set +e` and POST
|
||||
# success ONLY when it actually exited 0. sop-tier-check.sh itself
|
||||
# fails closed on infra faults (no SOP_FAIL_OPEN in this refire env),
|
||||
# so a bad token / unreachable API / missing jq → non-zero → we POST
|
||||
# `state=failure`, never a false green.
|
||||
#
|
||||
# SOP_REFIRE_TIER_CHECK_SCRIPT env var lets tests substitute a mock —
|
||||
# sop-tier-check.sh uses bash 4+ associative arrays which trigger a known
|
||||
@@ -125,7 +139,10 @@ if [ ! -f "$SCRIPT" ]; then
|
||||
fi
|
||||
|
||||
# Re-invoke. Pipe stdout/stderr through so the runner log shows the
|
||||
# tier-check decision inline.
|
||||
# tier-check decision inline. Capture the REAL exit code (set +e so a
|
||||
# non-zero verdict doesn't abort this script under set -e) — the POST
|
||||
# below keys off it, so a failed tier-check posts state=failure.
|
||||
set +e
|
||||
GITEA_TOKEN="$GITEA_TOKEN" \
|
||||
GITEA_HOST="$GITEA_HOST" \
|
||||
REPO="$REPO" \
|
||||
@@ -133,8 +150,9 @@ GITEA_TOKEN="$GITEA_TOKEN" \
|
||||
PR_AUTHOR="$PR_AUTHOR" \
|
||||
SOP_DEBUG="${SOP_DEBUG:-0}" \
|
||||
SOP_LEGACY_CHECK="${SOP_LEGACY_CHECK:-0}" \
|
||||
bash "$SCRIPT" || true
|
||||
TIER_EXIT=0
|
||||
bash "$SCRIPT"
|
||||
TIER_EXIT=$?
|
||||
set -e
|
||||
debug "sop-tier-check.sh exit=$TIER_EXIT"
|
||||
|
||||
# 4. POST the resulting status.
|
||||
@@ -170,4 +188,12 @@ if [ "$POST_HTTP" != "200" ] && [ "$POST_HTTP" != "201" ]; then
|
||||
fi
|
||||
|
||||
echo "::notice::sop-tier-refire posted state=$STATE for context=\"$CONTEXT\" on sha=$HEAD_SHA"
|
||||
exit "$TIER_EXIT"
|
||||
# Exit 0: the refire JOB succeeded — it re-evaluated the gate and posted
|
||||
# an HONEST status. The gate VERDICT is carried by the POSTed status
|
||||
# ($STATE), which is what branch protection reads; a failing tier-check
|
||||
# posts state=failure (red on the PR), so there is no fail-open. We do
|
||||
# NOT also exit non-zero on a failing verdict — that would double-signal
|
||||
# the same failure as both a red status AND a red refire job. The
|
||||
# fail-open that mattered (TIER_EXIT hardcoded to 0 → always state=success)
|
||||
# is fixed above by capturing the real exit code.
|
||||
exit 0
|
||||
|
||||
@@ -246,21 +246,24 @@ assert_contains "T1 POST context is sop-tier-check / tier-check" \
|
||||
'"context": "sop-tier-check / tier-check (pull_request)"' "$POSTED"
|
||||
assert_contains "T1 description names commenter" "test-runner" "$POSTED"
|
||||
|
||||
# T2: missing tier label → tier-check fails internally, but refire status
|
||||
# matches the canonical workflow's fail-open job conclusion.
|
||||
# T2: missing tier label → tier-check fails internally (mock exits 1).
|
||||
# FAIL-CLOSED contract (fix/core-ci-fail-closed): refire now captures the
|
||||
# REAL exit code and POSTs state=failure — it does NOT forge a green on
|
||||
# the required context. The refire job itself still exits 0 (it succeeded
|
||||
# at posting an honest failure status).
|
||||
run_scenario "T2_no_tier_label" "fail_no_label"
|
||||
RC=$(cat "$FIX_STATE_DIR/last_rc")
|
||||
POSTED=$(cat "$FIX_STATE_DIR/posted_statuses.jsonl" 2>/dev/null || true)
|
||||
assert_eq "T2 exit code 0 (canonical fail-open)" "0" "$RC"
|
||||
assert_contains "T2 POSTed state=success" '"state": "success"' "$POSTED"
|
||||
assert_eq "T2 exit code 0 (posted an honest status)" "0" "$RC"
|
||||
assert_contains "T2 POSTed state=failure (no forged green)" '"state": "failure"' "$POSTED"
|
||||
|
||||
# T3: tier:low present but ZERO approving reviews → internal tier check fails,
|
||||
# refire status remains aligned with the canonical workflow.
|
||||
# T3: tier:low present but ZERO approving reviews → internal tier check
|
||||
# fails (mock exits 1). Refire POSTs state=failure, never a false green.
|
||||
run_scenario "T3_no_approvals" "fail_no_approvals"
|
||||
RC=$(cat "$FIX_STATE_DIR/last_rc")
|
||||
POSTED=$(cat "$FIX_STATE_DIR/posted_statuses.jsonl" 2>/dev/null || true)
|
||||
assert_eq "T3 exit code 0 (canonical fail-open)" "0" "$RC"
|
||||
assert_contains "T3 POSTed state=success" '"state": "success"' "$POSTED"
|
||||
assert_eq "T3 exit code 0 (posted an honest status)" "0" "$RC"
|
||||
assert_contains "T3 POSTed state=failure (no forged green)" '"state": "failure"' "$POSTED"
|
||||
|
||||
# T4: closed PR — refire is a no-op (no POST, exit 0)
|
||||
run_scenario "T4_closed" "pass"
|
||||
|
||||
@@ -41,12 +41,16 @@
|
||||
# - the `|| true` after the sop-tier-check.sh invocation, which masked
|
||||
# real tier-gate verdicts.
|
||||
# AND-composition is now fully enforced and the tier-check step can
|
||||
# honestly red CI on a real SOP-6 violation. SOP_FAIL_OPEN=1 is RETAINED
|
||||
# as sanctioned infra-resilience: it fails-open only on token/network/jq
|
||||
# faults, never on a real gate verdict. If you need to temporarily
|
||||
# re-introduce a mask, file a tracker and follow the mc#1982 protocol
|
||||
# (Tier 2e lint requires a current tracker within 2 lines of any
|
||||
# continue-on-error: true).
|
||||
# honestly red CI on a real SOP-6 violation.
|
||||
#
|
||||
# SOP_FAIL_OPEN REMOVED 2026-06-05 (fix/core-ci-fail-closed): this is a
|
||||
# REQUIRED branch-protected gate on `pull_request_target` (always
|
||||
# same-repo, secrets always present — no fork/advisory split). Failing
|
||||
# open on a token/network/jq fault greened the SOP-6 approval gate
|
||||
# WITHOUT verifying approvals — a fail-open on a required context. The
|
||||
# gate now FAILS CLOSED on infra faults too: fix the token/runner, not
|
||||
# the gate. If you ever need to temporarily re-introduce a mask, file a
|
||||
# tracker and follow the mc#1982 protocol.
|
||||
|
||||
name: sop-tier-check
|
||||
|
||||
@@ -122,9 +126,9 @@ jobs:
|
||||
- name: Verify tier label + reviewer team membership
|
||||
# continue-on-error REMOVED 2026-06-04 (expired internal#189 Phase 1
|
||||
# burn-in, window closed 2026-05-17; mc#1982 directive: root-fix and
|
||||
# remove, do not renew). SOP_FAIL_OPEN=1 below still fails-open on
|
||||
# token/network/infra errors only (never on a real tier-gate verdict),
|
||||
# so this step can now honestly fail CI on a genuine SOP-6 violation.
|
||||
# remove, do not renew). SOP_FAIL_OPEN REMOVED 2026-06-05
|
||||
# (fix/core-ci-fail-closed): the gate now fails CLOSED on infra
|
||||
# faults too (see the env block below), not just on a real verdict.
|
||||
env:
|
||||
GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
|
||||
GITEA_HOST: git.moleculesai.app
|
||||
@@ -133,13 +137,26 @@ jobs:
|
||||
PR_AUTHOR: ${{ github.event.pull_request.user.login }}
|
||||
SOP_DEBUG: '0'
|
||||
SOP_LEGACY_CHECK: '0'
|
||||
# SOP_FAIL_OPEN=1 fails-open ONLY on infra faults (empty/invalid
|
||||
# token, unreachable Gitea API, missing jq) — see the guarded
|
||||
# `exit 0` branches in sop-tier-check.sh. It does NOT mask a real
|
||||
# tier-gate verdict: a missing tier label, no approving review, or
|
||||
# an unsatisfied AND-clause still `exit 1`. Kept as sanctioned
|
||||
# infra-resilience; the `|| true` mask was REMOVED with the burn-in
|
||||
# COE (2026-06-04) so a genuine SOP-6 violation now reds CI.
|
||||
SOP_FAIL_OPEN: '1'
|
||||
# SOP_FAIL_OPEN REMOVED 2026-06-05 (fix/core-ci-fail-closed).
|
||||
#
|
||||
# This is the REQUIRED branch-protected gate
|
||||
# `sop-tier-check / tier-check (pull_request)`. It runs on
|
||||
# `pull_request_target`, which ALWAYS executes from the base
|
||||
# branch WITH secrets present — there is NO fork/advisory split
|
||||
# and no legitimate "secrets genuinely absent" degradation here.
|
||||
#
|
||||
# SOP_FAIL_OPEN=1 made the script `exit 0` on an empty/invalid
|
||||
# token, an unreachable Gitea API, or missing jq — i.e. an AUTH
|
||||
# FAILURE or unreachable-dependency would green the SOP-6
|
||||
# approval gate WITHOUT verifying that the required teams
|
||||
# actually approved. That is a fail-open on a required gate: a
|
||||
# mis-wired or under-scoped SOP_TIER_CHECK_TOKEN would let any PR
|
||||
# merge past the approval requirement.
|
||||
#
|
||||
# Removing the env unsets it → `${SOP_FAIL_OPEN:-}` is empty in
|
||||
# sop-tier-check.sh → every guarded `exit 0` branch instead falls
|
||||
# through to `exit 1`. Infra faults (bad token / API down / no
|
||||
# jq) now FAIL CLOSED with a loud `::error::`, exactly like a real
|
||||
# SOP-6 violation. Fix the token/runner, not the gate.
|
||||
run: |
|
||||
bash .gitea/scripts/sop-tier-check.sh
|
||||
|
||||
@@ -203,6 +203,60 @@ def test_f1_job_missing_from_sentinel_needs(drift_module, tmp_path, monkeypatch)
|
||||
assert any("F1 —" in f and "test" in f for f in findings), findings
|
||||
|
||||
|
||||
def test_detect_drift_403_fails_closed(drift_module, tmp_path, monkeypatch):
|
||||
"""AUTH FAILURE on branch_protections (HTTP 401/403) → RAISE (fail
|
||||
closed). The token can't read BP, so drift is UNVERIFIABLE; greening
|
||||
the hourly cron here would let jobs↔protection drift go silently
|
||||
undetected — exactly the regression class this sentinel exists to
|
||||
catch. fix/core-ci-fail-closed.
|
||||
"""
|
||||
ci = _write_ci_yaml(
|
||||
tmp_path,
|
||||
jobs={"build": {"runs-on": "ubuntu-latest"}},
|
||||
sentinel_needs=["build"],
|
||||
)
|
||||
audit = _write_audit_yaml(tmp_path, ["ci / build (pull_request)"])
|
||||
_patch_paths(drift_module, monkeypatch, ci, audit)
|
||||
|
||||
stub = _make_stub_api({
|
||||
("GET", "/repos/owner/repo/branch_protections/main"): (
|
||||
drift_module.ApiError(
|
||||
"GET /repos/owner/repo/branch_protections/main → HTTP 403: forbidden"
|
||||
)
|
||||
),
|
||||
})
|
||||
monkeypatch.setattr(drift_module, "api", stub)
|
||||
with pytest.raises(drift_module.ApiError):
|
||||
drift_module.detect_drift("main")
|
||||
|
||||
|
||||
def test_detect_drift_404_skips_branch(drift_module, tmp_path, monkeypatch):
|
||||
"""Authenticated 404 (branch genuinely has no protection, e.g. staging
|
||||
pre-rollout) → tolerated skip: return ([], debug) with
|
||||
protection_contexts_skipped True. NOT a fail-open (real read of an
|
||||
absent resource with a valid token)."""
|
||||
ci = _write_ci_yaml(
|
||||
tmp_path,
|
||||
jobs={"build": {"runs-on": "ubuntu-latest"}},
|
||||
sentinel_needs=["build"],
|
||||
)
|
||||
audit = _write_audit_yaml(tmp_path, ["ci / build (pull_request)"])
|
||||
_patch_paths(drift_module, monkeypatch, ci, audit)
|
||||
|
||||
stub = _make_stub_api({
|
||||
("GET", "/repos/owner/repo/branch_protections/staging"): (
|
||||
drift_module.ApiError(
|
||||
"GET /repos/owner/repo/branch_protections/staging → HTTP 404: not found"
|
||||
)
|
||||
),
|
||||
})
|
||||
monkeypatch.setattr(drift_module, "api", stub)
|
||||
findings, debug = drift_module.detect_drift("staging")
|
||||
assert findings == []
|
||||
assert debug.get("protection_contexts_skipped") is True
|
||||
assert debug.get("protection_http_status") == 404
|
||||
|
||||
|
||||
def test_f1b_sentinel_needs_typo(drift_module, tmp_path, monkeypatch):
|
||||
"""F1b: sentinel.needs lists a job not present in ci.yml (typo).
|
||||
|
||||
|
||||
@@ -34,9 +34,12 @@ Test classes (per `feedback_branch_count_before_approving`):
|
||||
together, not short-circuited.
|
||||
- test_bp_empty_lints_nothing — BP has no contexts.
|
||||
Exit 0 cleanly.
|
||||
- test_api_403_skips_gracefully — branch_protections endpoint
|
||||
403s (token-scope). Exit 0 with ::error::, do NOT red-X.
|
||||
- test_api_404_skips_gracefully — branch has no protection.
|
||||
- test_api_403_fails_closed — branch_protections endpoint
|
||||
401/403s (auth failure). FAIL CLOSED (exit 2) with ::error::.
|
||||
- test_api_transient_fails_closed — transient/unexpected API
|
||||
error. FAIL CLOSED (exit 2).
|
||||
- test_api_404_skips_gracefully — branch has no protection
|
||||
(authenticated absent resource). Tolerated skip (exit 0 + warning).
|
||||
Exit 0 cleanly.
|
||||
- test_context_event_match_required — BP context says `(push)` and
|
||||
workflow only emits on `pull_request`. That's NOT a match — the
|
||||
@@ -247,9 +250,10 @@ def test_bp_empty_lints_nothing(envset, monkeypatch, capsys):
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# API 403 — graceful-degrade.
|
||||
# API 403 — AUTH FAILURE → FAIL CLOSED (exit 2). This is a HARD gate on a
|
||||
# protected context; a token that can't read BP must NOT green the lint.
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_api_403_skips_gracefully(envset, monkeypatch, capsys):
|
||||
def test_api_403_fails_closed(envset, monkeypatch, capsys):
|
||||
_write_wf(
|
||||
envset,
|
||||
"ci.yml",
|
||||
@@ -259,13 +263,30 @@ def test_api_403_skips_gracefully(envset, monkeypatch, capsys):
|
||||
m = _import_lint()
|
||||
_stub_api(monkeypatch, m, ("forbidden", None))
|
||||
rc = m.run()
|
||||
assert rc == 0
|
||||
assert rc == 2
|
||||
err = capsys.readouterr().err
|
||||
assert "403" in err or "scope" in err.lower() or "token" in err.lower()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# API 404 — branch has no protection → clean exit.
|
||||
# API transient/unexpected error → FAIL CLOSED (exit 2).
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_api_transient_fails_closed(envset, monkeypatch, capsys):
|
||||
_write_wf(
|
||||
envset,
|
||||
"ci.yml",
|
||||
"name: CI\non:\n pull_request:\n branches: [main]\njobs:\n"
|
||||
" j:\n runs-on: x\n steps:\n - run: echo hi\n",
|
||||
)
|
||||
m = _import_lint()
|
||||
_stub_api(monkeypatch, m, ("error", None))
|
||||
rc = m.run()
|
||||
assert rc == 2
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# API 404 — authenticated absent resource (branch has no protection) →
|
||||
# tolerated graceful skip (exit 0 with ::warning::), NOT a fail-open.
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_api_404_skips_gracefully(envset, monkeypatch, capsys):
|
||||
_write_wf(
|
||||
|
||||
@@ -47,7 +47,10 @@ Test classes (per `feedback_branch_count_before_approving`):
|
||||
(the OLD context name disappears; the NEW one needs validation).
|
||||
- test_unrelated_workflow_edit_is_not_new — edit a comment in
|
||||
an existing emitter; no new context introduced; pass.
|
||||
- test_api_403_skips_gracefully — BP read 403; exit 0
|
||||
- test_api_403_fails_closed — BP read 401/403 auth
|
||||
failure → FAIL CLOSED (exit 2)
|
||||
- test_api_transient_fails_closed — transient → exit 2
|
||||
- test_api_404_skips_gracefully — authenticated 404 → exit 0
|
||||
with stderr ::error::.
|
||||
- test_directive_must_be_in_workflow_yml — directive in PR
|
||||
body alone is NOT sufficient; the comment must live in the
|
||||
@@ -392,9 +395,10 @@ def test_unrelated_workflow_edit_is_not_new(env, monkeypatch, capsys):
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# BP API 403 → exit 0 with ::error::.
|
||||
# BP API 401/403 = AUTH FAILURE → FAIL CLOSED (exit 2). A new emission can't
|
||||
# be verified against BP if the token can't read BP — must not green.
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_api_403_skips_gracefully(env, monkeypatch, capsys):
|
||||
def test_api_403_fails_closed(env, monkeypatch, capsys):
|
||||
m = _import_lint()
|
||||
_stub_git_and_api(
|
||||
monkeypatch,
|
||||
@@ -404,11 +408,44 @@ def test_api_403_skips_gracefully(env, monkeypatch, capsys):
|
||||
bp_response=("forbidden", None),
|
||||
)
|
||||
rc = m.run()
|
||||
assert rc == 0
|
||||
assert rc == 2
|
||||
err = capsys.readouterr().err
|
||||
assert "403" in err or "scope" in err.lower() or "token" in err.lower()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# BP API transient/unexpected error → FAIL CLOSED (exit 2).
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_api_transient_fails_closed(env, monkeypatch, capsys):
|
||||
m = _import_lint()
|
||||
_stub_git_and_api(
|
||||
monkeypatch,
|
||||
m,
|
||||
base_files={".gitea/workflows/ci.yml": WF_CI_BASE},
|
||||
head_files={".gitea/workflows/ci.yml": WF_CI_NEW_JOB},
|
||||
bp_response=("error", None),
|
||||
)
|
||||
rc = m.run()
|
||||
assert rc == 2
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# BP API authenticated 404 (branch genuinely unprotected) → tolerated
|
||||
# graceful skip (exit 0 with ::warning::), NOT a fail-open.
|
||||
# ---------------------------------------------------------------------------
|
||||
def test_api_404_skips_gracefully(env, monkeypatch, capsys):
|
||||
m = _import_lint()
|
||||
_stub_git_and_api(
|
||||
monkeypatch,
|
||||
m,
|
||||
base_files={".gitea/workflows/ci.yml": WF_CI_BASE},
|
||||
head_files={".gitea/workflows/ci.yml": WF_CI_NEW_JOB},
|
||||
bp_response=("not_found", None),
|
||||
)
|
||||
rc = m.run()
|
||||
assert rc == 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Directive must be in the workflow YML, not PR body.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@@ -527,15 +527,13 @@ def test_multi_required_one_bad_one_good_fails(
|
||||
assert "good.yml" not in ln
|
||||
|
||||
|
||||
def test_protection_403_treated_as_skip(lint_module, monkeypatch, capsys):
|
||||
"""If the token can't read branch_protections (HTTP 403), exit 0
|
||||
with a clear ::error::-but-non-fatal note. Same scope-fallback shape
|
||||
as ci-required-drift.py per the precedent.
|
||||
|
||||
Rationale: if the lint workflow itself can't read protection, the PR
|
||||
can't make THIS state worse (a paths-filter PR was already addable
|
||||
without the lint). Better to surface a token-scope problem loudly
|
||||
than to red-X every PR until the token is fixed.
|
||||
def test_protection_403_fails_closed(lint_module, monkeypatch, capsys):
|
||||
"""AUTH FAILURE → FAIL CLOSED (exit 4). If the token can't read
|
||||
branch_protections (HTTP 401/403), the lint CANNOT enumerate the
|
||||
required-check set and therefore CANNOT verify the no-paths-filter
|
||||
invariant. This is a HARD gate on a protected (same-repo PR) context,
|
||||
so it MUST fail loud rather than green an unverifiable gate — fix the
|
||||
token, not the lint.
|
||||
"""
|
||||
stub = _make_stub_api({
|
||||
("GET", "/repos/owner/repo/branch_protections/main"): (
|
||||
@@ -546,7 +544,26 @@ def test_protection_403_treated_as_skip(lint_module, monkeypatch, capsys):
|
||||
})
|
||||
monkeypatch.setattr(lint_module, "api", stub)
|
||||
rc = lint_module.run()
|
||||
assert rc == 0
|
||||
assert rc == 4
|
||||
err = capsys.readouterr().err
|
||||
assert "::error::" in err
|
||||
assert "403" in err
|
||||
|
||||
|
||||
def test_protection_404_skips_gracefully(lint_module, monkeypatch, capsys):
|
||||
"""Authenticated 404 (branch genuinely has no protection) is the one
|
||||
tolerated degradation: there are no required contexts to check.
|
||||
Exit 0 with a ::warning:: — NOT a fail-open (this is a real read of an
|
||||
absent resource with a valid token, not an auth failure)."""
|
||||
stub = _make_stub_api({
|
||||
("GET", "/repos/owner/repo/branch_protections/main"): (
|
||||
lint_module.ApiError(
|
||||
"GET /repos/owner/repo/branch_protections/main → HTTP 404: not found"
|
||||
)
|
||||
),
|
||||
})
|
||||
monkeypatch.setattr(lint_module, "api", stub)
|
||||
rc = lint_module.run()
|
||||
assert rc == 0
|
||||
err = capsys.readouterr().err
|
||||
assert "404" in err
|
||||
|
||||
Reference in New Issue
Block a user