ci(lint): forbid continue-on-error on required-context jobs (SOP#765) #2541

Merged
devops-engineer merged 1 commits from ci/guard-no-coe-on-required into main 2026-06-10 20:47:13 +00:00
4 changed files with 358 additions and 0 deletions
+12
View File
@@ -0,0 +1,12 @@
# SSOT: the branch-protection REQUIRED status contexts on molecule-core
# main. Authoritative for lint-no-coe-on-required (CI cannot always read
# branch_protections; when a repo-admin token is present the lint also
# live-cross-checks this list against BP and fails on drift).
#
# Form: "<workflow name> / <job name>" (event suffix stripped). Verified
# 2026-06-10 against GET /api/v1/repos/molecule-ai/molecule-core/branch_protections.
# Keep in sync with branch protection; lint-required-context-exists-in-bp
# is the cousin that guards the other direction.
CI / all-required
E2E API Smoke Test / E2E API Smoke Test
Handlers Postgres Integration / Handlers Postgres Integration
+172
View File
@@ -0,0 +1,172 @@
#!/usr/bin/env python3
"""lint_no_coe_on_required — forbid continue-on-error on REQUIRED jobs.
Forbidden shape
---------------
A job in `.gitea/workflows/*.yml` that BOTH:
- has `continue-on-error: true` (job-level), AND
- emits a commit-status context that is in the repo's required
branch-protection set.
`continue-on-error: true` makes a failed step roll up to a *success*
job status (Gitea Quirk #10). On a job whose context branch-protection
treats as REQUIRED, that converts a real failure into a green gate —
exactly the mc#1982 masking incident (continue-on-error on platform-build
hid regressions for ~3 weeks; SOP#765). This makes SOP#765 mechanical.
Required-context SSOT
---------------------
A checked-in allowlist (REQUIRED_CONTEXTS_FILE, default
.gitea/required-contexts.txt — one context per line, `#` comments). This
is authoritative because the CI token cannot always read
branch_protections (cp returns 403). When a token IS available
(GITEA_TOKEN + repo admin) the script ALSO live-reads branch_protections
and fails if the checked-in allowlist has drifted from live BP — but a
403/absent token degrades gracefully to allowlist-only (warn, don't fail
on the read).
Context derivation
------------------
Gitea emits the per-job status context as `"{workflow_name} / {job_name
or job_key}{suffix}"` where suffix is ` (pull_request)` / ` (push)` on
those events. The allowlist stores the bare `workflow / job` form; we
match a required context if its event-stripped form equals a job's
`workflow / job`.
"""
import os
import re
import sys
try:
import yaml
except ImportError:
print("FAIL: PyYAML not available", file=sys.stderr)
sys.exit(2)
WORKFLOWS_DIR = os.environ.get("WORKFLOWS_DIR", ".gitea/workflows")
REQUIRED_FILE = os.environ.get("REQUIRED_CONTEXTS_FILE", ".gitea/required-contexts.txt")
GITEA_TOKEN = os.environ.get("GITEA_TOKEN", "")
GITEA_HOST = os.environ.get("GITEA_HOST", "git.moleculesai.app")
REPO = os.environ.get("REPO", "")
EVENT_SUFFIX = re.compile(r"\s*\((pull_request|push|pull_request_target)\)\s*$")
def strip_event(ctx):
return EVENT_SUFFIX.sub("", ctx).strip()
def load_required_allowlist(path):
if not os.path.isfile(path):
return None
out = set()
with open(path) as f:
for line in f:
line = line.split("#", 1)[0].strip()
if line:
out.add(strip_event(line))
return out
def job_contexts(workflows_dir):
"""Return dict context -> (file, job_key, continue_on_error_bool)."""
contexts = {}
for fn in sorted(os.listdir(workflows_dir)):
if not (fn.endswith(".yml") or fn.endswith(".yaml")):
continue
path = os.path.join(workflows_dir, fn)
try:
with open(path) as f:
doc = yaml.safe_load(f)
except yaml.YAMLError:
continue
if not isinstance(doc, dict):
continue
wf_name = doc.get("name") or os.path.splitext(fn)[0]
jobs = doc.get("jobs") or {}
if not isinstance(jobs, dict):
continue
for jkey, jval in jobs.items():
if not isinstance(jval, dict):
continue
jname = jval.get("name") or jkey
coe = jval.get("continue-on-error", False)
# Gitea coerces string "true" truthy.
coe_bool = coe is True or (isinstance(coe, str) and coe.strip().lower() == "true")
ctx = f"{wf_name} / {jname}"
contexts[strip_event(ctx)] = (path, jkey, coe_bool)
return contexts
def live_required_contexts():
"""Best-effort live BP read. Returns set or None (degrade)."""
if not (GITEA_TOKEN and REPO):
return None
try:
import json
import urllib.request
url = f"https://{GITEA_HOST}/api/v1/repos/{REPO}/branch_protections"
req = urllib.request.Request(url, headers={"Authorization": f"token {GITEA_TOKEN}"})
with urllib.request.urlopen(req, timeout=20) as r:
data = json.load(r)
out = set()
for b in data:
if b.get("branch_name") in ("main", None):
for c in (b.get("status_check_contexts") or []):
out.add(strip_event(c))
return out
except Exception as e:
print(f"::warning:: live branch_protections read failed ({e}); using checked-in allowlist only")
return None
def main():
if not os.path.isdir(WORKFLOWS_DIR):
print(f"OK: no {WORKFLOWS_DIR}")
return 0
required = load_required_allowlist(REQUIRED_FILE)
if required is None:
print(f"FAIL: required-contexts allowlist {REQUIRED_FILE} is missing — "
f"this file is the SSOT for which contexts are merge-required.")
return 1
# Optional live-BP drift check (graceful).
live = live_required_contexts()
if live is not None:
only_live = live - required
if only_live:
print("FAIL: branch-protection required contexts NOT in the checked-in allowlist "
f"({REQUIRED_FILE}) — allowlist has drifted from live BP:")
for c in sorted(only_live):
print(f" - {c}")
print(" Add them to the allowlist (or remove from BP).")
return 1
ctxs = job_contexts(WORKFLOWS_DIR)
fails = []
for ctx in sorted(required):
info = ctxs.get(ctx)
if info is None:
# The context is required but no job currently emits it — that's
# a different lint's concern (required-context-exists). Skip.
continue
path, jkey, coe = info
if coe:
fails.append(f"{path}: job `{jkey}` (context `{ctx}`) is branch-protection REQUIRED "
f"but has continue-on-error: true")
if fails:
print("FAIL: continue-on-error: true on a REQUIRED branch-protection job (mc#1982 / SOP#765):")
for f in fails:
print(f" - {f}")
print()
print("Why: continue-on-error makes a failed step roll up to a SUCCESS")
print(" job status (Gitea Quirk #10). On a REQUIRED context that turns")
print(" a real failure into a green gate — the mc#1982 masking incident.")
print(" Remove continue-on-error from required jobs (SOP#765).")
return 1
print(f"OK: no continue-on-error on any of the {len(required)} required contexts.")
return 0
if __name__ == "__main__":
sys.exit(main())
@@ -0,0 +1,81 @@
name: lint-no-coe-on-required
# Static workflow-shape lint: forbid `continue-on-error: true` on any job
# that emits a REQUIRED branch-protection status context. Makes SOP#765
# mechanical (the mc#1982 masking incident).
#
# Forbidden shape
# ---------------
# A job in `.gitea/workflows/*.yml` that is BOTH continue-on-error: true
# AND emits a context in `.gitea/required-contexts.txt` (the checked-in
# SSOT of merge-required contexts).
#
# Why this rule exists (mc#1982 / SOP#765)
# ----------------------------------------
# `continue-on-error: true` makes a failed step roll up to a SUCCESS job
# status (Gitea Quirk #10). On a REQUIRED context that silently converts
# a real failure into a green gate — continue-on-error on platform-build
# masked regressions for ~3 weeks before #656 surfaced them. SOP#765
# banned it on required jobs; this lint enforces it at PR time.
#
# Required-context SSOT + drift guard
# -----------------------------------
# `.gitea/required-contexts.txt` is authoritative (CI cannot always read
# branch_protections — cp returns 403). When a repo-admin token is
# present (DRIFT_BOT_TOKEN) the lint ALSO live-reads BP and fails if the
# checked-in allowlist has DRIFTED from live BP. A 403/absent token
# degrades gracefully to allowlist-only (warn, not fail).
#
# Not path-filtered on the required-contexts file because BP can change
# out-of-band; the live cross-check catches that on every run.
on:
pull_request:
types: [opened, synchronize, reopened]
paths:
- '.gitea/workflows/**'
- '.gitea/required-contexts.txt'
- '.gitea/scripts/lint_no_coe_on_required.py'
- 'tests/test_lint_no_coe_on_required.py'
push:
branches: [main, staging]
schedule:
# Daily — catches BP-drift introduced out-of-band (a required context
# added to BP whose emitting job already has continue-on-error).
- cron: '23 14 * * *'
workflow_dispatch:
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
permissions:
contents: read
concurrency:
group: lint-no-coe-on-required-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
# bp-exempt: meta-lint guarding the required set; standalone red-status
# lint, not itself a branch-protection required context.
lint:
name: lint-no-coe-on-required
runs-on: ubuntu-latest
timeout-minutes: 10
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
with:
python-version: '3.12'
- name: Install PyYAML
run: python -m pip install --quiet 'PyYAML==6.0.2'
- name: Run lint-no-coe-on-required
env:
GITEA_TOKEN: ${{ secrets.DRIFT_BOT_TOKEN }}
GITEA_HOST: git.moleculesai.app
REPO: ${{ github.repository }}
run: python3 .gitea/scripts/lint_no_coe_on_required.py
- name: Run unit tests
run: |
python -m pip install --quiet pytest
python3 -m pytest tests/test_lint_no_coe_on_required.py -q
+93
View File
@@ -0,0 +1,93 @@
"""Unit tests for lint_no_coe_on_required — fixture catch + clean."""
import importlib.util
import os
import textwrap
HERE = os.path.dirname(__file__)
SCRIPT = os.path.join(HERE, "..", ".gitea", "scripts", "lint_no_coe_on_required.py")
spec = importlib.util.spec_from_file_location("lint_no_coe_on_required", SCRIPT)
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
def _wf(tmp_path, name, body):
d = tmp_path / ".gitea" / "workflows"
d.mkdir(parents=True, exist_ok=True)
(d / name).write_text(textwrap.dedent(body))
def _allow(tmp_path, contexts):
(tmp_path / ".gitea").mkdir(parents=True, exist_ok=True)
(tmp_path / ".gitea" / "required-contexts.txt").write_text("\n".join(contexts) + "\n")
def test_coe_on_required_job_flagged(tmp_path):
_wf(tmp_path, "ci.yml", """\
name: CI
on: [pull_request]
jobs:
all-required:
runs-on: ubuntu-latest
continue-on-error: true
steps:
- run: echo gate
""")
ctxs = mod.job_contexts(str(tmp_path / ".gitea" / "workflows"))
info = ctxs["CI / all-required"]
assert info[2] is True # continue-on-error detected
def test_coe_string_true_flagged(tmp_path):
_wf(tmp_path, "ci.yml", """\
name: CI
on: [pull_request]
jobs:
gate:
runs-on: ubuntu-latest
continue-on-error: "true"
steps:
- run: echo hi
""")
ctxs = mod.job_contexts(str(tmp_path / ".gitea" / "workflows"))
assert ctxs["CI / gate"][2] is True
def test_required_job_without_coe_clean(tmp_path):
_wf(tmp_path, "ci.yml", """\
name: CI
on: [pull_request]
jobs:
all-required:
runs-on: ubuntu-latest
steps:
- run: echo gate
""")
ctxs = mod.job_contexts(str(tmp_path / ".gitea" / "workflows"))
assert ctxs["CI / all-required"][2] is False
def test_named_job_context_uses_name_not_key(tmp_path):
_wf(tmp_path, "e2e.yml", """\
name: E2E API Smoke Test
on: [pull_request]
jobs:
e2e-api:
name: E2E API Smoke Test
runs-on: ubuntu-latest
steps:
- run: echo hi
""")
ctxs = mod.job_contexts(str(tmp_path / ".gitea" / "workflows"))
assert "E2E API Smoke Test / E2E API Smoke Test" in ctxs
def test_strip_event_suffix():
assert mod.strip_event("CI / all-required (pull_request)") == "CI / all-required"
assert mod.strip_event("ci / build (push)") == "ci / build"
assert mod.strip_event("X / y") == "X / y"
def test_allowlist_load(tmp_path):
_allow(tmp_path, ["# comment", "CI / all-required", " ci / build (push) "])
got = mod.load_required_allowlist(str(tmp_path / ".gitea" / "required-contexts.txt"))
assert got == {"CI / all-required", "ci / build"}