forked from molecule-ai/molecule-core
#1569 Phase 1 discovery (2026-05-02) found six historical credential exposures in molecule-core git history. All confirmed dead — but the reason they got committed in the first place was that the local pre-commit hook had two gaps that the canonical CI gate (and the runtime's hook) didn't: 1. **Pattern set was incomplete.** Local hook checked `sk-ant-|sk-proj-|ghp_|gho_|AKIA|mol_pk_|cfut_` — missing `ghs_*`, `ghu_*`, `ghr_*`, `github_pat_*`, `sk-svcacct-`, `sk-cp-`, `xox[baprs]-`, `ASIA*`. The historical leaks were 5× `ghs_*` (App installation tokens) + 1× `github_pat_*` — none of which the local hook would have caught even if it ran. 2. **`*.md` and `docs/` were skip-listed.** The leaked tokens lived in `tick-reflections-temp.md`, `qa-audit-2026-04-21.md`, and `docs/incidents/INCIDENT_LOG.md` — exactly the file types the skip-list excluded. The hook ran and silently passed. This commit: - Replaces the local hook's hard-coded inline regex with the canonical 13-pattern array (byte-aligned with `.github/workflows/secret-scan.yml` and the workspace runtime's `pre-commit-checks.sh`). - Removes the `\.md$|docs/` skip — keeps only binary, lockfile, and hook-self exclusions. - Adds the local hook to `lint_secret_pattern_drift.py` as an in-repo consumer (read-from-disk, no network — the hook lives in the same checkout the lint runs against). Drift now fails the lint when canonical changes without the local hook updating in lockstep. - Adds `.githooks/pre-commit` to the drift-lint workflow's path filter so consumer-side edits also trigger the lint. - Adopts the canonical's "don't echo the matched value" defense (the prior version would have round-tripped a leaked credential into scrollback / CI logs). Verified: `python3 .github/scripts/lint_secret_pattern_drift.py` reports both consumers aligned at 13 patterns. The hook's existing six other gates (canvas 'use client', dark theme, SQL injection, go-build, etc.) are untouched. Companion change (already applied via API, no diff here): `Scan diff for credential-shaped strings` is now in the required-checks list on both `staging` and `main` branch protection — was previously a soft gate (workflow ran, exited 1, but didn't block merge). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
167 lines
6.5 KiB
Python
167 lines
6.5 KiB
Python
#!/usr/bin/env python3
|
|
"""Lint SECRET_PATTERNS drift across known consumers of molecule-core's canonical.
|
|
|
|
The canonical SECRET_PATTERNS array in
|
|
.github/workflows/secret-scan.yml is mirrored by every other side
|
|
that scans for credentials: the workspace-runtime's bundled
|
|
pre-commit hook, the molecule-controlplane inlined copy, etc. The
|
|
mirror is enforced socially today — when someone adds a new pattern
|
|
to canonical (e.g. the sk-cp- MiniMax token after F1088), the other
|
|
sides are supposed to be updated in lockstep.
|
|
|
|
This script automates the check. Diffs the canonical's pattern set
|
|
against each known public consumer and exits non-zero on any
|
|
mismatch. Wired into a daily cron + on-push gate via
|
|
.github/workflows/secret-pattern-drift.yml.
|
|
|
|
Private-repo consumers (currently molecule-controlplane's inlined
|
|
copy) are out of scope here because the molecule-core workflow's
|
|
GITHUB_TOKEN can't read other private repos in the org. They're
|
|
expected to self-monitor via their own copy of this script — not a
|
|
hard barrier, just a future expansion.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
import sys
|
|
import urllib.request
|
|
from pathlib import Path
|
|
|
|
CANONICAL_FILE = Path(".github/workflows/secret-scan.yml")
|
|
|
|
# Public consumer mirrors. Each entry is (label, raw_url) — raw_url
|
|
# points at the file's RAW content on the consumer's default branch
|
|
# (or staging where applicable). Add an entry here when a new public
|
|
# repo starts shipping its own SECRET_PATTERNS array.
|
|
CONSUMERS: list[tuple[str, str]] = [
|
|
(
|
|
"molecule-ai-workspace-runtime/molecule_runtime/scripts/pre-commit-checks.sh",
|
|
"https://raw.githubusercontent.com/Molecule-AI/molecule-ai-workspace-runtime/main/molecule_runtime/scripts/pre-commit-checks.sh",
|
|
),
|
|
]
|
|
|
|
# In-repo consumers — paths read locally from the workflow checkout.
|
|
# Read-from-disk avoids the staging→main lag that the URL fetcher
|
|
# would hit (a freshly-edited canonical wouldn't yet be on the
|
|
# consumer's default branch). Same drift semantics, no network.
|
|
LOCAL_CONSUMERS: list[tuple[str, Path]] = [
|
|
(
|
|
".githooks/pre-commit (molecule-core local hook)",
|
|
Path(".githooks/pre-commit"),
|
|
),
|
|
]
|
|
|
|
# Matches the SECRET_PATTERNS=( ... ) array in either yaml-indented
|
|
# (the canonical workflow's `run:` block) or shell-flat (runtime
|
|
# hook) format. Patterns inside are single-quoted Bash strings; we
|
|
# pull each via _PATTERN_RE.
|
|
#
|
|
# Closing `)` is anchored to the start of a line (possibly indented)
|
|
# because pattern comments like `# GitHub PAT (classic)` contain
|
|
# their own `)` mid-line — a non-anchored regex would match through
|
|
# the comment's paren and capture only the first pattern.
|
|
_ARRAY_RE = re.compile(r"SECRET_PATTERNS=\((.*?)^\s*\)", re.DOTALL | re.MULTILINE)
|
|
_PATTERN_RE = re.compile(r"'([^']+)'")
|
|
|
|
|
|
def extract_patterns(content: str, source_label: str) -> list[str]:
|
|
"""Pull the SECRET_PATTERNS list out of either format. Raises if missing."""
|
|
m = _ARRAY_RE.search(content)
|
|
if not m:
|
|
raise SystemExit(f"::error::{source_label}: SECRET_PATTERNS=(...) array not found")
|
|
return _PATTERN_RE.findall(m.group(1))
|
|
|
|
|
|
def fetch(url: str) -> str:
|
|
req = urllib.request.Request(
|
|
url, headers={"User-Agent": "secret-pattern-drift-lint/1"}
|
|
)
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
return resp.read().decode("utf-8")
|
|
|
|
|
|
def diff_patterns(canonical: list[str], consumer: list[str]) -> tuple[list[str], list[str]]:
|
|
"""Return (missing_from_consumer, extra_in_consumer) — both sorted."""
|
|
canonical_set = set(canonical)
|
|
consumer_set = set(consumer)
|
|
return (
|
|
sorted(canonical_set - consumer_set),
|
|
sorted(consumer_set - canonical_set),
|
|
)
|
|
|
|
|
|
def main() -> int:
|
|
if not CANONICAL_FILE.exists():
|
|
print(f"::error::canonical not found at {CANONICAL_FILE}")
|
|
return 1
|
|
|
|
canonical = extract_patterns(CANONICAL_FILE.read_text(), str(CANONICAL_FILE))
|
|
print(f"canonical ({CANONICAL_FILE}): {len(canonical)} patterns")
|
|
|
|
drift = False
|
|
|
|
# In-repo consumers first — these are read from the workflow's own
|
|
# checkout, so they never lag behind the canonical and a missing
|
|
# file IS a real error (not a fetch warning).
|
|
for label, path in LOCAL_CONSUMERS:
|
|
if not path.exists():
|
|
print(f"::error::{label}: file not found at {path}")
|
|
drift = True
|
|
continue
|
|
consumer = extract_patterns(path.read_text(), label)
|
|
missing, extra = diff_patterns(canonical, consumer)
|
|
if not missing and not extra:
|
|
print(f" ✓ {label}: aligned ({len(consumer)} patterns)")
|
|
continue
|
|
drift = True
|
|
print(f"::error::DRIFT in {label}:")
|
|
for p in missing:
|
|
print(f" - missing from consumer: {p!r}")
|
|
for p in extra:
|
|
print(f" - extra in consumer (not in canonical): {p!r}")
|
|
|
|
for label, url in CONSUMERS:
|
|
try:
|
|
content = fetch(url)
|
|
except Exception as e:
|
|
# Fetch failures are warnings, not errors. A consumer
|
|
# whose default branch was just renamed (or whose file
|
|
# moved) shouldn't fail the lint until someone updates
|
|
# the URL above. Real drift is the failure mode this
|
|
# gate exists to catch — fetch reliability isn't.
|
|
print(f"::warning::{label}: fetch failed ({e}) — skipping")
|
|
continue
|
|
|
|
consumer = extract_patterns(content, label)
|
|
missing, extra = diff_patterns(canonical, consumer)
|
|
if not missing and not extra:
|
|
print(f" ✓ {label}: aligned ({len(consumer)} patterns)")
|
|
continue
|
|
|
|
drift = True
|
|
print(f"::error::DRIFT in {label}:")
|
|
for p in missing:
|
|
print(f" - missing from consumer: {p!r}")
|
|
for p in extra:
|
|
print(f" - extra in consumer (not in canonical): {p!r}")
|
|
|
|
if drift:
|
|
print()
|
|
print("::error::SECRET_PATTERNS drift detected. Bring consumer(s) into")
|
|
print("alignment with the canonical SECRET_PATTERNS array in")
|
|
print(f"{CANONICAL_FILE} by adding the missing patterns and removing")
|
|
print("any extras. The two sides must stay byte-aligned on the pattern")
|
|
print("list — the runtime hook is the developer's local pre-commit,")
|
|
print("the canonical is the org-wide CI gate, divergence means a token")
|
|
print("can pass one but get rejected by the other.")
|
|
return 1
|
|
|
|
print()
|
|
print("✓ All known consumers aligned with canonical SECRET_PATTERNS.")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|