diff --git a/.githooks/pre-commit b/.githooks/pre-commit index ecbacd6d..4959bb51 100755 --- a/.githooks/pre-commit +++ b/.githooks/pre-commit @@ -129,19 +129,57 @@ fi # ────────────────────────────────────────────────────────── # 6. Secrets: No tokens/keys in staged files # ────────────────────────────────────────────────────────── +# +# Pattern set MUST match .github/workflows/secret-scan.yml SECRET_PATTERNS +# and molecule-ai-workspace-runtime/molecule_runtime/scripts/pre-commit-checks.sh — +# .github/workflows/secret-pattern-drift.yml lints this invariant. Rebuilt +# against canonical 2026-05-02 after #1569 Phase 1 discovery surfaced +# real ghs_*/github_pat_* leaks that the prior pattern set +# ('sk-ant-|sk-proj-|ghp_|gho_|AKIA|mol_pk_|cfut_') would have missed: +# (a) it lacked ghs_ / ghu_ / ghr_ / github_pat_ / sk-svcacct- / sk-cp- / +# xox[baprs]- / ASIA prefixes, (b) it skipped *.md and docs/* — but the +# actual leaks lived in tick-reflections-temp.md, qa-audit-2026-04-21.md, +# docs/incidents/INCIDENT_LOG.md. +SECRET_PATTERNS=( + 'ghp_[A-Za-z0-9]{36,}' # GitHub PAT (classic) + 'ghs_[A-Za-z0-9]{36,}' # GitHub App installation token + 'gho_[A-Za-z0-9]{36,}' # GitHub OAuth user-to-server + 'ghu_[A-Za-z0-9]{36,}' # GitHub OAuth user + 'ghr_[A-Za-z0-9]{36,}' # GitHub OAuth refresh + 'github_pat_[A-Za-z0-9_]{82,}' # GitHub fine-grained PAT + 'sk-ant-[A-Za-z0-9_-]{40,}' # Anthropic API key + 'sk-proj-[A-Za-z0-9_-]{40,}' # OpenAI project key + 'sk-svcacct-[A-Za-z0-9_-]{40,}' # OpenAI service-account key + 'sk-cp-[A-Za-z0-9_-]{60,}' # MiniMax API key (F1088 vector — caught only after the fact) + 'xox[baprs]-[A-Za-z0-9-]{20,}' # Slack tokens (bot/app/user/refresh) + 'AKIA[0-9A-Z]{16}' # AWS access key ID + 'ASIA[0-9A-Z]{16}' # AWS STS temp access key ID +) ALL_STAGED=$(git diff --cached --name-only --diff-filter=ACM || true) if [ -n "$ALL_STAGED" ]; then for f in $ALL_STAGED; do - # Skip binary, known safe files, hooks, docs, and markdown - if echo "$f" | grep -qE '\.png$|\.jpg$|\.ico$|\.woff|node_modules|\.lock$|\.githooks/|\.md$|docs/'; then + # Skip ONLY binary + lockfiles + the hook itself. Markdown + + # docs/* are NOT skipped — that was the bug (#1569 leaks were + # all in *.md). If a doc legitimately needs a token-shaped + # placeholder, use ghs_EXAMPLE_TOKEN_DO_NOT_USE — short enough + # to dodge the {36,} length suffix. + if echo "$f" | grep -qE '\.png$|\.jpg$|\.ico$|\.woff|node_modules|\.lock$|\.githooks/'; then continue fi - DIFF=$(git diff --cached "$f" 2>/dev/null | grep '^+' | grep -v '^+++' || true) - if echo "$DIFF" | grep -qE 'sk-ant-|sk-proj-|ghp_|gho_|AKIA[A-Z0-9]|mol_pk_|cfut_' 2>/dev/null; then - echo "❌ POSSIBLE SECRET in $f — do not commit API keys or tokens" - ERRORS=$((ERRORS + 1)) - fi + DIFF=$(git diff --cached --no-color --unified=0 -- "$f" 2>/dev/null | grep -E '^\+[^+]' || true) + [ -z "$DIFF" ] && continue + for pattern in "${SECRET_PATTERNS[@]}"; do + if echo "$DIFF" | grep -qE "$pattern"; then + echo "❌ POSSIBLE SECRET in $f (matched: ${pattern})" + echo " The actual matched value is NOT echoed here — round-tripping a" + echo " leaked credential into scrollback widens the blast radius." + echo " If false positive (test/docs example), use a short placeholder" + echo " like ghs_EXAMPLE_TOKEN_DO_NOT_USE that doesn't satisfy the length." + ERRORS=$((ERRORS + 1)) + break + fi + done done fi diff --git a/.github/scripts/lint_secret_pattern_drift.py b/.github/scripts/lint_secret_pattern_drift.py index 6c1b7965..076d2719 100644 --- a/.github/scripts/lint_secret_pattern_drift.py +++ b/.github/scripts/lint_secret_pattern_drift.py @@ -41,6 +41,17 @@ CONSUMERS: list[tuple[str, str]] = [ ), ] +# In-repo consumers — paths read locally from the workflow checkout. +# Read-from-disk avoids the staging→main lag that the URL fetcher +# would hit (a freshly-edited canonical wouldn't yet be on the +# consumer's default branch). Same drift semantics, no network. +LOCAL_CONSUMERS: list[tuple[str, Path]] = [ + ( + ".githooks/pre-commit (molecule-core local hook)", + Path(".githooks/pre-commit"), + ), +] + # Matches the SECRET_PATTERNS=( ... ) array in either yaml-indented # (the canonical workflow's `run:` block) or shell-flat (runtime # hook) format. Patterns inside are single-quoted Bash strings; we @@ -89,6 +100,27 @@ def main() -> int: print(f"canonical ({CANONICAL_FILE}): {len(canonical)} patterns") drift = False + + # In-repo consumers first — these are read from the workflow's own + # checkout, so they never lag behind the canonical and a missing + # file IS a real error (not a fetch warning). + for label, path in LOCAL_CONSUMERS: + if not path.exists(): + print(f"::error::{label}: file not found at {path}") + drift = True + continue + consumer = extract_patterns(path.read_text(), label) + missing, extra = diff_patterns(canonical, consumer) + if not missing and not extra: + print(f" ✓ {label}: aligned ({len(consumer)} patterns)") + continue + drift = True + print(f"::error::DRIFT in {label}:") + for p in missing: + print(f" - missing from consumer: {p!r}") + for p in extra: + print(f" - extra in consumer (not in canonical): {p!r}") + for label, url in CONSUMERS: try: content = fetch(url) diff --git a/.github/workflows/secret-pattern-drift.yml b/.github/workflows/secret-pattern-drift.yml index 7d4435fe..a9d8cc94 100644 --- a/.github/workflows/secret-pattern-drift.yml +++ b/.github/workflows/secret-pattern-drift.yml @@ -34,6 +34,7 @@ on: - ".github/workflows/secret-scan.yml" - ".github/workflows/secret-pattern-drift.yml" - ".github/scripts/lint_secret_pattern_drift.py" + - ".githooks/pre-commit" workflow_dispatch: # GITHUB_TOKEN scoped to read-only. The lint only does git checkout