The bundled pre-commit hook is the runtime-side mirror of molecule-core's canonical .github/workflows/secret-scan.yml SECRET_PATTERNS array. They drifted: canonical added the MiniMax sk-cp- pattern (F1088 vector — caught only after the fact) but this side wasn't updated. Result: a workspace developer's local pre-commit would let through a sk-cp- token that the org-wide CI scan would then refuse — useless friction. This brings the two sides back into byte-aligned-on-the-pattern-list state. The drift is exactly the maintenance gap that task #139's upcoming molecule-core CI lint is designed to surface automatically; this PR clears the gap so the lint passes from day 1. Refs: task #139.
213 lines
8.9 KiB
Bash
213 lines
8.9 KiB
Bash
#!/bin/bash
|
|
# pre-commit hook — defense-in-depth checks against agent foot-guns.
|
|
#
|
|
# Two independent gates, run in order:
|
|
#
|
|
# 1. Secret scan (ALL repos, including third-party). Refuses any commit
|
|
# whose staged additions contain a recognisable credential. Catches
|
|
# the canonical leak vectors regardless of how the secret arrived in
|
|
# the working tree (npm copying _authToken into package.json, an
|
|
# agent persisting its own GITHUB_TOKEN to a config file, a
|
|
# mis-configured `git clone https://x-access-token:GHS@github.com/...`
|
|
# remote, etc.). Same regex set as the tenant-proxy CI scanner —
|
|
# keep them aligned.
|
|
#
|
|
# 2. Internal-paths block (Molecule-AI public repos only). Refuses
|
|
# commits that add `research/`, `marketing/`, etc. to the public
|
|
# monorepo. Was the original purpose of this hook; lives here so
|
|
# every agent commit only pays one hook-install + one git config
|
|
# surface area, not two.
|
|
#
|
|
# Installed via `core.hooksPath` set by molecule_runtime.precommit_hook
|
|
# at workspace startup.
|
|
|
|
set -e
|
|
|
|
# Skip silently when GIT_AUTHOR_NAME/COMMITTER_NAME is unset — likely a
|
|
# non-agent context (operator manually running git inside the container
|
|
# for debug). Both gates assume the agent profile.
|
|
if [ -z "${GIT_AUTHOR_NAME:-}${GIT_COMMITTER_NAME:-}" ]; then
|
|
exit 0
|
|
fi
|
|
|
|
# Skip during rebase / cherry-pick / merge / revert — these REPLAY
|
|
# existing commits and the staged file set is whatever was already
|
|
# committed upstream. Blocking them forces interactive history rewriting
|
|
# that most agents won't do, leaving DIRTY PRs unmergeable. Both gates
|
|
# share this skip.
|
|
GIT_DIR=$(git rev-parse --git-dir 2>/dev/null || echo .git)
|
|
for state_dir in rebase-merge rebase-apply CHERRY_PICK_HEAD MERGE_HEAD REVERT_HEAD; do
|
|
if [ -e "${GIT_DIR}/${state_dir}" ]; then
|
|
exit 0
|
|
fi
|
|
done
|
|
|
|
REMOTE=$(git remote get-url origin 2>/dev/null || echo "")
|
|
|
|
# ─── 1. Secret scan ────────────────────────────────────────────────────
|
|
#
|
|
# Scans the staged diff (added lines only) for credential patterns.
|
|
# Refuses the commit if any pattern matches. The error message names the
|
|
# file + the pattern that matched but never echoes the secret value
|
|
# itself — avoids round-tripping the leaked credential into terminal
|
|
# scrollback / agent context windows.
|
|
#
|
|
# Pattern set covers the high-value GitHub family (the actual #2090
|
|
# incident vector), the most common cloud + LLM provider tokens, and
|
|
# AWS access keys. Regex anchored on prefixes that have low false-
|
|
# positive rates against agent-generated content (no plain hex blobs).
|
|
SECRET_PATTERNS=(
|
|
'ghp_[A-Za-z0-9]{36,}' # GitHub PAT (classic)
|
|
'ghs_[A-Za-z0-9]{36,}' # GitHub App installation token
|
|
'gho_[A-Za-z0-9]{36,}' # GitHub OAuth user-to-server
|
|
'ghu_[A-Za-z0-9]{36,}' # GitHub OAuth user
|
|
'ghr_[A-Za-z0-9]{36,}' # GitHub OAuth refresh
|
|
'github_pat_[A-Za-z0-9_]{82,}' # GitHub fine-grained PAT
|
|
'sk-ant-[A-Za-z0-9_-]{40,}' # Anthropic API key
|
|
'sk-proj-[A-Za-z0-9_-]{40,}' # OpenAI project key
|
|
'sk-svcacct-[A-Za-z0-9_-]{40,}' # OpenAI service-account key
|
|
'sk-cp-[A-Za-z0-9_-]{60,}' # MiniMax API key (F1088 vector — caught only after the fact)
|
|
'xox[baprs]-[A-Za-z0-9-]{20,}' # Slack tokens (bot/app/user/refresh)
|
|
'AKIA[0-9A-Z]{16}' # AWS access key ID
|
|
'ASIA[0-9A-Z]{16}' # AWS STS temp access key ID
|
|
)
|
|
|
|
# Only check ADDITIONS (lines starting with + but not the +++ header).
|
|
# This avoids re-flagging unchanged secrets that may already exist on
|
|
# disk from a previous commit (those are tracked separately by repo
|
|
# scanners; the local hook only enforces "don't add new ones").
|
|
DIFF=$(git diff --cached --no-color --unified=0 2>/dev/null | grep -E '^\+[^+]' || true)
|
|
|
|
if [ -n "$DIFF" ]; then
|
|
SECRET_HITS=""
|
|
for pattern in "${SECRET_PATTERNS[@]}"; do
|
|
# Use grep -lE on the diff to find which file the match came from.
|
|
# `git diff --cached --name-only -G<pattern>` would be cleaner but
|
|
# -G operates on hunks not added lines, and we explicitly want
|
|
# only added lines. Walk staged files individually for the file
|
|
# attribution.
|
|
if echo "$DIFF" | grep -qE "$pattern"; then
|
|
for f in $(git diff --cached --name-only --diff-filter=AM); do
|
|
if git diff --cached --no-color --unified=0 -- "$f" 2>/dev/null \
|
|
| grep -E '^\+[^+]' \
|
|
| grep -qE "$pattern"; then
|
|
SECRET_HITS="${SECRET_HITS} - ${f} (matched: ${pattern})\n"
|
|
fi
|
|
done
|
|
fi
|
|
done
|
|
|
|
if [ -n "$SECRET_HITS" ]; then
|
|
{
|
|
echo
|
|
echo "Refusing commit: staged additions contain credential-shaped strings."
|
|
echo
|
|
echo "Offending files:"
|
|
printf "$SECRET_HITS"
|
|
echo
|
|
echo "The actual matched values are NOT printed here, deliberately —"
|
|
echo "echoing them back into scrollback / agent context would round-trip"
|
|
echo "the leak into another surface."
|
|
echo
|
|
echo "Recovery:"
|
|
echo " 1. git restore --staged <file>"
|
|
echo " 2. Remove the secret from <file> and replace with an env var"
|
|
echo " reference (e.g. \${GITHUB_TOKEN}). Never inline credentials."
|
|
echo " 3. If the credential was already exposed (committed locally OR"
|
|
echo " visible to the agent in any prior tool output), treat it as"
|
|
echo " compromised — rotate it immediately, do not just remove it."
|
|
echo " 4. git add <file> && git commit"
|
|
echo
|
|
echo "If the match is a false positive (test fixture / docs example),"
|
|
echo "use a clearly-fake placeholder like ghs_EXAMPLE_TOKEN_DO_NOT_USE"
|
|
echo "that doesn't satisfy the length suffix."
|
|
echo
|
|
echo "Hook source: molecule_runtime/scripts/pre-commit-checks.sh"
|
|
} >&2
|
|
exit 1
|
|
fi
|
|
fi
|
|
|
|
# ─── 2. Internal-paths block (public Molecule-AI repos only) ──────────
|
|
#
|
|
# Despite SHARED_RULES.md, .gitignore, and a CI gate, agents still try
|
|
# to `git add /research/...` from their cwd in `molecule-monorepo`. Each
|
|
# leak attempt costs ~5 cycles (PR opens, CI fails, agent retries with
|
|
# workaround) and pollutes git history with reverts. This gate converts
|
|
# the failure mode from "PR fails" → "commit refused at the agent's
|
|
# local git" — instant feedback with the redirect command in the same
|
|
# error message.
|
|
case "$REMOTE" in
|
|
*Molecule-AI/molecule-monorepo*|*Molecule-AI/molecule-core*)
|
|
;;
|
|
*)
|
|
# Non-target repo (internal, plugins, templates, third-party) — let it through.
|
|
exit 0
|
|
;;
|
|
esac
|
|
|
|
STAGED=$(git diff --cached --name-only --diff-filter=AM)
|
|
[ -z "$STAGED" ] && exit 0
|
|
|
|
FORBIDDEN_PATTERNS=(
|
|
"^research/"
|
|
"^marketing/"
|
|
"^docs/marketing/"
|
|
"^comment-[0-9]+\.json$"
|
|
"^test-pmm.*\.(txt|md)$"
|
|
"^tick-reflections.*\.(txt|md)$"
|
|
".*-temp\.(md|txt)$"
|
|
)
|
|
|
|
OFFENDING=""
|
|
for path in $STAGED; do
|
|
for pattern in "${FORBIDDEN_PATTERNS[@]}"; do
|
|
if echo "$path" | grep -qE "$pattern"; then
|
|
OFFENDING="${OFFENDING} - ${path} (matched: ${pattern})\n"
|
|
break
|
|
fi
|
|
done
|
|
done
|
|
|
|
[ -z "$OFFENDING" ] && exit 0
|
|
|
|
{
|
|
echo
|
|
echo "Refusing commit: internal-flavored paths cannot live in the public monorepo."
|
|
echo
|
|
echo "Offending files:"
|
|
printf "$OFFENDING"
|
|
echo
|
|
echo "These belong in Molecule-AI/internal. Redirect:"
|
|
echo
|
|
echo " mkdir -p ~/repos"
|
|
echo " test -d ~/repos/internal || gh repo clone Molecule-AI/internal ~/repos/internal"
|
|
echo " cd ~/repos/internal"
|
|
echo " git pull origin main"
|
|
echo " git checkout -b <my-role>/<topic>-<date>"
|
|
echo " mkdir -p <area> # research, marketing, runbooks, etc."
|
|
echo " # move your file from the monorepo into <area>/<slug>.md"
|
|
echo " git add <area>/<slug>.md"
|
|
echo " git commit -m '<area>: add <slug>'"
|
|
echo " git push -u origin HEAD"
|
|
echo " gh pr create --base main --fill"
|
|
echo
|
|
echo "If your file is genuinely public-facing (final blog post, public"
|
|
echo "tutorial, customer-shippable doc), use one of these monorepo paths"
|
|
echo "instead — these are not blocked:"
|
|
echo " - docs/blog/<slug>.md"
|
|
echo " - docs/tutorials/<slug>.md"
|
|
echo " - docs/devrel/<slug>.md"
|
|
echo " - docs/api/<slug>.md"
|
|
echo
|
|
echo "If you legitimately need a new top-level path that matches a"
|
|
echo "forbidden pattern, edit:"
|
|
echo " .github/workflows/block-internal-paths.yml"
|
|
echo "with reviewer signoff and a public-facing justification — do NOT"
|
|
echo "work around the gate by renaming."
|
|
echo
|
|
echo "Hook source: molecule_runtime/scripts/pre-commit-checks.sh"
|
|
} >&2
|
|
|
|
exit 1
|