#!/bin/bash # pre-commit hook — defense-in-depth checks against agent foot-guns. # # Two independent gates, run in order: # # 1. Secret scan (ALL repos, including third-party). Refuses any commit # whose staged additions contain a recognisable credential. Catches # the canonical leak vectors regardless of how the secret arrived in # the working tree (npm copying _authToken into package.json, an # agent persisting its own GITHUB_TOKEN to a config file, a # mis-configured `git clone https://x-access-token:GHS@github.com/...` # remote, etc.). Same regex set as the tenant-proxy CI scanner — # keep them aligned. # # 2. Internal-paths block (Molecule-AI public repos only). Refuses # commits that add `research/`, `marketing/`, etc. to the public # monorepo. Was the original purpose of this hook; lives here so # every agent commit only pays one hook-install + one git config # surface area, not two. # # Installed via `core.hooksPath` set by molecule_runtime.precommit_hook # at workspace startup. set -e # Skip silently when GIT_AUTHOR_NAME/COMMITTER_NAME is unset — likely a # non-agent context (operator manually running git inside the container # for debug). Both gates assume the agent profile. if [ -z "${GIT_AUTHOR_NAME:-}${GIT_COMMITTER_NAME:-}" ]; then exit 0 fi # Skip during rebase / cherry-pick / merge / revert — these REPLAY # existing commits and the staged file set is whatever was already # committed upstream. Blocking them forces interactive history rewriting # that most agents won't do, leaving DIRTY PRs unmergeable. Both gates # share this skip. GIT_DIR=$(git rev-parse --git-dir 2>/dev/null || echo .git) for state_dir in rebase-merge rebase-apply CHERRY_PICK_HEAD MERGE_HEAD REVERT_HEAD; do if [ -e "${GIT_DIR}/${state_dir}" ]; then exit 0 fi done REMOTE=$(git remote get-url origin 2>/dev/null || echo "") # ─── 1. Secret scan ──────────────────────────────────────────────────── # # Scans the staged diff (added lines only) for credential patterns. # Refuses the commit if any pattern matches. The error message names the # file + the pattern that matched but never echoes the secret value # itself — avoids round-tripping the leaked credential into terminal # scrollback / agent context windows. # # Pattern set covers the high-value GitHub family (the actual #2090 # incident vector), the most common cloud + LLM provider tokens, and # AWS access keys. Regex anchored on prefixes that have low false- # positive rates against agent-generated content (no plain hex blobs). SECRET_PATTERNS=( 'ghp_[A-Za-z0-9]{36,}' # GitHub PAT (classic) 'ghs_[A-Za-z0-9]{36,}' # GitHub App installation token 'gho_[A-Za-z0-9]{36,}' # GitHub OAuth user-to-server 'ghu_[A-Za-z0-9]{36,}' # GitHub OAuth user 'ghr_[A-Za-z0-9]{36,}' # GitHub OAuth refresh 'github_pat_[A-Za-z0-9_]{82,}' # GitHub fine-grained PAT 'sk-ant-[A-Za-z0-9_-]{40,}' # Anthropic API key 'sk-proj-[A-Za-z0-9_-]{40,}' # OpenAI project key 'sk-svcacct-[A-Za-z0-9_-]{40,}' # OpenAI service-account key 'sk-cp-[A-Za-z0-9_-]{60,}' # MiniMax API key (F1088 vector — caught only after the fact) 'xox[baprs]-[A-Za-z0-9-]{20,}' # Slack tokens (bot/app/user/refresh) 'AKIA[0-9A-Z]{16}' # AWS access key ID 'ASIA[0-9A-Z]{16}' # AWS STS temp access key ID ) # Only check ADDITIONS (lines starting with + but not the +++ header). # This avoids re-flagging unchanged secrets that may already exist on # disk from a previous commit (those are tracked separately by repo # scanners; the local hook only enforces "don't add new ones"). DIFF=$(git diff --cached --no-color --unified=0 2>/dev/null | grep -E '^\+[^+]' || true) if [ -n "$DIFF" ]; then SECRET_HITS="" for pattern in "${SECRET_PATTERNS[@]}"; do # Use grep -lE on the diff to find which file the match came from. # `git diff --cached --name-only -G` would be cleaner but # -G operates on hunks not added lines, and we explicitly want # only added lines. Walk staged files individually for the file # attribution. if echo "$DIFF" | grep -qE "$pattern"; then for f in $(git diff --cached --name-only --diff-filter=AM); do if git diff --cached --no-color --unified=0 -- "$f" 2>/dev/null \ | grep -E '^\+[^+]' \ | grep -qE "$pattern"; then SECRET_HITS="${SECRET_HITS} - ${f} (matched: ${pattern})\n" fi done fi done if [ -n "$SECRET_HITS" ]; then { echo echo "Refusing commit: staged additions contain credential-shaped strings." echo echo "Offending files:" printf "$SECRET_HITS" echo echo "The actual matched values are NOT printed here, deliberately —" echo "echoing them back into scrollback / agent context would round-trip" echo "the leak into another surface." echo echo "Recovery:" echo " 1. git restore --staged " echo " 2. Remove the secret from and replace with an env var" echo " reference (e.g. \${GITHUB_TOKEN}). Never inline credentials." echo " 3. If the credential was already exposed (committed locally OR" echo " visible to the agent in any prior tool output), treat it as" echo " compromised — rotate it immediately, do not just remove it." echo " 4. git add && git commit" echo echo "If the match is a false positive (test fixture / docs example)," echo "use a clearly-fake placeholder like ghs_EXAMPLE_TOKEN_DO_NOT_USE" echo "that doesn't satisfy the length suffix." echo echo "Hook source: molecule_runtime/scripts/pre-commit-checks.sh" } >&2 exit 1 fi fi # ─── 2. Internal-paths block (public Molecule-AI repos only) ────────── # # Despite SHARED_RULES.md, .gitignore, and a CI gate, agents still try # to `git add /research/...` from their cwd in `molecule-monorepo`. Each # leak attempt costs ~5 cycles (PR opens, CI fails, agent retries with # workaround) and pollutes git history with reverts. This gate converts # the failure mode from "PR fails" → "commit refused at the agent's # local git" — instant feedback with the redirect command in the same # error message. case "$REMOTE" in *Molecule-AI/molecule-monorepo*|*Molecule-AI/molecule-core*) ;; *) # Non-target repo (internal, plugins, templates, third-party) — let it through. exit 0 ;; esac STAGED=$(git diff --cached --name-only --diff-filter=AM) [ -z "$STAGED" ] && exit 0 FORBIDDEN_PATTERNS=( "^research/" "^marketing/" "^docs/marketing/" "^comment-[0-9]+\.json$" "^test-pmm.*\.(txt|md)$" "^tick-reflections.*\.(txt|md)$" ".*-temp\.(md|txt)$" ) OFFENDING="" for path in $STAGED; do for pattern in "${FORBIDDEN_PATTERNS[@]}"; do if echo "$path" | grep -qE "$pattern"; then OFFENDING="${OFFENDING} - ${path} (matched: ${pattern})\n" break fi done done [ -z "$OFFENDING" ] && exit 0 { echo echo "Refusing commit: internal-flavored paths cannot live in the public monorepo." echo echo "Offending files:" printf "$OFFENDING" echo echo "These belong in Molecule-AI/internal. Redirect:" echo echo " mkdir -p ~/repos" echo " test -d ~/repos/internal || gh repo clone Molecule-AI/internal ~/repos/internal" echo " cd ~/repos/internal" echo " git pull origin main" echo " git checkout -b /-" echo " mkdir -p # research, marketing, runbooks, etc." echo " # move your file from the monorepo into /.md" echo " git add /.md" echo " git commit -m ': add '" echo " git push -u origin HEAD" echo " gh pr create --base main --fill" echo echo "If your file is genuinely public-facing (final blog post, public" echo "tutorial, customer-shippable doc), use one of these monorepo paths" echo "instead — these are not blocked:" echo " - docs/blog/.md" echo " - docs/tutorials/.md" echo " - docs/devrel/.md" echo " - docs/api/.md" echo echo "If you legitimately need a new top-level path that matches a" echo "forbidden pattern, edit:" echo " .github/workflows/block-internal-paths.yml" echo "with reviewer signoff and a public-facing justification — do NOT" echo "work around the gate by renaming." echo echo "Hook source: molecule_runtime/scripts/pre-commit-checks.sh" } >&2 exit 1