molecule-ai-workspace-runtime/molecule_runtime/scripts/pre-commit-checks.sh

#!/bin/bash
# pre-commit hook — defense-in-depth checks against agent foot-guns.
#
# Two independent gates, run in order:
#
#   1. Secret scan (ALL repos, including third-party). Refuses any commit
#      whose staged additions contain a recognisable credential. Catches
#      the canonical leak vectors regardless of how the secret arrived in
#      the working tree (npm copying _authToken into package.json, an
#      agent persisting its own GITHUB_TOKEN to a config file, a
#      mis-configured `git clone https://x-access-token:GHS@github.com/...`
#      remote, etc.). Same regex set as the tenant-proxy CI scanner —
#      keep them aligned.
#
#   2. Internal-paths block (Molecule-AI public repos only). Refuses
#      commits that add `research/`, `marketing/`, etc. to the public
#      monorepo. Was the original purpose of this hook; lives here so
#      every agent commit only pays one hook-install + one git config
#      surface area, not two.
#
# Installed via `core.hooksPath` set by molecule_runtime.precommit_hook
# at workspace startup.

set -e

# Skip silently when GIT_AUTHOR_NAME/COMMITTER_NAME is unset — likely a
# non-agent context (operator manually running git inside the container
# for debug). Both gates assume the agent profile.
if [ -z "${GIT_AUTHOR_NAME:-}${GIT_COMMITTER_NAME:-}" ]; then
    exit 0
fi

# Skip during rebase / cherry-pick / merge / revert — these REPLAY
# existing commits and the staged file set is whatever was already
# committed upstream. Blocking them forces interactive history rewriting
# that most agents won't do, leaving DIRTY PRs unmergeable. Both gates
# share this skip.
GIT_DIR=$(git rev-parse --git-dir 2>/dev/null || echo .git)
for state_dir in rebase-merge rebase-apply CHERRY_PICK_HEAD MERGE_HEAD REVERT_HEAD; do
    if [ -e "${GIT_DIR}/${state_dir}" ]; then
        exit 0
    fi
done

REMOTE=$(git remote get-url origin 2>/dev/null || echo "")

# ─── 1. Secret scan ────────────────────────────────────────────────────
#
# Scans the staged diff (added lines only) for credential patterns.
# Refuses the commit if any pattern matches. The error message names the
# file + the pattern that matched but never echoes the secret value
# itself — avoids round-tripping the leaked credential into terminal
# scrollback / agent context windows.
#
# Pattern set covers the high-value GitHub family (the actual #2090
# incident vector), the most common cloud + LLM provider tokens, and
# AWS access keys. Regex anchored on prefixes that have low false-
# positive rates against agent-generated content (no plain hex blobs).
SECRET_PATTERNS=(
    'ghp_[A-Za-z0-9]{36,}'           # GitHub PAT (classic)
    'ghs_[A-Za-z0-9]{36,}'           # GitHub App installation token
    'gho_[A-Za-z0-9]{36,}'           # GitHub OAuth user-to-server
    'ghu_[A-Za-z0-9]{36,}'           # GitHub OAuth user
    'ghr_[A-Za-z0-9]{36,}'           # GitHub OAuth refresh
    'github_pat_[A-Za-z0-9_]{82,}'   # GitHub fine-grained PAT
    'sk-ant-[A-Za-z0-9_-]{40,}'      # Anthropic API key
    'sk-proj-[A-Za-z0-9_-]{40,}'     # OpenAI project key
    'sk-svcacct-[A-Za-z0-9_-]{40,}'  # OpenAI service-account key
    'sk-cp-[A-Za-z0-9_-]{60,}'       # MiniMax API key (F1088 vector — caught only after the fact)
    'xox[baprs]-[A-Za-z0-9-]{20,}'   # Slack tokens (bot/app/user/refresh)
    'AKIA[0-9A-Z]{16}'               # AWS access key ID
    'ASIA[0-9A-Z]{16}'               # AWS STS temp access key ID
)

# Only check ADDITIONS (lines starting with + but not the +++ header).
# This avoids re-flagging unchanged secrets that may already exist on
# disk from a previous commit (those are tracked separately by repo
# scanners; the local hook only enforces "don't add new ones").
DIFF=$(git diff --cached --no-color --unified=0 2>/dev/null | grep -E '^\+[^+]' || true)

if [ -n "$DIFF" ]; then
    SECRET_HITS=""
    for pattern in "${SECRET_PATTERNS[@]}"; do
        # Use grep -lE on the diff to find which file the match came from.
        # `git diff --cached --name-only -G<pattern>` would be cleaner but
        # -G operates on hunks not added lines, and we explicitly want
        # only added lines. Walk staged files individually for the file
        # attribution.
        if echo "$DIFF" | grep -qE "$pattern"; then
            for f in $(git diff --cached --name-only --diff-filter=AM); do
                if git diff --cached --no-color --unified=0 -- "$f" 2>/dev/null \
                    | grep -E '^\+[^+]' \
                    | grep -qE "$pattern"; then
                    SECRET_HITS="${SECRET_HITS}  - ${f}  (matched: ${pattern})\n"
                fi
            done
        fi
    done

    if [ -n "$SECRET_HITS" ]; then
        {
            echo
            echo "Refusing commit: staged additions contain credential-shaped strings."
            echo
            echo "Offending files:"
            printf "$SECRET_HITS"
            echo
            echo "The actual matched values are NOT printed here, deliberately —"
            echo "echoing them back into scrollback / agent context would round-trip"
            echo "the leak into another surface."
            echo
            echo "Recovery:"
            echo "  1. git restore --staged <file>"
            echo "  2. Remove the secret from <file> and replace with an env var"
            echo "     reference (e.g. \${GITHUB_TOKEN}). Never inline credentials."
            echo "  3. If the credential was already exposed (committed locally OR"
            echo "     visible to the agent in any prior tool output), treat it as"
            echo "     compromised — rotate it immediately, do not just remove it."
            echo "  4. git add <file> && git commit"
            echo
            echo "If the match is a false positive (test fixture / docs example),"
            echo "use a clearly-fake placeholder like ghs_EXAMPLE_TOKEN_DO_NOT_USE"
            echo "that doesn't satisfy the length suffix."
            echo
            echo "Hook source: molecule_runtime/scripts/pre-commit-checks.sh"
        } >&2
        exit 1
    fi
fi

# ─── 2. Internal-paths block (public Molecule-AI repos only) ──────────
#
# Despite SHARED_RULES.md, .gitignore, and a CI gate, agents still try
# to `git add /research/...` from their cwd in `molecule-monorepo`. Each
# leak attempt costs ~5 cycles (PR opens, CI fails, agent retries with
# workaround) and pollutes git history with reverts. This gate converts
# the failure mode from "PR fails" → "commit refused at the agent's
# local git" — instant feedback with the redirect command in the same
# error message.
case "$REMOTE" in
    *Molecule-AI/molecule-monorepo*|*Molecule-AI/molecule-core*)
        ;;
    *)
        # Non-target repo (internal, plugins, templates, third-party) — let it through.
        exit 0
        ;;
esac

STAGED=$(git diff --cached --name-only --diff-filter=AM)
[ -z "$STAGED" ] && exit 0

FORBIDDEN_PATTERNS=(
    "^research/"
    "^marketing/"
    "^docs/marketing/"
    "^comment-[0-9]+\.json$"
    "^test-pmm.*\.(txt|md)$"
    "^tick-reflections.*\.(txt|md)$"
    ".*-temp\.(md|txt)$"
)

OFFENDING=""
for path in $STAGED; do
    for pattern in "${FORBIDDEN_PATTERNS[@]}"; do
        if echo "$path" | grep -qE "$pattern"; then
            OFFENDING="${OFFENDING}  - ${path}  (matched: ${pattern})\n"
            break
        fi
    done
done

[ -z "$OFFENDING" ] && exit 0

{
    echo
    echo "Refusing commit: internal-flavored paths cannot live in the public monorepo."
    echo
    echo "Offending files:"
    printf "$OFFENDING"
    echo
    echo "These belong in Molecule-AI/internal. Redirect:"
    echo
    echo "  mkdir -p ~/repos"
    echo "  test -d ~/repos/internal || gh repo clone Molecule-AI/internal ~/repos/internal"
    echo "  cd ~/repos/internal"
    echo "  git pull origin main"
    echo "  git checkout -b <my-role>/<topic>-<date>"
    echo "  mkdir -p <area>            # research, marketing, runbooks, etc."
    echo "  # move your file from the monorepo into <area>/<slug>.md"
    echo "  git add <area>/<slug>.md"
    echo "  git commit -m '<area>: add <slug>'"
    echo "  git push -u origin HEAD"
    echo "  gh pr create --base main --fill"
    echo
    echo "If your file is genuinely public-facing (final blog post, public"
    echo "tutorial, customer-shippable doc), use one of these monorepo paths"
    echo "instead — these are not blocked:"
    echo "  - docs/blog/<slug>.md"
    echo "  - docs/tutorials/<slug>.md"
    echo "  - docs/devrel/<slug>.md"
    echo "  - docs/api/<slug>.md"
    echo
    echo "If you legitimately need a new top-level path that matches a"
    echo "forbidden pattern, edit:"
    echo "  .github/workflows/block-internal-paths.yml"
    echo "with reviewer signoff and a public-facing justification — do NOT"
    echo "work around the gate by renaming."
    echo
    echo "Hook source: molecule_runtime/scripts/pre-commit-checks.sh"
} >&2

exit 1