diff --git a/molecule_runtime/precommit_hook.py b/molecule_runtime/precommit_hook.py index 78a39cd..0f511b9 100644 --- a/molecule_runtime/precommit_hook.py +++ b/molecule_runtime/precommit_hook.py @@ -1,35 +1,40 @@ -"""Pre-commit hook installer — block internal-flavored paths in public monorepo. +"""Pre-commit hook installer — secret scan + internal-paths block. Companion to ``credential_helper`` (#1933). Lifts the per-template wiring that would otherwise need to live in each Dockerfile + entrypoint.sh: copy the hook script to a known location, set ``core.hooksPath`` so git -finds it, and now every ``git commit`` inside the public monorepo gets a -local refusal with the redirect command in the same error. +finds it, and now every ``git commit`` runs through two defense-in-depth +gates with the recovery command printed in the same error. + +Two gates, single hook file +=========================== + +The bundled ``pre-commit-checks.sh`` runs: + +1. **Secret scan** — ALL repos. Refuses any commit whose staged + additions contain a recognisable credential (GitHub PAT/installation + tokens, Anthropic / OpenAI / Slack / AWS keys). Catches the leak + regardless of how the secret arrived in the working tree (npm + copying ``_authToken`` into ``package.json``, an agent persisting + its own ``GITHUB_TOKEN`` to a config file, etc.). + +2. **Internal-paths block** — ``Molecule-AI/molecule-monorepo`` and + ``Molecule-AI/molecule-core`` only. Refuses commits that add + ``research/``, ``marketing/``, etc. to the public monorepo with a + redirect to ``Molecule-AI/internal``. + +Both gates skip during rebase / cherry-pick / merge / revert (they +replay existing commits and blocking would force interactive history +rewriting). Why a local hook is the right layer =================================== -Agents try to ``git add /research/foo.md`` despite SHARED_RULES, the -``.gitignore`` patterns, and the CI gate. Each leak attempt costs ~5 -cycles: PR opens, CI fails (now that ``Block forbidden paths`` is -required), agent retries with workaround. Pollutes git history with -reverts. - -A pre-commit hook converts the failure from "PR opens then fails 5 -minutes later" to "commit refused immediately, with the recovery -command printed in the same response cycle the agent ran ``git commit`` -in." Agents act on what's in the current response context — putting the -redirect command literally in the error message they see is the highest -density of feedback we can provide. - -Scoped to public monorepo only -============================== - -The hook checks ``git remote get-url origin`` and only enforces when the -repo is ``Molecule-AI/molecule-monorepo`` or ``Molecule-AI/molecule-core``. -Inside ``Molecule-AI/internal`` (where ``research/``, ``marketing/`` -etc. legitimately belong) the hook is a no-op. Same for plugin / -template / third-party repos. +Agents act on what's in the current response context. A pre-commit hook +converts "PR opens then fails 5 minutes later" into "commit refused +immediately, with the recovery command printed in the same response +cycle the agent ran ``git commit`` in." That's the highest density of +feedback we can provide. Idempotence + non-clobber ========================= @@ -53,7 +58,7 @@ log = logging.getLogger(__name__) _HOOKS_DIR = Path(os.environ.get("HOME", "/home/agent")) / ".molecule-runtime" / "git-hooks" _HOOK_SCRIPT_NAME = "pre-commit" -_BUNDLED_SCRIPT = "pre-commit-block-internal-paths.sh" +_BUNDLED_SCRIPT = "pre-commit-checks.sh" def _existing_hooks_path() -> str | None: diff --git a/molecule_runtime/scripts/pre-commit-block-internal-paths.sh b/molecule_runtime/scripts/pre-commit-block-internal-paths.sh deleted file mode 100644 index 29a49d8..0000000 --- a/molecule_runtime/scripts/pre-commit-block-internal-paths.sh +++ /dev/null @@ -1,126 +0,0 @@ -#!/bin/bash -# pre-commit hook — refuse commits that add internal-flavored paths to the -# public monorepo. Only enforces in the Molecule-AI public repos; no-op in -# every other repo (including the canonical internal one) so agents can -# still write `research/foo.md` inside `Molecule-AI/internal`. -# -# Why this hook exists -# ==================== -# -# Despite SHARED_RULES.md, .gitignore, and a CI gate, agents still try to -# `git add /research/...` from their cwd in `molecule-monorepo`. Each leak -# attempt costs ~5 cycles (PR opens, CI fails, agent retries with -# workaround) and pollutes git history with reverts. This hook converts -# the failure mode from "PR fails" → "commit refused at the agent's local -# git" — instant feedback with the redirect command in the same error -# message. -# -# Installed via `core.hooksPath` set by molecule_runtime.precommit_hook -# at workspace startup. - -set -e - -# Skip silently when GIT_AUTHOR_EMAIL/USER is unset — likely a non-agent -# context (operator manually running git inside the container for debug). -# Agents always have the provisioner-set GIT_AUTHOR_NAME. -if [ -z "${GIT_AUTHOR_NAME:-}${GIT_COMMITTER_NAME:-}" ]; then - exit 0 -fi - -# Skip during rebase / cherry-pick / merge / revert — these REPLAY existing -# commits and the staged file set is whatever was already committed -# upstream. Blocking those forces the agent to manually rewrite history -# (interactive rebase + manual file deletion + commit amend) which most -# agents won't do — net effect was 15+ DIRTY PRs sitting unmergeable on -# molecule-core after the hook landed (cycle 107 trace, 2026-04-24). -# -# Detection via .git state directories. These exist only during the -# corresponding operation and get cleaned up at the end. -GIT_DIR=$(git rev-parse --git-dir 2>/dev/null || echo .git) -for state_dir in rebase-merge rebase-apply CHERRY_PICK_HEAD MERGE_HEAD REVERT_HEAD; do - if [ -e "${GIT_DIR}/${state_dir}" ]; then - exit 0 - fi -done - -# Determine if we're in a public Molecule-AI repo. `git remote get-url` -# returns nothing in repos without a remote (fine — exit clean). -REMOTE=$(git remote get-url origin 2>/dev/null || echo "") - -case "$REMOTE" in - *Molecule-AI/molecule-monorepo*|*Molecule-AI/molecule-core*) - # Continue — this is a public repo we enforce on. - ;; - *) - # Non-target repo (internal, plugins, templates, third-party) — let it through. - exit 0 - ;; -esac - -# Files added or modified in this commit. --diff-filter=AM excludes -# deletions so cleanup commits don't trip the gate. -STAGED=$(git diff --cached --name-only --diff-filter=AM) -[ -z "$STAGED" ] && exit 0 - -FORBIDDEN_PATTERNS=( - "^research/" - "^marketing/" - "^docs/marketing/" - "^comment-[0-9]+\.json$" - "^test-pmm.*\.(txt|md)$" - "^tick-reflections.*\.(txt|md)$" - ".*-temp\.(md|txt)$" -) - -OFFENDING="" -for path in $STAGED; do - for pattern in "${FORBIDDEN_PATTERNS[@]}"; do - if echo "$path" | grep -qE "$pattern"; then - OFFENDING="${OFFENDING} - ${path} (matched: ${pattern})\n" - break - fi - done -done - -[ -z "$OFFENDING" ] && exit 0 - -# Refuse the commit with the redirect instructions in the same message. -{ - echo - echo "Refusing commit: internal-flavored paths cannot live in the public monorepo." - echo - echo "Offending files:" - printf "$OFFENDING" - echo - echo "These belong in Molecule-AI/internal. Redirect:" - echo - echo " mkdir -p ~/repos" - echo " test -d ~/repos/internal || gh repo clone Molecule-AI/internal ~/repos/internal" - echo " cd ~/repos/internal" - echo " git pull origin main" - echo " git checkout -b /-" - echo " mkdir -p # research, marketing, runbooks, etc." - echo " # move your file from the monorepo into /.md" - echo " git add /.md" - echo " git commit -m ': add '" - echo " git push -u origin HEAD" - echo " gh pr create --base main --fill" - echo - echo "If your file is genuinely public-facing (final blog post, public" - echo "tutorial, customer-shippable doc), use one of these monorepo paths" - echo "instead — these are not blocked:" - echo " - docs/blog/.md" - echo " - docs/tutorials/.md" - echo " - docs/devrel/.md" - echo " - docs/api/.md" - echo - echo "If you legitimately need a new top-level path that matches a" - echo "forbidden pattern, edit:" - echo " .github/workflows/block-internal-paths.yml" - echo "with reviewer signoff and a public-facing justification — do NOT" - echo "work around the gate by renaming." - echo - echo "Hook source: molecule_runtime/scripts/pre-commit-block-internal-paths.sh" -} >&2 - -exit 1 diff --git a/molecule_runtime/scripts/pre-commit-checks.sh b/molecule_runtime/scripts/pre-commit-checks.sh new file mode 100644 index 0000000..1bc8248 --- /dev/null +++ b/molecule_runtime/scripts/pre-commit-checks.sh @@ -0,0 +1,211 @@ +#!/bin/bash +# pre-commit hook — defense-in-depth checks against agent foot-guns. +# +# Two independent gates, run in order: +# +# 1. Secret scan (ALL repos, including third-party). Refuses any commit +# whose staged additions contain a recognisable credential. Catches +# the canonical leak vectors regardless of how the secret arrived in +# the working tree (npm copying _authToken into package.json, an +# agent persisting its own GITHUB_TOKEN to a config file, a +# mis-configured `git clone https://x-access-token:GHS@github.com/...` +# remote, etc.). Same regex set as the tenant-proxy CI scanner — +# keep them aligned. +# +# 2. Internal-paths block (Molecule-AI public repos only). Refuses +# commits that add `research/`, `marketing/`, etc. to the public +# monorepo. Was the original purpose of this hook; lives here so +# every agent commit only pays one hook-install + one git config +# surface area, not two. +# +# Installed via `core.hooksPath` set by molecule_runtime.precommit_hook +# at workspace startup. + +set -e + +# Skip silently when GIT_AUTHOR_NAME/COMMITTER_NAME is unset — likely a +# non-agent context (operator manually running git inside the container +# for debug). Both gates assume the agent profile. +if [ -z "${GIT_AUTHOR_NAME:-}${GIT_COMMITTER_NAME:-}" ]; then + exit 0 +fi + +# Skip during rebase / cherry-pick / merge / revert — these REPLAY +# existing commits and the staged file set is whatever was already +# committed upstream. Blocking them forces interactive history rewriting +# that most agents won't do, leaving DIRTY PRs unmergeable. Both gates +# share this skip. +GIT_DIR=$(git rev-parse --git-dir 2>/dev/null || echo .git) +for state_dir in rebase-merge rebase-apply CHERRY_PICK_HEAD MERGE_HEAD REVERT_HEAD; do + if [ -e "${GIT_DIR}/${state_dir}" ]; then + exit 0 + fi +done + +REMOTE=$(git remote get-url origin 2>/dev/null || echo "") + +# ─── 1. Secret scan ──────────────────────────────────────────────────── +# +# Scans the staged diff (added lines only) for credential patterns. +# Refuses the commit if any pattern matches. The error message names the +# file + the pattern that matched but never echoes the secret value +# itself — avoids round-tripping the leaked credential into terminal +# scrollback / agent context windows. +# +# Pattern set covers the high-value GitHub family (the actual #2090 +# incident vector), the most common cloud + LLM provider tokens, and +# AWS access keys. Regex anchored on prefixes that have low false- +# positive rates against agent-generated content (no plain hex blobs). +SECRET_PATTERNS=( + 'ghp_[A-Za-z0-9]{36,}' # GitHub PAT (classic) + 'ghs_[A-Za-z0-9]{36,}' # GitHub App installation token + 'gho_[A-Za-z0-9]{36,}' # GitHub OAuth user-to-server + 'ghu_[A-Za-z0-9]{36,}' # GitHub OAuth user + 'ghr_[A-Za-z0-9]{36,}' # GitHub OAuth refresh + 'github_pat_[A-Za-z0-9_]{82,}' # GitHub fine-grained PAT + 'sk-ant-[A-Za-z0-9_-]{40,}' # Anthropic API key + 'sk-proj-[A-Za-z0-9_-]{40,}' # OpenAI project key + 'sk-svcacct-[A-Za-z0-9_-]{40,}' # OpenAI service-account key + 'xox[baprs]-[A-Za-z0-9-]{20,}' # Slack tokens (bot/app/user/refresh) + 'AKIA[0-9A-Z]{16}' # AWS access key ID + 'ASIA[0-9A-Z]{16}' # AWS STS temp access key ID +) + +# Only check ADDITIONS (lines starting with + but not the +++ header). +# This avoids re-flagging unchanged secrets that may already exist on +# disk from a previous commit (those are tracked separately by repo +# scanners; the local hook only enforces "don't add new ones"). +DIFF=$(git diff --cached --no-color --unified=0 2>/dev/null | grep -E '^\+[^+]' || true) + +if [ -n "$DIFF" ]; then + SECRET_HITS="" + for pattern in "${SECRET_PATTERNS[@]}"; do + # Use grep -lE on the diff to find which file the match came from. + # `git diff --cached --name-only -G` would be cleaner but + # -G operates on hunks not added lines, and we explicitly want + # only added lines. Walk staged files individually for the file + # attribution. + if echo "$DIFF" | grep -qE "$pattern"; then + for f in $(git diff --cached --name-only --diff-filter=AM); do + if git diff --cached --no-color --unified=0 -- "$f" 2>/dev/null \ + | grep -E '^\+[^+]' \ + | grep -qE "$pattern"; then + SECRET_HITS="${SECRET_HITS} - ${f} (matched: ${pattern})\n" + fi + done + fi + done + + if [ -n "$SECRET_HITS" ]; then + { + echo + echo "Refusing commit: staged additions contain credential-shaped strings." + echo + echo "Offending files:" + printf "$SECRET_HITS" + echo + echo "The actual matched values are NOT printed here, deliberately —" + echo "echoing them back into scrollback / agent context would round-trip" + echo "the leak into another surface." + echo + echo "Recovery:" + echo " 1. git restore --staged " + echo " 2. Remove the secret from and replace with an env var" + echo " reference (e.g. \${GITHUB_TOKEN}). Never inline credentials." + echo " 3. If the credential was already exposed (committed locally OR" + echo " visible to the agent in any prior tool output), treat it as" + echo " compromised — rotate it immediately, do not just remove it." + echo " 4. git add && git commit" + echo + echo "If the match is a false positive (test fixture / docs example)," + echo "use a clearly-fake placeholder like ghs_EXAMPLE_TOKEN_DO_NOT_USE" + echo "that doesn't satisfy the length suffix." + echo + echo "Hook source: molecule_runtime/scripts/pre-commit-checks.sh" + } >&2 + exit 1 + fi +fi + +# ─── 2. Internal-paths block (public Molecule-AI repos only) ────────── +# +# Despite SHARED_RULES.md, .gitignore, and a CI gate, agents still try +# to `git add /research/...` from their cwd in `molecule-monorepo`. Each +# leak attempt costs ~5 cycles (PR opens, CI fails, agent retries with +# workaround) and pollutes git history with reverts. This gate converts +# the failure mode from "PR fails" → "commit refused at the agent's +# local git" — instant feedback with the redirect command in the same +# error message. +case "$REMOTE" in + *Molecule-AI/molecule-monorepo*|*Molecule-AI/molecule-core*) + ;; + *) + # Non-target repo (internal, plugins, templates, third-party) — let it through. + exit 0 + ;; +esac + +STAGED=$(git diff --cached --name-only --diff-filter=AM) +[ -z "$STAGED" ] && exit 0 + +FORBIDDEN_PATTERNS=( + "^research/" + "^marketing/" + "^docs/marketing/" + "^comment-[0-9]+\.json$" + "^test-pmm.*\.(txt|md)$" + "^tick-reflections.*\.(txt|md)$" + ".*-temp\.(md|txt)$" +) + +OFFENDING="" +for path in $STAGED; do + for pattern in "${FORBIDDEN_PATTERNS[@]}"; do + if echo "$path" | grep -qE "$pattern"; then + OFFENDING="${OFFENDING} - ${path} (matched: ${pattern})\n" + break + fi + done +done + +[ -z "$OFFENDING" ] && exit 0 + +{ + echo + echo "Refusing commit: internal-flavored paths cannot live in the public monorepo." + echo + echo "Offending files:" + printf "$OFFENDING" + echo + echo "These belong in Molecule-AI/internal. Redirect:" + echo + echo " mkdir -p ~/repos" + echo " test -d ~/repos/internal || gh repo clone Molecule-AI/internal ~/repos/internal" + echo " cd ~/repos/internal" + echo " git pull origin main" + echo " git checkout -b /-" + echo " mkdir -p # research, marketing, runbooks, etc." + echo " # move your file from the monorepo into /.md" + echo " git add /.md" + echo " git commit -m ': add '" + echo " git push -u origin HEAD" + echo " gh pr create --base main --fill" + echo + echo "If your file is genuinely public-facing (final blog post, public" + echo "tutorial, customer-shippable doc), use one of these monorepo paths" + echo "instead — these are not blocked:" + echo " - docs/blog/.md" + echo " - docs/tutorials/.md" + echo " - docs/devrel/.md" + echo " - docs/api/.md" + echo + echo "If you legitimately need a new top-level path that matches a" + echo "forbidden pattern, edit:" + echo " .github/workflows/block-internal-paths.yml" + echo "with reviewer signoff and a public-facing justification — do NOT" + echo "work around the gate by renaming." + echo + echo "Hook source: molecule_runtime/scripts/pre-commit-checks.sh" +} >&2 + +exit 1 diff --git a/pyproject.toml b/pyproject.toml index c823145..018990c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta" [project] name = "molecule-ai-workspace-runtime" -version = "0.1.15" +version = "0.1.16" description = "Molecule AI workspace runtime — shared infrastructure for all agent adapters" requires-python = ">=3.11" diff --git a/tests/test_precommit_hook.py b/tests/test_precommit_hook.py new file mode 100644 index 0000000..cf2750f --- /dev/null +++ b/tests/test_precommit_hook.py @@ -0,0 +1,141 @@ +"""End-to-end test for the bundled pre-commit hook script. + +The hook runs as a real bash subprocess inside a real temp git repo with +real staged content — there's no Python-side simulation. This is the +only way to exercise the actual contract (refuses commit on secret +match, lets clean commits through) and catch shell-level regressions +like accidental ``set -e`` interactions or pattern-matching drift. + +Two paths covered: + +1. **Secret scan** — refuses any repo when staged additions contain a + credential-shaped string. Tested with a ``ghs_*`` token shape (the + actual #2090 incident vector) so the most important regression case + is locked. + +2. **Clean commit through** — verifies the hook is a no-op for benign + content, confirming we haven't shipped a check that fails open or + blocks every commit. + +Skipped on platforms without ``bash`` on PATH (Windows CI without WSL). +""" +from __future__ import annotations + +import os +import shutil +import subprocess +from importlib import resources +from pathlib import Path + +import pytest + +_BASH = shutil.which("bash") + + +def _run(cmd: list[str], cwd: Path, env: dict | None = None) -> subprocess.CompletedProcess: + """Run a subprocess + return result. Always capture both streams.""" + full_env = os.environ.copy() + if env: + full_env.update(env) + return subprocess.run( + cmd, cwd=cwd, env=full_env, + capture_output=True, text=True, check=False, + ) + + +def _init_repo(repo: Path) -> None: + """Create a fresh git repo with the agent identity set so the hook + doesn't bail on the GIT_AUTHOR_NAME-empty fast path.""" + _run(["git", "init", "-q", "-b", "main"], cwd=repo).check_returncode() + _run(["git", "config", "user.email", "agent@molecule.ai"], cwd=repo).check_returncode() + _run(["git", "config", "user.name", "test-agent"], cwd=repo).check_returncode() + + +def _install_hook(repo: Path) -> Path: + """Copy the bundled hook into the repo's local .git/hooks/pre-commit + and chmod +x. Return the installed path.""" + src = resources.files("molecule_runtime").joinpath("scripts", "pre-commit-checks.sh") + hook_dir = repo / ".git" / "hooks" + hook_dir.mkdir(parents=True, exist_ok=True) + target = hook_dir / "pre-commit" + target.write_bytes(src.read_bytes()) + target.chmod(0o755) + return target + + +@pytest.fixture +def repo(tmp_path: Path) -> Path: + """Initialised git repo with the bundled hook installed.""" + _init_repo(tmp_path) + _install_hook(tmp_path) + return tmp_path + + +@pytest.mark.skipif(_BASH is None, reason="bash not on PATH") +def test_secret_scan_refuses_github_installation_token(repo: Path) -> None: + """A staged file containing a ghs_-prefixed token must abort the commit. + + Lock for the #2090 incident: ``package.json`` with a + ``"_authToken": "ghs_..."`` entry should never reach git history. + """ + pkg = repo / "package.json" + pkg.write_text( + '{\n' + ' "name": "tenant-proxy",\n' + ' "publishConfig": {\n' + ' "//npm.pkg.github.com/:_authToken": "ghs_AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"\n' + ' }\n' + '}\n' + ) + _run(["git", "add", "package.json"], cwd=repo).check_returncode() + + result = _run( + ["git", "commit", "-m", "feat: add token", "--no-gpg-sign"], + cwd=repo, + env={"GIT_AUTHOR_NAME": "test-agent", "GIT_COMMITTER_NAME": "test-agent"}, + ) + assert result.returncode != 0, "commit should be refused" + assert "Refusing commit" in result.stderr + assert "credential-shaped" in result.stderr + assert "package.json" in result.stderr + assert "ghs_" in result.stderr # the pattern name is OK to surface + # The actual matched value must NOT appear — the secret stays out of + # scrollback. Spot-check the exact suffix string. + assert "ghs_AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" not in result.stderr + + +@pytest.mark.skipif(_BASH is None, reason="bash not on PATH") +def test_clean_commit_passes_through(repo: Path) -> None: + """Benign content must commit cleanly — the hook is not allowed to + fail open OR block every commit. This is the regression guard + against shipping a hook that breaks every agent's git workflow.""" + f = repo / "README.md" + f.write_text("# Test\n\nNo secrets here.\n") + _run(["git", "add", "README.md"], cwd=repo).check_returncode() + + result = _run( + ["git", "commit", "-m", "docs: readme", "--no-gpg-sign"], + cwd=repo, + env={"GIT_AUTHOR_NAME": "test-agent", "GIT_COMMITTER_NAME": "test-agent"}, + ) + assert result.returncode == 0, f"clean commit refused: {result.stderr}" + + +@pytest.mark.skipif(_BASH is None, reason="bash not on PATH") +def test_secret_scan_runs_on_third_party_repos(repo: Path) -> None: + """The secret scan must NOT be scoped to Molecule-AI public repos — + it runs on every repo. Internal-paths block was the original gate + and was scoped; secrets are universal.""" + # No remote set → not a Molecule-AI repo. Internal-paths block would + # exit clean here (good); secret scan must still fire. + leaky = repo / "config.yml" + leaky.write_text("anthropic_key: sk-ant-abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOP\n") + _run(["git", "add", "config.yml"], cwd=repo).check_returncode() + + result = _run( + ["git", "commit", "-m", "config: anthropic", "--no-gpg-sign"], + cwd=repo, + env={"GIT_AUTHOR_NAME": "test-agent", "GIT_COMMITTER_NAME": "test-agent"}, + ) + assert result.returncode != 0, "secret scan must fire even without a Molecule-AI remote" + assert "sk-ant-" in result.stderr