diff --git a/.github/workflows/secret-scan.yml b/.github/workflows/secret-scan.yml new file mode 100644 index 00000000..089bb4d4 --- /dev/null +++ b/.github/workflows/secret-scan.yml @@ -0,0 +1,152 @@ +name: Secret scan + +# Hard CI gate. Refuses any PR / push whose diff additions contain a +# recognisable credential. Defense-in-depth for the #2090-class incident +# (2026-04-24): GitHub's hosted Copilot Coding Agent leaked a ghs_* +# installation token into tenant-proxy/package.json via `npm init` +# slurping the URL from a token-embedded origin remote. We can't fix +# upstream's clone hygiene, so we gate here. +# +# Also the canonical reusable workflow for the rest of the org. Other +# Molecule-AI repos enroll with a single 3-line workflow: +# +# jobs: +# secret-scan: +# uses: Molecule-AI/molecule-monorepo/.github/workflows/secret-scan.yml@main +# +# Same regex set as the runtime's bundled pre-commit hook +# (molecule-ai-workspace-runtime: molecule_runtime/scripts/pre-commit-checks.sh). +# Keep the two sides aligned when adding patterns. + +on: + pull_request: + types: [opened, synchronize, reopened] + push: + branches: [main, staging] + # Required for GitHub merge queue: the queue's pre-merge CI run on + # `gh-readonly-queue/...` refs needs this check to fire so the queue + # gets a real result instead of stalling forever AWAITING_CHECKS. + merge_group: + types: [checks_requested] + # Reusable workflow entry point for other Molecule-AI repos. + workflow_call: + +jobs: + scan: + name: Scan diff for credential-shaped strings + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 2 # need previous commit to diff against on push events + + # For pull_request events the diff base may be many commits behind + # HEAD and absent from the shallow clone. Fetch it explicitly. + - name: Fetch PR base SHA (pull_request events only) + if: github.event_name == 'pull_request' + run: git fetch --depth=1 origin ${{ github.event.pull_request.base.sha }} + + - name: Refuse if credential-shaped strings appear in diff additions + run: | + # Pattern set covers GitHub family (the actual #2090 vector), + # Anthropic / OpenAI / Slack / AWS. Anchored on prefixes with low + # false-positive rates against agent-generated content. Mirror of + # molecule-ai-workspace-runtime/molecule_runtime/scripts/pre-commit-checks.sh + # — keep aligned. + SECRET_PATTERNS=( + 'ghp_[A-Za-z0-9]{36,}' # GitHub PAT (classic) + 'ghs_[A-Za-z0-9]{36,}' # GitHub App installation token + 'gho_[A-Za-z0-9]{36,}' # GitHub OAuth user-to-server + 'ghu_[A-Za-z0-9]{36,}' # GitHub OAuth user + 'ghr_[A-Za-z0-9]{36,}' # GitHub OAuth refresh + 'github_pat_[A-Za-z0-9_]{82,}' # GitHub fine-grained PAT + 'sk-ant-[A-Za-z0-9_-]{40,}' # Anthropic API key + 'sk-proj-[A-Za-z0-9_-]{40,}' # OpenAI project key + 'sk-svcacct-[A-Za-z0-9_-]{40,}' # OpenAI service-account key + 'xox[baprs]-[A-Za-z0-9-]{20,}' # Slack tokens + 'AKIA[0-9A-Z]{16}' # AWS access key ID + 'ASIA[0-9A-Z]{16}' # AWS STS temp access key ID + ) + + # Determine the diff base. + if [ "${{ github.event_name }}" = "pull_request" ]; then + BASE="${{ github.event.pull_request.base.sha }}" + HEAD="${{ github.event.pull_request.head.sha }}" + else + BASE="${{ github.event.before }}" + HEAD="${{ github.event.after }}" + fi + + # Files added or modified in this change. + if [ -z "$BASE" ] || echo "$BASE" | grep -qE '^0+$'; then + # New branch / no previous SHA — check the entire tree as + # added content. Slower, but correct on first push. + CHANGED=$(git ls-tree -r --name-only HEAD) + DIFF_RANGE="" + else + CHANGED=$(git diff --name-only --diff-filter=AM "$BASE" "$HEAD") + DIFF_RANGE="$BASE $HEAD" + fi + + if [ -z "$CHANGED" ]; then + echo "No changed files to inspect." + exit 0 + fi + + # Self-exclude: this workflow file legitimately contains the + # pattern strings as regex literals. Without an exclude it would + # block its own merge. + SELF=".github/workflows/secret-scan.yml" + + OFFENDING="" + for f in $CHANGED; do + [ "$f" = "$SELF" ] && continue + if [ -n "$DIFF_RANGE" ]; then + ADDED=$(git diff --no-color --unified=0 "$BASE" "$HEAD" -- "$f" 2>/dev/null | grep -E '^\+[^+]' || true) + else + # No diff range (new branch first push) — scan the full file + # contents as if every line were new. + ADDED=$(cat "$f" 2>/dev/null || true) + fi + [ -z "$ADDED" ] && continue + for pattern in "${SECRET_PATTERNS[@]}"; do + if echo "$ADDED" | grep -qE "$pattern"; then + OFFENDING="${OFFENDING}${f} (matched: ${pattern})\n" + break + fi + done + done + + if [ -n "$OFFENDING" ]; then + echo "::error::Credential-shaped strings detected in diff additions:" + printf "$OFFENDING" + echo "" + echo "The actual matched values are NOT echoed here, deliberately —" + echo "round-tripping a leaked credential into CI logs widens the blast" + echo "radius (logs are searchable + retained)." + echo "" + echo "Recovery:" + echo " 1. Remove the secret from the file. Replace with an env var" + echo " reference (e.g. \${{ secrets.GITHUB_TOKEN }} in workflows," + echo " process.env.X in code)." + echo " 2. If the credential was already pushed (this PR's commit" + echo " history reaches a public ref), treat it as compromised —" + echo " ROTATE it immediately, do not just remove it. The token" + echo " remains valid in git history forever and may be in any" + echo " log/cache that consumed this branch." + echo " 3. Force-push the cleaned commit (or stack a revert) and" + echo " re-run CI." + echo "" + echo "If the match is a false positive (test fixture, docs example," + echo "or this workflow's own regex literals): use a clearly-fake" + echo "placeholder like ghs_EXAMPLE_DO_NOT_USE that doesn't satisfy" + echo "the length suffix, OR add the file path to the SELF exclude" + echo "list in this workflow with a short reason." + echo "" + echo "Mirror of the regex set lives in the runtime's bundled" + echo "pre-commit hook (molecule-ai-workspace-runtime:" + echo "molecule_runtime/scripts/pre-commit-checks.sh) — keep aligned." + exit 1 + fi + + echo "✓ No credential-shaped strings in this change."