name: Secret scan # Hard CI gate. Refuses any PR / push whose diff additions contain a # recognisable credential. Defense-in-depth for the #2090-class incident # (2026-04-24): GitHub's hosted Copilot Coding Agent leaked a ghs_* # installation token into tenant-proxy/package.json via `npm init` # slurping the URL from a token-embedded origin remote. We can't fix # upstream's clone hygiene, so we gate here. # # Same regex set as the runtime's bundled pre-commit hook # (molecule-ai-workspace-runtime: molecule_runtime/scripts/pre-commit-checks.sh). # Keep the two sides aligned when adding patterns. # # Ported from .github/workflows/secret-scan.yml so the gate actually # fires on Gitea Actions. Differences from the GitHub version: # - drops `merge_group` event (Gitea has no merge queue) # - drops `workflow_call` (no cross-repo reusable invocation on Gitea) # - SELF path updated to .gitea/workflows/secret-scan.yml # The job name + step name are identical to the GitHub workflow so the # status-check context (`Secret scan / Scan diff for credential-shaped # strings (pull_request)`) matches branch protection on molecule-core/main. on: pull_request: types: [opened, synchronize, reopened] push: branches: [main, staging] jobs: scan: name: Scan diff for credential-shaped strings runs-on: ubuntu-latest steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 2 # need previous commit to diff against on push events # For pull_request events the diff base may be many commits behind # HEAD and absent from the shallow clone. Fetch it explicitly. - name: Fetch PR base SHA (pull_request events only) if: github.event_name == 'pull_request' run: git fetch --depth=1 origin ${{ github.event.pull_request.base.sha }} - name: Refuse if credential-shaped strings appear in diff additions env: # Plumb event-specific SHAs through env so the script doesn't # need conditional `${{ ... }}` interpolation per event type. # github.event.before/after only exist on push events; # pull_request has pull_request.base.sha / pull_request.head.sha. PR_BASE_SHA: ${{ github.event.pull_request.base.sha }} PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }} PUSH_BEFORE: ${{ github.event.before }} PUSH_AFTER: ${{ github.event.after }} run: | # Pattern set covers GitHub family (the actual #2090 vector), # Anthropic / OpenAI / Slack / AWS. Anchored on prefixes with low # false-positive rates against agent-generated content. Mirror of # molecule-ai-workspace-runtime/molecule_runtime/scripts/pre-commit-checks.sh # — keep aligned. SECRET_PATTERNS=( 'ghp_[A-Za-z0-9]{36,}' # GitHub PAT (classic) 'ghs_[A-Za-z0-9]{36,}' # GitHub App installation token 'gho_[A-Za-z0-9]{36,}' # GitHub OAuth user-to-server 'ghu_[A-Za-z0-9]{36,}' # GitHub OAuth user 'ghr_[A-Za-z0-9]{36,}' # GitHub OAuth refresh 'github_pat_[A-Za-z0-9_]{82,}' # GitHub fine-grained PAT 'sk-ant-[A-Za-z0-9_-]{40,}' # Anthropic API key 'sk-proj-[A-Za-z0-9_-]{40,}' # OpenAI project key 'sk-svcacct-[A-Za-z0-9_-]{40,}' # OpenAI service-account key 'sk-cp-[A-Za-z0-9_-]{60,}' # MiniMax API key (F1088 vector — caught only after the fact) 'xox[baprs]-[A-Za-z0-9-]{20,}' # Slack tokens 'AKIA[0-9A-Z]{16}' # AWS access key ID 'ASIA[0-9A-Z]{16}' # AWS STS temp access key ID ) # Determine the diff base. Each event type stores its SHAs in # a different place — see the env block above. case "${{ github.event_name }}" in pull_request) BASE="$PR_BASE_SHA" HEAD="$PR_HEAD_SHA" ;; *) BASE="$PUSH_BEFORE" HEAD="$PUSH_AFTER" ;; esac # On push events with shallow clones, BASE may be present in # the event payload but absent from the local object DB # (fetch-depth=2 doesn't always reach the previous commit # across true merges). Try fetching it on demand. If the # fetch fails — e.g. the SHA was force-overwritten — we fall # through to the empty-BASE branch below, which scans the # entire tree as if every file were new. Correct, just slow. if [ -n "$BASE" ] && ! echo "$BASE" | grep -qE '^0+$'; then if ! git cat-file -e "$BASE" 2>/dev/null; then git fetch --depth=1 origin "$BASE" 2>/dev/null || true fi fi # Files added or modified in this change. if [ -z "$BASE" ] || echo "$BASE" | grep -qE '^0+$' || ! git cat-file -e "$BASE" 2>/dev/null; then # New branch / no previous SHA / BASE unreachable — check the # entire tree as added content. Slower, but correct on first # push. CHANGED=$(git ls-tree -r --name-only HEAD) DIFF_RANGE="" else CHANGED=$(git diff --name-only --diff-filter=AM "$BASE" "$HEAD") DIFF_RANGE="$BASE $HEAD" fi if [ -z "$CHANGED" ]; then echo "No changed files to inspect." exit 0 fi # Self-exclude: this workflow file legitimately contains the # pattern strings as regex literals. Without an exclude it would # block its own merge. Both the .github/ original and this # .gitea/ port are excluded so a sync between them stays clean. SELF_GITHUB=".github/workflows/secret-scan.yml" SELF_GITEA=".gitea/workflows/secret-scan.yml" OFFENDING="" # `while IFS= read -r` (not `for f in $CHANGED`) so filenames # containing whitespace don't word-split silently — a path # with a space would otherwise produce two iterations on # tokens that aren't real filenames, breaking the # self-exclude + diff lookup. while IFS= read -r f; do [ -z "$f" ] && continue [ "$f" = "$SELF_GITHUB" ] && continue [ "$f" = "$SELF_GITEA" ] && continue if [ -n "$DIFF_RANGE" ]; then ADDED=$(git diff --no-color --unified=0 "$BASE" "$HEAD" -- "$f" 2>/dev/null | grep -E '^\+[^+]' || true) else # No diff range (new branch first push) — scan the full file # contents as if every line were new. ADDED=$(cat "$f" 2>/dev/null || true) fi [ -z "$ADDED" ] && continue for pattern in "${SECRET_PATTERNS[@]}"; do if echo "$ADDED" | grep -qE "$pattern"; then OFFENDING="${OFFENDING}${f} (matched: ${pattern})\n" break fi done done <<< "$CHANGED" if [ -n "$OFFENDING" ]; then echo "::error::Credential-shaped strings detected in diff additions:" # `printf '%b' "$OFFENDING"` interprets backslash escapes # (the literal `\n` we appended above becomes a newline) # WITHOUT treating OFFENDING as a format string. Plain # `printf "$OFFENDING"` is a format-string sink: a filename # containing `%` would be interpreted as a conversion # specifier, corrupting the error message (or printing # `%(missing)` artifacts). printf '%b' "$OFFENDING" echo "" echo "The actual matched values are NOT echoed here, deliberately —" echo "round-tripping a leaked credential into CI logs widens the blast" echo "radius (logs are searchable + retained)." echo "" echo "Recovery:" echo " 1. Remove the secret from the file. Replace with an env var" echo " reference (e.g. \${{ secrets.GITHUB_TOKEN }} in workflows," echo " process.env.X in code)." echo " 2. If the credential was already pushed (this PR's commit" echo " history reaches a public ref), treat it as compromised —" echo " ROTATE it immediately, do not just remove it. The token" echo " remains valid in git history forever and may be in any" echo " log/cache that consumed this branch." echo " 3. Force-push the cleaned commit (or stack a revert) and" echo " re-run CI." echo "" echo "If the match is a false positive (test fixture, docs example," echo "or this workflow's own regex literals): use a clearly-fake" echo "placeholder like ghs_EXAMPLE_DO_NOT_USE that doesn't satisfy" echo "the length suffix, OR add the file path to the SELF exclude" echo "list in this workflow with a short reason." echo "" echo "Mirror of the regex set lives in the runtime's bundled" echo "pre-commit hook (molecule-ai-workspace-runtime:" echo "molecule_runtime/scripts/pre-commit-checks.sh) — keep aligned." exit 1 fi echo "✓ No credential-shaped strings in this change."