diff --git a/.gitea/workflows/secret-scan.yml b/.gitea/workflows/secret-scan.yml new file mode 100644 index 00000000..6f1583f4 --- /dev/null +++ b/.gitea/workflows/secret-scan.yml @@ -0,0 +1,191 @@ +name: Secret scan + +# Hard CI gate. Refuses any PR / push whose diff additions contain a +# recognisable credential. Defense-in-depth for the #2090-class incident +# (2026-04-24): GitHub's hosted Copilot Coding Agent leaked a ghs_* +# installation token into tenant-proxy/package.json via `npm init` +# slurping the URL from a token-embedded origin remote. We can't fix +# upstream's clone hygiene, so we gate here. +# +# Same regex set as the runtime's bundled pre-commit hook +# (molecule-ai-workspace-runtime: molecule_runtime/scripts/pre-commit-checks.sh). +# Keep the two sides aligned when adding patterns. +# +# Ported from .github/workflows/secret-scan.yml so the gate actually +# fires on Gitea Actions. Differences from the GitHub version: +# - drops `merge_group` event (Gitea has no merge queue) +# - drops `workflow_call` (no cross-repo reusable invocation on Gitea) +# - SELF path updated to .gitea/workflows/secret-scan.yml +# The job name + step name are identical to the GitHub workflow so the +# status-check context (`Secret scan / Scan diff for credential-shaped +# strings (pull_request)`) matches branch protection on molecule-core/main. + +on: + pull_request: + types: [opened, synchronize, reopened] + push: + branches: [main, staging] + +jobs: + scan: + name: Scan diff for credential-shaped strings + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + fetch-depth: 2 # need previous commit to diff against on push events + + # For pull_request events the diff base may be many commits behind + # HEAD and absent from the shallow clone. Fetch it explicitly. + - name: Fetch PR base SHA (pull_request events only) + if: github.event_name == 'pull_request' + run: git fetch --depth=1 origin ${{ github.event.pull_request.base.sha }} + + - name: Refuse if credential-shaped strings appear in diff additions + env: + # Plumb event-specific SHAs through env so the script doesn't + # need conditional `${{ ... }}` interpolation per event type. + # github.event.before/after only exist on push events; + # pull_request has pull_request.base.sha / pull_request.head.sha. + PR_BASE_SHA: ${{ github.event.pull_request.base.sha }} + PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }} + PUSH_BEFORE: ${{ github.event.before }} + PUSH_AFTER: ${{ github.event.after }} + run: | + # Pattern set covers GitHub family (the actual #2090 vector), + # Anthropic / OpenAI / Slack / AWS. Anchored on prefixes with low + # false-positive rates against agent-generated content. Mirror of + # molecule-ai-workspace-runtime/molecule_runtime/scripts/pre-commit-checks.sh + # — keep aligned. + SECRET_PATTERNS=( + 'ghp_[A-Za-z0-9]{36,}' # GitHub PAT (classic) + 'ghs_[A-Za-z0-9]{36,}' # GitHub App installation token + 'gho_[A-Za-z0-9]{36,}' # GitHub OAuth user-to-server + 'ghu_[A-Za-z0-9]{36,}' # GitHub OAuth user + 'ghr_[A-Za-z0-9]{36,}' # GitHub OAuth refresh + 'github_pat_[A-Za-z0-9_]{82,}' # GitHub fine-grained PAT + 'sk-ant-[A-Za-z0-9_-]{40,}' # Anthropic API key + 'sk-proj-[A-Za-z0-9_-]{40,}' # OpenAI project key + 'sk-svcacct-[A-Za-z0-9_-]{40,}' # OpenAI service-account key + 'sk-cp-[A-Za-z0-9_-]{60,}' # MiniMax API key (F1088 vector — caught only after the fact) + 'xox[baprs]-[A-Za-z0-9-]{20,}' # Slack tokens + 'AKIA[0-9A-Z]{16}' # AWS access key ID + 'ASIA[0-9A-Z]{16}' # AWS STS temp access key ID + ) + + # Determine the diff base. Each event type stores its SHAs in + # a different place — see the env block above. + case "${{ github.event_name }}" in + pull_request) + BASE="$PR_BASE_SHA" + HEAD="$PR_HEAD_SHA" + ;; + *) + BASE="$PUSH_BEFORE" + HEAD="$PUSH_AFTER" + ;; + esac + + # On push events with shallow clones, BASE may be present in + # the event payload but absent from the local object DB + # (fetch-depth=2 doesn't always reach the previous commit + # across true merges). Try fetching it on demand. If the + # fetch fails — e.g. the SHA was force-overwritten — we fall + # through to the empty-BASE branch below, which scans the + # entire tree as if every file were new. Correct, just slow. + if [ -n "$BASE" ] && ! echo "$BASE" | grep -qE '^0+$'; then + if ! git cat-file -e "$BASE" 2>/dev/null; then + git fetch --depth=1 origin "$BASE" 2>/dev/null || true + fi + fi + + # Files added or modified in this change. + if [ -z "$BASE" ] || echo "$BASE" | grep -qE '^0+$' || ! git cat-file -e "$BASE" 2>/dev/null; then + # New branch / no previous SHA / BASE unreachable — check the + # entire tree as added content. Slower, but correct on first + # push. + CHANGED=$(git ls-tree -r --name-only HEAD) + DIFF_RANGE="" + else + CHANGED=$(git diff --name-only --diff-filter=AM "$BASE" "$HEAD") + DIFF_RANGE="$BASE $HEAD" + fi + + if [ -z "$CHANGED" ]; then + echo "No changed files to inspect." + exit 0 + fi + + # Self-exclude: this workflow file legitimately contains the + # pattern strings as regex literals. Without an exclude it would + # block its own merge. Both the .github/ original and this + # .gitea/ port are excluded so a sync between them stays clean. + SELF_GITHUB=".github/workflows/secret-scan.yml" + SELF_GITEA=".gitea/workflows/secret-scan.yml" + + OFFENDING="" + # `while IFS= read -r` (not `for f in $CHANGED`) so filenames + # containing whitespace don't word-split silently — a path + # with a space would otherwise produce two iterations on + # tokens that aren't real filenames, breaking the + # self-exclude + diff lookup. + while IFS= read -r f; do + [ -z "$f" ] && continue + [ "$f" = "$SELF_GITHUB" ] && continue + [ "$f" = "$SELF_GITEA" ] && continue + if [ -n "$DIFF_RANGE" ]; then + ADDED=$(git diff --no-color --unified=0 "$BASE" "$HEAD" -- "$f" 2>/dev/null | grep -E '^\+[^+]' || true) + else + # No diff range (new branch first push) — scan the full file + # contents as if every line were new. + ADDED=$(cat "$f" 2>/dev/null || true) + fi + [ -z "$ADDED" ] && continue + for pattern in "${SECRET_PATTERNS[@]}"; do + if echo "$ADDED" | grep -qE "$pattern"; then + OFFENDING="${OFFENDING}${f} (matched: ${pattern})\n" + break + fi + done + done <<< "$CHANGED" + + if [ -n "$OFFENDING" ]; then + echo "::error::Credential-shaped strings detected in diff additions:" + # `printf '%b' "$OFFENDING"` interprets backslash escapes + # (the literal `\n` we appended above becomes a newline) + # WITHOUT treating OFFENDING as a format string. Plain + # `printf "$OFFENDING"` is a format-string sink: a filename + # containing `%` would be interpreted as a conversion + # specifier, corrupting the error message (or printing + # `%(missing)` artifacts). + printf '%b' "$OFFENDING" + echo "" + echo "The actual matched values are NOT echoed here, deliberately —" + echo "round-tripping a leaked credential into CI logs widens the blast" + echo "radius (logs are searchable + retained)." + echo "" + echo "Recovery:" + echo " 1. Remove the secret from the file. Replace with an env var" + echo " reference (e.g. \${{ secrets.GITHUB_TOKEN }} in workflows," + echo " process.env.X in code)." + echo " 2. If the credential was already pushed (this PR's commit" + echo " history reaches a public ref), treat it as compromised —" + echo " ROTATE it immediately, do not just remove it. The token" + echo " remains valid in git history forever and may be in any" + echo " log/cache that consumed this branch." + echo " 3. Force-push the cleaned commit (or stack a revert) and" + echo " re-run CI." + echo "" + echo "If the match is a false positive (test fixture, docs example," + echo "or this workflow's own regex literals): use a clearly-fake" + echo "placeholder like ghs_EXAMPLE_DO_NOT_USE that doesn't satisfy" + echo "the length suffix, OR add the file path to the SELF exclude" + echo "list in this workflow with a short reason." + echo "" + echo "Mirror of the regex set lives in the runtime's bundled" + echo "pre-commit hook (molecule-ai-workspace-runtime:" + echo "molecule_runtime/scripts/pre-commit-checks.sh) — keep aligned." + exit 1 + fi + + echo "✓ No credential-shaped strings in this change."