Merge pull request 'ci(audit-force-merge): fan §SOP-6 force-merge audit to molecule-core' (#150) from fan/audit-force-merge into main
All checks were successful
Secret scan / Scan diff for credential-shaped strings (push) Successful in 4s

This commit is contained in:
Molecule AI · dev-lead 2026-05-09 03:13:26 +00:00
commit c0abbe33ef
3 changed files with 367 additions and 0 deletions

View File

@ -0,0 +1,118 @@
#!/usr/bin/env bash
# audit-force-merge — detect a §SOP-6 force-merge after PR close, emit
# `incident.force_merge` to stdout as structured JSON.
#
# Vector's docker_logs source picks up runner stdout; the JSON gets
# shipped to Loki on molecule-canonical-obs, indexable by event_type.
# Query example:
#
# {host="operator"} |= "event_type" |= "incident.force_merge" | json
#
# A force-merge is detected when a PR closed-with-merged=true had at
# least one of the repo's required-status-check contexts in a state
# other than "success" at the merge commit's SHA. That's exactly what
# the Gitea force_merge:true API call lets through, so it's a faithful
# detector of the override path.
#
# Triggers on `pull_request_target: closed` (loaded from base branch
# per §SOP-6 security model). No-op when merged=false.
#
# Required env (set by the workflow):
# GITEA_TOKEN, GITEA_HOST, REPO, PR_NUMBER, REQUIRED_CHECKS
#
# REQUIRED_CHECKS is a newline-separated list of status-check context
# names that branch protection requires. Declared in the workflow YAML
# rather than fetched from /branch_protections (which needs admin
# scope — sop-tier-bot has read-only). Trade dynamism for simplicity:
# when the required-check set changes, update both branch protection
# AND this env. Keeping them in sync is less complexity than granting
# the audit bot admin perms on every repo.
set -euo pipefail
: "${GITEA_TOKEN:?required}"
: "${GITEA_HOST:?required}"
: "${REPO:?required}"
: "${PR_NUMBER:?required}"
: "${REQUIRED_CHECKS:?required (newline-separated context names)}"
OWNER="${REPO%%/*}"
NAME="${REPO##*/}"
API="https://${GITEA_HOST}/api/v1"
AUTH="Authorization: token ${GITEA_TOKEN}"
# 1. Fetch the PR. If not merged, no-op.
PR=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/pulls/${PR_NUMBER}")
MERGED=$(echo "$PR" | jq -r '.merged // false')
if [ "$MERGED" != "true" ]; then
echo "::notice::PR #${PR_NUMBER} closed without merge — no audit emission."
exit 0
fi
MERGE_SHA=$(echo "$PR" | jq -r '.merge_commit_sha // empty')
MERGED_BY=$(echo "$PR" | jq -r '.merged_by.login // "unknown"')
TITLE=$(echo "$PR" | jq -r '.title // ""')
BASE_BRANCH=$(echo "$PR" | jq -r '.base.ref // "main"')
HEAD_SHA=$(echo "$PR" | jq -r '.head.sha // empty')
if [ -z "$MERGE_SHA" ]; then
echo "::warning::PR #${PR_NUMBER} merged=true but no merge_commit_sha — cannot evaluate force-merge."
exit 0
fi
# 2. Required status checks declared in the workflow env.
REQUIRED="$REQUIRED_CHECKS"
if [ -z "${REQUIRED//[[:space:]]/}" ]; then
echo "::notice::REQUIRED_CHECKS empty — force-merge not applicable."
exit 0
fi
# 3. Status-check state at the PR HEAD (where checks ran). The merge
# commit doesn't get its own checks; we evaluate the PR's last
# commit, which is what branch protection compared against.
STATUS=$(curl -sS -H "$AUTH" \
"${API}/repos/${OWNER}/${NAME}/commits/${HEAD_SHA}/status")
declare -A CHECK_STATE
while IFS=$'\t' read -r ctx state; do
[ -n "$ctx" ] && CHECK_STATE[$ctx]="$state"
done < <(echo "$STATUS" | jq -r '.statuses // [] | .[] | "\(.context)\t\(.status)"')
# 4. For each required check, was it green at merge? YAML block scalars
# (`|`) leave a trailing newline; skip blank/whitespace-only lines.
FAILED_CHECKS=()
while IFS= read -r req; do
trimmed="${req#"${req%%[![:space:]]*}"}" # ltrim
trimmed="${trimmed%"${trimmed##*[![:space:]]}"}" # rtrim
[ -z "$trimmed" ] && continue
state="${CHECK_STATE[$trimmed]:-missing}"
if [ "$state" != "success" ]; then
FAILED_CHECKS+=("${trimmed}=${state}")
fi
done <<< "$REQUIRED"
if [ "${#FAILED_CHECKS[@]}" -eq 0 ]; then
echo "::notice::PR #${PR_NUMBER} merged with all required checks green — not a force-merge."
exit 0
fi
# 5. Emit structured audit event.
NOW=$(date -u +%Y-%m-%dT%H:%M:%SZ)
FAILED_JSON=$(printf '%s\n' "${FAILED_CHECKS[@]}" | jq -R . | jq -s .)
# Print as a single-line JSON so Vector's parse_json transform can pick
# it up cleanly from docker_logs.
jq -nc \
--arg event_type "incident.force_merge" \
--arg ts "$NOW" \
--arg repo "$REPO" \
--argjson pr "$PR_NUMBER" \
--arg title "$TITLE" \
--arg base "$BASE_BRANCH" \
--arg merged_by "$MERGED_BY" \
--arg merge_sha "$MERGE_SHA" \
--argjson failed_checks "$FAILED_JSON" \
'{event_type: $event_type, ts: $ts, repo: $repo, pr: $pr, title: $title,
base_branch: $base, merged_by: $merged_by, merge_sha: $merge_sha,
failed_checks: $failed_checks}'
echo "::warning::FORCE-MERGE detected on PR #${PR_NUMBER} by ${MERGED_BY}: ${#FAILED_CHECKS[@]} required check(s) not green at merge time."

View File

@ -0,0 +1,58 @@
# audit-force-merge — emit `incident.force_merge` to runner stdout when
# a PR is merged with required-status-checks not green. Vector picks
# the JSON line off docker_logs and ships to Loki on
# molecule-canonical-obs (per `reference_obs_stack_phase1`); query as:
#
# {host="operator"} |= "event_type" |= "incident.force_merge" | json
#
# Closes the §SOP-6 audit gap (the doc says force-merges write to
# `structure_events`, but that table lives in the platform DB, not
# Gitea-side; Loki is the practical equivalent for Gitea Actions
# events). When the credential / observability stack converges later,
# this can sync into structure_events from Loki via a backfill job —
# the structured JSON shape is forward-compatible.
#
# Logic in `.gitea/scripts/audit-force-merge.sh` per the same script-
# extract pattern as sop-tier-check.
name: audit-force-merge
# pull_request_target loads from the base branch — same security model
# as sop-tier-check. Without this, an attacker could rewrite the
# workflow on a PR and skip the audit emission for their own
# force-merge. See `.gitea/workflows/sop-tier-check.yml` for the full
# rationale.
on:
pull_request_target:
types: [closed]
jobs:
audit:
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: read
# Skip when PR is closed without merge — saves a runner.
if: github.event.pull_request.merged == true
steps:
- name: Check out base branch (for the script)
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
ref: ${{ github.event.pull_request.base.sha }}
- name: Detect force-merge + emit audit event
env:
# Same org-level secret the sop-tier-check workflow uses.
GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
GITEA_HOST: git.moleculesai.app
REPO: ${{ github.repository }}
PR_NUMBER: ${{ github.event.pull_request.number }}
# Required-status-check contexts to evaluate at merge time.
# Newline-separated. Mirror this against branch protection
# (settings → branches → protected branch → required checks).
# Declared here rather than fetched from /branch_protections
# because that endpoint requires admin write — sop-tier-bot is
# read-only by design (least-privilege).
REQUIRED_CHECKS: |
sop-tier-check / tier-check (pull_request)
Secret scan / Scan diff for credential-shaped strings (pull_request)
run: bash .gitea/scripts/audit-force-merge.sh

View File

@ -0,0 +1,191 @@
name: Secret scan
# Hard CI gate. Refuses any PR / push whose diff additions contain a
# recognisable credential. Defense-in-depth for the #2090-class incident
# (2026-04-24): GitHub's hosted Copilot Coding Agent leaked a ghs_*
# installation token into tenant-proxy/package.json via `npm init`
# slurping the URL from a token-embedded origin remote. We can't fix
# upstream's clone hygiene, so we gate here.
#
# Same regex set as the runtime's bundled pre-commit hook
# (molecule-ai-workspace-runtime: molecule_runtime/scripts/pre-commit-checks.sh).
# Keep the two sides aligned when adding patterns.
#
# Ported from .github/workflows/secret-scan.yml so the gate actually
# fires on Gitea Actions. Differences from the GitHub version:
# - drops `merge_group` event (Gitea has no merge queue)
# - drops `workflow_call` (no cross-repo reusable invocation on Gitea)
# - SELF path updated to .gitea/workflows/secret-scan.yml
# The job name + step name are identical to the GitHub workflow so the
# status-check context (`Secret scan / Scan diff for credential-shaped
# strings (pull_request)`) matches branch protection on molecule-core/main.
on:
pull_request:
types: [opened, synchronize, reopened]
push:
branches: [main, staging]
jobs:
scan:
name: Scan diff for credential-shaped strings
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
fetch-depth: 2 # need previous commit to diff against on push events
# For pull_request events the diff base may be many commits behind
# HEAD and absent from the shallow clone. Fetch it explicitly.
- name: Fetch PR base SHA (pull_request events only)
if: github.event_name == 'pull_request'
run: git fetch --depth=1 origin ${{ github.event.pull_request.base.sha }}
- name: Refuse if credential-shaped strings appear in diff additions
env:
# Plumb event-specific SHAs through env so the script doesn't
# need conditional `${{ ... }}` interpolation per event type.
# github.event.before/after only exist on push events;
# pull_request has pull_request.base.sha / pull_request.head.sha.
PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }}
PUSH_BEFORE: ${{ github.event.before }}
PUSH_AFTER: ${{ github.event.after }}
run: |
# Pattern set covers GitHub family (the actual #2090 vector),
# Anthropic / OpenAI / Slack / AWS. Anchored on prefixes with low
# false-positive rates against agent-generated content. Mirror of
# molecule-ai-workspace-runtime/molecule_runtime/scripts/pre-commit-checks.sh
# — keep aligned.
SECRET_PATTERNS=(
'ghp_[A-Za-z0-9]{36,}' # GitHub PAT (classic)
'ghs_[A-Za-z0-9]{36,}' # GitHub App installation token
'gho_[A-Za-z0-9]{36,}' # GitHub OAuth user-to-server
'ghu_[A-Za-z0-9]{36,}' # GitHub OAuth user
'ghr_[A-Za-z0-9]{36,}' # GitHub OAuth refresh
'github_pat_[A-Za-z0-9_]{82,}' # GitHub fine-grained PAT
'sk-ant-[A-Za-z0-9_-]{40,}' # Anthropic API key
'sk-proj-[A-Za-z0-9_-]{40,}' # OpenAI project key
'sk-svcacct-[A-Za-z0-9_-]{40,}' # OpenAI service-account key
'sk-cp-[A-Za-z0-9_-]{60,}' # MiniMax API key (F1088 vector — caught only after the fact)
'xox[baprs]-[A-Za-z0-9-]{20,}' # Slack tokens
'AKIA[0-9A-Z]{16}' # AWS access key ID
'ASIA[0-9A-Z]{16}' # AWS STS temp access key ID
)
# Determine the diff base. Each event type stores its SHAs in
# a different place — see the env block above.
case "${{ github.event_name }}" in
pull_request)
BASE="$PR_BASE_SHA"
HEAD="$PR_HEAD_SHA"
;;
*)
BASE="$PUSH_BEFORE"
HEAD="$PUSH_AFTER"
;;
esac
# On push events with shallow clones, BASE may be present in
# the event payload but absent from the local object DB
# (fetch-depth=2 doesn't always reach the previous commit
# across true merges). Try fetching it on demand. If the
# fetch fails — e.g. the SHA was force-overwritten — we fall
# through to the empty-BASE branch below, which scans the
# entire tree as if every file were new. Correct, just slow.
if [ -n "$BASE" ] && ! echo "$BASE" | grep -qE '^0+$'; then
if ! git cat-file -e "$BASE" 2>/dev/null; then
git fetch --depth=1 origin "$BASE" 2>/dev/null || true
fi
fi
# Files added or modified in this change.
if [ -z "$BASE" ] || echo "$BASE" | grep -qE '^0+$' || ! git cat-file -e "$BASE" 2>/dev/null; then
# New branch / no previous SHA / BASE unreachable — check the
# entire tree as added content. Slower, but correct on first
# push.
CHANGED=$(git ls-tree -r --name-only HEAD)
DIFF_RANGE=""
else
CHANGED=$(git diff --name-only --diff-filter=AM "$BASE" "$HEAD")
DIFF_RANGE="$BASE $HEAD"
fi
if [ -z "$CHANGED" ]; then
echo "No changed files to inspect."
exit 0
fi
# Self-exclude: this workflow file legitimately contains the
# pattern strings as regex literals. Without an exclude it would
# block its own merge. Both the .github/ original and this
# .gitea/ port are excluded so a sync between them stays clean.
SELF_GITHUB=".github/workflows/secret-scan.yml"
SELF_GITEA=".gitea/workflows/secret-scan.yml"
OFFENDING=""
# `while IFS= read -r` (not `for f in $CHANGED`) so filenames
# containing whitespace don't word-split silently — a path
# with a space would otherwise produce two iterations on
# tokens that aren't real filenames, breaking the
# self-exclude + diff lookup.
while IFS= read -r f; do
[ -z "$f" ] && continue
[ "$f" = "$SELF_GITHUB" ] && continue
[ "$f" = "$SELF_GITEA" ] && continue
if [ -n "$DIFF_RANGE" ]; then
ADDED=$(git diff --no-color --unified=0 "$BASE" "$HEAD" -- "$f" 2>/dev/null | grep -E '^\+[^+]' || true)
else
# No diff range (new branch first push) — scan the full file
# contents as if every line were new.
ADDED=$(cat "$f" 2>/dev/null || true)
fi
[ -z "$ADDED" ] && continue
for pattern in "${SECRET_PATTERNS[@]}"; do
if echo "$ADDED" | grep -qE "$pattern"; then
OFFENDING="${OFFENDING}${f} (matched: ${pattern})\n"
break
fi
done
done <<< "$CHANGED"
if [ -n "$OFFENDING" ]; then
echo "::error::Credential-shaped strings detected in diff additions:"
# `printf '%b' "$OFFENDING"` interprets backslash escapes
# (the literal `\n` we appended above becomes a newline)
# WITHOUT treating OFFENDING as a format string. Plain
# `printf "$OFFENDING"` is a format-string sink: a filename
# containing `%` would be interpreted as a conversion
# specifier, corrupting the error message (or printing
# `%(missing)` artifacts).
printf '%b' "$OFFENDING"
echo ""
echo "The actual matched values are NOT echoed here, deliberately —"
echo "round-tripping a leaked credential into CI logs widens the blast"
echo "radius (logs are searchable + retained)."
echo ""
echo "Recovery:"
echo " 1. Remove the secret from the file. Replace with an env var"
echo " reference (e.g. \${{ secrets.GITHUB_TOKEN }} in workflows,"
echo " process.env.X in code)."
echo " 2. If the credential was already pushed (this PR's commit"
echo " history reaches a public ref), treat it as compromised —"
echo " ROTATE it immediately, do not just remove it. The token"
echo " remains valid in git history forever and may be in any"
echo " log/cache that consumed this branch."
echo " 3. Force-push the cleaned commit (or stack a revert) and"
echo " re-run CI."
echo ""
echo "If the match is a false positive (test fixture, docs example,"
echo "or this workflow's own regex literals): use a clearly-fake"
echo "placeholder like ghs_EXAMPLE_DO_NOT_USE that doesn't satisfy"
echo "the length suffix, OR add the file path to the SELF exclude"
echo "list in this workflow with a short reason."
echo ""
echo "Mirror of the regex set lives in the runtime's bundled"
echo "pre-commit hook (molecule-ai-workspace-runtime:"
echo "molecule_runtime/scripts/pre-commit-checks.sh) — keep aligned."
exit 1
fi
echo "✓ No credential-shaped strings in this change."