Compare commits
1 Commits
main
...
feat/githu
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d03fec794e |
@ -1,118 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# audit-force-merge — detect a §SOP-6 force-merge after PR close, emit
|
||||
# `incident.force_merge` to stdout as structured JSON.
|
||||
#
|
||||
# Vector's docker_logs source picks up runner stdout; the JSON gets
|
||||
# shipped to Loki on molecule-canonical-obs, indexable by event_type.
|
||||
# Query example:
|
||||
#
|
||||
# {host="operator"} |= "event_type" |= "incident.force_merge" | json
|
||||
#
|
||||
# A force-merge is detected when a PR closed-with-merged=true had at
|
||||
# least one of the repo's required-status-check contexts in a state
|
||||
# other than "success" at the merge commit's SHA. That's exactly what
|
||||
# the Gitea force_merge:true API call lets through, so it's a faithful
|
||||
# detector of the override path.
|
||||
#
|
||||
# Triggers on `pull_request_target: closed` (loaded from base branch
|
||||
# per §SOP-6 security model). No-op when merged=false.
|
||||
#
|
||||
# Required env (set by the workflow):
|
||||
# GITEA_TOKEN, GITEA_HOST, REPO, PR_NUMBER, REQUIRED_CHECKS
|
||||
#
|
||||
# REQUIRED_CHECKS is a newline-separated list of status-check context
|
||||
# names that branch protection requires. Declared in the workflow YAML
|
||||
# rather than fetched from /branch_protections (which needs admin
|
||||
# scope — sop-tier-bot has read-only). Trade dynamism for simplicity:
|
||||
# when the required-check set changes, update both branch protection
|
||||
# AND this env. Keeping them in sync is less complexity than granting
|
||||
# the audit bot admin perms on every repo.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
: "${GITEA_TOKEN:?required}"
|
||||
: "${GITEA_HOST:?required}"
|
||||
: "${REPO:?required}"
|
||||
: "${PR_NUMBER:?required}"
|
||||
: "${REQUIRED_CHECKS:?required (newline-separated context names)}"
|
||||
|
||||
OWNER="${REPO%%/*}"
|
||||
NAME="${REPO##*/}"
|
||||
API="https://${GITEA_HOST}/api/v1"
|
||||
AUTH="Authorization: token ${GITEA_TOKEN}"
|
||||
|
||||
# 1. Fetch the PR. If not merged, no-op.
|
||||
PR=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/pulls/${PR_NUMBER}")
|
||||
MERGED=$(echo "$PR" | jq -r '.merged // false')
|
||||
if [ "$MERGED" != "true" ]; then
|
||||
echo "::notice::PR #${PR_NUMBER} closed without merge — no audit emission."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
MERGE_SHA=$(echo "$PR" | jq -r '.merge_commit_sha // empty')
|
||||
MERGED_BY=$(echo "$PR" | jq -r '.merged_by.login // "unknown"')
|
||||
TITLE=$(echo "$PR" | jq -r '.title // ""')
|
||||
BASE_BRANCH=$(echo "$PR" | jq -r '.base.ref // "main"')
|
||||
HEAD_SHA=$(echo "$PR" | jq -r '.head.sha // empty')
|
||||
|
||||
if [ -z "$MERGE_SHA" ]; then
|
||||
echo "::warning::PR #${PR_NUMBER} merged=true but no merge_commit_sha — cannot evaluate force-merge."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# 2. Required status checks declared in the workflow env.
|
||||
REQUIRED="$REQUIRED_CHECKS"
|
||||
if [ -z "${REQUIRED//[[:space:]]/}" ]; then
|
||||
echo "::notice::REQUIRED_CHECKS empty — force-merge not applicable."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# 3. Status-check state at the PR HEAD (where checks ran). The merge
|
||||
# commit doesn't get its own checks; we evaluate the PR's last
|
||||
# commit, which is what branch protection compared against.
|
||||
STATUS=$(curl -sS -H "$AUTH" \
|
||||
"${API}/repos/${OWNER}/${NAME}/commits/${HEAD_SHA}/status")
|
||||
declare -A CHECK_STATE
|
||||
while IFS=$'\t' read -r ctx state; do
|
||||
[ -n "$ctx" ] && CHECK_STATE[$ctx]="$state"
|
||||
done < <(echo "$STATUS" | jq -r '.statuses // [] | .[] | "\(.context)\t\(.status)"')
|
||||
|
||||
# 4. For each required check, was it green at merge? YAML block scalars
|
||||
# (`|`) leave a trailing newline; skip blank/whitespace-only lines.
|
||||
FAILED_CHECKS=()
|
||||
while IFS= read -r req; do
|
||||
trimmed="${req#"${req%%[![:space:]]*}"}" # ltrim
|
||||
trimmed="${trimmed%"${trimmed##*[![:space:]]}"}" # rtrim
|
||||
[ -z "$trimmed" ] && continue
|
||||
state="${CHECK_STATE[$trimmed]:-missing}"
|
||||
if [ "$state" != "success" ]; then
|
||||
FAILED_CHECKS+=("${trimmed}=${state}")
|
||||
fi
|
||||
done <<< "$REQUIRED"
|
||||
|
||||
if [ "${#FAILED_CHECKS[@]}" -eq 0 ]; then
|
||||
echo "::notice::PR #${PR_NUMBER} merged with all required checks green — not a force-merge."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# 5. Emit structured audit event.
|
||||
NOW=$(date -u +%Y-%m-%dT%H:%M:%SZ)
|
||||
FAILED_JSON=$(printf '%s\n' "${FAILED_CHECKS[@]}" | jq -R . | jq -s .)
|
||||
|
||||
# Print as a single-line JSON so Vector's parse_json transform can pick
|
||||
# it up cleanly from docker_logs.
|
||||
jq -nc \
|
||||
--arg event_type "incident.force_merge" \
|
||||
--arg ts "$NOW" \
|
||||
--arg repo "$REPO" \
|
||||
--argjson pr "$PR_NUMBER" \
|
||||
--arg title "$TITLE" \
|
||||
--arg base "$BASE_BRANCH" \
|
||||
--arg merged_by "$MERGED_BY" \
|
||||
--arg merge_sha "$MERGE_SHA" \
|
||||
--argjson failed_checks "$FAILED_JSON" \
|
||||
'{event_type: $event_type, ts: $ts, repo: $repo, pr: $pr, title: $title,
|
||||
base_branch: $base, merged_by: $merged_by, merge_sha: $merge_sha,
|
||||
failed_checks: $failed_checks}'
|
||||
|
||||
echo "::warning::FORCE-MERGE detected on PR #${PR_NUMBER} by ${MERGED_BY}: ${#FAILED_CHECKS[@]} required check(s) not green at merge time."
|
||||
@ -1,149 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# sop-tier-check — verify a Gitea PR satisfies the §SOP-6 approval gate.
|
||||
#
|
||||
# Reads the PR's tier label, walks approving reviewers, and checks each
|
||||
# approver's Gitea team membership against the tier's eligible-team set.
|
||||
# Marks pass only when at least one non-author approver is in an eligible
|
||||
# team.
|
||||
#
|
||||
# Invoked from `.gitea/workflows/sop-tier-check.yml`. The workflow sets
|
||||
# the env vars below; this script does no IO outside of stdout/stderr +
|
||||
# the Gitea API.
|
||||
#
|
||||
# Required env:
|
||||
# GITEA_TOKEN — bot PAT with read:organization,read:user,
|
||||
# read:issue,read:repository scopes
|
||||
# GITEA_HOST — e.g. git.moleculesai.app
|
||||
# REPO — owner/name (from github.repository)
|
||||
# PR_NUMBER — int (from github.event.pull_request.number)
|
||||
# PR_AUTHOR — login (from github.event.pull_request.user.login)
|
||||
#
|
||||
# Optional:
|
||||
# SOP_DEBUG=1 — print per-API-call diagnostic lines (HTTP codes,
|
||||
# raw response bodies). Default: off.
|
||||
#
|
||||
# Stale-status caveat: Gitea Actions does not always re-fire workflows
|
||||
# on `labeled` / `pull_request_review:submitted` events. If the
|
||||
# sop-tier-check status is stale (e.g. red after labels/approvals were
|
||||
# added), push an empty commit to the PR branch to force a synchronize
|
||||
# event, OR re-request reviews. Tracked: internal#46.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
debug() {
|
||||
if [ "${SOP_DEBUG:-}" = "1" ]; then
|
||||
echo " [debug] $*" >&2
|
||||
fi
|
||||
}
|
||||
|
||||
# Validate env
|
||||
: "${GITEA_TOKEN:?GITEA_TOKEN required}"
|
||||
: "${GITEA_HOST:?GITEA_HOST required}"
|
||||
: "${REPO:?REPO required (owner/name)}"
|
||||
: "${PR_NUMBER:?PR_NUMBER required}"
|
||||
: "${PR_AUTHOR:?PR_AUTHOR required}"
|
||||
|
||||
OWNER="${REPO%%/*}"
|
||||
NAME="${REPO##*/}"
|
||||
API="https://${GITEA_HOST}/api/v1"
|
||||
AUTH="Authorization: token ${GITEA_TOKEN}"
|
||||
echo "::notice::tier-check start: repo=$OWNER/$NAME pr=$PR_NUMBER author=$PR_AUTHOR"
|
||||
|
||||
# Sanity: token resolves to a user
|
||||
WHOAMI=$(curl -sS -H "$AUTH" "${API}/user" | jq -r '.login // ""')
|
||||
if [ -z "$WHOAMI" ]; then
|
||||
echo "::error::GITEA_TOKEN cannot resolve a user via /api/v1/user — check the token scope and that the secret is wired correctly."
|
||||
exit 1
|
||||
fi
|
||||
echo "::notice::token resolves to user: $WHOAMI"
|
||||
|
||||
# 1. Read tier label
|
||||
LABELS=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/issues/${PR_NUMBER}/labels" | jq -r '.[].name')
|
||||
TIER=""
|
||||
for L in $LABELS; do
|
||||
case "$L" in
|
||||
tier:low|tier:medium|tier:high)
|
||||
if [ -n "$TIER" ]; then
|
||||
echo "::error::Multiple tier labels: $TIER + $L. Apply exactly one."
|
||||
exit 1
|
||||
fi
|
||||
TIER="$L"
|
||||
;;
|
||||
esac
|
||||
done
|
||||
if [ -z "$TIER" ]; then
|
||||
echo "::error::PR has no tier:low|tier:medium|tier:high label. Apply one before merge."
|
||||
exit 1
|
||||
fi
|
||||
debug "tier=$TIER"
|
||||
|
||||
# 2. Tier → eligible teams
|
||||
case "$TIER" in
|
||||
tier:low) ELIGIBLE="engineers managers ceo" ;;
|
||||
tier:medium) ELIGIBLE="managers ceo" ;;
|
||||
tier:high) ELIGIBLE="ceo" ;;
|
||||
esac
|
||||
debug "eligible_teams=$ELIGIBLE"
|
||||
|
||||
# Resolve team-name → team-id once. /orgs/{org}/teams/{slug}/... endpoints
|
||||
# don't exist on Gitea 1.22; we have to use /teams/{id}.
|
||||
ORG_TEAMS_FILE=$(mktemp)
|
||||
trap 'rm -f "$ORG_TEAMS_FILE"' EXIT
|
||||
HTTP_CODE=$(curl -sS -o "$ORG_TEAMS_FILE" -w '%{http_code}' -H "$AUTH" \
|
||||
"${API}/orgs/${OWNER}/teams")
|
||||
debug "teams-list HTTP=$HTTP_CODE size=$(wc -c <"$ORG_TEAMS_FILE")"
|
||||
if [ "${SOP_DEBUG:-}" = "1" ]; then
|
||||
echo " [debug] teams-list body (first 300 chars):" >&2
|
||||
head -c 300 "$ORG_TEAMS_FILE" >&2; echo >&2
|
||||
fi
|
||||
if [ "$HTTP_CODE" != "200" ]; then
|
||||
echo "::error::GET /orgs/${OWNER}/teams returned HTTP $HTTP_CODE — token likely lacks read:org scope. Add a SOP_TIER_CHECK_TOKEN secret with read:organization scope at the org level."
|
||||
exit 1
|
||||
fi
|
||||
declare -A TEAM_ID
|
||||
for T in $ELIGIBLE; do
|
||||
ID=$(jq -r --arg t "$T" '.[] | select(.name==$t) | .id' <"$ORG_TEAMS_FILE" | head -1)
|
||||
if [ -z "$ID" ] || [ "$ID" = "null" ]; then
|
||||
VISIBLE=$(jq -r '.[]?.name? // empty' <"$ORG_TEAMS_FILE" 2>/dev/null | tr '\n' ' ')
|
||||
echo "::error::Team \"$T\" not found in org $OWNER. Teams visible: $VISIBLE"
|
||||
exit 1
|
||||
fi
|
||||
TEAM_ID[$T]="$ID"
|
||||
debug "team-id: $T → $ID"
|
||||
done
|
||||
|
||||
# 3. Read approving reviewers
|
||||
REVIEWS=$(curl -sS -H "$AUTH" "${API}/repos/${OWNER}/${NAME}/pulls/${PR_NUMBER}/reviews")
|
||||
APPROVERS=$(echo "$REVIEWS" | jq -r '[.[] | select(.state=="APPROVED") | .user.login] | unique | .[]')
|
||||
if [ -z "$APPROVERS" ]; then
|
||||
echo "::error::No approving reviews. Tier $TIER requires approval from {$ELIGIBLE} (non-author)."
|
||||
exit 1
|
||||
fi
|
||||
debug "approvers: $(echo "$APPROVERS" | tr '\n' ' ')"
|
||||
|
||||
# 4. For each approver: check non-author + team membership (by id)
|
||||
OK=""
|
||||
for U in $APPROVERS; do
|
||||
if [ "$U" = "$PR_AUTHOR" ]; then
|
||||
debug "skip self-review by $U"
|
||||
continue
|
||||
fi
|
||||
for T in $ELIGIBLE; do
|
||||
ID="${TEAM_ID[$T]}"
|
||||
CODE=$(curl -sS -o /dev/null -w '%{http_code}' -H "$AUTH" \
|
||||
"${API}/teams/${ID}/members/${U}")
|
||||
debug "probe: $U in team $T (id=$ID) → HTTP $CODE"
|
||||
if [ "$CODE" = "200" ] || [ "$CODE" = "204" ]; then
|
||||
echo "::notice::approver $U is in team $T (eligible for $TIER)"
|
||||
OK="yes"
|
||||
break
|
||||
fi
|
||||
done
|
||||
[ -n "$OK" ] && break
|
||||
done
|
||||
|
||||
if [ -z "$OK" ]; then
|
||||
echo "::error::Tier $TIER requires approval from a non-author member of {$ELIGIBLE}. Got approvers: $APPROVERS — none of them satisfied team membership. Set SOP_DEBUG=1 to see per-probe HTTP codes."
|
||||
exit 1
|
||||
fi
|
||||
echo "::notice::sop-tier-check passed: $TIER, approver in {$ELIGIBLE}"
|
||||
@ -1,58 +0,0 @@
|
||||
# audit-force-merge — emit `incident.force_merge` to runner stdout when
|
||||
# a PR is merged with required-status-checks not green. Vector picks
|
||||
# the JSON line off docker_logs and ships to Loki on
|
||||
# molecule-canonical-obs (per `reference_obs_stack_phase1`); query as:
|
||||
#
|
||||
# {host="operator"} |= "event_type" |= "incident.force_merge" | json
|
||||
#
|
||||
# Closes the §SOP-6 audit gap (the doc says force-merges write to
|
||||
# `structure_events`, but that table lives in the platform DB, not
|
||||
# Gitea-side; Loki is the practical equivalent for Gitea Actions
|
||||
# events). When the credential / observability stack converges later,
|
||||
# this can sync into structure_events from Loki via a backfill job —
|
||||
# the structured JSON shape is forward-compatible.
|
||||
#
|
||||
# Logic in `.gitea/scripts/audit-force-merge.sh` per the same script-
|
||||
# extract pattern as sop-tier-check.
|
||||
|
||||
name: audit-force-merge
|
||||
|
||||
# pull_request_target loads from the base branch — same security model
|
||||
# as sop-tier-check. Without this, an attacker could rewrite the
|
||||
# workflow on a PR and skip the audit emission for their own
|
||||
# force-merge. See `.gitea/workflows/sop-tier-check.yml` for the full
|
||||
# rationale.
|
||||
on:
|
||||
pull_request_target:
|
||||
types: [closed]
|
||||
|
||||
jobs:
|
||||
audit:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: read
|
||||
# Skip when PR is closed without merge — saves a runner.
|
||||
if: github.event.pull_request.merged == true
|
||||
steps:
|
||||
- name: Check out base branch (for the script)
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
ref: ${{ github.event.pull_request.base.sha }}
|
||||
- name: Detect force-merge + emit audit event
|
||||
env:
|
||||
# Same org-level secret the sop-tier-check workflow uses.
|
||||
GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
|
||||
GITEA_HOST: git.moleculesai.app
|
||||
REPO: ${{ github.repository }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
# Required-status-check contexts to evaluate at merge time.
|
||||
# Newline-separated. Mirror this against branch protection
|
||||
# (settings → branches → protected branch → required checks).
|
||||
# Declared here rather than fetched from /branch_protections
|
||||
# because that endpoint requires admin write — sop-tier-bot is
|
||||
# read-only by design (least-privilege).
|
||||
REQUIRED_CHECKS: |
|
||||
sop-tier-check / tier-check (pull_request)
|
||||
Secret scan / Scan diff for credential-shaped strings (pull_request)
|
||||
run: bash .gitea/scripts/audit-force-merge.sh
|
||||
@ -1,191 +0,0 @@
|
||||
name: Secret scan
|
||||
|
||||
# Hard CI gate. Refuses any PR / push whose diff additions contain a
|
||||
# recognisable credential. Defense-in-depth for the #2090-class incident
|
||||
# (2026-04-24): GitHub's hosted Copilot Coding Agent leaked a ghs_*
|
||||
# installation token into tenant-proxy/package.json via `npm init`
|
||||
# slurping the URL from a token-embedded origin remote. We can't fix
|
||||
# upstream's clone hygiene, so we gate here.
|
||||
#
|
||||
# Same regex set as the runtime's bundled pre-commit hook
|
||||
# (molecule-ai-workspace-runtime: molecule_runtime/scripts/pre-commit-checks.sh).
|
||||
# Keep the two sides aligned when adding patterns.
|
||||
#
|
||||
# Ported from .github/workflows/secret-scan.yml so the gate actually
|
||||
# fires on Gitea Actions. Differences from the GitHub version:
|
||||
# - drops `merge_group` event (Gitea has no merge queue)
|
||||
# - drops `workflow_call` (no cross-repo reusable invocation on Gitea)
|
||||
# - SELF path updated to .gitea/workflows/secret-scan.yml
|
||||
# The job name + step name are identical to the GitHub workflow so the
|
||||
# status-check context (`Secret scan / Scan diff for credential-shaped
|
||||
# strings (pull_request)`) matches branch protection on molecule-core/main.
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
push:
|
||||
branches: [main, staging]
|
||||
|
||||
jobs:
|
||||
scan:
|
||||
name: Scan diff for credential-shaped strings
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
fetch-depth: 2 # need previous commit to diff against on push events
|
||||
|
||||
# For pull_request events the diff base may be many commits behind
|
||||
# HEAD and absent from the shallow clone. Fetch it explicitly.
|
||||
- name: Fetch PR base SHA (pull_request events only)
|
||||
if: github.event_name == 'pull_request'
|
||||
run: git fetch --depth=1 origin ${{ github.event.pull_request.base.sha }}
|
||||
|
||||
- name: Refuse if credential-shaped strings appear in diff additions
|
||||
env:
|
||||
# Plumb event-specific SHAs through env so the script doesn't
|
||||
# need conditional `${{ ... }}` interpolation per event type.
|
||||
# github.event.before/after only exist on push events;
|
||||
# pull_request has pull_request.base.sha / pull_request.head.sha.
|
||||
PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
|
||||
PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }}
|
||||
PUSH_BEFORE: ${{ github.event.before }}
|
||||
PUSH_AFTER: ${{ github.event.after }}
|
||||
run: |
|
||||
# Pattern set covers GitHub family (the actual #2090 vector),
|
||||
# Anthropic / OpenAI / Slack / AWS. Anchored on prefixes with low
|
||||
# false-positive rates against agent-generated content. Mirror of
|
||||
# molecule-ai-workspace-runtime/molecule_runtime/scripts/pre-commit-checks.sh
|
||||
# — keep aligned.
|
||||
SECRET_PATTERNS=(
|
||||
'ghp_[A-Za-z0-9]{36,}' # GitHub PAT (classic)
|
||||
'ghs_[A-Za-z0-9]{36,}' # GitHub App installation token
|
||||
'gho_[A-Za-z0-9]{36,}' # GitHub OAuth user-to-server
|
||||
'ghu_[A-Za-z0-9]{36,}' # GitHub OAuth user
|
||||
'ghr_[A-Za-z0-9]{36,}' # GitHub OAuth refresh
|
||||
'github_pat_[A-Za-z0-9_]{82,}' # GitHub fine-grained PAT
|
||||
'sk-ant-[A-Za-z0-9_-]{40,}' # Anthropic API key
|
||||
'sk-proj-[A-Za-z0-9_-]{40,}' # OpenAI project key
|
||||
'sk-svcacct-[A-Za-z0-9_-]{40,}' # OpenAI service-account key
|
||||
'sk-cp-[A-Za-z0-9_-]{60,}' # MiniMax API key (F1088 vector — caught only after the fact)
|
||||
'xox[baprs]-[A-Za-z0-9-]{20,}' # Slack tokens
|
||||
'AKIA[0-9A-Z]{16}' # AWS access key ID
|
||||
'ASIA[0-9A-Z]{16}' # AWS STS temp access key ID
|
||||
)
|
||||
|
||||
# Determine the diff base. Each event type stores its SHAs in
|
||||
# a different place — see the env block above.
|
||||
case "${{ github.event_name }}" in
|
||||
pull_request)
|
||||
BASE="$PR_BASE_SHA"
|
||||
HEAD="$PR_HEAD_SHA"
|
||||
;;
|
||||
*)
|
||||
BASE="$PUSH_BEFORE"
|
||||
HEAD="$PUSH_AFTER"
|
||||
;;
|
||||
esac
|
||||
|
||||
# On push events with shallow clones, BASE may be present in
|
||||
# the event payload but absent from the local object DB
|
||||
# (fetch-depth=2 doesn't always reach the previous commit
|
||||
# across true merges). Try fetching it on demand. If the
|
||||
# fetch fails — e.g. the SHA was force-overwritten — we fall
|
||||
# through to the empty-BASE branch below, which scans the
|
||||
# entire tree as if every file were new. Correct, just slow.
|
||||
if [ -n "$BASE" ] && ! echo "$BASE" | grep -qE '^0+$'; then
|
||||
if ! git cat-file -e "$BASE" 2>/dev/null; then
|
||||
git fetch --depth=1 origin "$BASE" 2>/dev/null || true
|
||||
fi
|
||||
fi
|
||||
|
||||
# Files added or modified in this change.
|
||||
if [ -z "$BASE" ] || echo "$BASE" | grep -qE '^0+$' || ! git cat-file -e "$BASE" 2>/dev/null; then
|
||||
# New branch / no previous SHA / BASE unreachable — check the
|
||||
# entire tree as added content. Slower, but correct on first
|
||||
# push.
|
||||
CHANGED=$(git ls-tree -r --name-only HEAD)
|
||||
DIFF_RANGE=""
|
||||
else
|
||||
CHANGED=$(git diff --name-only --diff-filter=AM "$BASE" "$HEAD")
|
||||
DIFF_RANGE="$BASE $HEAD"
|
||||
fi
|
||||
|
||||
if [ -z "$CHANGED" ]; then
|
||||
echo "No changed files to inspect."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Self-exclude: this workflow file legitimately contains the
|
||||
# pattern strings as regex literals. Without an exclude it would
|
||||
# block its own merge. Both the .github/ original and this
|
||||
# .gitea/ port are excluded so a sync between them stays clean.
|
||||
SELF_GITHUB=".github/workflows/secret-scan.yml"
|
||||
SELF_GITEA=".gitea/workflows/secret-scan.yml"
|
||||
|
||||
OFFENDING=""
|
||||
# `while IFS= read -r` (not `for f in $CHANGED`) so filenames
|
||||
# containing whitespace don't word-split silently — a path
|
||||
# with a space would otherwise produce two iterations on
|
||||
# tokens that aren't real filenames, breaking the
|
||||
# self-exclude + diff lookup.
|
||||
while IFS= read -r f; do
|
||||
[ -z "$f" ] && continue
|
||||
[ "$f" = "$SELF_GITHUB" ] && continue
|
||||
[ "$f" = "$SELF_GITEA" ] && continue
|
||||
if [ -n "$DIFF_RANGE" ]; then
|
||||
ADDED=$(git diff --no-color --unified=0 "$BASE" "$HEAD" -- "$f" 2>/dev/null | grep -E '^\+[^+]' || true)
|
||||
else
|
||||
# No diff range (new branch first push) — scan the full file
|
||||
# contents as if every line were new.
|
||||
ADDED=$(cat "$f" 2>/dev/null || true)
|
||||
fi
|
||||
[ -z "$ADDED" ] && continue
|
||||
for pattern in "${SECRET_PATTERNS[@]}"; do
|
||||
if echo "$ADDED" | grep -qE "$pattern"; then
|
||||
OFFENDING="${OFFENDING}${f} (matched: ${pattern})\n"
|
||||
break
|
||||
fi
|
||||
done
|
||||
done <<< "$CHANGED"
|
||||
|
||||
if [ -n "$OFFENDING" ]; then
|
||||
echo "::error::Credential-shaped strings detected in diff additions:"
|
||||
# `printf '%b' "$OFFENDING"` interprets backslash escapes
|
||||
# (the literal `\n` we appended above becomes a newline)
|
||||
# WITHOUT treating OFFENDING as a format string. Plain
|
||||
# `printf "$OFFENDING"` is a format-string sink: a filename
|
||||
# containing `%` would be interpreted as a conversion
|
||||
# specifier, corrupting the error message (or printing
|
||||
# `%(missing)` artifacts).
|
||||
printf '%b' "$OFFENDING"
|
||||
echo ""
|
||||
echo "The actual matched values are NOT echoed here, deliberately —"
|
||||
echo "round-tripping a leaked credential into CI logs widens the blast"
|
||||
echo "radius (logs are searchable + retained)."
|
||||
echo ""
|
||||
echo "Recovery:"
|
||||
echo " 1. Remove the secret from the file. Replace with an env var"
|
||||
echo " reference (e.g. \${{ secrets.GITHUB_TOKEN }} in workflows,"
|
||||
echo " process.env.X in code)."
|
||||
echo " 2. If the credential was already pushed (this PR's commit"
|
||||
echo " history reaches a public ref), treat it as compromised —"
|
||||
echo " ROTATE it immediately, do not just remove it. The token"
|
||||
echo " remains valid in git history forever and may be in any"
|
||||
echo " log/cache that consumed this branch."
|
||||
echo " 3. Force-push the cleaned commit (or stack a revert) and"
|
||||
echo " re-run CI."
|
||||
echo ""
|
||||
echo "If the match is a false positive (test fixture, docs example,"
|
||||
echo "or this workflow's own regex literals): use a clearly-fake"
|
||||
echo "placeholder like ghs_EXAMPLE_DO_NOT_USE that doesn't satisfy"
|
||||
echo "the length suffix, OR add the file path to the SELF exclude"
|
||||
echo "list in this workflow with a short reason."
|
||||
echo ""
|
||||
echo "Mirror of the regex set lives in the runtime's bundled"
|
||||
echo "pre-commit hook (molecule-ai-workspace-runtime:"
|
||||
echo "molecule_runtime/scripts/pre-commit-checks.sh) — keep aligned."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "✓ No credential-shaped strings in this change."
|
||||
@ -1,81 +0,0 @@
|
||||
# sop-tier-check — canonical Gitea Actions workflow for §SOP-6 enforcement.
|
||||
#
|
||||
# Logic lives in `.gitea/scripts/sop-tier-check.sh` (extracted 2026-05-09
|
||||
# from the previous inline-bash version). The script is the single source
|
||||
# of truth; this workflow file just sets env + invokes it.
|
||||
#
|
||||
# Copy BOTH files (`.gitea/workflows/sop-tier-check.yml` +
|
||||
# `.gitea/scripts/sop-tier-check.sh`) into any repo that wants the
|
||||
# §SOP-6 PR gate enforced. Pair with branch protection on the protected
|
||||
# branch:
|
||||
# required_status_checks: ["sop-tier-check / tier-check (pull_request)"]
|
||||
# required_approving_reviews: 1
|
||||
# approving_review_teams: ["ceo", "managers", "engineers"]
|
||||
#
|
||||
# Tier → eligible-team mapping (mirror of dev-sop §SOP-6):
|
||||
# tier:low → engineers, managers, ceo
|
||||
# tier:medium → managers, ceo
|
||||
# tier:high → ceo
|
||||
#
|
||||
# Force-merge: Owners-team override remains available out-of-band via
|
||||
# the Gitea merge API; force-merge writes `incident.force_merge` to
|
||||
# `structure_events` per §Persistent structured logging gate (Phase 3).
|
||||
#
|
||||
# Set `SOP_DEBUG: '1'` in the env block to enable per-API-call diagnostic
|
||||
# lines — useful when diagnosing token-scope or team-id-resolution
|
||||
# issues. Default off.
|
||||
|
||||
name: sop-tier-check
|
||||
|
||||
# SECURITY: triggers MUST use `pull_request_target`, not `pull_request`.
|
||||
# `pull_request_target` loads the workflow definition from the BASE
|
||||
# branch (i.e. `main`), not the PR's HEAD. With `pull_request`, anyone
|
||||
# with write access to a feature branch could rewrite this file in
|
||||
# their PR to dump SOP_TIER_CHECK_TOKEN (org-read scope) to logs and
|
||||
# exfiltrate it. Verified 2026-05-09 against Gitea 1.22.6 —
|
||||
# `pull_request_target` (added in Gitea 1.21 via go-gitea/gitea#25229)
|
||||
# is the documented mitigation.
|
||||
#
|
||||
# This workflow does NOT call `actions/checkout` of PR HEAD code, so no
|
||||
# untrusted code is ever executed in the runner — we only HTTP-call the
|
||||
# Gitea API. If a future change adds a checkout step, it MUST pin to
|
||||
# `${{ github.event.pull_request.base.sha }}` (NOT `head.sha`) to keep
|
||||
# the trust boundary.
|
||||
on:
|
||||
pull_request_target:
|
||||
types: [opened, edited, synchronize, reopened, labeled, unlabeled]
|
||||
pull_request_review:
|
||||
types: [submitted, dismissed, edited]
|
||||
|
||||
jobs:
|
||||
tier-check:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: read
|
||||
steps:
|
||||
- name: Check out base branch (for the script)
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
# Pin to base.sha — pull_request_target's protection only
|
||||
# works if we never check out PR HEAD. Same SHA the workflow
|
||||
# itself was loaded from.
|
||||
ref: ${{ github.event.pull_request.base.sha }}
|
||||
- name: Verify tier label + reviewer team membership
|
||||
env:
|
||||
# SOP_TIER_CHECK_TOKEN is the org-level secret for the
|
||||
# sop-tier-bot PAT (read:organization,read:user,read:issue,
|
||||
# read:repository). Stored at the org level
|
||||
# (/api/v1/orgs/molecule-ai/actions/secrets) so per-repo
|
||||
# configuration is unnecessary — every repo in the org
|
||||
# picks it up automatically.
|
||||
# Falls back to GITHUB_TOKEN with a clear error if missing.
|
||||
GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
|
||||
GITEA_HOST: git.moleculesai.app
|
||||
REPO: ${{ github.repository }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
PR_AUTHOR: ${{ github.event.pull_request.user.login }}
|
||||
# Set to '1' for diagnostic per-API-call output. Off by default
|
||||
# so production logs aren't noisy.
|
||||
SOP_DEBUG: '0'
|
||||
run: bash .gitea/scripts/sop-tier-check.sh
|
||||
@ -17,24 +17,6 @@ import { dirname, join } from "node:path";
|
||||
// update one heuristic. Production is unaffected: `output: "standalone"`
|
||||
// bakes resolved env into the build, and the marker file isn't shipped.
|
||||
loadMonorepoEnv();
|
||||
// Boot-time matched-pair guard for ADMIN_TOKEN / NEXT_PUBLIC_ADMIN_TOKEN.
|
||||
// When ADMIN_TOKEN is set on the workspace-server (server-side bearer
|
||||
// gate, wsauth_middleware.go ~L245), the canvas MUST send the matching
|
||||
// NEXT_PUBLIC_ADMIN_TOKEN as `Authorization: Bearer ...` on every API
|
||||
// call. If only one is set, every workspace API call 401s silently —
|
||||
// the canvas hydrates with empty data and the user sees a broken page
|
||||
// with no console hint about the auth-config mismatch.
|
||||
//
|
||||
// Pre-fix the matched-pair contract was descriptive only (a comment in
|
||||
// .env): future devs/agents could re-misconfigure with one of the two
|
||||
// unset and silently 401. Closes the post-PR-#174 self-review gap.
|
||||
//
|
||||
// Warn-only (not exit) — production canvas Docker images bake these
|
||||
// vars into the build at image-build time, and a missed pair there
|
||||
// would still emit the warning at runtime via the standalone server's
|
||||
// startup. Killing the process on misconfiguration would turn a
|
||||
// recoverable auth issue into a hard crashloop.
|
||||
checkAdminTokenPair();
|
||||
|
||||
const nextConfig: NextConfig = {
|
||||
output: "standalone",
|
||||
@ -75,43 +57,6 @@ function loadMonorepoEnv() {
|
||||
);
|
||||
}
|
||||
|
||||
// Boot-time matched-pair guard. Runs after .env has been loaded so the
|
||||
// check sees the post-load state. The two env vars must be set or
|
||||
// unset together; one-without-the-other is the silent-401 footgun.
|
||||
//
|
||||
// Treats empty string ("") as unset. An explicitly-empty `KEY=` in
|
||||
// .env counts as set-to-empty in `process.env`, but for auth purposes
|
||||
// an empty bearer token is equivalent to no token — so both
|
||||
// `ADMIN_TOKEN=` and an unset ADMIN_TOKEN are equivalent relative to
|
||||
// the matched-pair invariant.
|
||||
//
|
||||
// Returns void; side effect is the console.error warning. Kept as a
|
||||
// separate function (exported) so a future test can reset env, call
|
||||
// this, and assert on captured stderr.
|
||||
export function checkAdminTokenPair(): void {
|
||||
const serverSet = !!process.env.ADMIN_TOKEN;
|
||||
const clientSet = !!process.env.NEXT_PUBLIC_ADMIN_TOKEN;
|
||||
if (serverSet === clientSet) return;
|
||||
// Distinct messages so the operator can tell which half is missing
|
||||
// — the fix is symmetric (set the other one) but the diagnostic
|
||||
// mentions which side is currently set so they don't have to grep.
|
||||
if (serverSet && !clientSet) {
|
||||
// eslint-disable-next-line no-console
|
||||
console.error(
|
||||
"[next.config] ADMIN_TOKEN is set but NEXT_PUBLIC_ADMIN_TOKEN is not — " +
|
||||
"canvas will 401 against workspace-server because the bearer header " +
|
||||
"is never attached. Set both to the same value, or unset both.",
|
||||
);
|
||||
} else {
|
||||
// eslint-disable-next-line no-console
|
||||
console.error(
|
||||
"[next.config] NEXT_PUBLIC_ADMIN_TOKEN is set but ADMIN_TOKEN is not — " +
|
||||
"workspace-server will reject the bearer because no AdminAuth gate " +
|
||||
"is configured. Set both to the same value, or unset both.",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
function findMonorepoRoot(start: string): string | null {
|
||||
let dir = start;
|
||||
for (let i = 0; i < 6; i++) {
|
||||
|
||||
@ -1,130 +0,0 @@
|
||||
// @vitest-environment node
|
||||
import { describe, it, expect, beforeEach, afterEach, vi } from "vitest";
|
||||
|
||||
// Tests for the boot-time matched-pair guard added to next.config.ts.
|
||||
//
|
||||
// Why this lives in src/lib/__tests__ even though the function is in
|
||||
// canvas/next.config.ts:
|
||||
// - next.config.ts runs as ESM-but-also-CJS depending on which
|
||||
// consumer loads it (Next.js dev server vs Next.js build); we
|
||||
// want the test to be a plain ESM module Vitest already handles.
|
||||
// - Importing from "../../../next.config" pulls in the rest of the
|
||||
// file (loadMonorepoEnv, the default export, etc.) which has
|
||||
// side effects on module load (it runs loadMonorepoEnv()
|
||||
// immediately). To keep the test hermetic we don't import — we
|
||||
// duplicate the function under test.
|
||||
//
|
||||
// Sourcing the function from a shared module would be cleaner, but
|
||||
// next.config.ts is required to be a single self-contained file by
|
||||
// Next.js's loader on some host configurations. Pin invariant: the
|
||||
// duplicated function below MUST stay byte-identical to the one in
|
||||
// next.config.ts. If you change one, change the other and bump this
|
||||
// comment.
|
||||
|
||||
function checkAdminTokenPair(): void {
|
||||
const serverSet = !!process.env.ADMIN_TOKEN;
|
||||
const clientSet = !!process.env.NEXT_PUBLIC_ADMIN_TOKEN;
|
||||
if (serverSet === clientSet) return;
|
||||
if (serverSet && !clientSet) {
|
||||
// eslint-disable-next-line no-console
|
||||
console.error(
|
||||
"[next.config] ADMIN_TOKEN is set but NEXT_PUBLIC_ADMIN_TOKEN is not — " +
|
||||
"canvas will 401 against workspace-server because the bearer header " +
|
||||
"is never attached. Set both to the same value, or unset both.",
|
||||
);
|
||||
} else {
|
||||
// eslint-disable-next-line no-console
|
||||
console.error(
|
||||
"[next.config] NEXT_PUBLIC_ADMIN_TOKEN is set but ADMIN_TOKEN is not — " +
|
||||
"workspace-server will reject the bearer because no AdminAuth gate " +
|
||||
"is configured. Set both to the same value, or unset both.",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
describe("checkAdminTokenPair", () => {
|
||||
// Snapshot env so individual tests can stomp on it without leaking.
|
||||
// Rebuild from snapshot in afterEach so the next test sees a known
|
||||
// baseline regardless of mutation pattern.
|
||||
let originalEnv: Record<string, string | undefined>;
|
||||
let errorSpy: ReturnType<typeof vi.spyOn>;
|
||||
|
||||
beforeEach(() => {
|
||||
originalEnv = {
|
||||
ADMIN_TOKEN: process.env.ADMIN_TOKEN,
|
||||
NEXT_PUBLIC_ADMIN_TOKEN: process.env.NEXT_PUBLIC_ADMIN_TOKEN,
|
||||
};
|
||||
delete process.env.ADMIN_TOKEN;
|
||||
delete process.env.NEXT_PUBLIC_ADMIN_TOKEN;
|
||||
errorSpy = vi.spyOn(console, "error").mockImplementation(() => {});
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
if (originalEnv.ADMIN_TOKEN === undefined) delete process.env.ADMIN_TOKEN;
|
||||
else process.env.ADMIN_TOKEN = originalEnv.ADMIN_TOKEN;
|
||||
if (originalEnv.NEXT_PUBLIC_ADMIN_TOKEN === undefined) delete process.env.NEXT_PUBLIC_ADMIN_TOKEN;
|
||||
else process.env.NEXT_PUBLIC_ADMIN_TOKEN = originalEnv.NEXT_PUBLIC_ADMIN_TOKEN;
|
||||
errorSpy.mockRestore();
|
||||
});
|
||||
|
||||
it("emits no warning when both are unset", () => {
|
||||
checkAdminTokenPair();
|
||||
expect(errorSpy).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("emits no warning when both are set (matched pair, the happy path)", () => {
|
||||
process.env.ADMIN_TOKEN = "local-dev-admin";
|
||||
process.env.NEXT_PUBLIC_ADMIN_TOKEN = "local-dev-admin";
|
||||
checkAdminTokenPair();
|
||||
expect(errorSpy).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("warns when ADMIN_TOKEN is set but NEXT_PUBLIC_ADMIN_TOKEN is not", () => {
|
||||
process.env.ADMIN_TOKEN = "local-dev-admin";
|
||||
checkAdminTokenPair();
|
||||
expect(errorSpy).toHaveBeenCalledTimes(1);
|
||||
// Exact-string assertion — substring would also pass when the
|
||||
// function's branch logic is broken (e.g. emits both messages, or
|
||||
// emits the wrong one). Pin the exact message that operators will
|
||||
// see in their dev console so regressions are visible.
|
||||
expect(errorSpy).toHaveBeenCalledWith(
|
||||
"[next.config] ADMIN_TOKEN is set but NEXT_PUBLIC_ADMIN_TOKEN is not — " +
|
||||
"canvas will 401 against workspace-server because the bearer header " +
|
||||
"is never attached. Set both to the same value, or unset both.",
|
||||
);
|
||||
});
|
||||
|
||||
it("warns when NEXT_PUBLIC_ADMIN_TOKEN is set but ADMIN_TOKEN is not", () => {
|
||||
process.env.NEXT_PUBLIC_ADMIN_TOKEN = "local-dev-admin";
|
||||
checkAdminTokenPair();
|
||||
expect(errorSpy).toHaveBeenCalledTimes(1);
|
||||
expect(errorSpy).toHaveBeenCalledWith(
|
||||
"[next.config] NEXT_PUBLIC_ADMIN_TOKEN is set but ADMIN_TOKEN is not — " +
|
||||
"workspace-server will reject the bearer because no AdminAuth gate " +
|
||||
"is configured. Set both to the same value, or unset both.",
|
||||
);
|
||||
});
|
||||
|
||||
// Empty string in process.env is the JS-side representation of `KEY=`
|
||||
// (no value) in a .env file. Treating "" as unset makes the pair
|
||||
// invariant symmetric: `KEY=` and `unset KEY` produce the same
|
||||
// verdict. Without this branch, an operator who comments out the
|
||||
// value but leaves the line would get a false-positive warning.
|
||||
it("treats empty string as unset (so KEY= and unset KEY are equivalent)", () => {
|
||||
process.env.ADMIN_TOKEN = "";
|
||||
process.env.NEXT_PUBLIC_ADMIN_TOKEN = "";
|
||||
checkAdminTokenPair();
|
||||
expect(errorSpy).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("warns when ADMIN_TOKEN is set and NEXT_PUBLIC_ADMIN_TOKEN is empty string", () => {
|
||||
process.env.ADMIN_TOKEN = "local-dev-admin";
|
||||
process.env.NEXT_PUBLIC_ADMIN_TOKEN = "";
|
||||
checkAdminTokenPair();
|
||||
expect(errorSpy).toHaveBeenCalledTimes(1);
|
||||
// First branch — server set, client unset.
|
||||
expect(errorSpy).toHaveBeenCalledWith(
|
||||
expect.stringContaining("ADMIN_TOKEN is set but NEXT_PUBLIC_ADMIN_TOKEN is not"),
|
||||
);
|
||||
});
|
||||
});
|
||||
@ -26,14 +26,6 @@ func TestExtended_WorkspaceDelete(t *testing.T) {
|
||||
WithArgs(wsDelID).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id", "name"}))
|
||||
|
||||
// CascadeDelete walks descendants unconditionally (the 0-children
|
||||
// optimization in the old inline path was dropped during the
|
||||
// CascadeDelete extraction — descendant CTE returns 0 rows here,
|
||||
// same end state, one extra cheap query).
|
||||
mock.ExpectQuery("WITH RECURSIVE descendants").
|
||||
WithArgs(wsDelID).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id"}))
|
||||
|
||||
// #73: batch UPDATE happens BEFORE any container teardown.
|
||||
// Uses ANY($1::uuid[]) even with a single ID for consistency.
|
||||
mock.ExpectExec("UPDATE workspaces SET status =").
|
||||
|
||||
@ -589,6 +589,12 @@ func (h *OrgHandler) Import(c *gin.Context) {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "invalid request body"})
|
||||
return
|
||||
}
|
||||
importStart := time.Now()
|
||||
emitOrgEvent(c.Request.Context(), "org.import.started", map[string]any{
|
||||
"name": body.Template.Name,
|
||||
"dir": body.Dir,
|
||||
"mode": body.Mode,
|
||||
})
|
||||
|
||||
var tmpl OrgTemplate
|
||||
var orgBaseDir string // base directory for files_dir resolution
|
||||
@ -629,19 +635,6 @@ func (h *OrgHandler) Import(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
// Emit started AFTER the YAML is loaded so payload.name carries the
|
||||
// resolved template name (was: empty when caller passed `dir` instead
|
||||
// of inline `template`). Pre-parse error paths above return without
|
||||
// emitting — semantically "we couldn't even start an import" — so
|
||||
// every started event is guaranteed a paired completed/failed below
|
||||
// (no orphan started rows in structure_events).
|
||||
importStart := time.Now()
|
||||
emitOrgEvent(c.Request.Context(), "org.import.started", map[string]any{
|
||||
"name": tmpl.Name,
|
||||
"dir": body.Dir,
|
||||
"mode": body.Mode,
|
||||
})
|
||||
|
||||
// Required-env preflight — refuses import when any required_env is
|
||||
// missing from global_secrets. No bypass: the prior `force: true`
|
||||
// escape hatch was removed (issue #2290) because it was the silent
|
||||
@ -794,14 +787,14 @@ func (h *OrgHandler) Import(c *gin.Context) {
|
||||
rows.Close()
|
||||
|
||||
for _, oid := range orphanIDs {
|
||||
descendantIDs, stopErrs, err := h.workspace.CascadeDelete(ctx, oid)
|
||||
cascadeCount, stopErrs, err := h.workspace.CascadeDelete(ctx, oid)
|
||||
if err != nil {
|
||||
log.Printf("Org import reconcile: CascadeDelete(%s) failed: %v", oid, err)
|
||||
reconcileErrs = append(reconcileErrs, fmt.Sprintf("delete %s: %v", oid, err))
|
||||
reconcileSkipped++
|
||||
continue
|
||||
}
|
||||
reconcileRemovedCount += 1 + len(descendantIDs)
|
||||
reconcileRemovedCount += 1 + cascadeCount
|
||||
if len(stopErrs) > 0 {
|
||||
log.Printf("Org import reconcile: %s had %d stop errors (orphan sweeper will retry)", oid, len(stopErrs))
|
||||
}
|
||||
|
||||
@ -323,19 +323,161 @@ func (h *WorkspaceHandler) Delete(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
// Delegate the cascade to CascadeDelete so the HTTP path and the
|
||||
// OrgImport reconcile path share one teardown sequence (#73 race
|
||||
// guard, container stop, volume removal, token revocation, schedule
|
||||
// disable, broadcast). The HTTP-specific bits — direct-children 409
|
||||
// gate above, ?purge=true hard-delete below, response shaping —
|
||||
// stay in this handler.
|
||||
descendantIDs, stopErrs, err := h.CascadeDelete(ctx, id)
|
||||
if err != nil {
|
||||
log.Printf("Delete: CascadeDelete(%s) failed: %v", id, err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
// Cascade delete: collect ALL descendants (not just direct children) via
|
||||
// recursive CTE, then stop each container and remove each volume.
|
||||
// Previous bug: only direct children's containers were stopped, leaving
|
||||
// grandchildren as orphan running containers after a cascade delete.
|
||||
descendantIDs := []string{}
|
||||
if len(children) > 0 {
|
||||
descRows, err := db.DB.QueryContext(ctx, `
|
||||
WITH RECURSIVE descendants AS (
|
||||
SELECT id FROM workspaces WHERE parent_id = $1 AND status != 'removed'
|
||||
UNION ALL
|
||||
SELECT w.id FROM workspaces w JOIN descendants d ON w.parent_id = d.id WHERE w.status != 'removed'
|
||||
)
|
||||
SELECT id FROM descendants
|
||||
`, id)
|
||||
if err != nil {
|
||||
log.Printf("Delete: descendant query error for %s: %v", id, err)
|
||||
} else {
|
||||
for descRows.Next() {
|
||||
var descID string
|
||||
if descRows.Scan(&descID) == nil {
|
||||
descendantIDs = append(descendantIDs, descID)
|
||||
}
|
||||
}
|
||||
descRows.Close()
|
||||
}
|
||||
}
|
||||
|
||||
// #73 fix: mark rows 'removed' in the DB FIRST, BEFORE stopping containers
|
||||
// or removing volumes. Previously the sequence was stop → update-status,
|
||||
// which left a gap where:
|
||||
// - the container's last pre-teardown heartbeat could resurrect the row
|
||||
// via the register-handler UPSERT (now also guarded in #73)
|
||||
// - the liveness monitor could observe 'online' status + expired Redis
|
||||
// TTL and trigger RestartByID, recreating a container we're trying
|
||||
// to destroy
|
||||
// Marking 'removed' first makes both of those paths no-op via their
|
||||
// existing `status NOT IN ('removed', ...)` guards.
|
||||
allIDs := append([]string{id}, descendantIDs...)
|
||||
if _, err := db.DB.ExecContext(ctx,
|
||||
`UPDATE workspaces SET status = $1, updated_at = now() WHERE id = ANY($2::uuid[])`,
|
||||
models.StatusRemoved, pq.Array(allIDs)); err != nil {
|
||||
log.Printf("Delete status update error for %s: %v", id, err)
|
||||
}
|
||||
if _, err := db.DB.ExecContext(ctx,
|
||||
`DELETE FROM canvas_layouts WHERE workspace_id = ANY($1::uuid[])`,
|
||||
pq.Array(allIDs)); err != nil {
|
||||
log.Printf("Delete canvas_layouts error for %s: %v", id, err)
|
||||
}
|
||||
// Revoke all auth tokens for the deleted workspaces. Once the workspace is
|
||||
// gone its tokens are meaningless; leaving them alive would keep
|
||||
// HasAnyLiveTokenGlobal = true even after the platform is otherwise empty,
|
||||
// which prevents AdminAuth from returning to fail-open and breaks the E2E
|
||||
// test's count-zero assertion (and local re-run cleanup).
|
||||
if _, err := db.DB.ExecContext(ctx,
|
||||
`UPDATE workspace_auth_tokens SET revoked_at = now()
|
||||
WHERE workspace_id = ANY($1::uuid[]) AND revoked_at IS NULL`,
|
||||
pq.Array(allIDs)); err != nil {
|
||||
log.Printf("Delete token revocation error for %s: %v", id, err)
|
||||
}
|
||||
// #1027: cascade-disable all schedules for the deleted workspaces so
|
||||
// the scheduler never fires a cron into a removed container.
|
||||
if _, err := db.DB.ExecContext(ctx,
|
||||
`UPDATE workspace_schedules SET enabled = false, updated_at = now()
|
||||
WHERE workspace_id = ANY($1::uuid[]) AND enabled = true`,
|
||||
pq.Array(allIDs)); err != nil {
|
||||
log.Printf("Delete schedule disable error for %s: %v", id, err)
|
||||
}
|
||||
|
||||
// Now stop containers + remove volumes for all descendants (any depth).
|
||||
// Any concurrent heartbeat / registration / liveness-triggered restart
|
||||
// will see status='removed' and bail out early.
|
||||
//
|
||||
// Combines two concerns:
|
||||
//
|
||||
// 1. Detach cleanup from the request ctx via WithoutCancel + a 30s
|
||||
// timeout, so when the canvas's `api.del` resolves on our 200
|
||||
// (and gin cancels c.Request.Context()), in-flight Docker
|
||||
// stop/remove calls don't get cancelled mid-operation. The
|
||||
// previous shape leaked containers every time the canvas hung
|
||||
// up promptly: Stop returned "context canceled", the container
|
||||
// stayed up, and the next RemoveVolume failed with
|
||||
// "volume in use". 30s is generous for Docker daemon round-
|
||||
// trips (typical: <2s) and bounds a stuck daemon.
|
||||
//
|
||||
// 2. #1843: aggregate Stop() failures into stopErrs so the
|
||||
// post-deletion block surfaces them as 500. On the CP/EC2
|
||||
// backend, Stop() calls control plane's DELETE endpoint to
|
||||
// terminate the EC2; if that errors (transient 5xx, network),
|
||||
// the EC2 stays running with no DB row to track it (the
|
||||
// "orphan EC2 on a 0-customer account" scenario). Loud-fail
|
||||
// instead of silent-leak — clients retry, Stop's instance_id
|
||||
// lookup is idempotent against status='removed'. RemoveVolume
|
||||
// errors stay log-and-continue (local cleanup, not infra-leak).
|
||||
cleanupCtx, cleanupCancel := context.WithTimeout(
|
||||
context.WithoutCancel(ctx), 30*time.Second)
|
||||
defer cleanupCancel()
|
||||
|
||||
var stopErrs []error
|
||||
stopAndRemove := func(wsID string) {
|
||||
// Stop the workload first via the backend dispatcher (CP for
|
||||
// SaaS, Docker for self-hosted). Pre-2026-05-05 this gate was
|
||||
// `if h.provisioner == nil { return }` — early-returning on
|
||||
// every SaaS tenant left the EC2 running with no DB row to
|
||||
// track it (issue #2814; the comment below claimed "loud-fail
|
||||
// instead of silent-leak" but the early-return made it the
|
||||
// silent path on SaaS).
|
||||
//
|
||||
// Check Stop's error before any volume cleanup — the previous
|
||||
// code discarded it and immediately tried RemoveVolume, which
|
||||
// always fails with "volume in use" when Stop didn't actually
|
||||
// kill the container. The orphan sweeper
|
||||
// (registry/orphan_sweeper.go) catches what we skip here on
|
||||
// the next reconcile pass.
|
||||
if err := h.StopWorkspaceAuto(cleanupCtx, wsID); err != nil {
|
||||
log.Printf("Delete %s stop failed: %v — leaving cleanup for orphan sweeper", wsID, err)
|
||||
stopErrs = append(stopErrs, fmt.Errorf("stop %s: %w", wsID, err))
|
||||
return
|
||||
}
|
||||
// Volume cleanup is Docker-only — CP-managed workspaces have
|
||||
// no host-bind volumes to remove. Skip silently when no Docker
|
||||
// provisioner is wired (the SaaS path already terminated the
|
||||
// EC2 above; nothing left to do).
|
||||
if h.provisioner != nil {
|
||||
if err := h.provisioner.RemoveVolume(cleanupCtx, wsID); err != nil {
|
||||
log.Printf("Delete %s volume removal warning: %v", wsID, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, descID := range descendantIDs {
|
||||
stopAndRemove(descID)
|
||||
db.ClearWorkspaceKeys(cleanupCtx, descID)
|
||||
// #2269: drop the per-workspace restartState entry so it
|
||||
// doesn't accumulate across the platform's lifetime. The
|
||||
// LoadOrStore that creates the entry (workspace_restart.go)
|
||||
// has no companion remove path; without this Delete, every
|
||||
// short-lived workspace leaks ~16 bytes forever.
|
||||
restartStates.Delete(descID)
|
||||
// Detach broadcaster ctx for the same reason as the cleanup
|
||||
// above — RecordAndBroadcast does an INSERT INTO
|
||||
// structure_events + Redis Publish. If the canvas hangs up,
|
||||
// a request-ctx-bound INSERT can be cancelled mid-write,
|
||||
// leaving other WS clients ignorant of the cascade. The DB
|
||||
// row is already 'removed' so it's recoverable, but the
|
||||
// inconsistency is avoidable.
|
||||
h.broadcaster.RecordAndBroadcast(cleanupCtx, string(events.EventWorkspaceRemoved), descID, map[string]interface{}{})
|
||||
}
|
||||
|
||||
stopAndRemove(id)
|
||||
db.ClearWorkspaceKeys(cleanupCtx, id)
|
||||
restartStates.Delete(id) // #2269: same as descendants above
|
||||
|
||||
h.broadcaster.RecordAndBroadcast(cleanupCtx, string(events.EventWorkspaceRemoved), id, map[string]interface{}{
|
||||
"cascade_deleted": len(descendantIDs),
|
||||
})
|
||||
|
||||
// If any Stop call failed, surface 500 so the client retries. The DB
|
||||
// row is already 'removed' (idempotent), and Stop's instance_id
|
||||
@ -407,17 +549,16 @@ func (h *WorkspaceHandler) Delete(c *gin.Context) {
|
||||
// remove volumes, revoke tokens, disable schedules, broadcast events.
|
||||
//
|
||||
// Idempotent against already-removed rows (the descendant CTE and all UPDATE
|
||||
// guards skip status='removed'). Returns the descendant id list so the HTTP
|
||||
// caller can drive the optional `?purge=true` hard-delete path against the
|
||||
// same set the cascade just touched, plus any per-workspace stop errors so
|
||||
// callers can surface a retryable failure instead of a silent-leak.
|
||||
// guards skip status='removed'). Returns the number of cascaded descendants
|
||||
// (not including id itself) and any per-workspace stop errors so callers can
|
||||
// surface a retryable failure instead of a silent-leak.
|
||||
//
|
||||
// Caller is responsible for the children-confirmation gate (the HTTP handler
|
||||
// returns 409 when children exist + ?confirm=true is missing); this helper
|
||||
// always cascades.
|
||||
func (h *WorkspaceHandler) CascadeDelete(ctx context.Context, id string) ([]string, []error, error) {
|
||||
func (h *WorkspaceHandler) CascadeDelete(ctx context.Context, id string) (int, []error, error) {
|
||||
if err := validateWorkspaceID(id); err != nil {
|
||||
return nil, nil, err
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
descendantIDs := []string{}
|
||||
@ -430,7 +571,7 @@ func (h *WorkspaceHandler) CascadeDelete(ctx context.Context, id string) ([]stri
|
||||
SELECT id FROM descendants
|
||||
`, id)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("descendant query: %w", err)
|
||||
return 0, nil, fmt.Errorf("descendant query: %w", err)
|
||||
}
|
||||
for descRows.Next() {
|
||||
var descID string
|
||||
@ -496,7 +637,7 @@ func (h *WorkspaceHandler) CascadeDelete(ctx context.Context, id string) ([]stri
|
||||
"cascade_deleted": len(descendantIDs),
|
||||
})
|
||||
|
||||
return descendantIDs, stopErrs, nil
|
||||
return len(descendantIDs), stopErrs, nil
|
||||
}
|
||||
|
||||
// validateWorkspaceID returns an error when id is not a valid UUID.
|
||||
|
||||
@ -813,12 +813,6 @@ func TestWorkspaceDelete_DisablesSchedules(t *testing.T) {
|
||||
WithArgs(wsID).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id", "name"}))
|
||||
|
||||
// CascadeDelete walks descendants unconditionally — 0-children case
|
||||
// returns 0 rows here.
|
||||
mock.ExpectQuery("WITH RECURSIVE descendants").
|
||||
WithArgs(wsID).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id"}))
|
||||
|
||||
// Mark workspace as removed
|
||||
mock.ExpectExec("UPDATE workspaces SET status =").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
@ -941,12 +935,6 @@ func TestWorkspaceDelete_ScheduleDisableOnlyTargetsDeletedWorkspace(t *testing.T
|
||||
WithArgs(wsA).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id", "name"}))
|
||||
|
||||
// CascadeDelete walks descendants unconditionally — 0-children case
|
||||
// returns 0 rows here.
|
||||
mock.ExpectQuery("WITH RECURSIVE descendants").
|
||||
WithArgs(wsA).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id"}))
|
||||
|
||||
// Mark only workspace A as removed
|
||||
mock.ExpectExec("UPDATE workspaces SET status =").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
@ -46,8 +46,10 @@
|
||||
# 2. Fetch fresh token from platform API.
|
||||
# 3. If platform is unreachable, fall back to GITHUB_TOKEN / GH_TOKEN
|
||||
# env var (set at container start, valid for up to 60 min).
|
||||
# 4. If all fail, exit 1 so git falls through to the next credential
|
||||
# helper in the chain (if any).
|
||||
# 4. If all fail, fall back to a static PAT written by the infra operator
|
||||
# at ${CONFIGS_DIR}/.github-token (helps when platform
|
||||
# /github-installation-token returns 500 due to GitHub App misconfiguration).
|
||||
# Cache is NEVER written for static tokens — recovery is always fresh.
|
||||
#
|
||||
# # gh CLI integration
|
||||
#
|
||||
@ -222,6 +224,17 @@ _fetch_token() {
|
||||
return 0
|
||||
fi
|
||||
|
||||
# 4. Fall back to static token file (written by infra operator).
|
||||
static_token_file="${CONFIGS_DIR:-/configs}/.github-token"
|
||||
if [ -f "${static_token_file}" ]; then
|
||||
static_token=$(cat "${static_token_file}" | tr -d '[:space:]')
|
||||
if [ -n "${static_token}" ]; then
|
||||
echo "[molecule-git-token-helper] API unreachable, falling back to static token file" >&2
|
||||
echo "${static_token}"
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "[molecule-git-token-helper] all token sources exhausted" >&2
|
||||
return 1
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user