fix(ci): cut scheduler fan-out + stop all-required poll-gate squatting a slot #2094

Merged
devops-engineer merged 1 commits from fix/ci-scheduler-fanout into main 2026-06-01 16:05:25 +00:00
5 changed files with 326 additions and 311 deletions
@@ -11,21 +11,100 @@ def load_workflow(name: str) -> dict:
return yaml.safe_load(f)
def _all_required(workflow: dict) -> dict:
return workflow["jobs"]["all-required"]
def test_all_required_uses_dedicated_meta_runner_lane():
workflow = load_workflow("ci.yml")
all_required = workflow["jobs"]["all-required"]
all_required = _all_required(workflow)
# Stays on the dedicated `ci-meta` lane (the sentinel does no docker
# work, so it must NOT occupy the general docker-host pool).
assert all_required["runs-on"] == "ci-meta"
assert "needs" not in all_required
def test_all_required_reuses_path_filter_before_polling():
def test_all_required_is_needs_aggregator_not_a_polling_gate():
"""fix/ci-scheduler-fanout (2026-06-01): the sentinel was converted
from a status-polling loop (which squatted a ci-meta executor slot for
up to 40 min per PR) into a plain `needs:` aggregator that frees the
slot immediately. Pin the new shape so a regression to the poller is
caught.
"""
workflow = load_workflow("ci.yml")
all_required = workflow["jobs"]["all-required"]
all_required = _all_required(workflow)
rendered = str(all_required)
assert "--profile ci" in rendered
assert ".gitea/scripts/detect-changes.py" in rendered
assert "REQUIRE_PLATFORM" in rendered
assert "REQUIRE_CANVAS" in rendered
assert "REQUIRE_SCRIPTS" in rendered
# The job MUST aggregate via `needs:` (the slot-freeing design).
assert "needs" in all_required, "all-required must be a needs: aggregator"
# It MUST NOT reintroduce the polling loop / per-SHA status fetch that
# was the throughput sink.
assert "detect-changes.py" not in rendered, (
"all-required must not run the detect-changes poller path"
)
assert "commits/" not in rendered and "statuses" not in rendered, (
"all-required must not poll commit statuses (the slot-squat path)"
)
def test_all_required_does_not_use_if_always():
"""Plain `needs:` works on Gitea 1.22.6 / act_runner v0.6.1; `needs:` +
`if: always()` is BROKEN (feedback_gitea_needs_works_only_ifalways_broken)
and would let a non-success need pass the gate. The sentinel must use
plain `needs:` WITHOUT a job-level `if: always()`.
"""
workflow = load_workflow("ci.yml")
all_required = _all_required(workflow)
job_if = all_required.get("if")
assert not (isinstance(job_if, str) and "always()" in job_if), (
"all-required must not combine needs: with if: always()"
)
def test_all_required_needs_matches_ci_required_drift_f1_set():
"""The sentinel `needs:` list MUST equal ci-required-drift.py's
`ci_job_names()` set: every job MINUS the sentinel itself MINUS jobs
whose `if:` gates on github.event_name/github.ref (event-gated jobs
skip on PRs and a `needs:` on a skipped job would never let the
sentinel run). If they diverge, ci-required-drift F1 fires.
"""
workflow = load_workflow("ci.yml")
jobs = workflow["jobs"]
sentinel = "all-required"
expected = set()
for key, body in jobs.items():
if key == sentinel:
continue
gate = body.get("if") if isinstance(body, dict) else None
if isinstance(gate, str) and (
"github.event_name" in gate or "github.ref" in gate
):
# event-gated → legitimately skips on some triggers; excluded
# from both `needs:` and the F1 set.
continue
expected.add(key)
needs = jobs[sentinel].get("needs", [])
if isinstance(needs, str):
needs = [needs]
actual = set(needs)
assert actual == expected, (
f"all-required needs: {sorted(actual)} != ci_job_names() "
f"{sorted(expected)} — ci-required-drift F1 would fire"
)
def test_all_required_needs_reference_real_jobs():
"""F1b guard: every entry in `needs:` must name an existing job."""
workflow = load_workflow("ci.yml")
jobs = workflow["jobs"]
needs = jobs["all-required"].get("needs", [])
if isinstance(needs, str):
needs = [needs]
job_keys = set(jobs)
for dep in needs:
assert dep in job_keys, f"all-required needs unknown job {dep!r}"
+82 -120
View File
@@ -475,10 +475,10 @@ jobs:
#
# Emits `CI / all-required (<event>)` where <event> is the workflow trigger
# (e.g. `CI / all-required (pull_request)`, `CI / all-required (push)`).
# Branch protection MUST be updated to require the event-suffixed name —
# Branch protection requires the event-suffixed name —
# requiring `CI / all-required` (bare, no suffix) silently blocks all merges
# because Gitea treats absent status contexts as pending (not skipped), and
# no workflow emits the bare name. Fixed: BP now requires
# no workflow emits the bare name. BP requires
# `CI / all-required (pull_request)` per issue #1473.
#
# Closes the failure mode where status_check_contexts on molecule-core/main
@@ -487,129 +487,91 @@ jobs:
# red silently merged through. See internal#286 for the three concrete
# tonight-of-2026-05-11 incidents that prompted the emergency bump.
#
# This job deliberately has no `needs:`. Gitea 1.22/act_runner can mark a
# job-level `if: always()` + `needs:` sentinel as skipped before upstream
# jobs settle, leaving branch protection with a permanent pending
# `CI / all-required` context. Instead, this independent sentinel polls the
# required commit-status contexts for this SHA and fails if any fail, skip,
# or never emit. It runs the same path detector as `changes` and only waits
# for path-relevant jobs; Gitea can otherwise leave needs/output-skipped
# jobs permanently pending with "Blocked by required conditions". It runs on
# the dedicated `ci-meta` lane so the poller does not occupy the same
# general runner pool as the jobs it is waiting for.
# ── 2026-06-01 CI-scheduler-overload fix (fix/ci-scheduler-fanout) ──
# PREVIOUS shape: a poll-gate that ran detect-changes then LOOPED on
# `GET /commits/{sha}/statuses` every 15s for up to 40 min, occupying a
# `ci-meta` executor slot the entire time it waited for upstream jobs.
# With only 2 ci-meta runners, that poll-loop squatted half the lane on
# every PR — a confirmed throughput sink in the live RCA (two concurrent
# `JOB-all-required` containers observed pinning the lane). The polling
# design existed only to dodge the Gitea `needs:` + `if: always()` bug,
# where an always()-guarded sentinel could be marked skipped before
# upstream jobs settled (leaving BP pending forever).
#
# canvas-deploy-reminder is intentionally NOT included in all-required.needs.
# It is an informational main-push reminder, not a PR quality gate. Keeping
# it in this dependency list lets a skipped reminder skip the required
# sentinel before the `always()` guard can emit a branch-protection status.
# NEW shape: a plain `needs:` aggregator with NO polling loop. This is
# safe here — and was NOT safe at the time the poller was written —
# because every aggregated CI job now gates its real work PER-STEP
# (`if: needs.changes.outputs.* != 'true'`) rather than at the JOB level.
# A per-step-gated job always reaches a terminal SUCCESS (it no-ops its
# expensive steps but the job itself still completes), so it is never
# `skipped`. Plain `needs:` (WITHOUT `if: always()`) works correctly on
# Gitea 1.22.6 / act_runner v0.6.1 — only `needs:` + `if: always()` is
# broken (feedback_gitea_needs_works_only_ifalways_broken). We therefore
# use plain `needs:` + an explicit per-need result check (NOT
# `if: always()`); if any need fails/errors, Gitea never starts this job
# and BP sees `CI / all-required` go red via the failed dependency
# propagation — exactly the gate we want, with zero runner-squat.
#
# The `needs:` list MUST stay in lockstep with ci-required-drift.py's
# F1 check (`ci_job_names()` = every job MINUS the sentinel MINUS jobs
# whose `if:` gates on github.event_name/github.ref). canvas-deploy-
# reminder is event-gated (`if: github.ref == refs/heads/{main,staging}`)
# so it is intentionally EXCLUDED — it skips on PRs and a `needs:` on a
# skipped job would never let the sentinel run. If a new always-running
# CI job is added, add it here too or ci-required-drift F1 will flag it.
#
# Stays on the dedicated `ci-meta` lane (no docker work, so the
# docker-host-pin lint does not apply), but now the job is sub-second:
# it only inspects already-settled `needs.*.result` values, so it frees
# the slot immediately instead of holding it for the whole CI duration.
#
needs:
- changes
- platform-build
- canvas-build
- shellcheck
- python-lint
continue-on-error: false
runs-on: ci-meta
timeout-minutes: 45
timeout-minutes: 5
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
fetch-depth: 0
- id: check
- name: Verify all aggregated CI jobs succeeded
# NO polling, NO API call, NO checkout. Because this job lists the
# aggregated jobs under `needs:` (without `if: always()`), Gitea only
# starts it once every need has reached SUCCESS — a failed/errored
# need short-circuits the job and propagates red to the
# `CI / all-required` context. This explicit check is a
# belt-and-suspenders assertion + a readable run summary; the real
# gating is the `needs:` edge itself.
env:
PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
PR_BASE_REF: ${{ github.event.pull_request.base.ref }}
PUSH_BEFORE: ${{ github.event.before }}
run: |
python3 .gitea/scripts/detect-changes.py \
--profile ci \
--event-name "${{ github.event_name }}" \
--pr-base-sha "$PR_BASE_SHA" \
--base-ref "$PR_BASE_REF" \
--push-before "${GITHUB_EVENT_BEFORE:-$PUSH_BEFORE}"
- name: Wait for required CI contexts
env:
GITEA_TOKEN: ${{ secrets.GITHUB_TOKEN }}
API_ROOT: ${{ github.server_url }}/api/v1
REPOSITORY: ${{ github.repository }}
COMMIT_SHA: ${{ github.sha }}
EVENT_NAME: ${{ github.event_name }}
REQUIRE_PLATFORM: ${{ steps.check.outputs.platform }}
REQUIRE_CANVAS: ${{ steps.check.outputs.canvas }}
REQUIRE_SCRIPTS: ${{ steps.check.outputs.scripts }}
CHANGES_RESULT: ${{ needs.changes.result }}
PLATFORM_RESULT: ${{ needs.platform-build.result }}
CANVAS_RESULT: ${{ needs.canvas-build.result }}
SHELLCHECK_RESULT: ${{ needs.shellcheck.result }}
PYTHON_LINT_RESULT: ${{ needs.python-lint.result }}
run: |
set -euo pipefail
python3 - <<'PY'
import json
import os
import sys
import time
import urllib.error
import urllib.request
token = os.environ["GITEA_TOKEN"]
api_root = os.environ["API_ROOT"].rstrip("/")
repo = os.environ["REPOSITORY"]
sha = os.environ["COMMIT_SHA"]
event = os.environ["EVENT_NAME"]
required = [
f"CI / Detect changes ({event})",
f"CI / Python Lint & Test ({event})",
]
if os.environ.get("REQUIRE_PLATFORM") == "true":
required.append(f"CI / Platform (Go) ({event})")
if os.environ.get("REQUIRE_CANVAS") == "true":
required.append(f"CI / Canvas (Next.js) ({event})")
if os.environ.get("REQUIRE_SCRIPTS") == "true":
required.append(f"CI / Shellcheck (E2E scripts) ({event})")
terminal_bad = {"failure", "error"}
deadline = time.time() + 40 * 60
last_summary = None
def fetch_statuses():
statuses = []
for page in range(1, 6):
url = f"{api_root}/repos/{repo}/commits/{sha}/statuses?page={page}&limit=100"
req = urllib.request.Request(url, headers={"Authorization": f"token {token}"})
with urllib.request.urlopen(req, timeout=10) as resp:
chunk = json.load(resp)
if not chunk:
break
statuses.extend(chunk)
latest = {}
for item in statuses:
ctx = item.get("context")
if not ctx:
continue
prev = latest.get(ctx)
if prev is None or (item.get("updated_at") or item.get("created_at") or "") >= (prev.get("updated_at") or prev.get("created_at") or ""):
latest[ctx] = item
return latest
while True:
try:
latest = fetch_statuses()
except (TimeoutError, OSError, urllib.error.URLError) as exc:
if time.time() >= deadline:
print(f"FAIL: status polling did not recover before deadline: {exc}", file=sys.stderr)
sys.exit(1)
print(f"WARN: status poll failed, retrying: {exc}", flush=True)
time.sleep(15)
continue
states = {ctx: (latest.get(ctx) or {}).get("status") or (latest.get(ctx) or {}).get("state") or "missing" for ctx in required}
summary = ", ".join(f"{ctx}={state}" for ctx, state in states.items())
if summary != last_summary:
print(summary, flush=True)
last_summary = summary
bad = {ctx: state for ctx, state in states.items() if state in terminal_bad}
if bad:
print("FAIL: required CI context failed:", file=sys.stderr)
for ctx, state in bad.items():
desc = (latest.get(ctx) or {}).get("description") or ""
print(f" - {ctx}: {state} {desc}", file=sys.stderr)
sys.exit(1)
if all(state == "success" for state in states.values()):
print(f"OK: all {len(required)} required CI contexts succeeded")
sys.exit(0)
if time.time() >= deadline:
print("FAIL: timed out waiting for required CI contexts:", file=sys.stderr)
for ctx, state in states.items():
print(f" - {ctx}: {state}", file=sys.stderr)
sys.exit(1)
time.sleep(15)
PY
fail=0
check() {
name="$1"; result="$2"
printf 'CI / %s = %s\n' "$name" "$result"
# `success` is the only green terminal state we accept. A plain
# `needs:` job is only started when all needs succeed, so reaching
# this step already implies success — but assert explicitly so a
# future `if: always()` reintroduction (which WOULD let non-success
# through) fails loudly instead of silently passing the gate.
if [ "$result" != "success" ]; then
echo "::error::aggregated CI job '${name}' did not succeed (result=${result})"
fail=1
fi
}
check "Detect changes" "$CHANGES_RESULT"
check "Platform (Go)" "$PLATFORM_RESULT"
check "Canvas (Next.js)" "$CANVAS_RESULT"
check "Shellcheck (E2E scripts)" "$SHELLCHECK_RESULT"
check "Python Lint & Test" "$PYTHON_LINT_RESULT"
if [ "$fail" -ne 0 ]; then
echo "::error::all-required: one or more aggregated CI jobs did not succeed"
exit 1
fi
echo "OK: all aggregated CI jobs succeeded — CI / all-required green."
@@ -25,6 +25,21 @@ name: Lint forbidden tenant-env keys
# feedback_path_filtered_workflow_cant_be_required). The scan itself
# targets workspace_secrets-writer paths via grep -r; it's fast
# (sub-second) so unconditional run is fine.
#
# ── 2026-06-01 CI-scheduler-fanout consolidation (fix/ci-scheduler-fanout) ──
# The RFC#523 sibling lint formerly in its own file
# `lint-no-tenant-gitea-token.yml` (the broader "no repo-host token into
# any tenant-writer surface" scan) is now a SECOND job in THIS workflow
# (`scan-tenant-token-write`). Both are sub-second Go-source greps that
# fired as two separate workflow runs on every PR — pure scheduler
# fan-out. Folding the sibling in here drops one workflow run + one
# checkout per PR while keeping BOTH scans firing unconditionally on
# every PR (the no-paths discipline above is preserved — neither job is
# paths-filtered). The moved job keeps its exact `name:` so its emitted
# status context is unchanged in substance; its `# bp-exempt:` directive
# moves with it (Tier 2g). The old `Lint no tenant GITEA or GITHUB token
# write / …` context is retired (a disappearing context needs no
# directive; only NEW emitters do).
on:
pull_request:
@@ -166,3 +181,126 @@ jobs:
fi
echo "OK No forbidden operator-scope env key names hardcoded in writer paths."
# bp-exempt: advisory RFC#523 lint; PR review gate is review-driven, not BP-driven.
# (Carried with the workflow-name rename in PR mc#1593 so the renamed
# context emission satisfies lint_required_context_exists_in_bp Tier 2g.)
scan-tenant-token-write:
name: Scan for repo-host token write into tenant workspace surface
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
fetch-depth: 1
- name: Find Go files referencing a tenant-writer surface AND a repo-host token
run: |
set -euo pipefail
# Repo-host token NAMES — the threat-model subset. Operator-fleet
# tokens (CP_ADMIN_API_TOKEN, RAILWAY_TOKEN, INFISICAL_*) are
# caught by lint-forbidden-env-keys.yml's broader deny set; this
# lint focuses on the git-host class so a single co-occurrence
# match has a low false-positive rate.
FORBIDDEN_KEYS=(
"GITEA_TOKEN"
"GITEA_PAT"
"GITHUB_TOKEN"
"GITHUB_PAT"
"GH_TOKEN"
)
# Tenant-writer surface markers. A file matches the surface set
# if it references ANY of these strings. This is the "is this
# code path writing into a tenant workspace?" heuristic.
# Curated to catch the actual code shapes used in this repo
# (verified by grep against current main 2026-05-19):
# - "workspace_secrets" / "global_secrets" → DB table writes
# - "seedAllowList" → CP-side seed table
# - "/settings/secrets" → tenant HTTP API write
# - "envVars[" → in-memory env map write
# - "containerEnv" → docker-run env-set
# - "userData" → EC2 user-data script
# - "provisionPayload" / "provisionContext" → provision-request shape
SURFACE_PATTERN='workspace_secrets|global_secrets|seedAllowList|/settings/secrets|envVars\[|containerEnv|userData|provisionPayload|provisionContext'
# Files that legitimately reference these names AND a surface
# marker, but do so for guard / strip / test / doc-comment
# reasons. New entries require reviewer signoff and a one-line
# justification in the diff.
EXEMPT_FILES=(
# RFC#523 L1 deny-set source-of-truth + tests
"workspace-server/internal/handlers/workspace_provision_forbidden_env.go"
"workspace-server/internal/handlers/workspace_provision_forbidden_env_test.go"
# Forensic-#145 silent-strip denylist (defense-in-depth, by design lists the names)
"workspace-server/internal/provisioner/provisioner.go"
"workspace-server/internal/provisioner/provisioner_test.go"
# Pre-RFC#523 persona-fallback / org-helper paths. The L1
# fail-closed runs BEFORE these writers; downstream silent-strip
# also covers them. See applyAgentGitHTTPCreds doc-comment.
"workspace-server/internal/handlers/agent_git_identity.go"
"workspace-server/internal/handlers/org_helpers.go"
"workspace-server/internal/handlers/org.go"
# CP→platform admin auth (NOT a tenant env write).
"workspace-server/internal/provisioner/cp_provisioner.go"
)
# Build an extended-regex alternation of forbidden keys.
KEY_ALT="$(IFS='|'; echo "${FORBIDDEN_KEYS[*]}")"
# Find candidate files: Go non-test sources that contain a
# tenant-writer surface marker.
mapfile -t CANDIDATES < <(
grep -rlE --include='*.go' --exclude='*_test.go' \
"${SURFACE_PATTERN}" . 2>/dev/null \
| sed 's|^\./||' \
| sort -u
)
if [ "${#CANDIDATES[@]}" -eq 0 ]; then
echo "OK No tenant-writer-surface files found in tree (unexpected, but not a lint failure)."
exit 0
fi
HITS=""
for f in "${CANDIDATES[@]}"; do
# Skip exempt files.
skip=0
for ex in "${EXEMPT_FILES[@]}"; do
if [ "$f" = "$ex" ]; then skip=1; break; fi
done
[ "$skip" = "1" ] && continue
# File contains a surface marker; now grep for a forbidden
# key NAME. We require a QUOTED-literal match to avoid
# firing on a comment like "// also handle GITEA_TOKEN".
#
# The literal form catches:
# - os.Getenv("GITEA_TOKEN")
# - envVars["GITEA_TOKEN"] = ...
# - {envKey: "GITEA_TOKEN", tenantKey: "GITEA_TOKEN"}
# but not:
# - // see GITEA_TOKEN below (no quotes)
found=$(grep -nE "\"(${KEY_ALT})\"" "$f" 2>/dev/null || true)
if [ -n "$found" ]; then
HITS="${HITS}--- ${f} ---\n${found}\n"
fi
done
if [ -n "$HITS" ]; then
echo "::error::Task #146 lint: repo-host token name(s) quoted in a tenant-writer-surface file:"
printf "$HITS"
echo ""
echo "These files reference a tenant-writer surface (workspace_secrets,"
echo "seedAllowList, /settings/secrets, containerEnv, userData, etc.)"
echo "AND quote a repo-host token name (GITEA_TOKEN/GITHUB_TOKEN/…)."
echo "Per RFC#523 threat model, tenant workspaces MUST NOT receive"
echo "operator-scope repo-host tokens. If your code legitimately needs"
echo "to reference one of these names in a tenant-writer file (e.g."
echo "a deny-set definition or silent-strip list), add the file to"
echo "EXEMPT_FILES with a one-line justification — reviewer signoff"
echo "required."
exit 1
fi
echo "OK No tenant-writer-surface file co-mentions a repo-host token literal."
@@ -1,182 +0,0 @@
name: Lint no tenant GITEA or GITHUB token write
# Task #146 — CI guardrail companion to RFC#523's `lint-forbidden-env-keys.yml`.
#
# `lint-forbidden-env-keys.yml` (Layer 3) catches code that hardcodes a
# forbidden env-var key NAME as a quoted literal in workspace_secrets
# writer paths under workspace-server/internal/.
#
# This workflow catches a BROADER class: any code path that reads a
# repo-host token (GITEA_TOKEN / GITHUB_TOKEN / GH_TOKEN) and then writes
# it into a TENANT WORKSPACE's env, secret store, user-data, or
# provision payload. This is the actual RFC#523 threat-model statement —
# the goal is "no tenant workspace ever receives an operator-scope repo
# token," not just "no _quoted_ literal `GITEA_TOKEN`." A future writer
# could route the value via a variable, a struct field, or a config key
# and slip past the existing literal scan; this lint catches those
# routing patterns at PR review time.
#
# Scope
# Scans the WHOLE repo's Go sources (not just workspace-server/) for
# co-occurrences of:
# - a repo-host token NAME (GITEA_TOKEN / GITHUB_TOKEN / GH_TOKEN /
# GITEA_PAT / GITHUB_PAT) used as os.Getenv argument or string
# literal
# - within a file that ALSO references a tenant-writer surface
# (`tenant`, `workspace_secrets`, `global_secrets`, `seedAllowList`,
# `/settings/secrets`, `userData`, `provisionPayload`,
# `envVars[`, `containerEnv`).
#
# Co-occurrence (not single-line) is the false-positive control: a
# file that just LOGS the variable name (e.g. "missing GITEA_TOKEN")
# without touching any tenant surface won't fire.
#
# Drift contract with lint-forbidden-env-keys.yml
# Both lints share the same FORBIDDEN_KEYS list (a subset — only the
# repo-host tokens, since this lint's threat model is "tenant gets
# write access to operator's git host"). If RFC#523's deny set grows,
# update BOTH this file AND lint-forbidden-env-keys.yml AND the Go
# source-of-truth in
# workspace-server/internal/handlers/workspace_provision_forbidden_env.go.
#
# Open-source-template-friendly
# The patterns scanned are generic (no MOLECULE_-prefix literals).
# A fork can copy this workflow as-is and adjust FORBIDDEN_KEYS.
#
# Path-filter discipline
# No `paths:` filter — required-status workflows must run on every PR
# per `feedback_path_filtered_workflow_cant_be_required`. Scan is
# sub-second.
on:
pull_request:
types: [opened, synchronize, reopened]
push:
branches: [main, staging]
env:
GITHUB_SERVER_URL: https://git.moleculesai.app
jobs:
# bp-exempt: advisory RFC#523 lint; PR review gate is review-driven, not BP-driven.
# (Carried with the workflow-name rename in PR mc#1593 so the renamed
# context emission satisfies lint_required_context_exists_in_bp Tier 2g.)
scan:
name: Scan for repo-host token write into tenant workspace surface
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
fetch-depth: 1
- name: Find Go files referencing a tenant-writer surface AND a repo-host token
run: |
set -euo pipefail
# Repo-host token NAMES — the threat-model subset. Operator-fleet
# tokens (CP_ADMIN_API_TOKEN, RAILWAY_TOKEN, INFISICAL_*) are
# caught by lint-forbidden-env-keys.yml's broader deny set; this
# lint focuses on the git-host class so a single co-occurrence
# match has a low false-positive rate.
FORBIDDEN_KEYS=(
"GITEA_TOKEN"
"GITEA_PAT"
"GITHUB_TOKEN"
"GITHUB_PAT"
"GH_TOKEN"
)
# Tenant-writer surface markers. A file matches the surface set
# if it references ANY of these strings. This is the "is this
# code path writing into a tenant workspace?" heuristic.
# Curated to catch the actual code shapes used in this repo
# (verified by grep against current main 2026-05-19):
# - "workspace_secrets" / "global_secrets" → DB table writes
# - "seedAllowList" → CP-side seed table
# - "/settings/secrets" → tenant HTTP API write
# - "envVars[" → in-memory env map write
# - "containerEnv" → docker-run env-set
# - "userData" → EC2 user-data script
# - "provisionPayload" / "provisionContext" → provision-request shape
SURFACE_PATTERN='workspace_secrets|global_secrets|seedAllowList|/settings/secrets|envVars\[|containerEnv|userData|provisionPayload|provisionContext'
# Files that legitimately reference these names AND a surface
# marker, but do so for guard / strip / test / doc-comment
# reasons. New entries require reviewer signoff and a one-line
# justification in the diff.
EXEMPT_FILES=(
# RFC#523 L1 deny-set source-of-truth + tests
"workspace-server/internal/handlers/workspace_provision_forbidden_env.go"
"workspace-server/internal/handlers/workspace_provision_forbidden_env_test.go"
# Forensic-#145 silent-strip denylist (defense-in-depth, by design lists the names)
"workspace-server/internal/provisioner/provisioner.go"
"workspace-server/internal/provisioner/provisioner_test.go"
# Pre-RFC#523 persona-fallback / org-helper paths. The L1
# fail-closed runs BEFORE these writers; downstream silent-strip
# also covers them. See applyAgentGitHTTPCreds doc-comment.
"workspace-server/internal/handlers/agent_git_identity.go"
"workspace-server/internal/handlers/org_helpers.go"
"workspace-server/internal/handlers/org.go"
# CP→platform admin auth (NOT a tenant env write).
"workspace-server/internal/provisioner/cp_provisioner.go"
)
# Build an extended-regex alternation of forbidden keys.
KEY_ALT="$(IFS='|'; echo "${FORBIDDEN_KEYS[*]}")"
# Find candidate files: Go non-test sources that contain a
# tenant-writer surface marker.
mapfile -t CANDIDATES < <(
grep -rlE --include='*.go' --exclude='*_test.go' \
"${SURFACE_PATTERN}" . 2>/dev/null \
| sed 's|^\./||' \
| sort -u
)
if [ "${#CANDIDATES[@]}" -eq 0 ]; then
echo "OK No tenant-writer-surface files found in tree (unexpected, but not a lint failure)."
exit 0
fi
HITS=""
for f in "${CANDIDATES[@]}"; do
# Skip exempt files.
skip=0
for ex in "${EXEMPT_FILES[@]}"; do
if [ "$f" = "$ex" ]; then skip=1; break; fi
done
[ "$skip" = "1" ] && continue
# File contains a surface marker; now grep for a forbidden
# key NAME. We require a QUOTED-literal match to avoid
# firing on a comment like "// also handle GITEA_TOKEN".
#
# The literal form catches:
# - os.Getenv("GITEA_TOKEN")
# - envVars["GITEA_TOKEN"] = ...
# - {envKey: "GITEA_TOKEN", tenantKey: "GITEA_TOKEN"}
# but not:
# - // see GITEA_TOKEN below (no quotes)
found=$(grep -nE "\"(${KEY_ALT})\"" "$f" 2>/dev/null || true)
if [ -n "$found" ]; then
HITS="${HITS}--- ${f} ---\n${found}\n"
fi
done
if [ -n "$HITS" ]; then
echo "::error::Task #146 lint: repo-host token name(s) quoted in a tenant-writer-surface file:"
printf "$HITS"
echo ""
echo "These files reference a tenant-writer surface (workspace_secrets,"
echo "seedAllowList, /settings/secrets, containerEnv, userData, etc.)"
echo "AND quote a repo-host token name (GITEA_TOKEN/GITHUB_TOKEN/…)."
echo "Per RFC#523 threat model, tenant workspaces MUST NOT receive"
echo "operator-scope repo-host tokens. If your code legitimately needs"
echo "to reference one of these names in a tenant-writer file (e.g."
echo "a deny-set definition or silent-strip list), add the file to"
echo "EXEMPT_FILES with a one-line justification — reviewer signoff"
echo "required."
exit 1
fi
echo "OK No tenant-writer-surface file co-mentions a repo-host token literal."
+18
View File
@@ -35,8 +35,26 @@ name: verify-providers-gen
on:
pull_request:
types: [opened, synchronize, reopened]
# CI-scheduler-overload fix (fix/ci-scheduler-fanout, 2026-06-01):
# this gate only verifies that the generated providers artifact is in
# sync with the schema SSOT. Its verdict can ONLY change when one of
# the codegen inputs/outputs changes, so firing the Go toolchain on
# every unrelated PR (docs, canvas, scripts) is pure fan-out cost.
# Scoped to the codegen surface. SAFE because this workflow is NOT a
# branch-protection status_check_context (see header §ENFORCEMENT
# GATING) — lint-required-no-paths only forbids paths filters on
# REQUIRED workflows; this is advisory, so a paths filter is allowed.
# Mirrors the sibling sync-providers-yaml.yml scoping convention.
paths:
- 'workspace-server/internal/providers/**'
- 'workspace-server/cmd/gen-providers/**'
- '.gitea/workflows/verify-providers-gen.yml'
push:
branches: [main, staging]
paths:
- 'workspace-server/internal/providers/**'
- 'workspace-server/cmd/gen-providers/**'
- '.gitea/workflows/verify-providers-gen.yml'
env:
GITHUB_SERVER_URL: https://git.moleculesai.app