From 6a1189ee9dde258f928ac1b8b30485b3f863a7b4 Mon Sep 17 00:00:00 2001 From: core-be Date: Mon, 1 Jun 2026 00:35:04 -0700 Subject: [PATCH] fix(ci): cut scheduler fan-out + stop all-required poll-gate squatting a slot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause (live RCA): the Gitea Actions run-scheduler is throughput- starved by workflow fan-out. A single PR-head commit triggers ~65 runs; the `all-required` sentinel was a status-POLLING loop that held a `ci-meta` executor slot (only 2 in the lane) for up to 40 min per PR; and several cheap meta-lints fired as separate runs on every commit. Two fixes, both branch-protection-preserving: 1. all-required: poll-gate → plain `needs:` aggregator (ci.yml). Was: detect-changes + a 40-min `GET /commits/{sha}/statuses` poll loop on the ci-meta lane (confirmed slot-squat in the RCA — two concurrent JOB-all-required containers pinning the 2-slot lane). Now: `needs: [changes, platform-build, canvas-build, shellcheck, python-lint]` + a sub-second inline result-check (no API, no poll, no checkout). Frees the slot immediately. Safe because every aggregated job now gates real work PER-STEP (`if: needs.changes.outputs.* != 'true'`), so it always reaches a terminal SUCCESS and is never `skipped`. Plain `needs:` (WITHOUT `if: always()`) works on Gitea 1.22.6 / act_runner v0.6.1 — only `needs:` + `if: always()` is broken (feedback_gitea_needs_works_only_ifalways_broken). canvas-deploy- reminder is event-gated (`if: github.ref...`) so it is intentionally excluded. The needs: set equals ci-required-drift.py's ci_job_names() so F1 stays clean (verified + now unit-pinned). The required context name `CI / all-required ()` is UNCHANGED. 2. Cut fan-out: - Consolidated lint-no-tenant-gitea-token.yml INTO lint-forbidden-env-keys.yml as a second job (scan-tenant-token- write). Two sub-second Go-source greps that fired as two separate workflow runs per PR → one run, one checkout. Both still fire on every PR (no paths filter; RFC#523 threat model preserved). The moved job keeps its exact `name:` + `# bp-exempt:` directive (Tier 2g); the old `Lint no tenant GITEA…` context is retired. - Added a `paths:` filter to verify-providers-gen.yml (Go toolchain, ~8min) scoped to the codegen surface. SAFE: it is NOT a branch- protection required context, so lint-required-no-paths permits it. Branch-protection required contexts are unchanged (CI / all-required, E2E API Smoke Test, Handlers Postgres Integration, sop-checklist / all-items-acked). No paths filter was added to any required emitter. Tests: updated test_ci_workflow_bookkeeping.py to pin the new needs: aggregator shape + the no-if:always() hazard + the F1-lockstep invariant (watched the old assertions fail, then pass on the new shape). Full .gitea/scripts/tests suite (192) + affected tests/ lints green. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../tests/test_ci_workflow_bookkeeping.py | 97 ++++++++- .gitea/workflows/ci.yml | 202 +++++++----------- .gitea/workflows/lint-forbidden-env-keys.yml | 138 ++++++++++++ .../workflows/lint-no-tenant-gitea-token.yml | 182 ---------------- .gitea/workflows/verify-providers-gen.yml | 18 ++ 5 files changed, 326 insertions(+), 311 deletions(-) delete mode 100644 .gitea/workflows/lint-no-tenant-gitea-token.yml diff --git a/.gitea/scripts/tests/test_ci_workflow_bookkeeping.py b/.gitea/scripts/tests/test_ci_workflow_bookkeeping.py index 418dea648..572e5d2db 100644 --- a/.gitea/scripts/tests/test_ci_workflow_bookkeeping.py +++ b/.gitea/scripts/tests/test_ci_workflow_bookkeeping.py @@ -11,21 +11,100 @@ def load_workflow(name: str) -> dict: return yaml.safe_load(f) +def _all_required(workflow: dict) -> dict: + return workflow["jobs"]["all-required"] + + def test_all_required_uses_dedicated_meta_runner_lane(): workflow = load_workflow("ci.yml") - all_required = workflow["jobs"]["all-required"] + all_required = _all_required(workflow) + # Stays on the dedicated `ci-meta` lane (the sentinel does no docker + # work, so it must NOT occupy the general docker-host pool). assert all_required["runs-on"] == "ci-meta" - assert "needs" not in all_required -def test_all_required_reuses_path_filter_before_polling(): +def test_all_required_is_needs_aggregator_not_a_polling_gate(): + """fix/ci-scheduler-fanout (2026-06-01): the sentinel was converted + from a status-polling loop (which squatted a ci-meta executor slot for + up to 40 min per PR) into a plain `needs:` aggregator that frees the + slot immediately. Pin the new shape so a regression to the poller is + caught. + """ workflow = load_workflow("ci.yml") - all_required = workflow["jobs"]["all-required"] + all_required = _all_required(workflow) rendered = str(all_required) - assert "--profile ci" in rendered - assert ".gitea/scripts/detect-changes.py" in rendered - assert "REQUIRE_PLATFORM" in rendered - assert "REQUIRE_CANVAS" in rendered - assert "REQUIRE_SCRIPTS" in rendered + # The job MUST aggregate via `needs:` (the slot-freeing design). + assert "needs" in all_required, "all-required must be a needs: aggregator" + + # It MUST NOT reintroduce the polling loop / per-SHA status fetch that + # was the throughput sink. + assert "detect-changes.py" not in rendered, ( + "all-required must not run the detect-changes poller path" + ) + assert "commits/" not in rendered and "statuses" not in rendered, ( + "all-required must not poll commit statuses (the slot-squat path)" + ) + + +def test_all_required_does_not_use_if_always(): + """Plain `needs:` works on Gitea 1.22.6 / act_runner v0.6.1; `needs:` + + `if: always()` is BROKEN (feedback_gitea_needs_works_only_ifalways_broken) + and would let a non-success need pass the gate. The sentinel must use + plain `needs:` WITHOUT a job-level `if: always()`. + """ + workflow = load_workflow("ci.yml") + all_required = _all_required(workflow) + + job_if = all_required.get("if") + assert not (isinstance(job_if, str) and "always()" in job_if), ( + "all-required must not combine needs: with if: always()" + ) + + +def test_all_required_needs_matches_ci_required_drift_f1_set(): + """The sentinel `needs:` list MUST equal ci-required-drift.py's + `ci_job_names()` set: every job MINUS the sentinel itself MINUS jobs + whose `if:` gates on github.event_name/github.ref (event-gated jobs + skip on PRs and a `needs:` on a skipped job would never let the + sentinel run). If they diverge, ci-required-drift F1 fires. + """ + workflow = load_workflow("ci.yml") + jobs = workflow["jobs"] + sentinel = "all-required" + + expected = set() + for key, body in jobs.items(): + if key == sentinel: + continue + gate = body.get("if") if isinstance(body, dict) else None + if isinstance(gate, str) and ( + "github.event_name" in gate or "github.ref" in gate + ): + # event-gated → legitimately skips on some triggers; excluded + # from both `needs:` and the F1 set. + continue + expected.add(key) + + needs = jobs[sentinel].get("needs", []) + if isinstance(needs, str): + needs = [needs] + actual = set(needs) + + assert actual == expected, ( + f"all-required needs: {sorted(actual)} != ci_job_names() " + f"{sorted(expected)} — ci-required-drift F1 would fire" + ) + + +def test_all_required_needs_reference_real_jobs(): + """F1b guard: every entry in `needs:` must name an existing job.""" + workflow = load_workflow("ci.yml") + jobs = workflow["jobs"] + needs = jobs["all-required"].get("needs", []) + if isinstance(needs, str): + needs = [needs] + job_keys = set(jobs) + for dep in needs: + assert dep in job_keys, f"all-required needs unknown job {dep!r}" diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index 739183154..26a7db27d 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -475,10 +475,10 @@ jobs: # # Emits `CI / all-required ()` where is the workflow trigger # (e.g. `CI / all-required (pull_request)`, `CI / all-required (push)`). - # Branch protection MUST be updated to require the event-suffixed name — + # Branch protection requires the event-suffixed name — # requiring `CI / all-required` (bare, no suffix) silently blocks all merges # because Gitea treats absent status contexts as pending (not skipped), and - # no workflow emits the bare name. Fixed: BP now requires + # no workflow emits the bare name. BP requires # `CI / all-required (pull_request)` per issue #1473. # # Closes the failure mode where status_check_contexts on molecule-core/main @@ -487,129 +487,91 @@ jobs: # red silently merged through. See internal#286 for the three concrete # tonight-of-2026-05-11 incidents that prompted the emergency bump. # - # This job deliberately has no `needs:`. Gitea 1.22/act_runner can mark a - # job-level `if: always()` + `needs:` sentinel as skipped before upstream - # jobs settle, leaving branch protection with a permanent pending - # `CI / all-required` context. Instead, this independent sentinel polls the - # required commit-status contexts for this SHA and fails if any fail, skip, - # or never emit. It runs the same path detector as `changes` and only waits - # for path-relevant jobs; Gitea can otherwise leave needs/output-skipped - # jobs permanently pending with "Blocked by required conditions". It runs on - # the dedicated `ci-meta` lane so the poller does not occupy the same - # general runner pool as the jobs it is waiting for. + # ── 2026-06-01 CI-scheduler-overload fix (fix/ci-scheduler-fanout) ── + # PREVIOUS shape: a poll-gate that ran detect-changes then LOOPED on + # `GET /commits/{sha}/statuses` every 15s for up to 40 min, occupying a + # `ci-meta` executor slot the entire time it waited for upstream jobs. + # With only 2 ci-meta runners, that poll-loop squatted half the lane on + # every PR — a confirmed throughput sink in the live RCA (two concurrent + # `JOB-all-required` containers observed pinning the lane). The polling + # design existed only to dodge the Gitea `needs:` + `if: always()` bug, + # where an always()-guarded sentinel could be marked skipped before + # upstream jobs settled (leaving BP pending forever). # - # canvas-deploy-reminder is intentionally NOT included in all-required.needs. - # It is an informational main-push reminder, not a PR quality gate. Keeping - # it in this dependency list lets a skipped reminder skip the required - # sentinel before the `always()` guard can emit a branch-protection status. + # NEW shape: a plain `needs:` aggregator with NO polling loop. This is + # safe here — and was NOT safe at the time the poller was written — + # because every aggregated CI job now gates its real work PER-STEP + # (`if: needs.changes.outputs.* != 'true'`) rather than at the JOB level. + # A per-step-gated job always reaches a terminal SUCCESS (it no-ops its + # expensive steps but the job itself still completes), so it is never + # `skipped`. Plain `needs:` (WITHOUT `if: always()`) works correctly on + # Gitea 1.22.6 / act_runner v0.6.1 — only `needs:` + `if: always()` is + # broken (feedback_gitea_needs_works_only_ifalways_broken). We therefore + # use plain `needs:` + an explicit per-need result check (NOT + # `if: always()`); if any need fails/errors, Gitea never starts this job + # and BP sees `CI / all-required` go red via the failed dependency + # propagation — exactly the gate we want, with zero runner-squat. # + # The `needs:` list MUST stay in lockstep with ci-required-drift.py's + # F1 check (`ci_job_names()` = every job MINUS the sentinel MINUS jobs + # whose `if:` gates on github.event_name/github.ref). canvas-deploy- + # reminder is event-gated (`if: github.ref == refs/heads/{main,staging}`) + # so it is intentionally EXCLUDED — it skips on PRs and a `needs:` on a + # skipped job would never let the sentinel run. If a new always-running + # CI job is added, add it here too or ci-required-drift F1 will flag it. + # + # Stays on the dedicated `ci-meta` lane (no docker work, so the + # docker-host-pin lint does not apply), but now the job is sub-second: + # it only inspects already-settled `needs.*.result` values, so it frees + # the slot immediately instead of holding it for the whole CI duration. + # + needs: + - changes + - platform-build + - canvas-build + - shellcheck + - python-lint continue-on-error: false runs-on: ci-meta - timeout-minutes: 45 + timeout-minutes: 5 steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - fetch-depth: 0 - - id: check + - name: Verify all aggregated CI jobs succeeded + # NO polling, NO API call, NO checkout. Because this job lists the + # aggregated jobs under `needs:` (without `if: always()`), Gitea only + # starts it once every need has reached SUCCESS — a failed/errored + # need short-circuits the job and propagates red to the + # `CI / all-required` context. This explicit check is a + # belt-and-suspenders assertion + a readable run summary; the real + # gating is the `needs:` edge itself. env: - PR_BASE_SHA: ${{ github.event.pull_request.base.sha }} - PR_BASE_REF: ${{ github.event.pull_request.base.ref }} - PUSH_BEFORE: ${{ github.event.before }} - run: | - python3 .gitea/scripts/detect-changes.py \ - --profile ci \ - --event-name "${{ github.event_name }}" \ - --pr-base-sha "$PR_BASE_SHA" \ - --base-ref "$PR_BASE_REF" \ - --push-before "${GITHUB_EVENT_BEFORE:-$PUSH_BEFORE}" - - name: Wait for required CI contexts - env: - GITEA_TOKEN: ${{ secrets.GITHUB_TOKEN }} - API_ROOT: ${{ github.server_url }}/api/v1 - REPOSITORY: ${{ github.repository }} - COMMIT_SHA: ${{ github.sha }} - EVENT_NAME: ${{ github.event_name }} - REQUIRE_PLATFORM: ${{ steps.check.outputs.platform }} - REQUIRE_CANVAS: ${{ steps.check.outputs.canvas }} - REQUIRE_SCRIPTS: ${{ steps.check.outputs.scripts }} + CHANGES_RESULT: ${{ needs.changes.result }} + PLATFORM_RESULT: ${{ needs.platform-build.result }} + CANVAS_RESULT: ${{ needs.canvas-build.result }} + SHELLCHECK_RESULT: ${{ needs.shellcheck.result }} + PYTHON_LINT_RESULT: ${{ needs.python-lint.result }} run: | set -euo pipefail - python3 - <<'PY' - import json - import os - import sys - import time - import urllib.error - import urllib.request - - token = os.environ["GITEA_TOKEN"] - api_root = os.environ["API_ROOT"].rstrip("/") - repo = os.environ["REPOSITORY"] - sha = os.environ["COMMIT_SHA"] - event = os.environ["EVENT_NAME"] - required = [ - f"CI / Detect changes ({event})", - f"CI / Python Lint & Test ({event})", - ] - if os.environ.get("REQUIRE_PLATFORM") == "true": - required.append(f"CI / Platform (Go) ({event})") - if os.environ.get("REQUIRE_CANVAS") == "true": - required.append(f"CI / Canvas (Next.js) ({event})") - if os.environ.get("REQUIRE_SCRIPTS") == "true": - required.append(f"CI / Shellcheck (E2E scripts) ({event})") - terminal_bad = {"failure", "error"} - deadline = time.time() + 40 * 60 - last_summary = None - - def fetch_statuses(): - statuses = [] - for page in range(1, 6): - url = f"{api_root}/repos/{repo}/commits/{sha}/statuses?page={page}&limit=100" - req = urllib.request.Request(url, headers={"Authorization": f"token {token}"}) - with urllib.request.urlopen(req, timeout=10) as resp: - chunk = json.load(resp) - if not chunk: - break - statuses.extend(chunk) - latest = {} - for item in statuses: - ctx = item.get("context") - if not ctx: - continue - prev = latest.get(ctx) - if prev is None or (item.get("updated_at") or item.get("created_at") or "") >= (prev.get("updated_at") or prev.get("created_at") or ""): - latest[ctx] = item - return latest - - while True: - try: - latest = fetch_statuses() - except (TimeoutError, OSError, urllib.error.URLError) as exc: - if time.time() >= deadline: - print(f"FAIL: status polling did not recover before deadline: {exc}", file=sys.stderr) - sys.exit(1) - print(f"WARN: status poll failed, retrying: {exc}", flush=True) - time.sleep(15) - continue - states = {ctx: (latest.get(ctx) or {}).get("status") or (latest.get(ctx) or {}).get("state") or "missing" for ctx in required} - summary = ", ".join(f"{ctx}={state}" for ctx, state in states.items()) - if summary != last_summary: - print(summary, flush=True) - last_summary = summary - bad = {ctx: state for ctx, state in states.items() if state in terminal_bad} - if bad: - print("FAIL: required CI context failed:", file=sys.stderr) - for ctx, state in bad.items(): - desc = (latest.get(ctx) or {}).get("description") or "" - print(f" - {ctx}: {state} {desc}", file=sys.stderr) - sys.exit(1) - if all(state == "success" for state in states.values()): - print(f"OK: all {len(required)} required CI contexts succeeded") - sys.exit(0) - if time.time() >= deadline: - print("FAIL: timed out waiting for required CI contexts:", file=sys.stderr) - for ctx, state in states.items(): - print(f" - {ctx}: {state}", file=sys.stderr) - sys.exit(1) - time.sleep(15) - PY + fail=0 + check() { + name="$1"; result="$2" + printf 'CI / %s = %s\n' "$name" "$result" + # `success` is the only green terminal state we accept. A plain + # `needs:` job is only started when all needs succeed, so reaching + # this step already implies success — but assert explicitly so a + # future `if: always()` reintroduction (which WOULD let non-success + # through) fails loudly instead of silently passing the gate. + if [ "$result" != "success" ]; then + echo "::error::aggregated CI job '${name}' did not succeed (result=${result})" + fail=1 + fi + } + check "Detect changes" "$CHANGES_RESULT" + check "Platform (Go)" "$PLATFORM_RESULT" + check "Canvas (Next.js)" "$CANVAS_RESULT" + check "Shellcheck (E2E scripts)" "$SHELLCHECK_RESULT" + check "Python Lint & Test" "$PYTHON_LINT_RESULT" + if [ "$fail" -ne 0 ]; then + echo "::error::all-required: one or more aggregated CI jobs did not succeed" + exit 1 + fi + echo "OK: all aggregated CI jobs succeeded — CI / all-required green." diff --git a/.gitea/workflows/lint-forbidden-env-keys.yml b/.gitea/workflows/lint-forbidden-env-keys.yml index e70e449ab..d805c0678 100644 --- a/.gitea/workflows/lint-forbidden-env-keys.yml +++ b/.gitea/workflows/lint-forbidden-env-keys.yml @@ -25,6 +25,21 @@ name: Lint forbidden tenant-env keys # feedback_path_filtered_workflow_cant_be_required). The scan itself # targets workspace_secrets-writer paths via grep -r; it's fast # (sub-second) so unconditional run is fine. +# +# ── 2026-06-01 CI-scheduler-fanout consolidation (fix/ci-scheduler-fanout) ── +# The RFC#523 sibling lint formerly in its own file +# `lint-no-tenant-gitea-token.yml` (the broader "no repo-host token into +# any tenant-writer surface" scan) is now a SECOND job in THIS workflow +# (`scan-tenant-token-write`). Both are sub-second Go-source greps that +# fired as two separate workflow runs on every PR — pure scheduler +# fan-out. Folding the sibling in here drops one workflow run + one +# checkout per PR while keeping BOTH scans firing unconditionally on +# every PR (the no-paths discipline above is preserved — neither job is +# paths-filtered). The moved job keeps its exact `name:` so its emitted +# status context is unchanged in substance; its `# bp-exempt:` directive +# moves with it (Tier 2g). The old `Lint no tenant GITEA or GITHUB token +# write / …` context is retired (a disappearing context needs no +# directive; only NEW emitters do). on: pull_request: @@ -166,3 +181,126 @@ jobs: fi echo "OK No forbidden operator-scope env key names hardcoded in writer paths." + + # bp-exempt: advisory RFC#523 lint; PR review gate is review-driven, not BP-driven. + # (Carried with the workflow-name rename in PR mc#1593 so the renamed + # context emission satisfies lint_required_context_exists_in_bp Tier 2g.) + scan-tenant-token-write: + name: Scan for repo-host token write into tenant workspace surface + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + fetch-depth: 1 + + - name: Find Go files referencing a tenant-writer surface AND a repo-host token + run: | + set -euo pipefail + + # Repo-host token NAMES — the threat-model subset. Operator-fleet + # tokens (CP_ADMIN_API_TOKEN, RAILWAY_TOKEN, INFISICAL_*) are + # caught by lint-forbidden-env-keys.yml's broader deny set; this + # lint focuses on the git-host class so a single co-occurrence + # match has a low false-positive rate. + FORBIDDEN_KEYS=( + "GITEA_TOKEN" + "GITEA_PAT" + "GITHUB_TOKEN" + "GITHUB_PAT" + "GH_TOKEN" + ) + + # Tenant-writer surface markers. A file matches the surface set + # if it references ANY of these strings. This is the "is this + # code path writing into a tenant workspace?" heuristic. + # Curated to catch the actual code shapes used in this repo + # (verified by grep against current main 2026-05-19): + # - "workspace_secrets" / "global_secrets" → DB table writes + # - "seedAllowList" → CP-side seed table + # - "/settings/secrets" → tenant HTTP API write + # - "envVars[" → in-memory env map write + # - "containerEnv" → docker-run env-set + # - "userData" → EC2 user-data script + # - "provisionPayload" / "provisionContext" → provision-request shape + SURFACE_PATTERN='workspace_secrets|global_secrets|seedAllowList|/settings/secrets|envVars\[|containerEnv|userData|provisionPayload|provisionContext' + + # Files that legitimately reference these names AND a surface + # marker, but do so for guard / strip / test / doc-comment + # reasons. New entries require reviewer signoff and a one-line + # justification in the diff. + EXEMPT_FILES=( + # RFC#523 L1 deny-set source-of-truth + tests + "workspace-server/internal/handlers/workspace_provision_forbidden_env.go" + "workspace-server/internal/handlers/workspace_provision_forbidden_env_test.go" + # Forensic-#145 silent-strip denylist (defense-in-depth, by design lists the names) + "workspace-server/internal/provisioner/provisioner.go" + "workspace-server/internal/provisioner/provisioner_test.go" + # Pre-RFC#523 persona-fallback / org-helper paths. The L1 + # fail-closed runs BEFORE these writers; downstream silent-strip + # also covers them. See applyAgentGitHTTPCreds doc-comment. + "workspace-server/internal/handlers/agent_git_identity.go" + "workspace-server/internal/handlers/org_helpers.go" + "workspace-server/internal/handlers/org.go" + # CP→platform admin auth (NOT a tenant env write). + "workspace-server/internal/provisioner/cp_provisioner.go" + ) + + # Build an extended-regex alternation of forbidden keys. + KEY_ALT="$(IFS='|'; echo "${FORBIDDEN_KEYS[*]}")" + + # Find candidate files: Go non-test sources that contain a + # tenant-writer surface marker. + mapfile -t CANDIDATES < <( + grep -rlE --include='*.go' --exclude='*_test.go' \ + "${SURFACE_PATTERN}" . 2>/dev/null \ + | sed 's|^\./||' \ + | sort -u + ) + + if [ "${#CANDIDATES[@]}" -eq 0 ]; then + echo "OK No tenant-writer-surface files found in tree (unexpected, but not a lint failure)." + exit 0 + fi + + HITS="" + for f in "${CANDIDATES[@]}"; do + # Skip exempt files. + skip=0 + for ex in "${EXEMPT_FILES[@]}"; do + if [ "$f" = "$ex" ]; then skip=1; break; fi + done + [ "$skip" = "1" ] && continue + + # File contains a surface marker; now grep for a forbidden + # key NAME. We require a QUOTED-literal match to avoid + # firing on a comment like "// also handle GITEA_TOKEN". + # + # The literal form catches: + # - os.Getenv("GITEA_TOKEN") + # - envVars["GITEA_TOKEN"] = ... + # - {envKey: "GITEA_TOKEN", tenantKey: "GITEA_TOKEN"} + # but not: + # - // see GITEA_TOKEN below (no quotes) + found=$(grep -nE "\"(${KEY_ALT})\"" "$f" 2>/dev/null || true) + if [ -n "$found" ]; then + HITS="${HITS}--- ${f} ---\n${found}\n" + fi + done + + if [ -n "$HITS" ]; then + echo "::error::Task #146 lint: repo-host token name(s) quoted in a tenant-writer-surface file:" + printf "$HITS" + echo "" + echo "These files reference a tenant-writer surface (workspace_secrets," + echo "seedAllowList, /settings/secrets, containerEnv, userData, etc.)" + echo "AND quote a repo-host token name (GITEA_TOKEN/GITHUB_TOKEN/…)." + echo "Per RFC#523 threat model, tenant workspaces MUST NOT receive" + echo "operator-scope repo-host tokens. If your code legitimately needs" + echo "to reference one of these names in a tenant-writer file (e.g." + echo "a deny-set definition or silent-strip list), add the file to" + echo "EXEMPT_FILES with a one-line justification — reviewer signoff" + echo "required." + exit 1 + fi + + echo "OK No tenant-writer-surface file co-mentions a repo-host token literal." diff --git a/.gitea/workflows/lint-no-tenant-gitea-token.yml b/.gitea/workflows/lint-no-tenant-gitea-token.yml deleted file mode 100644 index 7a542ba53..000000000 --- a/.gitea/workflows/lint-no-tenant-gitea-token.yml +++ /dev/null @@ -1,182 +0,0 @@ -name: Lint no tenant GITEA or GITHUB token write - -# Task #146 — CI guardrail companion to RFC#523's `lint-forbidden-env-keys.yml`. -# -# `lint-forbidden-env-keys.yml` (Layer 3) catches code that hardcodes a -# forbidden env-var key NAME as a quoted literal in workspace_secrets -# writer paths under workspace-server/internal/. -# -# This workflow catches a BROADER class: any code path that reads a -# repo-host token (GITEA_TOKEN / GITHUB_TOKEN / GH_TOKEN) and then writes -# it into a TENANT WORKSPACE's env, secret store, user-data, or -# provision payload. This is the actual RFC#523 threat-model statement — -# the goal is "no tenant workspace ever receives an operator-scope repo -# token," not just "no _quoted_ literal `GITEA_TOKEN`." A future writer -# could route the value via a variable, a struct field, or a config key -# and slip past the existing literal scan; this lint catches those -# routing patterns at PR review time. -# -# Scope -# Scans the WHOLE repo's Go sources (not just workspace-server/) for -# co-occurrences of: -# - a repo-host token NAME (GITEA_TOKEN / GITHUB_TOKEN / GH_TOKEN / -# GITEA_PAT / GITHUB_PAT) used as os.Getenv argument or string -# literal -# - within a file that ALSO references a tenant-writer surface -# (`tenant`, `workspace_secrets`, `global_secrets`, `seedAllowList`, -# `/settings/secrets`, `userData`, `provisionPayload`, -# `envVars[`, `containerEnv`). -# -# Co-occurrence (not single-line) is the false-positive control: a -# file that just LOGS the variable name (e.g. "missing GITEA_TOKEN") -# without touching any tenant surface won't fire. -# -# Drift contract with lint-forbidden-env-keys.yml -# Both lints share the same FORBIDDEN_KEYS list (a subset — only the -# repo-host tokens, since this lint's threat model is "tenant gets -# write access to operator's git host"). If RFC#523's deny set grows, -# update BOTH this file AND lint-forbidden-env-keys.yml AND the Go -# source-of-truth in -# workspace-server/internal/handlers/workspace_provision_forbidden_env.go. -# -# Open-source-template-friendly -# The patterns scanned are generic (no MOLECULE_-prefix literals). -# A fork can copy this workflow as-is and adjust FORBIDDEN_KEYS. -# -# Path-filter discipline -# No `paths:` filter — required-status workflows must run on every PR -# per `feedback_path_filtered_workflow_cant_be_required`. Scan is -# sub-second. - -on: - pull_request: - types: [opened, synchronize, reopened] - push: - branches: [main, staging] - -env: - GITHUB_SERVER_URL: https://git.moleculesai.app - -jobs: - # bp-exempt: advisory RFC#523 lint; PR review gate is review-driven, not BP-driven. - # (Carried with the workflow-name rename in PR mc#1593 so the renamed - # context emission satisfies lint_required_context_exists_in_bp Tier 2g.) - scan: - name: Scan for repo-host token write into tenant workspace surface - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - with: - fetch-depth: 1 - - - name: Find Go files referencing a tenant-writer surface AND a repo-host token - run: | - set -euo pipefail - - # Repo-host token NAMES — the threat-model subset. Operator-fleet - # tokens (CP_ADMIN_API_TOKEN, RAILWAY_TOKEN, INFISICAL_*) are - # caught by lint-forbidden-env-keys.yml's broader deny set; this - # lint focuses on the git-host class so a single co-occurrence - # match has a low false-positive rate. - FORBIDDEN_KEYS=( - "GITEA_TOKEN" - "GITEA_PAT" - "GITHUB_TOKEN" - "GITHUB_PAT" - "GH_TOKEN" - ) - - # Tenant-writer surface markers. A file matches the surface set - # if it references ANY of these strings. This is the "is this - # code path writing into a tenant workspace?" heuristic. - # Curated to catch the actual code shapes used in this repo - # (verified by grep against current main 2026-05-19): - # - "workspace_secrets" / "global_secrets" → DB table writes - # - "seedAllowList" → CP-side seed table - # - "/settings/secrets" → tenant HTTP API write - # - "envVars[" → in-memory env map write - # - "containerEnv" → docker-run env-set - # - "userData" → EC2 user-data script - # - "provisionPayload" / "provisionContext" → provision-request shape - SURFACE_PATTERN='workspace_secrets|global_secrets|seedAllowList|/settings/secrets|envVars\[|containerEnv|userData|provisionPayload|provisionContext' - - # Files that legitimately reference these names AND a surface - # marker, but do so for guard / strip / test / doc-comment - # reasons. New entries require reviewer signoff and a one-line - # justification in the diff. - EXEMPT_FILES=( - # RFC#523 L1 deny-set source-of-truth + tests - "workspace-server/internal/handlers/workspace_provision_forbidden_env.go" - "workspace-server/internal/handlers/workspace_provision_forbidden_env_test.go" - # Forensic-#145 silent-strip denylist (defense-in-depth, by design lists the names) - "workspace-server/internal/provisioner/provisioner.go" - "workspace-server/internal/provisioner/provisioner_test.go" - # Pre-RFC#523 persona-fallback / org-helper paths. The L1 - # fail-closed runs BEFORE these writers; downstream silent-strip - # also covers them. See applyAgentGitHTTPCreds doc-comment. - "workspace-server/internal/handlers/agent_git_identity.go" - "workspace-server/internal/handlers/org_helpers.go" - "workspace-server/internal/handlers/org.go" - # CP→platform admin auth (NOT a tenant env write). - "workspace-server/internal/provisioner/cp_provisioner.go" - ) - - # Build an extended-regex alternation of forbidden keys. - KEY_ALT="$(IFS='|'; echo "${FORBIDDEN_KEYS[*]}")" - - # Find candidate files: Go non-test sources that contain a - # tenant-writer surface marker. - mapfile -t CANDIDATES < <( - grep -rlE --include='*.go' --exclude='*_test.go' \ - "${SURFACE_PATTERN}" . 2>/dev/null \ - | sed 's|^\./||' \ - | sort -u - ) - - if [ "${#CANDIDATES[@]}" -eq 0 ]; then - echo "OK No tenant-writer-surface files found in tree (unexpected, but not a lint failure)." - exit 0 - fi - - HITS="" - for f in "${CANDIDATES[@]}"; do - # Skip exempt files. - skip=0 - for ex in "${EXEMPT_FILES[@]}"; do - if [ "$f" = "$ex" ]; then skip=1; break; fi - done - [ "$skip" = "1" ] && continue - - # File contains a surface marker; now grep for a forbidden - # key NAME. We require a QUOTED-literal match to avoid - # firing on a comment like "// also handle GITEA_TOKEN". - # - # The literal form catches: - # - os.Getenv("GITEA_TOKEN") - # - envVars["GITEA_TOKEN"] = ... - # - {envKey: "GITEA_TOKEN", tenantKey: "GITEA_TOKEN"} - # but not: - # - // see GITEA_TOKEN below (no quotes) - found=$(grep -nE "\"(${KEY_ALT})\"" "$f" 2>/dev/null || true) - if [ -n "$found" ]; then - HITS="${HITS}--- ${f} ---\n${found}\n" - fi - done - - if [ -n "$HITS" ]; then - echo "::error::Task #146 lint: repo-host token name(s) quoted in a tenant-writer-surface file:" - printf "$HITS" - echo "" - echo "These files reference a tenant-writer surface (workspace_secrets," - echo "seedAllowList, /settings/secrets, containerEnv, userData, etc.)" - echo "AND quote a repo-host token name (GITEA_TOKEN/GITHUB_TOKEN/…)." - echo "Per RFC#523 threat model, tenant workspaces MUST NOT receive" - echo "operator-scope repo-host tokens. If your code legitimately needs" - echo "to reference one of these names in a tenant-writer file (e.g." - echo "a deny-set definition or silent-strip list), add the file to" - echo "EXEMPT_FILES with a one-line justification — reviewer signoff" - echo "required." - exit 1 - fi - - echo "OK No tenant-writer-surface file co-mentions a repo-host token literal." diff --git a/.gitea/workflows/verify-providers-gen.yml b/.gitea/workflows/verify-providers-gen.yml index 5b7bc8063..b9f9978e6 100644 --- a/.gitea/workflows/verify-providers-gen.yml +++ b/.gitea/workflows/verify-providers-gen.yml @@ -35,8 +35,26 @@ name: verify-providers-gen on: pull_request: types: [opened, synchronize, reopened] + # CI-scheduler-overload fix (fix/ci-scheduler-fanout, 2026-06-01): + # this gate only verifies that the generated providers artifact is in + # sync with the schema SSOT. Its verdict can ONLY change when one of + # the codegen inputs/outputs changes, so firing the Go toolchain on + # every unrelated PR (docs, canvas, scripts) is pure fan-out cost. + # Scoped to the codegen surface. SAFE because this workflow is NOT a + # branch-protection status_check_context (see header §ENFORCEMENT + # GATING) — lint-required-no-paths only forbids paths filters on + # REQUIRED workflows; this is advisory, so a paths filter is allowed. + # Mirrors the sibling sync-providers-yaml.yml scoping convention. + paths: + - 'workspace-server/internal/providers/**' + - 'workspace-server/cmd/gen-providers/**' + - '.gitea/workflows/verify-providers-gen.yml' push: branches: [main, staging] + paths: + - 'workspace-server/internal/providers/**' + - 'workspace-server/cmd/gen-providers/**' + - '.gitea/workflows/verify-providers-gen.yml' env: GITHUB_SERVER_URL: https://git.moleculesai.app -- 2.52.0