From c9175c071ceb021d851b32f0530888fda94d7b24 Mon Sep 17 00:00:00 2001 From: core-be Date: Sat, 16 May 2026 11:56:22 -0700 Subject: [PATCH] ci(provisioner-parity): enforce the fast local prod-mimic parity test as a fail-closed merge gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The token-injection/ownership bug class — platform delivers /configs/.auth_token root:root AFTER the entrypoint chown, so the uid-1000 agent's save_token O_WRONLY|O_TRUNC is denied -> list_peers / heartbeat 401 forever — shipped to the fleet (Hermes #1877/#418) and again on template-hermes #162 precisely because nothing ENFORCED the local check. The dev-SOP only referenced feedback_mandatory_local_e2e_ before_ship as prose; prose does not stop a PR. This wires the //go:build local provisioner-parity test (added in this PR) into CI as a real gate: - new provisioner-parity job runs `go test -tags local -run TestTokenOwnership` against the runner's Docker daemon. The test self-skips Docker-less (keeps `make test` / Platform (Go) green on dev machines); this job runs on a Docker-capable runner and treats a SKIP or empty run as a FAILURE (fail-closed). - outcomes parsed from the test2json stream as real JSON (Package sits between Action and Test; a grep adjacency match counts zero — a vacuous-green trap caught and fixed in verification). - requires BOTH the headline parity test AND its fail-direction proof control (TestTokenOwnership_FailPre_ProvesCatch) to pass. - joins the `CI / all-required` aggregator (RFC internal#219 §2) so branch protection fail-closes on it with NO branch-protection edit. Verified locally: PASS-case exit 0; Hermes-bug-present FAIL-case exit 1; no-daemon SKIP-case exit 1. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitea/workflows/ci.yml | 134 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index 6c98159e..cc4bc704 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -294,6 +294,132 @@ jobs: exit 1 fi + # Provisioner Parity — fast local prod-mimic gate. REQUIRED, always runs. + # + # WHY THIS IS A GATE, NOT A DOC LINE (feedback_checkpointed_workflow_over + # _good_practice_doc): the dev-SOP already *referenced* + # feedback_mandatory_local_e2e_before_ship as prose, but prose does not + # stop a PR. The token-injection/ownership bug class — platform writes + # /configs/.auth_token root:root AFTER the entrypoint chown, so the + # uid-1000 agent's save_token O_WRONLY|O_TRUNC is denied → list_peers / + # heartbeat 401 forever — shipped to the fleet (Hermes #1877/#418) and + # again on template-hermes #162 (the bearer-401 being landed right now) + # precisely because nothing *enforced* the local check. The parity test + # (workspace-server/internal/provisioner/provisioner_token_ownership_ + # local_test.go, `//go:build local`) reproduces that exact class against + # a LOCAL Docker daemon in <1s — versus an ~1h EC2 fresh-provision. This + # job makes it fail-closed on every workspace-server PR. + # + # FAIL-CLOSED CONTRACT: the test self-skips when no Docker daemon is + # reachable (so `make test` / `go test ./...` stay green on Docker-less + # dev machines and the standard Platform (Go) job). A *gate* that + # silently skips is not a gate. This job runs on a Docker-capable runner + # and treats "0 parity tests ran" as a FAILURE — a skipped daemon here + # means the gate did not execute, which must block merge, not pass. + # + # Always-run + per-step gating shape mirrors platform-build so the + # `CI / Provisioner Parity ()` required-check name is always + # emitted (SKIPPED != passed under branch protection — PR #2314). + provisioner-parity: + name: Provisioner Parity + runs-on: ubuntu-latest + needs: [changes] + continue-on-error: false + # Test is seconds-local; generous ceiling absorbs a cold alpine pull + # on a slow runner link plus Go module/build cold cache. + timeout-minutes: 15 + defaults: + run: + working-directory: workspace-server + steps: + - if: ${{ needs.changes.outputs.platform != 'true' }} + working-directory: . + run: echo "No workspace-server/** changes — parity gate is a no-op for this PR; this job always runs to satisfy the required-check name on branch protection." + - if: ${{ needs.changes.outputs.platform == 'true' }} + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - if: ${{ needs.changes.outputs.platform == 'true' }} + uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5 + with: + go-version: 'stable' + - if: ${{ needs.changes.outputs.platform == 'true' }} + run: go mod download + - if: ${{ needs.changes.outputs.platform == 'true' }} + name: Fast local prod-mimic provisioner-parity (fail-closed) + # Run the `//go:build local` parity suite against the runner's + # Docker daemon. -json lets us assert tests actually RAN: a skip + # (no daemon) or zero-test run must fail this gate, never pass it. + run: | + set -euo pipefail + echo "Docker daemon check (gate requires a reachable daemon):" + docker version --format '{{.Server.Version}}' \ + || { echo "::error::Provisioner-parity gate could not reach a Docker daemon. This gate is fail-closed: a missing daemon means the token-ownership class was NOT checked. Failing the PR rather than passing un-tested."; exit 1; } + set +e + go test -tags local -json -run 'TestTokenOwnership' \ + -timeout 12m ./internal/provisioner/ | tee /tmp/parity.json + gotest_exit=${PIPESTATUS[0]} + set -e + # Parse the test2json stream as real JSON. test2json emits + # objects as {"Action":..,"Package":..,"Test":..} — field ORDER + # is not guaranteed and `Package` sits between `Action` and + # `Test`, so a grep adjacency match silently counts ZERO (a + # vacuous-green trap that nearly shipped here — caught in + # verification). Per-test terminal action is the source of truth. + GOTEST_EXIT="$gotest_exit" python3 - <<'PY' + import json, os, sys + headline = "TestTokenOwnership_LocalProvisionerParity" + proof = "TestTokenOwnership_FailPre_ProvesCatch" + outcome = {} # test name -> last terminal action + with open("/tmp/parity.json") as fh: + for line in fh: + line = line.strip() + if not line or not line.startswith("{"): + continue + try: + ev = json.loads(line) + except json.JSONDecodeError: + continue + t = ev.get("Test") + a = ev.get("Action") + if t and a in ("pass", "fail", "skip"): + outcome[t] = a + passed = sum(1 for v in outcome.values() if v == "pass") + failed = sum(1 for v in outcome.values() if v == "fail") + skipped = sum(1 for v in outcome.values() if v == "skip") + go_exit = int(os.environ["GOTEST_EXIT"]) + print(f"parity outcomes: passed={passed} failed={failed} " + f"skipped={skipped} go_exit={go_exit} " + f"per-test={outcome}") + if go_exit != 0 or failed > 0: + print("::error::Provisioner-parity FAILED — the " + "token-injection/ownership bug class (Hermes " + "#1877/#418, template-hermes #162: /configs token " + "files delivered root:root, uid-1000 agent save_token " + "denied -> list_peers/heartbeat 401) is present. Fix " + "the provisioner injection to deliver AgentUID-owned " + "files before merge.") + sys.exit(1) + # The headline parity test AND its fail-direction proof control + # MUST have run and passed. If either was skipped (no daemon) or + # never collected, the gate did not actually execute its job — + # fail-closed, never pass un-checked. + if outcome.get(headline) != "pass": + print(f"::error::Provisioner-parity gate did NOT execute " + f"the headline test ({headline}={outcome.get(headline)}" + f"). Fail-closed: a skipped/absent parity run means " + f"the token-ownership class was never checked on this " + f"PR — treated as a gate failure.") + sys.exit(1) + if outcome.get(proof) != "pass": + print(f"::error::Provisioner-parity fail-direction proof " + f"control did NOT pass ({proof}=" + f"{outcome.get(proof)}). Without it the headline " + f"assertion is not proven load-bearing — fail-closed.") + sys.exit(1) + print(f"Provisioner-parity gate PASSED: token-ownership class " + f"checked locally and the fail-direction proof control " + f"confirms the assertion is load-bearing (passed={passed}).") + PY + # Canvas (Next.js) — required check, always runs. Same always-run + # per-step gating shape as platform-build. The two-job-sharing-name # pattern attempted in PR #2321 doesn't satisfy branch protection @@ -591,6 +717,14 @@ jobs: required = [ f"CI / Detect changes ({event})", f"CI / Platform (Go) ({event})", + # Fast local prod-mimic provisioner-parity gate (this PR). + # Wired here — not into branch-protection's + # status_check_contexts — by RFC internal#219 §2 design: + # the single stable `CI / all-required` context is what BP + # points at, and new fail-closed gates join by extending + # this list. Makes the token-ownership class (Hermes + # #1877/#418, template-hermes #162) a hard merge gate. + f"CI / Provisioner Parity ({event})", f"CI / Canvas (Next.js) ({event})", f"CI / Shellcheck (E2E scripts) ({event})", f"CI / Python Lint & Test ({event})",