test(provisioner): fast local-Docker parity test for the token-injection ownership bug class #1332

Open
core-be wants to merge 4 commits from test/local-provisioner-token-ownership-parity into main
3 changed files with 608 additions and 1 deletions

View File

@ -294,6 +294,132 @@ jobs:
exit 1
fi
# Provisioner Parity — fast local prod-mimic gate. REQUIRED, always runs.
#
# WHY THIS IS A GATE, NOT A DOC LINE (feedback_checkpointed_workflow_over
# _good_practice_doc): the dev-SOP already *referenced*
# feedback_mandatory_local_e2e_before_ship as prose, but prose does not
# stop a PR. The token-injection/ownership bug class — platform writes
# /configs/.auth_token root:root AFTER the entrypoint chown, so the
# uid-1000 agent's save_token O_WRONLY|O_TRUNC is denied → list_peers /
# heartbeat 401 forever — shipped to the fleet (Hermes #1877/#418) and
# again on template-hermes #162 (the bearer-401 being landed right now)
# precisely because nothing *enforced* the local check. The parity test
# (workspace-server/internal/provisioner/provisioner_token_ownership_
# local_test.go, `//go:build local`) reproduces that exact class against
# a LOCAL Docker daemon in <1s — versus an ~1h EC2 fresh-provision. This
# job makes it fail-closed on every workspace-server PR.
#
# FAIL-CLOSED CONTRACT: the test self-skips when no Docker daemon is
# reachable (so `make test` / `go test ./...` stay green on Docker-less
# dev machines and the standard Platform (Go) job). A *gate* that
# silently skips is not a gate. This job runs on a Docker-capable runner
# and treats "0 parity tests ran" as a FAILURE — a skipped daemon here
# means the gate did not execute, which must block merge, not pass.
#
# Always-run + per-step gating shape mirrors platform-build so the
# `CI / Provisioner Parity (<event>)` required-check name is always
# emitted (SKIPPED != passed under branch protection — PR #2314).
provisioner-parity:
name: Provisioner Parity
runs-on: ubuntu-latest
needs: [changes]
continue-on-error: false
# Test is seconds-local; generous ceiling absorbs a cold alpine pull
# on a slow runner link plus Go module/build cold cache.
timeout-minutes: 15
defaults:
run:
working-directory: workspace-server
steps:
- if: ${{ needs.changes.outputs.platform != 'true' }}
working-directory: .
run: echo "No workspace-server/** changes — parity gate is a no-op for this PR; this job always runs to satisfy the required-check name on branch protection."
- if: ${{ needs.changes.outputs.platform == 'true' }}
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- if: ${{ needs.changes.outputs.platform == 'true' }}
uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
with:
go-version: 'stable'
- if: ${{ needs.changes.outputs.platform == 'true' }}
run: go mod download
- if: ${{ needs.changes.outputs.platform == 'true' }}
name: Fast local prod-mimic provisioner-parity (fail-closed)
# Run the `//go:build local` parity suite against the runner's
# Docker daemon. -json lets us assert tests actually RAN: a skip
# (no daemon) or zero-test run must fail this gate, never pass it.
run: |
set -euo pipefail
echo "Docker daemon check (gate requires a reachable daemon):"
docker version --format '{{.Server.Version}}' \
|| { echo "::error::Provisioner-parity gate could not reach a Docker daemon. This gate is fail-closed: a missing daemon means the token-ownership class was NOT checked. Failing the PR rather than passing un-tested."; exit 1; }
set +e
go test -tags local -json -run 'TestTokenOwnership' \
-timeout 12m ./internal/provisioner/ | tee /tmp/parity.json
gotest_exit=${PIPESTATUS[0]}
set -e
# Parse the test2json stream as real JSON. test2json emits
# objects as {"Action":..,"Package":..,"Test":..} — field ORDER
# is not guaranteed and `Package` sits between `Action` and
# `Test`, so a grep adjacency match silently counts ZERO (a
# vacuous-green trap that nearly shipped here — caught in
# verification). Per-test terminal action is the source of truth.
GOTEST_EXIT="$gotest_exit" python3 - <<'PY'
import json, os, sys
headline = "TestTokenOwnership_LocalProvisionerParity"
proof = "TestTokenOwnership_FailPre_ProvesCatch"
outcome = {} # test name -> last terminal action
with open("/tmp/parity.json") as fh:
for line in fh:
line = line.strip()
if not line or not line.startswith("{"):
continue
try:
ev = json.loads(line)
except json.JSONDecodeError:
continue
t = ev.get("Test")
a = ev.get("Action")
if t and a in ("pass", "fail", "skip"):
outcome[t] = a
passed = sum(1 for v in outcome.values() if v == "pass")
failed = sum(1 for v in outcome.values() if v == "fail")
skipped = sum(1 for v in outcome.values() if v == "skip")
go_exit = int(os.environ["GOTEST_EXIT"])
print(f"parity outcomes: passed={passed} failed={failed} "
f"skipped={skipped} go_exit={go_exit} "
f"per-test={outcome}")
if go_exit != 0 or failed > 0:
print("::error::Provisioner-parity FAILED — the "
"token-injection/ownership bug class (Hermes "
"#1877/#418, template-hermes #162: /configs token "
"files delivered root:root, uid-1000 agent save_token "
"denied -> list_peers/heartbeat 401) is present. Fix "
"the provisioner injection to deliver AgentUID-owned "
"files before merge.")
sys.exit(1)
# The headline parity test AND its fail-direction proof control
# MUST have run and passed. If either was skipped (no daemon) or
# never collected, the gate did not actually execute its job —
# fail-closed, never pass un-checked.
if outcome.get(headline) != "pass":
print(f"::error::Provisioner-parity gate did NOT execute "
f"the headline test ({headline}={outcome.get(headline)}"
f"). Fail-closed: a skipped/absent parity run means "
f"the token-ownership class was never checked on this "
f"PR — treated as a gate failure.")
sys.exit(1)
if outcome.get(proof) != "pass":
print(f"::error::Provisioner-parity fail-direction proof "
f"control did NOT pass ({proof}="
f"{outcome.get(proof)}). Without it the headline "
f"assertion is not proven load-bearing — fail-closed.")
sys.exit(1)
print(f"Provisioner-parity gate PASSED: token-ownership class "
f"checked locally and the fail-direction proof control "
f"confirms the assertion is load-bearing (passed={passed}).")
PY
# Canvas (Next.js) — required check, always runs. Same always-run +
# per-step gating shape as platform-build. The two-job-sharing-name
# pattern attempted in PR #2321 doesn't satisfy branch protection
@ -591,6 +717,14 @@ jobs:
required = [
f"CI / Detect changes ({event})",
f"CI / Platform (Go) ({event})",
# Fast local prod-mimic provisioner-parity gate (this PR).
# Wired here — not into branch-protection's
# status_check_contexts — by RFC internal#219 §2 design:
# the single stable `CI / all-required` context is what BP
# points at, and new fail-closed gates join by extending
# this list. Makes the token-ownership class (Hermes
# #1877/#418, template-hermes #162) a hard merge gate.
f"CI / Provisioner Parity ({event})",
f"CI / Canvas (Next.js) ({event})",
f"CI / Shellcheck (E2E scripts) ({event})",
f"CI / Python Lint & Test ({event})",

View File

@ -4,7 +4,7 @@
# use this Makefile; CI calls docker compose / go test directly so the
# Makefile can evolve without breaking the build.
.PHONY: help dev up down logs build test
.PHONY: help dev up down logs build test test-local-e2e
help: ## Show this help.
@grep -E '^[a-zA-Z_-]+:.*?## ' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-12s\033[0m %s\n", $$1, $$2}'
@ -26,3 +26,17 @@ build: ## Force a fresh build of the platform image (no cache).
test: ## Run Go unit tests in workspace-server/.
cd workspace-server && go test -race ./...
# Mandatory local-E2E gate (feedback_mandatory_local_e2e_before_ship,
# feedback_local_must_mimic_production). The provisioner-parity tests
# invoke the REAL WriteAuthTokenToVolume + WriteFilesToContainer against
# the LOCAL Docker daemon and assert /configs token files are
# AgentUID-owned — the formerly-prod-only token-injection ownership bug
# class (Hermes list_peers 401, #1877/#418), now caught in SECONDS
# locally instead of an ~1h EC2 round-trip. These self-skip when no
# Docker daemon is reachable (so `make test`/CI stays green on
# Docker-less runners); this target requires a daemon and is the local
# fast counterpart to the staging-required gate. Run before pushing any
# workspace-server provisioner change.
test-local-e2e: ## Run Docker-gated local-E2E parity tests (requires a local Docker daemon).
cd workspace-server && go test -run 'TestTokenOwnership' -v ./internal/provisioner/

View File

@ -0,0 +1,459 @@
//go:build local
// +build local
package provisioner
// Fast local provisioner-parity test — the proof-of-pattern that the
// formerly "prod-only / slow" token-injection ownership bug class is
// reproducible against a LOCAL Docker daemon in SECONDS, not an
// ~hour-round-trip through EC2.
//
// WHY THIS EXISTS
//
// The fleet-wide list_peers 401 incident (Hermes et al; #1877 / #418)
// came from THIS package's WriteAuthTokenToVolume +
// WriteFilesToContainer running their Docker API operations as root and
// never chowning the delivered files to the in-container agent uid
// (AgentUID, 1000). The agent runs as AgentUID (every workspace
// template `useradd -u 1000 agent`; workspace/entrypoint.sh `gosu
// agent`), so the platform injecting /configs/.auth_token as root:root
// AFTER the entrypoint's `chown -R agent /configs` means the agent-uid
// a2a_mcp_server gets EACCES → empty bearer → platform 401 on
// /registry/{id}/peers (the literal list_peers path).
//
// Those are Docker API operations, NOT AWS operations. They run
// IDENTICALLY against a local Docker daemon. They were only "prod-only"
// because the handler forks local-vs-EC2 at
// handlers/workspace_dispatchers.go (cpProv-vs-provisioner) and the
// local stack historically didn't drive the same post-start
// re-injection sequence — NOT because the bug needs AWS. This test
// invokes the REAL functions against REAL local Docker and asserts the
// REAL file ownership semantics the agent depends on. NO mock — a mock
// here would bypass the exact ownership semantics that caused the
// incident (the proxy-trap).
//
// EXPECTED BEHAVIOUR ACROSS CODE STATES (both proven in review):
//
// - pre-fix code (WriteAuthTokenToVolume: `printf > .auth_token &&
// chmod 0600`, no chown; WriteFilesToContainer: tar.Header with
// Uid/Gid unset → 0/root): the post-start re-injection lands
// root:root → the AgentUID write probe FAILS. Measured: the
// headline test fails in ~0.9s. This test would have caught the
// Hermes incident LOCALLY, in seconds, instead of an ~1h prod
// round-trip.
// - post-fix code (this PR's base — the agent-owned-injection fix:
// chown 1000:1000 in the volume writer + tar Uid/Gid=AgentUID):
// files land AgentUID-owned → the probe PASSES. Measured: ~0.87s.
//
// The probe asserts WRITABILITY by AgentUID, not mere readability: the
// token file is delivered world-readable (mode 0644 via the tar
// header), so a naive `cat` probe stays green even on root:root and the
// assertion would be vacuously useless. The real failure is on the
// RECOVERY path — heartbeat 401 → platform issues a fresh token →
// platform_auth.save_token does os.open(O_WRONLY|O_CREAT|O_TRUNC,
// 0o600) to persist it — and a root:root file denies the AgentUID
// write, so the agent can never persist a rotated token and 401s
// forever. TestTokenOwnership_FailPre_ProvesCatch pins the pre-fix
// root:root delivery shape independently of the production source and
// asserts the SAME probe catches it, so the catch stays demonstrable
// even on this fix-based branch where the headline test is green.
//
// RUNTIME: seconds. One-time alpine pull (cached), throwaway
// containers, no EC2, no provision wait.
import (
"archive/tar"
"bytes"
"context"
"fmt"
"io"
"os/exec"
"strings"
"testing"
"time"
"github.com/docker/docker/api/types/container"
"github.com/docker/docker/api/types/volume"
)
// agentUID is the in-container uid the workspace agent runs as. Bound to
// the production contract via the package's exported AgentUID constant
// (the agent-owned-injection fix defines it; every template
// `useradd -u 1000 agent` and entrypoint.sh `gosu agent` honour it). We
// reference AgentUID rather than a private literal so this test and the
// production fix share ONE source of truth — if the runtime ever moves
// the agent off 1000, AgentUID and this test move together.
const agentUID = AgentUID
// buildRootOwnedTar builds a tar stream with each entry's Mode set but
// Uid/Gid LEFT UNSET (Go tar.Header zero value → 0 → root). This is a
// faithful reproduction of the PRE-fix WriteFilesToContainer header
// (`&tar.Header{Name, Mode, Size}` — no Uid/Gid). Used by the FailPre
// control to pin the OLD delivery shape independently of the production
// source, so the catch stays demonstrable after the fix lands.
func buildRootOwnedTar(t *testing.T, files map[string][]byte) io.Reader {
t.Helper()
var buf bytes.Buffer
tw := tar.NewWriter(&buf)
for name, data := range files {
if err := tw.WriteHeader(&tar.Header{
Name: name,
Mode: 0644,
Size: int64(len(data)),
// Uid/Gid intentionally unset == 0 == root (the bug).
}); err != nil {
t.Fatalf("tar header: %v", err)
}
if _, err := tw.Write(data); err != nil {
t.Fatalf("tar write: %v", err)
}
}
if err := tw.Close(); err != nil {
t.Fatalf("tar close: %v", err)
}
return &buf
}
// dockerOrSkip returns a Provisioner bound to the local Docker daemon,
// or skips if no daemon is reachable. This is a real-Docker integration
// test by design (per feedback_real_subprocess_test_for_boot_path:
// in-process / mock misses the ownership bug entirely). Skipping on
// no-daemon keeps `go test ./...` green on Docker-less machines while
// the local-E2E lane (which always has a daemon) still gates on it.
func dockerOrSkip(t *testing.T) *Provisioner {
t.Helper()
if testing.Short() {
t.Skip("skipping real-Docker provisioner-parity test in -short mode")
}
p, err := New()
if err != nil {
t.Skipf("no local Docker daemon (New: %v) — this lane requires one", err)
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
if _, err := p.cli.Ping(ctx); err != nil {
t.Skipf("local Docker daemon unreachable (Ping: %v) — this lane requires one", err)
}
return p
}
// ensureAlpine pulls alpine once if absent. WriteAuthTokenToVolume
// itself uses the `alpine` image, so the daemon needs it regardless;
// pulling here makes the one-time cost explicit and the timed
// assertions honest (pull is one-time, not per-run).
func ensureAlpine(t *testing.T, p *Provisioner) {
t.Helper()
ctx, cancel := context.WithTimeout(context.Background(), 90*time.Second)
defer cancel()
if _, err := p.cli.ImageInspect(ctx, "alpine"); err == nil {
return
}
if err := pullImageAndDrain(ctx, p.cli, "alpine", ""); err != nil {
t.Fatalf("failed to pull alpine (needed by WriteAuthTokenToVolume too): %v", err)
}
}
// uniqueWorkspaceID returns a short, test-scoped workspace id. Volume +
// container names derive from the first 12 chars (ConfigVolumeName /
// ContainerName truncate), so the entropy must live in the prefix.
func uniqueWorkspaceID() string {
return fmt.Sprintf("t%011d", time.Now().UnixNano()%1e11)
}
// cleanupWorkspace force-removes the container + config volume the test
// created. Best-effort: honours feedback_cleanup_after_each_test —
// leave the daemon as we found it.
func cleanupWorkspace(p *Provisioner, workspaceID string) {
ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second)
defer cancel()
_ = p.cli.ContainerRemove(ctx, ContainerName(workspaceID),
container.RemoveOptions{Force: true})
_ = p.cli.VolumeRemove(ctx, ConfigVolumeName(workspaceID), true)
}
// rewritableByAgentUID returns nil iff uid agentUID can re-open `path`
// inside `containerID` for WRITE+TRUNCATE — the EXACT operation the
// runtime's platform_auth.save_token performs:
//
// os.open(path, O_WRONLY|O_CREAT|O_TRUNC, 0o600)
//
// This is the operation that 401'd Hermes. Modelled with `: > path`
// (shell O_WRONLY|O_CREAT|O_TRUNC), the minimal faithful equivalent. A
// non-nil error is the Hermes failure: a rotated token cannot be
// persisted over a root:root file → 401 forever.
func rewritableByAgentUID(ctx context.Context, p *Provisioner, containerID, path string) error {
execCfg := container.ExecOptions{
User: fmt.Sprintf("%d", agentUID),
Cmd: []string{"sh", "-c", ": > " + path},
AttachStdout: true,
AttachStderr: true,
}
ex, err := p.cli.ContainerExecCreate(ctx, containerID, execCfg)
if err != nil {
return fmt.Errorf("exec create: %w", err)
}
att, err := p.cli.ContainerExecAttach(ctx, ex.ID, container.ExecAttachOptions{})
if err != nil {
return fmt.Errorf("exec attach: %w", err)
}
// Drain the multiplexed stream to completion. This BLOCKS until the
// exec process exits — without it, ContainerExecInspect races the
// process and reports ExitCode 0 / Running=true, silently inverting
// the assertion (the proxy-trap that almost shipped this test green
// against the real bug — caught and fixed in review).
_, _ = io.Copy(io.Discard, att.Reader)
att.Close()
var insp container.ExecInspect
deadline := time.Now().Add(10 * time.Second)
for {
insp, err = p.cli.ContainerExecInspect(ctx, ex.ID)
if err != nil {
return fmt.Errorf("exec inspect: %w", err)
}
if !insp.Running {
break
}
if time.Now().After(deadline) {
return fmt.Errorf("exec %s still running after 10s", ex.ID)
}
time.Sleep(50 * time.Millisecond)
}
if insp.ExitCode != 0 {
return fmt.Errorf("uid %d cannot O_WRONLY|O_TRUNC %s (exit %d) — "+
"this is the save_token write-denial that 401'd Hermes: the "+
"agent receives a rotated token but cannot persist it over a "+
"root:root file, so it 401s forever",
agentUID, path, insp.ExitCode)
}
return nil
}
// startConfigsContainer creates and starts a long-lived container that
// mounts the workspace's REAL config volume at /configs and models the
// production ownership lifecycle: runs as root, `chown -R agent
// /configs` (mirrors workspace/entrypoint.sh), then idles. The agent is
// created at uid agentUID exactly like the workspace templates. This is
// the container into which WriteFilesToContainer re-injects (#418)
// post-start — the precise moment the prod bug manifests.
func startConfigsContainer(t *testing.T, p *Provisioner, workspaceID string) string {
t.Helper()
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
volName := ConfigVolumeName(workspaceID)
entry := fmt.Sprintf(
"adduser -D -u %d agent 2>/dev/null || true; "+
"mkdir -p /configs; chown -R agent /configs; "+
"exec sleep 600", agentUID)
resp, err := p.cli.ContainerCreate(ctx, &container.Config{
Image: "alpine",
Cmd: []string{"sh", "-c", entry},
User: "0", // entrypoint starts as root, like the runtime
}, &container.HostConfig{
Binds: []string{volName + ":/configs"},
}, nil, nil, ContainerName(workspaceID))
if err != nil {
t.Fatalf("create configs container: %v", err)
}
if err := p.cli.ContainerStart(ctx, resp.ID, container.StartOptions{}); err != nil {
t.Fatalf("start configs container: %v", err)
}
if err := waitForChownedConfigs(ctx, p, resp.ID); err != nil {
t.Fatalf("configs container never became ready: %v", err)
}
return resp.ID
}
// waitForChownedConfigs blocks until /configs is present AND owned by
// the agent uid — i.e. the entrypoint's `chown -R agent /configs` has
// completed. Polling on ownership (not mere existence) is the correct
// readiness signal: the bug is the post-start re-injection landing
// root:root *after* this chown, so the test must not re-inject until
// the chown is definitively done. Each probe drains the exec stream to
// completion before trusting ExitCode (same race fix as
// rewritableByAgentUID).
func waitForChownedConfigs(ctx context.Context, p *Provisioner, containerID string) error {
deadline := time.Now().Add(20 * time.Second)
for time.Now().Before(deadline) {
ex, err := p.cli.ContainerExecCreate(ctx, containerID, container.ExecOptions{
Cmd: []string{"sh", "-c",
fmt.Sprintf("[ \"$(stat -c %%u /configs)\" = \"%d\" ]", agentUID)},
AttachStdout: true,
AttachStderr: true,
})
if err == nil {
if att, aerr := p.cli.ContainerExecAttach(ctx, ex.ID,
container.ExecAttachOptions{}); aerr == nil {
_, _ = io.Copy(io.Discard, att.Reader)
att.Close()
if insp, ierr := p.cli.ContainerExecInspect(ctx, ex.ID); ierr == nil &&
!insp.Running && insp.ExitCode == 0 {
return nil
}
}
}
time.Sleep(150 * time.Millisecond)
}
return fmt.Errorf("/configs not chowned to uid %d after wait", agentUID)
}
// TestTokenOwnership_LocalProvisionerParity is the headline test. It
// drives the EXACT production sequence against local Docker:
//
// 1. WriteAuthTokenToVolume (#1877): pre-start write of .auth_token
// into the named config volume via the real throwaway alpine writer.
// 2. start a container modelling the runtime (root → chown /configs →
// would gosu agent), mounting that same real volume.
// 3. WriteFilesToContainer (#418): post-start re-injection of
// .auth_token + .platform_inbound_secret into the running container
// via the real tar/CopyToContainer path.
// 4. assert uid agentUID can RE-WRITE both files (the save_token
// recovery path) — the exact property the Hermes 401 violated.
//
// FAILS in ~0.9s on pre-fix code (files root:root). PASSES on this PR's
// fix-based branch (files agent-owned).
func TestTokenOwnership_LocalProvisionerParity(t *testing.T) {
p := dockerOrSkip(t)
ensureAlpine(t, p)
workspaceID := uniqueWorkspaceID()
t.Cleanup(func() { cleanupWorkspace(p, workspaceID) })
start := time.Now()
// --- Step 1: REAL WriteAuthTokenToVolume (#1877, pre-start) ------
const tokenPlain = "wsauth-fasttest-tok-abc123"
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
if err := p.WriteAuthTokenToVolume(ctx, workspaceID, tokenPlain); err != nil {
t.Fatalf("WriteAuthTokenToVolume (real #1877 path) failed: %v", err)
}
// --- Step 2: start the runtime-modelling container --------------
containerID := startConfigsContainer(t, p, workspaceID)
// --- Step 3: REAL WriteFilesToContainer (#418 post-start) -------
files := map[string][]byte{
".auth_token": []byte(tokenPlain),
".platform_inbound_secret": []byte("inbound-secret-xyz789"),
}
if err := p.WriteFilesToContainer(ctx, containerID, files); err != nil {
t.Fatalf("WriteFilesToContainer (real #418 path) failed: %v", err)
}
elapsed := time.Since(start)
// --- Step 4: assert the property Hermes violated ----------------
for _, f := range []string{
"/configs/.auth_token",
"/configs/.platform_inbound_secret",
} {
if err := rewritableByAgentUID(ctx, p, containerID, f); err != nil {
t.Errorf("PROVISIONER-PARITY BUG (Hermes class): %v\n"+
" This is the formerly-prod-only token-injection "+
"ownership bug, caught LOCALLY in %s. The real "+
"WriteFilesToContainer (#418 post-start re-inject) "+
"delivered %s owned root:root; the workspace agent "+
"runs as uid %d and its save_token O_WRONLY|O_TRUNC "+
"is denied → it cannot persist a rotated token → "+
"heartbeat / list_peers 401 forever. Fix: inject "+
"delivered files owned by AgentUID (this PR's base). "+
"See incident notes #1877 / #418.",
err, elapsed, f, agentUID)
}
}
t.Logf("provisioner-parity sequence (real WriteAuthTokenToVolume + "+
"WriteFilesToContainer vs local Docker) completed in %s — "+
"the SECONDS-local replacement for the ~1h EC2 round-trip "+
"that previously gated this bug class", elapsed)
// Hard guard on the headline claim: this MUST be seconds. 90s
// ceiling absorbs a cold alpine pull on a slow link; steady-state
// (image cached) is sub-2s.
if elapsed > 90*time.Second {
t.Errorf("provisioner-parity test took %s — claimed to be "+
"SECONDS-local; investigate (image pull? daemon load?)",
elapsed)
}
}
// TestTokenOwnership_FailPre_ProvesCatch pins the PRE-fix delivery shape
// (post-start re-injection via a tar with Uid/Gid unset → root) and
// asserts the SAME AgentUID write probe the headline test uses FAILS on
// it. This proves the assertion genuinely catches the bug — it is not
// vacuously green — and keeps the fail-direction demonstrable on this
// fix-based branch where the headline test is green. Per
// feedback_assert_exact_not_substring: verify the tightened assertion
// FAILS on old behaviour.
func TestTokenOwnership_FailPre_ProvesCatch(t *testing.T) {
p := dockerOrSkip(t)
ensureAlpine(t, p)
workspaceID := uniqueWorkspaceID()
t.Cleanup(func() { cleanupWorkspace(p, workspaceID) })
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
if _, err := p.cli.VolumeCreate(ctx, volume.CreateOptions{
Name: ConfigVolumeName(workspaceID), Labels: managedLabels(),
}); err != nil {
t.Fatalf("volume create: %v", err)
}
// Start a runtime-model container (root → chown -R agent /configs →
// idle) so the entrypoint chown has ALREADY run, exactly as in
// production. The bug is the POST-start re-injection landing
// root:root *after* that chown — so pinning the pre-fix
// WriteFilesToContainer tar shape and copying it in post-start is
// the faithful pre-fix reproduction.
containerID := startConfigsContainer(t, p, workspaceID)
preFixTar := buildRootOwnedTar(t, map[string][]byte{
".auth_token": []byte("pre-fix-token"),
})
if err := p.cli.CopyToContainer(ctx, containerID, "/configs",
preFixTar, container.CopyToContainerOptions{}); err != nil {
t.Fatalf("pre-fix CopyToContainer: %v", err)
}
probeErr := rewritableByAgentUID(ctx, p, containerID, "/configs/.auth_token")
if probeErr == nil {
t.Fatalf("REGRESSION IN THE TEST ITSELF: the uid-%d write "+
"probe PASSED against the known-pre-fix root:root "+
"post-start re-injection. The headline assertion would "+
"be vacuously green and would NOT have caught Hermes. "+
"Investigate rewritableByAgentUID before trusting the "+
"parity test.", agentUID)
}
t.Logf("confirmed: pre-fix root:root post-start re-injection is "+
"correctly caught by the uid-%d write probe (%v) — the "+
"parity assertion is load-bearing, not vacuous", agentUID, probeErr)
}
// TestTokenOwnership_DockerIsLocalNotAWS asserts the core insight:
// WriteAuthTokenToVolume + WriteFilesToContainer reach ONLY the Docker
// client — no AWS SDK, no EC2/EBS/SG calls. That is why the bug class
// is locally reproducible at all. Static guard: the provisioner package
// must not import the AWS SDK (belt-and-braces with architecture_test.go
// and the driver-seam RFC, internal #184).
func TestTokenOwnership_DockerIsLocalNotAWS(t *testing.T) {
t.Parallel()
out, err := exec.Command("go", "list", "-deps",
"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner").
Output()
if err != nil {
t.Skipf("go list unavailable in this env: %v", err)
}
for _, line := range strings.Split(string(out), "\n") {
if strings.Contains(line, "aws-sdk-go") {
t.Fatalf("provisioner pulls in the AWS SDK (%s) — the "+
"token-injection paths are supposed to be "+
"driver-agnostic Docker-only ops; an AWS dep here "+
"would undermine the local-testability this test "+
"proves. See the driver-seam RFC (internal #184).",
strings.TrimSpace(line))
}
}
}