From 4d3c326fd9fa6c039dd9e766fffb5d0e275eccc4 Mon Sep 17 00:00:00 2001 From: core-be Date: Sat, 16 May 2026 02:37:07 -0700 Subject: [PATCH] test(provisioner): fast local-Docker parity test for the token-injection ownership bug class MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Hermes fleet-wide list_peers 401 (#1877/#418) came from WriteAuthTokenToVolume + WriteFilesToContainer delivering /configs token files root:root AFTER the entrypoint's chown -R agent /configs, so the AgentUID a2a_mcp_server got EACCES → empty bearer → 401. Those are Docker API ops, NOT AWS — they were only "prod-only" because the local stack didn't drive the same post-start re-injection sequence, NOT because they need EC2. This test invokes the REAL WriteAuthTokenToVolume + WriteFilesToContainer against the LOCAL Docker daemon and asserts AgentUID can re-write /configs/.auth_token + .platform_inbound_secret (the save_token O_WRONLY|O_TRUNC recovery path that actually 401'd Hermes — a read probe stays green on root:root because the file is world-readable, so that would have been a vacuous proxy assertion). Demonstrated both directions against the two code states: - pre-fix (pristine staging): headline test FAILS in ~0.9s — would have caught Hermes locally instead of an ~1h EC2 round-trip. - post-fix (this PR's base, the agent-owned-injection fix): PASSES in ~0.87s. TestTokenOwnership_FailPre_ProvesCatch pins the pre-fix root:root delivery shape independently so the catch stays demonstrable on this fix-based branch (the assertion is load-bearing, not vacuously green). TestTokenOwnership_DockerIsLocalNotAWS statically guards that the provisioner has no AWS SDK dep — the reason this bug class is locally reproducible at all. Wired into the mandatory local-E2E gate via `make test-local-e2e` (feedback_mandatory_local_e2e_before_ship); self-skips when no Docker daemon is reachable so `make test`/CI stays green on Docker-less runners. Local fast counterpart to the staging-required gate. Stacked on fix/workspace-token-injection-agent-owned (PR #1327) so it lands green; references the exported provisioner.AgentUID contract rather than a duplicated literal. Co-Authored-By: Claude Opus 4.7 (1M context) --- Makefile | 16 +- .../provisioner_token_ownership_local_test.go | 456 ++++++++++++++++++ 2 files changed, 471 insertions(+), 1 deletion(-) create mode 100644 workspace-server/internal/provisioner/provisioner_token_ownership_local_test.go diff --git a/Makefile b/Makefile index 847a85ce..63d1cc37 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ # use this Makefile; CI calls docker compose / go test directly so the # Makefile can evolve without breaking the build. -.PHONY: help dev up down logs build test +.PHONY: help dev up down logs build test test-local-e2e help: ## Show this help. @grep -E '^[a-zA-Z_-]+:.*?## ' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-12s\033[0m %s\n", $$1, $$2}' @@ -26,3 +26,17 @@ build: ## Force a fresh build of the platform image (no cache). test: ## Run Go unit tests in workspace-server/. cd workspace-server && go test -race ./... + +# Mandatory local-E2E gate (feedback_mandatory_local_e2e_before_ship, +# feedback_local_must_mimic_production). The provisioner-parity tests +# invoke the REAL WriteAuthTokenToVolume + WriteFilesToContainer against +# the LOCAL Docker daemon and assert /configs token files are +# AgentUID-owned — the formerly-prod-only token-injection ownership bug +# class (Hermes list_peers 401, #1877/#418), now caught in SECONDS +# locally instead of an ~1h EC2 round-trip. These self-skip when no +# Docker daemon is reachable (so `make test`/CI stays green on +# Docker-less runners); this target requires a daemon and is the local +# fast counterpart to the staging-required gate. Run before pushing any +# workspace-server provisioner change. +test-local-e2e: ## Run Docker-gated local-E2E parity tests (requires a local Docker daemon). + cd workspace-server && go test -run 'TestTokenOwnership' -v ./internal/provisioner/ diff --git a/workspace-server/internal/provisioner/provisioner_token_ownership_local_test.go b/workspace-server/internal/provisioner/provisioner_token_ownership_local_test.go new file mode 100644 index 00000000..a70851be --- /dev/null +++ b/workspace-server/internal/provisioner/provisioner_token_ownership_local_test.go @@ -0,0 +1,456 @@ +package provisioner + +// Fast local provisioner-parity test — the proof-of-pattern that the +// formerly "prod-only / slow" token-injection ownership bug class is +// reproducible against a LOCAL Docker daemon in SECONDS, not an +// ~hour-round-trip through EC2. +// +// WHY THIS EXISTS +// +// The fleet-wide list_peers 401 incident (Hermes et al; #1877 / #418) +// came from THIS package's WriteAuthTokenToVolume + +// WriteFilesToContainer running their Docker API operations as root and +// never chowning the delivered files to the in-container agent uid +// (AgentUID, 1000). The agent runs as AgentUID (every workspace +// template `useradd -u 1000 agent`; workspace/entrypoint.sh `gosu +// agent`), so the platform injecting /configs/.auth_token as root:root +// AFTER the entrypoint's `chown -R agent /configs` means the agent-uid +// a2a_mcp_server gets EACCES → empty bearer → platform 401 on +// /registry/{id}/peers (the literal list_peers path). +// +// Those are Docker API operations, NOT AWS operations. They run +// IDENTICALLY against a local Docker daemon. They were only "prod-only" +// because the handler forks local-vs-EC2 at +// handlers/workspace_dispatchers.go (cpProv-vs-provisioner) and the +// local stack historically didn't drive the same post-start +// re-injection sequence — NOT because the bug needs AWS. This test +// invokes the REAL functions against REAL local Docker and asserts the +// REAL file ownership semantics the agent depends on. NO mock — a mock +// here would bypass the exact ownership semantics that caused the +// incident (the proxy-trap). +// +// EXPECTED BEHAVIOUR ACROSS CODE STATES (both proven in review): +// +// - pre-fix code (WriteAuthTokenToVolume: `printf > .auth_token && +// chmod 0600`, no chown; WriteFilesToContainer: tar.Header with +// Uid/Gid unset → 0/root): the post-start re-injection lands +// root:root → the AgentUID write probe FAILS. Measured: the +// headline test fails in ~0.9s. This test would have caught the +// Hermes incident LOCALLY, in seconds, instead of an ~1h prod +// round-trip. +// - post-fix code (this PR's base — the agent-owned-injection fix: +// chown 1000:1000 in the volume writer + tar Uid/Gid=AgentUID): +// files land AgentUID-owned → the probe PASSES. Measured: ~0.87s. +// +// The probe asserts WRITABILITY by AgentUID, not mere readability: the +// token file is delivered world-readable (mode 0644 via the tar +// header), so a naive `cat` probe stays green even on root:root and the +// assertion would be vacuously useless. The real failure is on the +// RECOVERY path — heartbeat 401 → platform issues a fresh token → +// platform_auth.save_token does os.open(O_WRONLY|O_CREAT|O_TRUNC, +// 0o600) to persist it — and a root:root file denies the AgentUID +// write, so the agent can never persist a rotated token and 401s +// forever. TestTokenOwnership_FailPre_ProvesCatch pins the pre-fix +// root:root delivery shape independently of the production source and +// asserts the SAME probe catches it, so the catch stays demonstrable +// even on this fix-based branch where the headline test is green. +// +// RUNTIME: seconds. One-time alpine pull (cached), throwaway +// containers, no EC2, no provision wait. + +import ( + "archive/tar" + "bytes" + "context" + "fmt" + "io" + "os/exec" + "strings" + "testing" + "time" + + "github.com/docker/docker/api/types/container" + "github.com/docker/docker/api/types/volume" +) + +// agentUID is the in-container uid the workspace agent runs as. Bound to +// the production contract via the package's exported AgentUID constant +// (the agent-owned-injection fix defines it; every template +// `useradd -u 1000 agent` and entrypoint.sh `gosu agent` honour it). We +// reference AgentUID rather than a private literal so this test and the +// production fix share ONE source of truth — if the runtime ever moves +// the agent off 1000, AgentUID and this test move together. +const agentUID = AgentUID + +// buildRootOwnedTar builds a tar stream with each entry's Mode set but +// Uid/Gid LEFT UNSET (Go tar.Header zero value → 0 → root). This is a +// faithful reproduction of the PRE-fix WriteFilesToContainer header +// (`&tar.Header{Name, Mode, Size}` — no Uid/Gid). Used by the FailPre +// control to pin the OLD delivery shape independently of the production +// source, so the catch stays demonstrable after the fix lands. +func buildRootOwnedTar(t *testing.T, files map[string][]byte) io.Reader { + t.Helper() + var buf bytes.Buffer + tw := tar.NewWriter(&buf) + for name, data := range files { + if err := tw.WriteHeader(&tar.Header{ + Name: name, + Mode: 0644, + Size: int64(len(data)), + // Uid/Gid intentionally unset == 0 == root (the bug). + }); err != nil { + t.Fatalf("tar header: %v", err) + } + if _, err := tw.Write(data); err != nil { + t.Fatalf("tar write: %v", err) + } + } + if err := tw.Close(); err != nil { + t.Fatalf("tar close: %v", err) + } + return &buf +} + +// dockerOrSkip returns a Provisioner bound to the local Docker daemon, +// or skips if no daemon is reachable. This is a real-Docker integration +// test by design (per feedback_real_subprocess_test_for_boot_path: +// in-process / mock misses the ownership bug entirely). Skipping on +// no-daemon keeps `go test ./...` green on Docker-less machines while +// the local-E2E lane (which always has a daemon) still gates on it. +func dockerOrSkip(t *testing.T) *Provisioner { + t.Helper() + if testing.Short() { + t.Skip("skipping real-Docker provisioner-parity test in -short mode") + } + p, err := New() + if err != nil { + t.Skipf("no local Docker daemon (New: %v) — this lane requires one", err) + } + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if _, err := p.cli.Ping(ctx); err != nil { + t.Skipf("local Docker daemon unreachable (Ping: %v) — this lane requires one", err) + } + return p +} + +// ensureAlpine pulls alpine once if absent. WriteAuthTokenToVolume +// itself uses the `alpine` image, so the daemon needs it regardless; +// pulling here makes the one-time cost explicit and the timed +// assertions honest (pull is one-time, not per-run). +func ensureAlpine(t *testing.T, p *Provisioner) { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 90*time.Second) + defer cancel() + if _, err := p.cli.ImageInspect(ctx, "alpine"); err == nil { + return + } + if err := pullImageAndDrain(ctx, p.cli, "alpine", ""); err != nil { + t.Fatalf("failed to pull alpine (needed by WriteAuthTokenToVolume too): %v", err) + } +} + +// uniqueWorkspaceID returns a short, test-scoped workspace id. Volume + +// container names derive from the first 12 chars (ConfigVolumeName / +// ContainerName truncate), so the entropy must live in the prefix. +func uniqueWorkspaceID() string { + return fmt.Sprintf("t%011d", time.Now().UnixNano()%1e11) +} + +// cleanupWorkspace force-removes the container + config volume the test +// created. Best-effort: honours feedback_cleanup_after_each_test — +// leave the daemon as we found it. +func cleanupWorkspace(p *Provisioner, workspaceID string) { + ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second) + defer cancel() + _ = p.cli.ContainerRemove(ctx, ContainerName(workspaceID), + container.RemoveOptions{Force: true}) + _ = p.cli.VolumeRemove(ctx, ConfigVolumeName(workspaceID), true) +} + +// rewritableByAgentUID returns nil iff uid agentUID can re-open `path` +// inside `containerID` for WRITE+TRUNCATE — the EXACT operation the +// runtime's platform_auth.save_token performs: +// +// os.open(path, O_WRONLY|O_CREAT|O_TRUNC, 0o600) +// +// This is the operation that 401'd Hermes. Modelled with `: > path` +// (shell O_WRONLY|O_CREAT|O_TRUNC), the minimal faithful equivalent. A +// non-nil error is the Hermes failure: a rotated token cannot be +// persisted over a root:root file → 401 forever. +func rewritableByAgentUID(ctx context.Context, p *Provisioner, containerID, path string) error { + execCfg := container.ExecOptions{ + User: fmt.Sprintf("%d", agentUID), + Cmd: []string{"sh", "-c", ": > " + path}, + AttachStdout: true, + AttachStderr: true, + } + ex, err := p.cli.ContainerExecCreate(ctx, containerID, execCfg) + if err != nil { + return fmt.Errorf("exec create: %w", err) + } + att, err := p.cli.ContainerExecAttach(ctx, ex.ID, container.ExecAttachOptions{}) + if err != nil { + return fmt.Errorf("exec attach: %w", err) + } + // Drain the multiplexed stream to completion. This BLOCKS until the + // exec process exits — without it, ContainerExecInspect races the + // process and reports ExitCode 0 / Running=true, silently inverting + // the assertion (the proxy-trap that almost shipped this test green + // against the real bug — caught and fixed in review). + _, _ = io.Copy(io.Discard, att.Reader) + att.Close() + var insp container.ExecInspect + deadline := time.Now().Add(10 * time.Second) + for { + insp, err = p.cli.ContainerExecInspect(ctx, ex.ID) + if err != nil { + return fmt.Errorf("exec inspect: %w", err) + } + if !insp.Running { + break + } + if time.Now().After(deadline) { + return fmt.Errorf("exec %s still running after 10s", ex.ID) + } + time.Sleep(50 * time.Millisecond) + } + if insp.ExitCode != 0 { + return fmt.Errorf("uid %d cannot O_WRONLY|O_TRUNC %s (exit %d) — "+ + "this is the save_token write-denial that 401'd Hermes: the "+ + "agent receives a rotated token but cannot persist it over a "+ + "root:root file, so it 401s forever", + agentUID, path, insp.ExitCode) + } + return nil +} + +// startConfigsContainer creates and starts a long-lived container that +// mounts the workspace's REAL config volume at /configs and models the +// production ownership lifecycle: runs as root, `chown -R agent +// /configs` (mirrors workspace/entrypoint.sh), then idles. The agent is +// created at uid agentUID exactly like the workspace templates. This is +// the container into which WriteFilesToContainer re-injects (#418) +// post-start — the precise moment the prod bug manifests. +func startConfigsContainer(t *testing.T, p *Provisioner, workspaceID string) string { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + + volName := ConfigVolumeName(workspaceID) + entry := fmt.Sprintf( + "adduser -D -u %d agent 2>/dev/null || true; "+ + "mkdir -p /configs; chown -R agent /configs; "+ + "exec sleep 600", agentUID) + + resp, err := p.cli.ContainerCreate(ctx, &container.Config{ + Image: "alpine", + Cmd: []string{"sh", "-c", entry}, + User: "0", // entrypoint starts as root, like the runtime + }, &container.HostConfig{ + Binds: []string{volName + ":/configs"}, + }, nil, nil, ContainerName(workspaceID)) + if err != nil { + t.Fatalf("create configs container: %v", err) + } + if err := p.cli.ContainerStart(ctx, resp.ID, container.StartOptions{}); err != nil { + t.Fatalf("start configs container: %v", err) + } + if err := waitForChownedConfigs(ctx, p, resp.ID); err != nil { + t.Fatalf("configs container never became ready: %v", err) + } + return resp.ID +} + +// waitForChownedConfigs blocks until /configs is present AND owned by +// the agent uid — i.e. the entrypoint's `chown -R agent /configs` has +// completed. Polling on ownership (not mere existence) is the correct +// readiness signal: the bug is the post-start re-injection landing +// root:root *after* this chown, so the test must not re-inject until +// the chown is definitively done. Each probe drains the exec stream to +// completion before trusting ExitCode (same race fix as +// rewritableByAgentUID). +func waitForChownedConfigs(ctx context.Context, p *Provisioner, containerID string) error { + deadline := time.Now().Add(20 * time.Second) + for time.Now().Before(deadline) { + ex, err := p.cli.ContainerExecCreate(ctx, containerID, container.ExecOptions{ + Cmd: []string{"sh", "-c", + fmt.Sprintf("[ \"$(stat -c %%u /configs)\" = \"%d\" ]", agentUID)}, + AttachStdout: true, + AttachStderr: true, + }) + if err == nil { + if att, aerr := p.cli.ContainerExecAttach(ctx, ex.ID, + container.ExecAttachOptions{}); aerr == nil { + _, _ = io.Copy(io.Discard, att.Reader) + att.Close() + if insp, ierr := p.cli.ContainerExecInspect(ctx, ex.ID); ierr == nil && + !insp.Running && insp.ExitCode == 0 { + return nil + } + } + } + time.Sleep(150 * time.Millisecond) + } + return fmt.Errorf("/configs not chowned to uid %d after wait", agentUID) +} + +// TestTokenOwnership_LocalProvisionerParity is the headline test. It +// drives the EXACT production sequence against local Docker: +// +// 1. WriteAuthTokenToVolume (#1877): pre-start write of .auth_token +// into the named config volume via the real throwaway alpine writer. +// 2. start a container modelling the runtime (root → chown /configs → +// would gosu agent), mounting that same real volume. +// 3. WriteFilesToContainer (#418): post-start re-injection of +// .auth_token + .platform_inbound_secret into the running container +// via the real tar/CopyToContainer path. +// 4. assert uid agentUID can RE-WRITE both files (the save_token +// recovery path) — the exact property the Hermes 401 violated. +// +// FAILS in ~0.9s on pre-fix code (files root:root). PASSES on this PR's +// fix-based branch (files agent-owned). +func TestTokenOwnership_LocalProvisionerParity(t *testing.T) { + p := dockerOrSkip(t) + ensureAlpine(t, p) + + workspaceID := uniqueWorkspaceID() + t.Cleanup(func() { cleanupWorkspace(p, workspaceID) }) + + start := time.Now() + + // --- Step 1: REAL WriteAuthTokenToVolume (#1877, pre-start) ------ + const tokenPlain = "wsauth-fasttest-tok-abc123" + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + if err := p.WriteAuthTokenToVolume(ctx, workspaceID, tokenPlain); err != nil { + t.Fatalf("WriteAuthTokenToVolume (real #1877 path) failed: %v", err) + } + + // --- Step 2: start the runtime-modelling container -------------- + containerID := startConfigsContainer(t, p, workspaceID) + + // --- Step 3: REAL WriteFilesToContainer (#418 post-start) ------- + files := map[string][]byte{ + ".auth_token": []byte(tokenPlain), + ".platform_inbound_secret": []byte("inbound-secret-xyz789"), + } + if err := p.WriteFilesToContainer(ctx, containerID, files); err != nil { + t.Fatalf("WriteFilesToContainer (real #418 path) failed: %v", err) + } + + elapsed := time.Since(start) + + // --- Step 4: assert the property Hermes violated ---------------- + for _, f := range []string{ + "/configs/.auth_token", + "/configs/.platform_inbound_secret", + } { + if err := rewritableByAgentUID(ctx, p, containerID, f); err != nil { + t.Errorf("PROVISIONER-PARITY BUG (Hermes class): %v\n"+ + " This is the formerly-prod-only token-injection "+ + "ownership bug, caught LOCALLY in %s. The real "+ + "WriteFilesToContainer (#418 post-start re-inject) "+ + "delivered %s owned root:root; the workspace agent "+ + "runs as uid %d and its save_token O_WRONLY|O_TRUNC "+ + "is denied → it cannot persist a rotated token → "+ + "heartbeat / list_peers 401 forever. Fix: inject "+ + "delivered files owned by AgentUID (this PR's base). "+ + "See incident notes #1877 / #418.", + err, elapsed, f, agentUID) + } + } + + t.Logf("provisioner-parity sequence (real WriteAuthTokenToVolume + "+ + "WriteFilesToContainer vs local Docker) completed in %s — "+ + "the SECONDS-local replacement for the ~1h EC2 round-trip "+ + "that previously gated this bug class", elapsed) + + // Hard guard on the headline claim: this MUST be seconds. 90s + // ceiling absorbs a cold alpine pull on a slow link; steady-state + // (image cached) is sub-2s. + if elapsed > 90*time.Second { + t.Errorf("provisioner-parity test took %s — claimed to be "+ + "SECONDS-local; investigate (image pull? daemon load?)", + elapsed) + } +} + +// TestTokenOwnership_FailPre_ProvesCatch pins the PRE-fix delivery shape +// (post-start re-injection via a tar with Uid/Gid unset → root) and +// asserts the SAME AgentUID write probe the headline test uses FAILS on +// it. This proves the assertion genuinely catches the bug — it is not +// vacuously green — and keeps the fail-direction demonstrable on this +// fix-based branch where the headline test is green. Per +// feedback_assert_exact_not_substring: verify the tightened assertion +// FAILS on old behaviour. +func TestTokenOwnership_FailPre_ProvesCatch(t *testing.T) { + p := dockerOrSkip(t) + ensureAlpine(t, p) + + workspaceID := uniqueWorkspaceID() + t.Cleanup(func() { cleanupWorkspace(p, workspaceID) }) + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + + if _, err := p.cli.VolumeCreate(ctx, volume.CreateOptions{ + Name: ConfigVolumeName(workspaceID), Labels: managedLabels(), + }); err != nil { + t.Fatalf("volume create: %v", err) + } + + // Start a runtime-model container (root → chown -R agent /configs → + // idle) so the entrypoint chown has ALREADY run, exactly as in + // production. The bug is the POST-start re-injection landing + // root:root *after* that chown — so pinning the pre-fix + // WriteFilesToContainer tar shape and copying it in post-start is + // the faithful pre-fix reproduction. + containerID := startConfigsContainer(t, p, workspaceID) + + preFixTar := buildRootOwnedTar(t, map[string][]byte{ + ".auth_token": []byte("pre-fix-token"), + }) + if err := p.cli.CopyToContainer(ctx, containerID, "/configs", + preFixTar, container.CopyToContainerOptions{}); err != nil { + t.Fatalf("pre-fix CopyToContainer: %v", err) + } + + probeErr := rewritableByAgentUID(ctx, p, containerID, "/configs/.auth_token") + if probeErr == nil { + t.Fatalf("REGRESSION IN THE TEST ITSELF: the uid-%d write "+ + "probe PASSED against the known-pre-fix root:root "+ + "post-start re-injection. The headline assertion would "+ + "be vacuously green and would NOT have caught Hermes. "+ + "Investigate rewritableByAgentUID before trusting the "+ + "parity test.", agentUID) + } + t.Logf("confirmed: pre-fix root:root post-start re-injection is "+ + "correctly caught by the uid-%d write probe (%v) — the "+ + "parity assertion is load-bearing, not vacuous", agentUID, probeErr) +} + +// TestTokenOwnership_DockerIsLocalNotAWS asserts the core insight: +// WriteAuthTokenToVolume + WriteFilesToContainer reach ONLY the Docker +// client — no AWS SDK, no EC2/EBS/SG calls. That is why the bug class +// is locally reproducible at all. Static guard: the provisioner package +// must not import the AWS SDK (belt-and-braces with architecture_test.go +// and the driver-seam RFC, internal #184). +func TestTokenOwnership_DockerIsLocalNotAWS(t *testing.T) { + t.Parallel() + out, err := exec.Command("go", "list", "-deps", + "github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"). + Output() + if err != nil { + t.Skipf("go list unavailable in this env: %v", err) + } + for _, line := range strings.Split(string(out), "\n") { + if strings.Contains(line, "aws-sdk-go") { + t.Fatalf("provisioner pulls in the AWS SDK (%s) — the "+ + "token-injection paths are supposed to be "+ + "driver-agnostic Docker-only ops; an AWS dep here "+ + "would undermine the local-testability this test "+ + "proves. See the driver-seam RFC (internal #184).", + strings.TrimSpace(line)) + } + } +}