test(provisioner): fast local-Docker parity test for the token-injection ownership bug class #1332
@ -294,6 +294,132 @@ jobs:
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Provisioner Parity — fast local prod-mimic gate. REQUIRED, always runs.
|
||||
#
|
||||
# WHY THIS IS A GATE, NOT A DOC LINE (feedback_checkpointed_workflow_over
|
||||
# _good_practice_doc): the dev-SOP already *referenced*
|
||||
# feedback_mandatory_local_e2e_before_ship as prose, but prose does not
|
||||
# stop a PR. The token-injection/ownership bug class — platform writes
|
||||
# /configs/.auth_token root:root AFTER the entrypoint chown, so the
|
||||
# uid-1000 agent's save_token O_WRONLY|O_TRUNC is denied → list_peers /
|
||||
# heartbeat 401 forever — shipped to the fleet (Hermes #1877/#418) and
|
||||
# again on template-hermes #162 (the bearer-401 being landed right now)
|
||||
# precisely because nothing *enforced* the local check. The parity test
|
||||
# (workspace-server/internal/provisioner/provisioner_token_ownership_
|
||||
# local_test.go, `//go:build local`) reproduces that exact class against
|
||||
# a LOCAL Docker daemon in <1s — versus an ~1h EC2 fresh-provision. This
|
||||
# job makes it fail-closed on every workspace-server PR.
|
||||
#
|
||||
# FAIL-CLOSED CONTRACT: the test self-skips when no Docker daemon is
|
||||
# reachable (so `make test` / `go test ./...` stay green on Docker-less
|
||||
# dev machines and the standard Platform (Go) job). A *gate* that
|
||||
# silently skips is not a gate. This job runs on a Docker-capable runner
|
||||
# and treats "0 parity tests ran" as a FAILURE — a skipped daemon here
|
||||
# means the gate did not execute, which must block merge, not pass.
|
||||
#
|
||||
# Always-run + per-step gating shape mirrors platform-build so the
|
||||
# `CI / Provisioner Parity (<event>)` required-check name is always
|
||||
# emitted (SKIPPED != passed under branch protection — PR #2314).
|
||||
provisioner-parity:
|
||||
name: Provisioner Parity
|
||||
runs-on: ubuntu-latest
|
||||
needs: [changes]
|
||||
continue-on-error: false
|
||||
# Test is seconds-local; generous ceiling absorbs a cold alpine pull
|
||||
# on a slow runner link plus Go module/build cold cache.
|
||||
timeout-minutes: 15
|
||||
defaults:
|
||||
run:
|
||||
working-directory: workspace-server
|
||||
steps:
|
||||
- if: ${{ needs.changes.outputs.platform != 'true' }}
|
||||
working-directory: .
|
||||
run: echo "No workspace-server/** changes — parity gate is a no-op for this PR; this job always runs to satisfy the required-check name on branch protection."
|
||||
- if: ${{ needs.changes.outputs.platform == 'true' }}
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- if: ${{ needs.changes.outputs.platform == 'true' }}
|
||||
uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
|
||||
with:
|
||||
go-version: 'stable'
|
||||
- if: ${{ needs.changes.outputs.platform == 'true' }}
|
||||
run: go mod download
|
||||
- if: ${{ needs.changes.outputs.platform == 'true' }}
|
||||
name: Fast local prod-mimic provisioner-parity (fail-closed)
|
||||
# Run the `//go:build local` parity suite against the runner's
|
||||
# Docker daemon. -json lets us assert tests actually RAN: a skip
|
||||
# (no daemon) or zero-test run must fail this gate, never pass it.
|
||||
run: |
|
||||
set -euo pipefail
|
||||
echo "Docker daemon check (gate requires a reachable daemon):"
|
||||
docker version --format '{{.Server.Version}}' \
|
||||
|| { echo "::error::Provisioner-parity gate could not reach a Docker daemon. This gate is fail-closed: a missing daemon means the token-ownership class was NOT checked. Failing the PR rather than passing un-tested."; exit 1; }
|
||||
set +e
|
||||
go test -tags local -json -run 'TestTokenOwnership' \
|
||||
-timeout 12m ./internal/provisioner/ | tee /tmp/parity.json
|
||||
gotest_exit=${PIPESTATUS[0]}
|
||||
set -e
|
||||
# Parse the test2json stream as real JSON. test2json emits
|
||||
# objects as {"Action":..,"Package":..,"Test":..} — field ORDER
|
||||
# is not guaranteed and `Package` sits between `Action` and
|
||||
# `Test`, so a grep adjacency match silently counts ZERO (a
|
||||
# vacuous-green trap that nearly shipped here — caught in
|
||||
# verification). Per-test terminal action is the source of truth.
|
||||
GOTEST_EXIT="$gotest_exit" python3 - <<'PY'
|
||||
import json, os, sys
|
||||
headline = "TestTokenOwnership_LocalProvisionerParity"
|
||||
proof = "TestTokenOwnership_FailPre_ProvesCatch"
|
||||
outcome = {} # test name -> last terminal action
|
||||
with open("/tmp/parity.json") as fh:
|
||||
for line in fh:
|
||||
line = line.strip()
|
||||
if not line or not line.startswith("{"):
|
||||
continue
|
||||
try:
|
||||
ev = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
t = ev.get("Test")
|
||||
a = ev.get("Action")
|
||||
if t and a in ("pass", "fail", "skip"):
|
||||
outcome[t] = a
|
||||
passed = sum(1 for v in outcome.values() if v == "pass")
|
||||
failed = sum(1 for v in outcome.values() if v == "fail")
|
||||
skipped = sum(1 for v in outcome.values() if v == "skip")
|
||||
go_exit = int(os.environ["GOTEST_EXIT"])
|
||||
print(f"parity outcomes: passed={passed} failed={failed} "
|
||||
f"skipped={skipped} go_exit={go_exit} "
|
||||
f"per-test={outcome}")
|
||||
if go_exit != 0 or failed > 0:
|
||||
print("::error::Provisioner-parity FAILED — the "
|
||||
"token-injection/ownership bug class (Hermes "
|
||||
"#1877/#418, template-hermes #162: /configs token "
|
||||
"files delivered root:root, uid-1000 agent save_token "
|
||||
"denied -> list_peers/heartbeat 401) is present. Fix "
|
||||
"the provisioner injection to deliver AgentUID-owned "
|
||||
"files before merge.")
|
||||
sys.exit(1)
|
||||
# The headline parity test AND its fail-direction proof control
|
||||
# MUST have run and passed. If either was skipped (no daemon) or
|
||||
# never collected, the gate did not actually execute its job —
|
||||
# fail-closed, never pass un-checked.
|
||||
if outcome.get(headline) != "pass":
|
||||
print(f"::error::Provisioner-parity gate did NOT execute "
|
||||
f"the headline test ({headline}={outcome.get(headline)}"
|
||||
f"). Fail-closed: a skipped/absent parity run means "
|
||||
f"the token-ownership class was never checked on this "
|
||||
f"PR — treated as a gate failure.")
|
||||
sys.exit(1)
|
||||
if outcome.get(proof) != "pass":
|
||||
print(f"::error::Provisioner-parity fail-direction proof "
|
||||
f"control did NOT pass ({proof}="
|
||||
f"{outcome.get(proof)}). Without it the headline "
|
||||
f"assertion is not proven load-bearing — fail-closed.")
|
||||
sys.exit(1)
|
||||
print(f"Provisioner-parity gate PASSED: token-ownership class "
|
||||
f"checked locally and the fail-direction proof control "
|
||||
f"confirms the assertion is load-bearing (passed={passed}).")
|
||||
PY
|
||||
|
||||
# Canvas (Next.js) — required check, always runs. Same always-run +
|
||||
# per-step gating shape as platform-build. The two-job-sharing-name
|
||||
# pattern attempted in PR #2321 doesn't satisfy branch protection
|
||||
@ -591,6 +717,14 @@ jobs:
|
||||
required = [
|
||||
f"CI / Detect changes ({event})",
|
||||
f"CI / Platform (Go) ({event})",
|
||||
# Fast local prod-mimic provisioner-parity gate (this PR).
|
||||
# Wired here — not into branch-protection's
|
||||
# status_check_contexts — by RFC internal#219 §2 design:
|
||||
# the single stable `CI / all-required` context is what BP
|
||||
# points at, and new fail-closed gates join by extending
|
||||
# this list. Makes the token-ownership class (Hermes
|
||||
# #1877/#418, template-hermes #162) a hard merge gate.
|
||||
f"CI / Provisioner Parity ({event})",
|
||||
f"CI / Canvas (Next.js) ({event})",
|
||||
f"CI / Shellcheck (E2E scripts) ({event})",
|
||||
f"CI / Python Lint & Test ({event})",
|
||||
|
||||
16
Makefile
16
Makefile
@ -4,7 +4,7 @@
|
||||
# use this Makefile; CI calls docker compose / go test directly so the
|
||||
# Makefile can evolve without breaking the build.
|
||||
|
||||
.PHONY: help dev up down logs build test
|
||||
.PHONY: help dev up down logs build test test-local-e2e
|
||||
|
||||
help: ## Show this help.
|
||||
@grep -E '^[a-zA-Z_-]+:.*?## ' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-12s\033[0m %s\n", $$1, $$2}'
|
||||
@ -26,3 +26,17 @@ build: ## Force a fresh build of the platform image (no cache).
|
||||
|
||||
test: ## Run Go unit tests in workspace-server/.
|
||||
cd workspace-server && go test -race ./...
|
||||
|
||||
# Mandatory local-E2E gate (feedback_mandatory_local_e2e_before_ship,
|
||||
# feedback_local_must_mimic_production). The provisioner-parity tests
|
||||
# invoke the REAL WriteAuthTokenToVolume + WriteFilesToContainer against
|
||||
# the LOCAL Docker daemon and assert /configs token files are
|
||||
# AgentUID-owned — the formerly-prod-only token-injection ownership bug
|
||||
# class (Hermes list_peers 401, #1877/#418), now caught in SECONDS
|
||||
# locally instead of an ~1h EC2 round-trip. These self-skip when no
|
||||
# Docker daemon is reachable (so `make test`/CI stays green on
|
||||
# Docker-less runners); this target requires a daemon and is the local
|
||||
# fast counterpart to the staging-required gate. Run before pushing any
|
||||
# workspace-server provisioner change.
|
||||
test-local-e2e: ## Run Docker-gated local-E2E parity tests (requires a local Docker daemon).
|
||||
cd workspace-server && go test -run 'TestTokenOwnership' -v ./internal/provisioner/
|
||||
|
||||
@ -0,0 +1,459 @@
|
||||
//go:build local
|
||||
// +build local
|
||||
|
||||
package provisioner
|
||||
|
||||
// Fast local provisioner-parity test — the proof-of-pattern that the
|
||||
// formerly "prod-only / slow" token-injection ownership bug class is
|
||||
// reproducible against a LOCAL Docker daemon in SECONDS, not an
|
||||
// ~hour-round-trip through EC2.
|
||||
//
|
||||
// WHY THIS EXISTS
|
||||
//
|
||||
// The fleet-wide list_peers 401 incident (Hermes et al; #1877 / #418)
|
||||
// came from THIS package's WriteAuthTokenToVolume +
|
||||
// WriteFilesToContainer running their Docker API operations as root and
|
||||
// never chowning the delivered files to the in-container agent uid
|
||||
// (AgentUID, 1000). The agent runs as AgentUID (every workspace
|
||||
// template `useradd -u 1000 agent`; workspace/entrypoint.sh `gosu
|
||||
// agent`), so the platform injecting /configs/.auth_token as root:root
|
||||
// AFTER the entrypoint's `chown -R agent /configs` means the agent-uid
|
||||
// a2a_mcp_server gets EACCES → empty bearer → platform 401 on
|
||||
// /registry/{id}/peers (the literal list_peers path).
|
||||
//
|
||||
// Those are Docker API operations, NOT AWS operations. They run
|
||||
// IDENTICALLY against a local Docker daemon. They were only "prod-only"
|
||||
// because the handler forks local-vs-EC2 at
|
||||
// handlers/workspace_dispatchers.go (cpProv-vs-provisioner) and the
|
||||
// local stack historically didn't drive the same post-start
|
||||
// re-injection sequence — NOT because the bug needs AWS. This test
|
||||
// invokes the REAL functions against REAL local Docker and asserts the
|
||||
// REAL file ownership semantics the agent depends on. NO mock — a mock
|
||||
// here would bypass the exact ownership semantics that caused the
|
||||
// incident (the proxy-trap).
|
||||
//
|
||||
// EXPECTED BEHAVIOUR ACROSS CODE STATES (both proven in review):
|
||||
//
|
||||
// - pre-fix code (WriteAuthTokenToVolume: `printf > .auth_token &&
|
||||
// chmod 0600`, no chown; WriteFilesToContainer: tar.Header with
|
||||
// Uid/Gid unset → 0/root): the post-start re-injection lands
|
||||
// root:root → the AgentUID write probe FAILS. Measured: the
|
||||
// headline test fails in ~0.9s. This test would have caught the
|
||||
// Hermes incident LOCALLY, in seconds, instead of an ~1h prod
|
||||
// round-trip.
|
||||
// - post-fix code (this PR's base — the agent-owned-injection fix:
|
||||
// chown 1000:1000 in the volume writer + tar Uid/Gid=AgentUID):
|
||||
// files land AgentUID-owned → the probe PASSES. Measured: ~0.87s.
|
||||
//
|
||||
// The probe asserts WRITABILITY by AgentUID, not mere readability: the
|
||||
// token file is delivered world-readable (mode 0644 via the tar
|
||||
// header), so a naive `cat` probe stays green even on root:root and the
|
||||
// assertion would be vacuously useless. The real failure is on the
|
||||
// RECOVERY path — heartbeat 401 → platform issues a fresh token →
|
||||
// platform_auth.save_token does os.open(O_WRONLY|O_CREAT|O_TRUNC,
|
||||
// 0o600) to persist it — and a root:root file denies the AgentUID
|
||||
// write, so the agent can never persist a rotated token and 401s
|
||||
// forever. TestTokenOwnership_FailPre_ProvesCatch pins the pre-fix
|
||||
// root:root delivery shape independently of the production source and
|
||||
// asserts the SAME probe catches it, so the catch stays demonstrable
|
||||
// even on this fix-based branch where the headline test is green.
|
||||
//
|
||||
// RUNTIME: seconds. One-time alpine pull (cached), throwaway
|
||||
// containers, no EC2, no provision wait.
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/docker/docker/api/types/container"
|
||||
"github.com/docker/docker/api/types/volume"
|
||||
)
|
||||
|
||||
// agentUID is the in-container uid the workspace agent runs as. Bound to
|
||||
// the production contract via the package's exported AgentUID constant
|
||||
// (the agent-owned-injection fix defines it; every template
|
||||
// `useradd -u 1000 agent` and entrypoint.sh `gosu agent` honour it). We
|
||||
// reference AgentUID rather than a private literal so this test and the
|
||||
// production fix share ONE source of truth — if the runtime ever moves
|
||||
// the agent off 1000, AgentUID and this test move together.
|
||||
const agentUID = AgentUID
|
||||
|
||||
// buildRootOwnedTar builds a tar stream with each entry's Mode set but
|
||||
// Uid/Gid LEFT UNSET (Go tar.Header zero value → 0 → root). This is a
|
||||
// faithful reproduction of the PRE-fix WriteFilesToContainer header
|
||||
// (`&tar.Header{Name, Mode, Size}` — no Uid/Gid). Used by the FailPre
|
||||
// control to pin the OLD delivery shape independently of the production
|
||||
// source, so the catch stays demonstrable after the fix lands.
|
||||
func buildRootOwnedTar(t *testing.T, files map[string][]byte) io.Reader {
|
||||
t.Helper()
|
||||
var buf bytes.Buffer
|
||||
tw := tar.NewWriter(&buf)
|
||||
for name, data := range files {
|
||||
if err := tw.WriteHeader(&tar.Header{
|
||||
Name: name,
|
||||
Mode: 0644,
|
||||
Size: int64(len(data)),
|
||||
// Uid/Gid intentionally unset == 0 == root (the bug).
|
||||
}); err != nil {
|
||||
t.Fatalf("tar header: %v", err)
|
||||
}
|
||||
if _, err := tw.Write(data); err != nil {
|
||||
t.Fatalf("tar write: %v", err)
|
||||
}
|
||||
}
|
||||
if err := tw.Close(); err != nil {
|
||||
t.Fatalf("tar close: %v", err)
|
||||
}
|
||||
return &buf
|
||||
}
|
||||
|
||||
// dockerOrSkip returns a Provisioner bound to the local Docker daemon,
|
||||
// or skips if no daemon is reachable. This is a real-Docker integration
|
||||
// test by design (per feedback_real_subprocess_test_for_boot_path:
|
||||
// in-process / mock misses the ownership bug entirely). Skipping on
|
||||
// no-daemon keeps `go test ./...` green on Docker-less machines while
|
||||
// the local-E2E lane (which always has a daemon) still gates on it.
|
||||
func dockerOrSkip(t *testing.T) *Provisioner {
|
||||
t.Helper()
|
||||
if testing.Short() {
|
||||
t.Skip("skipping real-Docker provisioner-parity test in -short mode")
|
||||
}
|
||||
p, err := New()
|
||||
if err != nil {
|
||||
t.Skipf("no local Docker daemon (New: %v) — this lane requires one", err)
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
if _, err := p.cli.Ping(ctx); err != nil {
|
||||
t.Skipf("local Docker daemon unreachable (Ping: %v) — this lane requires one", err)
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
// ensureAlpine pulls alpine once if absent. WriteAuthTokenToVolume
|
||||
// itself uses the `alpine` image, so the daemon needs it regardless;
|
||||
// pulling here makes the one-time cost explicit and the timed
|
||||
// assertions honest (pull is one-time, not per-run).
|
||||
func ensureAlpine(t *testing.T, p *Provisioner) {
|
||||
t.Helper()
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 90*time.Second)
|
||||
defer cancel()
|
||||
if _, err := p.cli.ImageInspect(ctx, "alpine"); err == nil {
|
||||
return
|
||||
}
|
||||
if err := pullImageAndDrain(ctx, p.cli, "alpine", ""); err != nil {
|
||||
t.Fatalf("failed to pull alpine (needed by WriteAuthTokenToVolume too): %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// uniqueWorkspaceID returns a short, test-scoped workspace id. Volume +
|
||||
// container names derive from the first 12 chars (ConfigVolumeName /
|
||||
// ContainerName truncate), so the entropy must live in the prefix.
|
||||
func uniqueWorkspaceID() string {
|
||||
return fmt.Sprintf("t%011d", time.Now().UnixNano()%1e11)
|
||||
}
|
||||
|
||||
// cleanupWorkspace force-removes the container + config volume the test
|
||||
// created. Best-effort: honours feedback_cleanup_after_each_test —
|
||||
// leave the daemon as we found it.
|
||||
func cleanupWorkspace(p *Provisioner, workspaceID string) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second)
|
||||
defer cancel()
|
||||
_ = p.cli.ContainerRemove(ctx, ContainerName(workspaceID),
|
||||
container.RemoveOptions{Force: true})
|
||||
_ = p.cli.VolumeRemove(ctx, ConfigVolumeName(workspaceID), true)
|
||||
}
|
||||
|
||||
// rewritableByAgentUID returns nil iff uid agentUID can re-open `path`
|
||||
// inside `containerID` for WRITE+TRUNCATE — the EXACT operation the
|
||||
// runtime's platform_auth.save_token performs:
|
||||
//
|
||||
// os.open(path, O_WRONLY|O_CREAT|O_TRUNC, 0o600)
|
||||
//
|
||||
// This is the operation that 401'd Hermes. Modelled with `: > path`
|
||||
// (shell O_WRONLY|O_CREAT|O_TRUNC), the minimal faithful equivalent. A
|
||||
// non-nil error is the Hermes failure: a rotated token cannot be
|
||||
// persisted over a root:root file → 401 forever.
|
||||
func rewritableByAgentUID(ctx context.Context, p *Provisioner, containerID, path string) error {
|
||||
execCfg := container.ExecOptions{
|
||||
User: fmt.Sprintf("%d", agentUID),
|
||||
Cmd: []string{"sh", "-c", ": > " + path},
|
||||
AttachStdout: true,
|
||||
AttachStderr: true,
|
||||
}
|
||||
ex, err := p.cli.ContainerExecCreate(ctx, containerID, execCfg)
|
||||
if err != nil {
|
||||
return fmt.Errorf("exec create: %w", err)
|
||||
}
|
||||
att, err := p.cli.ContainerExecAttach(ctx, ex.ID, container.ExecAttachOptions{})
|
||||
if err != nil {
|
||||
return fmt.Errorf("exec attach: %w", err)
|
||||
}
|
||||
// Drain the multiplexed stream to completion. This BLOCKS until the
|
||||
// exec process exits — without it, ContainerExecInspect races the
|
||||
// process and reports ExitCode 0 / Running=true, silently inverting
|
||||
// the assertion (the proxy-trap that almost shipped this test green
|
||||
// against the real bug — caught and fixed in review).
|
||||
_, _ = io.Copy(io.Discard, att.Reader)
|
||||
att.Close()
|
||||
var insp container.ExecInspect
|
||||
deadline := time.Now().Add(10 * time.Second)
|
||||
for {
|
||||
insp, err = p.cli.ContainerExecInspect(ctx, ex.ID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("exec inspect: %w", err)
|
||||
}
|
||||
if !insp.Running {
|
||||
break
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
return fmt.Errorf("exec %s still running after 10s", ex.ID)
|
||||
}
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
}
|
||||
if insp.ExitCode != 0 {
|
||||
return fmt.Errorf("uid %d cannot O_WRONLY|O_TRUNC %s (exit %d) — "+
|
||||
"this is the save_token write-denial that 401'd Hermes: the "+
|
||||
"agent receives a rotated token but cannot persist it over a "+
|
||||
"root:root file, so it 401s forever",
|
||||
agentUID, path, insp.ExitCode)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// startConfigsContainer creates and starts a long-lived container that
|
||||
// mounts the workspace's REAL config volume at /configs and models the
|
||||
// production ownership lifecycle: runs as root, `chown -R agent
|
||||
// /configs` (mirrors workspace/entrypoint.sh), then idles. The agent is
|
||||
// created at uid agentUID exactly like the workspace templates. This is
|
||||
// the container into which WriteFilesToContainer re-injects (#418)
|
||||
// post-start — the precise moment the prod bug manifests.
|
||||
func startConfigsContainer(t *testing.T, p *Provisioner, workspaceID string) string {
|
||||
t.Helper()
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||
defer cancel()
|
||||
|
||||
volName := ConfigVolumeName(workspaceID)
|
||||
entry := fmt.Sprintf(
|
||||
"adduser -D -u %d agent 2>/dev/null || true; "+
|
||||
"mkdir -p /configs; chown -R agent /configs; "+
|
||||
"exec sleep 600", agentUID)
|
||||
|
||||
resp, err := p.cli.ContainerCreate(ctx, &container.Config{
|
||||
Image: "alpine",
|
||||
Cmd: []string{"sh", "-c", entry},
|
||||
User: "0", // entrypoint starts as root, like the runtime
|
||||
}, &container.HostConfig{
|
||||
Binds: []string{volName + ":/configs"},
|
||||
}, nil, nil, ContainerName(workspaceID))
|
||||
if err != nil {
|
||||
t.Fatalf("create configs container: %v", err)
|
||||
}
|
||||
if err := p.cli.ContainerStart(ctx, resp.ID, container.StartOptions{}); err != nil {
|
||||
t.Fatalf("start configs container: %v", err)
|
||||
}
|
||||
if err := waitForChownedConfigs(ctx, p, resp.ID); err != nil {
|
||||
t.Fatalf("configs container never became ready: %v", err)
|
||||
}
|
||||
return resp.ID
|
||||
}
|
||||
|
||||
// waitForChownedConfigs blocks until /configs is present AND owned by
|
||||
// the agent uid — i.e. the entrypoint's `chown -R agent /configs` has
|
||||
// completed. Polling on ownership (not mere existence) is the correct
|
||||
// readiness signal: the bug is the post-start re-injection landing
|
||||
// root:root *after* this chown, so the test must not re-inject until
|
||||
// the chown is definitively done. Each probe drains the exec stream to
|
||||
// completion before trusting ExitCode (same race fix as
|
||||
// rewritableByAgentUID).
|
||||
func waitForChownedConfigs(ctx context.Context, p *Provisioner, containerID string) error {
|
||||
deadline := time.Now().Add(20 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
ex, err := p.cli.ContainerExecCreate(ctx, containerID, container.ExecOptions{
|
||||
Cmd: []string{"sh", "-c",
|
||||
fmt.Sprintf("[ \"$(stat -c %%u /configs)\" = \"%d\" ]", agentUID)},
|
||||
AttachStdout: true,
|
||||
AttachStderr: true,
|
||||
})
|
||||
if err == nil {
|
||||
if att, aerr := p.cli.ContainerExecAttach(ctx, ex.ID,
|
||||
container.ExecAttachOptions{}); aerr == nil {
|
||||
_, _ = io.Copy(io.Discard, att.Reader)
|
||||
att.Close()
|
||||
if insp, ierr := p.cli.ContainerExecInspect(ctx, ex.ID); ierr == nil &&
|
||||
!insp.Running && insp.ExitCode == 0 {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
time.Sleep(150 * time.Millisecond)
|
||||
}
|
||||
return fmt.Errorf("/configs not chowned to uid %d after wait", agentUID)
|
||||
}
|
||||
|
||||
// TestTokenOwnership_LocalProvisionerParity is the headline test. It
|
||||
// drives the EXACT production sequence against local Docker:
|
||||
//
|
||||
// 1. WriteAuthTokenToVolume (#1877): pre-start write of .auth_token
|
||||
// into the named config volume via the real throwaway alpine writer.
|
||||
// 2. start a container modelling the runtime (root → chown /configs →
|
||||
// would gosu agent), mounting that same real volume.
|
||||
// 3. WriteFilesToContainer (#418): post-start re-injection of
|
||||
// .auth_token + .platform_inbound_secret into the running container
|
||||
// via the real tar/CopyToContainer path.
|
||||
// 4. assert uid agentUID can RE-WRITE both files (the save_token
|
||||
// recovery path) — the exact property the Hermes 401 violated.
|
||||
//
|
||||
// FAILS in ~0.9s on pre-fix code (files root:root). PASSES on this PR's
|
||||
// fix-based branch (files agent-owned).
|
||||
func TestTokenOwnership_LocalProvisionerParity(t *testing.T) {
|
||||
p := dockerOrSkip(t)
|
||||
ensureAlpine(t, p)
|
||||
|
||||
workspaceID := uniqueWorkspaceID()
|
||||
t.Cleanup(func() { cleanupWorkspace(p, workspaceID) })
|
||||
|
||||
start := time.Now()
|
||||
|
||||
// --- Step 1: REAL WriteAuthTokenToVolume (#1877, pre-start) ------
|
||||
const tokenPlain = "wsauth-fasttest-tok-abc123"
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||
defer cancel()
|
||||
if err := p.WriteAuthTokenToVolume(ctx, workspaceID, tokenPlain); err != nil {
|
||||
t.Fatalf("WriteAuthTokenToVolume (real #1877 path) failed: %v", err)
|
||||
}
|
||||
|
||||
// --- Step 2: start the runtime-modelling container --------------
|
||||
containerID := startConfigsContainer(t, p, workspaceID)
|
||||
|
||||
// --- Step 3: REAL WriteFilesToContainer (#418 post-start) -------
|
||||
files := map[string][]byte{
|
||||
".auth_token": []byte(tokenPlain),
|
||||
".platform_inbound_secret": []byte("inbound-secret-xyz789"),
|
||||
}
|
||||
if err := p.WriteFilesToContainer(ctx, containerID, files); err != nil {
|
||||
t.Fatalf("WriteFilesToContainer (real #418 path) failed: %v", err)
|
||||
}
|
||||
|
||||
elapsed := time.Since(start)
|
||||
|
||||
// --- Step 4: assert the property Hermes violated ----------------
|
||||
for _, f := range []string{
|
||||
"/configs/.auth_token",
|
||||
"/configs/.platform_inbound_secret",
|
||||
} {
|
||||
if err := rewritableByAgentUID(ctx, p, containerID, f); err != nil {
|
||||
t.Errorf("PROVISIONER-PARITY BUG (Hermes class): %v\n"+
|
||||
" This is the formerly-prod-only token-injection "+
|
||||
"ownership bug, caught LOCALLY in %s. The real "+
|
||||
"WriteFilesToContainer (#418 post-start re-inject) "+
|
||||
"delivered %s owned root:root; the workspace agent "+
|
||||
"runs as uid %d and its save_token O_WRONLY|O_TRUNC "+
|
||||
"is denied → it cannot persist a rotated token → "+
|
||||
"heartbeat / list_peers 401 forever. Fix: inject "+
|
||||
"delivered files owned by AgentUID (this PR's base). "+
|
||||
"See incident notes #1877 / #418.",
|
||||
err, elapsed, f, agentUID)
|
||||
}
|
||||
}
|
||||
|
||||
t.Logf("provisioner-parity sequence (real WriteAuthTokenToVolume + "+
|
||||
"WriteFilesToContainer vs local Docker) completed in %s — "+
|
||||
"the SECONDS-local replacement for the ~1h EC2 round-trip "+
|
||||
"that previously gated this bug class", elapsed)
|
||||
|
||||
// Hard guard on the headline claim: this MUST be seconds. 90s
|
||||
// ceiling absorbs a cold alpine pull on a slow link; steady-state
|
||||
// (image cached) is sub-2s.
|
||||
if elapsed > 90*time.Second {
|
||||
t.Errorf("provisioner-parity test took %s — claimed to be "+
|
||||
"SECONDS-local; investigate (image pull? daemon load?)",
|
||||
elapsed)
|
||||
}
|
||||
}
|
||||
|
||||
// TestTokenOwnership_FailPre_ProvesCatch pins the PRE-fix delivery shape
|
||||
// (post-start re-injection via a tar with Uid/Gid unset → root) and
|
||||
// asserts the SAME AgentUID write probe the headline test uses FAILS on
|
||||
// it. This proves the assertion genuinely catches the bug — it is not
|
||||
// vacuously green — and keeps the fail-direction demonstrable on this
|
||||
// fix-based branch where the headline test is green. Per
|
||||
// feedback_assert_exact_not_substring: verify the tightened assertion
|
||||
// FAILS on old behaviour.
|
||||
func TestTokenOwnership_FailPre_ProvesCatch(t *testing.T) {
|
||||
p := dockerOrSkip(t)
|
||||
ensureAlpine(t, p)
|
||||
|
||||
workspaceID := uniqueWorkspaceID()
|
||||
t.Cleanup(func() { cleanupWorkspace(p, workspaceID) })
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
|
||||
defer cancel()
|
||||
|
||||
if _, err := p.cli.VolumeCreate(ctx, volume.CreateOptions{
|
||||
Name: ConfigVolumeName(workspaceID), Labels: managedLabels(),
|
||||
}); err != nil {
|
||||
t.Fatalf("volume create: %v", err)
|
||||
}
|
||||
|
||||
// Start a runtime-model container (root → chown -R agent /configs →
|
||||
// idle) so the entrypoint chown has ALREADY run, exactly as in
|
||||
// production. The bug is the POST-start re-injection landing
|
||||
// root:root *after* that chown — so pinning the pre-fix
|
||||
// WriteFilesToContainer tar shape and copying it in post-start is
|
||||
// the faithful pre-fix reproduction.
|
||||
containerID := startConfigsContainer(t, p, workspaceID)
|
||||
|
||||
preFixTar := buildRootOwnedTar(t, map[string][]byte{
|
||||
".auth_token": []byte("pre-fix-token"),
|
||||
})
|
||||
if err := p.cli.CopyToContainer(ctx, containerID, "/configs",
|
||||
preFixTar, container.CopyToContainerOptions{}); err != nil {
|
||||
t.Fatalf("pre-fix CopyToContainer: %v", err)
|
||||
}
|
||||
|
||||
probeErr := rewritableByAgentUID(ctx, p, containerID, "/configs/.auth_token")
|
||||
if probeErr == nil {
|
||||
t.Fatalf("REGRESSION IN THE TEST ITSELF: the uid-%d write "+
|
||||
"probe PASSED against the known-pre-fix root:root "+
|
||||
"post-start re-injection. The headline assertion would "+
|
||||
"be vacuously green and would NOT have caught Hermes. "+
|
||||
"Investigate rewritableByAgentUID before trusting the "+
|
||||
"parity test.", agentUID)
|
||||
}
|
||||
t.Logf("confirmed: pre-fix root:root post-start re-injection is "+
|
||||
"correctly caught by the uid-%d write probe (%v) — the "+
|
||||
"parity assertion is load-bearing, not vacuous", agentUID, probeErr)
|
||||
}
|
||||
|
||||
// TestTokenOwnership_DockerIsLocalNotAWS asserts the core insight:
|
||||
// WriteAuthTokenToVolume + WriteFilesToContainer reach ONLY the Docker
|
||||
// client — no AWS SDK, no EC2/EBS/SG calls. That is why the bug class
|
||||
// is locally reproducible at all. Static guard: the provisioner package
|
||||
// must not import the AWS SDK (belt-and-braces with architecture_test.go
|
||||
// and the driver-seam RFC, internal #184).
|
||||
func TestTokenOwnership_DockerIsLocalNotAWS(t *testing.T) {
|
||||
t.Parallel()
|
||||
out, err := exec.Command("go", "list", "-deps",
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner").
|
||||
Output()
|
||||
if err != nil {
|
||||
t.Skipf("go list unavailable in this env: %v", err)
|
||||
}
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
if strings.Contains(line, "aws-sdk-go") {
|
||||
t.Fatalf("provisioner pulls in the AWS SDK (%s) — the "+
|
||||
"token-injection paths are supposed to be "+
|
||||
"driver-agnostic Docker-only ops; an AWS dep here "+
|
||||
"would undermine the local-testability this test "+
|
||||
"proves. See the driver-seam RFC (internal #184).",
|
||||
strings.TrimSpace(line))
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user