From 533502da35c56a9382282d245ce7d608e673535e Mon Sep 17 00:00:00 2001 From: infra-runtime-be Date: Mon, 18 May 2026 14:38:58 -0700 Subject: [PATCH] feat(provisioner): uniform T4 privilege contract + YAML emitter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds workspace-server/internal/provisioner/t4_privilege_contract.go as the single source of truth for the T4 ("full machine access") capability set that template-repo CI workflows currently re-implement as bespoke shell. Today's t4-conformance gates in template-claude-code / template-hermes / template-codex each hand-assert agent-uid + token-ownership + host-root reach. The shell drifts (the very Hermes 401 class bug came from drift), and there's no way to add a new capability fleet-wide without N template PRs. This contract: * Defines T4Capability as code (Name/Description/Probe/Severity/Source) * Lists the closure: agent_uid_1000, auth_token_agent_owned, host_root_reach_via_nsenter, host_fs_write_readback, docker_socket_reachable, list_peers_http_200, agent_home_writable, network_egress_https, privileged_flag_observable, pid_host_visible * Renders to YAML via AsYAML() and cmd/t4-contract-dump so any template CI can do: go run ./workspace-server/cmd/t4-contract-dump > t4_capabilities.yaml and iterate capabilities — new capabilities propagate without per-template PRs. * Pure stdlib + no Molecule-AI-internal deps so fork users can adopt the same contract. Anti-drift unit tests (7, all green): - all caps have required fields - names unique - core closure (RFC#456 + task #128/#174) is present - hard-severity is strict majority - YAML is deterministic + escapes double quotes - YAML header cites internal#456 - AgentUID const consistent with probes Does NOT change Docker/Dockerfile or any existing emit-side behavior; this is purely additive. The provisioner.go T4 branch is unchanged. Templates adopt the YAML in a separate PR (pilot: template-claude-code). Refs: RFC internal#456, task #174, memory reference_per_template_privilege_contract_class_audit_2026_05_16, memory feedback_hermes_listpeers_401_token_root600_unreadable_by_uid1000. Co-Authored-By: Claude Opus 4.7 (1M context) --- workspace-server/cmd/t4-contract-dump/main.go | 35 +++ .../provisioner/t4_privilege_contract.go | 229 ++++++++++++++++++ .../provisioner/t4_privilege_contract_test.go | 153 ++++++++++++ 3 files changed, 417 insertions(+) create mode 100644 workspace-server/cmd/t4-contract-dump/main.go create mode 100644 workspace-server/internal/provisioner/t4_privilege_contract.go create mode 100644 workspace-server/internal/provisioner/t4_privilege_contract_test.go diff --git a/workspace-server/cmd/t4-contract-dump/main.go b/workspace-server/cmd/t4-contract-dump/main.go new file mode 100644 index 000000000..86f85695b --- /dev/null +++ b/workspace-server/cmd/t4-contract-dump/main.go @@ -0,0 +1,35 @@ +// Command t4-contract-dump prints the T4 privilege contract as YAML. +// +// Usage: +// +// go run ./workspace-server/cmd/t4-contract-dump > t4_capabilities.yaml +// +// This is the seam that template-repo CI workflows consume: +// +// - Template CI fetches molecule-core at pinned ref +// - Runs `go run ./workspace-server/cmd/t4-contract-dump` to produce +// t4_capabilities.yaml +// - Iterates capabilities and runs each Probe inside a freshly-built +// privileged container +// - Aggregates structured pass/fail; fails the gate on any hard miss. +// +// Keeping this trivial and pure-stdlib means a fork user does not need +// a Molecule-AI Gitea token or any internal infrastructure to consume +// the contract — `go run` against molecule-core's public source is +// enough. +package main + +import ( + "fmt" + "os" + + "github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner" +) + +func main() { + caps := provisioner.T4PrivilegeContract() + if _, err := os.Stdout.WriteString(provisioner.AsYAML(caps)); err != nil { + fmt.Fprintln(os.Stderr, "t4-contract-dump: write failed:", err) + os.Exit(1) + } +} diff --git a/workspace-server/internal/provisioner/t4_privilege_contract.go b/workspace-server/internal/provisioner/t4_privilege_contract.go new file mode 100644 index 000000000..52477fd4a --- /dev/null +++ b/workspace-server/internal/provisioner/t4_privilege_contract.go @@ -0,0 +1,229 @@ +// Package provisioner — T4 privilege contract. +// +// This file is the single source of truth for what a Tier-4 ("full +// machine access") workspace runtime MUST guarantee, expressed as code +// templates can reference and CI can verify. +// +// RFC: molecule-ai/internal#456 (per-template privilege-contract class). +// Task: molecule-ai/internal #174. +// +// Background +// ---------- +// Prior art is RFC#456's three layers: +// +// (1) molecule-runtime self-enforces uid-1000 + fchown safety net, +// (2) a platform-owned wrapper entrypoint from a shared base image, +// (3) a REQUIRED CI conformance gate wired into the fresh-provision +// harness that asserts the post-condition, not the mechanism. +// +// This file is the *data shape* for layer (3): the gate's tests have +// been hand-written per-template (template-claude-code, template-hermes, +// template-codex). Hand-writing drifts; the Hermes 401 class came from +// drift. We need the capability list itself to be code so that: +// +// - The provisioner can dump it as `t4_capabilities.yaml` for any +// fork user or non-Molecule-AI template runner to consume directly +// (no hardcoded internal org). +// - A `Verify(...)` helper turns into the t4-conformance shell out of +// one file, so when a capability is added the templates pick it up +// by reading the YAML — they do not silently lag. +// - The provisioner-emit side (provisioner.go applyTierResources / T4 +// branch) and the verifier side share the same constants for the +// uid + mount paths, eliminating "string-match" drift between +// emitter and gate. +// +// Non-goals +// --------- +// - This is NOT a substitute for layer (1)/(2). Templates still must +// `exec gosu agent` and write /configs/.auth_token under uid 1000; +// this file describes *what to check*, not how to achieve it. +// - This file does not run tests. It is the spec. CI workflows call +// `T4PrivilegeContract().AsYAML()` once at the start of the gate +// and assert each capability's `Probe` returns ok. +package provisioner + +import ( + "fmt" + "sort" + "strings" +) + +// T4Capability is one assertion the T4 runtime MUST satisfy. +// +// Each capability declares: +// - Name: stable id (used as the test name in CI output). +// - Description: human-readable why-this-matters; goes in failure logs. +// - Probe: a shell snippet that exits 0 on pass, non-zero on fail. +// The probe MUST be deterministic, MUST be runnable inside the +// running container under uid 1000, and MUST NOT depend on outside +// network beyond what `RequiredEgress` declares. +// - Severity: "hard" capabilities fail the gate; "advisory" emit a +// warning. T4 contract minimum = all hard pass. +// - Source: RFC section or memory reference that motivated this +// capability — keeps the audit trail in-tree. +type T4Capability struct { + Name string `yaml:"name"` + Description string `yaml:"description"` + Probe string `yaml:"probe"` + Severity string `yaml:"severity"` + Source string `yaml:"source"` + RequiredEgress []string `yaml:"required_egress,omitempty"` +} + +// SeverityHard / SeverityAdvisory enumerate the only allowed Severity +// values. We do not use Go enums because the YAML consumer is shell. +const ( + SeverityHard = "hard" + SeverityAdvisory = "advisory" +) + +// T4PrivilegeContract returns the full T4 capability set. +// +// Add new capabilities here. Each one is automatically picked up by +// any template whose CI consumes `t4_capabilities.yaml` (no per-template +// PR needed for new checks — this is the anti-drift property). +// +// Capability ordering matters for human-readable CI output but is not +// load-bearing for correctness; AsYAML() emits them sorted by Name. +func T4PrivilegeContract() []T4Capability { + return []T4Capability{ + { + Name: "agent_uid_1000", + Description: "The container's primary process (the runtime, post-gosu) runs as uid 1000, not root. T4 grants full machine access via privileged + host PID + Docker socket — the WORKLOAD inside that privileged container must still be unprivileged to prevent every untrusted code execution from being trivially root-on-host.", + Probe: `[ "$(id -u)" = "1000" ]`, + Severity: SeverityHard, + Source: "RFC internal#456 §2.1.2; memory feedback_hermes_listpeers_401_token_root600_unreadable_by_uid1000", + }, + { + Name: "auth_token_agent_owned", + Description: "/configs/.auth_token is owned by uid 1000 (== AgentUID) so the a2a_mcp_server can read its bearer. In SaaS mode molecule-runtime itself writes the token via save_token() — the ownership equals the runtime's exec uid. If the runtime ever runs as root, this fails and list_peers 401s (the Hermes class bug).", + Probe: `[ -e /configs/.auth_token ] && [ "$(stat -c '%u' /configs/.auth_token)" = "1000" ]`, + Severity: SeverityHard, + Source: "RFC internal#456 §10; memory feedback_hermes_listpeers_401_token_root600_unreadable_by_uid1000", + }, + { + Name: "host_root_reach_via_nsenter", + Description: "The uid-1000 agent can attain host root via `sudo -n nsenter --target 1 --mount --pid -- id -u` returning 0. This is the T4 escalation leg: full machine access means the agent CAN escalate to host root deliberately, even though it does not run as root by default.", + Probe: `[ "$(sudo -n nsenter --target 1 --mount --pid -- id -u)" = "0" ]`, + Severity: SeverityHard, + Source: "RFC internal#456 §11; memory reference_per_template_privilege_contract_class_audit_2026_05_16", + }, + { + Name: "host_fs_write_readback", + Description: "Host filesystem is mounted at /host and the agent can write+read+remove a file there via sudo. Proves real host reach (not just a PID-1 namespace trick on an isolated init).", + Probe: `MARKER="t4cap-$(date +%s)-$RANDOM"; PROBE_FILE="/host/tmp/.t4-cap-probe-${MOLECULE_T4_PROBE_ID:-$$}"; ` + + `sudo -n sh -c "echo $MARKER > $PROBE_FILE" && ` + + `[ "$(sudo -n cat $PROBE_FILE)" = "$MARKER" ] && ` + + `sudo -n rm -f $PROBE_FILE`, + Severity: SeverityHard, + Source: "RFC internal#456 §11", + }, + { + Name: "docker_socket_reachable", + Description: "/var/run/docker.sock is bind-mounted into the container so the agent can manage other containers (T4 use case: agent-as-orchestrator). Proven by 'docker version' returning a server section, which requires the daemon to answer over the socket.", + Probe: `sudo -n docker version --format '{{.Server.Version}}' >/dev/null 2>&1`, + Severity: SeverityHard, + Source: "provisioner.go applyHostConfig T4 branch (case 4)", + }, + { + Name: "list_peers_http_200", + Description: "The platform list_peers HTTP endpoint (served by the in-container a2a_mcp_server) returns HTTP 200 when called from uid 1000 with the bearer from /configs/.auth_token. This proves the WHOLE token-ownership chain end-to-end: token written under correct uid → reader uid matches → bearer non-empty → platform accepts. A self-contained empirical test for the Hermes class bug.", + Probe: `BEARER=$(cat /configs/.auth_token 2>/dev/null || echo ""); ` + + `[ -n "$BEARER" ] || exit 1; ` + + `PORT=$(cat /configs/.platform_port 2>/dev/null || echo "8080"); ` + + `STATUS=$(curl -sS -o /dev/null -w '%{http_code}' -H "Authorization: Bearer $BEARER" "http://127.0.0.1:${PORT}/list_peers"); ` + + `[ "$STATUS" = "200" ]`, + Severity: SeverityHard, + Source: "memory reference_openclaw_fresh_provision_nonfunctional_anthropic_default_unroutable; memory reference_openclaw_mcp_peer_wiring_rootcause", + }, + { + Name: "agent_home_writable", + Description: "/agent-home is writable by the agent (Files API split per task #128). The Files API redesign uses /agent-home as the user-writable root; the agent must be able to create files there without sudo.", + Probe: `TF=/agent-home/.t4-cap-write-probe-${MOLECULE_T4_PROBE_ID:-$$}; echo ok > "$TF" && [ "$(cat "$TF")" = "ok" ] && rm -f "$TF"`, + Severity: SeverityHard, + Source: "task #128 Files API redesign; memory reference_post_suspension_pipeline", + }, + { + Name: "network_egress_https", + Description: "Generic HTTPS egress works. T4 is unconstrained network; the canonical test target is the Gitea instance over its public name, which any fork user can also resolve. Any reachable HTTPS endpoint satisfies it — the YAML carries the recommended targets but accepts any 200/301/302.", + Probe: `for U in $MOLECULE_T4_EGRESS_TARGETS; do ` + + ` C=$(curl -sS -o /dev/null -w '%{http_code}' --max-time 8 "$U"); ` + + ` case "$C" in 2*|3*) exit 0;; esac; ` + + `done; exit 1`, + Severity: SeverityHard, + Source: "task #174 brief", + RequiredEgress: []string{ + // Public, no auth, returns a small JSON. + // Adopters override via MOLECULE_T4_EGRESS_TARGETS. + "https://api.github.com/zen", + "https://www.google.com/generate_204", + }, + }, + { + Name: "privileged_flag_observable", + Description: "Container is started with --privileged. Observable from inside via /proc/self/status CapEff containing CAP_SYS_ADMIN. Defense-in-depth for the provisioner emission side.", + Probe: `grep -q '^CapEff:.*ffffffffff' /proc/self/status`, + Severity: SeverityAdvisory, // Imperfect — some CAP filters trim CapEff; advisory only. + Source: "provisioner.go applyHostConfig T4 branch (case 4)", + }, + { + Name: "pid_host_visible", + Description: "Host PID namespace is shared (--pid=host). The container can see host process 1 (systemd or pid-1 on the EC2 instance). Required for nsenter into host mount/pid namespaces.", + Probe: `[ -d /proc/1/root ] && [ "$(sudo -n readlink /proc/1/ns/pid)" = "$(sudo -n readlink /proc/self/ns/pid)" ]`, + Severity: SeverityHard, + Source: "provisioner.go applyHostConfig T4 branch (case 4): hostCfg.PidMode = 'host'", + }, + } +} + +// AsYAML renders the contract as a single YAML document templates can +// fetch at CI time. Sorted by Name for deterministic diffs. +// +// We deliberately do not depend on a YAML library here — the format is +// trivial, and one-file pure-stdlib means this can be vendored or +// dumped from any Go context (including a `go run` script in CI). +// +// The format is stable; downstream consumers must treat unknown fields +// as warnings, not errors. +func AsYAML(caps []T4Capability) string { + sorted := make([]T4Capability, len(caps)) + copy(sorted, caps) + sort.Slice(sorted, func(i, j int) bool { return sorted[i].Name < sorted[j].Name }) + + var b strings.Builder + b.WriteString("# T4 privilege contract — generated from\n") + b.WriteString("# molecule-ai/molecule-core workspace-server/internal/provisioner/t4_privilege_contract.go\n") + b.WriteString("# RFC: molecule-ai/internal#456\n") + b.WriteString("# Do NOT edit this file by hand; regenerate via `go run ./cmd/t4-contract-dump > t4_capabilities.yaml`.\n") + b.WriteString("version: 1\n") + b.WriteString("agent_uid: 1000\n") + b.WriteString("capabilities:\n") + for _, c := range sorted { + fmt.Fprintf(&b, " - name: %s\n", yamlEscape(c.Name)) + fmt.Fprintf(&b, " description: %s\n", yamlEscape(c.Description)) + fmt.Fprintf(&b, " severity: %s\n", c.Severity) + fmt.Fprintf(&b, " source: %s\n", yamlEscape(c.Source)) + fmt.Fprintf(&b, " probe: %s\n", yamlEscape(c.Probe)) + if len(c.RequiredEgress) > 0 { + b.WriteString(" required_egress:\n") + for _, u := range c.RequiredEgress { + fmt.Fprintf(&b, " - %s\n", yamlEscape(u)) + } + } + } + return b.String() +} + +// yamlEscape is a minimal YAML scalar escaper. We always quote with +// double quotes and backslash-escape internal quotes + backslashes — +// safe for the subset of strings we emit (no control chars except \n +// and \t, both of which we replace with literal escapes). +func yamlEscape(s string) string { + r := strings.NewReplacer( + "\\", "\\\\", + "\"", "\\\"", + "\n", "\\n", + "\t", "\\t", + ) + return "\"" + r.Replace(s) + "\"" +} diff --git a/workspace-server/internal/provisioner/t4_privilege_contract_test.go b/workspace-server/internal/provisioner/t4_privilege_contract_test.go new file mode 100644 index 000000000..9fb369e04 --- /dev/null +++ b/workspace-server/internal/provisioner/t4_privilege_contract_test.go @@ -0,0 +1,153 @@ +package provisioner + +import ( + "strings" + "testing" +) + +// TestT4PrivilegeContract_AllCapabilitiesHaveRequiredFields enforces +// the invariant that every entry in the contract has at minimum a +// Name, Description, Probe, Severity, and Source — so the YAML the +// templates consume is never partially-filled (a quiet way to drift). +func TestT4PrivilegeContract_AllCapabilitiesHaveRequiredFields(t *testing.T) { + caps := T4PrivilegeContract() + if len(caps) == 0 { + t.Fatal("T4PrivilegeContract returned zero capabilities — the gate would have nothing to assert") + } + for _, c := range caps { + if c.Name == "" { + t.Errorf("capability missing Name: %+v", c) + } + if c.Description == "" { + t.Errorf("capability %q missing Description", c.Name) + } + if c.Probe == "" { + t.Errorf("capability %q missing Probe", c.Name) + } + if c.Severity != SeverityHard && c.Severity != SeverityAdvisory { + t.Errorf("capability %q has invalid Severity %q (allowed: hard, advisory)", c.Name, c.Severity) + } + if c.Source == "" { + t.Errorf("capability %q missing Source — every capability must cite the RFC section or memory that motivates it", c.Name) + } + } +} + +// TestT4PrivilegeContract_NamesAreUnique catches a silent +// dup-by-rename: if two capabilities share a name, AsYAML overwrites +// one in any YAML-loader-with-merge implementation, and CI output +// becomes ambiguous. +func TestT4PrivilegeContract_NamesAreUnique(t *testing.T) { + caps := T4PrivilegeContract() + seen := make(map[string]bool, len(caps)) + for _, c := range caps { + if seen[c.Name] { + t.Errorf("capability name %q appears more than once", c.Name) + } + seen[c.Name] = true + } +} + +// TestT4PrivilegeContract_CoreCapabilitiesPresent pins the minimum +// closure of capabilities the gate guarantees. Adding capabilities +// is fine; removing one of these requires updating this test +// (which the reviewer will see and challenge). +// +// These are exactly the post-conditions cited in RFC internal#456 §10–§11 +// + task #128 (Files API) + task #174 (this task). +func TestT4PrivilegeContract_CoreCapabilitiesPresent(t *testing.T) { + required := []string{ + "agent_uid_1000", + "auth_token_agent_owned", + "host_root_reach_via_nsenter", + "docker_socket_reachable", + "list_peers_http_200", + "agent_home_writable", + "network_egress_https", + } + caps := T4PrivilegeContract() + have := make(map[string]bool, len(caps)) + for _, c := range caps { + have[c.Name] = true + } + for _, r := range required { + if !have[r] { + t.Errorf("required capability %q missing from contract — RFC internal#456 / task #174 says this MUST be in the closure", r) + } + } +} + +// TestT4PrivilegeContract_HardCapabilitiesMajority sanity-checks that +// the contract is not silently advisory-only. If someone marks +// everything as "advisory" the gate becomes a no-op without anyone +// noticing — fail the test if hard capabilities are not the majority. +func TestT4PrivilegeContract_HardCapabilitiesMajority(t *testing.T) { + caps := T4PrivilegeContract() + hard := 0 + for _, c := range caps { + if c.Severity == SeverityHard { + hard++ + } + } + if hard*2 <= len(caps) { + t.Errorf("hard capabilities (%d) must be the strict majority of %d total — otherwise the gate is a no-op", hard, len(caps)) + } +} + +// TestAsYAML_IsParseableAndStable asserts the AsYAML output is +// stable across invocations (sorted by name) and contains every +// capability's name. We do not depend on a YAML parser here — +// presence of `- name: ""` lines is sufficient and the format +// is deliberately the trivially-greppable subset. +func TestAsYAML_IsParseableAndStable(t *testing.T) { + caps := T4PrivilegeContract() + y1 := AsYAML(caps) + y2 := AsYAML(caps) + if y1 != y2 { + t.Error("AsYAML output is not deterministic across calls — sort/format must be stable for CI diff sanity") + } + for _, c := range caps { + needle := "- name: \"" + c.Name + "\"" + if !strings.Contains(y1, needle) { + t.Errorf("AsYAML output missing %q", needle) + } + } + // Header must cite the RFC so adopters can find the source of truth. + if !strings.Contains(y1, "internal#456") { + t.Error("AsYAML header must reference RFC internal#456 — that is the design-of-record") + } + if !strings.Contains(y1, "version: 1") { + t.Error("AsYAML must declare schema version (templates parse-check on this)") + } +} + +// TestAsYAML_EscapesEmbeddedQuotes catches a regression in +// yamlEscape: a probe shell string containing a double-quote would +// produce an unparseable YAML scalar. +func TestAsYAML_EscapesEmbeddedQuotes(t *testing.T) { + caps := []T4Capability{{ + Name: "embedded_quote", + Description: `says "hi"`, + Probe: `echo "ok"`, + Severity: SeverityHard, + Source: "test", + }} + y := AsYAML(caps) + // We expect the embedded `"` to be backslash-escaped. + if !strings.Contains(y, `\"hi\"`) { + t.Errorf("AsYAML did not escape embedded double quotes; got:\n%s", y) + } + if !strings.Contains(y, `\"ok\"`) { + t.Errorf("AsYAML did not escape embedded double quotes in Probe; got:\n%s", y) + } +} + +// TestAgentUIDConsistency ties the contract to the existing +// provisioner-side AgentUID const. The probe for "agent_uid_1000" +// hard-codes `id -u == 1000`; if AgentUID ever changes (no one +// expects it to, but a CI guard is free), the probe must change too. +func TestAgentUIDConsistency(t *testing.T) { + if AgentUID != 1000 { + t.Fatalf("AgentUID is %d but the T4 contract's probes assume 1000; update t4_privilege_contract.go probes before changing AgentUID", AgentUID) + } +} -- 2.52.0