2026-06-06 06:45:08 +00:00
1 changed files with 322 additions and 0 deletions
@@ -0,0 +1,322 @@
+//go:build staging_e2e
+
+package staginge2e
+
+import (
+	"fmt"
+	"net/http"
+	"testing"
+	"time"
+)
+
+// TestDataVolumeSurvivesRecreate_Staging closes the data-persistence coverage
+// gap flagged in core#2332 (P0.5): "data-volume survives recreate" and
+// "snapshot-before-container-swap (/home/agent not wiped)" had NO e2e, and both
+// map to a real past incident — feedback_workspace_container_swap_wipes_home_agent:
+// on a container swap, only the /configs + /workspace binds (the durable data
+// volume, cp#326) survive; the container's own $HOME (/home/agent) is ephemeral
+// and is WIPED unless a snapshot is taken BEFORE docker stop+rm+run.
+//
+// This is the FORWARD half of that incident: prove the durable-data invariant
+// holds across a recreate so a future regression that drops the data-volume
+// reattach (or that flips a "persist" workspace to ephemeral) fails LOUD here
+// instead of silently eating a customer's /workspace state.
+//
+// What it does, end-to-end, against a real staging tenant:
+//  0. Provision a throwaway org + tenant via the CP admin API and acquire the
+//     tenant admin token (shared harness — mirrors workspace_lifecycle_test.go).
+//  1. Create a workspace with compute.data_persistence="persist" (the durable
+//     data-volume choice, internal#734) and wait for it to come ONLINE.
+//  2. Write a unique sentinel into /workspace (?root=/workspace) — the data
+//     volume per cp#326 — via the tenant Files API.
+//  3. Probe the /home/agent (container-$HOME) surface to encode the documented
+//     contract for the ephemeral side (see assertAgentHomeContract).
+//  4. Trigger a recreate / container-swap on the SAME data volume via
+//     POST /workspaces/:id/restart, and wait for ONLINE again.
+//  5. Assert the /workspace sentinel SURVIVES (data volume reattached +
+//     persisted). This is the load-bearing assertion — a wipe here is the
+//     regression we are gating.
+//
+// Guarded by the staging_e2e build tag and STAGING_E2E=1 env gate. Teardown is
+// t.Cleanup-driven (admin DELETE /cp/admin/tenants + DELETE /workspaces/:id).
+// Promote-to-required is a CTO call (infra-bound; see doc.go).
+func TestDataVolumeSurvivesRecreate_Staging(t *testing.T) {
+	cfg := requireStagingEnv(t)
+
+	// Unique-per-run sentinel so a stale prior run can never make a wiped
+	// volume look "survived" (we compare exact content, not mere existence).
+	stamp := time.Now().UnixNano()
+	relPath := fmt.Sprintf("e2e-persist/%d.sentinel", stamp)
+
+	slug := fmt.Sprintf("e2e-persist-%d", time.Now().Unix()%100000000)
+	t.Logf("data-persistence: slug=%s", slug)
+
+	// --- Step 0: provision org + tenant, acquire token + wait TLS ready ---
+	orgID := adminCreateOrg(t, cfg, slug)
+	t.Cleanup(func() { adminDeleteTenant(t, cfg, slug) })
+	t.Logf("org created: org_id=%s", orgID)
+
+	token := tenantAdminToken(t, cfg, slug)
+	tenantHost := slug + "." + cfg.subdomainSuffix
+	waitForHTTP(t, tenantHost, http.StatusOK, 10*time.Minute, "tenant /health ready")
+	t.Logf("tenant TLS ready: %s", tenantHost)
+
+	sentinel := fmt.Sprintf("data-volume-survives-recreate stamp=%d host=%s", stamp, tenantHost)
+
+	// --- Step 1: create workspace with durable data persistence ---
+	wsID := createPersistWorkspace(t, tenantHost, token, orgID, stamp)
+	t.Cleanup(func() { deletePersistWorkspace(t, tenantHost, token, orgID, wsID) })
+	t.Logf("workspace created: id=%s (data_persistence=persist)", wsID)
+
+	waitForWorkspaceOnline(t, tenantHost, token, orgID, wsID, 20*time.Minute)
+	t.Logf("workspace %s ONLINE", wsID)
+
+	// --- Step 2: write the /workspace sentinel (data volume, cp#326) ---
+	writeWorkspaceFile(t, tenantHost, token, orgID, wsID, "/workspace", relPath, sentinel)
+	t.Logf("wrote /workspace sentinel: root=/workspace path=%s", relPath)
+
+	// Read it straight back so a write that silently no-op'd can't masquerade
+	// as a survived-recreate later. This also confirms the EIC write landed on
+	// the host data volume before we swap the container out from under it.
+	if got := readWorkspaceFile(t, tenantHost, token, orgID, wsID, "/workspace", relPath); got != sentinel {
+		t.Fatalf("pre-recreate readback mismatch: wrote %q, read %q", sentinel, got)
+	}
+	t.Logf("pre-recreate readback OK")
+
+	// --- Step 3: encode the /home/agent (ephemeral container-$HOME) contract ---
+	assertAgentHomeContract(t, tenantHost, token, orgID, wsID, stamp)
+
+	// A successful Files write to a SaaS workspace can itself debounce-trigger
+	// an auto-restart (internal#624). Settle that window first so our explicit
+	// recreate below is the swap we actually measure, not a coalesced one that
+	// races our readback.
+	settleAutoRestart(t, tenantHost, token, orgID, wsID)
+
+	// --- Step 4: recreate / container-swap on the SAME data volume ---
+	// POST /restart is the recreate path: Stop (prune=false ALWAYS for restart,
+	// so the data volume is NEVER erased) -> re-provision on the same volume,
+	// templates NOT re-applied. See workspace_restart.go runRestartCycle.
+	triggerRecreate(t, tenantHost, token, orgID, wsID)
+	t.Logf("recreate (container swap) triggered via POST /restart")
+
+	// The swap flips status to 'provisioning'; wait for it to come back ONLINE.
+	waitForRecreateThenOnline(t, tenantHost, token, orgID, wsID, 20*time.Minute)
+	t.Logf("workspace %s back ONLINE after recreate", wsID)
+
+	// --- Step 5: LOAD-BEARING — the /workspace sentinel must SURVIVE ---
+	got := readWorkspaceFile(t, tenantHost, token, orgID, wsID, "/workspace", relPath)
+	if got != sentinel {
+		t.Fatalf("DATA-VOLUME REGRESSION: /workspace sentinel did NOT survive recreate.\n"+
+			"  wrote: %q\n  read:  %q\n"+
+			"  This is the cp#326 durable-data-volume invariant: a 'persist' workspace's\n"+
+			"  /workspace MUST survive a container swap. A wipe here means the data volume\n"+
+			"  was not reattached (or a persist→ephemeral regression). See\n"+
+			"  feedback_workspace_container_swap_wipes_home_agent.", sentinel, got)
+	}
+	t.Logf("PASS: /workspace sentinel SURVIVED recreate — data-volume invariant holds (cp#326)")
+}
+
+// assertAgentHomeContract encodes the CORRECT, documented expectation for the
+// /home/agent (container-$HOME) side of the incident.
+//
+// The Files API exposes the container's own $HOME via ?root=/agent-home (the
+// docker-exec backend, internal#425 RFC). That backend is intentionally STUBBED
+// today: every verb returns 501 Not Implemented. So there is NO supported
+// platform write path into the container's /home/agent — which is precisely
+// because that directory is EPHEMERAL: it lives inside the container, not on the
+// durable data volume, and is WIPED on every container swap unless a snapshot is
+// taken first (the incident's snapshot-before-stop+rm+run rule, which is a
+// CP-side provisioner concern, not a tenant ws-server file-API surface).
+//
+// This assertion is the regression tripwire for that contract: if a future
+// change wires /agent-home to a path WITHOUT also making it data-volume-backed,
+// this 501 flips to 200 and the test fails LOUD — forcing whoever lit up the
+// surface to first answer "is /home/agent now durable, and was the snapshot
+// hook added?" rather than silently shipping a wipe-on-recreate surface.
+//
+// We do NOT write-then-recreate-then-expect-wipe on /home/agent: asserting a
+// WIPE as a pass would be fail-open (a no-op write would also "pass"). Pinning
+// the 501 contract is the fail-closed encoding.
+func assertAgentHomeContract(t *testing.T, host, token, orgID, wsID string, stamp int64) {
+	t.Helper()
+	rel := fmt.Sprintf("e2e-persist/%d.home.sentinel", stamp)
+	url := fmt.Sprintf("https://%s/workspaces/%s/files/%s?root=%s",
+		host, wsID, rel, "/agent-home")
+	status, body := doTenantJSON(t, "PUT", url, token, orgID, fmt.Sprintf(`{"content":%q}`, "x"))
+
+	switch status {
+	case http.StatusNotImplemented:
+		// Documented contract: container-$HOME browse/write is stubbed BECAUSE
+		// it is ephemeral. No durable surface to assert survival on. Good.
+		t.Logf("/home/agent contract OK: /agent-home is 501 (ephemeral container-$HOME, no durable write surface — snapshot-before-swap is a CP-side concern)")
+	case http.StatusOK:
+		// The stub was lit up. This is a contract change that MUST be paired
+		// with data-volume backing + a snapshot-before-swap hook; until this
+		// test is extended to prove BOTH, treat the bare flip as a regression
+		// of the documented ephemeral contract.
+		t.Fatalf("CONTRACT DRIFT: PUT ?root=/agent-home returned 200 — the container-$HOME surface was wired up.\n"+
+			"  Per feedback_workspace_container_swap_wipes_home_agent, /home/agent is EPHEMERAL and wiped on\n"+
+			"  container swap unless snapshotted first. If this surface is now durable, EXTEND this test to\n"+
+			"  write→recreate→assert-survival on /home/agent AND assert the snapshot-before-swap hook fired.\n"+
+			"  Do not leave a write-able-but-ephemeral surface uncovered. body=%s", body)
+	default:
+		// 4xx other than 501 (e.g. 400/404) is acceptable — still "not a
+		// durable write surface". Anything 5xx that ISN'T 501 is a real bug.
+		if status >= 500 {
+			t.Fatalf("/home/agent contract probe: unexpected %d (want 501 or a 4xx): %s", status, body)
+		}
+		t.Logf("/home/agent contract: ?root=/agent-home returned %d (non-durable surface) — acceptable", status)
+	}
+}
+
+// --- workspace lifecycle over the tenant API ------------------------------
+
+// createPersistWorkspace creates a throwaway workspace with the durable
+// data-volume choice (compute.data_persistence="persist", internal#734). The
+// "persist" choice is what makes /workspace survive a recreate; we set it
+// explicitly rather than relying on the auto/org-flag default so the invariant
+// under test is unambiguous.
+func createPersistWorkspace(t *testing.T, host, token, orgID string, stamp int64) string {
+	t.Helper()
+	url := "https://" + host + "/workspaces"
+	body := fmt.Sprintf(
+		`{"name":%q,"runtime":%q,"tier":%d,"compute":{"data_persistence":%q}}`,
+		fmt.Sprintf("e2e-persist-%d", stamp%100000000), "claude-code", 1, "persist",
+	)
+	status, resp := doTenantJSON(t, "POST", url, token, orgID, body)
+	if status != http.StatusCreated && status != http.StatusOK {
+		t.Fatalf("create workspace: HTTP %d: %s", status, resp)
+	}
+	id := jsonField(resp, "id")
+	if id == "" {
+		t.Fatalf("create workspace: no id in response: %s", resp)
+	}
+	return id
+}
+
+// deletePersistWorkspace is the t.Cleanup teardown — best-effort, never fails
+// the test. DELETE without prune so a hung delete doesn't strand the test;
+// staging sweep reclaims any leftover compute. (The org/tenant itself is torn
+// down separately via adminDeleteTenant.)
+func deletePersistWorkspace(t *testing.T, host, token, orgID, wsID string) {
+	t.Helper()
+	url := "https://" + host + "/workspaces/" + wsID
+	status, resp := doTenantJSON(t, "DELETE", url, token, orgID, "")
+	if status != http.StatusOK && status != http.StatusAccepted && status != http.StatusNoContent && status != http.StatusNotFound {
+		t.Logf("WARNING: teardown DELETE workspace %s returned HTTP %d: %s (manual cleanup may be needed)", wsID, status, resp)
+		return
+	}
+	t.Logf("teardown: deleted workspace %s (HTTP %d)", wsID, status)
+}
+
+// waitForWorkspaceOnline polls GET /workspaces/:id until .status == "online".
+func waitForWorkspaceOnline(t *testing.T, host, token, orgID, wsID string, timeout time.Duration) {
+	t.Helper()
+	url := "https://" + host + "/workspaces/" + wsID
+	deadline := time.Now().Add(timeout)
+	var last string
+	for time.Now().Before(deadline) {
+		status, body := doTenantJSON(t, "GET", url, token, orgID, "")
+		if status == http.StatusOK {
+			last = jsonField(body, "status")
+			if last == "online" {
+				return
+			}
+		}
+		time.Sleep(10 * time.Second)
+	}
+	t.Fatalf("workspace %s did not reach status=online within %s (last=%q)", wsID, timeout, last)
+}
+
+// triggerRecreate POSTs /restart, the recreate / container-swap path. The
+// handler tears down the container and re-provisions on the SAME data volume
+// (Stop is called with prune=false for restart — see workspace_restart.go's
+// cpStopWithRetryErr — so a recreate can NEVER erase the data volume).
+func triggerRecreate(t *testing.T, host, token, orgID, wsID string) {
+	t.Helper()
+	url := "https://" + host + "/workspaces/" + wsID + "/restart"
+	status, body := doTenantJSON(t, "POST", url, token, orgID, "")
+	if status != http.StatusOK && status != http.StatusAccepted {
+		t.Fatalf("trigger recreate (POST /restart): HTTP %d: %s", status, body)
+	}
+}
+
+// waitForRecreateThenOnline waits out the swap. The recreate flips status to
+// 'provisioning'; we first observe it LEAVE online (so we don't read a stale
+// "still online" before the swap starts), then wait for it to return to online.
+// If we never catch the provisioning dip (fast swap), the subsequent online
+// poll still proves liveness — the load-bearing assertion is the sentinel read,
+// not the transient state machine.
+func waitForRecreateThenOnline(t *testing.T, host, token, orgID, wsID string, timeout time.Duration) {
+	t.Helper()
+	url := "https://" + host + "/workspaces/" + wsID
+	deadline := time.Now().Add(timeout)
+
+	// Brief window to catch the provisioning dip (best-effort; not required).
+	dipDeadline := time.Now().Add(90 * time.Second)
+	for time.Now().Before(dipDeadline) {
+		status, body := doTenantJSON(t, "GET", url, token, orgID, "")
+		if status == http.StatusOK && jsonField(body, "status") != "online" {
+			break
+		}
+		time.Sleep(3 * time.Second)
+	}
+
+	var last string
+	for time.Now().Before(deadline) {
+		status, body := doTenantJSON(t, "GET", url, token, orgID, "")
+		if status == http.StatusOK {
+			last = jsonField(body, "status")
+			if last == "online" {
+				return
+			}
+		}
+		time.Sleep(10 * time.Second)
+	}
+	t.Fatalf("workspace %s did not return to status=online after recreate within %s (last=%q)", wsID, timeout, last)
+}
+
+// settleAutoRestart absorbs the internal#624 file-write→restart debounce so the
+// explicit recreate we measure isn't coalesced with an implicit one. The
+// debounce window is 15s + a restart cycle; we poll back to a stable online.
+func settleAutoRestart(t *testing.T, host, token, orgID, wsID string) {
+	t.Helper()
+	// Give the debounce window time to fire (or not) ...
+	time.Sleep(20 * time.Second)
+	// ... then ensure we're back to a stable online before the measured swap.
+	waitForWorkspaceOnline(t, host, token, orgID, wsID, 10*time.Minute)
+}
+
+// --- tenant Files API ------------------------------------------------------
+
+// writeWorkspaceFile PUTs a file via the tenant Files API into the given root.
+// root="/workspace" is the literal data-volume path (cp#326).
+func writeWorkspaceFile(t *testing.T, host, token, orgID, wsID, root, relPath, content string) {
+	t.Helper()
+	url := fmt.Sprintf("https://%s/workspaces/%s/files/%s?root=%s",
+		host, wsID, relPath, root)
+	status, body := doTenantJSON(t, "PUT", url, token, orgID, fmt.Sprintf(`{"content":%q}`, content))
+	if status != http.StatusOK {
+		t.Fatalf("write %s%s: HTTP %d: %s", root, relPath, status, body)
+	}
+}
+
+// readWorkspaceFile GETs a file via the tenant Files API and returns its
+// content. Fails the test on any non-200 (a not-found after a recreate is the
+// wipe we are gating, so the caller compares content and emits the regression
+// message — but a transport/auth failure should still fail loud here).
+func readWorkspaceFile(t *testing.T, host, token, orgID, wsID, root, relPath string) string {
+	t.Helper()
+	url := fmt.Sprintf("https://%s/workspaces/%s/files/%s?root=%s",
+		host, wsID, relPath, root)
+	status, body := doTenantJSON(t, "GET", url, token, orgID, "")
+	if status == http.StatusNotFound {
+		// Surface the not-found as empty content; the caller's exact-content
+		// compare turns this into the DATA-VOLUME REGRESSION message.
+		return ""
+	}
+	if status != http.StatusOK {
+		t.Fatalf("read %s%s: HTTP %d: %s", root, relPath, status, body)
+	}
+	return jsonField(body, "content")
+}