test(staginge2e): data-volume survives recreate e2e (core#2332 P0.5) #2336
@@ -0,0 +1,322 @@
|
||||
//go:build staging_e2e
|
||||
|
||||
package staginge2e
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// TestDataVolumeSurvivesRecreate_Staging closes the data-persistence coverage
|
||||
// gap flagged in core#2332 (P0.5): "data-volume survives recreate" and
|
||||
// "snapshot-before-container-swap (/home/agent not wiped)" had NO e2e, and both
|
||||
// map to a real past incident — feedback_workspace_container_swap_wipes_home_agent:
|
||||
// on a container swap, only the /configs + /workspace binds (the durable data
|
||||
// volume, cp#326) survive; the container's own $HOME (/home/agent) is ephemeral
|
||||
// and is WIPED unless a snapshot is taken BEFORE docker stop+rm+run.
|
||||
//
|
||||
// This is the FORWARD half of that incident: prove the durable-data invariant
|
||||
// holds across a recreate so a future regression that drops the data-volume
|
||||
// reattach (or that flips a "persist" workspace to ephemeral) fails LOUD here
|
||||
// instead of silently eating a customer's /workspace state.
|
||||
//
|
||||
// What it does, end-to-end, against a real staging tenant:
|
||||
// 0. Provision a throwaway org + tenant via the CP admin API and acquire the
|
||||
// tenant admin token (shared harness — mirrors workspace_lifecycle_test.go).
|
||||
// 1. Create a workspace with compute.data_persistence="persist" (the durable
|
||||
// data-volume choice, internal#734) and wait for it to come ONLINE.
|
||||
// 2. Write a unique sentinel into /workspace (?root=/workspace) — the data
|
||||
// volume per cp#326 — via the tenant Files API.
|
||||
// 3. Probe the /home/agent (container-$HOME) surface to encode the documented
|
||||
// contract for the ephemeral side (see assertAgentHomeContract).
|
||||
// 4. Trigger a recreate / container-swap on the SAME data volume via
|
||||
// POST /workspaces/:id/restart, and wait for ONLINE again.
|
||||
// 5. Assert the /workspace sentinel SURVIVES (data volume reattached +
|
||||
// persisted). This is the load-bearing assertion — a wipe here is the
|
||||
// regression we are gating.
|
||||
//
|
||||
// Guarded by the staging_e2e build tag and STAGING_E2E=1 env gate. Teardown is
|
||||
// t.Cleanup-driven (admin DELETE /cp/admin/tenants + DELETE /workspaces/:id).
|
||||
// Promote-to-required is a CTO call (infra-bound; see doc.go).
|
||||
func TestDataVolumeSurvivesRecreate_Staging(t *testing.T) {
|
||||
cfg := requireStagingEnv(t)
|
||||
|
||||
// Unique-per-run sentinel so a stale prior run can never make a wiped
|
||||
// volume look "survived" (we compare exact content, not mere existence).
|
||||
stamp := time.Now().UnixNano()
|
||||
relPath := fmt.Sprintf("e2e-persist/%d.sentinel", stamp)
|
||||
|
||||
slug := fmt.Sprintf("e2e-persist-%d", time.Now().Unix()%100000000)
|
||||
t.Logf("data-persistence: slug=%s", slug)
|
||||
|
||||
// --- Step 0: provision org + tenant, acquire token + wait TLS ready ---
|
||||
orgID := adminCreateOrg(t, cfg, slug)
|
||||
t.Cleanup(func() { adminDeleteTenant(t, cfg, slug) })
|
||||
t.Logf("org created: org_id=%s", orgID)
|
||||
|
||||
token := tenantAdminToken(t, cfg, slug)
|
||||
tenantHost := slug + "." + cfg.subdomainSuffix
|
||||
waitForHTTP(t, tenantHost, http.StatusOK, 10*time.Minute, "tenant /health ready")
|
||||
t.Logf("tenant TLS ready: %s", tenantHost)
|
||||
|
||||
sentinel := fmt.Sprintf("data-volume-survives-recreate stamp=%d host=%s", stamp, tenantHost)
|
||||
|
||||
// --- Step 1: create workspace with durable data persistence ---
|
||||
wsID := createPersistWorkspace(t, tenantHost, token, orgID, stamp)
|
||||
t.Cleanup(func() { deletePersistWorkspace(t, tenantHost, token, orgID, wsID) })
|
||||
t.Logf("workspace created: id=%s (data_persistence=persist)", wsID)
|
||||
|
||||
waitForWorkspaceOnline(t, tenantHost, token, orgID, wsID, 20*time.Minute)
|
||||
t.Logf("workspace %s ONLINE", wsID)
|
||||
|
||||
// --- Step 2: write the /workspace sentinel (data volume, cp#326) ---
|
||||
writeWorkspaceFile(t, tenantHost, token, orgID, wsID, "/workspace", relPath, sentinel)
|
||||
t.Logf("wrote /workspace sentinel: root=/workspace path=%s", relPath)
|
||||
|
||||
// Read it straight back so a write that silently no-op'd can't masquerade
|
||||
// as a survived-recreate later. This also confirms the EIC write landed on
|
||||
// the host data volume before we swap the container out from under it.
|
||||
if got := readWorkspaceFile(t, tenantHost, token, orgID, wsID, "/workspace", relPath); got != sentinel {
|
||||
t.Fatalf("pre-recreate readback mismatch: wrote %q, read %q", sentinel, got)
|
||||
}
|
||||
t.Logf("pre-recreate readback OK")
|
||||
|
||||
// --- Step 3: encode the /home/agent (ephemeral container-$HOME) contract ---
|
||||
assertAgentHomeContract(t, tenantHost, token, orgID, wsID, stamp)
|
||||
|
||||
// A successful Files write to a SaaS workspace can itself debounce-trigger
|
||||
// an auto-restart (internal#624). Settle that window first so our explicit
|
||||
// recreate below is the swap we actually measure, not a coalesced one that
|
||||
// races our readback.
|
||||
settleAutoRestart(t, tenantHost, token, orgID, wsID)
|
||||
|
||||
// --- Step 4: recreate / container-swap on the SAME data volume ---
|
||||
// POST /restart is the recreate path: Stop (prune=false ALWAYS for restart,
|
||||
// so the data volume is NEVER erased) -> re-provision on the same volume,
|
||||
// templates NOT re-applied. See workspace_restart.go runRestartCycle.
|
||||
triggerRecreate(t, tenantHost, token, orgID, wsID)
|
||||
t.Logf("recreate (container swap) triggered via POST /restart")
|
||||
|
||||
// The swap flips status to 'provisioning'; wait for it to come back ONLINE.
|
||||
waitForRecreateThenOnline(t, tenantHost, token, orgID, wsID, 20*time.Minute)
|
||||
t.Logf("workspace %s back ONLINE after recreate", wsID)
|
||||
|
||||
// --- Step 5: LOAD-BEARING — the /workspace sentinel must SURVIVE ---
|
||||
got := readWorkspaceFile(t, tenantHost, token, orgID, wsID, "/workspace", relPath)
|
||||
if got != sentinel {
|
||||
t.Fatalf("DATA-VOLUME REGRESSION: /workspace sentinel did NOT survive recreate.\n"+
|
||||
" wrote: %q\n read: %q\n"+
|
||||
" This is the cp#326 durable-data-volume invariant: a 'persist' workspace's\n"+
|
||||
" /workspace MUST survive a container swap. A wipe here means the data volume\n"+
|
||||
" was not reattached (or a persist→ephemeral regression). See\n"+
|
||||
" feedback_workspace_container_swap_wipes_home_agent.", sentinel, got)
|
||||
}
|
||||
t.Logf("PASS: /workspace sentinel SURVIVED recreate — data-volume invariant holds (cp#326)")
|
||||
}
|
||||
|
||||
// assertAgentHomeContract encodes the CORRECT, documented expectation for the
|
||||
// /home/agent (container-$HOME) side of the incident.
|
||||
//
|
||||
// The Files API exposes the container's own $HOME via ?root=/agent-home (the
|
||||
// docker-exec backend, internal#425 RFC). That backend is intentionally STUBBED
|
||||
// today: every verb returns 501 Not Implemented. So there is NO supported
|
||||
// platform write path into the container's /home/agent — which is precisely
|
||||
// because that directory is EPHEMERAL: it lives inside the container, not on the
|
||||
// durable data volume, and is WIPED on every container swap unless a snapshot is
|
||||
// taken first (the incident's snapshot-before-stop+rm+run rule, which is a
|
||||
// CP-side provisioner concern, not a tenant ws-server file-API surface).
|
||||
//
|
||||
// This assertion is the regression tripwire for that contract: if a future
|
||||
// change wires /agent-home to a path WITHOUT also making it data-volume-backed,
|
||||
// this 501 flips to 200 and the test fails LOUD — forcing whoever lit up the
|
||||
// surface to first answer "is /home/agent now durable, and was the snapshot
|
||||
// hook added?" rather than silently shipping a wipe-on-recreate surface.
|
||||
//
|
||||
// We do NOT write-then-recreate-then-expect-wipe on /home/agent: asserting a
|
||||
// WIPE as a pass would be fail-open (a no-op write would also "pass"). Pinning
|
||||
// the 501 contract is the fail-closed encoding.
|
||||
func assertAgentHomeContract(t *testing.T, host, token, orgID, wsID string, stamp int64) {
|
||||
t.Helper()
|
||||
rel := fmt.Sprintf("e2e-persist/%d.home.sentinel", stamp)
|
||||
url := fmt.Sprintf("https://%s/workspaces/%s/files/%s?root=%s",
|
||||
host, wsID, rel, "/agent-home")
|
||||
status, body := doTenantJSON(t, "PUT", url, token, orgID, fmt.Sprintf(`{"content":%q}`, "x"))
|
||||
|
||||
switch status {
|
||||
case http.StatusNotImplemented:
|
||||
// Documented contract: container-$HOME browse/write is stubbed BECAUSE
|
||||
// it is ephemeral. No durable surface to assert survival on. Good.
|
||||
t.Logf("/home/agent contract OK: /agent-home is 501 (ephemeral container-$HOME, no durable write surface — snapshot-before-swap is a CP-side concern)")
|
||||
case http.StatusOK:
|
||||
// The stub was lit up. This is a contract change that MUST be paired
|
||||
// with data-volume backing + a snapshot-before-swap hook; until this
|
||||
// test is extended to prove BOTH, treat the bare flip as a regression
|
||||
// of the documented ephemeral contract.
|
||||
t.Fatalf("CONTRACT DRIFT: PUT ?root=/agent-home returned 200 — the container-$HOME surface was wired up.\n"+
|
||||
" Per feedback_workspace_container_swap_wipes_home_agent, /home/agent is EPHEMERAL and wiped on\n"+
|
||||
" container swap unless snapshotted first. If this surface is now durable, EXTEND this test to\n"+
|
||||
" write→recreate→assert-survival on /home/agent AND assert the snapshot-before-swap hook fired.\n"+
|
||||
" Do not leave a write-able-but-ephemeral surface uncovered. body=%s", body)
|
||||
default:
|
||||
// 4xx other than 501 (e.g. 400/404) is acceptable — still "not a
|
||||
// durable write surface". Anything 5xx that ISN'T 501 is a real bug.
|
||||
if status >= 500 {
|
||||
t.Fatalf("/home/agent contract probe: unexpected %d (want 501 or a 4xx): %s", status, body)
|
||||
}
|
||||
t.Logf("/home/agent contract: ?root=/agent-home returned %d (non-durable surface) — acceptable", status)
|
||||
}
|
||||
}
|
||||
|
||||
// --- workspace lifecycle over the tenant API ------------------------------
|
||||
|
||||
// createPersistWorkspace creates a throwaway workspace with the durable
|
||||
// data-volume choice (compute.data_persistence="persist", internal#734). The
|
||||
// "persist" choice is what makes /workspace survive a recreate; we set it
|
||||
// explicitly rather than relying on the auto/org-flag default so the invariant
|
||||
// under test is unambiguous.
|
||||
func createPersistWorkspace(t *testing.T, host, token, orgID string, stamp int64) string {
|
||||
t.Helper()
|
||||
url := "https://" + host + "/workspaces"
|
||||
body := fmt.Sprintf(
|
||||
`{"name":%q,"runtime":%q,"tier":%d,"compute":{"data_persistence":%q}}`,
|
||||
fmt.Sprintf("e2e-persist-%d", stamp%100000000), "claude-code", 1, "persist",
|
||||
)
|
||||
status, resp := doTenantJSON(t, "POST", url, token, orgID, body)
|
||||
if status != http.StatusCreated && status != http.StatusOK {
|
||||
t.Fatalf("create workspace: HTTP %d: %s", status, resp)
|
||||
}
|
||||
id := jsonField(resp, "id")
|
||||
if id == "" {
|
||||
t.Fatalf("create workspace: no id in response: %s", resp)
|
||||
}
|
||||
return id
|
||||
}
|
||||
|
||||
// deletePersistWorkspace is the t.Cleanup teardown — best-effort, never fails
|
||||
// the test. DELETE without prune so a hung delete doesn't strand the test;
|
||||
// staging sweep reclaims any leftover compute. (The org/tenant itself is torn
|
||||
// down separately via adminDeleteTenant.)
|
||||
func deletePersistWorkspace(t *testing.T, host, token, orgID, wsID string) {
|
||||
t.Helper()
|
||||
url := "https://" + host + "/workspaces/" + wsID
|
||||
status, resp := doTenantJSON(t, "DELETE", url, token, orgID, "")
|
||||
if status != http.StatusOK && status != http.StatusAccepted && status != http.StatusNoContent && status != http.StatusNotFound {
|
||||
t.Logf("WARNING: teardown DELETE workspace %s returned HTTP %d: %s (manual cleanup may be needed)", wsID, status, resp)
|
||||
return
|
||||
}
|
||||
t.Logf("teardown: deleted workspace %s (HTTP %d)", wsID, status)
|
||||
}
|
||||
|
||||
// waitForWorkspaceOnline polls GET /workspaces/:id until .status == "online".
|
||||
func waitForWorkspaceOnline(t *testing.T, host, token, orgID, wsID string, timeout time.Duration) {
|
||||
t.Helper()
|
||||
url := "https://" + host + "/workspaces/" + wsID
|
||||
deadline := time.Now().Add(timeout)
|
||||
var last string
|
||||
for time.Now().Before(deadline) {
|
||||
status, body := doTenantJSON(t, "GET", url, token, orgID, "")
|
||||
if status == http.StatusOK {
|
||||
last = jsonField(body, "status")
|
||||
if last == "online" {
|
||||
return
|
||||
}
|
||||
}
|
||||
time.Sleep(10 * time.Second)
|
||||
}
|
||||
t.Fatalf("workspace %s did not reach status=online within %s (last=%q)", wsID, timeout, last)
|
||||
}
|
||||
|
||||
// triggerRecreate POSTs /restart, the recreate / container-swap path. The
|
||||
// handler tears down the container and re-provisions on the SAME data volume
|
||||
// (Stop is called with prune=false for restart — see workspace_restart.go's
|
||||
// cpStopWithRetryErr — so a recreate can NEVER erase the data volume).
|
||||
func triggerRecreate(t *testing.T, host, token, orgID, wsID string) {
|
||||
t.Helper()
|
||||
url := "https://" + host + "/workspaces/" + wsID + "/restart"
|
||||
status, body := doTenantJSON(t, "POST", url, token, orgID, "")
|
||||
if status != http.StatusOK && status != http.StatusAccepted {
|
||||
t.Fatalf("trigger recreate (POST /restart): HTTP %d: %s", status, body)
|
||||
}
|
||||
}
|
||||
|
||||
// waitForRecreateThenOnline waits out the swap. The recreate flips status to
|
||||
// 'provisioning'; we first observe it LEAVE online (so we don't read a stale
|
||||
// "still online" before the swap starts), then wait for it to return to online.
|
||||
// If we never catch the provisioning dip (fast swap), the subsequent online
|
||||
// poll still proves liveness — the load-bearing assertion is the sentinel read,
|
||||
// not the transient state machine.
|
||||
func waitForRecreateThenOnline(t *testing.T, host, token, orgID, wsID string, timeout time.Duration) {
|
||||
t.Helper()
|
||||
url := "https://" + host + "/workspaces/" + wsID
|
||||
deadline := time.Now().Add(timeout)
|
||||
|
||||
// Brief window to catch the provisioning dip (best-effort; not required).
|
||||
dipDeadline := time.Now().Add(90 * time.Second)
|
||||
for time.Now().Before(dipDeadline) {
|
||||
status, body := doTenantJSON(t, "GET", url, token, orgID, "")
|
||||
if status == http.StatusOK && jsonField(body, "status") != "online" {
|
||||
break
|
||||
}
|
||||
time.Sleep(3 * time.Second)
|
||||
}
|
||||
|
||||
var last string
|
||||
for time.Now().Before(deadline) {
|
||||
status, body := doTenantJSON(t, "GET", url, token, orgID, "")
|
||||
if status == http.StatusOK {
|
||||
last = jsonField(body, "status")
|
||||
if last == "online" {
|
||||
return
|
||||
}
|
||||
}
|
||||
time.Sleep(10 * time.Second)
|
||||
}
|
||||
t.Fatalf("workspace %s did not return to status=online after recreate within %s (last=%q)", wsID, timeout, last)
|
||||
}
|
||||
|
||||
// settleAutoRestart absorbs the internal#624 file-write→restart debounce so the
|
||||
// explicit recreate we measure isn't coalesced with an implicit one. The
|
||||
// debounce window is 15s + a restart cycle; we poll back to a stable online.
|
||||
func settleAutoRestart(t *testing.T, host, token, orgID, wsID string) {
|
||||
t.Helper()
|
||||
// Give the debounce window time to fire (or not) ...
|
||||
time.Sleep(20 * time.Second)
|
||||
// ... then ensure we're back to a stable online before the measured swap.
|
||||
waitForWorkspaceOnline(t, host, token, orgID, wsID, 10*time.Minute)
|
||||
}
|
||||
|
||||
// --- tenant Files API ------------------------------------------------------
|
||||
|
||||
// writeWorkspaceFile PUTs a file via the tenant Files API into the given root.
|
||||
// root="/workspace" is the literal data-volume path (cp#326).
|
||||
func writeWorkspaceFile(t *testing.T, host, token, orgID, wsID, root, relPath, content string) {
|
||||
t.Helper()
|
||||
url := fmt.Sprintf("https://%s/workspaces/%s/files/%s?root=%s",
|
||||
host, wsID, relPath, root)
|
||||
status, body := doTenantJSON(t, "PUT", url, token, orgID, fmt.Sprintf(`{"content":%q}`, content))
|
||||
if status != http.StatusOK {
|
||||
t.Fatalf("write %s%s: HTTP %d: %s", root, relPath, status, body)
|
||||
}
|
||||
}
|
||||
|
||||
// readWorkspaceFile GETs a file via the tenant Files API and returns its
|
||||
// content. Fails the test on any non-200 (a not-found after a recreate is the
|
||||
// wipe we are gating, so the caller compares content and emits the regression
|
||||
// message — but a transport/auth failure should still fail loud here).
|
||||
func readWorkspaceFile(t *testing.T, host, token, orgID, wsID, root, relPath string) string {
|
||||
t.Helper()
|
||||
url := fmt.Sprintf("https://%s/workspaces/%s/files/%s?root=%s",
|
||||
host, wsID, relPath, root)
|
||||
status, body := doTenantJSON(t, "GET", url, token, orgID, "")
|
||||
if status == http.StatusNotFound {
|
||||
// Surface the not-found as empty content; the caller's exact-content
|
||||
// compare turns this into the DATA-VOLUME REGRESSION message.
|
||||
return ""
|
||||
}
|
||||
if status != http.StatusOK {
|
||||
t.Fatalf("read %s%s: HTTP %d: %s", root, relPath, status, body)
|
||||
}
|
||||
return jsonField(body, "content")
|
||||
}
|
||||
Reference in New Issue
Block a user