From 3cbe7d880d9e1c7e4adb3c2b905604687d91afcd Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 7 Jun 2026 19:46:04 -0700 Subject: [PATCH 1/6] feat(ws-server): POST /workspaces/:id/switch-provider (ordered, leak-safe) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch an EXISTING workspace to a different cloud (compute.provider). The VM is cloud-specific so this reprovisions the box on the new cloud; the agent's durable identity (agent_card, config, secrets, tokens, memory) lives in the tenant DB and is preserved (the row is never deleted). CRITICAL ordering (RFC #622 Hazard 1): stop the OLD box with the OLD provider BEFORE writing the new provider — cpStopWithRetry resolves provider+instance_id from the row at call time, so writing the new provider first would deprovision the old box against the new backend and leak it. Sequence: stop-old -> clear instance_id + jsonb_set provider -> provision-new (withStoredCompute re-reads the new provider). Clearing instance_id makes a retried switch safe. Guards: SaaS-only (cpProv), external/mock no-op, paused-parent, same- provider no-op, and a confirm_data_loss gate for persistent volumes (the filesystem cross-cloud migration is RFC #622 PR3/PR4). Tests: source-level ordering pin (stop-before-write), bad/missing-provider 400s, route wiring. PR2 of the switch-existing-workspace-provider series. Stacked on PR1 (#2420, the compute.provider validation it reuses). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../handlers/workspace_switch_provider.go | 158 ++++++++++++++++++ .../workspace_switch_provider_test.go | 81 +++++++++ workspace-server/internal/router/router.go | 1 + 3 files changed, 240 insertions(+) create mode 100644 workspace-server/internal/handlers/workspace_switch_provider.go create mode 100644 workspace-server/internal/handlers/workspace_switch_provider_test.go diff --git a/workspace-server/internal/handlers/workspace_switch_provider.go b/workspace-server/internal/handlers/workspace_switch_provider.go new file mode 100644 index 000000000..be37a379c --- /dev/null +++ b/workspace-server/internal/handlers/workspace_switch_provider.go @@ -0,0 +1,158 @@ +package handlers + +import ( + "context" + "database/sql" + "io" + "log" + "net/http" + "strings" + + "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db" + "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/events" + "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/models" + "github.com/gin-gonic/gin" +) + +// SwitchProvider handles POST /workspaces/:id/switch-provider — move an EXISTING +// workspace to a different cloud (compute.provider). The VM is cloud-specific, so +// this reprovisions the box on the new cloud; the agent's durable identity +// (agent_card, config, secrets, tokens, memory) lives in the tenant DB and is +// preserved because the row is never deleted. +// +// CRITICAL ORDERING (the wrong-backend leak, RFC #622 Hazard 1): the stop must +// run with the OLD provider BEFORE the DB provider is changed. cpStopWithRetry +// resolves provider + instance_id from the workspaces row at call time; if we +// wrote the new provider first, the stop would issue +// DELETE …?instance_id=&provider= → CP routes teardown to the NEW +// backend → the old box is never terminated and leaks (billed forever, and not +// covered by the status='removed' orphan sweeper). So the sequence is strictly: +// +// 1. stop OLD box (DB still has old provider + old instance_id) +// 2. clear instance_id + write new provider (jsonb_set, preserving the rest) +// 3. provision NEW box (withStoredCompute now reads the new provider) +// +// Clearing instance_id in step 2 also makes a retried switch safe: a second call +// finds no stale instance to (mis-)deprovision against the new backend. +func (h *WorkspaceHandler) SwitchProvider(c *gin.Context) { + id := c.Param("id") + ctx := c.Request.Context() + + var body struct { + Provider string `json:"provider"` + ConfirmDataLoss bool `json:"confirm_data_loss"` + // MigrateData is accepted for forward-compat (RFC #622 PR4 wires the + // filesystem migration). Until then it is a no-op and a persistent + // volume still requires confirm_data_loss. + MigrateData bool `json:"migrate_data"` + } + if err := c.ShouldBindJSON(&body); err != nil && err != io.EOF { + c.JSON(http.StatusBadRequest, gin.H{"error": "invalid JSON body"}) + return + } + newProvider := strings.ToLower(strings.TrimSpace(body.Provider)) + if newProvider == "" { + c.JSON(http.StatusBadRequest, gin.H{"error": "provider is required"}) + return + } + if _, ok := workspaceComputeProviderAllowlist[newProvider]; !ok { + c.JSON(http.StatusBadRequest, gin.H{"error": "unsupported provider (want aws|gcp|hetzner)"}) + return + } + + var status, wsName, dbRuntime, oldProvider, dataPersistence string + var tier int + err := db.DB.QueryRowContext(ctx, ` + SELECT status, name, tier, COALESCE(runtime, 'claude-code'), + COALESCE(compute->>'provider', ''), COALESCE(compute->>'data_persistence', '') + FROM workspaces WHERE id = $1`, id, + ).Scan(&status, &wsName, &tier, &dbRuntime, &oldProvider, &dataPersistence) + if err == sql.ErrNoRows || status == string(models.StatusRemoved) { + c.JSON(http.StatusNotFound, gin.H{"error": "workspace not found"}) + return + } + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": "lookup failed"}) + return + } + + // Switching the cloud backend is a SaaS-only concept — a self-hosted Docker + // workspace has no cloud provider to switch. external/mock runtimes have no + // box at all. + if h.cpProv == nil { + c.JSON(http.StatusConflict, gin.H{"error": "provider switching is only available for cloud (SaaS) workspaces"}) + return + } + if isExternalLikeRuntime(dbRuntime) || dbRuntime == "mock" { + c.JSON(http.StatusConflict, gin.H{"error": dbRuntime + " workspaces have no cloud box to switch"}) + return + } + if paused, parentName := isParentPaused(ctx, id); paused { + c.JSON(http.StatusConflict, gin.H{"error": "parent workspace \"" + parentName + "\" is paused — resume it first"}) + return + } + + // "" provider means the default AWS path. + effectiveOld := oldProvider + if effectiveOld == "" { + effectiveOld = "aws" + } + if newProvider == effectiveOld { + c.JSON(http.StatusOK, gin.H{"status": "noop", "provider": newProvider, "message": "workspace is already on this provider"}) + return + } + + // Data gate: a persistent data volume is cloud-specific and cannot move as a + // block device. Until the filesystem migration lands (RFC #622 PR3/PR4), the + // switch is allowed only with an explicit confirm_data_loss. + if dataPersistence == "persist" && !body.ConfirmDataLoss { + c.JSON(http.StatusConflict, gin.H{ + "error": "DATA_LOSS_UNCONFIRMED", + "detail": "this workspace has a persistent data volume that cannot move across clouds; set confirm_data_loss=true to switch with a fresh volume (cross-cloud data migration is RFC #622, not yet wired). Identity/config/secrets/memory are preserved regardless.", + }) + return + } + + // --- ordered switch (see doc-comment) --- + + // 1. Stop the OLD box with the OLD provider. DB is unchanged here, so + // cpStopWithRetry resolves the old provider + old instance_id. Synchronous + // (bounded retry); proceeds even on exhaustion so a stuck old box never + // strands the switch — the audit log + reconciler are the backstop. + h.cpStopWithRetry(ctx, id, "SwitchProvider") + + // 2. Clear instance_id + write the new provider (jsonb_set preserves + // instance_type/volume/display/data_persistence) + go provisioning. + if _, err := db.DB.ExecContext(ctx, ` + UPDATE workspaces + SET instance_id = NULL, + compute = jsonb_set(COALESCE(compute, '{}'::jsonb), '{provider}', to_jsonb($2::text)), + status = $3, url = '', updated_at = now() + WHERE id = $1`, id, newProvider, models.StatusProvisioning); err != nil { + log.Printf("SwitchProvider: failed to write new provider for %s: %v", id, err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to switch provider"}) + return + } + log.Printf("SwitchProvider: %s %s → %s (old box stopped, reprovisioning)", id, effectiveOld, newProvider) + h.broadcaster.RecordAndBroadcast(ctx, string(events.EventWorkspaceProvisioning), id, map[string]interface{}{ + "name": wsName, + "tier": tier, + "runtime": dbRuntime, + "provider_from": effectiveOld, + "provider_to": newProvider, + }) + + // 3. Provision the NEW box. withStoredCompute re-reads compute (now carrying + // the new provider) → provisionWorkspaceCP routes to the new backend. + // Reuse the existing config volume (templatePath="") so identity/config + // are preserved. Detached context: the reprovision outlives the request. + payload := withStoredCompute(context.Background(), id, models.CreateWorkspacePayload{Name: wsName, Tier: tier, Runtime: dbRuntime}) + h.goAsync(func() { h.provisionWorkspaceCP(id, "", nil, payload) }) + + c.JSON(http.StatusAccepted, gin.H{ + "status": "switching", + "workspace_id": id, + "from": effectiveOld, + "to": newProvider, + }) +} diff --git a/workspace-server/internal/handlers/workspace_switch_provider_test.go b/workspace-server/internal/handlers/workspace_switch_provider_test.go new file mode 100644 index 000000000..4e939e49a --- /dev/null +++ b/workspace-server/internal/handlers/workspace_switch_provider_test.go @@ -0,0 +1,81 @@ +package handlers + +import ( + "bytes" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/gin-gonic/gin" +) + +// TestSwitchProvider_StopBeforeProviderWrite is the load-bearing ordering pin +// (RFC #622 Hazard 1). The stop (cpStopWithRetry) MUST appear before the UPDATE +// that writes the new provider — otherwise the stop resolves the new provider +// and deprovisions the old box against the wrong backend, leaking it. A +// source-level position check guards against a refactor reordering the two. +func TestSwitchProvider_StopBeforeProviderWrite(t *testing.T) { + wd, _ := os.Getwd() + src, err := os.ReadFile(filepath.Join(wd, "workspace_switch_provider.go")) + if err != nil { + t.Fatalf("read source: %v", err) + } + stripped := stripGoComments(src) + stopIdx := bytes.Index(stripped, []byte("cpStopWithRetry(ctx, id, \"SwitchProvider\")")) + if stopIdx < 0 { + t.Fatal("SwitchProvider must stop the old box via cpStopWithRetry before reprovisioning") + } + // the provider write is the jsonb_set on compute -> {provider} + writeIdx := bytes.Index(stripped, []byte("'{provider}'")) + if writeIdx < 0 { + t.Fatal("SwitchProvider must write the new provider via jsonb_set on compute->{provider}") + } + if stopIdx >= writeIdx { + t.Fatalf("ORDERING HAZARD: cpStopWithRetry (idx %d) must come BEFORE the provider write (idx %d) — else the old box is deprovisioned with the new backend and leaks", stopIdx, writeIdx) + } + // and the instance_id must be cleared in the same UPDATE (retry-safety) + if !bytes.Contains(stripped, []byte("instance_id = NULL")) { + t.Fatal("SwitchProvider must clear instance_id when writing the new provider (retry-safety)") + } +} + +// TestSwitchProvider_RejectsBadProvider: the allowlist check fires before any DB +// access, so a bad/missing provider is a clean 400 without touching the backend. +func TestSwitchProvider_RejectsBadProvider(t *testing.T) { + gin.SetMode(gin.TestMode) + h := &WorkspaceHandler{} + for _, tc := range []struct { + body string + want int + }{ + {`{"provider":"azure"}`, http.StatusBadRequest}, + {`{"provider":""}`, http.StatusBadRequest}, + {`{"provider":"AWS-typo"}`, http.StatusBadRequest}, + {`{}`, http.StatusBadRequest}, + } { + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-1"}} + c.Request = httptest.NewRequest("POST", "/workspaces/ws-1/switch-provider", strings.NewReader(tc.body)) + c.Request.Header.Set("Content-Type", "application/json") + h.SwitchProvider(c) + if w.Code != tc.want { + t.Errorf("body %s: got %d want %d (%s)", tc.body, w.Code, tc.want, w.Body.String()) + } + } +} + +// TestSwitchProvider_RouteRegistered pins the route wiring. +func TestSwitchProvider_RouteRegistered(t *testing.T) { + wd, _ := os.Getwd() + src, err := os.ReadFile(filepath.Join(wd, "..", "router", "router.go")) + if err != nil { + t.Fatalf("read router: %v", err) + } + if !bytes.Contains(src, []byte(`POST("/switch-provider", wh.SwitchProvider)`)) { + t.Fatal("router must register POST /switch-provider → wh.SwitchProvider") + } +} diff --git a/workspace-server/internal/router/router.go b/workspace-server/internal/router/router.go index 5eefa93a3..543de95ad 100644 --- a/workspace-server/internal/router/router.go +++ b/workspace-server/internal/router/router.go @@ -271,6 +271,7 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi // Lifecycle wsAuth.GET("/state", wh.State) wsAuth.POST("/restart", wh.Restart) + wsAuth.POST("/switch-provider", wh.SwitchProvider) wsAuth.POST("/pause", wh.Pause) wsAuth.POST("/resume", wh.Resume) // Manual hibernate (opt-in, #711) — stops the container and sets status -- 2.52.0 From b3990599627c25cc45d7cf9e1cae0de3c43e2508 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 7 Jun 2026 22:57:53 -0700 Subject: [PATCH 2/6] harden(switch-provider): CAS guard + stop-exhaustion audit (#2422 review) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses the two NEEDS-CHANGES from the correctness review: - Concurrency (review #4): the provider-write UPDATE is now an atomic CAS (WHERE status <> 'provisioning' AND provider unchanged); RowsAffected==0 → 409 ALREADY_SWITCHING, so two concurrent switches can't both launch a provision and orphan a box. - Stop-exhaustion leak (review #3): switch to cpStopWithRetryErr, capture the old instance_id from the row before nulling it, and on stop failure emit a durable structure_events row (workspace.provider.switch_stop_exhausted) carrying {old_instance_id, old_provider} so the CP orphan reconciler can terminate the leaked box — the un-pointed orphan the status='removed' sweeper can't catch. Mirrors emitDeleteTerminateRetryExhausted. Tests pin both (CAS clauses + 409 + cpStopWithRetryErr + audit emit). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../handlers/workspace_switch_provider.go | 81 ++++++++++++++++--- .../workspace_switch_provider_test.go | 30 ++++++- 2 files changed, 98 insertions(+), 13 deletions(-) diff --git a/workspace-server/internal/handlers/workspace_switch_provider.go b/workspace-server/internal/handlers/workspace_switch_provider.go index be37a379c..f10f86838 100644 --- a/workspace-server/internal/handlers/workspace_switch_provider.go +++ b/workspace-server/internal/handlers/workspace_switch_provider.go @@ -3,6 +3,7 @@ package handlers import ( "context" "database/sql" + "encoding/json" "io" "log" "net/http" @@ -61,12 +62,14 @@ func (h *WorkspaceHandler) SwitchProvider(c *gin.Context) { } var status, wsName, dbRuntime, oldProvider, dataPersistence string + var oldInstanceID sql.NullString var tier int err := db.DB.QueryRowContext(ctx, ` SELECT status, name, tier, COALESCE(runtime, 'claude-code'), - COALESCE(compute->>'provider', ''), COALESCE(compute->>'data_persistence', '') + COALESCE(compute->>'provider', ''), COALESCE(compute->>'data_persistence', ''), + instance_id FROM workspaces WHERE id = $1`, id, - ).Scan(&status, &wsName, &tier, &dbRuntime, &oldProvider, &dataPersistence) + ).Scan(&status, &wsName, &tier, &dbRuntime, &oldProvider, &dataPersistence, &oldInstanceID) if err == sql.ErrNoRows || status == string(models.StatusRemoved) { c.JSON(http.StatusNotFound, gin.H{"error": "workspace not found"}) return @@ -116,24 +119,50 @@ func (h *WorkspaceHandler) SwitchProvider(c *gin.Context) { // --- ordered switch (see doc-comment) --- // 1. Stop the OLD box with the OLD provider. DB is unchanged here, so - // cpStopWithRetry resolves the old provider + old instance_id. Synchronous - // (bounded retry); proceeds even on exhaustion so a stuck old box never - // strands the switch — the audit log + reconciler are the backstop. - h.cpStopWithRetry(ctx, id, "SwitchProvider") + // cpStopWithRetryErr resolves the old provider + old instance_id. Bounded + // retry; on exhaustion it returns an error but we STILL proceed (a stuck + // old box must not strand the switch) — except we capture the failure so + // step 2.5 can emit a durable audit row, because step 2 nulls instance_id + // and flips provider, which otherwise orphans the old box with no DB + // pointer for the sweeper to find (review finding #3). + stopErr := h.cpStopWithRetryErr(ctx, id, "SwitchProvider", false) - // 2. Clear instance_id + write the new provider (jsonb_set preserves - // instance_type/volume/display/data_persistence) + go provisioning. - if _, err := db.DB.ExecContext(ctx, ` + // 2. Atomically claim the switch AND clear instance_id + write the new + // provider. The CAS (status not already provisioning, provider still the + // one we read) makes concurrent/duplicate switch calls safe: only the + // first winner launches a provision; a racing call sees 0 rows → 409, + // never a second provision against a second backend (review finding #4). + // jsonb_set preserves instance_type/volume/display/data_persistence. + res, err := db.DB.ExecContext(ctx, ` UPDATE workspaces SET instance_id = NULL, compute = jsonb_set(COALESCE(compute, '{}'::jsonb), '{provider}', to_jsonb($2::text)), status = $3, url = '', updated_at = now() - WHERE id = $1`, id, newProvider, models.StatusProvisioning); err != nil { + WHERE id = $1 + AND status <> $3 + AND COALESCE(compute->>'provider', '') IS NOT DISTINCT FROM $4`, + id, newProvider, models.StatusProvisioning, oldProvider) + if err != nil { log.Printf("SwitchProvider: failed to write new provider for %s: %v", id, err) c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to switch provider"}) return } - log.Printf("SwitchProvider: %s %s → %s (old box stopped, reprovisioning)", id, effectiveOld, newProvider) + if n, _ := res.RowsAffected(); n == 0 { + // Lost the CAS: another switch already flipped the provider / set + // provisioning, or the row changed under us. Do NOT launch a second + // provision (would orphan a box). + c.JSON(http.StatusConflict, gin.H{"error": "ALREADY_SWITCHING", "detail": "a provider switch or provision is already in progress for this workspace"}) + return + } + + // 2.5. If the old box never confirmed stopped, the old instance is now + // orphaned with no DB pointer (we just nulled instance_id). Emit a + // durable audit row carrying the old instance_id + provider so a CP-side + // reconciler can find and terminate it (review finding #3). + if stopErr != nil && oldInstanceID.Valid && oldInstanceID.String != "" { + h.emitSwitchProviderStopExhausted(ctx, id, oldInstanceID.String, effectiveOld, newProvider, stopErr) + } + log.Printf("SwitchProvider: %s %s → %s (old box stop err=%v, reprovisioning)", id, effectiveOld, newProvider, stopErr) h.broadcaster.RecordAndBroadcast(ctx, string(events.EventWorkspaceProvisioning), id, map[string]interface{}{ "name": wsName, "tier": tier, @@ -156,3 +185,33 @@ func (h *WorkspaceHandler) SwitchProvider(c *gin.Context) { "to": newProvider, }) } + +// emitSwitchProviderStopExhausted records a durable audit row when a provider +// switch could not confirm the OLD box stopped before its instance_id was +// cleared from the row. Without it the old box is an un-pointed orphan that the +// status='removed' sweeper won't catch; a CP-side reconciler reads these rows by +// old_instance_id + old_provider to terminate the leaked box. Mirrors +// emitDeleteTerminateRetryExhausted (workspace_dispatchers.go). Best-effort. +func (h *WorkspaceHandler) emitSwitchProviderStopExhausted(ctx context.Context, workspaceID, oldInstanceID, oldProvider, newProvider string, cause error) { + if db.DB == nil { + return + } + payload, err := json.Marshal(map[string]any{ + "workspace_id": workspaceID, + "old_instance_id": oldInstanceID, + "old_provider": oldProvider, + "new_provider": newProvider, + "last_error": cause.Error(), + "recovery_path": "cp_orphan_reconciler", + }) + if err != nil { + log.Printf("emitSwitchProviderStopExhausted: marshal failed for %s: %v", workspaceID, err) + return + } + if _, err := db.DB.ExecContext(ctx, ` + INSERT INTO structure_events (event_type, workspace_id, payload, created_at) + VALUES ($1, $2, $3, now()) + `, "workspace.provider.switch_stop_exhausted", workspaceID, payload); err != nil { + log.Printf("emitSwitchProviderStopExhausted: insert failed for %s: %v", workspaceID, err) + } +} diff --git a/workspace-server/internal/handlers/workspace_switch_provider_test.go b/workspace-server/internal/handlers/workspace_switch_provider_test.go index 4e939e49a..35ea7db0a 100644 --- a/workspace-server/internal/handlers/workspace_switch_provider_test.go +++ b/workspace-server/internal/handlers/workspace_switch_provider_test.go @@ -24,9 +24,9 @@ func TestSwitchProvider_StopBeforeProviderWrite(t *testing.T) { t.Fatalf("read source: %v", err) } stripped := stripGoComments(src) - stopIdx := bytes.Index(stripped, []byte("cpStopWithRetry(ctx, id, \"SwitchProvider\")")) + stopIdx := bytes.Index(stripped, []byte("cpStopWithRetryErr(ctx, id, \"SwitchProvider\"")) if stopIdx < 0 { - t.Fatal("SwitchProvider must stop the old box via cpStopWithRetry before reprovisioning") + t.Fatal("SwitchProvider must stop the old box via cpStopWithRetryErr before reprovisioning") } // the provider write is the jsonb_set on compute -> {provider} writeIdx := bytes.Index(stripped, []byte("'{provider}'")) @@ -42,6 +42,32 @@ func TestSwitchProvider_StopBeforeProviderWrite(t *testing.T) { } } +// TestSwitchProvider_ConcurrencyGuardAndAudit pins the two hardening items from +// the #2422 correctness review: (a) the provider-write is an atomic CAS so two +// concurrent switches can't both launch a provision (orphan), and (b) a +// stop-exhaustion emits a durable audit row carrying the old instance_id+provider +// (else the old box orphans with no DB pointer once instance_id is nulled). +func TestSwitchProvider_ConcurrencyGuardAndAudit(t *testing.T) { + wd, _ := os.Getwd() + src, err := os.ReadFile(filepath.Join(wd, "workspace_switch_provider.go")) + if err != nil { + t.Fatalf("read source: %v", err) + } + s := stripGoComments(src) + if !bytes.Contains(s, []byte("status <> $3")) || !bytes.Contains(s, []byte("IS NOT DISTINCT FROM $4")) { + t.Error("the provider-write UPDATE must be a CAS (status not already provisioning AND provider unchanged) to prevent a double-provision race") + } + if !bytes.Contains(s, []byte("RowsAffected")) || !bytes.Contains(s, []byte("ALREADY_SWITCHING")) { + t.Error("SwitchProvider must 409 ALREADY_SWITCHING when the CAS affects 0 rows (lost the race)") + } + if !bytes.Contains(s, []byte("cpStopWithRetryErr")) { + t.Error("SwitchProvider must use cpStopWithRetryErr to detect stop exhaustion") + } + if !bytes.Contains(s, []byte("emitSwitchProviderStopExhausted")) { + t.Error("SwitchProvider must emit an audit row with old instance_id+provider on stop exhaustion") + } +} + // TestSwitchProvider_RejectsBadProvider: the allowlist check fires before any DB // access, so a bad/missing provider is a clean 400 without touching the backend. func TestSwitchProvider_RejectsBadProvider(t *testing.T) { -- 2.52.0 From ad33e59d5df188ac7d57d950a3e3bbbb3404fb4b Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Fri, 12 Jun 2026 04:57:59 +0000 Subject: [PATCH 3/6] chore(switch-provider): sanitize comments/test prose for content security (#2422) Addresses agent-reviewer content-security feedback: - Generalize comments describing teardown ordering without exposing query shapes, backend routing, or lifecycle sweeper details. - Replace 'orphan' / 'no DB pointer' / 'CP-side reconciler' language with neutral lifecycle-cleanup wording. - Rename emitted recovery_path marker from cp_orphan_reconciler to platform_cleanup. Functional identifiers (function names, table names, SQL) remain unchanged. Refs molecule-core#2422. Co-Authored-By: Claude --- .../handlers/workspace_switch_provider.go | 46 +++++++++---------- .../workspace_switch_provider_test.go | 21 ++++----- 2 files changed, 31 insertions(+), 36 deletions(-) diff --git a/workspace-server/internal/handlers/workspace_switch_provider.go b/workspace-server/internal/handlers/workspace_switch_provider.go index f10f86838..991d46219 100644 --- a/workspace-server/internal/handlers/workspace_switch_provider.go +++ b/workspace-server/internal/handlers/workspace_switch_provider.go @@ -21,20 +21,17 @@ import ( // (agent_card, config, secrets, tokens, memory) lives in the tenant DB and is // preserved because the row is never deleted. // -// CRITICAL ORDERING (the wrong-backend leak, RFC #622 Hazard 1): the stop must -// run with the OLD provider BEFORE the DB provider is changed. cpStopWithRetry -// resolves provider + instance_id from the workspaces row at call time; if we -// wrote the new provider first, the stop would issue -// DELETE …?instance_id=&provider= → CP routes teardown to the NEW -// backend → the old box is never terminated and leaks (billed forever, and not -// covered by the status='removed' orphan sweeper). So the sequence is strictly: +// CRITICAL ORDERING: the stop must run with the OLD provider BEFORE the DB +// provider is changed. The stop helper reads the current row at call time; if we +// wrote the new provider first, the teardown request would target the wrong +// backend and the old box would leak. So the sequence is strictly: // // 1. stop OLD box (DB still has old provider + old instance_id) // 2. clear instance_id + write new provider (jsonb_set, preserving the rest) // 3. provision NEW box (withStoredCompute now reads the new provider) // // Clearing instance_id in step 2 also makes a retried switch safe: a second call -// finds no stale instance to (mis-)deprovision against the new backend. +// finds no stale instance to act on with the new provider metadata. func (h *WorkspaceHandler) SwitchProvider(c *gin.Context) { id := c.Param("id") ctx := c.Request.Context() @@ -118,13 +115,13 @@ func (h *WorkspaceHandler) SwitchProvider(c *gin.Context) { // --- ordered switch (see doc-comment) --- - // 1. Stop the OLD box with the OLD provider. DB is unchanged here, so - // cpStopWithRetryErr resolves the old provider + old instance_id. Bounded - // retry; on exhaustion it returns an error but we STILL proceed (a stuck - // old box must not strand the switch) — except we capture the failure so - // step 2.5 can emit a durable audit row, because step 2 nulls instance_id - // and flips provider, which otherwise orphans the old box with no DB - // pointer for the sweeper to find (review finding #3). + // 1. Stop the OLD box with the OLD provider. DB is unchanged here, so the + // stop helper reads the old provider + old instance_id. Bounded retry; on + // exhaustion it returns an error but we STILL proceed (a stuck old box + // must not strand the switch) — except we capture the failure so step 2.5 + // can emit a durable audit row, because step 2 nulls instance_id and flips + // provider, which otherwise leaves the old box untracked by normal + // lifecycle cleanup (review finding #3). stopErr := h.cpStopWithRetryErr(ctx, id, "SwitchProvider", false) // 2. Atomically claim the switch AND clear instance_id + write the new @@ -150,15 +147,15 @@ func (h *WorkspaceHandler) SwitchProvider(c *gin.Context) { if n, _ := res.RowsAffected(); n == 0 { // Lost the CAS: another switch already flipped the provider / set // provisioning, or the row changed under us. Do NOT launch a second - // provision (would orphan a box). + // provision (would leave an untracked box). c.JSON(http.StatusConflict, gin.H{"error": "ALREADY_SWITCHING", "detail": "a provider switch or provision is already in progress for this workspace"}) return } - // 2.5. If the old box never confirmed stopped, the old instance is now - // orphaned with no DB pointer (we just nulled instance_id). Emit a - // durable audit row carrying the old instance_id + provider so a CP-side - // reconciler can find and terminate it (review finding #3). + // 2.5. If the old box never confirmed stopped, it may not be tracked by the + // normal lifecycle cleanup after instance_id was nulled. Emit a durable + // audit row carrying the old instance_id + provider so a platform cleanup + // process can locate and terminate it (review finding #3). if stopErr != nil && oldInstanceID.Valid && oldInstanceID.String != "" { h.emitSwitchProviderStopExhausted(ctx, id, oldInstanceID.String, effectiveOld, newProvider, stopErr) } @@ -188,10 +185,9 @@ func (h *WorkspaceHandler) SwitchProvider(c *gin.Context) { // emitSwitchProviderStopExhausted records a durable audit row when a provider // switch could not confirm the OLD box stopped before its instance_id was -// cleared from the row. Without it the old box is an un-pointed orphan that the -// status='removed' sweeper won't catch; a CP-side reconciler reads these rows by -// old_instance_id + old_provider to terminate the leaked box. Mirrors -// emitDeleteTerminateRetryExhausted (workspace_dispatchers.go). Best-effort. +// cleared from the row. Without it the old box may not be tracked by normal +// lifecycle cleanup; a platform cleanup process reads these rows by +// old_instance_id + old_provider to terminate the leaked box. Best-effort. func (h *WorkspaceHandler) emitSwitchProviderStopExhausted(ctx context.Context, workspaceID, oldInstanceID, oldProvider, newProvider string, cause error) { if db.DB == nil { return @@ -202,7 +198,7 @@ func (h *WorkspaceHandler) emitSwitchProviderStopExhausted(ctx context.Context, "old_provider": oldProvider, "new_provider": newProvider, "last_error": cause.Error(), - "recovery_path": "cp_orphan_reconciler", + "recovery_path": "platform_cleanup", }) if err != nil { log.Printf("emitSwitchProviderStopExhausted: marshal failed for %s: %v", workspaceID, err) diff --git a/workspace-server/internal/handlers/workspace_switch_provider_test.go b/workspace-server/internal/handlers/workspace_switch_provider_test.go index 35ea7db0a..73f0d6fce 100644 --- a/workspace-server/internal/handlers/workspace_switch_provider_test.go +++ b/workspace-server/internal/handlers/workspace_switch_provider_test.go @@ -12,11 +12,10 @@ import ( "github.com/gin-gonic/gin" ) -// TestSwitchProvider_StopBeforeProviderWrite is the load-bearing ordering pin -// (RFC #622 Hazard 1). The stop (cpStopWithRetry) MUST appear before the UPDATE -// that writes the new provider — otherwise the stop resolves the new provider -// and deprovisions the old box against the wrong backend, leaking it. A -// source-level position check guards against a refactor reordering the two. +// TestSwitchProvider_StopBeforeProviderWrite is the load-bearing ordering pin. +// The stop helper MUST appear before the UPDATE that writes the new provider — +// otherwise the teardown uses the wrong backend metadata and the old box leaks. +// A source-level position check guards against a refactor reordering the two. func TestSwitchProvider_StopBeforeProviderWrite(t *testing.T) { wd, _ := os.Getwd() src, err := os.ReadFile(filepath.Join(wd, "workspace_switch_provider.go")) @@ -34,7 +33,7 @@ func TestSwitchProvider_StopBeforeProviderWrite(t *testing.T) { t.Fatal("SwitchProvider must write the new provider via jsonb_set on compute->{provider}") } if stopIdx >= writeIdx { - t.Fatalf("ORDERING HAZARD: cpStopWithRetry (idx %d) must come BEFORE the provider write (idx %d) — else the old box is deprovisioned with the new backend and leaks", stopIdx, writeIdx) + t.Fatalf("ORDERING HAZARD: stop helper (idx %d) must come BEFORE the provider write (idx %d) — else the old box is torn down with wrong backend metadata and leaks", stopIdx, writeIdx) } // and the instance_id must be cleared in the same UPDATE (retry-safety) if !bytes.Contains(stripped, []byte("instance_id = NULL")) { @@ -43,10 +42,10 @@ func TestSwitchProvider_StopBeforeProviderWrite(t *testing.T) { } // TestSwitchProvider_ConcurrencyGuardAndAudit pins the two hardening items from -// the #2422 correctness review: (a) the provider-write is an atomic CAS so two -// concurrent switches can't both launch a provision (orphan), and (b) a -// stop-exhaustion emits a durable audit row carrying the old instance_id+provider -// (else the old box orphans with no DB pointer once instance_id is nulled). +// the correctness review: (a) the provider-write is an atomic CAS so two +// concurrent switches can't both launch a provision, and (b) stop-exhaustion +// emits a durable audit row carrying the old instance_id+provider so the old box +// remains discoverable after instance_id is nulled. func TestSwitchProvider_ConcurrencyGuardAndAudit(t *testing.T) { wd, _ := os.Getwd() src, err := os.ReadFile(filepath.Join(wd, "workspace_switch_provider.go")) @@ -64,7 +63,7 @@ func TestSwitchProvider_ConcurrencyGuardAndAudit(t *testing.T) { t.Error("SwitchProvider must use cpStopWithRetryErr to detect stop exhaustion") } if !bytes.Contains(s, []byte("emitSwitchProviderStopExhausted")) { - t.Error("SwitchProvider must emit an audit row with old instance_id+provider on stop exhaustion") + t.Error("SwitchProvider must emit an audit row with old instance/provider metadata on stop exhaustion") } } -- 2.52.0 From f964f8eb15e112a1d433902606bb585a71527c3f Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer B (MiniMax)" Date: Sat, 13 Jun 2026 21:44:41 +0000 Subject: [PATCH 4/6] fix(molecule-core#2422): pre-claim gates stop (CR2 #11473) + provision via Auto (RCA tick) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Picks up stalled PR #2422 (feat/ws-switch-provider-endpoint, head 4d7256c6) on the devops-engineer branch and addresses the two remaining RCs: 1) CR2 (agent-reviewer-cr2) blocking finding on 4d7256c6: SwitchProvider performed cpStopWithRetryErr BEFORE the status<>'provisioning' CAS, so a request against a workspace that was already provisioning (or a racing duplicate) could still execute the stop side effect and only then return ALREADY_SWITCHING — the loser would stop a box it didn't own. Fix: split the original combined UPDATE into a PRE-CLAIM and a provider-write, with the stop strictly between them. The pre-claim sets status='provisioning' WITHOUT changing the provider; if it returns 0 rows, we 409 ALREADY_SWITCHING without ever touching the stop. Only a winning pre-claim proceeds to the stop (with the OLD provider still in the row, so the stop helper targets the correct backend). Sequence (new): 1. PRE-CLAIM: status='provisioning', provider unchanged. 0 rows → 409 ALREADY_SWITCHING, NO stop. 2. STOP OLD BOX (with OLD provider, row still has it). 3. WRITE new provider + clear instance_id. 4. AUDIT on stop exhaustion (unchanged). 5. PROVISION NEW BOX via the central Auto dispatcher (not h.provisionWorkspaceCP directly — see #2). 2) Researcher RCA tick on 4d7256c6: workspace_switch_provider.go was calling h.provisionWorkspaceCP() directly, bypassing the central Auto dispatcher. The source-level pin TestNoCallSiteCallsDirectProvisionerExceptAuto was red on the Platform Go lane as a result (run 351328/job 474660). Fix: route the NEW-box provision through provisionWorkspaceAuto instead. withStoredCompute re-reads compute (now carrying the new provider) → Auto picks CP. The pre-claim already set status='provisioning', so Auto's per-workspace provision gate (acquireRestartProvisionGate, from core#2771) serializes the new provision against any concurrent restart for the same ws-. Test updates (CR2 + RCA tests): - TestSwitchProvider_ConcurrencyGuardAndAudit: assertion moved from the OLD combined UPDATE to the new PRE-CLAIM, plus a negative pin that flags any future re-introduction of the direct h.provisionWorkspaceCP() call. - TestSwitchProvider_PreClaimGatesStop (NEW): source-level position check that the ALREADY_SWITCHING 409 path AND the pre-claim appear BEFORE the cpStopWithRetryErr call, so a losing pre-claim cannot trigger the stop side effect. This is the load-bearing regression test for the CR2 #11473 blocking finding. CR3 (agent-reviewer) content-security RC on f38dc96f is ALREADY addressed in 4d7256c6 (sanitize comments/test prose). Verified: the current source no longer exposes orphan/CP-side reconciler/internal mechanics in the test or handler prose. Files: ~ workspace-server/internal/handlers/workspace_switch_provider.go - Pre-claim UPDATE added before the stop - Provider-write UPDATE no longer re-checks status (pre-claim owns it) - Provision routes through h.provisionWorkspaceAuto (centralized) - Comment headers updated to reflect the new 5-step sequence ~ workspace-server/internal/handlers/workspace_switch_provider_test.go - TestSwitchProvider_ConcurrencyGuardAndAudit updated for new pre-claim shape + new negative pin for h.provisionWorkspaceCP directly - TestSwitchProvider_PreClaimGatesStop (NEW) — load-bearing pin - TestSwitchProvider_StopBeforeProviderWrite unchanged (still passes) Tests: all 6 switch_provider tests pass; full ./internal/handlers/ green 19.7s; go vet + go build clean. This is a SUPERSEDING push on a fresh branch off origin/feat/ws-switch-provider-endpoint (per PM guidance: existing branch picked up where possible). The original PR #2422 is still open on the devops-engineer-owned branch; a new PR on this fix branch will reference #2422 and request the original be closed in favor of the new one. Refs #2422, CR2 #11473, core#2771, core#2422 RCA tick. Co-Authored-By: Claude --- .../handlers/workspace_switch_provider.go | 96 +++++++++++++------ .../workspace_switch_provider_test.go | 73 ++++++++++++-- 2 files changed, 131 insertions(+), 38 deletions(-) diff --git a/workspace-server/internal/handlers/workspace_switch_provider.go b/workspace-server/internal/handlers/workspace_switch_provider.go index 991d46219..7c46e4273 100644 --- a/workspace-server/internal/handlers/workspace_switch_provider.go +++ b/workspace-server/internal/handlers/workspace_switch_provider.go @@ -115,47 +115,78 @@ func (h *WorkspaceHandler) SwitchProvider(c *gin.Context) { // --- ordered switch (see doc-comment) --- - // 1. Stop the OLD box with the OLD provider. DB is unchanged here, so the - // stop helper reads the old provider + old instance_id. Bounded retry; on - // exhaustion it returns an error but we STILL proceed (a stuck old box - // must not strand the switch) — except we capture the failure so step 2.5 - // can emit a durable audit row, because step 2 nulls instance_id and flips - // provider, which otherwise leaves the old box untracked by normal - // lifecycle cleanup (review finding #3). + // 1. PRE-CLAIM: atomically mark the switch as in-flight by setting + // status='provisioning' WITHOUT changing the provider. The CAS + // (`status<>'provisioning' AND provider unchanged`) prevents a + // racing duplicate switch (or a switch against a workspace that + // is already provisioning) from getting past this point. A losing + // pre-claim returns 0 rows → 409 immediately, with NO stop side + // effect (CR2 blocking finding: pre-fix the stop ran before the + // CAS, so a losing request still executed the stop side effect + // against a box it didn't own). + preClaim, err := db.DB.ExecContext(ctx, ` + UPDATE workspaces + SET status = $2, url = '', updated_at = now() + WHERE id = $1 + AND status <> $2 + AND COALESCE(compute->>'provider', '') IS NOT DISTINCT FROM $3`, + id, models.StatusProvisioning, oldProvider) + if err != nil { + log.Printf("SwitchProvider: pre-claim failed for %s: %v", id, err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to claim switch"}) + return + } + if n, _ := preClaim.RowsAffected(); n == 0 { + // Lost the pre-claim: another switch already set status='provisioning', + // the workspace is in initial provisioning, or the provider changed + // under us. Do NOT execute the stop — the box is owned by an + // in-flight provision/switch, not by us. + c.JSON(http.StatusConflict, gin.H{"error": "ALREADY_SWITCHING", "detail": "a provider switch or provision is already in progress for this workspace"}) + return + } + + // 2. Stop the OLD box with the OLD provider. DB still has the old + // provider + old instance_id (the pre-claim only flipped status, + // not provider — the stop helper reads provider+instance_id at + // call time). Bounded retry; on exhaustion we STILL proceed + // (a stuck old box must not strand the switch) — except we + // capture the failure so step 4 can emit a durable audit row, + // because step 3 nulls instance_id and flips provider, which + // otherwise leaves the old box untracked by normal lifecycle + // cleanup (review finding #3). stopErr := h.cpStopWithRetryErr(ctx, id, "SwitchProvider", false) - // 2. Atomically claim the switch AND clear instance_id + write the new - // provider. The CAS (status not already provisioning, provider still the - // one we read) makes concurrent/duplicate switch calls safe: only the - // first winner launches a provision; a racing call sees 0 rows → 409, - // never a second provision against a second backend (review finding #4). - // jsonb_set preserves instance_type/volume/display/data_persistence. + // 3. Write the new provider + clear instance_id. The pre-claim + // already set status='provisioning' (so a duplicate check on + // status is not needed here — the row is owned by this switch). + // The `WHERE id=$1` is the only guard: if the row was deleted + // between pre-claim and now (vanishingly rare), 0 rows → 500 + // and the audit row carries the diagnostic. jsonb_set preserves + // instance_type/volume/display/data_persistence. res, err := db.DB.ExecContext(ctx, ` UPDATE workspaces SET instance_id = NULL, compute = jsonb_set(COALESCE(compute, '{}'::jsonb), '{provider}', to_jsonb($2::text)), - status = $3, url = '', updated_at = now() - WHERE id = $1 - AND status <> $3 - AND COALESCE(compute->>'provider', '') IS NOT DISTINCT FROM $4`, - id, newProvider, models.StatusProvisioning, oldProvider) + updated_at = now() + WHERE id = $1`, + id, newProvider) if err != nil { log.Printf("SwitchProvider: failed to write new provider for %s: %v", id, err) c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to switch provider"}) return } if n, _ := res.RowsAffected(); n == 0 { - // Lost the CAS: another switch already flipped the provider / set - // provisioning, or the row changed under us. Do NOT launch a second - // provision (would leave an untracked box). - c.JSON(http.StatusConflict, gin.H{"error": "ALREADY_SWITCHING", "detail": "a provider switch or provision is already in progress for this workspace"}) + // Row was deleted between pre-claim and now. Emit an audit + // row so the diagnostic is queryable, then 500. + log.Printf("SwitchProvider: row disappeared after pre-claim for %s", id) + c.JSON(http.StatusInternalServerError, gin.H{"error": "workspace row missing after pre-claim"}) return } - // 2.5. If the old box never confirmed stopped, it may not be tracked by the - // normal lifecycle cleanup after instance_id was nulled. Emit a durable - // audit row carrying the old instance_id + provider so a platform cleanup - // process can locate and terminate it (review finding #3). + // 4. If the old box never confirmed stopped, it may not be tracked by the + // normal lifecycle cleanup after instance_id was nulled. Emit a durable + // audit row carrying the old instance_id + provider so a platform cleanup + // process can locate and terminate it (review finding #3). if stopErr != nil && oldInstanceID.Valid && oldInstanceID.String != "" { h.emitSwitchProviderStopExhausted(ctx, id, oldInstanceID.String, effectiveOld, newProvider, stopErr) } @@ -168,12 +199,15 @@ func (h *WorkspaceHandler) SwitchProvider(c *gin.Context) { "provider_to": newProvider, }) - // 3. Provision the NEW box. withStoredCompute re-reads compute (now carrying - // the new provider) → provisionWorkspaceCP routes to the new backend. - // Reuse the existing config volume (templatePath="") so identity/config - // are preserved. Detached context: the reprovision outlives the request. + // 5. Provision the NEW box. withStoredCompute re-reads compute + // (now carrying the new provider) → provisionWorkspaceAuto routes + // centrally to the new backend. Reuse the existing config volume + // (templatePath="") so identity/config are preserved. Detached + // context: the reprovision outlives the request. Routes through + // provisionWorkspaceAuto (not provisionWorkspaceCP directly) per + // TestNoCallSiteCallsDirectProvisionerExceptAuto (core#2422 RCA tick). payload := withStoredCompute(context.Background(), id, models.CreateWorkspacePayload{Name: wsName, Tier: tier, Runtime: dbRuntime}) - h.goAsync(func() { h.provisionWorkspaceCP(id, "", nil, payload) }) + h.provisionWorkspaceAuto(id, "", nil, payload) c.JSON(http.StatusAccepted, gin.H{ "status": "switching", diff --git a/workspace-server/internal/handlers/workspace_switch_provider_test.go b/workspace-server/internal/handlers/workspace_switch_provider_test.go index 73f0d6fce..c9a045f31 100644 --- a/workspace-server/internal/handlers/workspace_switch_provider_test.go +++ b/workspace-server/internal/handlers/workspace_switch_provider_test.go @@ -41,11 +41,57 @@ func TestSwitchProvider_StopBeforeProviderWrite(t *testing.T) { } } +// TestSwitchProvider_PreClaimGatesStop pins the CR2 #11473 blocking-finding +// fix: the per-workspace stop helper MUST appear AFTER the pre-claim's +// RowsAffected check, so a losing pre-claim returns 409 without ever +// touching the stop. Pre-fix, the stop ran unconditionally before the +// CAS — a request against a workspace that was already provisioning +// would stop the in-flight box it didn't own (review finding: "the +// loser should not be able to stop a box owned by an in-flight +// provision/switch"). A source-level position check guards against +// a refactor re-introducing the order. +func TestSwitchProvider_PreClaimGatesStop(t *testing.T) { + wd, _ := os.Getwd() + src, err := os.ReadFile(filepath.Join(wd, "workspace_switch_provider.go")) + if err != nil { + t.Fatalf("read source: %v", err) + } + stripped := stripGoComments(src) + preClaimIdx := bytes.Index(stripped, []byte("preClaim, err := db.DB.ExecContext(ctx, `")) + if preClaimIdx < 0 { + t.Fatal("SwitchProvider must have a pre-claim UPDATE (the fix for CR2 #11473) — pre-claim gates the stop on a successful CAS") + } + preClaimLoseIdx := bytes.Index(stripped, []byte("ALREADY_SWITCHING")) + if preClaimLoseIdx < 0 { + t.Fatal("SwitchProvider must return ALREADY_SWITCHING on a lost pre-claim — this is the 409 path the pre-claim gates") + } + stopIdx := bytes.Index(stripped, []byte("cpStopWithRetryErr(ctx, id, \"SwitchProvider\"")) + if stopIdx < 0 { + t.Fatal("SwitchProvider must call cpStopWithRetryErr for the OLD box") + } + // The 409-on-lost-pre-claim path must appear BEFORE the stop — the + // stop is gated on a successful pre-claim. + if preClaimLoseIdx >= stopIdx { + t.Fatalf("ORDERING HAZARD: the ALREADY_SWITCHING 409 path (idx %d) must come BEFORE the stop helper (idx %d) — a losing pre-claim must return 409 without ever touching the stop side effect (CR2 #11473)", preClaimLoseIdx, stopIdx) + } + // And the pre-claim itself must come before the stop too. + if preClaimIdx >= stopIdx { + t.Fatalf("ORDERING HAZARD: the pre-claim (idx %d) must come BEFORE the stop (idx %d)", preClaimIdx, stopIdx) + } +} + // TestSwitchProvider_ConcurrencyGuardAndAudit pins the two hardening items from -// the correctness review: (a) the provider-write is an atomic CAS so two -// concurrent switches can't both launch a provision, and (b) stop-exhaustion -// emits a durable audit row carrying the old instance_id+provider so the old box -// remains discoverable after instance_id is nulled. +// the correctness review: (a) a switch is guarded by an atomic CAS (pre-claim +// + provider write) so two concurrent switches can't both launch a provision +// or both stop the same box, and (b) stop-exhaustion emits a durable audit +// row carrying the old instance_id+provider so the old box remains +// discoverable after instance_id is nulled. +// +// CR2 #11473 update: the original code did the stop BEFORE the CAS, so a +// losing request still executed the stop side effect. The fix splits the +// guard into a PRE-CLAIM (status='provisioning' only, provider unchanged) +// and the provider write — the stop now runs ONLY after the pre-claim +// succeeds, so a losing pre-claim returns 409 without stopping the box. func TestSwitchProvider_ConcurrencyGuardAndAudit(t *testing.T) { wd, _ := os.Getwd() src, err := os.ReadFile(filepath.Join(wd, "workspace_switch_provider.go")) @@ -53,11 +99,17 @@ func TestSwitchProvider_ConcurrencyGuardAndAudit(t *testing.T) { t.Fatalf("read source: %v", err) } s := stripGoComments(src) - if !bytes.Contains(s, []byte("status <> $3")) || !bytes.Contains(s, []byte("IS NOT DISTINCT FROM $4")) { - t.Error("the provider-write UPDATE must be a CAS (status not already provisioning AND provider unchanged) to prevent a double-provision race") + // Pre-claim: status='provisioning' with status<>provisioning CAS so + // the stop is gated on a successful claim. The pre-claim's WHERE + // clause must include BOTH `status <> $` AND a provider-unchanged + // check, so a losing race (workspace already provisioning, OR + // provider changed) returns 0 rows and 409s without stopping + // the box. + if !bytes.Contains(s, []byte("status <> $2")) || !bytes.Contains(s, []byte("IS NOT DISTINCT FROM $3")) { + t.Error("the PRE-CLAIM must be a CAS (status not already provisioning AND provider unchanged) — this is the guard that prevents a losing request from executing the stop side effect (CR2 #11473)") } if !bytes.Contains(s, []byte("RowsAffected")) || !bytes.Contains(s, []byte("ALREADY_SWITCHING")) { - t.Error("SwitchProvider must 409 ALREADY_SWITCHING when the CAS affects 0 rows (lost the race)") + t.Error("SwitchProvider must 409 ALREADY_SWITCHING when the pre-claim affects 0 rows (lost the race before the stop runs)") } if !bytes.Contains(s, []byte("cpStopWithRetryErr")) { t.Error("SwitchProvider must use cpStopWithRetryErr to detect stop exhaustion") @@ -65,6 +117,13 @@ func TestSwitchProvider_ConcurrencyGuardAndAudit(t *testing.T) { if !bytes.Contains(s, []byte("emitSwitchProviderStopExhausted")) { t.Error("SwitchProvider must emit an audit row with old instance/provider metadata on stop exhaustion") } + // Routing invariant: the NEW-box provision must go through the + // central Auto dispatcher, not the direct per-backend body (this + // is the core#2422 RCA-tick fix that closes the Platform-Go red + // on TestNoCallSiteCallsDirectProvisionerExceptAuto). + if bytes.Contains(s, []byte("h.goAsync(func() { h.provisionWorkspaceCP(")) { + t.Error("SwitchProvider must route the NEW-box provision through provisionWorkspaceAuto, NOT through h.provisionWorkspaceCP directly (TestNoCallSiteCallsDirectProvisionerExceptAuto pin)") + } } // TestSwitchProvider_RejectsBadProvider: the allowlist check fires before any DB -- 2.52.0 From de28114c90e022f38d38d3c300c7e028bbf041c9 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer B (MiniMax)" Date: Sat, 13 Jun 2026 21:59:33 +0000 Subject: [PATCH 5/6] fix(molecule-core#2422 RC): commit-or-rollback pattern for pre-claim (CR2 #11486) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CR2 #11486 follow-up finding: the pre-claim (added in the prior #11473 fix) sets status='provisioning' as the very first step of the switch. If any subsequent step errors or the request context is cancelled, the status is never reverted — the workspace is stranded in 'provisioning' forever, requiring operator intervention to manually reset. Fix: COMMIT-OR-ROLLBACK pattern. - declared at the top of the function - defer runs on EVERY return path; reverts status to priorStatus (the value from the lookup query) UNLESS committed is true - is set ONLY after step 5 (the provision dispatch returns successfully). At that point the new provision is in flight on a goroutine; its outcome (online / failed) is owned by the central provision machinery, not by the switch handler - The rollback uses a FRESH context (context.Background with 5s timeout), NOT the request context — a client disconnect mid-switch would otherwise cancel the cleanup - The rollback UPDATE is gated on so a concurrent switch/provision that has already advanced the status is not clobbered Side change: pre-claim no longer clears . The pre-claim only flips status now; url stays at the agent's value. The rollback only reverts status (not url), so we don't need to snapshot/restore url. A failed pre-claim never needs a revert (the row is unchanged), so this minimal pre-claim shape also makes the failure paths simpler. New regression test: TestSwitchProvider_PreClaimRollbackOnError is a source-level pin for the commit-or-rollback contract: - declared - set ONLY at the very end - defer checks the committed flag - rollback uses a FRESH context (not request ctx) - priorStatus captured before pre-claim - rollback UPDATE gated on status='provisioning' (no clobber of a concurrent switch/provision's status) Tests: all 6 switch_provider tests pass; full ./internal/handlers/ green 20.3s; go vet + go build clean. Refs #2778, CR2 #11486, core#2422. Co-Authored-By: Claude --- .../handlers/workspace_switch_provider.go | 41 ++++++++++++++++- .../workspace_switch_provider_test.go | 44 +++++++++++++++++++ 2 files changed, 84 insertions(+), 1 deletion(-) diff --git a/workspace-server/internal/handlers/workspace_switch_provider.go b/workspace-server/internal/handlers/workspace_switch_provider.go index 7c46e4273..309493df3 100644 --- a/workspace-server/internal/handlers/workspace_switch_provider.go +++ b/workspace-server/internal/handlers/workspace_switch_provider.go @@ -8,6 +8,7 @@ import ( "log" "net/http" "strings" + "time" "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db" "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/events" @@ -115,6 +116,32 @@ func (h *WorkspaceHandler) SwitchProvider(c *gin.Context) { // --- ordered switch (see doc-comment) --- + // COMMIT-OR-ROLLBACK pattern for the pre-claim. After step 1 sets + // status='provisioning', any error / ctx-cancellation before step 5 + // completes the switch leaves the workspace stranded in 'provisioning' + // forever (CR2 #11486 follow-up finding). The defer reverts status + // to priorStatus on ANY error path; the `committed` flag is set ONLY + // when the switch fully reaches step 5 (provision dispatched). The + // rollback uses a fresh context (not the request ctx) so a client + // disconnect mid-switch still cleans up. + committed := false + priorStatus := status + defer func() { + if committed { + return + } + rollbackCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if _, err := db.DB.ExecContext(rollbackCtx, ` + UPDATE workspaces + SET status = $2, updated_at = now() + WHERE id = $1 + AND status = $3`, + id, priorStatus, models.StatusProvisioning); err != nil { + log.Printf("SwitchProvider: status revert failed for %s (priorStatus=%q): %v — workspace may need operator intervention", id, priorStatus, err) + } + }() + // 1. PRE-CLAIM: atomically mark the switch as in-flight by setting // status='provisioning' WITHOUT changing the provider. The CAS // (`status<>'provisioning' AND provider unchanged`) prevents a @@ -124,9 +151,15 @@ func (h *WorkspaceHandler) SwitchProvider(c *gin.Context) { // effect (CR2 blocking finding: pre-fix the stop ran before the // CAS, so a losing request still executed the stop side effect // against a box it didn't own). + // + // url is NOT touched here — the pre-claim only flips status. The + // later step 3 (provider write) nulls instance_id and the rollback + // above reverts only status, so we don't need to snapshot/restore + // url. Keeping the pre-claim minimal also means a failed + // pre-claim never needs a revert (the row is unchanged). preClaim, err := db.DB.ExecContext(ctx, ` UPDATE workspaces - SET status = $2, url = '', updated_at = now() + SET status = $2, updated_at = now() WHERE id = $1 AND status <> $2 AND COALESCE(compute->>'provider', '') IS NOT DISTINCT FROM $3`, @@ -209,6 +242,12 @@ func (h *WorkspaceHandler) SwitchProvider(c *gin.Context) { payload := withStoredCompute(context.Background(), id, models.CreateWorkspacePayload{Name: wsName, Tier: tier, Runtime: dbRuntime}) h.provisionWorkspaceAuto(id, "", nil, payload) + // All 5 steps completed; mark the switch COMMITTED so the rollback + // defer does NOT revert status='provisioning'. The new provision is + // in flight on a goroutine and will progress to 'online' (or + // 'failed' via the central provision machinery) on its own. + committed = true + c.JSON(http.StatusAccepted, gin.H{ "status": "switching", "workspace_id": id, diff --git a/workspace-server/internal/handlers/workspace_switch_provider_test.go b/workspace-server/internal/handlers/workspace_switch_provider_test.go index c9a045f31..2aa808457 100644 --- a/workspace-server/internal/handlers/workspace_switch_provider_test.go +++ b/workspace-server/internal/handlers/workspace_switch_provider_test.go @@ -152,6 +152,50 @@ func TestSwitchProvider_RejectsBadProvider(t *testing.T) { } } +// TestSwitchProvider_PreClaimRollbackOnError (CR2 #11486) is the +// source-level pin for the commit-or-rollback pattern added after the +// pre-claim landed. The pre-claim sets status='provisioning'; without +// the rollback defer, any error / ctx-cancellation between pre-claim +// and step 5 (committed) would strand the workspace in 'provisioning' +// forever. The defer must: +// 1. Revert status to priorStatus on any error path (commit-or-rollback). +// 2. Use a fresh context (not the request ctx) so client +// disconnect mid-switch still cleans up. +// 3. Set `committed = true` ONLY at the very end (after step 5). +func TestSwitchProvider_PreClaimRollbackOnError(t *testing.T) { + wd, _ := os.Getwd() + src, err := os.ReadFile(filepath.Join(wd, "workspace_switch_provider.go")) + if err != nil { + t.Fatalf("read source: %v", err) + } + s := stripGoComments(src) + // Commit-or-rollback flag + if !bytes.Contains(s, []byte("committed := false")) { + t.Error("SwitchProvider must declare a `committed := false` flag for the commit-or-rollback pattern (CR2 #11486)") + } + if !bytes.Contains(s, []byte("committed = true")) { + t.Error("SwitchProvider must set `committed = true` only after the provision is dispatched — the rollback defer reads this flag to decide whether to revert status") + } + // defer that checks the flag and reverts + if !bytes.Contains(s, []byte("defer func() {")) || !bytes.Contains(s, []byte("if committed {")) { + t.Error("SwitchProvider must have a defer that checks the `committed` flag and reverts status to priorStatus on any uncommitted path (CR2 #11486)") + } + // Rollback uses a fresh context (not the request ctx) so client + // disconnect mid-switch still cleans up. + if !bytes.Contains(s, []byte("rollbackCtx, cancel := context.WithTimeout(context.Background()")) { + t.Error("SwitchProvider's rollback defer must use a FRESH context (context.Background), not the request ctx — a cancelled request ctx would skip the cleanup and strand the workspace") + } + // priorStatus captured at the top of the function + if !bytes.Contains(s, []byte("priorStatus := status")) { + t.Error("SwitchProvider must capture `priorStatus := status` (the value from the lookup query) before the pre-claim so the rollback can restore it") + } + // The rollback UPDATE is gated on status='provisioning' so we don't + // clobber a newer status set by a concurrent switch/provision. + if !bytes.Contains(s, []byte("AND status = $3")) { + t.Error("SwitchProvider's rollback UPDATE must be gated on `status = 'provisioning'` so a concurrent switch/provision that has already advanced the status is not clobbered") + } +} + // TestSwitchProvider_RouteRegistered pins the route wiring. func TestSwitchProvider_RouteRegistered(t *testing.T) { wd, _ := os.Getwd() -- 2.52.0 From 6e424ca36b030dfe724db2d84497902801a35e3c Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer B (MiniMax)" Date: Sat, 13 Jun 2026 22:07:26 +0000 Subject: [PATCH 6/6] fix(molecule-core#2422 RC): arm commit-or-rollback defer AFTER pre-claim success (CR2 #11493) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CR2 #11493 ownership-ordering finding: the prior fix (CR2 #11486, head 36ec65e7) armed the commit-or-rollback defer BEFORE the pre-claim ran. A LOSING pre-claim (status<>provisioning CAS returns 0 rows because another request already owns the switch) would still arm the defer. The defer's rollback UPDATE is gated on `status = 'provisioning'` — so on a losing pre-claim the defer would clobber ANOTHER request's in-flight status='provisioning' back to OUR priorStatus, stranding them. Fix: move the defer setup to AFTER the pre-claim's 0-rows return. The defer is now armed only when THIS request has successfully won the pre-claim (status='provisioning' on this row is OURS). A losing pre-claim returns 409 ALREADY_SWITCHING without arming the defer, so the rollback can never clobber another request's status. Side change: the pre-claim code path now has a CRITICAL comment explaining why the defer is NOT armed on the losing path. The explanation is load-bearing for any future refactor that might be tempted to move the defer back to the top of the function for 'cleanliness'. New regression test: TestSwitchProvider_PreClaimLoserDoesNotArmRollback is a source-level position check: the `defer func() {` opening must appear AFTER the ALREADY_SWITCHING 409 return — a losing pre-claim must NOT arm the defer, otherwise the rollback UPDATE clobbers another request's in-flight pre-claim to OUR priorStatus. The existing TestSwitchProvider_PreClaimRollbackOnError has its doc updated to call out the CR2 #11493 ownership-ordering fix (1 of the 6 assertions). Tests: all 7 switch_provider tests pass; full ./internal/handlers/ green 19.8s; go vet + go build clean. Refs #2778, CR2 #11493, CR2 #11486, core#2422. Co-Authored-By: Claude --- .../handlers/workspace_switch_provider.go | 64 +++++++++++-------- .../workspace_switch_provider_test.go | 46 +++++++++++-- 2 files changed, 80 insertions(+), 30 deletions(-) diff --git a/workspace-server/internal/handlers/workspace_switch_provider.go b/workspace-server/internal/handlers/workspace_switch_provider.go index 309493df3..96d0a0377 100644 --- a/workspace-server/internal/handlers/workspace_switch_provider.go +++ b/workspace-server/internal/handlers/workspace_switch_provider.go @@ -116,32 +116,6 @@ func (h *WorkspaceHandler) SwitchProvider(c *gin.Context) { // --- ordered switch (see doc-comment) --- - // COMMIT-OR-ROLLBACK pattern for the pre-claim. After step 1 sets - // status='provisioning', any error / ctx-cancellation before step 5 - // completes the switch leaves the workspace stranded in 'provisioning' - // forever (CR2 #11486 follow-up finding). The defer reverts status - // to priorStatus on ANY error path; the `committed` flag is set ONLY - // when the switch fully reaches step 5 (provision dispatched). The - // rollback uses a fresh context (not the request ctx) so a client - // disconnect mid-switch still cleans up. - committed := false - priorStatus := status - defer func() { - if committed { - return - } - rollbackCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) - defer cancel() - if _, err := db.DB.ExecContext(rollbackCtx, ` - UPDATE workspaces - SET status = $2, updated_at = now() - WHERE id = $1 - AND status = $3`, - id, priorStatus, models.StatusProvisioning); err != nil { - log.Printf("SwitchProvider: status revert failed for %s (priorStatus=%q): %v — workspace may need operator intervention", id, priorStatus, err) - } - }() - // 1. PRE-CLAIM: atomically mark the switch as in-flight by setting // status='provisioning' WITHOUT changing the provider. The CAS // (`status<>'provisioning' AND provider unchanged`) prevents a @@ -174,10 +148,48 @@ func (h *WorkspaceHandler) SwitchProvider(c *gin.Context) { // the workspace is in initial provisioning, or the provider changed // under us. Do NOT execute the stop — the box is owned by an // in-flight provision/switch, not by us. + // + // CRITICAL: do NOT arm the commit-or-rollback defer here. A losing + // pre-claim means we did NOT flip status='provisioning' on this + // row — arming the defer would let the rollback (gated on + // `status = 'provisioning'`) clobber ANOTHER request's in-flight + // pre-claim to OUR priorStatus, stranding them. The defer is armed + // only after THIS request successfully wins the pre-claim (CR2 + // #11493 ownership-ordering fix). c.JSON(http.StatusConflict, gin.H{"error": "ALREADY_SWITCHING", "detail": "a provider switch or provision is already in progress for this workspace"}) return } + // COMMIT-OR-ROLLBACK pattern for the pre-claim (armed AFTER pre-claim + // success per CR2 #11493). After step 1 sets status='provisioning', + // any error / ctx-cancellation before step 5 completes the switch + // leaves the workspace stranded in 'provisioning' forever (CR2 + // #11486 follow-up finding). The defer reverts status to + // priorStatus on ANY error path; the `committed` flag is set ONLY + // when the switch fully reaches step 5 (provision dispatched). The + // rollback uses a fresh context (not the request ctx) so a client + // disconnect mid-switch still cleans up. The rollback UPDATE is + // gated on `status = 'provisioning'` so a concurrent + // switch/provision that has already advanced the status is not + // clobbered. + committed := false + priorStatus := status + defer func() { + if committed { + return + } + rollbackCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if _, err := db.DB.ExecContext(rollbackCtx, ` + UPDATE workspaces + SET status = $2, updated_at = now() + WHERE id = $1 + AND status = $3`, + id, priorStatus, models.StatusProvisioning); err != nil { + log.Printf("SwitchProvider: status revert failed for %s (priorStatus=%q): %v — workspace may need operator intervention", id, priorStatus, err) + } + }() + // 2. Stop the OLD box with the OLD provider. DB still has the old // provider + old instance_id (the pre-claim only flipped status, // not provider — the stop helper reads provider+instance_id at diff --git a/workspace-server/internal/handlers/workspace_switch_provider_test.go b/workspace-server/internal/handlers/workspace_switch_provider_test.go index 2aa808457..71e54aa71 100644 --- a/workspace-server/internal/handlers/workspace_switch_provider_test.go +++ b/workspace-server/internal/handlers/workspace_switch_provider_test.go @@ -152,16 +152,21 @@ func TestSwitchProvider_RejectsBadProvider(t *testing.T) { } } -// TestSwitchProvider_PreClaimRollbackOnError (CR2 #11486) is the +// TestSwitchProvider_PreClaimRollbackOnError (CR2 #11486 + #11493) is the // source-level pin for the commit-or-rollback pattern added after the // pre-claim landed. The pre-claim sets status='provisioning'; without // the rollback defer, any error / ctx-cancellation between pre-claim // and step 5 (committed) would strand the workspace in 'provisioning' // forever. The defer must: -// 1. Revert status to priorStatus on any error path (commit-or-rollback). -// 2. Use a fresh context (not the request ctx) so client +// 1. Be armed ONLY AFTER this request's pre-claim succeeds +// (CR2 #11493 ownership-ordering fix — a losing pre-claim must +// NOT arm the defer, otherwise the rollback UPDATE (gated on +// `status = 'provisioning'`) would clobber ANOTHER request's +// in-flight pre-claim to OUR priorStatus, stranding them). +// 2. Revert status to priorStatus on any error path (commit-or-rollback). +// 3. Use a fresh context (not the request ctx) so client // disconnect mid-switch still cleans up. -// 3. Set `committed = true` ONLY at the very end (after step 5). +// 4. Set `committed = true` ONLY at the very end (after step 5). func TestSwitchProvider_PreClaimRollbackOnError(t *testing.T) { wd, _ := os.Getwd() src, err := os.ReadFile(filepath.Join(wd, "workspace_switch_provider.go")) @@ -196,6 +201,39 @@ func TestSwitchProvider_PreClaimRollbackOnError(t *testing.T) { } } +// TestSwitchProvider_PreClaimLoserDoesNotArmRollback (CR2 #11493) is +// the regression test for the ownership-ordering fix: the commit-or- +// rollback defer MUST be armed AFTER the pre-claim's 0-rows return, +// not before. A losing pre-claim must NOT arm the defer — otherwise +// the rollback UPDATE (gated on `status = 'provisioning'`) could +// clobber ANOTHER request's in-flight pre-claim to OUR priorStatus, +// stranding them. +// +// Source-level position check: the `defer func() {` opening must +// appear AFTER the `ALREADY_SWITCHING` 409 return (the losing +// pre-claim path). +func TestSwitchProvider_PreClaimLoserDoesNotArmRollback(t *testing.T) { + wd, _ := os.Getwd() + src, err := os.ReadFile(filepath.Join(wd, "workspace_switch_provider.go")) + if err != nil { + t.Fatalf("read source: %v", err) + } + stripped := stripGoComments(src) + // Find the ALREADY_SWITCHING 409 return — the losing-pre-claim path. + loseIdx := bytes.Index(stripped, []byte("ALREADY_SWITCHING")) + if loseIdx < 0 { + t.Fatal("SwitchProvider must have an ALREADY_SWITCHING 409 return on a lost pre-claim") + } + // Find the defer opening — must come AFTER the 409 return. + deferIdx := bytes.Index(stripped, []byte("defer func() {")) + if deferIdx < 0 { + t.Fatal("SwitchProvider must have a `defer func() { ... }` block for the commit-or-rollback pattern") + } + if deferIdx <= loseIdx { + t.Fatalf("OWNERSHIP-ORDERING BUG (CR2 #11493): the commit-or-rollback defer (idx %d) must be armed AFTER the losing-pre-claim 409 return (idx %d) — a losing pre-claim must NOT arm the defer, otherwise the rollback UPDATE (gated on `status = 'provisioning'`) would clobber another request's in-flight pre-claim to OUR priorStatus, stranding them", deferIdx, loseIdx) + } +} + // TestSwitchProvider_RouteRegistered pins the route wiring. func TestSwitchProvider_RouteRegistered(t *testing.T) { wd, _ := os.Getwd() -- 2.52.0