fix(molecule-core#2422): pre-claim gates stop (CR2 #11473) + provision via Auto (RCA tick) #2778
@@ -0,0 +1,298 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db"
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/events"
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/models"
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
// SwitchProvider handles POST /workspaces/:id/switch-provider — move an EXISTING
|
||||
// workspace to a different cloud (compute.provider). The VM is cloud-specific, so
|
||||
// this reprovisions the box on the new cloud; the agent's durable identity
|
||||
// (agent_card, config, secrets, tokens, memory) lives in the tenant DB and is
|
||||
// preserved because the row is never deleted.
|
||||
//
|
||||
// CRITICAL ORDERING: the stop must run with the OLD provider BEFORE the DB
|
||||
// provider is changed. The stop helper reads the current row at call time; if we
|
||||
// wrote the new provider first, the teardown request would target the wrong
|
||||
// backend and the old box would leak. So the sequence is strictly:
|
||||
//
|
||||
// 1. stop OLD box (DB still has old provider + old instance_id)
|
||||
// 2. clear instance_id + write new provider (jsonb_set, preserving the rest)
|
||||
// 3. provision NEW box (withStoredCompute now reads the new provider)
|
||||
//
|
||||
// Clearing instance_id in step 2 also makes a retried switch safe: a second call
|
||||
// finds no stale instance to act on with the new provider metadata.
|
||||
func (h *WorkspaceHandler) SwitchProvider(c *gin.Context) {
|
||||
id := c.Param("id")
|
||||
ctx := c.Request.Context()
|
||||
|
||||
var body struct {
|
||||
Provider string `json:"provider"`
|
||||
ConfirmDataLoss bool `json:"confirm_data_loss"`
|
||||
// MigrateData is accepted for forward-compat (RFC #622 PR4 wires the
|
||||
// filesystem migration). Until then it is a no-op and a persistent
|
||||
// volume still requires confirm_data_loss.
|
||||
MigrateData bool `json:"migrate_data"`
|
||||
}
|
||||
if err := c.ShouldBindJSON(&body); err != nil && err != io.EOF {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "invalid JSON body"})
|
||||
return
|
||||
}
|
||||
newProvider := strings.ToLower(strings.TrimSpace(body.Provider))
|
||||
if newProvider == "" {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "provider is required"})
|
||||
return
|
||||
}
|
||||
if _, ok := workspaceComputeProviderAllowlist[newProvider]; !ok {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "unsupported provider (want aws|gcp|hetzner)"})
|
||||
return
|
||||
}
|
||||
|
||||
var status, wsName, dbRuntime, oldProvider, dataPersistence string
|
||||
var oldInstanceID sql.NullString
|
||||
var tier int
|
||||
err := db.DB.QueryRowContext(ctx, `
|
||||
SELECT status, name, tier, COALESCE(runtime, 'claude-code'),
|
||||
COALESCE(compute->>'provider', ''), COALESCE(compute->>'data_persistence', ''),
|
||||
instance_id
|
||||
FROM workspaces WHERE id = $1`, id,
|
||||
).Scan(&status, &wsName, &tier, &dbRuntime, &oldProvider, &dataPersistence, &oldInstanceID)
|
||||
if err == sql.ErrNoRows || status == string(models.StatusRemoved) {
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": "workspace not found"})
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "lookup failed"})
|
||||
return
|
||||
}
|
||||
|
||||
// Switching the cloud backend is a SaaS-only concept — a self-hosted Docker
|
||||
// workspace has no cloud provider to switch. external/mock runtimes have no
|
||||
// box at all.
|
||||
if h.cpProv == nil {
|
||||
c.JSON(http.StatusConflict, gin.H{"error": "provider switching is only available for cloud (SaaS) workspaces"})
|
||||
return
|
||||
}
|
||||
if isExternalLikeRuntime(dbRuntime) || dbRuntime == "mock" {
|
||||
c.JSON(http.StatusConflict, gin.H{"error": dbRuntime + " workspaces have no cloud box to switch"})
|
||||
return
|
||||
}
|
||||
if paused, parentName := isParentPaused(ctx, id); paused {
|
||||
c.JSON(http.StatusConflict, gin.H{"error": "parent workspace \"" + parentName + "\" is paused — resume it first"})
|
||||
return
|
||||
}
|
||||
|
||||
// "" provider means the default AWS path.
|
||||
effectiveOld := oldProvider
|
||||
if effectiveOld == "" {
|
||||
effectiveOld = "aws"
|
||||
}
|
||||
if newProvider == effectiveOld {
|
||||
c.JSON(http.StatusOK, gin.H{"status": "noop", "provider": newProvider, "message": "workspace is already on this provider"})
|
||||
return
|
||||
}
|
||||
|
||||
// Data gate: a persistent data volume is cloud-specific and cannot move as a
|
||||
// block device. Until the filesystem migration lands (RFC #622 PR3/PR4), the
|
||||
// switch is allowed only with an explicit confirm_data_loss.
|
||||
if dataPersistence == "persist" && !body.ConfirmDataLoss {
|
||||
c.JSON(http.StatusConflict, gin.H{
|
||||
"error": "DATA_LOSS_UNCONFIRMED",
|
||||
"detail": "this workspace has a persistent data volume that cannot move across clouds; set confirm_data_loss=true to switch with a fresh volume (cross-cloud data migration is RFC #622, not yet wired). Identity/config/secrets/memory are preserved regardless.",
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
// --- ordered switch (see doc-comment) ---
|
||||
|
||||
// 1. PRE-CLAIM: atomically mark the switch as in-flight by setting
|
||||
// status='provisioning' WITHOUT changing the provider. The CAS
|
||||
// (`status<>'provisioning' AND provider unchanged`) prevents a
|
||||
// racing duplicate switch (or a switch against a workspace that
|
||||
// is already provisioning) from getting past this point. A losing
|
||||
// pre-claim returns 0 rows → 409 immediately, with NO stop side
|
||||
// effect (CR2 blocking finding: pre-fix the stop ran before the
|
||||
// CAS, so a losing request still executed the stop side effect
|
||||
// against a box it didn't own).
|
||||
//
|
||||
// url is NOT touched here — the pre-claim only flips status. The
|
||||
// later step 3 (provider write) nulls instance_id and the rollback
|
||||
// above reverts only status, so we don't need to snapshot/restore
|
||||
// url. Keeping the pre-claim minimal also means a failed
|
||||
// pre-claim never needs a revert (the row is unchanged).
|
||||
preClaim, err := db.DB.ExecContext(ctx, `
|
||||
UPDATE workspaces
|
||||
SET status = $2, updated_at = now()
|
||||
WHERE id = $1
|
||||
AND status <> $2
|
||||
AND COALESCE(compute->>'provider', '') IS NOT DISTINCT FROM $3`,
|
||||
id, models.StatusProvisioning, oldProvider)
|
||||
if err != nil {
|
||||
log.Printf("SwitchProvider: pre-claim failed for %s: %v", id, err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to claim switch"})
|
||||
return
|
||||
}
|
||||
if n, _ := preClaim.RowsAffected(); n == 0 {
|
||||
// Lost the pre-claim: another switch already set status='provisioning',
|
||||
// the workspace is in initial provisioning, or the provider changed
|
||||
// under us. Do NOT execute the stop — the box is owned by an
|
||||
// in-flight provision/switch, not by us.
|
||||
//
|
||||
// CRITICAL: do NOT arm the commit-or-rollback defer here. A losing
|
||||
// pre-claim means we did NOT flip status='provisioning' on this
|
||||
// row — arming the defer would let the rollback (gated on
|
||||
// `status = 'provisioning'`) clobber ANOTHER request's in-flight
|
||||
// pre-claim to OUR priorStatus, stranding them. The defer is armed
|
||||
// only after THIS request successfully wins the pre-claim (CR2
|
||||
// #11493 ownership-ordering fix).
|
||||
c.JSON(http.StatusConflict, gin.H{"error": "ALREADY_SWITCHING", "detail": "a provider switch or provision is already in progress for this workspace"})
|
||||
return
|
||||
}
|
||||
|
||||
// COMMIT-OR-ROLLBACK pattern for the pre-claim (armed AFTER pre-claim
|
||||
// success per CR2 #11493). After step 1 sets status='provisioning',
|
||||
// any error / ctx-cancellation before step 5 completes the switch
|
||||
// leaves the workspace stranded in 'provisioning' forever (CR2
|
||||
// #11486 follow-up finding). The defer reverts status to
|
||||
// priorStatus on ANY error path; the `committed` flag is set ONLY
|
||||
// when the switch fully reaches step 5 (provision dispatched). The
|
||||
// rollback uses a fresh context (not the request ctx) so a client
|
||||
// disconnect mid-switch still cleans up. The rollback UPDATE is
|
||||
// gated on `status = 'provisioning'` so a concurrent
|
||||
// switch/provision that has already advanced the status is not
|
||||
// clobbered.
|
||||
committed := false
|
||||
priorStatus := status
|
||||
defer func() {
|
||||
if committed {
|
||||
return
|
||||
}
|
||||
rollbackCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
if _, err := db.DB.ExecContext(rollbackCtx, `
|
||||
UPDATE workspaces
|
||||
SET status = $2, updated_at = now()
|
||||
WHERE id = $1
|
||||
AND status = $3`,
|
||||
id, priorStatus, models.StatusProvisioning); err != nil {
|
||||
log.Printf("SwitchProvider: status revert failed for %s (priorStatus=%q): %v — workspace may need operator intervention", id, priorStatus, err)
|
||||
}
|
||||
}()
|
||||
|
||||
// 2. Stop the OLD box with the OLD provider. DB still has the old
|
||||
// provider + old instance_id (the pre-claim only flipped status,
|
||||
// not provider — the stop helper reads provider+instance_id at
|
||||
// call time). Bounded retry; on exhaustion we STILL proceed
|
||||
// (a stuck old box must not strand the switch) — except we
|
||||
// capture the failure so step 4 can emit a durable audit row,
|
||||
// because step 3 nulls instance_id and flips provider, which
|
||||
// otherwise leaves the old box untracked by normal lifecycle
|
||||
// cleanup (review finding #3).
|
||||
stopErr := h.cpStopWithRetryErr(ctx, id, "SwitchProvider", false)
|
||||
|
||||
// 3. Write the new provider + clear instance_id. The pre-claim
|
||||
// already set status='provisioning' (so a duplicate check on
|
||||
// status is not needed here — the row is owned by this switch).
|
||||
// The `WHERE id=$1` is the only guard: if the row was deleted
|
||||
// between pre-claim and now (vanishingly rare), 0 rows → 500
|
||||
// and the audit row carries the diagnostic. jsonb_set preserves
|
||||
// instance_type/volume/display/data_persistence.
|
||||
res, err := db.DB.ExecContext(ctx, `
|
||||
UPDATE workspaces
|
||||
SET instance_id = NULL,
|
||||
compute = jsonb_set(COALESCE(compute, '{}'::jsonb), '{provider}', to_jsonb($2::text)),
|
||||
updated_at = now()
|
||||
WHERE id = $1`,
|
||||
id, newProvider)
|
||||
if err != nil {
|
||||
log.Printf("SwitchProvider: failed to write new provider for %s: %v", id, err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to switch provider"})
|
||||
return
|
||||
}
|
||||
if n, _ := res.RowsAffected(); n == 0 {
|
||||
// Row was deleted between pre-claim and now. Emit an audit
|
||||
// row so the diagnostic is queryable, then 500.
|
||||
log.Printf("SwitchProvider: row disappeared after pre-claim for %s", id)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "workspace row missing after pre-claim"})
|
||||
return
|
||||
}
|
||||
|
||||
// 4. If the old box never confirmed stopped, it may not be tracked by the
|
||||
// normal lifecycle cleanup after instance_id was nulled. Emit a durable
|
||||
// audit row carrying the old instance_id + provider so a platform cleanup
|
||||
// process can locate and terminate it (review finding #3).
|
||||
if stopErr != nil && oldInstanceID.Valid && oldInstanceID.String != "" {
|
||||
h.emitSwitchProviderStopExhausted(ctx, id, oldInstanceID.String, effectiveOld, newProvider, stopErr)
|
||||
}
|
||||
log.Printf("SwitchProvider: %s %s → %s (old box stop err=%v, reprovisioning)", id, effectiveOld, newProvider, stopErr)
|
||||
h.broadcaster.RecordAndBroadcast(ctx, string(events.EventWorkspaceProvisioning), id, map[string]interface{}{
|
||||
"name": wsName,
|
||||
"tier": tier,
|
||||
"runtime": dbRuntime,
|
||||
"provider_from": effectiveOld,
|
||||
"provider_to": newProvider,
|
||||
})
|
||||
|
||||
// 5. Provision the NEW box. withStoredCompute re-reads compute
|
||||
// (now carrying the new provider) → provisionWorkspaceAuto routes
|
||||
// centrally to the new backend. Reuse the existing config volume
|
||||
// (templatePath="") so identity/config are preserved. Detached
|
||||
// context: the reprovision outlives the request. Routes through
|
||||
// provisionWorkspaceAuto (not provisionWorkspaceCP directly) per
|
||||
// TestNoCallSiteCallsDirectProvisionerExceptAuto (core#2422 RCA tick).
|
||||
payload := withStoredCompute(context.Background(), id, models.CreateWorkspacePayload{Name: wsName, Tier: tier, Runtime: dbRuntime})
|
||||
h.provisionWorkspaceAuto(id, "", nil, payload)
|
||||
|
||||
// All 5 steps completed; mark the switch COMMITTED so the rollback
|
||||
// defer does NOT revert status='provisioning'. The new provision is
|
||||
// in flight on a goroutine and will progress to 'online' (or
|
||||
// 'failed' via the central provision machinery) on its own.
|
||||
committed = true
|
||||
|
||||
c.JSON(http.StatusAccepted, gin.H{
|
||||
"status": "switching",
|
||||
"workspace_id": id,
|
||||
"from": effectiveOld,
|
||||
"to": newProvider,
|
||||
})
|
||||
}
|
||||
|
||||
// emitSwitchProviderStopExhausted records a durable audit row when a provider
|
||||
// switch could not confirm the OLD box stopped before its instance_id was
|
||||
// cleared from the row. Without it the old box may not be tracked by normal
|
||||
// lifecycle cleanup; a platform cleanup process reads these rows by
|
||||
// old_instance_id + old_provider to terminate the leaked box. Best-effort.
|
||||
func (h *WorkspaceHandler) emitSwitchProviderStopExhausted(ctx context.Context, workspaceID, oldInstanceID, oldProvider, newProvider string, cause error) {
|
||||
if db.DB == nil {
|
||||
return
|
||||
}
|
||||
payload, err := json.Marshal(map[string]any{
|
||||
"workspace_id": workspaceID,
|
||||
"old_instance_id": oldInstanceID,
|
||||
"old_provider": oldProvider,
|
||||
"new_provider": newProvider,
|
||||
"last_error": cause.Error(),
|
||||
"recovery_path": "platform_cleanup",
|
||||
})
|
||||
if err != nil {
|
||||
log.Printf("emitSwitchProviderStopExhausted: marshal failed for %s: %v", workspaceID, err)
|
||||
return
|
||||
}
|
||||
if _, err := db.DB.ExecContext(ctx, `
|
||||
INSERT INTO structure_events (event_type, workspace_id, payload, created_at)
|
||||
VALUES ($1, $2, $3, now())
|
||||
`, "workspace.provider.switch_stop_exhausted", workspaceID, payload); err != nil {
|
||||
log.Printf("emitSwitchProviderStopExhausted: insert failed for %s: %v", workspaceID, err)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,247 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
// TestSwitchProvider_StopBeforeProviderWrite is the load-bearing ordering pin.
|
||||
// The stop helper MUST appear before the UPDATE that writes the new provider —
|
||||
// otherwise the teardown uses the wrong backend metadata and the old box leaks.
|
||||
// A source-level position check guards against a refactor reordering the two.
|
||||
func TestSwitchProvider_StopBeforeProviderWrite(t *testing.T) {
|
||||
wd, _ := os.Getwd()
|
||||
src, err := os.ReadFile(filepath.Join(wd, "workspace_switch_provider.go"))
|
||||
if err != nil {
|
||||
t.Fatalf("read source: %v", err)
|
||||
}
|
||||
stripped := stripGoComments(src)
|
||||
stopIdx := bytes.Index(stripped, []byte("cpStopWithRetryErr(ctx, id, \"SwitchProvider\""))
|
||||
if stopIdx < 0 {
|
||||
t.Fatal("SwitchProvider must stop the old box via cpStopWithRetryErr before reprovisioning")
|
||||
}
|
||||
// the provider write is the jsonb_set on compute -> {provider}
|
||||
writeIdx := bytes.Index(stripped, []byte("'{provider}'"))
|
||||
if writeIdx < 0 {
|
||||
t.Fatal("SwitchProvider must write the new provider via jsonb_set on compute->{provider}")
|
||||
}
|
||||
if stopIdx >= writeIdx {
|
||||
t.Fatalf("ORDERING HAZARD: stop helper (idx %d) must come BEFORE the provider write (idx %d) — else the old box is torn down with wrong backend metadata and leaks", stopIdx, writeIdx)
|
||||
}
|
||||
// and the instance_id must be cleared in the same UPDATE (retry-safety)
|
||||
if !bytes.Contains(stripped, []byte("instance_id = NULL")) {
|
||||
t.Fatal("SwitchProvider must clear instance_id when writing the new provider (retry-safety)")
|
||||
}
|
||||
}
|
||||
|
||||
// TestSwitchProvider_PreClaimGatesStop pins the CR2 #11473 blocking-finding
|
||||
// fix: the per-workspace stop helper MUST appear AFTER the pre-claim's
|
||||
// RowsAffected check, so a losing pre-claim returns 409 without ever
|
||||
// touching the stop. Pre-fix, the stop ran unconditionally before the
|
||||
// CAS — a request against a workspace that was already provisioning
|
||||
// would stop the in-flight box it didn't own (review finding: "the
|
||||
// loser should not be able to stop a box owned by an in-flight
|
||||
// provision/switch"). A source-level position check guards against
|
||||
// a refactor re-introducing the order.
|
||||
func TestSwitchProvider_PreClaimGatesStop(t *testing.T) {
|
||||
wd, _ := os.Getwd()
|
||||
src, err := os.ReadFile(filepath.Join(wd, "workspace_switch_provider.go"))
|
||||
if err != nil {
|
||||
t.Fatalf("read source: %v", err)
|
||||
}
|
||||
stripped := stripGoComments(src)
|
||||
preClaimIdx := bytes.Index(stripped, []byte("preClaim, err := db.DB.ExecContext(ctx, `"))
|
||||
if preClaimIdx < 0 {
|
||||
t.Fatal("SwitchProvider must have a pre-claim UPDATE (the fix for CR2 #11473) — pre-claim gates the stop on a successful CAS")
|
||||
}
|
||||
preClaimLoseIdx := bytes.Index(stripped, []byte("ALREADY_SWITCHING"))
|
||||
if preClaimLoseIdx < 0 {
|
||||
t.Fatal("SwitchProvider must return ALREADY_SWITCHING on a lost pre-claim — this is the 409 path the pre-claim gates")
|
||||
}
|
||||
stopIdx := bytes.Index(stripped, []byte("cpStopWithRetryErr(ctx, id, \"SwitchProvider\""))
|
||||
if stopIdx < 0 {
|
||||
t.Fatal("SwitchProvider must call cpStopWithRetryErr for the OLD box")
|
||||
}
|
||||
// The 409-on-lost-pre-claim path must appear BEFORE the stop — the
|
||||
// stop is gated on a successful pre-claim.
|
||||
if preClaimLoseIdx >= stopIdx {
|
||||
t.Fatalf("ORDERING HAZARD: the ALREADY_SWITCHING 409 path (idx %d) must come BEFORE the stop helper (idx %d) — a losing pre-claim must return 409 without ever touching the stop side effect (CR2 #11473)", preClaimLoseIdx, stopIdx)
|
||||
}
|
||||
// And the pre-claim itself must come before the stop too.
|
||||
if preClaimIdx >= stopIdx {
|
||||
t.Fatalf("ORDERING HAZARD: the pre-claim (idx %d) must come BEFORE the stop (idx %d)", preClaimIdx, stopIdx)
|
||||
}
|
||||
}
|
||||
|
||||
// TestSwitchProvider_ConcurrencyGuardAndAudit pins the two hardening items from
|
||||
// the correctness review: (a) a switch is guarded by an atomic CAS (pre-claim
|
||||
// + provider write) so two concurrent switches can't both launch a provision
|
||||
// or both stop the same box, and (b) stop-exhaustion emits a durable audit
|
||||
// row carrying the old instance_id+provider so the old box remains
|
||||
// discoverable after instance_id is nulled.
|
||||
//
|
||||
// CR2 #11473 update: the original code did the stop BEFORE the CAS, so a
|
||||
// losing request still executed the stop side effect. The fix splits the
|
||||
// guard into a PRE-CLAIM (status='provisioning' only, provider unchanged)
|
||||
// and the provider write — the stop now runs ONLY after the pre-claim
|
||||
// succeeds, so a losing pre-claim returns 409 without stopping the box.
|
||||
func TestSwitchProvider_ConcurrencyGuardAndAudit(t *testing.T) {
|
||||
wd, _ := os.Getwd()
|
||||
src, err := os.ReadFile(filepath.Join(wd, "workspace_switch_provider.go"))
|
||||
if err != nil {
|
||||
t.Fatalf("read source: %v", err)
|
||||
}
|
||||
s := stripGoComments(src)
|
||||
// Pre-claim: status='provisioning' with status<>provisioning CAS so
|
||||
// the stop is gated on a successful claim. The pre-claim's WHERE
|
||||
// clause must include BOTH `status <> $` AND a provider-unchanged
|
||||
// check, so a losing race (workspace already provisioning, OR
|
||||
// provider changed) returns 0 rows and 409s without stopping
|
||||
// the box.
|
||||
if !bytes.Contains(s, []byte("status <> $2")) || !bytes.Contains(s, []byte("IS NOT DISTINCT FROM $3")) {
|
||||
t.Error("the PRE-CLAIM must be a CAS (status not already provisioning AND provider unchanged) — this is the guard that prevents a losing request from executing the stop side effect (CR2 #11473)")
|
||||
}
|
||||
if !bytes.Contains(s, []byte("RowsAffected")) || !bytes.Contains(s, []byte("ALREADY_SWITCHING")) {
|
||||
t.Error("SwitchProvider must 409 ALREADY_SWITCHING when the pre-claim affects 0 rows (lost the race before the stop runs)")
|
||||
}
|
||||
if !bytes.Contains(s, []byte("cpStopWithRetryErr")) {
|
||||
t.Error("SwitchProvider must use cpStopWithRetryErr to detect stop exhaustion")
|
||||
}
|
||||
if !bytes.Contains(s, []byte("emitSwitchProviderStopExhausted")) {
|
||||
t.Error("SwitchProvider must emit an audit row with old instance/provider metadata on stop exhaustion")
|
||||
}
|
||||
// Routing invariant: the NEW-box provision must go through the
|
||||
// central Auto dispatcher, not the direct per-backend body (this
|
||||
// is the core#2422 RCA-tick fix that closes the Platform-Go red
|
||||
// on TestNoCallSiteCallsDirectProvisionerExceptAuto).
|
||||
if bytes.Contains(s, []byte("h.goAsync(func() { h.provisionWorkspaceCP(")) {
|
||||
t.Error("SwitchProvider must route the NEW-box provision through provisionWorkspaceAuto, NOT through h.provisionWorkspaceCP directly (TestNoCallSiteCallsDirectProvisionerExceptAuto pin)")
|
||||
}
|
||||
}
|
||||
|
||||
// TestSwitchProvider_RejectsBadProvider: the allowlist check fires before any DB
|
||||
// access, so a bad/missing provider is a clean 400 without touching the backend.
|
||||
func TestSwitchProvider_RejectsBadProvider(t *testing.T) {
|
||||
gin.SetMode(gin.TestMode)
|
||||
h := &WorkspaceHandler{}
|
||||
for _, tc := range []struct {
|
||||
body string
|
||||
want int
|
||||
}{
|
||||
{`{"provider":"azure"}`, http.StatusBadRequest},
|
||||
{`{"provider":""}`, http.StatusBadRequest},
|
||||
{`{"provider":"AWS-typo"}`, http.StatusBadRequest},
|
||||
{`{}`, http.StatusBadRequest},
|
||||
} {
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "ws-1"}}
|
||||
c.Request = httptest.NewRequest("POST", "/workspaces/ws-1/switch-provider", strings.NewReader(tc.body))
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
h.SwitchProvider(c)
|
||||
if w.Code != tc.want {
|
||||
t.Errorf("body %s: got %d want %d (%s)", tc.body, w.Code, tc.want, w.Body.String())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestSwitchProvider_PreClaimRollbackOnError (CR2 #11486 + #11493) is the
|
||||
// source-level pin for the commit-or-rollback pattern added after the
|
||||
// pre-claim landed. The pre-claim sets status='provisioning'; without
|
||||
// the rollback defer, any error / ctx-cancellation between pre-claim
|
||||
// and step 5 (committed) would strand the workspace in 'provisioning'
|
||||
// forever. The defer must:
|
||||
// 1. Be armed ONLY AFTER this request's pre-claim succeeds
|
||||
// (CR2 #11493 ownership-ordering fix — a losing pre-claim must
|
||||
// NOT arm the defer, otherwise the rollback UPDATE (gated on
|
||||
// `status = 'provisioning'`) would clobber ANOTHER request's
|
||||
// in-flight pre-claim to OUR priorStatus, stranding them).
|
||||
// 2. Revert status to priorStatus on any error path (commit-or-rollback).
|
||||
// 3. Use a fresh context (not the request ctx) so client
|
||||
// disconnect mid-switch still cleans up.
|
||||
// 4. Set `committed = true` ONLY at the very end (after step 5).
|
||||
func TestSwitchProvider_PreClaimRollbackOnError(t *testing.T) {
|
||||
wd, _ := os.Getwd()
|
||||
src, err := os.ReadFile(filepath.Join(wd, "workspace_switch_provider.go"))
|
||||
if err != nil {
|
||||
t.Fatalf("read source: %v", err)
|
||||
}
|
||||
s := stripGoComments(src)
|
||||
// Commit-or-rollback flag
|
||||
if !bytes.Contains(s, []byte("committed := false")) {
|
||||
t.Error("SwitchProvider must declare a `committed := false` flag for the commit-or-rollback pattern (CR2 #11486)")
|
||||
}
|
||||
if !bytes.Contains(s, []byte("committed = true")) {
|
||||
t.Error("SwitchProvider must set `committed = true` only after the provision is dispatched — the rollback defer reads this flag to decide whether to revert status")
|
||||
}
|
||||
// defer that checks the flag and reverts
|
||||
if !bytes.Contains(s, []byte("defer func() {")) || !bytes.Contains(s, []byte("if committed {")) {
|
||||
t.Error("SwitchProvider must have a defer that checks the `committed` flag and reverts status to priorStatus on any uncommitted path (CR2 #11486)")
|
||||
}
|
||||
// Rollback uses a fresh context (not the request ctx) so client
|
||||
// disconnect mid-switch still cleans up.
|
||||
if !bytes.Contains(s, []byte("rollbackCtx, cancel := context.WithTimeout(context.Background()")) {
|
||||
t.Error("SwitchProvider's rollback defer must use a FRESH context (context.Background), not the request ctx — a cancelled request ctx would skip the cleanup and strand the workspace")
|
||||
}
|
||||
// priorStatus captured at the top of the function
|
||||
if !bytes.Contains(s, []byte("priorStatus := status")) {
|
||||
t.Error("SwitchProvider must capture `priorStatus := status` (the value from the lookup query) before the pre-claim so the rollback can restore it")
|
||||
}
|
||||
// The rollback UPDATE is gated on status='provisioning' so we don't
|
||||
// clobber a newer status set by a concurrent switch/provision.
|
||||
if !bytes.Contains(s, []byte("AND status = $3")) {
|
||||
t.Error("SwitchProvider's rollback UPDATE must be gated on `status = 'provisioning'` so a concurrent switch/provision that has already advanced the status is not clobbered")
|
||||
}
|
||||
}
|
||||
|
||||
// TestSwitchProvider_PreClaimLoserDoesNotArmRollback (CR2 #11493) is
|
||||
// the regression test for the ownership-ordering fix: the commit-or-
|
||||
// rollback defer MUST be armed AFTER the pre-claim's 0-rows return,
|
||||
// not before. A losing pre-claim must NOT arm the defer — otherwise
|
||||
// the rollback UPDATE (gated on `status = 'provisioning'`) could
|
||||
// clobber ANOTHER request's in-flight pre-claim to OUR priorStatus,
|
||||
// stranding them.
|
||||
//
|
||||
// Source-level position check: the `defer func() {` opening must
|
||||
// appear AFTER the `ALREADY_SWITCHING` 409 return (the losing
|
||||
// pre-claim path).
|
||||
func TestSwitchProvider_PreClaimLoserDoesNotArmRollback(t *testing.T) {
|
||||
wd, _ := os.Getwd()
|
||||
src, err := os.ReadFile(filepath.Join(wd, "workspace_switch_provider.go"))
|
||||
if err != nil {
|
||||
t.Fatalf("read source: %v", err)
|
||||
}
|
||||
stripped := stripGoComments(src)
|
||||
// Find the ALREADY_SWITCHING 409 return — the losing-pre-claim path.
|
||||
loseIdx := bytes.Index(stripped, []byte("ALREADY_SWITCHING"))
|
||||
if loseIdx < 0 {
|
||||
t.Fatal("SwitchProvider must have an ALREADY_SWITCHING 409 return on a lost pre-claim")
|
||||
}
|
||||
// Find the defer opening — must come AFTER the 409 return.
|
||||
deferIdx := bytes.Index(stripped, []byte("defer func() {"))
|
||||
if deferIdx < 0 {
|
||||
t.Fatal("SwitchProvider must have a `defer func() { ... }` block for the commit-or-rollback pattern")
|
||||
}
|
||||
if deferIdx <= loseIdx {
|
||||
t.Fatalf("OWNERSHIP-ORDERING BUG (CR2 #11493): the commit-or-rollback defer (idx %d) must be armed AFTER the losing-pre-claim 409 return (idx %d) — a losing pre-claim must NOT arm the defer, otherwise the rollback UPDATE (gated on `status = 'provisioning'`) would clobber another request's in-flight pre-claim to OUR priorStatus, stranding them", deferIdx, loseIdx)
|
||||
}
|
||||
}
|
||||
|
||||
// TestSwitchProvider_RouteRegistered pins the route wiring.
|
||||
func TestSwitchProvider_RouteRegistered(t *testing.T) {
|
||||
wd, _ := os.Getwd()
|
||||
src, err := os.ReadFile(filepath.Join(wd, "..", "router", "router.go"))
|
||||
if err != nil {
|
||||
t.Fatalf("read router: %v", err)
|
||||
}
|
||||
if !bytes.Contains(src, []byte(`POST("/switch-provider", wh.SwitchProvider)`)) {
|
||||
t.Fatal("router must register POST /switch-provider → wh.SwitchProvider")
|
||||
}
|
||||
}
|
||||
@@ -271,6 +271,7 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi
|
||||
// Lifecycle
|
||||
wsAuth.GET("/state", wh.State)
|
||||
wsAuth.POST("/restart", wh.Restart)
|
||||
wsAuth.POST("/switch-provider", wh.SwitchProvider)
|
||||
wsAuth.POST("/pause", wh.Pause)
|
||||
wsAuth.POST("/resume", wh.Resume)
|
||||
// Manual hibernate (opt-in, #711) — stops the container and sets status
|
||||
|
||||
Reference in New Issue
Block a user