feat(workspace): in-place cloud-provider switch in Container Config #2465

Merged
devops-engineer merged 4 commits from feat/in-place-provider-switch into main 2026-06-09 02:39:55 +00:00
6 changed files with 487 additions and 44 deletions
@@ -3,13 +3,36 @@
import { useEffect, useMemo, useState } from "react";
import { api } from "@/lib/api";
import { runtimeDisplayName } from "@/lib/runtime-names";
import { isSaaSTenant } from "@/lib/tenant";
import { useCanvasStore, type WorkspaceNodeData } from "@/store/canvas";
import type { WorkspaceCompute } from "@/store/socket";
const INSTANCE_TYPES = ["t3.medium", "t3.large", "t3.xlarge", "t3.2xlarge", "m6i.large", "m6i.xlarge", "c6i.xlarge"];
// Machine sizes keyed by cloud provider — an AWS t3.* is meaningless on Hetzner,
// etc. MUST mirror the workspace-server workspaceComputeInstanceAllowlist (which
// mirrors the CP provider configs); the PATCH validation rejects a mismatch 400.
const INSTANCE_TYPES_BY_PROVIDER: Record<string, string[]> = {
aws: ["t3.medium", "t3.large", "t3.xlarge", "t3.2xlarge", "m6i.large", "m6i.xlarge", "c6i.xlarge"],
hetzner: ["cpx11", "cpx21", "cpx31", "cpx41", "cpx51", "cax11", "cax21", "cax31", "cax41"],
gcp: ["e2-small", "e2-medium", "e2-standard-2", "e2-standard-4", "e2-standard-8"],
};
const DEFAULT_INSTANCE_BY_PROVIDER: Record<string, string> = {
aws: "t3.medium", hetzner: "cpx31", gcp: "e2-standard-2",
};
const normalizeProvider = (p?: string): string => (p === "gcp" || p === "hetzner" ? p : "aws");
const instanceTypesForProvider = (p?: string): string[] =>
INSTANCE_TYPES_BY_PROVIDER[normalizeProvider(p)] ?? INSTANCE_TYPES_BY_PROVIDER.aws;
const defaultInstanceForProvider = (p?: string): string =>
DEFAULT_INSTANCE_BY_PROVIDER[normalizeProvider(p)] ?? "t3.medium";
// Editable cloud-provider options (multi-provider RFC) — mirrors CreateWorkspaceDialog.
const CLOUD_PROVIDER_OPTIONS = [
{ value: "aws", label: "AWS (default)" },
{ value: "gcp", label: "GCP" },
{ value: "hetzner", label: "Hetzner" },
];
const RUNTIME_OPTIONS = ["claude-code", "codex", "hermes", "openclaw", "kimi", "kimi-cli", "external"];
const RESOLUTIONS = ["1280x720", "1440x900", "1920x1080", "2560x1440"];
const DEFAULT_HEADLESS_INSTANCE_TYPE = "t3.medium";
const DEFAULT_HEADLESS_ROOT_GB = 30;
type Props = {
@@ -23,6 +46,7 @@ type Props = {
type FormState = {
runtime: string;
provider: string; // cloud backend; editable in SaaS (in-place switch recreates the box)
instanceType: string;
rootGB: string;
displayEnabled: boolean;
@@ -38,16 +62,16 @@ const DATA_PERSISTENCE_OPTIONS = ["", "persist", "ephemeral"];
const dataPersistenceLabel = (v: string): string =>
v === "persist" ? "Always keep (persist)" : v === "ephemeral" ? "Don't keep (ephemeral)" : "Auto";
// Cloud/compute backend display name. The provider is chosen at create time and
// is NOT editable here (changing a workspace's cloud requires a recreate), so
// it renders as a read-only badge — but we must preserve it across Save (the
// compute payload is rebuilt below, and dropping it would wipe the column).
// Cloud/compute backend display name (read-only fallback for non-SaaS / legacy).
const cloudProviderLabel = (v: string | undefined): string =>
v === "gcp" ? "GCP" : v === "hetzner" ? "Hetzner" : "AWS";
export function ContainerConfigTab({ workspaceId, data }: Props) {
// Provider is editable only in SaaS (CP-provisioned boxes). Local/Docker has no
// cloud-provider concept, so we keep the read-only badge there.
const isSaaS = useMemo(() => isSaaSTenant(), []);
const runtime = data.runtime;
const provider = data.compute?.provider; // read-only; set at create time
const provider = data.compute?.provider;
const instanceType = data.compute?.instance_type;
const rootGB = data.compute?.volume?.root_gb;
const displayMode = data.compute?.display?.mode;
@@ -56,8 +80,8 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
const displayHeight = data.compute?.display?.height;
const dataPersistence = data.compute?.data_persistence;
const initial = useMemo(
() => formFromData({ runtime, instanceType, rootGB, displayMode, displayProtocol, displayWidth, displayHeight, dataPersistence }),
[runtime, instanceType, rootGB, displayMode, displayProtocol, displayWidth, displayHeight, dataPersistence],
() => formFromData({ runtime, provider, instanceType, rootGB, displayMode, displayProtocol, displayWidth, displayHeight, dataPersistence }),
[runtime, provider, instanceType, rootGB, displayMode, displayProtocol, displayWidth, displayHeight, dataPersistence],
);
const [form, setForm] = useState<FormState>(initial);
const [saving, setSaving] = useState(false);
@@ -87,6 +111,21 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
try {
let applyTemplateOnRestart = data.applyTemplateOnRestart ?? false;
if (dirty) {
// In-place cloud switch is DESTRUCTIVE: changing the provider recreates the
// box on the new cloud (the workspace-server deprovisions the old box on
// its old cloud first, then the restart provisions on the new one). Confirm
// before doing it — the current box and any non-persisted state are lost.
const providerChanged = normalizeProvider(form.provider) !== normalizeProvider(initial.provider);
if (providerChanged && typeof window !== "undefined") {
const ok = window.confirm(
`Switch this workspace to ${cloudProviderLabel(form.provider)}? This RECREATES the box on the new cloud — the current box and any non-persisted state are replaced.`,
);
if (!ok) {
setSaving(false);
return;
}
}
const rootGB = parseInt(form.rootGB, 10);
if (!Number.isFinite(rootGB)) {
setError("Root volume must be a number");
@@ -102,10 +141,11 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
: { mode: "none" },
// internal#734: omit when "auto" so the wire/default behavior is unchanged.
...(form.dataPersistence ? { data_persistence: form.dataPersistence } : {}),
// Preserve the create-time cloud provider — it's not editable here, but
// this PATCH rebuilds the whole compute object, so omitting it would
// wipe the persisted provider (and mislead the badge after a Save).
...(provider ? { provider } : {}),
// Cloud backend: send the (possibly switched) provider. Omit for the
// default (aws) so a non-switching AWS save keeps the wire unchanged;
// a switch TO aws (omit) vs FROM aws (explicit) both register correctly
// because the workspace-server normalizes ""→aws when diffing.
...(normalizeProvider(form.provider) !== "aws" ? { provider: normalizeProvider(form.provider) } : {}),
};
const resp = await api.patch<{ needs_restart?: boolean }>(`/workspaces/${workspaceId}`, {
@@ -140,15 +180,16 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
<div className="mb-3 flex items-center justify-between gap-3">
<div className="flex items-center gap-2">
<h3 className="text-sm font-semibold text-ink">Container Config</h3>
{/* Read-only cloud-provider badge — which cloud this workspace's box
runs on (AWS/GCP/Hetzner). Defaults to AWS when unset (legacy
rows). Set at create time in the Create Workspace dialog. */}
<span
title="Cloud provider for this workspace's compute (set at create time)"
className="rounded-full border border-line/60 bg-surface-sunken px-2 py-0.5 font-mono text-[10px] uppercase tracking-wide text-ink-mid"
>
{cloudProviderLabel(provider)}
</span>
{/* Non-SaaS (local/Docker) has no cloud-provider concept → read-only
badge. In SaaS the provider is an editable selector in the form. */}
{!isSaaS && (
<span
title="Cloud provider for this workspace's compute"
className="rounded-full border border-line/60 bg-surface-sunken px-2 py-0.5 font-mono text-[10px] uppercase tracking-wide text-ink-mid"
>
{cloudProviderLabel(provider)}
</span>
)}
</div>
{data.needsRestart && <span className="text-[11px] text-warm">Restart required</span>}
</div>
@@ -162,11 +203,32 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
optionLabel={runtimeDisplayName}
onChange={(runtime) => setForm((s) => ({ ...s, runtime }))}
/>
{isSaaS && (
<SelectField
id="cloud-provider"
label="Cloud provider"
value={normalizeProvider(form.provider)}
options={CLOUD_PROVIDER_OPTIONS.map((p) => p.value)}
optionLabel={(v) => CLOUD_PROVIDER_OPTIONS.find((p) => p.value === v)?.label ?? v}
// Switching cloud resets the instance type to the new provider's
// default (an AWS t3.* is invalid on Hetzner, etc.) — also keeps the
// instance-type dropdown below in sync with the provider's sizes.
onChange={(provider) =>
setForm((s) => ({
...s,
provider,
instanceType: instanceTypesForProvider(provider).includes(s.instanceType)
? s.instanceType
: defaultInstanceForProvider(provider),
}))
}
/>
)}
<SelectField
id="instance-type"
label="Instance type"
value={form.instanceType}
options={INSTANCE_TYPES}
options={instanceTypesForProvider(form.provider)}
onChange={(instanceType) => setForm((s) => ({ ...s, instanceType }))}
/>
<label className="grid gap-1" htmlFor="root-volume-gb">
@@ -270,6 +332,7 @@ export function ContainerConfigTab({ workspaceId, data }: Props) {
function formFromData(data: {
runtime?: string;
provider?: string;
instanceType?: string;
rootGB?: number;
displayMode?: string;
@@ -281,9 +344,11 @@ function formFromData(data: {
const width = data.displayWidth ?? 1920;
const height = data.displayHeight ?? 1080;
const resolution = `${width}x${height}`;
const provider = normalizeProvider(data.provider);
return {
runtime: data.runtime || "claude-code",
instanceType: data.instanceType || DEFAULT_HEADLESS_INSTANCE_TYPE,
provider,
instanceType: data.instanceType || defaultInstanceForProvider(provider),
rootGB: String(data.rootGB || DEFAULT_HEADLESS_ROOT_GB),
displayEnabled: !!data.displayMode && data.displayMode !== "none",
displayMode: data.displayMode && data.displayMode !== "none" ? data.displayMode : "desktop-control",
@@ -23,6 +23,13 @@ vi.mock("@/store/canvas", () => ({
),
}));
// SaaS so the editable cloud-provider selector renders (non-SaaS shows a read-only
// badge). Existing tests keep provider=aws (default), which is omitted from the
// PATCH payload, so their assertions are unaffected.
vi.mock("@/lib/tenant", () => ({
isSaaSTenant: () => true,
}));
import { ContainerConfigTab } from "../ContainerConfigTab";
afterEach(() => {
@@ -314,4 +321,67 @@ describe("ContainerConfigTab", () => {
await waitFor(() => expect(restartWorkspace).toHaveBeenCalledWith("ws-compute", { applyTemplate: true }));
expect(apiPatch).not.toHaveBeenCalled();
});
it("switches cloud provider — keys the instance-type list to the provider, confirms the recreate, and PATCHes the new provider", async () => {
const confirmSpy = vi.spyOn(window, "confirm").mockReturnValue(true);
render(
<ContainerConfigTab
workspaceId="ws-switch"
data={{
runtime: "claude-code",
status: "online",
needsRestart: false,
activeTasks: 0,
maxConcurrentTasks: null,
workspaceAccess: "read-write",
deliveryMode: "push",
compute: { instance_type: "t3.large", provider: "aws", volume: { root_gb: 30 } },
}}
/>,
);
const providerSel = screen.getByLabelText("Cloud provider");
expect(providerSel).toHaveProperty("value", "aws");
expect(screen.getByLabelText("Instance type")).toHaveProperty("value", "t3.large");
// Switch to Hetzner → the instance type resets to the Hetzner default (an AWS
// t3.* is invalid on Hetzner) and the options become Hetzner sizes.
fireEvent.change(providerSel, { target: { value: "hetzner" } });
expect(screen.getByLabelText("Instance type")).toHaveProperty("value", "cpx31");
fireEvent.click(screen.getByRole("button", { name: "Save" }));
await waitFor(() => expect(apiPatch).toHaveBeenCalledTimes(1));
expect(confirmSpy).toHaveBeenCalled(); // destructive recreate confirmed
const body = apiPatch.mock.calls[0][1] as { compute: { provider?: string; instance_type?: string } };
expect(body.compute.provider).toBe("hetzner");
expect(body.compute.instance_type).toBe("cpx31");
confirmSpy.mockRestore();
});
it("does not treat a non-provider edit as a recreate (no confirm; aws default omitted)", async () => {
const confirmSpy = vi.spyOn(window, "confirm").mockReturnValue(true);
render(
<ContainerConfigTab
workspaceId="ws-noswitch"
data={{
runtime: "claude-code",
status: "online",
needsRestart: false,
activeTasks: 0,
maxConcurrentTasks: null,
workspaceAccess: "read-write",
deliveryMode: "push",
compute: { instance_type: "t3.large", provider: "aws", volume: { root_gb: 30 } },
}}
/>,
);
fireEvent.change(screen.getByLabelText("Root volume"), { target: { value: "60" } });
fireEvent.click(screen.getByRole("button", { name: "Save" }));
await waitFor(() => expect(apiPatch).toHaveBeenCalledTimes(1));
expect(confirmSpy).not.toHaveBeenCalled();
const body = apiPatch.mock.calls[0][1] as { compute: { provider?: string } };
expect(body.compute.provider).toBeUndefined(); // aws default omitted (wire unchanged)
confirmSpy.mockRestore();
});
});
@@ -31,14 +31,53 @@ type workspaceDisplayResponse struct {
Status string `json:"status,omitempty"`
}
var workspaceComputeInstanceAllowlist = map[string]struct{}{
"t3.medium": {},
"t3.large": {},
"t3.xlarge": {},
"t3.2xlarge": {},
"m6i.large": {},
"m6i.xlarge": {},
"c6i.xlarge": {},
// workspaceComputeInstanceAllowlist is keyed by cloud provider (multi-provider /
// in-place switch): each provider's box accepts only that provider's machine
// sizes (an AWS t3.* is meaningless on Hetzner, and vice-versa). Mirrors the CP
// provider SSOT — keep in lock-step with the controlplane provider configs
// (Hetzner ServerType cpx*/cax*, GCP MachineType e2-*, AWS EC2 t3*/m6i*/c6i*).
// TestValidateWorkspaceCompute_Provider / _InstanceTypePerProvider pin the sets.
// "" provider = AWS default.
var workspaceComputeInstanceAllowlist = map[string]map[string]struct{}{
"aws": {
"t3.medium": {}, "t3.large": {}, "t3.xlarge": {}, "t3.2xlarge": {},
"m6i.large": {}, "m6i.xlarge": {}, "c6i.xlarge": {},
},
"hetzner": {
"cpx11": {}, "cpx21": {}, "cpx31": {}, "cpx41": {}, "cpx51": {},
"cax11": {}, "cax21": {}, "cax31": {}, "cax41": {},
},
"gcp": {
"e2-small": {}, "e2-medium": {},
"e2-standard-2": {}, "e2-standard-4": {}, "e2-standard-8": {},
},
}
// normalizeCloudProvider maps "" → "aws" so the in-place switch comparison
// treats the default and an explicit "aws" as the same cloud (no spurious switch).
func normalizeCloudProvider(p string) string {
if p == "" {
return "aws"
}
return p
}
// instanceTypeAllowedForProvider reports whether instanceType is valid for the
// given provider ("" → aws). Empty instanceType is always allowed (CP defaults).
func instanceTypeAllowedForProvider(provider, instanceType string) bool {
if instanceType == "" {
return true
}
p := provider
if p == "" {
p = "aws"
}
set, ok := workspaceComputeInstanceAllowlist[p]
if !ok {
return false
}
_, ok = set[instanceType]
return ok
}
// workspaceComputeProviderAllowlist mirrors the controlplane cloud-provider SSOT
@@ -56,11 +95,24 @@ var workspaceComputeProviderAllowlist = map[string]struct{}{
}
func validateWorkspaceCompute(compute models.WorkspaceCompute) error {
if compute.InstanceType != "" {
if _, ok := workspaceComputeInstanceAllowlist[compute.InstanceType]; !ok {
return fmt.Errorf("unsupported compute.instance_type")
// Provider first (so the instance-type check below can be provider-scoped).
// "" = default (AWS). CP fail-closes an unwired provider with a 422; validating
// here gives a clean 400 before the round-trip and is the gate reused by the
// switch-provider flow. Mirrors the controlplane cloudprovider SSOT.
if compute.Provider != "" {
if _, ok := workspaceComputeProviderAllowlist[compute.Provider]; !ok {
return fmt.Errorf("unsupported compute.provider (want aws|gcp|hetzner)")
}
}
// Instance type must belong to the chosen provider (an AWS t3.* is invalid on
// Hetzner, etc.). Empty = CP default for the provider.
if !instanceTypeAllowedForProvider(compute.Provider, compute.InstanceType) {
prov := compute.Provider
if prov == "" {
prov = "aws"
}
return fmt.Errorf("unsupported compute.instance_type %q for provider %q", compute.InstanceType, prov)
}
if compute.Volume.RootGB != 0 {
if compute.Volume.RootGB < workspaceComputeDiskFloorGB || compute.Volume.RootGB > workspaceComputeDiskCeilingGB {
return fmt.Errorf("compute.volume.root_gb must be between %d and %d", workspaceComputeDiskFloorGB, workspaceComputeDiskCeilingGB)
@@ -87,15 +139,6 @@ func validateWorkspaceCompute(compute models.WorkspaceCompute) error {
default:
return fmt.Errorf("unsupported compute.data_persistence (want persist|ephemeral)")
}
// Cloud backend for the box (multi-provider). "" = default (AWS). CP fail-
// closes an unwired provider with a 422 (PROVIDER_UNAVAILABLE); validating
// here gives a clean 400 before the round-trip and is the gate reused by the
// switch-provider flow. Mirrors the controlplane cloudprovider SSOT.
if compute.Provider != "" {
if _, ok := workspaceComputeProviderAllowlist[compute.Provider]; !ok {
return fmt.Errorf("unsupported compute.provider (want aws|gcp|hetzner)")
}
}
return nil
}
@@ -64,6 +64,40 @@ func TestValidateWorkspaceCompute_Provider(t *testing.T) {
}
}
// Multi-provider / in-place switch: an instance type must belong to the chosen
// provider — an AWS t3.* is meaningless on Hetzner, a cpx* on AWS, etc. Pins the
// provider-keyed allowlist (mirrors the CP provider configs).
func TestValidateWorkspaceCompute_InstanceTypePerProvider(t *testing.T) {
good := []struct{ provider, instance string }{
{"", "t3.medium"}, {"aws", "t3.2xlarge"}, {"aws", "c6i.xlarge"},
{"hetzner", "cpx31"}, {"hetzner", "cax41"},
{"gcp", "e2-standard-2"}, {"gcp", "e2-small"},
{"hetzner", ""}, {"gcp", ""}, // empty instance = CP default, always ok
}
for _, g := range good {
c := models.WorkspaceCompute{Provider: g.provider, InstanceType: g.instance}
if err := validateWorkspaceCompute(c); err != nil {
t.Errorf("provider=%q instance=%q must be accepted: %v", g.provider, g.instance, err)
}
}
bad := []struct{ provider, instance string }{
{"hetzner", "t3.medium"}, // AWS type on Hetzner
{"aws", "cpx31"}, // Hetzner type on AWS
{"gcp", "t3.large"}, // AWS type on GCP
{"hetzner", "e2-small"}, // GCP type on Hetzner
{"", "cpx31"}, // default(aws) + Hetzner type
}
for _, b := range bad {
c := models.WorkspaceCompute{Provider: b.provider, InstanceType: b.instance}
if err := validateWorkspaceCompute(c); err == nil {
t.Errorf("provider=%q instance=%q must be rejected (cross-provider instance type)", b.provider, b.instance)
}
}
if normalizeCloudProvider("") != "aws" || normalizeCloudProvider("hetzner") != "hetzner" {
t.Fatal("normalizeCloudProvider: \"\" must map to aws; explicit providers unchanged")
}
}
// internal#734: data_persistence enum. "" (auto), "persist", "ephemeral" are
// the only accepted values; anything else is a clear 400 before the CP call.
func TestValidateWorkspaceCompute_DataPersistence(t *testing.T) {
@@ -164,6 +164,7 @@ func (h *WorkspaceHandler) Update(c *gin.Context) {
}
}
var computeJSON string
var newComputeProvider string // hoisted: drives the cloud-provider switch detection below
computePatch := false
if rawCompute, ok := body["compute"]; ok {
computePatch = true
@@ -184,6 +185,7 @@ func (h *WorkspaceHandler) Update(c *gin.Context) {
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
newComputeProvider = compute.Provider
encoded, err := workspaceComputeJSON(compute)
if err != nil {
log.Printf("Update compute encode error for %s: %v", id, err)
@@ -262,6 +264,55 @@ func (h *WorkspaceHandler) Update(c *gin.Context) {
needsRestart = true
}
if computePatch {
// Cloud-provider SWITCH (in-place): if the incoming provider differs from
// the one currently stored, the existing box lives on the OLD cloud. We
// MUST deprovision it on the OLD provider BEFORE overwriting compute —
// otherwise the subsequent "Save & Restart" restart's provider-aware
// deprovision (cpProv.Stop → resolveProvider reads compute->>'provider')
// would target the NEW cloud and ORPHAN the old box (a silently-billing
// leak). Cloud mode only (the local Docker provisioner has no cross-cloud
// concept; provider stays "" there so this never fires). After this, the
// canvas's restart provisions the box on the new cloud; its own Stop is a
// safe no-op (the box is already gone).
if h.cpProv != nil {
var oldProvider sql.NullString
err := db.DB.QueryRowContext(ctx, `SELECT compute->>'provider' FROM workspaces WHERE id = $1`, id).Scan(&oldProvider)
// FAIL-CLOSED on the read. The earlier `err == nil` gate was fail-OPEN:
// a transient/unexpected DB error here skipped the whole switch block and
// fell through to the compute UPDATE — so during a real switch the later
// provider-aware restart deprovision would target the NEW cloud and ORPHAN
// the old box (silent billing, unrecoverable). We cannot tell whether this
// is a cross-cloud switch without the old provider, so on any error other
// than "no such row" we abort exactly like a failed deprovision: compute
// untouched, old box still recoverable, user retries. (sql.ErrNoRows means
// there is genuinely no prior box — nothing to orphan — so it's safe to
// skip the switch and let the UPDATE proceed.)
if err != nil && !errors.Is(err, sql.ErrNoRows) {
log.Printf("Update: provider-switch precheck for %s ABORTED — could not read current cloud provider (provider left unchanged): %v", id, err)
c.JSON(http.StatusBadGateway, gin.H{"error": "could not read the current cloud provider; provider unchanged — please retry"})
return
}
if err == nil && normalizeCloudProvider(oldProvider.String) != normalizeCloudProvider(newComputeProvider) {
log.Printf("Update: cloud-provider switch for %s: %q -> %q; deprovisioning old box on old provider before overwriting compute",
id, normalizeCloudProvider(oldProvider.String), normalizeCloudProvider(newComputeProvider))
// Use the ERROR-returning variant and ABORT before overwriting
// compute if the old-box deprovision fails. If we proceeded, the
// old box would keep running on the OLD cloud while the row now
// records the NEW provider+instance — stranding it with no DB
// pointer (an UNRECOVERABLE cross-cloud orphan that no reconciler
// can map back). Aborting leaves the row pointing at the
// still-recoverable old box; the user can retry the switch. (The
// restart paths' void cpStopWithRetry is fine there because the
// box stays on the SAME cloud, so the provider record is unchanged
// and a provider-scoped sweep can still find it.)
if err := h.cpStopWithRetryErr(ctx, id, "provider-switch", false); err != nil {
log.Printf("Update: provider-switch for %s ABORTED — could not deprovision old box on %q (provider left unchanged, old box recoverable): %v",
id, normalizeCloudProvider(oldProvider.String), err)
c.JSON(http.StatusBadGateway, gin.H{"error": "could not deprovision the current cloud box; provider unchanged — please retry"})
return
}
}
}
if _, err := db.DB.ExecContext(ctx, `UPDATE workspaces SET compute = $2::jsonb, updated_at = now() WHERE id = $1`, id, computeJSON); err != nil {
log.Printf("Update compute error for %s: %v", id, err)
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to save compute config"})
@@ -0,0 +1,180 @@
package handlers
// workspace_provider_switch_test.go — deterministic coverage for the in-place
// cloud-provider switch in the Update (PATCH /workspaces/:id) handler.
//
// The switch is DESTRUCTIVE (it recreates the box on a new cloud) and its
// safety hinges on ORDER + ABORT, which these tests pin without touching a real
// cloud (sqlmock DB + the scriptedCPStop fake from workspace_restart_stop_retry_test):
//
// 1. On a provider change, the OLD box is deprovisioned (cpProv.Stop) BEFORE
// the compute row is overwritten — otherwise the later restart's
// provider-aware deprovision would target the NEW cloud and ORPHAN the old
// (still-billing) box. The sqlmock query ORDER pins "read old provider →
// [Stop] → UPDATE compute".
// 2. If the old-box deprovision FAILS, the handler ABORTS (502) and does NOT
// overwrite compute — leaving the row pointed at the recoverable old box
// (an unexpected UPDATE would fail sqlmock's expectations).
// 3. A non-switch compute edit (same provider) does NOT deprovision anything.
// 4. If the old-provider READ errors (transient DB fault, not sql.ErrNoRows),
// the handler FAILS CLOSED: aborts (502), deprovisions nothing, and does NOT
// overwrite compute — closing the fail-open read path that would otherwise
// orphan the old box on a real switch (security review RC 9895).
import (
"bytes"
"fmt"
"net/http"
"net/http/httptest"
"testing"
"github.com/DATA-DOG/go-sqlmock"
"github.com/gin-gonic/gin"
)
func newPatchContext(t *testing.T, id, body string) (*gin.Context, *httptest.ResponseRecorder) {
t.Helper()
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
c.Params = gin.Params{{Key: "id", Value: id}}
req := httptest.NewRequest("PATCH", "/workspaces/"+id, bytes.NewBufferString(body))
req.Header.Set("Content-Type", "application/json")
c.Request = req
return c, w
}
const switchTestWSID = "cccccccc-0001-0000-0000-000000000000"
func newSwitchTestHandler(t *testing.T, cp *scriptedCPStop) *WorkspaceHandler {
t.Helper()
h := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
h.cpProv = cp
return h
}
// 1. aws → hetzner: deprovision the OLD box, THEN overwrite compute (200).
func TestWorkspaceUpdate_ProviderSwitch_DeprovisionsOldBeforeUpdate(t *testing.T) {
mock := setupTestDB(t)
cp := &scriptedCPStop{} // Stop succeeds
h := newSwitchTestHandler(t, cp)
// Ordered expectations pin: EXISTS → read OLD provider (aws) → UPDATE compute.
// The cpProv.Stop deprovision runs (in code) AFTER the provider read and
// BEFORE the UPDATE — exactly the orphan-safe order.
mock.ExpectQuery("SELECT EXISTS").WithArgs(switchTestWSID).
WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
mock.ExpectQuery("compute->>'provider'").WithArgs(switchTestWSID).
WillReturnRows(sqlmock.NewRows([]string{"provider"}).AddRow("aws"))
mock.ExpectExec("UPDATE workspaces SET compute").
WillReturnResult(sqlmock.NewResult(0, 1))
c, w := newPatchContext(t, switchTestWSID,
`{"compute":{"instance_type":"cpx31","provider":"hetzner","volume":{"root_gb":30}}}`)
h.Update(c)
if w.Code != http.StatusOK {
t.Fatalf("expected 200 on a successful switch, got %d: %s", w.Code, w.Body.String())
}
if cp.calls != 1 {
t.Fatalf("expected the OLD box to be deprovisioned exactly once on a provider switch; got %d Stop calls", cp.calls)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Fatalf("unmet/unexpected DB queries (ordering broken?): %v", err)
}
}
// 2. Deprovision FAILS → abort (502) + compute NOT overwritten (no UPDATE).
func TestWorkspaceUpdate_ProviderSwitch_AbortsWhenDeprovisionFails(t *testing.T) {
shrinkRetryBackoff(t) // don't burn the 1s/2s/4s retry backoff
mock := setupTestDB(t)
// All retry attempts fail → cpStopWithRetryErr returns an error → abort.
cp := &scriptedCPStop{errs: []error{
fmt.Errorf("cp 503"), fmt.Errorf("cp 503"), fmt.Errorf("cp 503"),
}}
h := newSwitchTestHandler(t, cp)
mock.ExpectQuery("SELECT EXISTS").WithArgs(switchTestWSID).
WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
mock.ExpectQuery("compute->>'provider'").WithArgs(switchTestWSID).
WillReturnRows(sqlmock.NewRows([]string{"provider"}).AddRow("aws"))
// NO UPDATE expectation: if the handler overwrote compute after a failed
// deprovision (the orphan bug), sqlmock would flag the unexpected query.
c, w := newPatchContext(t, switchTestWSID,
`{"compute":{"instance_type":"cpx31","provider":"hetzner","volume":{"root_gb":30}}}`)
h.Update(c)
if w.Code != http.StatusBadGateway {
t.Fatalf("expected 502 when the old-box deprovision fails, got %d: %s", w.Code, w.Body.String())
}
if cp.calls == 0 {
t.Fatal("expected at least one Stop attempt before aborting")
}
if err := mock.ExpectationsWereMet(); err != nil {
// A failure here means an UNEXPECTED UPDATE ran — i.e. compute was
// overwritten after a failed deprovision → the orphan bug is back.
t.Fatalf("compute must NOT be overwritten when deprovision fails (orphan-prevention): %v", err)
}
}
// 3. Same provider (no switch): no deprovision; compute is updated normally.
func TestWorkspaceUpdate_NoProviderSwitch_DoesNotDeprovision(t *testing.T) {
mock := setupTestDB(t)
cp := &scriptedCPStop{}
h := newSwitchTestHandler(t, cp)
mock.ExpectQuery("SELECT EXISTS").WithArgs(switchTestWSID).
WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
mock.ExpectQuery("compute->>'provider'").WithArgs(switchTestWSID).
WillReturnRows(sqlmock.NewRows([]string{"provider"}).AddRow("aws"))
mock.ExpectExec("UPDATE workspaces SET compute").
WillReturnResult(sqlmock.NewResult(0, 1))
// provider stays aws (only the instance size changes) → no switch, no Stop.
c, w := newPatchContext(t, switchTestWSID,
`{"compute":{"instance_type":"t3.large","provider":"aws","volume":{"root_gb":60}}}`)
h.Update(c)
if w.Code != http.StatusOK {
t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
}
if cp.calls != 0 {
t.Fatalf("a non-switching compute edit must NOT deprovision the box; got %d Stop calls", cp.calls)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Fatalf("unmet/unexpected DB queries: %v", err)
}
}
// 4. Provider READ errors (transient DB fault) → fail-CLOSED: abort 502, no
// deprovision, no compute overwrite. A fail-open read (the old `err == nil`
// gate) would skip switch detection and overwrite compute → orphan the old
// cloud box. sqlmock has NO UPDATE/Stop expectations, so either an overwrite
// or a stray deprovision trips it.
func TestWorkspaceUpdate_ProviderSwitch_AbortsOnProviderReadError(t *testing.T) {
mock := setupTestDB(t)
cp := &scriptedCPStop{}
h := newSwitchTestHandler(t, cp)
mock.ExpectQuery("SELECT EXISTS").WithArgs(switchTestWSID).
WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
// The old-provider read hits a transient error (NOT sql.ErrNoRows).
mock.ExpectQuery("compute->>'provider'").WithArgs(switchTestWSID).
WillReturnError(fmt.Errorf("connection reset by peer"))
c, w := newPatchContext(t, switchTestWSID,
`{"compute":{"instance_type":"cpx31","provider":"hetzner","volume":{"root_gb":30}}}`)
h.Update(c)
if w.Code != http.StatusBadGateway {
t.Fatalf("expected 502 when the provider read fails (fail-closed), got %d: %s", w.Code, w.Body.String())
}
if cp.calls != 0 {
t.Fatalf("must NOT deprovision when the current provider can't be read; got %d Stop calls", cp.calls)
}
if err := mock.ExpectationsWereMet(); err != nil {
// An unexpected UPDATE here = compute was overwritten despite an unreadable
// provider → the fail-open orphan path is back.
t.Fatalf("compute must NOT be overwritten on a provider read error (fail-closed): %v", err)
}
}