diff --git a/canvas/src/components/Canvas.tsx b/canvas/src/components/Canvas.tsx index b3bae917..16335608 100644 --- a/canvas/src/components/Canvas.tsx +++ b/canvas/src/components/Canvas.tsx @@ -30,6 +30,8 @@ import { SearchDialog } from "./SearchDialog"; import { Toaster } from "./Toaster"; import { Toolbar } from "./Toolbar"; import { ConfirmDialog } from "./ConfirmDialog"; +import { api } from "@/lib/api"; +import { showToast } from "./Toaster"; // Phase 20 components import { SettingsPanel, DeleteConfirmDialog } from "./settings"; // Phase 20.3 batch operations @@ -96,6 +98,24 @@ function CanvasInner() { // Confirmation dialog state for structure changes const [pendingNest, setPendingNest] = useState<{ nodeId: string; targetId: string | null; nodeName: string; targetName: string } | null>(null); + // Delete-confirmation lives in the store so the dialog survives ContextMenu + // unmounting — the prior local-in-ContextMenu state raced with the menu's + // outside-click handler (the portal-rendered Confirm button counted as + // "outside" and closed the menu, killing the dialog mid-click). + const pendingDelete = useCanvasStore((s) => s.pendingDelete); + const setPendingDelete = useCanvasStore((s) => s.setPendingDelete); + const removeNode = useCanvasStore((s) => s.removeNode); + const confirmDelete = useCallback(async () => { + if (!pendingDelete) return; + const { id } = pendingDelete; + setPendingDelete(null); + try { + await api.del(`/workspaces/${id}?confirm=true`); + removeNode(id); + } catch (e) { + showToast(e instanceof Error ? e.message : "Delete failed", "error"); + } + }, [pendingDelete, setPendingDelete, removeNode]); const onNodeDragStop: OnNodeDrag> = useCallback( (_event, node) => { @@ -358,6 +378,17 @@ function CanvasInner() { onCancel={cancelNest} /> + {/* Confirmation dialog for workspace delete — driven by store */} + setPendingDelete(null)} + /> + {/* Settings Panel — global secrets management drawer */} diff --git a/canvas/src/components/ContextMenu.tsx b/canvas/src/components/ContextMenu.tsx index c03fb8fa..4211b7b4 100644 --- a/canvas/src/components/ContextMenu.tsx +++ b/canvas/src/components/ContextMenu.tsx @@ -4,7 +4,6 @@ import { useCallback, useEffect, useRef, useState } from "react"; import { useCanvasStore, type WorkspaceNodeData } from "@/store/canvas"; import { api } from "@/lib/api"; import { showToast } from "./Toaster"; -import { ConfirmDialog } from "./ConfirmDialog"; import { statusDotClass } from "@/lib/design-tokens"; interface MenuItem { @@ -26,15 +25,10 @@ export function ContextMenu() { const nestNode = useCanvasStore((s) => s.nestNode); const contextNodeId = contextMenu?.nodeId ?? null; const hasChildren = useCanvasStore((s) => contextNodeId ? s.nodes.some((n) => n.data.parentId === contextNodeId) : false); + const setPendingDelete = useCanvasStore((s) => s.setPendingDelete); const ref = useRef(null); - const [deleteConfirm, setDeleteConfirm] = useState<{ id: string; name: string } | null>(null); const [actionLoading, setActionLoading] = useState(false); - // Clear orphaned dialog state when context menu closes - useEffect(() => { - if (!contextMenu) setDeleteConfirm(null); - }, [contextMenu]); - // Auto-focus first enabled item when menu opens useEffect(() => { if (!contextMenu) return; @@ -167,21 +161,13 @@ export function ContextMenu() { const handleDelete = useCallback(() => { if (!contextMenu) return; - // Don't close context menu yet — keep it mounted so ConfirmDialog renders - setDeleteConfirm({ id: contextMenu.nodeId, name: contextMenu.nodeData.name }); - }, [contextMenu]); - - const confirmDelete = useCallback(async () => { - if (!deleteConfirm) return; - try { - await api.del(`/workspaces/${deleteConfirm.id}`); - removeNode(deleteConfirm.id); - } catch { - showToast("Delete failed", "error"); - } - setDeleteConfirm(null); + // Hoist delete confirmation to the Canvas-level dialog (via store) so + // it survives ContextMenu unmount. Closing the menu here avoids the + // prior race where the portal dialog's Confirm click was treated as + // "outside" by the menu's outside-click handler. + setPendingDelete({ id: contextMenu.nodeId, name: contextMenu.nodeData.name }); closeContextMenu(); - }, [deleteConfirm, removeNode, closeContextMenu]); + }, [contextMenu, setPendingDelete, closeContextMenu]); const handleViewDetails = useCallback(() => { if (!contextMenu) return; @@ -317,17 +303,6 @@ export function ContextMenu() { ); })} - - {/* Delete confirmation dialog */} - { setDeleteConfirm(null); closeContextMenu(); }} - /> ); } diff --git a/canvas/src/components/WorkspaceUsage.tsx b/canvas/src/components/WorkspaceUsage.tsx index 5ef629d4..e8625542 100644 --- a/canvas/src/components/WorkspaceUsage.tsx +++ b/canvas/src/components/WorkspaceUsage.tsx @@ -78,17 +78,17 @@ export function WorkspaceUsage({ workspaceId }: WorkspaceUsageProps) { <> diff --git a/canvas/src/components/__tests__/BudgetSection.test.tsx b/canvas/src/components/__tests__/BudgetSection.test.tsx index b0094829..5818972f 100644 --- a/canvas/src/components/__tests__/BudgetSection.test.tsx +++ b/canvas/src/components/__tests__/BudgetSection.test.tsx @@ -154,6 +154,16 @@ describe("BudgetSection — stats row", () => { await renderLoaded(budgetResponse({ budget_remaining: null })); expect(screen.queryByTestId("budget-remaining")).toBeNull(); }); + + it("does not crash when budget_used is missing from the response", async () => { + // Backend for a provisioning-stuck workspace may return a partial + // shape. Regression: previously this threw + // "Cannot read properties of undefined (reading 'toLocaleString')" + // and crashed the whole Details tab. + // eslint-disable-next-line @typescript-eslint/no-explicit-any + await renderLoaded({ budget_limit: 1000, budget_remaining: null } as any); + expect(screen.getByTestId("budget-used-value").textContent).toBe("0"); + }); }); // ── Progress bar ────────────────────────────────────────────────────────────── diff --git a/canvas/src/components/__tests__/ContextMenu.keyboard.test.tsx b/canvas/src/components/__tests__/ContextMenu.keyboard.test.tsx index 8e878de2..b37a0e4f 100644 --- a/canvas/src/components/__tests__/ContextMenu.keyboard.test.tsx +++ b/canvas/src/components/__tests__/ContextMenu.keyboard.test.tsx @@ -45,6 +45,7 @@ const mockStore = { selectNode: vi.fn(), setPanelTab: vi.fn(), nestNode: vi.fn(), + setPendingDelete: vi.fn(), nodes: [] as Array<{ id: string; data: { parentId: string | null } }>, }; @@ -209,4 +210,23 @@ describe("ContextMenu — keyboard accessibility", () => { fireEvent.click(zoomItem); expect(closeContextMenu).toHaveBeenCalled(); }); + + // Regression: the old flow kept ConfirmDialog inside ContextMenu's local + // state and rendered it via a portal. The portal-rendered Confirm button + // counted as "outside" by the menu's outside-click handler, closing the + // menu mid-click and making Delete appear to do nothing. The fix hoists + // the dialog state to the canvas store via `setPendingDelete` AND closes + // the context menu on click, so the dialog is owned by a component that + // outlives the menu. + it("clicking 'Delete' hoists state to the store and closes the menu", () => { + render(); + const items = screen.getAllByRole("menuitem"); + const deleteItem = items.find((el) => el.textContent?.includes("Delete"))!; + fireEvent.click(deleteItem); + expect(mockStore.setPendingDelete).toHaveBeenCalledWith({ + id: "ws-1", + name: "Alpha Workspace", + }); + expect(closeContextMenu).toHaveBeenCalled(); + }); }); diff --git a/canvas/src/components/__tests__/WorkspaceUsage.test.tsx b/canvas/src/components/__tests__/WorkspaceUsage.test.tsx index d40deac8..36f21333 100644 --- a/canvas/src/components/__tests__/WorkspaceUsage.test.tsx +++ b/canvas/src/components/__tests__/WorkspaceUsage.test.tsx @@ -84,6 +84,20 @@ describe("WorkspaceUsage", () => { }); }); + it("does not crash when token/cost fields are missing", async () => { + // Regression: Details tab crashed with + // "Cannot read properties of undefined (reading 'toLocaleString')" + // when a workspace stuck in provisioning returned partial metrics. + // eslint-disable-next-line @typescript-eslint/no-explicit-any + mockGet.mockResolvedValue({} as any); + render(); + await waitFor(() => { + expect(screen.getByTestId("usage-input-tokens").textContent).toContain("0"); + expect(screen.getByTestId("usage-output-tokens").textContent).toContain("0"); + expect(screen.getByTestId("usage-estimated-cost").textContent).toBe("Estimated cost$0.000000"); + }); + }); + it("displays estimated cost formatted as $X.XXXXXX after load", async () => { // eslint-disable-next-line @typescript-eslint/no-explicit-any mockGet.mockResolvedValue(METRICS_RESPONSE as any); diff --git a/canvas/src/components/tabs/BudgetSection.tsx b/canvas/src/components/tabs/BudgetSection.tsx index 24fbe404..db2d7040 100644 --- a/canvas/src/components/tabs/BudgetSection.tsx +++ b/canvas/src/components/tabs/BudgetSection.tsx @@ -171,7 +171,7 @@ export function BudgetSection({ workspaceId }: Props) {
Credits used - {budget.budget_used.toLocaleString()} + {(budget.budget_used ?? 0).toLocaleString()} / {budget.budget_limit != null diff --git a/canvas/src/store/canvas.ts b/canvas/src/store/canvas.ts index c19e1011..f0c73b6d 100644 --- a/canvas/src/store/canvas.ts +++ b/canvas/src/store/canvas.ts @@ -66,6 +66,14 @@ interface CanvasState { isDescendant: (ancestorId: string, nodeId: string) => boolean; openContextMenu: (menu: ContextMenuState) => void; closeContextMenu: () => void; + // Pending delete confirmation — lives in the store (not inside ContextMenu's + // local state) so the confirm dialog survives ContextMenu unmounting. The + // ContextMenu's portal-rendered dialog used to race with its outside-click + // handler: clicking Confirm registered as "outside", closed the menu, and + // unmounted the dialog before its onClick fired. Hoisting the state fixes + // that — see fix/context-menu-delete-race. + pendingDelete: { id: string; name: string } | null; + setPendingDelete: (v: { id: string; name: string } | null) => void; searchOpen: boolean; setSearchOpen: (open: boolean) => void; viewport: { x: number; y: number; zoom: number }; @@ -158,6 +166,8 @@ export const useCanvasStore = create((set, get) => ({ selectNode: (id) => set({ selectedNodeId: id }), openContextMenu: (menu) => set({ contextMenu: menu }), closeContextMenu: () => set({ contextMenu: null }), + pendingDelete: null, + setPendingDelete: (v) => set({ pendingDelete: v }), searchOpen: false, setSearchOpen: (open) => set({ searchOpen: open }), agentMessages: {}, diff --git a/workspace-server/cmd/server/main.go b/workspace-server/cmd/server/main.go index 3855a859..6ab47cc4 100644 --- a/workspace-server/cmd/server/main.go +++ b/workspace-server/cmd/server/main.go @@ -191,6 +191,15 @@ func main() { }) } + // Provision-timeout sweep — flips workspaces that have been stuck in + // status='provisioning' past the timeout window to 'failed' and emits + // WORKSPACE_PROVISION_TIMEOUT. Without this the UI banner is cosmetic + // and the state is incoherent (e.g. user sees "Retry" after 15min but + // backend still thinks provisioning is in progress). + go supervised.RunWithRecover(ctx, "provision-timeout-sweep", func(c context.Context) { + registry.StartProvisioningTimeoutSweep(c, broadcaster, registry.DefaultProvisionSweepInterval) + }) + // Cron Scheduler — fires A2A messages to workspaces on user-defined schedules cronSched := scheduler.New(wh, broadcaster) go supervised.RunWithRecover(ctx, "scheduler", cronSched.Start) diff --git a/workspace-server/internal/handlers/workspace_preflight.go b/workspace-server/internal/handlers/workspace_preflight.go new file mode 100644 index 00000000..837786c7 --- /dev/null +++ b/workspace-server/internal/handlers/workspace_preflight.go @@ -0,0 +1,70 @@ +package handlers + +import ( + "fmt" + + "gopkg.in/yaml.v3" +) + +// requiredEnvSchema is the subset of config.yaml we read to decide which env +// vars must be present before a container launch. It maps the YAML path +// `runtime_config.required_env: [...]` which is the same shape the workspace +// adapter's preflight reads inside the container (workspace/preflight.py). +// +// Mirroring the check server-side lets us fail fast with a readable error +// instead of letting the container crash-loop and the workspace sit in +// `provisioning` until a sweeper or the user intervenes. +type requiredEnvSchema struct { + RuntimeConfig struct { + RequiredEnv []string `yaml:"required_env"` + } `yaml:"runtime_config"` +} + +// missingRequiredEnv returns the list of env var names declared in the +// workspace's config.yaml under `runtime_config.required_env` that are NOT +// present (or are empty) in the assembled envVars map. Returns an empty +// slice when the config declares no requirements or when all are satisfied. +// +// A parse failure returns no missing vars — config.yaml shape is enforced by +// the in-container preflight, and the server's job here is only to catch the +// common "forgot to add the OAuth token secret" footgun, not to be a second +// config validator. +func missingRequiredEnv(configFiles map[string][]byte, envVars map[string]string) []string { + if len(configFiles) == 0 { + return nil + } + raw, ok := configFiles["config.yaml"] + if !ok || len(raw) == 0 { + return nil + } + var schema requiredEnvSchema + if err := yaml.Unmarshal(raw, &schema); err != nil { + return nil + } + if len(schema.RuntimeConfig.RequiredEnv) == 0 { + return nil + } + var missing []string + for _, name := range schema.RuntimeConfig.RequiredEnv { + if v, ok := envVars[name]; !ok || v == "" { + missing = append(missing, name) + } + } + return missing +} + +// formatMissingEnvError builds the user-facing message for a provision +// failure caused by unset required env vars. Kept stable because it's +// rendered verbatim in the canvas Events tab and Details banner. +func formatMissingEnvError(missing []string) string { + if len(missing) == 1 { + return fmt.Sprintf( + "missing required env var %q — add it under Config → Env Vars (or as a Global secret) and retry", + missing[0], + ) + } + return fmt.Sprintf( + "missing required env vars %q — add them under Config → Env Vars (or as Global secrets) and retry", + missing, + ) +} diff --git a/workspace-server/internal/handlers/workspace_preflight_test.go b/workspace-server/internal/handlers/workspace_preflight_test.go new file mode 100644 index 00000000..fcb865af --- /dev/null +++ b/workspace-server/internal/handlers/workspace_preflight_test.go @@ -0,0 +1,124 @@ +package handlers + +import ( + "strings" + "testing" +) + +func TestMissingRequiredEnv_NoConfig(t *testing.T) { + // Zero configFiles → nothing to validate → no missing. + if got := missingRequiredEnv(nil, map[string]string{}); got != nil { + t.Errorf("nil configFiles: got %v, want nil", got) + } + if got := missingRequiredEnv(map[string][]byte{}, map[string]string{}); got != nil { + t.Errorf("empty configFiles: got %v, want nil", got) + } +} + +func TestMissingRequiredEnv_NoConfigYaml(t *testing.T) { + // A map without config.yaml → no schema → no missing. + files := map[string][]byte{ + "other.txt": []byte("irrelevant"), + } + if got := missingRequiredEnv(files, map[string]string{}); got != nil { + t.Errorf("no config.yaml: got %v, want nil", got) + } +} + +func TestMissingRequiredEnv_NoRequiredEnvInYaml(t *testing.T) { + // config.yaml without runtime_config.required_env → no missing. + // Mirrors the default config emitted by ensureDefaultConfig (see the + // #1028 comment in workspace_provision.go about why required_env is + // intentionally omitted for auto-generated configs). + yml := ` +name: example +runtime: langgraph +runtime_config: + timeout: 0 +` + files := map[string][]byte{"config.yaml": []byte(yml)} + if got := missingRequiredEnv(files, map[string]string{}); got != nil { + t.Errorf("no required_env in YAML: got %v, want nil", got) + } +} + +func TestMissingRequiredEnv_AllSatisfied(t *testing.T) { + yml := ` +runtime: claude-code +runtime_config: + required_env: + - CLAUDE_CODE_OAUTH_TOKEN + - ANTHROPIC_API_KEY +` + files := map[string][]byte{"config.yaml": []byte(yml)} + env := map[string]string{ + "CLAUDE_CODE_OAUTH_TOKEN": "sk-ant-oat01-...", + "ANTHROPIC_API_KEY": "sk-ant-api-...", + } + if got := missingRequiredEnv(files, env); got != nil { + t.Errorf("all set: got %v, want nil", got) + } +} + +func TestMissingRequiredEnv_OneMissing(t *testing.T) { + // Reproduces the reported issue: Claude Code Agent config declares + // CLAUDE_CODE_OAUTH_TOKEN required but the tenant only has + // ANTHROPIC_API_KEY set globally. + yml := ` +runtime: claude-code +runtime_config: + required_env: + - CLAUDE_CODE_OAUTH_TOKEN + - ANTHROPIC_API_KEY +` + files := map[string][]byte{"config.yaml": []byte(yml)} + env := map[string]string{ + "ANTHROPIC_API_KEY": "sk-ant-...", + } + got := missingRequiredEnv(files, env) + if len(got) != 1 || got[0] != "CLAUDE_CODE_OAUTH_TOKEN" { + t.Errorf("expected [CLAUDE_CODE_OAUTH_TOKEN], got %v", got) + } +} + +func TestMissingRequiredEnv_EmptyStringCountsAsMissing(t *testing.T) { + // A secret row with empty value is effectively unset; the in-container + // preflight treats empty string as missing, so the server must match. + yml := ` +runtime_config: + required_env: [FOO] +` + files := map[string][]byte{"config.yaml": []byte(yml)} + env := map[string]string{"FOO": ""} + got := missingRequiredEnv(files, env) + if len(got) != 1 || got[0] != "FOO" { + t.Errorf("expected [FOO], got %v", got) + } +} + +func TestMissingRequiredEnv_MalformedYamlReturnsNil(t *testing.T) { + // Malformed YAML should not panic and should not block provisioning — + // the in-container preflight is the source of truth for config.yaml + // shape, and we don't want to double-fail on parse quirks. + files := map[string][]byte{"config.yaml": []byte("{ not: valid: yaml: [[")} + if got := missingRequiredEnv(files, map[string]string{}); got != nil { + t.Errorf("malformed YAML: got %v, want nil", got) + } +} + +func TestFormatMissingEnvError_Single(t *testing.T) { + msg := formatMissingEnvError([]string{"CLAUDE_CODE_OAUTH_TOKEN"}) + if !strings.Contains(msg, "CLAUDE_CODE_OAUTH_TOKEN") { + t.Errorf("message should name the var: %q", msg) + } + if !strings.Contains(msg, "retry") { + t.Errorf("message should tell user how to fix it: %q", msg) + } +} + +func TestFormatMissingEnvError_Multiple(t *testing.T) { + msg := formatMissingEnvError([]string{"A", "B"}) + if !strings.Contains(msg, "A") || !strings.Contains(msg, "B") { + t.Errorf("message should name both vars: %q", msg) + } +} diff --git a/workspace-server/internal/handlers/workspace_provision.go b/workspace-server/internal/handlers/workspace_provision.go index 92b9c55f..58ca1403 100644 --- a/workspace-server/internal/handlers/workspace_provision.go +++ b/workspace-server/internal/handlers/workspace_provision.go @@ -114,6 +114,27 @@ func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath stri return } + // Preflight: refuse to launch when config.yaml declares required env vars + // that are not set. Without this, a missing CLAUDE_CODE_OAUTH_TOKEN (or + // similar) crashes the in-container preflight, the container never calls + // /registry/register, and the workspace sits in `provisioning` until a + // sweeper flips it or the user retries. Failing fast here gives the user + // an immediate, actionable error in the Events tab. + if missing := missingRequiredEnv(configFiles, envVars); len(missing) > 0 { + msg := formatMissingEnvError(missing) + log.Printf("Provisioner: %s (workspace=%s)", msg, workspaceID) + if _, dbErr := db.DB.ExecContext(ctx, + `UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`, + workspaceID, msg); dbErr != nil { + log.Printf("Provisioner: failed to mark workspace %s as failed: %v", workspaceID, dbErr) + } + h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{ + "error": msg, + "missing": missing, + }) + return + } + cfg := h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace) cfg.ResetClaudeSession = resetClaudeSession // #12 diff --git a/workspace-server/internal/registry/provisiontimeout.go b/workspace-server/internal/registry/provisiontimeout.go new file mode 100644 index 00000000..32e96142 --- /dev/null +++ b/workspace-server/internal/registry/provisiontimeout.go @@ -0,0 +1,137 @@ +package registry + +import ( + "context" + "log" + "os" + "strconv" + "time" + + "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" +) + +// ProvisionTimeoutEmitter is the narrow broadcaster dependency the sweeper +// needs. Defined locally so the registry package stays event-bus agnostic +// (same pattern as OfflineHandler in healthsweep.go). +type ProvisionTimeoutEmitter interface { + RecordAndBroadcast(ctx context.Context, eventType string, workspaceID string, payload interface{}) error +} + +// DefaultProvisioningTimeout is how long a workspace may sit in +// status='provisioning' before the sweeper flips it to 'failed'. The +// container-launch path has its own 3-minute context timeout +// (provisioner.ProvisionTimeout) but that only bounds the docker API call — +// a container that started but crashes before /registry/register never +// triggers that path and would sit in provisioning forever. 10 minutes +// covers pathological image-pull + user-data execution on a cold EC2 worker +// while still getting well ahead of the "15+ minute" stuck state users see +// in production. +const DefaultProvisioningTimeout = 10 * time.Minute + +// DefaultProvisionSweepInterval is how often the sweeper polls. Same cadence +// as the hibernation monitor — cheap and bounded by the provisioning-state +// query which hits the primary key / status partial index. +const DefaultProvisionSweepInterval = 30 * time.Second + +// provisioningTimeout reads the override from env, falling back to the +// default. Env var expressed in seconds so operators can tune via a normal +// container restart without a code change. +func provisioningTimeout() time.Duration { + if v := os.Getenv("PROVISION_TIMEOUT_SECONDS"); v != "" { + if n, err := strconv.Atoi(v); err == nil && n > 0 { + return time.Duration(n) * time.Second + } + } + return DefaultProvisioningTimeout +} + +// StartProvisioningTimeoutSweep periodically scans for workspaces stuck in +// `status='provisioning'` past the timeout window, flips them to `failed`, +// and broadcasts a WORKSPACE_PROVISION_TIMEOUT event so the canvas can +// render a fail-state instead of the indefinite cosmetic "Provisioning +// Timeout" banner. +// +// The sweep is idempotent: the UPDATE's WHERE clause re-checks both status +// and age under the same row lock, so a workspace that raced to `online` or +// was restarted while the sweep was scanning will not get flipped. +func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeoutEmitter, interval time.Duration) { + if emitter == nil { + log.Println("Provision-timeout sweep: emitter is nil — skipping (no one to broadcast to)") + return + } + if interval <= 0 { + interval = DefaultProvisionSweepInterval + } + ticker := time.NewTicker(interval) + defer ticker.Stop() + + log.Printf("Provision-timeout sweep: started (interval=%s, timeout=%s)", interval, provisioningTimeout()) + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + sweepStuckProvisioning(ctx, emitter) + } + } +} + +// sweepStuckProvisioning is one tick of the sweeper. Exported-for-test via +// the package boundary: keep all time.Now reads inside so tests can drive it +// deterministically by seeding updated_at rather than manipulating time. +func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter) { + timeout := provisioningTimeout() + timeoutSec := int(timeout / time.Second) + + // Read candidates first so the event broadcast can include each id. The + // subsequent UPDATE re-checks the predicate to stay race-safe against + // concurrent restart / register paths that write updated_at. + rows, err := db.DB.QueryContext(ctx, ` + SELECT id FROM workspaces + WHERE status = 'provisioning' + AND updated_at < now() - ($1 || ' seconds')::interval + `, timeoutSec) + if err != nil { + log.Printf("Provision-timeout sweep: query error: %v", err) + return + } + defer rows.Close() + + var ids []string + for rows.Next() { + var id string + if err := rows.Scan(&id); err == nil { + ids = append(ids, id) + } + } + + for _, id := range ids { + msg := "provisioning timed out — container never reported online. Check the workspace's required env vars and retry." + res, err := db.DB.ExecContext(ctx, ` + UPDATE workspaces + SET status = 'failed', + last_sample_error = $2, + updated_at = now() + WHERE id = $1 + AND status = 'provisioning' + AND updated_at < now() - ($3 || ' seconds')::interval + `, id, msg, timeoutSec) + if err != nil { + log.Printf("Provision-timeout sweep: failed to flip %s to failed: %v", id, err) + continue + } + affected, _ := res.RowsAffected() + if affected == 0 { + // Raced with restart / register — no harm, just skip. + continue + } + log.Printf("Provision-timeout sweep: %s stuck in provisioning > %s — marked failed", id, timeout) + if emitErr := emitter.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_TIMEOUT", id, map[string]interface{}{ + "error": msg, + "timeout_secs": timeoutSec, + }); emitErr != nil { + log.Printf("Provision-timeout sweep: broadcast failed for %s: %v", id, emitErr) + } + } +} diff --git a/workspace-server/internal/registry/provisiontimeout_test.go b/workspace-server/internal/registry/provisiontimeout_test.go new file mode 100644 index 00000000..07a04a4e --- /dev/null +++ b/workspace-server/internal/registry/provisiontimeout_test.go @@ -0,0 +1,175 @@ +package registry + +import ( + "context" + "errors" + "sync" + "testing" + + "github.com/DATA-DOG/go-sqlmock" +) + +// fakeEmitter records every RecordAndBroadcast call so tests can assert +// payload shape + emission count. Safe for concurrent use since the sweeper +// itself is single-goroutine but keeping the lock lets the suite fan out. +type fakeEmitter struct { + mu sync.Mutex + events []emittedEvent + fail bool +} + +type emittedEvent struct { + Type string + WorkspaceID string + Payload interface{} +} + +func (f *fakeEmitter) RecordAndBroadcast(_ context.Context, eventType string, workspaceID string, payload interface{}) error { + f.mu.Lock() + defer f.mu.Unlock() + f.events = append(f.events, emittedEvent{eventType, workspaceID, payload}) + if f.fail { + return errors.New("broadcast boom") + } + return nil +} + +func (f *fakeEmitter) count() int { + f.mu.Lock() + defer f.mu.Unlock() + return len(f.events) +} + +// TestSweepStuckProvisioning_FlipsOverdue verifies the happy path: a stuck +// provisioning workspace gets flipped to failed AND an event is broadcast. +func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) { + mock := setupTestDB(t) + + mock.ExpectQuery(`SELECT id FROM workspaces`). + WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-stuck")) + + mock.ExpectExec(`UPDATE workspaces`). + WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) + + emit := &fakeEmitter{} + sweepStuckProvisioning(context.Background(), emit) + + if emit.count() != 1 { + t.Fatalf("expected 1 event, got %d", emit.count()) + } + if emit.events[0].Type != "WORKSPACE_PROVISION_TIMEOUT" { + t.Errorf("event type = %q, want WORKSPACE_PROVISION_TIMEOUT", emit.events[0].Type) + } + if emit.events[0].WorkspaceID != "ws-stuck" { + t.Errorf("workspace id = %q, want ws-stuck", emit.events[0].WorkspaceID) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet expectations: %v", err) + } +} + +// TestSweepStuckProvisioning_RaceSafe covers the case where UPDATE affects +// 0 rows because the workspace flipped to online (or got restarted) between +// the SELECT and the UPDATE. We should skip the event, not emit a false +// WORKSPACE_PROVISION_TIMEOUT. +func TestSweepStuckProvisioning_RaceSafe(t *testing.T) { + mock := setupTestDB(t) + + mock.ExpectQuery(`SELECT id FROM workspaces`). + WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-raced")) + + mock.ExpectExec(`UPDATE workspaces`). + WithArgs("ws-raced", sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 0)) // 0 rows — raced + + emit := &fakeEmitter{} + sweepStuckProvisioning(context.Background(), emit) + + if emit.count() != 0 { + t.Errorf("expected 0 events on race, got %d", emit.count()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet expectations: %v", err) + } +} + +// TestSweepStuckProvisioning_NoStuck verifies that an empty candidate list +// produces no events and no update queries. +func TestSweepStuckProvisioning_NoStuck(t *testing.T) { + mock := setupTestDB(t) + + mock.ExpectQuery(`SELECT id FROM workspaces`). + WillReturnRows(sqlmock.NewRows([]string{"id"})) + + emit := &fakeEmitter{} + sweepStuckProvisioning(context.Background(), emit) + + if emit.count() != 0 { + t.Errorf("expected 0 events when nothing stuck, got %d", emit.count()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet expectations: %v", err) + } +} + +// TestSweepStuckProvisioning_MultipleStuck covers the realistic case where +// both agents (claude-code + hermes) are stuck — both should get flipped +// and both should get events. +func TestSweepStuckProvisioning_MultipleStuck(t *testing.T) { + mock := setupTestDB(t) + + mock.ExpectQuery(`SELECT id FROM workspaces`). + WillReturnRows(sqlmock.NewRows([]string{"id"}). + AddRow("ws-claude-code"). + AddRow("ws-hermes")) + + mock.ExpectExec(`UPDATE workspaces`). + WithArgs("ws-claude-code", sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec(`UPDATE workspaces`). + WithArgs("ws-hermes", sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) + + emit := &fakeEmitter{} + sweepStuckProvisioning(context.Background(), emit) + + if emit.count() != 2 { + t.Fatalf("expected 2 events, got %d", emit.count()) + } +} + +// TestSweepStuckProvisioning_BroadcastFailureDoesNotCrash ensures the +// sweeper tolerates a broadcast error (Redis hiccup) — the DB row is +// already flipped so the state stays coherent. +func TestSweepStuckProvisioning_BroadcastFailureDoesNotCrash(t *testing.T) { + mock := setupTestDB(t) + + mock.ExpectQuery(`SELECT id FROM workspaces`). + WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-stuck")) + mock.ExpectExec(`UPDATE workspaces`). + WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) + + emit := &fakeEmitter{fail: true} + // Must not panic. + sweepStuckProvisioning(context.Background(), emit) +} + +// TestProvisioningTimeout_EnvOverride verifies PROVISION_TIMEOUT_SECONDS +// env var takes effect when set to a positive integer, and falls back to +// default otherwise. +func TestProvisioningTimeout_EnvOverride(t *testing.T) { + t.Setenv("PROVISION_TIMEOUT_SECONDS", "60") + if got := provisioningTimeout(); got.Seconds() != 60 { + t.Errorf("override: got %v, want 60s", got) + } + t.Setenv("PROVISION_TIMEOUT_SECONDS", "") + if got := provisioningTimeout(); got != DefaultProvisioningTimeout { + t.Errorf("default: got %v, want %v", got, DefaultProvisioningTimeout) + } + t.Setenv("PROVISION_TIMEOUT_SECONDS", "not-a-number") + if got := provisioningTimeout(); got != DefaultProvisioningTimeout { + t.Errorf("bad override: got %v, want default %v", got, DefaultProvisioningTimeout) + } +}