Merge remote-tracking branch 'origin/staging' into fix/canvas-orgs-page-tests
This commit is contained in:
commit
223584c66d
@ -30,6 +30,8 @@ import { SearchDialog } from "./SearchDialog";
|
||||
import { Toaster } from "./Toaster";
|
||||
import { Toolbar } from "./Toolbar";
|
||||
import { ConfirmDialog } from "./ConfirmDialog";
|
||||
import { api } from "@/lib/api";
|
||||
import { showToast } from "./Toaster";
|
||||
// Phase 20 components
|
||||
import { SettingsPanel, DeleteConfirmDialog } from "./settings";
|
||||
// Phase 20.3 batch operations
|
||||
@ -96,6 +98,24 @@ function CanvasInner() {
|
||||
|
||||
// Confirmation dialog state for structure changes
|
||||
const [pendingNest, setPendingNest] = useState<{ nodeId: string; targetId: string | null; nodeName: string; targetName: string } | null>(null);
|
||||
// Delete-confirmation lives in the store so the dialog survives ContextMenu
|
||||
// unmounting — the prior local-in-ContextMenu state raced with the menu's
|
||||
// outside-click handler (the portal-rendered Confirm button counted as
|
||||
// "outside" and closed the menu, killing the dialog mid-click).
|
||||
const pendingDelete = useCanvasStore((s) => s.pendingDelete);
|
||||
const setPendingDelete = useCanvasStore((s) => s.setPendingDelete);
|
||||
const removeNode = useCanvasStore((s) => s.removeNode);
|
||||
const confirmDelete = useCallback(async () => {
|
||||
if (!pendingDelete) return;
|
||||
const { id } = pendingDelete;
|
||||
setPendingDelete(null);
|
||||
try {
|
||||
await api.del(`/workspaces/${id}?confirm=true`);
|
||||
removeNode(id);
|
||||
} catch (e) {
|
||||
showToast(e instanceof Error ? e.message : "Delete failed", "error");
|
||||
}
|
||||
}, [pendingDelete, setPendingDelete, removeNode]);
|
||||
|
||||
const onNodeDragStop: OnNodeDrag<Node<WorkspaceNodeData>> = useCallback(
|
||||
(_event, node) => {
|
||||
@ -358,6 +378,17 @@ function CanvasInner() {
|
||||
onCancel={cancelNest}
|
||||
/>
|
||||
|
||||
{/* Confirmation dialog for workspace delete — driven by store */}
|
||||
<ConfirmDialog
|
||||
open={!!pendingDelete}
|
||||
title="Delete Workspace"
|
||||
message={`Permanently delete "${pendingDelete?.name}"? This will stop the container and remove all configuration. This action cannot be undone.`}
|
||||
confirmLabel="Delete"
|
||||
confirmVariant="danger"
|
||||
onConfirm={confirmDelete}
|
||||
onCancel={() => setPendingDelete(null)}
|
||||
/>
|
||||
|
||||
{/* Settings Panel — global secrets management drawer */}
|
||||
<SettingsPanel workspaceId={settingsWorkspaceId} />
|
||||
<DeleteConfirmDialog workspaceId={settingsWorkspaceId} />
|
||||
|
||||
@ -4,7 +4,6 @@ import { useCallback, useEffect, useRef, useState } from "react";
|
||||
import { useCanvasStore, type WorkspaceNodeData } from "@/store/canvas";
|
||||
import { api } from "@/lib/api";
|
||||
import { showToast } from "./Toaster";
|
||||
import { ConfirmDialog } from "./ConfirmDialog";
|
||||
import { statusDotClass } from "@/lib/design-tokens";
|
||||
|
||||
interface MenuItem {
|
||||
@ -26,15 +25,10 @@ export function ContextMenu() {
|
||||
const nestNode = useCanvasStore((s) => s.nestNode);
|
||||
const contextNodeId = contextMenu?.nodeId ?? null;
|
||||
const hasChildren = useCanvasStore((s) => contextNodeId ? s.nodes.some((n) => n.data.parentId === contextNodeId) : false);
|
||||
const setPendingDelete = useCanvasStore((s) => s.setPendingDelete);
|
||||
const ref = useRef<HTMLDivElement>(null);
|
||||
const [deleteConfirm, setDeleteConfirm] = useState<{ id: string; name: string } | null>(null);
|
||||
const [actionLoading, setActionLoading] = useState(false);
|
||||
|
||||
// Clear orphaned dialog state when context menu closes
|
||||
useEffect(() => {
|
||||
if (!contextMenu) setDeleteConfirm(null);
|
||||
}, [contextMenu]);
|
||||
|
||||
// Auto-focus first enabled item when menu opens
|
||||
useEffect(() => {
|
||||
if (!contextMenu) return;
|
||||
@ -167,21 +161,13 @@ export function ContextMenu() {
|
||||
|
||||
const handleDelete = useCallback(() => {
|
||||
if (!contextMenu) return;
|
||||
// Don't close context menu yet — keep it mounted so ConfirmDialog renders
|
||||
setDeleteConfirm({ id: contextMenu.nodeId, name: contextMenu.nodeData.name });
|
||||
}, [contextMenu]);
|
||||
|
||||
const confirmDelete = useCallback(async () => {
|
||||
if (!deleteConfirm) return;
|
||||
try {
|
||||
await api.del(`/workspaces/${deleteConfirm.id}`);
|
||||
removeNode(deleteConfirm.id);
|
||||
} catch {
|
||||
showToast("Delete failed", "error");
|
||||
}
|
||||
setDeleteConfirm(null);
|
||||
// Hoist delete confirmation to the Canvas-level dialog (via store) so
|
||||
// it survives ContextMenu unmount. Closing the menu here avoids the
|
||||
// prior race where the portal dialog's Confirm click was treated as
|
||||
// "outside" by the menu's outside-click handler.
|
||||
setPendingDelete({ id: contextMenu.nodeId, name: contextMenu.nodeData.name });
|
||||
closeContextMenu();
|
||||
}, [deleteConfirm, removeNode, closeContextMenu]);
|
||||
}, [contextMenu, setPendingDelete, closeContextMenu]);
|
||||
|
||||
const handleViewDetails = useCallback(() => {
|
||||
if (!contextMenu) return;
|
||||
@ -317,17 +303,6 @@ export function ContextMenu() {
|
||||
</button>
|
||||
);
|
||||
})}
|
||||
|
||||
{/* Delete confirmation dialog */}
|
||||
<ConfirmDialog
|
||||
open={!!deleteConfirm}
|
||||
title="Delete Workspace"
|
||||
message={`Permanently delete "${deleteConfirm?.name}"? This will stop the container and remove all configuration. This action cannot be undone.`}
|
||||
confirmLabel="Delete"
|
||||
confirmVariant="danger"
|
||||
onConfirm={confirmDelete}
|
||||
onCancel={() => { setDeleteConfirm(null); closeContextMenu(); }}
|
||||
/>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
@ -78,17 +78,17 @@ export function WorkspaceUsage({ workspaceId }: WorkspaceUsageProps) {
|
||||
<>
|
||||
<StatRow
|
||||
label="Input tokens"
|
||||
value={`${metrics.input_tokens.toLocaleString()} tokens`}
|
||||
value={`${(metrics.input_tokens ?? 0).toLocaleString()} tokens`}
|
||||
testId="usage-input-tokens"
|
||||
/>
|
||||
<StatRow
|
||||
label="Output tokens"
|
||||
value={`${metrics.output_tokens.toLocaleString()} tokens`}
|
||||
value={`${(metrics.output_tokens ?? 0).toLocaleString()} tokens`}
|
||||
testId="usage-output-tokens"
|
||||
/>
|
||||
<StatRow
|
||||
label="Estimated cost"
|
||||
value={`$${parseFloat(metrics.estimated_cost_usd).toFixed(6)}`}
|
||||
value={`$${parseFloat(metrics.estimated_cost_usd ?? "0").toFixed(6)}`}
|
||||
testId="usage-estimated-cost"
|
||||
/>
|
||||
</>
|
||||
|
||||
@ -154,6 +154,16 @@ describe("BudgetSection — stats row", () => {
|
||||
await renderLoaded(budgetResponse({ budget_remaining: null }));
|
||||
expect(screen.queryByTestId("budget-remaining")).toBeNull();
|
||||
});
|
||||
|
||||
it("does not crash when budget_used is missing from the response", async () => {
|
||||
// Backend for a provisioning-stuck workspace may return a partial
|
||||
// shape. Regression: previously this threw
|
||||
// "Cannot read properties of undefined (reading 'toLocaleString')"
|
||||
// and crashed the whole Details tab.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
await renderLoaded({ budget_limit: 1000, budget_remaining: null } as any);
|
||||
expect(screen.getByTestId("budget-used-value").textContent).toBe("0");
|
||||
});
|
||||
});
|
||||
|
||||
// ── Progress bar ──────────────────────────────────────────────────────────────
|
||||
|
||||
@ -45,6 +45,7 @@ const mockStore = {
|
||||
selectNode: vi.fn(),
|
||||
setPanelTab: vi.fn(),
|
||||
nestNode: vi.fn(),
|
||||
setPendingDelete: vi.fn(),
|
||||
nodes: [] as Array<{ id: string; data: { parentId: string | null } }>,
|
||||
};
|
||||
|
||||
@ -209,4 +210,23 @@ describe("ContextMenu — keyboard accessibility", () => {
|
||||
fireEvent.click(zoomItem);
|
||||
expect(closeContextMenu).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
// Regression: the old flow kept ConfirmDialog inside ContextMenu's local
|
||||
// state and rendered it via a portal. The portal-rendered Confirm button
|
||||
// counted as "outside" by the menu's outside-click handler, closing the
|
||||
// menu mid-click and making Delete appear to do nothing. The fix hoists
|
||||
// the dialog state to the canvas store via `setPendingDelete` AND closes
|
||||
// the context menu on click, so the dialog is owned by a component that
|
||||
// outlives the menu.
|
||||
it("clicking 'Delete' hoists state to the store and closes the menu", () => {
|
||||
render(<ContextMenu />);
|
||||
const items = screen.getAllByRole("menuitem");
|
||||
const deleteItem = items.find((el) => el.textContent?.includes("Delete"))!;
|
||||
fireEvent.click(deleteItem);
|
||||
expect(mockStore.setPendingDelete).toHaveBeenCalledWith({
|
||||
id: "ws-1",
|
||||
name: "Alpha Workspace",
|
||||
});
|
||||
expect(closeContextMenu).toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
@ -84,6 +84,20 @@ describe("WorkspaceUsage", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("does not crash when token/cost fields are missing", async () => {
|
||||
// Regression: Details tab crashed with
|
||||
// "Cannot read properties of undefined (reading 'toLocaleString')"
|
||||
// when a workspace stuck in provisioning returned partial metrics.
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
mockGet.mockResolvedValue({} as any);
|
||||
render(<WorkspaceUsage workspaceId="ws-1" />);
|
||||
await waitFor(() => {
|
||||
expect(screen.getByTestId("usage-input-tokens").textContent).toContain("0");
|
||||
expect(screen.getByTestId("usage-output-tokens").textContent).toContain("0");
|
||||
expect(screen.getByTestId("usage-estimated-cost").textContent).toBe("Estimated cost$0.000000");
|
||||
});
|
||||
});
|
||||
|
||||
it("displays estimated cost formatted as $X.XXXXXX after load", async () => {
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
mockGet.mockResolvedValue(METRICS_RESPONSE as any);
|
||||
|
||||
@ -171,7 +171,7 @@ export function BudgetSection({ workspaceId }: Props) {
|
||||
<div className="flex items-baseline justify-between" data-testid="budget-stats-row">
|
||||
<span className="text-xs text-zinc-400">Credits used</span>
|
||||
<span className="text-xs font-mono text-zinc-300">
|
||||
<span data-testid="budget-used-value">{budget.budget_used.toLocaleString()}</span>
|
||||
<span data-testid="budget-used-value">{(budget.budget_used ?? 0).toLocaleString()}</span>
|
||||
<span className="text-zinc-500 mx-1">/</span>
|
||||
<span data-testid="budget-limit-value">
|
||||
{budget.budget_limit != null
|
||||
|
||||
@ -66,6 +66,14 @@ interface CanvasState {
|
||||
isDescendant: (ancestorId: string, nodeId: string) => boolean;
|
||||
openContextMenu: (menu: ContextMenuState) => void;
|
||||
closeContextMenu: () => void;
|
||||
// Pending delete confirmation — lives in the store (not inside ContextMenu's
|
||||
// local state) so the confirm dialog survives ContextMenu unmounting. The
|
||||
// ContextMenu's portal-rendered dialog used to race with its outside-click
|
||||
// handler: clicking Confirm registered as "outside", closed the menu, and
|
||||
// unmounted the dialog before its onClick fired. Hoisting the state fixes
|
||||
// that — see fix/context-menu-delete-race.
|
||||
pendingDelete: { id: string; name: string } | null;
|
||||
setPendingDelete: (v: { id: string; name: string } | null) => void;
|
||||
searchOpen: boolean;
|
||||
setSearchOpen: (open: boolean) => void;
|
||||
viewport: { x: number; y: number; zoom: number };
|
||||
@ -158,6 +166,8 @@ export const useCanvasStore = create<CanvasState>((set, get) => ({
|
||||
selectNode: (id) => set({ selectedNodeId: id }),
|
||||
openContextMenu: (menu) => set({ contextMenu: menu }),
|
||||
closeContextMenu: () => set({ contextMenu: null }),
|
||||
pendingDelete: null,
|
||||
setPendingDelete: (v) => set({ pendingDelete: v }),
|
||||
searchOpen: false,
|
||||
setSearchOpen: (open) => set({ searchOpen: open }),
|
||||
agentMessages: {},
|
||||
|
||||
@ -191,6 +191,15 @@ func main() {
|
||||
})
|
||||
}
|
||||
|
||||
// Provision-timeout sweep — flips workspaces that have been stuck in
|
||||
// status='provisioning' past the timeout window to 'failed' and emits
|
||||
// WORKSPACE_PROVISION_TIMEOUT. Without this the UI banner is cosmetic
|
||||
// and the state is incoherent (e.g. user sees "Retry" after 15min but
|
||||
// backend still thinks provisioning is in progress).
|
||||
go supervised.RunWithRecover(ctx, "provision-timeout-sweep", func(c context.Context) {
|
||||
registry.StartProvisioningTimeoutSweep(c, broadcaster, registry.DefaultProvisionSweepInterval)
|
||||
})
|
||||
|
||||
// Cron Scheduler — fires A2A messages to workspaces on user-defined schedules
|
||||
cronSched := scheduler.New(wh, broadcaster)
|
||||
go supervised.RunWithRecover(ctx, "scheduler", cronSched.Start)
|
||||
|
||||
70
workspace-server/internal/handlers/workspace_preflight.go
Normal file
70
workspace-server/internal/handlers/workspace_preflight.go
Normal file
@ -0,0 +1,70 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
// requiredEnvSchema is the subset of config.yaml we read to decide which env
|
||||
// vars must be present before a container launch. It maps the YAML path
|
||||
// `runtime_config.required_env: [...]` which is the same shape the workspace
|
||||
// adapter's preflight reads inside the container (workspace/preflight.py).
|
||||
//
|
||||
// Mirroring the check server-side lets us fail fast with a readable error
|
||||
// instead of letting the container crash-loop and the workspace sit in
|
||||
// `provisioning` until a sweeper or the user intervenes.
|
||||
type requiredEnvSchema struct {
|
||||
RuntimeConfig struct {
|
||||
RequiredEnv []string `yaml:"required_env"`
|
||||
} `yaml:"runtime_config"`
|
||||
}
|
||||
|
||||
// missingRequiredEnv returns the list of env var names declared in the
|
||||
// workspace's config.yaml under `runtime_config.required_env` that are NOT
|
||||
// present (or are empty) in the assembled envVars map. Returns an empty
|
||||
// slice when the config declares no requirements or when all are satisfied.
|
||||
//
|
||||
// A parse failure returns no missing vars — config.yaml shape is enforced by
|
||||
// the in-container preflight, and the server's job here is only to catch the
|
||||
// common "forgot to add the OAuth token secret" footgun, not to be a second
|
||||
// config validator.
|
||||
func missingRequiredEnv(configFiles map[string][]byte, envVars map[string]string) []string {
|
||||
if len(configFiles) == 0 {
|
||||
return nil
|
||||
}
|
||||
raw, ok := configFiles["config.yaml"]
|
||||
if !ok || len(raw) == 0 {
|
||||
return nil
|
||||
}
|
||||
var schema requiredEnvSchema
|
||||
if err := yaml.Unmarshal(raw, &schema); err != nil {
|
||||
return nil
|
||||
}
|
||||
if len(schema.RuntimeConfig.RequiredEnv) == 0 {
|
||||
return nil
|
||||
}
|
||||
var missing []string
|
||||
for _, name := range schema.RuntimeConfig.RequiredEnv {
|
||||
if v, ok := envVars[name]; !ok || v == "" {
|
||||
missing = append(missing, name)
|
||||
}
|
||||
}
|
||||
return missing
|
||||
}
|
||||
|
||||
// formatMissingEnvError builds the user-facing message for a provision
|
||||
// failure caused by unset required env vars. Kept stable because it's
|
||||
// rendered verbatim in the canvas Events tab and Details banner.
|
||||
func formatMissingEnvError(missing []string) string {
|
||||
if len(missing) == 1 {
|
||||
return fmt.Sprintf(
|
||||
"missing required env var %q — add it under Config → Env Vars (or as a Global secret) and retry",
|
||||
missing[0],
|
||||
)
|
||||
}
|
||||
return fmt.Sprintf(
|
||||
"missing required env vars %q — add them under Config → Env Vars (or as Global secrets) and retry",
|
||||
missing,
|
||||
)
|
||||
}
|
||||
124
workspace-server/internal/handlers/workspace_preflight_test.go
Normal file
124
workspace-server/internal/handlers/workspace_preflight_test.go
Normal file
@ -0,0 +1,124 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestMissingRequiredEnv_NoConfig(t *testing.T) {
|
||||
// Zero configFiles → nothing to validate → no missing.
|
||||
if got := missingRequiredEnv(nil, map[string]string{}); got != nil {
|
||||
t.Errorf("nil configFiles: got %v, want nil", got)
|
||||
}
|
||||
if got := missingRequiredEnv(map[string][]byte{}, map[string]string{}); got != nil {
|
||||
t.Errorf("empty configFiles: got %v, want nil", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMissingRequiredEnv_NoConfigYaml(t *testing.T) {
|
||||
// A map without config.yaml → no schema → no missing.
|
||||
files := map[string][]byte{
|
||||
"other.txt": []byte("irrelevant"),
|
||||
}
|
||||
if got := missingRequiredEnv(files, map[string]string{}); got != nil {
|
||||
t.Errorf("no config.yaml: got %v, want nil", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMissingRequiredEnv_NoRequiredEnvInYaml(t *testing.T) {
|
||||
// config.yaml without runtime_config.required_env → no missing.
|
||||
// Mirrors the default config emitted by ensureDefaultConfig (see the
|
||||
// #1028 comment in workspace_provision.go about why required_env is
|
||||
// intentionally omitted for auto-generated configs).
|
||||
yml := `
|
||||
name: example
|
||||
runtime: langgraph
|
||||
runtime_config:
|
||||
timeout: 0
|
||||
`
|
||||
files := map[string][]byte{"config.yaml": []byte(yml)}
|
||||
if got := missingRequiredEnv(files, map[string]string{}); got != nil {
|
||||
t.Errorf("no required_env in YAML: got %v, want nil", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMissingRequiredEnv_AllSatisfied(t *testing.T) {
|
||||
yml := `
|
||||
runtime: claude-code
|
||||
runtime_config:
|
||||
required_env:
|
||||
- CLAUDE_CODE_OAUTH_TOKEN
|
||||
- ANTHROPIC_API_KEY
|
||||
`
|
||||
files := map[string][]byte{"config.yaml": []byte(yml)}
|
||||
env := map[string]string{
|
||||
"CLAUDE_CODE_OAUTH_TOKEN": "sk-ant-oat01-...",
|
||||
"ANTHROPIC_API_KEY": "sk-ant-api-...",
|
||||
}
|
||||
if got := missingRequiredEnv(files, env); got != nil {
|
||||
t.Errorf("all set: got %v, want nil", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMissingRequiredEnv_OneMissing(t *testing.T) {
|
||||
// Reproduces the reported issue: Claude Code Agent config declares
|
||||
// CLAUDE_CODE_OAUTH_TOKEN required but the tenant only has
|
||||
// ANTHROPIC_API_KEY set globally.
|
||||
yml := `
|
||||
runtime: claude-code
|
||||
runtime_config:
|
||||
required_env:
|
||||
- CLAUDE_CODE_OAUTH_TOKEN
|
||||
- ANTHROPIC_API_KEY
|
||||
`
|
||||
files := map[string][]byte{"config.yaml": []byte(yml)}
|
||||
env := map[string]string{
|
||||
"ANTHROPIC_API_KEY": "sk-ant-...",
|
||||
}
|
||||
got := missingRequiredEnv(files, env)
|
||||
if len(got) != 1 || got[0] != "CLAUDE_CODE_OAUTH_TOKEN" {
|
||||
t.Errorf("expected [CLAUDE_CODE_OAUTH_TOKEN], got %v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMissingRequiredEnv_EmptyStringCountsAsMissing(t *testing.T) {
|
||||
// A secret row with empty value is effectively unset; the in-container
|
||||
// preflight treats empty string as missing, so the server must match.
|
||||
yml := `
|
||||
runtime_config:
|
||||
required_env: [FOO]
|
||||
`
|
||||
files := map[string][]byte{"config.yaml": []byte(yml)}
|
||||
env := map[string]string{"FOO": ""}
|
||||
got := missingRequiredEnv(files, env)
|
||||
if len(got) != 1 || got[0] != "FOO" {
|
||||
t.Errorf("expected [FOO], got %v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMissingRequiredEnv_MalformedYamlReturnsNil(t *testing.T) {
|
||||
// Malformed YAML should not panic and should not block provisioning —
|
||||
// the in-container preflight is the source of truth for config.yaml
|
||||
// shape, and we don't want to double-fail on parse quirks.
|
||||
files := map[string][]byte{"config.yaml": []byte("{ not: valid: yaml: [[")}
|
||||
if got := missingRequiredEnv(files, map[string]string{}); got != nil {
|
||||
t.Errorf("malformed YAML: got %v, want nil", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFormatMissingEnvError_Single(t *testing.T) {
|
||||
msg := formatMissingEnvError([]string{"CLAUDE_CODE_OAUTH_TOKEN"})
|
||||
if !strings.Contains(msg, "CLAUDE_CODE_OAUTH_TOKEN") {
|
||||
t.Errorf("message should name the var: %q", msg)
|
||||
}
|
||||
if !strings.Contains(msg, "retry") {
|
||||
t.Errorf("message should tell user how to fix it: %q", msg)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFormatMissingEnvError_Multiple(t *testing.T) {
|
||||
msg := formatMissingEnvError([]string{"A", "B"})
|
||||
if !strings.Contains(msg, "A") || !strings.Contains(msg, "B") {
|
||||
t.Errorf("message should name both vars: %q", msg)
|
||||
}
|
||||
}
|
||||
@ -114,6 +114,27 @@ func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath stri
|
||||
return
|
||||
}
|
||||
|
||||
// Preflight: refuse to launch when config.yaml declares required env vars
|
||||
// that are not set. Without this, a missing CLAUDE_CODE_OAUTH_TOKEN (or
|
||||
// similar) crashes the in-container preflight, the container never calls
|
||||
// /registry/register, and the workspace sits in `provisioning` until a
|
||||
// sweeper flips it or the user retries. Failing fast here gives the user
|
||||
// an immediate, actionable error in the Events tab.
|
||||
if missing := missingRequiredEnv(configFiles, envVars); len(missing) > 0 {
|
||||
msg := formatMissingEnvError(missing)
|
||||
log.Printf("Provisioner: %s (workspace=%s)", msg, workspaceID)
|
||||
if _, dbErr := db.DB.ExecContext(ctx,
|
||||
`UPDATE workspaces SET status = 'failed', last_sample_error = $2, updated_at = now() WHERE id = $1`,
|
||||
workspaceID, msg); dbErr != nil {
|
||||
log.Printf("Provisioner: failed to mark workspace %s as failed: %v", workspaceID, dbErr)
|
||||
}
|
||||
h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", workspaceID, map[string]interface{}{
|
||||
"error": msg,
|
||||
"missing": missing,
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
cfg := h.buildProvisionerConfig(workspaceID, templatePath, configFiles, payload, envVars, pluginsPath, awarenessNamespace)
|
||||
cfg.ResetClaudeSession = resetClaudeSession // #12
|
||||
|
||||
|
||||
137
workspace-server/internal/registry/provisiontimeout.go
Normal file
137
workspace-server/internal/registry/provisiontimeout.go
Normal file
@ -0,0 +1,137 @@
|
||||
package registry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log"
|
||||
"os"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
)
|
||||
|
||||
// ProvisionTimeoutEmitter is the narrow broadcaster dependency the sweeper
|
||||
// needs. Defined locally so the registry package stays event-bus agnostic
|
||||
// (same pattern as OfflineHandler in healthsweep.go).
|
||||
type ProvisionTimeoutEmitter interface {
|
||||
RecordAndBroadcast(ctx context.Context, eventType string, workspaceID string, payload interface{}) error
|
||||
}
|
||||
|
||||
// DefaultProvisioningTimeout is how long a workspace may sit in
|
||||
// status='provisioning' before the sweeper flips it to 'failed'. The
|
||||
// container-launch path has its own 3-minute context timeout
|
||||
// (provisioner.ProvisionTimeout) but that only bounds the docker API call —
|
||||
// a container that started but crashes before /registry/register never
|
||||
// triggers that path and would sit in provisioning forever. 10 minutes
|
||||
// covers pathological image-pull + user-data execution on a cold EC2 worker
|
||||
// while still getting well ahead of the "15+ minute" stuck state users see
|
||||
// in production.
|
||||
const DefaultProvisioningTimeout = 10 * time.Minute
|
||||
|
||||
// DefaultProvisionSweepInterval is how often the sweeper polls. Same cadence
|
||||
// as the hibernation monitor — cheap and bounded by the provisioning-state
|
||||
// query which hits the primary key / status partial index.
|
||||
const DefaultProvisionSweepInterval = 30 * time.Second
|
||||
|
||||
// provisioningTimeout reads the override from env, falling back to the
|
||||
// default. Env var expressed in seconds so operators can tune via a normal
|
||||
// container restart without a code change.
|
||||
func provisioningTimeout() time.Duration {
|
||||
if v := os.Getenv("PROVISION_TIMEOUT_SECONDS"); v != "" {
|
||||
if n, err := strconv.Atoi(v); err == nil && n > 0 {
|
||||
return time.Duration(n) * time.Second
|
||||
}
|
||||
}
|
||||
return DefaultProvisioningTimeout
|
||||
}
|
||||
|
||||
// StartProvisioningTimeoutSweep periodically scans for workspaces stuck in
|
||||
// `status='provisioning'` past the timeout window, flips them to `failed`,
|
||||
// and broadcasts a WORKSPACE_PROVISION_TIMEOUT event so the canvas can
|
||||
// render a fail-state instead of the indefinite cosmetic "Provisioning
|
||||
// Timeout" banner.
|
||||
//
|
||||
// The sweep is idempotent: the UPDATE's WHERE clause re-checks both status
|
||||
// and age under the same row lock, so a workspace that raced to `online` or
|
||||
// was restarted while the sweep was scanning will not get flipped.
|
||||
func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeoutEmitter, interval time.Duration) {
|
||||
if emitter == nil {
|
||||
log.Println("Provision-timeout sweep: emitter is nil — skipping (no one to broadcast to)")
|
||||
return
|
||||
}
|
||||
if interval <= 0 {
|
||||
interval = DefaultProvisionSweepInterval
|
||||
}
|
||||
ticker := time.NewTicker(interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
log.Printf("Provision-timeout sweep: started (interval=%s, timeout=%s)", interval, provisioningTimeout())
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
sweepStuckProvisioning(ctx, emitter)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// sweepStuckProvisioning is one tick of the sweeper. Exported-for-test via
|
||||
// the package boundary: keep all time.Now reads inside so tests can drive it
|
||||
// deterministically by seeding updated_at rather than manipulating time.
|
||||
func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter) {
|
||||
timeout := provisioningTimeout()
|
||||
timeoutSec := int(timeout / time.Second)
|
||||
|
||||
// Read candidates first so the event broadcast can include each id. The
|
||||
// subsequent UPDATE re-checks the predicate to stay race-safe against
|
||||
// concurrent restart / register paths that write updated_at.
|
||||
rows, err := db.DB.QueryContext(ctx, `
|
||||
SELECT id FROM workspaces
|
||||
WHERE status = 'provisioning'
|
||||
AND updated_at < now() - ($1 || ' seconds')::interval
|
||||
`, timeoutSec)
|
||||
if err != nil {
|
||||
log.Printf("Provision-timeout sweep: query error: %v", err)
|
||||
return
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var ids []string
|
||||
for rows.Next() {
|
||||
var id string
|
||||
if err := rows.Scan(&id); err == nil {
|
||||
ids = append(ids, id)
|
||||
}
|
||||
}
|
||||
|
||||
for _, id := range ids {
|
||||
msg := "provisioning timed out — container never reported online. Check the workspace's required env vars and retry."
|
||||
res, err := db.DB.ExecContext(ctx, `
|
||||
UPDATE workspaces
|
||||
SET status = 'failed',
|
||||
last_sample_error = $2,
|
||||
updated_at = now()
|
||||
WHERE id = $1
|
||||
AND status = 'provisioning'
|
||||
AND updated_at < now() - ($3 || ' seconds')::interval
|
||||
`, id, msg, timeoutSec)
|
||||
if err != nil {
|
||||
log.Printf("Provision-timeout sweep: failed to flip %s to failed: %v", id, err)
|
||||
continue
|
||||
}
|
||||
affected, _ := res.RowsAffected()
|
||||
if affected == 0 {
|
||||
// Raced with restart / register — no harm, just skip.
|
||||
continue
|
||||
}
|
||||
log.Printf("Provision-timeout sweep: %s stuck in provisioning > %s — marked failed", id, timeout)
|
||||
if emitErr := emitter.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_TIMEOUT", id, map[string]interface{}{
|
||||
"error": msg,
|
||||
"timeout_secs": timeoutSec,
|
||||
}); emitErr != nil {
|
||||
log.Printf("Provision-timeout sweep: broadcast failed for %s: %v", id, emitErr)
|
||||
}
|
||||
}
|
||||
}
|
||||
175
workspace-server/internal/registry/provisiontimeout_test.go
Normal file
175
workspace-server/internal/registry/provisiontimeout_test.go
Normal file
@ -0,0 +1,175 @@
|
||||
package registry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"sync"
|
||||
"testing"
|
||||
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
)
|
||||
|
||||
// fakeEmitter records every RecordAndBroadcast call so tests can assert
|
||||
// payload shape + emission count. Safe for concurrent use since the sweeper
|
||||
// itself is single-goroutine but keeping the lock lets the suite fan out.
|
||||
type fakeEmitter struct {
|
||||
mu sync.Mutex
|
||||
events []emittedEvent
|
||||
fail bool
|
||||
}
|
||||
|
||||
type emittedEvent struct {
|
||||
Type string
|
||||
WorkspaceID string
|
||||
Payload interface{}
|
||||
}
|
||||
|
||||
func (f *fakeEmitter) RecordAndBroadcast(_ context.Context, eventType string, workspaceID string, payload interface{}) error {
|
||||
f.mu.Lock()
|
||||
defer f.mu.Unlock()
|
||||
f.events = append(f.events, emittedEvent{eventType, workspaceID, payload})
|
||||
if f.fail {
|
||||
return errors.New("broadcast boom")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (f *fakeEmitter) count() int {
|
||||
f.mu.Lock()
|
||||
defer f.mu.Unlock()
|
||||
return len(f.events)
|
||||
}
|
||||
|
||||
// TestSweepStuckProvisioning_FlipsOverdue verifies the happy path: a stuck
|
||||
// provisioning workspace gets flipped to failed AND an event is broadcast.
|
||||
func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
|
||||
mock.ExpectQuery(`SELECT id FROM workspaces`).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-stuck"))
|
||||
|
||||
mock.ExpectExec(`UPDATE workspaces`).
|
||||
WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg()).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
emit := &fakeEmitter{}
|
||||
sweepStuckProvisioning(context.Background(), emit)
|
||||
|
||||
if emit.count() != 1 {
|
||||
t.Fatalf("expected 1 event, got %d", emit.count())
|
||||
}
|
||||
if emit.events[0].Type != "WORKSPACE_PROVISION_TIMEOUT" {
|
||||
t.Errorf("event type = %q, want WORKSPACE_PROVISION_TIMEOUT", emit.events[0].Type)
|
||||
}
|
||||
if emit.events[0].WorkspaceID != "ws-stuck" {
|
||||
t.Errorf("workspace id = %q, want ws-stuck", emit.events[0].WorkspaceID)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestSweepStuckProvisioning_RaceSafe covers the case where UPDATE affects
|
||||
// 0 rows because the workspace flipped to online (or got restarted) between
|
||||
// the SELECT and the UPDATE. We should skip the event, not emit a false
|
||||
// WORKSPACE_PROVISION_TIMEOUT.
|
||||
func TestSweepStuckProvisioning_RaceSafe(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
|
||||
mock.ExpectQuery(`SELECT id FROM workspaces`).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-raced"))
|
||||
|
||||
mock.ExpectExec(`UPDATE workspaces`).
|
||||
WithArgs("ws-raced", sqlmock.AnyArg(), sqlmock.AnyArg()).
|
||||
WillReturnResult(sqlmock.NewResult(0, 0)) // 0 rows — raced
|
||||
|
||||
emit := &fakeEmitter{}
|
||||
sweepStuckProvisioning(context.Background(), emit)
|
||||
|
||||
if emit.count() != 0 {
|
||||
t.Errorf("expected 0 events on race, got %d", emit.count())
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestSweepStuckProvisioning_NoStuck verifies that an empty candidate list
|
||||
// produces no events and no update queries.
|
||||
func TestSweepStuckProvisioning_NoStuck(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
|
||||
mock.ExpectQuery(`SELECT id FROM workspaces`).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id"}))
|
||||
|
||||
emit := &fakeEmitter{}
|
||||
sweepStuckProvisioning(context.Background(), emit)
|
||||
|
||||
if emit.count() != 0 {
|
||||
t.Errorf("expected 0 events when nothing stuck, got %d", emit.count())
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestSweepStuckProvisioning_MultipleStuck covers the realistic case where
|
||||
// both agents (claude-code + hermes) are stuck — both should get flipped
|
||||
// and both should get events.
|
||||
func TestSweepStuckProvisioning_MultipleStuck(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
|
||||
mock.ExpectQuery(`SELECT id FROM workspaces`).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id"}).
|
||||
AddRow("ws-claude-code").
|
||||
AddRow("ws-hermes"))
|
||||
|
||||
mock.ExpectExec(`UPDATE workspaces`).
|
||||
WithArgs("ws-claude-code", sqlmock.AnyArg(), sqlmock.AnyArg()).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
mock.ExpectExec(`UPDATE workspaces`).
|
||||
WithArgs("ws-hermes", sqlmock.AnyArg(), sqlmock.AnyArg()).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
emit := &fakeEmitter{}
|
||||
sweepStuckProvisioning(context.Background(), emit)
|
||||
|
||||
if emit.count() != 2 {
|
||||
t.Fatalf("expected 2 events, got %d", emit.count())
|
||||
}
|
||||
}
|
||||
|
||||
// TestSweepStuckProvisioning_BroadcastFailureDoesNotCrash ensures the
|
||||
// sweeper tolerates a broadcast error (Redis hiccup) — the DB row is
|
||||
// already flipped so the state stays coherent.
|
||||
func TestSweepStuckProvisioning_BroadcastFailureDoesNotCrash(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
|
||||
mock.ExpectQuery(`SELECT id FROM workspaces`).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-stuck"))
|
||||
mock.ExpectExec(`UPDATE workspaces`).
|
||||
WithArgs("ws-stuck", sqlmock.AnyArg(), sqlmock.AnyArg()).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
emit := &fakeEmitter{fail: true}
|
||||
// Must not panic.
|
||||
sweepStuckProvisioning(context.Background(), emit)
|
||||
}
|
||||
|
||||
// TestProvisioningTimeout_EnvOverride verifies PROVISION_TIMEOUT_SECONDS
|
||||
// env var takes effect when set to a positive integer, and falls back to
|
||||
// default otherwise.
|
||||
func TestProvisioningTimeout_EnvOverride(t *testing.T) {
|
||||
t.Setenv("PROVISION_TIMEOUT_SECONDS", "60")
|
||||
if got := provisioningTimeout(); got.Seconds() != 60 {
|
||||
t.Errorf("override: got %v, want 60s", got)
|
||||
}
|
||||
t.Setenv("PROVISION_TIMEOUT_SECONDS", "")
|
||||
if got := provisioningTimeout(); got != DefaultProvisioningTimeout {
|
||||
t.Errorf("default: got %v, want %v", got, DefaultProvisioningTimeout)
|
||||
}
|
||||
t.Setenv("PROVISION_TIMEOUT_SECONDS", "not-a-number")
|
||||
if got := provisioningTimeout(); got != DefaultProvisioningTimeout {
|
||||
t.Errorf("bad override: got %v, want default %v", got, DefaultProvisioningTimeout)
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user