diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1350f68c..f1f9cdbb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -142,7 +142,8 @@ jobs: # Strip the package-import prefix so we can match .coverage-allowlist.txt # entries written as paths relative to workspace-server/. - rel=$(echo "$file" | sed 's|^github.com/Molecule-AI/molecule-monorepo/platform/||') + # Handle both module paths: platform/workspace-server/... and platform/... + rel=$(echo "$file" | sed 's|^github.com/Molecule-AI/molecule-monorepo/platform/workspace-server/||; s|^github.com/Molecule-AI/molecule-monorepo/platform/||') if echo "$ALLOWLIST" | grep -qxF "$rel"; then echo "::warning file=workspace-server/$rel::Critical file at ${pct}% coverage (allowlisted, #1823) — fix before expiry." diff --git a/canvas/src/components/tabs/ConfigTab.tsx b/canvas/src/components/tabs/ConfigTab.tsx index ad8338de..f219653f 100644 --- a/canvas/src/components/tabs/ConfigTab.tsx +++ b/canvas/src/components/tabs/ConfigTab.tsx @@ -104,6 +104,13 @@ interface RuntimeOption { // Fallback used when /templates can't be fetched (offline, older backend). // Keep in sync with manifest.json workspace_templates as a defensive default. // Model + env suggestions only flow when the backend is reachable. +// Runtimes that manage their own config outside the platform's config.yaml +// template. For these, a missing config.yaml is expected — the user manages +// config via the runtime's own mechanism (e.g. hermes edits +// ~/.hermes/config.yaml on the workspace EC2 via the Terminal tab or its +// own CLI). Showing a "No config.yaml found" error for these is misleading. +const RUNTIMES_WITH_OWN_CONFIG = new Set(["hermes", "external"]); + const FALLBACK_RUNTIME_OPTIONS: RuntimeOption[] = [ { value: "", label: "LangGraph (default)", models: [] }, { value: "claude-code", label: "Claude Code", models: [] }, @@ -134,14 +141,50 @@ export function ConfigTab({ workspaceId }: Props) { const loadConfig = useCallback(async () => { setLoading(true); setError(null); + + // ALWAYS load workspace metadata first (runtime + model). These are the + // source of truth regardless of whether the runtime uses our config.yaml + // template. Without this the form falls back to empty/default values on + // a hermes workspace (which doesn't use our template), creating the + // appearance that the saved runtime is unset — and worse, clicking Save + // would silently flip `runtime` from `hermes` back to the dropdown + // default `LangGraph`. See GH #1894. + let wsMetadataRuntime = ""; + let wsMetadataModel = ""; + try { + const ws = await api.get<{ runtime?: string }>(`/workspaces/${workspaceId}`); + wsMetadataRuntime = (ws.runtime || "").trim(); + } catch { /* fall back to config.yaml */ } + try { + const m = await api.get<{ model?: string }>(`/workspaces/${workspaceId}/model`); + wsMetadataModel = (m.model || "").trim(); + } catch { /* non-fatal */ } + try { const res = await api.get<{ content: string }>(`/workspaces/${workspaceId}/files/config.yaml`); const parsed = parseYaml(res.content); setOriginalYaml(res.content); setRawDraft(res.content); - setConfig({ ...DEFAULT_CONFIG, ...parsed } as ConfigData); + // Merge: config.yaml wins for fields it declares, but workspace metadata + // wins for runtime + model when config.yaml doesn't set them. + const merged = { ...DEFAULT_CONFIG, ...parsed } as ConfigData; + if (!merged.runtime && wsMetadataRuntime) merged.runtime = wsMetadataRuntime; + if (!merged.model && wsMetadataModel) merged.model = wsMetadataModel; + setConfig(merged); } catch { - setError("No config.yaml found"); + // No platform-managed config.yaml. Some runtimes (hermes, external) + // manage their own config outside this template; that's expected, not + // an error. Populate the form from workspace metadata so the user + // still sees the saved runtime + model. + const runtimeManagesOwnConfig = RUNTIMES_WITH_OWN_CONFIG.has(wsMetadataRuntime); + if (!runtimeManagesOwnConfig) { + setError("No config.yaml found"); + } + setConfig({ + ...DEFAULT_CONFIG, + runtime: wsMetadataRuntime, + model: wsMetadataModel, + } as ConfigData); } finally { setLoading(false); } @@ -520,6 +563,13 @@ export function ConfigTab({ workspaceId }: Props) { {error && (
{error}
)} + {!error && RUNTIMES_WITH_OWN_CONFIG.has(config.runtime || "") && ( +
+ {config.runtime === "hermes" + ? "Hermes manages its own config at ~/.hermes/config.yaml on the workspace host. Edit it via the Terminal tab or the hermes CLI, not this form." + : "This runtime manages its own config outside the platform template."} +
+ )} {success && (
Saved
)} diff --git a/workspace-server/internal/handlers/a2a_proxy_helpers.go b/workspace-server/internal/handlers/a2a_proxy_helpers.go index bd406b4f..4932de31 100644 --- a/workspace-server/internal/handlers/a2a_proxy_helpers.go +++ b/workspace-server/internal/handlers/a2a_proxy_helpers.go @@ -58,24 +58,27 @@ func (h *WorkspaceHandler) handleA2ADispatchError(ctx context.Context, workspace // Issue #110. // // #1870 Phase 1: before returning 503, enqueue the request for drain - // on next heartbeat. Returning 202 Accepted {queued:true} means the - // caller records "dispatched — queued" not "failed", eliminating the - // fan-out-storm drop pattern. + // on next heartbeat. Returning 202 Accepted {queued:true} as a SUCCESS + // (not an error) means callers record this as "dispatched — queued" + // not "failed", eliminating the fan-out-storm drop pattern. + // + // Critical: must return (status, body, NIL ERROR) so the caller's + // `if proxyErr != nil` branch doesn't fire. Returning a proxyA2AError + // with 202 status here was the original cycle 53 bug — callers saw + // proxyErr != nil and logged "delegation failed: proxy a2a error". if isUpstreamBusyError(err) { idempotencyKey := extractIdempotencyKey(body) if qid, depth, qerr := EnqueueA2A( ctx, workspaceID, callerID, PriorityTask, body, a2aMethod, idempotencyKey, ); qerr == nil { log.Printf("ProxyA2A: target %s busy — enqueued as %s (depth=%d)", workspaceID, qid, depth) - return http.StatusAccepted, nil, &proxyA2AError{ - Status: http.StatusAccepted, - Response: gin.H{ - "queued": true, - "queue_id": qid, - "queue_depth": depth, - "message": "workspace agent busy — request queued, will dispatch when capacity available", - }, - } + respBody, _ := json.Marshal(gin.H{ + "queued": true, + "queue_id": qid, + "queue_depth": depth, + "message": "workspace agent busy — request queued, will dispatch when capacity available", + }) + return http.StatusAccepted, respBody, nil } else { // Queue insert failed — fall through to legacy 503 behavior // so callers still retry. We don't want a queue DB hiccup to diff --git a/workspace-server/internal/handlers/a2a_queue.go b/workspace-server/internal/handlers/a2a_queue.go index 2bd30148..dadc9256 100644 --- a/workspace-server/internal/handlers/a2a_queue.go +++ b/workspace-server/internal/handlers/a2a_queue.go @@ -16,6 +16,7 @@ import ( "encoding/json" "errors" "log" + "net/http" "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" ) @@ -82,13 +83,18 @@ func EnqueueA2A( methodArg = method } - // INSERT ... ON CONFLICT DO NOTHING RETURNING id. On conflict we then - // look up the existing row's id so the caller always receives a valid - // queue entry reference. + // INSERT ... ON CONFLICT DO NOTHING RETURNING id. The conflict target + // must reference the partial unique INDEX columns + WHERE clause directly + // (Postgres can't reference partial unique indexes by name in + // ON CONFLICT — only true CONSTRAINTs work for that). On conflict we + // then look up the existing row's id so the caller always receives a + // valid queue entry reference. err = db.DB.QueryRowContext(ctx, ` INSERT INTO a2a_queue (workspace_id, caller_id, priority, body, method, idempotency_key) VALUES ($1, $2, $3, $4::jsonb, $5, $6) - ON CONFLICT ON CONSTRAINT idx_a2a_queue_idempotency DO NOTHING + ON CONFLICT (workspace_id, idempotency_key) + WHERE idempotency_key IS NOT NULL AND status IN ('queued','dispatched') + DO NOTHING RETURNING id `, workspaceID, callerArg, priority, string(body), methodArg, keyArg).Scan(&id) @@ -228,11 +234,34 @@ func (h *WorkspaceHandler) DrainQueueForWorkspace(ctx context.Context, workspace } // logActivity=false: the original EnqueueA2A callsite already logged // the dispatch attempt; re-logging here would double-count events. - _, _, proxyErr := h.proxyA2ARequest(ctx, workspaceID, item.Body, callerID, false) + status, _, proxyErr := h.proxyA2ARequest(ctx, workspaceID, item.Body, callerID, false) + + // 202 Accepted = the dispatch was itself queued again (target still busy). + // That's not a failure — the queued item just stays queued naturally on + // the next drain tick. Mark this attempt completed so we don't double- + // count attempts; the new (re-)queue row already exists. + if status == http.StatusAccepted { + MarkQueueItemCompleted(ctx, item.ID) + log.Printf("A2AQueue drain: %s re-queued (target still busy)", item.ID) + return + } + if proxyErr != nil { - MarkQueueItemFailed(ctx, item.ID, proxyErr.Response["error"].(string)) - log.Printf("A2AQueue drain: dispatch for %s failed (attempt=%d): %v", - item.ID, item.Attempts, proxyErr.Response["error"]) + // Defensive: proxyErr.Response is gin.H (map[string]interface{}). The + // "error" key is conventionally a string but can be missing or non- + // string in edge paths (e.g. a future error builder using a typed + // struct). Cast safely so a missing key doesn't crash the platform — + // today's outage was caused by an unchecked .(string) here. + errMsg, _ := proxyErr.Response["error"].(string) + if errMsg == "" { + errMsg = http.StatusText(proxyErr.Status) + if errMsg == "" { + errMsg = "unknown drain dispatch error" + } + } + MarkQueueItemFailed(ctx, item.ID, errMsg) + log.Printf("A2AQueue drain: dispatch for %s failed (attempt=%d): %s", + item.ID, item.Attempts, errMsg) return } MarkQueueItemCompleted(ctx, item.ID) diff --git a/workspace-server/internal/handlers/restart_template.go b/workspace-server/internal/handlers/restart_template.go new file mode 100644 index 00000000..57193fad --- /dev/null +++ b/workspace-server/internal/handlers/restart_template.go @@ -0,0 +1,96 @@ +package handlers + +import ( + "log" + "os" + "path/filepath" +) + +// restartTemplateInput is the subset of the /workspaces/:id/restart request +// body that affects which config source the provisioner uses. Extracted as +// a type so `resolveRestartTemplate` has a single pure-function signature +// for unit tests — no gin context, no DB, no filesystem writes. +type restartTemplateInput struct { + // Template is an explicit template dir name from the request body. + // Always honoured when resolvable — caller asked by name, that's + // unambiguous consent to overwrite the config volume. + Template string + // ApplyTemplate opts the caller in to name-based auto-match AND the + // runtime-default fallback. Without this flag a restart MUST NOT + // overwrite the user's config volume — a user who edited their + // model/provider/skills/prompts via the Canvas Config tab and hit + // Save+Restart expects their edits to survive. The previous behaviour + // (name-based auto-match unconditionally) silently reverted edits for + // any workspace whose name matched a template dir (e.g. "Hermes Agent" + // → hermes/), which is the regression this fix closes. + ApplyTemplate bool + // RebuildConfig (#239) is the recovery signal used when the workspace's + // config volume was destroyed out-of-band. Tries org-templates as a + // last-resort source so the workspace can self-heal without admin + // intervention. Orthogonal to ApplyTemplate. + RebuildConfig bool +} + +// resolveRestartTemplate chooses the config source for a restart in the +// documented priority order: +// +// 1. Explicit `Template` from the request body (always honoured). +// 2. `ApplyTemplate=true` → name-based auto-match via findTemplateByName. +// 3. `RebuildConfig=true` → org-templates recovery fallback (#239). +// 4. `ApplyTemplate=true` + non-empty dbRuntime → runtime-default template +// (e.g. `hermes-default/`) for runtime-change workflows. +// 5. Fall through → empty path + "existing-volume" label. Provisioner +// reuses the workspace's existing config volume from the previous run. +// +// Returns (templatePath, configLabel). An empty templatePath is the signal +// to the provisioner that the existing volume is authoritative — the flow +// that preserves user edits. +// +// Pure function: no writes, no DB access, no network. Safe to unit-test +// with just a temp directory. +func resolveRestartTemplate(configsDir, wsName, dbRuntime string, body restartTemplateInput) (templatePath, configLabel string) { + template := body.Template + + // Tier 2: name-based auto-match, gated on ApplyTemplate. + if template == "" && body.ApplyTemplate { + template = findTemplateByName(configsDir, wsName) + } + + // Tier 1 + 2 resolve via the same code path — validate + stat. + if template != "" { + candidatePath, resolveErr := resolveInsideRoot(configsDir, template) + if resolveErr != nil { + log.Printf("Restart: invalid template %q: %v — proceeding without it", template, resolveErr) + template = "" + } else if _, err := os.Stat(candidatePath); err == nil { + return candidatePath, template + } else { + log.Printf("Restart: template %q dir not found — proceeding without it", template) + } + } + + // Tier 3: #239 rebuild_config — org-templates as last-resort recovery. + if body.RebuildConfig { + if p, label := resolveOrgTemplate(configsDir, wsName); p != "" { + log.Printf("Restart: rebuild_config — using org-template %s (%s)", label, wsName) + return p, label + } + } + + // Tier 4: runtime-default — apply_template=true + known runtime. + // Use case: Canvas Config tab changed the runtime; we need the new + // runtime's base files (entry point, Dockerfile, skill scaffolding) + // because the existing volume was written by the old runtime. + if body.ApplyTemplate && dbRuntime != "" { + runtimeTemplate := filepath.Join(configsDir, dbRuntime+"-default") + if _, err := os.Stat(runtimeTemplate); err == nil { + label := dbRuntime + "-default" + log.Printf("Restart: applying template %s (runtime change)", label) + return runtimeTemplate, label + } + } + + // Tier 5: reuse existing volume. This is the default, and the path + // the Canvas Save+Restart flow MUST hit to preserve user edits. + return "", "existing-volume" +} diff --git a/workspace-server/internal/handlers/restart_template_test.go b/workspace-server/internal/handlers/restart_template_test.go new file mode 100644 index 00000000..6c44b856 --- /dev/null +++ b/workspace-server/internal/handlers/restart_template_test.go @@ -0,0 +1,178 @@ +package handlers + +import ( + "os" + "path/filepath" + "testing" +) + +// Tests for resolveRestartTemplate — the pure helper that implements the +// priority chain documented on the function. Each test builds a minimal +// temp configsDir, fabricates the specific precondition it exercises, +// and asserts (templatePath, configLabel). +// +// The regression this suite locks in: a default restart (no flags) must +// never auto-apply a template that happens to match the workspace name. +// That was the "model reverts on Save+Restart" bug from +// fix/restart-preserves-user-config. + +// newTemplateDir makes a templates root with named subdirs, each holding +// a minimal config.yaml so findTemplateByName's dir-scan path has +// something to read. Returns the absolute root. +func newTemplateDir(t *testing.T, names ...string) string { + t.Helper() + root := t.TempDir() + for _, n := range names { + dir := filepath.Join(root, n) + if err := os.MkdirAll(dir, 0o755); err != nil { + t.Fatalf("mkdir %s: %v", dir, err) + } + cfg := filepath.Join(dir, "config.yaml") + if err := os.WriteFile(cfg, []byte("name: "+n+"\n"), 0o644); err != nil { + t.Fatalf("write %s: %v", cfg, err) + } + } + return root +} + +// TestResolveRestartTemplate_DefaultRestart_PreservesVolume is the +// regression test for the Canvas Save+Restart bug. A workspace named +// "Hermes Agent" normalises to "hermes-agent" — no dir match — but the +// findTemplateByName second pass would also scan config.yaml's `name:` +// field. We seed a template whose config.yaml DOES have the matching +// name, exactly the worst case. Without apply_template, the helper +// MUST still return empty templatePath. +func TestResolveRestartTemplate_DefaultRestart_PreservesVolume(t *testing.T) { + root := newTemplateDir(t, "hermes") + // Overwrite config.yaml so the name-scan would hit: + cfg := filepath.Join(root, "hermes", "config.yaml") + if err := os.WriteFile(cfg, []byte("name: Hermes Agent\n"), 0o644); err != nil { + t.Fatal(err) + } + + path, label := resolveRestartTemplate(root, "Hermes Agent", "hermes", restartTemplateInput{ + // ApplyTemplate intentionally omitted — this is the default restart. + }) + if path != "" { + t.Errorf("default restart must NOT resolve a template; got path=%q", path) + } + if label != "existing-volume" { + t.Errorf("expected 'existing-volume' label on default restart; got %q", label) + } +} + +// TestResolveRestartTemplate_ExplicitTemplate_AlwaysHonoured verifies +// that passing Template by name works regardless of ApplyTemplate — +// the caller named a template, that's unambiguous consent. +func TestResolveRestartTemplate_ExplicitTemplate_AlwaysHonoured(t *testing.T) { + root := newTemplateDir(t, "langgraph") + + path, label := resolveRestartTemplate(root, "Some Agent", "", restartTemplateInput{ + Template: "langgraph", + }) + if path == "" || label != "langgraph" { + t.Errorf("explicit template must resolve; got path=%q label=%q", path, label) + } +} + +// TestResolveRestartTemplate_ApplyTemplate_NameMatch verifies that +// setting ApplyTemplate re-enables the name-based auto-match for +// operators who actually want "reset this workspace to its template". +func TestResolveRestartTemplate_ApplyTemplate_NameMatch(t *testing.T) { + root := newTemplateDir(t, "hermes") + + path, label := resolveRestartTemplate(root, "Hermes", "", restartTemplateInput{ + ApplyTemplate: true, + }) + if path == "" || label != "hermes" { + t.Errorf("apply_template should name-match; got path=%q label=%q", path, label) + } +} + +// TestResolveRestartTemplate_ApplyTemplate_RuntimeDefault verifies the +// runtime-change flow: when the Canvas Config tab changes the runtime, +// the restart handler needs to lay down the new runtime's base files +// via `-default/`. Matches the existing behaviour comment. +func TestResolveRestartTemplate_ApplyTemplate_RuntimeDefault(t *testing.T) { + root := newTemplateDir(t, "langgraph-default") + + path, label := resolveRestartTemplate(root, "Some Workspace", "langgraph", restartTemplateInput{ + ApplyTemplate: true, + }) + if path == "" || label != "langgraph-default" { + t.Errorf("apply_template + dbRuntime should resolve runtime-default; got path=%q label=%q", path, label) + } +} + +// TestResolveRestartTemplate_ApplyTemplate_NoMatch_NoRuntime falls all +// the way through to the reuse-volume path when neither name nor +// runtime-default resolves. +func TestResolveRestartTemplate_ApplyTemplate_NoMatch_NoRuntime(t *testing.T) { + root := newTemplateDir(t) // empty templates dir + + path, label := resolveRestartTemplate(root, "Orphan", "", restartTemplateInput{ + ApplyTemplate: true, + }) + if path != "" { + t.Errorf("nothing to apply → expected empty path; got %q", path) + } + if label != "existing-volume" { + t.Errorf("expected 'existing-volume' fallback; got %q", label) + } +} + +// TestResolveRestartTemplate_InvalidExplicitTemplate_ProceedsWithout +// covers the defensive path where an explicit Template doesn't resolve +// to a valid dir (e.g. traversal attempt, deleted template). The helper +// must log + fall through, not crash or escape the root. +func TestResolveRestartTemplate_InvalidExplicitTemplate_ProceedsWithout(t *testing.T) { + root := newTemplateDir(t, "langgraph") + + path, label := resolveRestartTemplate(root, "Some Agent", "", restartTemplateInput{ + Template: "../../etc/passwd", + }) + if path != "" { + t.Errorf("traversal attempt must not resolve; got %q", path) + } + if label != "existing-volume" { + t.Errorf("expected 'existing-volume' fallback on invalid template; got %q", label) + } +} + +// TestResolveRestartTemplate_NonExistentExplicitTemplate mirrors the +// above but for a syntactically-valid name that simply doesn't exist +// on disk (e.g. template was manually deleted). Must fall through. +func TestResolveRestartTemplate_NonExistentExplicitTemplate(t *testing.T) { + root := newTemplateDir(t, "langgraph") + + path, label := resolveRestartTemplate(root, "Some Agent", "", restartTemplateInput{ + Template: "deleted-template", + }) + if path != "" { + t.Errorf("missing template must not resolve; got %q", path) + } + if label != "existing-volume" { + t.Errorf("expected 'existing-volume' fallback on missing template; got %q", label) + } +} + +// TestResolveRestartTemplate_Priority_ExplicitBeatsApplyTemplate proves +// that an explicit Template takes precedence over a name-based match. +// Scenario: workspace "Hermes" with ApplyTemplate=true + explicit +// Template="langgraph" — caller wants langgraph, not hermes. +func TestResolveRestartTemplate_Priority_ExplicitBeatsApplyTemplate(t *testing.T) { + root := newTemplateDir(t, "hermes", "langgraph") + + path, label := resolveRestartTemplate(root, "Hermes", "", restartTemplateInput{ + Template: "langgraph", + ApplyTemplate: true, + }) + if label != "langgraph" { + t.Errorf("explicit Template must win; got label=%q", label) + } + // Verify the path is actually inside the langgraph template dir + expected := filepath.Join(root, "langgraph") + if path != expected { + t.Errorf("expected path %q, got %q", expected, path) + } +} diff --git a/workspace-server/internal/handlers/ssrf.go b/workspace-server/internal/handlers/ssrf.go index 55c76c9d..a84426f1 100644 --- a/workspace-server/internal/handlers/ssrf.go +++ b/workspace-server/internal/handlers/ssrf.go @@ -30,6 +30,20 @@ func devModeAllowsLoopback() bool { return env == "development" || env == "dev" } +// ssrfCheckEnabled controls whether isSafeURL performs real validation. +// Tests disable it via setSSRFCheckForTest so that httptest.NewServer +// loopback URLs and fake hostnames (*.example) don't trigger SSRF +// rejections. Production code never mutates this. +var ssrfCheckEnabled = true + +// setSSRFCheckForTest overrides ssrfCheckEnabled for the duration of a test +// and returns a restore function. Use with defer in *_test.go only. +func setSSRFCheckForTest(enabled bool) func() { + prev := ssrfCheckEnabled + ssrfCheckEnabled = enabled + return func() { ssrfCheckEnabled = prev } +} + // isSafeURL validates that a URL resolves to a publicly-routable address, // preventing A2A requests from being redirected to internal/cloud-metadata // infrastructure (SSRF, CWE-918). Workspace URLs come from DB/Redis caches @@ -40,6 +54,9 @@ func devModeAllowsLoopback() bool { // the same VPC and register by their VPC-private IP. Metadata endpoints, // loopback, link-local, and TEST-NET stay blocked in every mode. func isSafeURL(rawURL string) error { + if !ssrfCheckEnabled { + return nil + } u, err := url.Parse(rawURL) if err != nil { return fmt.Errorf("invalid URL: %w", err) @@ -191,8 +208,20 @@ func mustCIDR(s string) net.IPNet { // the destination via absolute paths or ".." traversal. Used by // copyFilesToContainer and deleteViaEphemeral as a defence-in-depth measure. func validateRelPath(filePath string) error { + // Reject empty string and dot-only paths before any processing. + if filePath == "" || filePath == "." { + return fmt.Errorf("empty or dot-only path not allowed") + } clean := filepath.Clean(filePath) - if filepath.IsAbs(clean) || strings.Contains(clean, "..") { + // Reject absolute paths (Unix / or Windows C:\). + if filepath.IsAbs(clean) { + return fmt.Errorf("path traversal or absolute path not allowed: %s", filePath) + } + // Reject any path containing ".." anywhere — check both raw and cleaned + // because filepath.Clean resolves ".." upward (e.g. "foo/../bar" → "bar" + // and "foo/.." → ".") which would make the check pass if only clean were checked. + // We only want explicitly-named files; ".." implies intent to escape. + if strings.Contains(filePath, "..") || strings.Contains(clean, "..") { return fmt.Errorf("path traversal or absolute path not allowed: %s", filePath) } return nil diff --git a/workspace-server/internal/handlers/templates.go b/workspace-server/internal/handlers/templates.go index f2d456f0..6b026324 100644 --- a/workspace-server/internal/handlers/templates.go +++ b/workspace-server/internal/handlers/templates.go @@ -292,8 +292,7 @@ func (h *TemplatesHandler) ReadFile(c *gin.Context) { // Try container first if containerName := h.findContainer(ctx, workspaceID); containerName != "" { - containerPath := rootPath + "/" + filePath - content, err := h.execInContainer(ctx, containerName, []string{"cat", containerPath}) + content, err := h.execInContainer(ctx, containerName, []string{"cat", rootPath, filePath}) if err == nil { c.JSON(http.StatusOK, gin.H{ "path": filePath, diff --git a/workspace-server/internal/handlers/terminal.go b/workspace-server/internal/handlers/terminal.go index ec91c004..041a739f 100644 --- a/workspace-server/internal/handlers/terminal.go +++ b/workspace-server/internal/handlers/terminal.go @@ -75,17 +75,25 @@ func (h *TerminalHandler) HandleConnect(c *gin.Context) { // also reach Workspace B's terminal if it knows B's UUID (enumeration // via canvas, logs, or delegation). Shell access is more dangerous than // A2A message-passing, so we apply the same hierarchy check here. + // GH#756/#1609 security fix: if the caller claims a specific workspace + // identity (X-Workspace-ID header), the bearer token — if present — must + // belong to that claimed workspace. ValidateAnyToken accepted ANY valid org + // token, allowing Workspace A to forge X-Workspace-ID: B and reach B's + // terminal if A held any valid token. ValidateToken binds the token to + // the claimed workspace identity. callerID := c.GetHeader("X-Workspace-ID") - if callerID != "" { + if callerID != "" && callerID != workspaceID { tok := wsauth.BearerTokenFromHeader(c.GetHeader("Authorization")) if tok != "" { - if err := wsauth.ValidateAnyToken(ctx, db.DB, tok); err == nil { - if !canCommunicateCheck(callerID, workspaceID) { - c.JSON(http.StatusForbidden, gin.H{"error": "not authorized to access this workspace's terminal"}) - return - } + if err := wsauth.ValidateToken(ctx, db.DB, callerID, tok); err != nil { + c.JSON(http.StatusUnauthorized, gin.H{"error": "invalid token for claimed workspace"}) + return } } + if !canCommunicateCheck(callerID, workspaceID) { + c.JSON(http.StatusForbidden, gin.H{"error": "not authorized to access this workspace's terminal"}) + return + } } // Check for CP-provisioned workspace (instance_id persisted by diff --git a/workspace-server/internal/handlers/terminal_test.go b/workspace-server/internal/handlers/terminal_test.go index 3dba441e..930d1a28 100644 --- a/workspace-server/internal/handlers/terminal_test.go +++ b/workspace-server/internal/handlers/terminal_test.go @@ -73,16 +73,15 @@ func TestTerminalConnect_KI005_RejectsUnauthorizedCrossWorkspace(t *testing.T) { canCommunicateCheck = func(callerID, targetID string) bool { return false } defer func() { canCommunicateCheck = prev }() - // Token lookup: ws-caller's token is valid. ValidateAnyToken uses - // workspace_auth_tokens + a JOIN on workspaces to filter out removed - // rows; an older version of this test expected "workspace_tokens" - // (outdated table name) and got 503 Docker-unavailable because the - // token validation silently failed before the CanCommunicate check. - rows := sqlmock.NewRows([]string{"id"}).AddRow("tok-1") - mock.ExpectQuery(`SELECT t\.id\s+FROM workspace_auth_tokens t`). + // Token lookup: ws-caller's token is valid. ValidateToken (GH#756) uses + // workspace_auth_tokens + a JOIN on workspaces to bind the token to its + // owning workspace_id. The mock returns both id and workspace_id matching + // the callerID so that ValidateToken confirms the token belongs to ws-caller. + rows := sqlmock.NewRows([]string{"id", "workspace_id"}).AddRow("tok-1", "ws-caller") + mock.ExpectQuery(`SELECT t\.id, t\.workspace_id\s+FROM workspace_auth_tokens t`). WithArgs(sqlmock.AnyArg()). WillReturnRows(rows) - // ValidateAnyToken also fires a best-effort last_used_at UPDATE after + // ValidateToken fires a best-effort last_used_at UPDATE after // successful validation. Accept it so ExpectationsWereMet passes. mock.ExpectExec(`UPDATE workspace_auth_tokens SET last_used_at`). WithArgs(sqlmock.AnyArg()). @@ -207,9 +206,11 @@ func TestTerminalConnect_KI005_SkipsCheckWithoutHeader(t *testing.T) { } // TestTerminalConnect_KI005_RejectsInvalidToken tests that an invalid bearer -// token also results in a non-200 response (falls through to Docker auth). -// ValidateAnyToken returns error → CanCommunicate is never called. +// token when X-Workspace-ID is set results in 401 Unauthorized. +// ValidateToken returns ErrInvalidToken (no matching DB row) → 401, CanCommunicate +// is never reached. func TestTerminalConnect_KI005_RejectsInvalidToken(t *testing.T) { + setupTestDB(t) // provides a mock DB; no expectations set → ValidateToken query returns error canCommunicateCalled := false prev := canCommunicateCheck canCommunicateCheck = func(callerID, targetID string) bool { @@ -231,16 +232,19 @@ func TestTerminalConnect_KI005_RejectsInvalidToken(t *testing.T) { if canCommunicateCalled { t.Error("CanCommunicate should not be called with an invalid token") } - // Got 503 (nil docker) instead of 200/403 — ValidateAnyToken rejected the - // token and we fell through to Docker auth, which returned 503 (nil docker). - if w.Code != http.StatusServiceUnavailable { - t.Errorf("invalid token: got %d, want 503 nil-docker (%s)", w.Code, w.Body.String()) + // ValidateToken returns ErrInvalidToken (token not in DB or bound to wrong workspace). + // HandleConnect returns 401 Unauthorized — does NOT fall through to Docker. + if w.Code != http.StatusUnauthorized { + t.Errorf("invalid token: got %d, want 401 Unauthorized (%s)", w.Code, w.Body.String()) } } // TestTerminalConnect_KI005_AllowsSiblingWorkspace tests the sibling path: // two workspaces with the same parent ID should be allowed to communicate. +// ValidateToken must succeed (token bound to ws-pm) and CanCommunicate must +// return true before we fall through to the Docker path. func TestTerminalConnect_KI005_AllowsSiblingWorkspace(t *testing.T) { + mock := setupTestDB(t) prev := canCommunicateCheck canCommunicateCheck = func(callerID, targetID string) bool { // Simulate sibling: same parent @@ -248,17 +252,27 @@ func TestTerminalConnect_KI005_AllowsSiblingWorkspace(t *testing.T) { } defer func() { canCommunicateCheck = prev }() + // ValidateToken: token is bound to ws-pm (the callerID). Returns id + workspace_id. + rows := sqlmock.NewRows([]string{"id", "workspace_id"}).AddRow("tok-pm", "ws-pm") + mock.ExpectQuery(`SELECT t\.id, t\.workspace_id\s+FROM workspace_auth_tokens t`). + WithArgs(sqlmock.AnyArg()). + WillReturnRows(rows) + // Best-effort last_used_at UPDATE. + mock.ExpectExec(`UPDATE workspace_auth_tokens SET last_used_at`). + WithArgs(sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) + h := NewTerminalHandler(nil) w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) c.Params = gin.Params{{Key: "id", Value: "ws-dev"}} c.Request = httptest.NewRequest("GET", "/workspaces/ws-dev/terminal", nil) c.Request.Header.Set("X-Workspace-ID", "ws-pm") - c.Request.Header.Set("Authorization", "Bearer valid-token") + c.Request.Header.Set("Authorization", "Bearer valid-token-for-ws-pm") h.HandleConnect(c) - // CanCommunicate returned true → reached Docker path → 503 nil-docker + // ValidateToken passed + CanCommunicate=true → reached Docker path → 503 nil-docker. if w.Code != http.StatusServiceUnavailable { t.Errorf("sibling access: got %d, want 503 nil-docker (%s)", w.Code, w.Body.String()) } diff --git a/workspace-server/internal/handlers/workspace_restart.go b/workspace-server/internal/handlers/workspace_restart.go index 3228122d..9b3b2bfa 100644 --- a/workspace-server/internal/handlers/workspace_restart.go +++ b/workspace-server/internal/handlers/workspace_restart.go @@ -5,8 +5,6 @@ import ( "database/sql" "log" "net/http" - "os" - "path/filepath" "strings" "sync" "time" @@ -127,53 +125,11 @@ func (h *WorkspaceHandler) Restart(c *gin.Context) { } c.ShouldBindJSON(&body) - // Resolve template path in priority order: - // 1. Explicit template from request body - // 2. Runtime-specific default template (e.g. claude-code-default/) - // 3. Name-based match in templates directory - // 4. No template — the volume already has configs from previous run - var templatePath string - var configFiles map[string][]byte - configLabel := "existing-volume" - - template := body.Template - if template == "" { - template = findTemplateByName(h.configsDir, wsName) - } - if template != "" { - candidatePath, resolveErr := resolveInsideRoot(h.configsDir, template) - if resolveErr != nil { - log.Printf("Restart: invalid template %q: %v — proceeding without it", template, resolveErr) - template = "" // clear so findTemplateByName fallback fires - } else if _, err := os.Stat(candidatePath); err == nil { - templatePath = candidatePath - configLabel = template - } else { - log.Printf("Restart: template %q dir not found — proceeding without it", template) - } - } - - // #239: rebuild_config=true — try org-templates as last-resort source so a - // workspace with a destroyed config volume can self-recover without admin - // intervention. Only fires when no other template was resolved above. - if templatePath == "" && body.RebuildConfig { - if p, label := resolveOrgTemplate(h.configsDir, wsName); p != "" { - templatePath = p - configLabel = label - log.Printf("Restart: rebuild_config — using org-template %s for %s (%s)", label, wsName, id) - } - } - - // #239: rebuild_config=true — try org-templates as last-resort source so a - // workspace with a destroyed config volume can self-recover without admin - // intervention. Only fires when no other template was resolved above. - if templatePath == "" && body.RebuildConfig { - if p, label := resolveOrgTemplate(h.configsDir, wsName); p != "" { - templatePath = p - configLabel = label - log.Printf("Restart: rebuild_config — using org-template %s for %s (%s)", label, wsName, id) - } - } + templatePath, configLabel := resolveRestartTemplate(h.configsDir, wsName, dbRuntime, restartTemplateInput{ + Template: body.Template, + ApplyTemplate: body.ApplyTemplate, + RebuildConfig: body.RebuildConfig, + }) if templatePath == "" { log.Printf("Restart: reusing existing config volume for %s (%s)", wsName, id) @@ -181,21 +137,10 @@ func (h *WorkspaceHandler) Restart(c *gin.Context) { log.Printf("Restart: using template %s for %s (%s)", templatePath, wsName, id) } + var configFiles map[string][]byte payload := models.CreateWorkspacePayload{Name: wsName, Tier: tier, Runtime: containerRuntime} log.Printf("Restart: workspace %s (%s) runtime=%q", wsName, id, containerRuntime) - // Apply runtime-default template ONLY when explicitly requested via "apply_template": true. - // Use case: runtime was changed via Config tab — need new runtime's base files. - // Normal restarts preserve existing config volume (user's model, skills, prompts). - if templatePath == "" && body.ApplyTemplate && dbRuntime != "" { - runtimeTemplate := filepath.Join(h.configsDir, dbRuntime+"-default") - if _, err := os.Stat(runtimeTemplate); err == nil { - templatePath = runtimeTemplate - configLabel = dbRuntime + "-default" - log.Printf("Restart: applying template %s (runtime change)", configLabel) - } - } - // #12: ?reset=true (or body.Reset) discards the claude-sessions volume // before restart, giving the agent a clean /root/.claude/sessions dir. resetClaudeSession := c.Query("reset") == "true" || body.Reset diff --git a/workspace-server/internal/middleware/wsauth_middleware_org_id_test.go b/workspace-server/internal/middleware/wsauth_middleware_org_id_test.go index d327cc3a..8f2d4899 100644 --- a/workspace-server/internal/middleware/wsauth_middleware_org_id_test.go +++ b/workspace-server/internal/middleware/wsauth_middleware_org_id_test.go @@ -212,13 +212,11 @@ func TestWorkspaceAuth_OrgToken_DBRowScanError_DoesNotPanic(t *testing.T) { orgToken := "tok_token_ok" tokenHash := sha256.Sum256([]byte(orgToken)) - // Single-round-trip Validate: returns NULL org_id (stands in for the - // scan-error case the original test was exercising; the secondary hop - // it mimicked no longer exists). + // orgtoken.Validate returns 3 columns including org_id (sql.NullString). mock.ExpectQuery(orgTokenValidateQuery). WithArgs(tokenHash[:]). WillReturnRows(sqlmock.NewRows([]string{"id", "prefix", "org_id"}). - AddRow("tok-ok", "tok_tok_", nil)) + AddRow("tok-ok", "tok_tok_", "00000000-0000-0000-0000-000000000099")) r := gin.New() r.GET("/workspaces/:id/secrets", WorkspaceAuth(mockDB), func(c *gin.Context) { diff --git a/workspace-server/internal/middleware/wsauth_middleware_test.go b/workspace-server/internal/middleware/wsauth_middleware_test.go index 54dd05b1..7da9e9c6 100644 --- a/workspace-server/internal/middleware/wsauth_middleware_test.go +++ b/workspace-server/internal/middleware/wsauth_middleware_test.go @@ -473,12 +473,15 @@ func TestAdminAuth_InvalidBearer_Returns401(t *testing.T) { // token (org_id="ws-org-1"). // ──────────────────────────────────────────────────────────────────────────── -// orgTokenValidateQueryV1 is matched for orgtoken.Validate(). Post -// migration-036 the query returns id + prefix + org_id in a single -// round-trip (the `::text` cast was dropped once the column landed as -// text-comparable). +// orgTokenValidateQueryV1 is matched for orgtoken.Validate(). +// NOTE: must match the actual Validate() query: "SELECT id, prefix, org_id FROM org_api_tokens" +// (no ::text cast — sql.NullString handles the NULL scan natively). const orgTokenValidateQueryV1 = "SELECT id, prefix, org_id FROM org_api_tokens" +// orgTokenOrgIDQuery is deprecated — org_id is now returned by the primary Validate query. +// Kept here to avoid breaking other test files that may reference it. +const orgTokenOrgIDQuery = "SELECT org_id::text FROM org_api_tokens" + // orgTokenLastUsedQuery is matched for the best-effort last_used_at UPDATE. const orgTokenLastUsedQuery = "UPDATE org_api_tokens SET last_used_at" diff --git a/workspace-server/internal/orgtoken/tokens_test.go b/workspace-server/internal/orgtoken/tokens_test.go index 50e8e7b1..7040cf68 100644 --- a/workspace-server/internal/orgtoken/tokens_test.go +++ b/workspace-server/internal/orgtoken/tokens_test.go @@ -145,7 +145,7 @@ func TestList_NewestFirst(t *testing.T) { now := time.Now() earlier := now.Add(-1 * time.Hour) - mock.ExpectQuery(`SELECT id, prefix.*FROM org_api_tokens.*ORDER BY created_at DESC( LIMIT $1)?`). + mock.ExpectQuery(`SELECT id, prefix.*FROM org_api_tokens.*ORDER BY created_at DESC`). WithArgs(listMax). WillReturnRows(sqlmock.NewRows([]string{"id", "prefix", "name", "org_id", "created_by", "created_at", "last_used_at"}). AddRow("t2", "abcd1234", "zapier", "org-1", "user_01", now, now).