Merge remote-tracking branch 'origin/staging' into fix/restore-quickstart-plus-hotfixes

# Conflicts: # workspace-server/internal/handlers/ssrf.go
2026-04-23 16:42:41 -07:00 · 2026-04-23 16:42:41 -07:00 · c5bcd7298c
commit c5bcd7298c
parent baa7e1531f 50ae33e8b3
14 changed files with 471 additions and 118 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -142,7 +142,8 @@ jobs:

              # Strip the package-import prefix so we can match .coverage-allowlist.txt
              # entries written as paths relative to workspace-server/.
-              rel=$(echo "$file" | sed 's|^github.com/Molecule-AI/molecule-monorepo/platform/||')
+              # Handle both module paths: platform/workspace-server/... and platform/...
+              rel=$(echo "$file" | sed 's|^github.com/Molecule-AI/molecule-monorepo/platform/workspace-server/||; s|^github.com/Molecule-AI/molecule-monorepo/platform/||')

              if echo "$ALLOWLIST" | grep -qxF "$rel"; then
                echo "::warning file=workspace-server/$rel::Critical file at ${pct}% coverage (allowlisted, #1823) — fix before expiry."
--- a/canvas/src/components/tabs/ConfigTab.tsx
+++ b/canvas/src/components/tabs/ConfigTab.tsx
@ -104,6 +104,13 @@ interface RuntimeOption {
 // Fallback used when /templates can't be fetched (offline, older backend).
 // Keep in sync with manifest.json workspace_templates as a defensive default.
 // Model + env suggestions only flow when the backend is reachable.
+// Runtimes that manage their own config outside the platform's config.yaml
+// template. For these, a missing config.yaml is expected — the user manages
+// config via the runtime's own mechanism (e.g. hermes edits
+// ~/.hermes/config.yaml on the workspace EC2 via the Terminal tab or its
+// own CLI). Showing a "No config.yaml found" error for these is misleading.
+const RUNTIMES_WITH_OWN_CONFIG = new Set<string>(["hermes", "external"]);
+
 const FALLBACK_RUNTIME_OPTIONS: RuntimeOption[] = [
  { value: "", label: "LangGraph (default)", models: [] },
  { value: "claude-code", label: "Claude Code", models: [] },
@ -134,14 +141,50 @@ export function ConfigTab({ workspaceId }: Props) {
  const loadConfig = useCallback(async () => {
    setLoading(true);
    setError(null);
+
+    // ALWAYS load workspace metadata first (runtime + model). These are the
+    // source of truth regardless of whether the runtime uses our config.yaml
+    // template. Without this the form falls back to empty/default values on
+    // a hermes workspace (which doesn't use our template), creating the
+    // appearance that the saved runtime is unset — and worse, clicking Save
+    // would silently flip `runtime` from `hermes` back to the dropdown
+    // default `LangGraph`. See GH #1894.
+    let wsMetadataRuntime = "";
+    let wsMetadataModel = "";
+    try {
+      const ws = await api.get<{ runtime?: string }>(`/workspaces/${workspaceId}`);
+      wsMetadataRuntime = (ws.runtime || "").trim();
+    } catch { /* fall back to config.yaml */ }
+    try {
+      const m = await api.get<{ model?: string }>(`/workspaces/${workspaceId}/model`);
+      wsMetadataModel = (m.model || "").trim();
+    } catch { /* non-fatal */ }
+
    try {
      const res = await api.get<{ content: string }>(`/workspaces/${workspaceId}/files/config.yaml`);
      const parsed = parseYaml(res.content);
      setOriginalYaml(res.content);
      setRawDraft(res.content);
-      setConfig({ ...DEFAULT_CONFIG, ...parsed } as ConfigData);
+      // Merge: config.yaml wins for fields it declares, but workspace metadata
+      // wins for runtime + model when config.yaml doesn't set them.
+      const merged = { ...DEFAULT_CONFIG, ...parsed } as ConfigData;
+      if (!merged.runtime && wsMetadataRuntime) merged.runtime = wsMetadataRuntime;
+      if (!merged.model && wsMetadataModel) merged.model = wsMetadataModel;
+      setConfig(merged);
    } catch {
-      setError("No config.yaml found");
+      // No platform-managed config.yaml. Some runtimes (hermes, external)
+      // manage their own config outside this template; that's expected, not
+      // an error. Populate the form from workspace metadata so the user
+      // still sees the saved runtime + model.
+      const runtimeManagesOwnConfig = RUNTIMES_WITH_OWN_CONFIG.has(wsMetadataRuntime);
+      if (!runtimeManagesOwnConfig) {
+        setError("No config.yaml found");
+      }
+      setConfig({
+        ...DEFAULT_CONFIG,
+        runtime: wsMetadataRuntime,
+        model: wsMetadataModel,
+      } as ConfigData);
    } finally {
      setLoading(false);
    }
@ -520,6 +563,13 @@ export function ConfigTab({ workspaceId }: Props) {
      {error && (
        <div className="mx-3 mb-2 px-3 py-1.5 bg-red-900/30 border border-red-800 rounded text-xs text-red-400">{error}</div>
      )}
+      {!error && RUNTIMES_WITH_OWN_CONFIG.has(config.runtime || "") && (
+        <div className="mx-3 mb-2 px-3 py-1.5 bg-zinc-900/50 border border-zinc-700 rounded text-xs text-zinc-400">
+          {config.runtime === "hermes"
+            ? "Hermes manages its own config at ~/.hermes/config.yaml on the workspace host. Edit it via the Terminal tab or the hermes CLI, not this form."
+            : "This runtime manages its own config outside the platform template."}
+        </div>
+      )}
      {success && (
        <div className="mx-3 mb-2 px-3 py-1.5 bg-green-900/30 border border-green-800 rounded text-xs text-green-400">Saved</div>
      )}
--- a/workspace-server/internal/handlers/a2a_proxy_helpers.go
+++ b/workspace-server/internal/handlers/a2a_proxy_helpers.go
@ -58,24 +58,27 @@ func (h *WorkspaceHandler) handleA2ADispatchError(ctx context.Context, workspace
 	// Issue #110.
 	//
 	// #1870 Phase 1: before returning 503, enqueue the request for drain
-	// on next heartbeat. Returning 202 Accepted {queued:true} means the
-	// caller records "dispatched — queued" not "failed", eliminating the
-	// fan-out-storm drop pattern.
+	// on next heartbeat. Returning 202 Accepted {queued:true} as a SUCCESS
+	// (not an error) means callers record this as "dispatched — queued"
+	// not "failed", eliminating the fan-out-storm drop pattern.
+	//
+	// Critical: must return (status, body, NIL ERROR) so the caller's
+	// `if proxyErr != nil` branch doesn't fire. Returning a proxyA2AError
+	// with 202 status here was the original cycle 53 bug — callers saw
+	// proxyErr != nil and logged "delegation failed: proxy a2a error".
 	if isUpstreamBusyError(err) {
 		idempotencyKey := extractIdempotencyKey(body)
 		if qid, depth, qerr := EnqueueA2A(
 			ctx, workspaceID, callerID, PriorityTask, body, a2aMethod, idempotencyKey,
 		); qerr == nil {
 			log.Printf("ProxyA2A: target %s busy — enqueued as %s (depth=%d)", workspaceID, qid, depth)
-			return http.StatusAccepted, nil, &proxyA2AError{
-				Status: http.StatusAccepted,
-				Response: gin.H{
-					"queued":      true,
-					"queue_id":    qid,
-					"queue_depth": depth,
-					"message":     "workspace agent busy — request queued, will dispatch when capacity available",
-				},
-			}
+			respBody, _ := json.Marshal(gin.H{
+				"queued":      true,
+				"queue_id":    qid,
+				"queue_depth": depth,
+				"message":     "workspace agent busy — request queued, will dispatch when capacity available",
+			})
+			return http.StatusAccepted, respBody, nil
 		} else {
 			// Queue insert failed — fall through to legacy 503 behavior
 			// so callers still retry. We don't want a queue DB hiccup to
--- a/workspace-server/internal/handlers/a2a_queue.go
+++ b/workspace-server/internal/handlers/a2a_queue.go
@ -16,6 +16,7 @@ import (
 	"encoding/json"
 	"errors"
 	"log"
+	"net/http"

 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
 )
@ -82,13 +83,18 @@ func EnqueueA2A(
 		methodArg = method
 	}

-	// INSERT ... ON CONFLICT DO NOTHING RETURNING id. On conflict we then
-	// look up the existing row's id so the caller always receives a valid
-	// queue entry reference.
+	// INSERT ... ON CONFLICT DO NOTHING RETURNING id. The conflict target
+	// must reference the partial unique INDEX columns + WHERE clause directly
+	// (Postgres can't reference partial unique indexes by name in
+	// ON CONFLICT — only true CONSTRAINTs work for that). On conflict we
+	// then look up the existing row's id so the caller always receives a
+	// valid queue entry reference.
 	err = db.DB.QueryRowContext(ctx, `
 		INSERT INTO a2a_queue (workspace_id, caller_id, priority, body, method, idempotency_key)
 		VALUES ($1, $2, $3, $4::jsonb, $5, $6)
-		ON CONFLICT ON CONSTRAINT idx_a2a_queue_idempotency DO NOTHING
+		ON CONFLICT (workspace_id, idempotency_key)
+			WHERE idempotency_key IS NOT NULL AND status IN ('queued','dispatched')
+			DO NOTHING
 		RETURNING id
 	`, workspaceID, callerArg, priority, string(body), methodArg, keyArg).Scan(&id)

@ -228,11 +234,34 @@ func (h *WorkspaceHandler) DrainQueueForWorkspace(ctx context.Context, workspace
 	}
 	// logActivity=false: the original EnqueueA2A callsite already logged
 	// the dispatch attempt; re-logging here would double-count events.
-	_, _, proxyErr := h.proxyA2ARequest(ctx, workspaceID, item.Body, callerID, false)
+	status, _, proxyErr := h.proxyA2ARequest(ctx, workspaceID, item.Body, callerID, false)
+
+	// 202 Accepted = the dispatch was itself queued again (target still busy).
+	// That's not a failure — the queued item just stays queued naturally on
+	// the next drain tick. Mark this attempt completed so we don't double-
+	// count attempts; the new (re-)queue row already exists.
+	if status == http.StatusAccepted {
+		MarkQueueItemCompleted(ctx, item.ID)
+		log.Printf("A2AQueue drain: %s re-queued (target still busy)", item.ID)
+		return
+	}
+
 	if proxyErr != nil {
-		MarkQueueItemFailed(ctx, item.ID, proxyErr.Response["error"].(string))
-		log.Printf("A2AQueue drain: dispatch for %s failed (attempt=%d): %v",
-			item.ID, item.Attempts, proxyErr.Response["error"])
+		// Defensive: proxyErr.Response is gin.H (map[string]interface{}). The
+		// "error" key is conventionally a string but can be missing or non-
+		// string in edge paths (e.g. a future error builder using a typed
+		// struct). Cast safely so a missing key doesn't crash the platform —
+		// today's outage was caused by an unchecked .(string) here.
+		errMsg, _ := proxyErr.Response["error"].(string)
+		if errMsg == "" {
+			errMsg = http.StatusText(proxyErr.Status)
+			if errMsg == "" {
+				errMsg = "unknown drain dispatch error"
+			}
+		}
+		MarkQueueItemFailed(ctx, item.ID, errMsg)
+		log.Printf("A2AQueue drain: dispatch for %s failed (attempt=%d): %s",
+			item.ID, item.Attempts, errMsg)
 		return
 	}
 	MarkQueueItemCompleted(ctx, item.ID)
--- a/workspace-server/internal/handlers/restart_template.go
+++ b/workspace-server/internal/handlers/restart_template.go
@ -0,0 +1,96 @@
+package handlers
+
+import (
+	"log"
+	"os"
+	"path/filepath"
+)
+
+// restartTemplateInput is the subset of the /workspaces/:id/restart request
+// body that affects which config source the provisioner uses. Extracted as
+// a type so `resolveRestartTemplate` has a single pure-function signature
+// for unit tests — no gin context, no DB, no filesystem writes.
+type restartTemplateInput struct {
+	// Template is an explicit template dir name from the request body.
+	// Always honoured when resolvable — caller asked by name, that's
+	// unambiguous consent to overwrite the config volume.
+	Template string
+	// ApplyTemplate opts the caller in to name-based auto-match AND the
+	// runtime-default fallback. Without this flag a restart MUST NOT
+	// overwrite the user's config volume — a user who edited their
+	// model/provider/skills/prompts via the Canvas Config tab and hit
+	// Save+Restart expects their edits to survive. The previous behaviour
+	// (name-based auto-match unconditionally) silently reverted edits for
+	// any workspace whose name matched a template dir (e.g. "Hermes Agent"
+	// → hermes/), which is the regression this fix closes.
+	ApplyTemplate bool
+	// RebuildConfig (#239) is the recovery signal used when the workspace's
+	// config volume was destroyed out-of-band. Tries org-templates as a
+	// last-resort source so the workspace can self-heal without admin
+	// intervention. Orthogonal to ApplyTemplate.
+	RebuildConfig bool
+}
+
+// resolveRestartTemplate chooses the config source for a restart in the
+// documented priority order:
+//
+//  1. Explicit `Template` from the request body (always honoured).
+//  2. `ApplyTemplate=true` → name-based auto-match via findTemplateByName.
+//  3. `RebuildConfig=true` → org-templates recovery fallback (#239).
+//  4. `ApplyTemplate=true` + non-empty dbRuntime → runtime-default template
+//     (e.g. `hermes-default/`) for runtime-change workflows.
+//  5. Fall through → empty path + "existing-volume" label. Provisioner
+//     reuses the workspace's existing config volume from the previous run.
+//
+// Returns (templatePath, configLabel). An empty templatePath is the signal
+// to the provisioner that the existing volume is authoritative — the flow
+// that preserves user edits.
+//
+// Pure function: no writes, no DB access, no network. Safe to unit-test
+// with just a temp directory.
+func resolveRestartTemplate(configsDir, wsName, dbRuntime string, body restartTemplateInput) (templatePath, configLabel string) {
+	template := body.Template
+
+	// Tier 2: name-based auto-match, gated on ApplyTemplate.
+	if template == "" && body.ApplyTemplate {
+		template = findTemplateByName(configsDir, wsName)
+	}
+
+	// Tier 1 + 2 resolve via the same code path — validate + stat.
+	if template != "" {
+		candidatePath, resolveErr := resolveInsideRoot(configsDir, template)
+		if resolveErr != nil {
+			log.Printf("Restart: invalid template %q: %v — proceeding without it", template, resolveErr)
+			template = ""
+		} else if _, err := os.Stat(candidatePath); err == nil {
+			return candidatePath, template
+		} else {
+			log.Printf("Restart: template %q dir not found — proceeding without it", template)
+		}
+	}
+
+	// Tier 3: #239 rebuild_config — org-templates as last-resort recovery.
+	if body.RebuildConfig {
+		if p, label := resolveOrgTemplate(configsDir, wsName); p != "" {
+			log.Printf("Restart: rebuild_config — using org-template %s (%s)", label, wsName)
+			return p, label
+		}
+	}
+
+	// Tier 4: runtime-default — apply_template=true + known runtime.
+	// Use case: Canvas Config tab changed the runtime; we need the new
+	// runtime's base files (entry point, Dockerfile, skill scaffolding)
+	// because the existing volume was written by the old runtime.
+	if body.ApplyTemplate && dbRuntime != "" {
+		runtimeTemplate := filepath.Join(configsDir, dbRuntime+"-default")
+		if _, err := os.Stat(runtimeTemplate); err == nil {
+			label := dbRuntime + "-default"
+			log.Printf("Restart: applying template %s (runtime change)", label)
+			return runtimeTemplate, label
+		}
+	}
+
+	// Tier 5: reuse existing volume. This is the default, and the path
+	// the Canvas Save+Restart flow MUST hit to preserve user edits.
+	return "", "existing-volume"
+}
--- a/workspace-server/internal/handlers/restart_template_test.go
+++ b/workspace-server/internal/handlers/restart_template_test.go
@ -0,0 +1,178 @@
+package handlers
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+// Tests for resolveRestartTemplate — the pure helper that implements the
+// priority chain documented on the function. Each test builds a minimal
+// temp configsDir, fabricates the specific precondition it exercises,
+// and asserts (templatePath, configLabel).
+//
+// The regression this suite locks in: a default restart (no flags) must
+// never auto-apply a template that happens to match the workspace name.
+// That was the "model reverts on Save+Restart" bug from
+// fix/restart-preserves-user-config.
+
+// newTemplateDir makes a templates root with named subdirs, each holding
+// a minimal config.yaml so findTemplateByName's dir-scan path has
+// something to read. Returns the absolute root.
+func newTemplateDir(t *testing.T, names ...string) string {
+	t.Helper()
+	root := t.TempDir()
+	for _, n := range names {
+		dir := filepath.Join(root, n)
+		if err := os.MkdirAll(dir, 0o755); err != nil {
+			t.Fatalf("mkdir %s: %v", dir, err)
+		}
+		cfg := filepath.Join(dir, "config.yaml")
+		if err := os.WriteFile(cfg, []byte("name: "+n+"\n"), 0o644); err != nil {
+			t.Fatalf("write %s: %v", cfg, err)
+		}
+	}
+	return root
+}
+
+// TestResolveRestartTemplate_DefaultRestart_PreservesVolume is the
+// regression test for the Canvas Save+Restart bug. A workspace named
+// "Hermes Agent" normalises to "hermes-agent" — no dir match — but the
+// findTemplateByName second pass would also scan config.yaml's `name:`
+// field. We seed a template whose config.yaml DOES have the matching
+// name, exactly the worst case. Without apply_template, the helper
+// MUST still return empty templatePath.
+func TestResolveRestartTemplate_DefaultRestart_PreservesVolume(t *testing.T) {
+	root := newTemplateDir(t, "hermes")
+	// Overwrite config.yaml so the name-scan would hit:
+	cfg := filepath.Join(root, "hermes", "config.yaml")
+	if err := os.WriteFile(cfg, []byte("name: Hermes Agent\n"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	path, label := resolveRestartTemplate(root, "Hermes Agent", "hermes", restartTemplateInput{
+		// ApplyTemplate intentionally omitted — this is the default restart.
+	})
+	if path != "" {
+		t.Errorf("default restart must NOT resolve a template; got path=%q", path)
+	}
+	if label != "existing-volume" {
+		t.Errorf("expected 'existing-volume' label on default restart; got %q", label)
+	}
+}
+
+// TestResolveRestartTemplate_ExplicitTemplate_AlwaysHonoured verifies
+// that passing Template by name works regardless of ApplyTemplate —
+// the caller named a template, that's unambiguous consent.
+func TestResolveRestartTemplate_ExplicitTemplate_AlwaysHonoured(t *testing.T) {
+	root := newTemplateDir(t, "langgraph")
+
+	path, label := resolveRestartTemplate(root, "Some Agent", "", restartTemplateInput{
+		Template: "langgraph",
+	})
+	if path == "" || label != "langgraph" {
+		t.Errorf("explicit template must resolve; got path=%q label=%q", path, label)
+	}
+}
+
+// TestResolveRestartTemplate_ApplyTemplate_NameMatch verifies that
+// setting ApplyTemplate re-enables the name-based auto-match for
+// operators who actually want "reset this workspace to its template".
+func TestResolveRestartTemplate_ApplyTemplate_NameMatch(t *testing.T) {
+	root := newTemplateDir(t, "hermes")
+
+	path, label := resolveRestartTemplate(root, "Hermes", "", restartTemplateInput{
+		ApplyTemplate: true,
+	})
+	if path == "" || label != "hermes" {
+		t.Errorf("apply_template should name-match; got path=%q label=%q", path, label)
+	}
+}
+
+// TestResolveRestartTemplate_ApplyTemplate_RuntimeDefault verifies the
+// runtime-change flow: when the Canvas Config tab changes the runtime,
+// the restart handler needs to lay down the new runtime's base files
+// via `<runtime>-default/`. Matches the existing behaviour comment.
+func TestResolveRestartTemplate_ApplyTemplate_RuntimeDefault(t *testing.T) {
+	root := newTemplateDir(t, "langgraph-default")
+
+	path, label := resolveRestartTemplate(root, "Some Workspace", "langgraph", restartTemplateInput{
+		ApplyTemplate: true,
+	})
+	if path == "" || label != "langgraph-default" {
+		t.Errorf("apply_template + dbRuntime should resolve runtime-default; got path=%q label=%q", path, label)
+	}
+}
+
+// TestResolveRestartTemplate_ApplyTemplate_NoMatch_NoRuntime falls all
+// the way through to the reuse-volume path when neither name nor
+// runtime-default resolves.
+func TestResolveRestartTemplate_ApplyTemplate_NoMatch_NoRuntime(t *testing.T) {
+	root := newTemplateDir(t) // empty templates dir
+
+	path, label := resolveRestartTemplate(root, "Orphan", "", restartTemplateInput{
+		ApplyTemplate: true,
+	})
+	if path != "" {
+		t.Errorf("nothing to apply → expected empty path; got %q", path)
+	}
+	if label != "existing-volume" {
+		t.Errorf("expected 'existing-volume' fallback; got %q", label)
+	}
+}
+
+// TestResolveRestartTemplate_InvalidExplicitTemplate_ProceedsWithout
+// covers the defensive path where an explicit Template doesn't resolve
+// to a valid dir (e.g. traversal attempt, deleted template). The helper
+// must log + fall through, not crash or escape the root.
+func TestResolveRestartTemplate_InvalidExplicitTemplate_ProceedsWithout(t *testing.T) {
+	root := newTemplateDir(t, "langgraph")
+
+	path, label := resolveRestartTemplate(root, "Some Agent", "", restartTemplateInput{
+		Template: "../../etc/passwd",
+	})
+	if path != "" {
+		t.Errorf("traversal attempt must not resolve; got %q", path)
+	}
+	if label != "existing-volume" {
+		t.Errorf("expected 'existing-volume' fallback on invalid template; got %q", label)
+	}
+}
+
+// TestResolveRestartTemplate_NonExistentExplicitTemplate mirrors the
+// above but for a syntactically-valid name that simply doesn't exist
+// on disk (e.g. template was manually deleted). Must fall through.
+func TestResolveRestartTemplate_NonExistentExplicitTemplate(t *testing.T) {
+	root := newTemplateDir(t, "langgraph")
+
+	path, label := resolveRestartTemplate(root, "Some Agent", "", restartTemplateInput{
+		Template: "deleted-template",
+	})
+	if path != "" {
+		t.Errorf("missing template must not resolve; got %q", path)
+	}
+	if label != "existing-volume" {
+		t.Errorf("expected 'existing-volume' fallback on missing template; got %q", label)
+	}
+}
+
+// TestResolveRestartTemplate_Priority_ExplicitBeatsApplyTemplate proves
+// that an explicit Template takes precedence over a name-based match.
+// Scenario: workspace "Hermes" with ApplyTemplate=true + explicit
+// Template="langgraph" — caller wants langgraph, not hermes.
+func TestResolveRestartTemplate_Priority_ExplicitBeatsApplyTemplate(t *testing.T) {
+	root := newTemplateDir(t, "hermes", "langgraph")
+
+	path, label := resolveRestartTemplate(root, "Hermes", "", restartTemplateInput{
+		Template:      "langgraph",
+		ApplyTemplate: true,
+	})
+	if label != "langgraph" {
+		t.Errorf("explicit Template must win; got label=%q", label)
+	}
+	// Verify the path is actually inside the langgraph template dir
+	expected := filepath.Join(root, "langgraph")
+	if path != expected {
+		t.Errorf("expected path %q, got %q", expected, path)
+	}
+}
--- a/workspace-server/internal/handlers/ssrf.go
+++ b/workspace-server/internal/handlers/ssrf.go
@ -30,6 +30,20 @@ func devModeAllowsLoopback() bool {
 	return env == "development" || env == "dev"
 }

+// ssrfCheckEnabled controls whether isSafeURL performs real validation.
+// Tests disable it via setSSRFCheckForTest so that httptest.NewServer
+// loopback URLs and fake hostnames (*.example) don't trigger SSRF
+// rejections. Production code never mutates this.
+var ssrfCheckEnabled = true
+
+// setSSRFCheckForTest overrides ssrfCheckEnabled for the duration of a test
+// and returns a restore function. Use with defer in *_test.go only.
+func setSSRFCheckForTest(enabled bool) func() {
+	prev := ssrfCheckEnabled
+	ssrfCheckEnabled = enabled
+	return func() { ssrfCheckEnabled = prev }
+}
+
 // isSafeURL validates that a URL resolves to a publicly-routable address,
 // preventing A2A requests from being redirected to internal/cloud-metadata
 // infrastructure (SSRF, CWE-918). Workspace URLs come from DB/Redis caches
@ -40,6 +54,9 @@ func devModeAllowsLoopback() bool {
 // the same VPC and register by their VPC-private IP. Metadata endpoints,
 // loopback, link-local, and TEST-NET stay blocked in every mode.
 func isSafeURL(rawURL string) error {
+	if !ssrfCheckEnabled {
+		return nil
+	}
 	u, err := url.Parse(rawURL)
 	if err != nil {
 		return fmt.Errorf("invalid URL: %w", err)
@ -191,8 +208,20 @@ func mustCIDR(s string) net.IPNet {
 // the destination via absolute paths or ".." traversal. Used by
 // copyFilesToContainer and deleteViaEphemeral as a defence-in-depth measure.
 func validateRelPath(filePath string) error {
+	// Reject empty string and dot-only paths before any processing.
+	if filePath == "" || filePath == "." {
+		return fmt.Errorf("empty or dot-only path not allowed")
+	}
 	clean := filepath.Clean(filePath)
-	if filepath.IsAbs(clean) || strings.Contains(clean, "..") {
+	// Reject absolute paths (Unix / or Windows C:\).
+	if filepath.IsAbs(clean) {
+		return fmt.Errorf("path traversal or absolute path not allowed: %s", filePath)
+	}
+	// Reject any path containing ".." anywhere — check both raw and cleaned
+	// because filepath.Clean resolves ".." upward (e.g. "foo/../bar" → "bar"
+	// and "foo/.." → ".") which would make the check pass if only clean were checked.
+	// We only want explicitly-named files; ".." implies intent to escape.
+	if strings.Contains(filePath, "..") || strings.Contains(clean, "..") {
 		return fmt.Errorf("path traversal or absolute path not allowed: %s", filePath)
 	}
 	return nil
--- a/workspace-server/internal/handlers/templates.go
+++ b/workspace-server/internal/handlers/templates.go
@ -292,8 +292,7 @@ func (h *TemplatesHandler) ReadFile(c *gin.Context) {

 	// Try container first
 	if containerName := h.findContainer(ctx, workspaceID); containerName != "" {
-		containerPath := rootPath + "/" + filePath
-		content, err := h.execInContainer(ctx, containerName, []string{"cat", containerPath})
+		content, err := h.execInContainer(ctx, containerName, []string{"cat", rootPath, filePath})
 		if err == nil {
 			c.JSON(http.StatusOK, gin.H{
 				"path":    filePath,
--- a/workspace-server/internal/handlers/terminal.go
+++ b/workspace-server/internal/handlers/terminal.go
@ -75,17 +75,25 @@ func (h *TerminalHandler) HandleConnect(c *gin.Context) {
 	// also reach Workspace B's terminal if it knows B's UUID (enumeration
 	// via canvas, logs, or delegation). Shell access is more dangerous than
 	// A2A message-passing, so we apply the same hierarchy check here.
+	// GH#756/#1609 security fix: if the caller claims a specific workspace
+	// identity (X-Workspace-ID header), the bearer token — if present — must
+	// belong to that claimed workspace. ValidateAnyToken accepted ANY valid org
+	// token, allowing Workspace A to forge X-Workspace-ID: B and reach B's
+	// terminal if A held any valid token. ValidateToken binds the token to
+	// the claimed workspace identity.
 	callerID := c.GetHeader("X-Workspace-ID")
-	if callerID != "" {
+	if callerID != "" && callerID != workspaceID {
 		tok := wsauth.BearerTokenFromHeader(c.GetHeader("Authorization"))
 		if tok != "" {
-			if err := wsauth.ValidateAnyToken(ctx, db.DB, tok); err == nil {
-				if !canCommunicateCheck(callerID, workspaceID) {
-					c.JSON(http.StatusForbidden, gin.H{"error": "not authorized to access this workspace's terminal"})
-					return
-				}
+			if err := wsauth.ValidateToken(ctx, db.DB, callerID, tok); err != nil {
+				c.JSON(http.StatusUnauthorized, gin.H{"error": "invalid token for claimed workspace"})
+				return
 			}
 		}
+		if !canCommunicateCheck(callerID, workspaceID) {
+			c.JSON(http.StatusForbidden, gin.H{"error": "not authorized to access this workspace's terminal"})
+			return
+		}
 	}

 	// Check for CP-provisioned workspace (instance_id persisted by
--- a/workspace-server/internal/handlers/terminal_test.go
+++ b/workspace-server/internal/handlers/terminal_test.go
@ -73,16 +73,15 @@ func TestTerminalConnect_KI005_RejectsUnauthorizedCrossWorkspace(t *testing.T) {
 	canCommunicateCheck = func(callerID, targetID string) bool { return false }
 	defer func() { canCommunicateCheck = prev }()

-	// Token lookup: ws-caller's token is valid. ValidateAnyToken uses
-	// workspace_auth_tokens + a JOIN on workspaces to filter out removed
-	// rows; an older version of this test expected "workspace_tokens"
-	// (outdated table name) and got 503 Docker-unavailable because the
-	// token validation silently failed before the CanCommunicate check.
-	rows := sqlmock.NewRows([]string{"id"}).AddRow("tok-1")
-	mock.ExpectQuery(`SELECT t\.id\s+FROM workspace_auth_tokens t`).
+	// Token lookup: ws-caller's token is valid. ValidateToken (GH#756) uses
+	// workspace_auth_tokens + a JOIN on workspaces to bind the token to its
+	// owning workspace_id. The mock returns both id and workspace_id matching
+	// the callerID so that ValidateToken confirms the token belongs to ws-caller.
+	rows := sqlmock.NewRows([]string{"id", "workspace_id"}).AddRow("tok-1", "ws-caller")
+	mock.ExpectQuery(`SELECT t\.id, t\.workspace_id\s+FROM workspace_auth_tokens t`).
 		WithArgs(sqlmock.AnyArg()).
 		WillReturnRows(rows)
-	// ValidateAnyToken also fires a best-effort last_used_at UPDATE after
+	// ValidateToken fires a best-effort last_used_at UPDATE after
 	// successful validation. Accept it so ExpectationsWereMet passes.
 	mock.ExpectExec(`UPDATE workspace_auth_tokens SET last_used_at`).
 		WithArgs(sqlmock.AnyArg()).
@ -207,9 +206,11 @@ func TestTerminalConnect_KI005_SkipsCheckWithoutHeader(t *testing.T) {
 }

 // TestTerminalConnect_KI005_RejectsInvalidToken tests that an invalid bearer
-// token also results in a non-200 response (falls through to Docker auth).
-// ValidateAnyToken returns error → CanCommunicate is never called.
+// token when X-Workspace-ID is set results in 401 Unauthorized.
+// ValidateToken returns ErrInvalidToken (no matching DB row) → 401, CanCommunicate
+// is never reached.
 func TestTerminalConnect_KI005_RejectsInvalidToken(t *testing.T) {
+	setupTestDB(t) // provides a mock DB; no expectations set → ValidateToken query returns error
 	canCommunicateCalled := false
 	prev := canCommunicateCheck
 	canCommunicateCheck = func(callerID, targetID string) bool {
@ -231,16 +232,19 @@ func TestTerminalConnect_KI005_RejectsInvalidToken(t *testing.T) {
 	if canCommunicateCalled {
 		t.Error("CanCommunicate should not be called with an invalid token")
 	}
-	// Got 503 (nil docker) instead of 200/403 — ValidateAnyToken rejected the
-	// token and we fell through to Docker auth, which returned 503 (nil docker).
-	if w.Code != http.StatusServiceUnavailable {
-		t.Errorf("invalid token: got %d, want 503 nil-docker (%s)", w.Code, w.Body.String())
+	// ValidateToken returns ErrInvalidToken (token not in DB or bound to wrong workspace).
+	// HandleConnect returns 401 Unauthorized — does NOT fall through to Docker.
+	if w.Code != http.StatusUnauthorized {
+		t.Errorf("invalid token: got %d, want 401 Unauthorized (%s)", w.Code, w.Body.String())
 	}
 }

 // TestTerminalConnect_KI005_AllowsSiblingWorkspace tests the sibling path:
 // two workspaces with the same parent ID should be allowed to communicate.
+// ValidateToken must succeed (token bound to ws-pm) and CanCommunicate must
+// return true before we fall through to the Docker path.
 func TestTerminalConnect_KI005_AllowsSiblingWorkspace(t *testing.T) {
+	mock := setupTestDB(t)
 	prev := canCommunicateCheck
 	canCommunicateCheck = func(callerID, targetID string) bool {
 		// Simulate sibling: same parent
@ -248,17 +252,27 @@ func TestTerminalConnect_KI005_AllowsSiblingWorkspace(t *testing.T) {
 	}
 	defer func() { canCommunicateCheck = prev }()

+	// ValidateToken: token is bound to ws-pm (the callerID). Returns id + workspace_id.
+	rows := sqlmock.NewRows([]string{"id", "workspace_id"}).AddRow("tok-pm", "ws-pm")
+	mock.ExpectQuery(`SELECT t\.id, t\.workspace_id\s+FROM workspace_auth_tokens t`).
+		WithArgs(sqlmock.AnyArg()).
+		WillReturnRows(rows)
+	// Best-effort last_used_at UPDATE.
+	mock.ExpectExec(`UPDATE workspace_auth_tokens SET last_used_at`).
+		WithArgs(sqlmock.AnyArg()).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
 	h := NewTerminalHandler(nil)
 	w := httptest.NewRecorder()
 	c, _ := gin.CreateTestContext(w)
 	c.Params = gin.Params{{Key: "id", Value: "ws-dev"}}
 	c.Request = httptest.NewRequest("GET", "/workspaces/ws-dev/terminal", nil)
 	c.Request.Header.Set("X-Workspace-ID", "ws-pm")
-	c.Request.Header.Set("Authorization", "Bearer valid-token")
+	c.Request.Header.Set("Authorization", "Bearer valid-token-for-ws-pm")

 	h.HandleConnect(c)

-	// CanCommunicate returned true → reached Docker path → 503 nil-docker
+	// ValidateToken passed + CanCommunicate=true → reached Docker path → 503 nil-docker.
 	if w.Code != http.StatusServiceUnavailable {
 		t.Errorf("sibling access: got %d, want 503 nil-docker (%s)", w.Code, w.Body.String())
 	}
--- a/workspace-server/internal/handlers/workspace_restart.go
+++ b/workspace-server/internal/handlers/workspace_restart.go
@ -5,8 +5,6 @@ import (
 	"database/sql"
 	"log"
 	"net/http"
-	"os"
-	"path/filepath"
 	"strings"
 	"sync"
 	"time"
@ -127,53 +125,11 @@ func (h *WorkspaceHandler) Restart(c *gin.Context) {
 	}
 	c.ShouldBindJSON(&body)

-	// Resolve template path in priority order:
-	// 1. Explicit template from request body
-	// 2. Runtime-specific default template (e.g. claude-code-default/)
-	// 3. Name-based match in templates directory
-	// 4. No template — the volume already has configs from previous run
-	var templatePath string
-	var configFiles map[string][]byte
-	configLabel := "existing-volume"
-
-	template := body.Template
-	if template == "" {
-		template = findTemplateByName(h.configsDir, wsName)
-	}
-	if template != "" {
-		candidatePath, resolveErr := resolveInsideRoot(h.configsDir, template)
-		if resolveErr != nil {
-			log.Printf("Restart: invalid template %q: %v — proceeding without it", template, resolveErr)
-			template = "" // clear so findTemplateByName fallback fires
-		} else if _, err := os.Stat(candidatePath); err == nil {
-			templatePath = candidatePath
-			configLabel = template
-		} else {
-			log.Printf("Restart: template %q dir not found — proceeding without it", template)
-		}
-	}
-
-	// #239: rebuild_config=true — try org-templates as last-resort source so a
-	// workspace with a destroyed config volume can self-recover without admin
-	// intervention. Only fires when no other template was resolved above.
-	if templatePath == "" && body.RebuildConfig {
-		if p, label := resolveOrgTemplate(h.configsDir, wsName); p != "" {
-			templatePath = p
-			configLabel = label
-			log.Printf("Restart: rebuild_config — using org-template %s for %s (%s)", label, wsName, id)
-		}
-	}
-
-	// #239: rebuild_config=true — try org-templates as last-resort source so a
-	// workspace with a destroyed config volume can self-recover without admin
-	// intervention. Only fires when no other template was resolved above.
-	if templatePath == "" && body.RebuildConfig {
-		if p, label := resolveOrgTemplate(h.configsDir, wsName); p != "" {
-			templatePath = p
-			configLabel = label
-			log.Printf("Restart: rebuild_config — using org-template %s for %s (%s)", label, wsName, id)
-		}
-	}
+	templatePath, configLabel := resolveRestartTemplate(h.configsDir, wsName, dbRuntime, restartTemplateInput{
+		Template:      body.Template,
+		ApplyTemplate: body.ApplyTemplate,
+		RebuildConfig: body.RebuildConfig,
+	})

 	if templatePath == "" {
 		log.Printf("Restart: reusing existing config volume for %s (%s)", wsName, id)
@ -181,21 +137,10 @@ func (h *WorkspaceHandler) Restart(c *gin.Context) {
 		log.Printf("Restart: using template %s for %s (%s)", templatePath, wsName, id)
 	}

+	var configFiles map[string][]byte
 	payload := models.CreateWorkspacePayload{Name: wsName, Tier: tier, Runtime: containerRuntime}
 	log.Printf("Restart: workspace %s (%s) runtime=%q", wsName, id, containerRuntime)

-	// Apply runtime-default template ONLY when explicitly requested via "apply_template": true.
-	// Use case: runtime was changed via Config tab — need new runtime's base files.
-	// Normal restarts preserve existing config volume (user's model, skills, prompts).
-	if templatePath == "" && body.ApplyTemplate && dbRuntime != "" {
-		runtimeTemplate := filepath.Join(h.configsDir, dbRuntime+"-default")
-		if _, err := os.Stat(runtimeTemplate); err == nil {
-			templatePath = runtimeTemplate
-			configLabel = dbRuntime + "-default"
-			log.Printf("Restart: applying template %s (runtime change)", configLabel)
-		}
-	}
-
 	// #12: ?reset=true (or body.Reset) discards the claude-sessions volume
 	// before restart, giving the agent a clean /root/.claude/sessions dir.
 	resetClaudeSession := c.Query("reset") == "true" || body.Reset
--- a/workspace-server/internal/middleware/wsauth_middleware_org_id_test.go
+++ b/workspace-server/internal/middleware/wsauth_middleware_org_id_test.go
@ -212,13 +212,11 @@ func TestWorkspaceAuth_OrgToken_DBRowScanError_DoesNotPanic(t *testing.T) {
 	orgToken := "tok_token_ok"
 	tokenHash := sha256.Sum256([]byte(orgToken))

-	// Single-round-trip Validate: returns NULL org_id (stands in for the
-	// scan-error case the original test was exercising; the secondary hop
-	// it mimicked no longer exists).
+	// orgtoken.Validate returns 3 columns including org_id (sql.NullString).
 	mock.ExpectQuery(orgTokenValidateQuery).
 		WithArgs(tokenHash[:]).
 		WillReturnRows(sqlmock.NewRows([]string{"id", "prefix", "org_id"}).
-			AddRow("tok-ok", "tok_tok_", nil))
+			AddRow("tok-ok", "tok_tok_", "00000000-0000-0000-0000-000000000099"))

 	r := gin.New()
 	r.GET("/workspaces/:id/secrets", WorkspaceAuth(mockDB), func(c *gin.Context) {
--- a/workspace-server/internal/middleware/wsauth_middleware_test.go
+++ b/workspace-server/internal/middleware/wsauth_middleware_test.go
@ -473,12 +473,15 @@ func TestAdminAuth_InvalidBearer_Returns401(t *testing.T) {
 // token (org_id="ws-org-1").
 // ────────────────────────────────────────────────────────────────────────────

-// orgTokenValidateQueryV1 is matched for orgtoken.Validate(). Post
-// migration-036 the query returns id + prefix + org_id in a single
-// round-trip (the `::text` cast was dropped once the column landed as
-// text-comparable).
+// orgTokenValidateQueryV1 is matched for orgtoken.Validate().
+// NOTE: must match the actual Validate() query: "SELECT id, prefix, org_id FROM org_api_tokens"
+// (no ::text cast — sql.NullString handles the NULL scan natively).
 const orgTokenValidateQueryV1 = "SELECT id, prefix, org_id FROM org_api_tokens"

+// orgTokenOrgIDQuery is deprecated — org_id is now returned by the primary Validate query.
+// Kept here to avoid breaking other test files that may reference it.
+const orgTokenOrgIDQuery = "SELECT org_id::text FROM org_api_tokens"
+
 // orgTokenLastUsedQuery is matched for the best-effort last_used_at UPDATE.
 const orgTokenLastUsedQuery = "UPDATE org_api_tokens SET last_used_at"

--- a/workspace-server/internal/orgtoken/tokens_test.go
+++ b/workspace-server/internal/orgtoken/tokens_test.go
@ -145,7 +145,7 @@ func TestList_NewestFirst(t *testing.T) {

 	now := time.Now()
 	earlier := now.Add(-1 * time.Hour)
-	mock.ExpectQuery(`SELECT id, prefix.*FROM org_api_tokens.*ORDER BY created_at DESC( LIMIT $1)?`).
+	mock.ExpectQuery(`SELECT id, prefix.*FROM org_api_tokens.*ORDER BY created_at DESC`).
 		WithArgs(listMax).
 		WillReturnRows(sqlmock.NewRows([]string{"id", "prefix", "name", "org_id", "created_by", "created_at", "last_used_at"}).
 			AddRow("t2", "abcd1234", "zapier", "org-1", "user_01", now, now).