feat(workspace-server): surface provision_timeout_ms in workspace API (#2054 phase 2)

Phase 2 of #2054 — workspace-server reads runtime-level provision_timeout_seconds from template config.yaml manifests and includes provision_timeout_ms in the workspace List/Get response. Phase 1 (canvas, #2092) already plumbs the field through socket → node-data → ProvisioningTimeout's resolver, so the moment a template declares the field the per-runtime banner threshold adjusts without a canvas release. Implementation: - templates.go: parse runtime_config.provision_timeout_seconds in the templateSummary marshaller. The /templates API now surfaces the field too — useful for ops dashboards and future tooling. - runtime_provision_timeouts.go (new): loadRuntimeProvisionTimeouts scans configsDir, parses every immediate subdir's config.yaml, returns runtime → seconds. Multiple templates with the same runtime: max wins (so a slow template's threshold doesn't get cut by a fast template's). Bad/empty inputs are silently skipped — workspace-server starts cleanly with no templates. - runtimeProvisionTimeoutsCache: sync.Once-backed lazy cache. First workspace API request after process start pays the read cost (~few KB across ~50 templates); every subsequent request is a map lookup. Cache lifetime = process lifetime; invalidates on workspace-server restart, which is the normal template-change cadence. - WorkspaceHandler gets a provisionTimeouts field (zero-value struct is valid — the cache lazy-inits on first get()). - addProvisionTimeoutMs decorates the response map with provision_timeout_ms (seconds × 1000) when the runtime has a declared timeout. Absent = no key in the response, canvas falls through to its runtime-profile default. Wired into both List (per-row decoration in the loop) and Get. Tests (5 new in runtime_provision_timeouts_test.go): - happy path: hermes declares 720, claude-code doesn't, only hermes appears in the map - max-on-duplicate: same runtime in two templates → max wins - skip-bad-inputs: missing runtime, zero timeout, malformed yaml, loose top-level files all silently ignored - missing-dir: returns empty map, no crash - cache: lazy-init on first get; subsequent gets hit cache even after underlying file changes (sync.Once contract); unknown runtime returns zero Phase 3 (separate template-repo PR): template-hermes config.yaml declares provision_timeout_seconds: 720 under runtime_config. canvas RUNTIME_PROFILES.hermes becomes redundant + removable. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 06:37:45 -07:00 · 2026-04-26 06:37:45 -07:00 · 27396d992c
commit 27396d992c
parent f4cbb50ddf
4 changed files with 261 additions and 13 deletions
--- a/workspace-server/internal/handlers/runtime_provision_timeouts.go
+++ b/workspace-server/internal/handlers/runtime_provision_timeouts.go
@ -0,0 +1,83 @@
+package handlers
+
+import (
+	"log"
+	"os"
+	"path/filepath"
+	"sync"
+
+	"gopkg.in/yaml.v3"
+)
+
+// runtimeProvisionTimeouts caches the per-runtime provision-timeout values
+// declared in template config.yaml manifests (#2054 phase 2). Lazy-init so
+// the first workspace API request after process start pays the read cost
+// (a few KB of yaml across ~50 templates) and every subsequent one is a
+// map lookup.
+//
+// Cache lifetime = process lifetime. Templates only change on container
+// rebuild + workspace-server restart, which already invalidates the
+// in-memory state. A future template-hot-reload feature would need to
+// refresh this cache; today there is no such hook.
+type runtimeProvisionTimeoutsCache struct {
+	once sync.Once
+	m    map[string]int // runtime → seconds
+}
+
+func (c *runtimeProvisionTimeoutsCache) get(configsDir string, runtime string) int {
+	c.once.Do(func() {
+		c.m = loadRuntimeProvisionTimeouts(configsDir)
+	})
+	return c.m[runtime]
+}
+
+// loadRuntimeProvisionTimeouts walks `configsDir`, parses every immediate
+// subdirectory's `config.yaml`, and returns a map of runtime → seconds
+// for templates that declared `runtime_config.provision_timeout_seconds`.
+//
+// Templates without the field aren't represented (lookup returns zero,
+// which the caller treats as "fall through to canvas runtime profile").
+//
+// Multiple templates with the same runtime: take the MAX timeout — a
+// slow template's threshold should win over a fast template's so users
+// of either template see a true-positive timeout signal rather than a
+// false alarm. Same-runtime divergence is rare in practice (typically
+// one canonical template-{runtime} per runtime) but the rule is the
+// safer default.
+func loadRuntimeProvisionTimeouts(configsDir string) map[string]int {
+	out := map[string]int{}
+	entries, err := os.ReadDir(configsDir)
+	if err != nil {
+		// Logged but not fatal — workspace-server starts cleanly with
+		// no templates (dev / fresh-clone). The result is an empty map
+		// so every runtime falls through to canvas's profile default.
+		log.Printf("loadRuntimeProvisionTimeouts: read configsDir %s: %v", configsDir, err)
+		return out
+	}
+	for _, e := range entries {
+		if !e.IsDir() {
+			continue
+		}
+		data, err := os.ReadFile(filepath.Join(configsDir, e.Name(), "config.yaml"))
+		if err != nil {
+			continue
+		}
+		var raw struct {
+			Runtime       string `yaml:"runtime"`
+			RuntimeConfig struct {
+				ProvisionTimeoutSeconds int `yaml:"provision_timeout_seconds"`
+			} `yaml:"runtime_config"`
+		}
+		if err := yaml.Unmarshal(data, &raw); err != nil {
+			continue
+		}
+		secs := raw.RuntimeConfig.ProvisionTimeoutSeconds
+		if secs <= 0 || raw.Runtime == "" {
+			continue
+		}
+		if existing, ok := out[raw.Runtime]; !ok || secs > existing {
+			out[raw.Runtime] = secs
+		}
+	}
+	return out
+}
--- a/workspace-server/internal/handlers/runtime_provision_timeouts_test.go
+++ b/workspace-server/internal/handlers/runtime_provision_timeouts_test.go
@ -0,0 +1,128 @@
+package handlers
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+// writeTemplate is a tiny test fixture: drop a config.yaml under
+// tmp/<dir>/config.yaml with the given content. Mirrors the real
+// configsDir layout (one subdir per template, each with its own
+// config.yaml).
+func writeTemplate(t *testing.T, dir, name, content string) {
+	t.Helper()
+	p := filepath.Join(dir, name)
+	if err := os.MkdirAll(p, 0o755); err != nil {
+		t.Fatalf("mkdir %s: %v", p, err)
+	}
+	if err := os.WriteFile(filepath.Join(p, "config.yaml"), []byte(content), 0o644); err != nil {
+		t.Fatalf("write config.yaml: %v", err)
+	}
+}
+
+func TestLoadRuntimeProvisionTimeouts_HappyPath(t *testing.T) {
+	dir := t.TempDir()
+	writeTemplate(t, dir, "template-hermes", `
+name: Hermes
+runtime: hermes
+runtime_config:
+  provision_timeout_seconds: 720
+`)
+	writeTemplate(t, dir, "template-claude-code", `
+name: Claude
+runtime: claude-code
+runtime_config:
+  model: anthropic:claude-opus
+`)
+	got := loadRuntimeProvisionTimeouts(dir)
+	if got["hermes"] != 720 {
+		t.Errorf("hermes: got %d, want 720", got["hermes"])
+	}
+	// claude-code didn't declare a timeout — must not appear in the map
+	// (zero-value lookup is the no-override signal).
+	if _, ok := got["claude-code"]; ok {
+		t.Errorf("claude-code: present without declaration: %d", got["claude-code"])
+	}
+}
+
+func TestLoadRuntimeProvisionTimeouts_MaxOnDuplicateRuntime(t *testing.T) {
+	dir := t.TempDir()
+	writeTemplate(t, dir, "template-hermes-fast", `
+runtime: hermes
+runtime_config:
+  provision_timeout_seconds: 300
+`)
+	writeTemplate(t, dir, "template-hermes-slow", `
+runtime: hermes
+runtime_config:
+  provision_timeout_seconds: 900
+`)
+	got := loadRuntimeProvisionTimeouts(dir)
+	// Max wins so the slowest template's threshold doesn't false-alarm
+	// when both templates use the same runtime.
+	if got["hermes"] != 900 {
+		t.Errorf("max-on-duplicate: got %d, want 900", got["hermes"])
+	}
+}
+
+func TestLoadRuntimeProvisionTimeouts_SkipsBadInputs(t *testing.T) {
+	dir := t.TempDir()
+	// Missing runtime field — has timeout but no key to map under.
+	writeTemplate(t, dir, "template-no-runtime", `
+runtime_config:
+  provision_timeout_seconds: 600
+`)
+	// Zero/negative timeout — same as no declaration.
+	writeTemplate(t, dir, "template-zero", `
+runtime: zero-runtime
+runtime_config:
+  provision_timeout_seconds: 0
+`)
+	// Malformed yaml — must not crash.
+	writeTemplate(t, dir, "template-bad", "not: valid: yaml: at: all:")
+	// Loose file at the top level (not a dir) — must be ignored.
+	if err := os.WriteFile(filepath.Join(dir, "stray.txt"), []byte("ignore me"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	got := loadRuntimeProvisionTimeouts(dir)
+	if len(got) != 0 {
+		t.Errorf("expected empty map for skip cases, got %v", got)
+	}
+}
+
+func TestLoadRuntimeProvisionTimeouts_MissingDirReturnsEmpty(t *testing.T) {
+	got := loadRuntimeProvisionTimeouts("/nonexistent/path/should/not/exist/12345")
+	if len(got) != 0 {
+		t.Errorf("expected empty map on missing dir, got %v", got)
+	}
+}
+
+func TestRuntimeProvisionTimeoutsCache_LazyInitAndCached(t *testing.T) {
+	dir := t.TempDir()
+	writeTemplate(t, dir, "template-hermes", `
+runtime: hermes
+runtime_config:
+  provision_timeout_seconds: 720
+`)
+	c := runtimeProvisionTimeoutsCache{}
+
+	// First call populates.
+	if got := c.get(dir, "hermes"); got != 720 {
+		t.Errorf("first call: got %d, want 720", got)
+	}
+	// Second call hits cache — even if the underlying file changed we
+	// still see the original value (sync.Once contract).
+	if err := os.WriteFile(filepath.Join(dir, "template-hermes", "config.yaml"),
+		[]byte("runtime: hermes\nruntime_config:\n  provision_timeout_seconds: 60\n"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+	if got := c.get(dir, "hermes"); got != 720 {
+		t.Errorf("cached call: got %d, want 720 (cache must not re-read)", got)
+	}
+	// Unknown runtime returns zero — caller's signal to fall through to
+	// the canvas runtime profile default.
+	if got := c.get(dir, "unknown"); got != 0 {
+		t.Errorf("unknown runtime: got %d, want 0", got)
+	}
+}
--- a/workspace-server/internal/handlers/templates.go
+++ b/workspace-server/internal/handlers/templates.go
@ -61,6 +61,13 @@ type templateSummary struct {
 	RequiredEnv []string `json:"required_env,omitempty"`
 	Skills      []string `json:"skills"`
 	SkillCount  int      `json:"skill_count"`
+	// ProvisionTimeoutSeconds lets a slow runtime declare its expected
+	// cold-boot duration in its template manifest. Canvas's
+	// ProvisioningTimeout banner respects this per-workspace via the
+	// `provision_timeout_ms` field in the workspace API response (#2054).
+	// 0 = template hasn't declared one, falls through to canvas's
+	// runtime-profile default.
+	ProvisionTimeoutSeconds int `json:"provision_timeout_seconds,omitempty"`
 }

 // resolveTemplateDir finds the template directory for a workspace on the host.
@ -106,9 +113,10 @@ func (h *TemplatesHandler) List(c *gin.Context) {
 			Model         string   `yaml:"model"`
 			Skills        []string `yaml:"skills"`
 			RuntimeConfig struct {
-				Model       string      `yaml:"model"`
-				Models      []modelSpec `yaml:"models"`
-				RequiredEnv []string    `yaml:"required_env"`
+				Model                  string      `yaml:"model"`
+				Models                 []modelSpec `yaml:"models"`
+				RequiredEnv            []string    `yaml:"required_env"`
+				ProvisionTimeoutSeconds int         `yaml:"provision_timeout_seconds"`
 			} `yaml:"runtime_config"`
 		}
 		if err := yaml.Unmarshal(data, &raw); err != nil {
@ -122,16 +130,17 @@ func (h *TemplatesHandler) List(c *gin.Context) {
 		}

 		templates = append(templates, templateSummary{
-			ID:          entry.Name(),
-			Name:        raw.Name,
-			Description: raw.Description,
-			Tier:        raw.Tier,
-			Runtime:     raw.Runtime,
-			Model:       model,
-			Models:      raw.RuntimeConfig.Models,
-			RequiredEnv: raw.RuntimeConfig.RequiredEnv,
-			Skills:      raw.Skills,
-			SkillCount:  len(raw.Skills),
+			ID:                      entry.Name(),
+			Name:                    raw.Name,
+			Description:             raw.Description,
+			Tier:                    raw.Tier,
+			Runtime:                 raw.Runtime,
+			Model:                   model,
+			Models:                  raw.RuntimeConfig.Models,
+			RequiredEnv:             raw.RuntimeConfig.RequiredEnv,
+			Skills:                  raw.Skills,
+			SkillCount:              len(raw.Skills),
+			ProvisionTimeoutSeconds: raw.RuntimeConfig.ProvisionTimeoutSeconds,
 		})
 	}

--- a/workspace-server/internal/handlers/workspace.go
+++ b/workspace-server/internal/handlers/workspace.go
@ -40,6 +40,10 @@ type WorkspaceHandler struct {
 	// calls made by HibernateWorkspace without requiring a running Docker daemon.
 	// Always nil in production; the real provisioner path is used when nil.
 	stopFnOverride func(ctx context.Context, workspaceID string)
+	// provisionTimeouts caches per-runtime provision-timeout values from
+	// template manifests (#2054 phase 2). Lazy-init on first scan; see
+	// runtime_provision_timeouts.go for the loader contract.
+	provisionTimeouts runtimeProvisionTimeoutsCache
 }

 func NewWorkspaceHandler(b *events.Broadcaster, p *provisioner.Provisioner, platformURL, configsDir string) *WorkspaceHandler {
@ -343,6 +347,17 @@ func (h *WorkspaceHandler) Create(c *gin.Context) {
 	})
 }

+// addProvisionTimeoutMs decorates a workspace response map with the
+// per-runtime provision-timeout override (#2054 phase 2) when one is
+// declared in the runtime's template manifest. No-op when the runtime
+// has no declared timeout — the canvas-side resolver falls through to
+// its runtime-profile default.
+func (h *WorkspaceHandler) addProvisionTimeoutMs(ws map[string]interface{}, runtime string) {
+	if secs := h.provisionTimeouts.get(h.configsDir, runtime); secs > 0 {
+		ws["provision_timeout_ms"] = secs * 1000
+	}
+}
+
 // scanWorkspaceRow is a helper to scan workspace+layout rows into a clean JSON map.
 func scanWorkspaceRow(rows interface {
 	Scan(dest ...interface{}) error
@ -441,6 +456,13 @@ func (h *WorkspaceHandler) List(c *gin.Context) {
 			log.Printf("List scan error: %v", err)
 			continue
 		}
+		// #2054 phase 2: surface per-runtime provision-timeout for
+		// canvas's ProvisioningTimeout banner. Decorating per-row
+		// (vs map-once-and-reuse) keeps the helper self-contained;
+		// the cache hit is sub-microsecond.
+		if rt, _ := ws["runtime"].(string); rt != "" {
+			h.addProvisionTimeoutMs(ws, rt)
+		}
 		workspaces = append(workspaces, ws)
 	}
 	if err := rows.Err(); err != nil {
@ -508,5 +530,11 @@ func (h *WorkspaceHandler) Get(c *gin.Context) {
 		ws["last_outbound_at"] = nil
 	}

+	// #2054 phase 2: per-runtime provision-timeout for canvas's
+	// ProvisioningTimeout banner.
+	if rt, _ := ws["runtime"].(string); rt != "" {
+		h.addProvisionTimeoutMs(ws, rt)
+	}
+
 	c.JSON(http.StatusOK, ws)
 }