diff --git a/workspace-server/internal/handlers/runtime_provision_timeouts.go b/workspace-server/internal/handlers/runtime_provision_timeouts.go new file mode 100644 index 00000000..d27b1975 --- /dev/null +++ b/workspace-server/internal/handlers/runtime_provision_timeouts.go @@ -0,0 +1,83 @@ +package handlers + +import ( + "log" + "os" + "path/filepath" + "sync" + + "gopkg.in/yaml.v3" +) + +// runtimeProvisionTimeouts caches the per-runtime provision-timeout values +// declared in template config.yaml manifests (#2054 phase 2). Lazy-init so +// the first workspace API request after process start pays the read cost +// (a few KB of yaml across ~50 templates) and every subsequent one is a +// map lookup. +// +// Cache lifetime = process lifetime. Templates only change on container +// rebuild + workspace-server restart, which already invalidates the +// in-memory state. A future template-hot-reload feature would need to +// refresh this cache; today there is no such hook. +type runtimeProvisionTimeoutsCache struct { + once sync.Once + m map[string]int // runtime → seconds +} + +func (c *runtimeProvisionTimeoutsCache) get(configsDir string, runtime string) int { + c.once.Do(func() { + c.m = loadRuntimeProvisionTimeouts(configsDir) + }) + return c.m[runtime] +} + +// loadRuntimeProvisionTimeouts walks `configsDir`, parses every immediate +// subdirectory's `config.yaml`, and returns a map of runtime → seconds +// for templates that declared `runtime_config.provision_timeout_seconds`. +// +// Templates without the field aren't represented (lookup returns zero, +// which the caller treats as "fall through to canvas runtime profile"). +// +// Multiple templates with the same runtime: take the MAX timeout — a +// slow template's threshold should win over a fast template's so users +// of either template see a true-positive timeout signal rather than a +// false alarm. Same-runtime divergence is rare in practice (typically +// one canonical template-{runtime} per runtime) but the rule is the +// safer default. +func loadRuntimeProvisionTimeouts(configsDir string) map[string]int { + out := map[string]int{} + entries, err := os.ReadDir(configsDir) + if err != nil { + // Logged but not fatal — workspace-server starts cleanly with + // no templates (dev / fresh-clone). The result is an empty map + // so every runtime falls through to canvas's profile default. + log.Printf("loadRuntimeProvisionTimeouts: read configsDir %s: %v", configsDir, err) + return out + } + for _, e := range entries { + if !e.IsDir() { + continue + } + data, err := os.ReadFile(filepath.Join(configsDir, e.Name(), "config.yaml")) + if err != nil { + continue + } + var raw struct { + Runtime string `yaml:"runtime"` + RuntimeConfig struct { + ProvisionTimeoutSeconds int `yaml:"provision_timeout_seconds"` + } `yaml:"runtime_config"` + } + if err := yaml.Unmarshal(data, &raw); err != nil { + continue + } + secs := raw.RuntimeConfig.ProvisionTimeoutSeconds + if secs <= 0 || raw.Runtime == "" { + continue + } + if existing, ok := out[raw.Runtime]; !ok || secs > existing { + out[raw.Runtime] = secs + } + } + return out +} diff --git a/workspace-server/internal/handlers/runtime_provision_timeouts_test.go b/workspace-server/internal/handlers/runtime_provision_timeouts_test.go new file mode 100644 index 00000000..a6cd3b52 --- /dev/null +++ b/workspace-server/internal/handlers/runtime_provision_timeouts_test.go @@ -0,0 +1,128 @@ +package handlers + +import ( + "os" + "path/filepath" + "testing" +) + +// writeTemplate is a tiny test fixture: drop a config.yaml under +// tmp//config.yaml with the given content. Mirrors the real +// configsDir layout (one subdir per template, each with its own +// config.yaml). +func writeTemplate(t *testing.T, dir, name, content string) { + t.Helper() + p := filepath.Join(dir, name) + if err := os.MkdirAll(p, 0o755); err != nil { + t.Fatalf("mkdir %s: %v", p, err) + } + if err := os.WriteFile(filepath.Join(p, "config.yaml"), []byte(content), 0o644); err != nil { + t.Fatalf("write config.yaml: %v", err) + } +} + +func TestLoadRuntimeProvisionTimeouts_HappyPath(t *testing.T) { + dir := t.TempDir() + writeTemplate(t, dir, "template-hermes", ` +name: Hermes +runtime: hermes +runtime_config: + provision_timeout_seconds: 720 +`) + writeTemplate(t, dir, "template-claude-code", ` +name: Claude +runtime: claude-code +runtime_config: + model: anthropic:claude-opus +`) + got := loadRuntimeProvisionTimeouts(dir) + if got["hermes"] != 720 { + t.Errorf("hermes: got %d, want 720", got["hermes"]) + } + // claude-code didn't declare a timeout — must not appear in the map + // (zero-value lookup is the no-override signal). + if _, ok := got["claude-code"]; ok { + t.Errorf("claude-code: present without declaration: %d", got["claude-code"]) + } +} + +func TestLoadRuntimeProvisionTimeouts_MaxOnDuplicateRuntime(t *testing.T) { + dir := t.TempDir() + writeTemplate(t, dir, "template-hermes-fast", ` +runtime: hermes +runtime_config: + provision_timeout_seconds: 300 +`) + writeTemplate(t, dir, "template-hermes-slow", ` +runtime: hermes +runtime_config: + provision_timeout_seconds: 900 +`) + got := loadRuntimeProvisionTimeouts(dir) + // Max wins so the slowest template's threshold doesn't false-alarm + // when both templates use the same runtime. + if got["hermes"] != 900 { + t.Errorf("max-on-duplicate: got %d, want 900", got["hermes"]) + } +} + +func TestLoadRuntimeProvisionTimeouts_SkipsBadInputs(t *testing.T) { + dir := t.TempDir() + // Missing runtime field — has timeout but no key to map under. + writeTemplate(t, dir, "template-no-runtime", ` +runtime_config: + provision_timeout_seconds: 600 +`) + // Zero/negative timeout — same as no declaration. + writeTemplate(t, dir, "template-zero", ` +runtime: zero-runtime +runtime_config: + provision_timeout_seconds: 0 +`) + // Malformed yaml — must not crash. + writeTemplate(t, dir, "template-bad", "not: valid: yaml: at: all:") + // Loose file at the top level (not a dir) — must be ignored. + if err := os.WriteFile(filepath.Join(dir, "stray.txt"), []byte("ignore me"), 0o644); err != nil { + t.Fatal(err) + } + got := loadRuntimeProvisionTimeouts(dir) + if len(got) != 0 { + t.Errorf("expected empty map for skip cases, got %v", got) + } +} + +func TestLoadRuntimeProvisionTimeouts_MissingDirReturnsEmpty(t *testing.T) { + got := loadRuntimeProvisionTimeouts("/nonexistent/path/should/not/exist/12345") + if len(got) != 0 { + t.Errorf("expected empty map on missing dir, got %v", got) + } +} + +func TestRuntimeProvisionTimeoutsCache_LazyInitAndCached(t *testing.T) { + dir := t.TempDir() + writeTemplate(t, dir, "template-hermes", ` +runtime: hermes +runtime_config: + provision_timeout_seconds: 720 +`) + c := runtimeProvisionTimeoutsCache{} + + // First call populates. + if got := c.get(dir, "hermes"); got != 720 { + t.Errorf("first call: got %d, want 720", got) + } + // Second call hits cache — even if the underlying file changed we + // still see the original value (sync.Once contract). + if err := os.WriteFile(filepath.Join(dir, "template-hermes", "config.yaml"), + []byte("runtime: hermes\nruntime_config:\n provision_timeout_seconds: 60\n"), 0o644); err != nil { + t.Fatal(err) + } + if got := c.get(dir, "hermes"); got != 720 { + t.Errorf("cached call: got %d, want 720 (cache must not re-read)", got) + } + // Unknown runtime returns zero — caller's signal to fall through to + // the canvas runtime profile default. + if got := c.get(dir, "unknown"); got != 0 { + t.Errorf("unknown runtime: got %d, want 0", got) + } +} diff --git a/workspace-server/internal/handlers/templates.go b/workspace-server/internal/handlers/templates.go index 38735830..6c5f42f3 100644 --- a/workspace-server/internal/handlers/templates.go +++ b/workspace-server/internal/handlers/templates.go @@ -61,6 +61,13 @@ type templateSummary struct { RequiredEnv []string `json:"required_env,omitempty"` Skills []string `json:"skills"` SkillCount int `json:"skill_count"` + // ProvisionTimeoutSeconds lets a slow runtime declare its expected + // cold-boot duration in its template manifest. Canvas's + // ProvisioningTimeout banner respects this per-workspace via the + // `provision_timeout_ms` field in the workspace API response (#2054). + // 0 = template hasn't declared one, falls through to canvas's + // runtime-profile default. + ProvisionTimeoutSeconds int `json:"provision_timeout_seconds,omitempty"` } // resolveTemplateDir finds the template directory for a workspace on the host. @@ -106,9 +113,10 @@ func (h *TemplatesHandler) List(c *gin.Context) { Model string `yaml:"model"` Skills []string `yaml:"skills"` RuntimeConfig struct { - Model string `yaml:"model"` - Models []modelSpec `yaml:"models"` - RequiredEnv []string `yaml:"required_env"` + Model string `yaml:"model"` + Models []modelSpec `yaml:"models"` + RequiredEnv []string `yaml:"required_env"` + ProvisionTimeoutSeconds int `yaml:"provision_timeout_seconds"` } `yaml:"runtime_config"` } if err := yaml.Unmarshal(data, &raw); err != nil { @@ -122,16 +130,17 @@ func (h *TemplatesHandler) List(c *gin.Context) { } templates = append(templates, templateSummary{ - ID: entry.Name(), - Name: raw.Name, - Description: raw.Description, - Tier: raw.Tier, - Runtime: raw.Runtime, - Model: model, - Models: raw.RuntimeConfig.Models, - RequiredEnv: raw.RuntimeConfig.RequiredEnv, - Skills: raw.Skills, - SkillCount: len(raw.Skills), + ID: entry.Name(), + Name: raw.Name, + Description: raw.Description, + Tier: raw.Tier, + Runtime: raw.Runtime, + Model: model, + Models: raw.RuntimeConfig.Models, + RequiredEnv: raw.RuntimeConfig.RequiredEnv, + Skills: raw.Skills, + SkillCount: len(raw.Skills), + ProvisionTimeoutSeconds: raw.RuntimeConfig.ProvisionTimeoutSeconds, }) } diff --git a/workspace-server/internal/handlers/workspace.go b/workspace-server/internal/handlers/workspace.go index 91ece238..43d6e877 100644 --- a/workspace-server/internal/handlers/workspace.go +++ b/workspace-server/internal/handlers/workspace.go @@ -40,6 +40,10 @@ type WorkspaceHandler struct { // calls made by HibernateWorkspace without requiring a running Docker daemon. // Always nil in production; the real provisioner path is used when nil. stopFnOverride func(ctx context.Context, workspaceID string) + // provisionTimeouts caches per-runtime provision-timeout values from + // template manifests (#2054 phase 2). Lazy-init on first scan; see + // runtime_provision_timeouts.go for the loader contract. + provisionTimeouts runtimeProvisionTimeoutsCache } func NewWorkspaceHandler(b *events.Broadcaster, p *provisioner.Provisioner, platformURL, configsDir string) *WorkspaceHandler { @@ -343,6 +347,17 @@ func (h *WorkspaceHandler) Create(c *gin.Context) { }) } +// addProvisionTimeoutMs decorates a workspace response map with the +// per-runtime provision-timeout override (#2054 phase 2) when one is +// declared in the runtime's template manifest. No-op when the runtime +// has no declared timeout — the canvas-side resolver falls through to +// its runtime-profile default. +func (h *WorkspaceHandler) addProvisionTimeoutMs(ws map[string]interface{}, runtime string) { + if secs := h.provisionTimeouts.get(h.configsDir, runtime); secs > 0 { + ws["provision_timeout_ms"] = secs * 1000 + } +} + // scanWorkspaceRow is a helper to scan workspace+layout rows into a clean JSON map. func scanWorkspaceRow(rows interface { Scan(dest ...interface{}) error @@ -441,6 +456,13 @@ func (h *WorkspaceHandler) List(c *gin.Context) { log.Printf("List scan error: %v", err) continue } + // #2054 phase 2: surface per-runtime provision-timeout for + // canvas's ProvisioningTimeout banner. Decorating per-row + // (vs map-once-and-reuse) keeps the helper self-contained; + // the cache hit is sub-microsecond. + if rt, _ := ws["runtime"].(string); rt != "" { + h.addProvisionTimeoutMs(ws, rt) + } workspaces = append(workspaces, ws) } if err := rows.Err(); err != nil { @@ -508,5 +530,11 @@ func (h *WorkspaceHandler) Get(c *gin.Context) { ws["last_outbound_at"] = nil } + // #2054 phase 2: per-runtime provision-timeout for canvas's + // ProvisioningTimeout banner. + if rt, _ := ws["runtime"].(string); rt != "" { + h.addProvisionTimeoutMs(ws, rt) + } + c.JSON(http.StatusOK, ws) }