feat(workspace-server): surface provision_timeout_ms in workspace API (#2054 phase 2)

Phase 2 of #2054 — workspace-server reads runtime-level
provision_timeout_seconds from template config.yaml manifests and
includes provision_timeout_ms in the workspace List/Get response.
Phase 1 (canvas, #2092) already plumbs the field through socket →
node-data → ProvisioningTimeout's resolver, so the moment a
template declares the field the per-runtime banner threshold
adjusts without a canvas release.

Implementation:

- templates.go: parse runtime_config.provision_timeout_seconds in
  the templateSummary marshaller. The /templates API now surfaces
  the field too — useful for ops dashboards and future tooling.
- runtime_provision_timeouts.go (new): loadRuntimeProvisionTimeouts
  scans configsDir, parses every immediate subdir's config.yaml,
  returns runtime → seconds. Multiple templates with the same
  runtime: max wins (so a slow template's threshold doesn't get
  cut by a fast template's). Bad/empty inputs are silently
  skipped — workspace-server starts cleanly with no templates.
- runtimeProvisionTimeoutsCache: sync.Once-backed lazy cache.
  First workspace API request after process start pays the read
  cost (~few KB across ~50 templates); every subsequent request is
  a map lookup. Cache lifetime = process lifetime; invalidates on
  workspace-server restart, which is the normal template-change
  cadence.
- WorkspaceHandler gets a provisionTimeouts field (zero-value struct
  is valid — the cache lazy-inits on first get()).
- addProvisionTimeoutMs decorates the response map with
  provision_timeout_ms (seconds × 1000) when the runtime has a
  declared timeout. Absent = no key in the response, canvas falls
  through to its runtime-profile default. Wired into both List
  (per-row decoration in the loop) and Get.

Tests (5 new in runtime_provision_timeouts_test.go):
- happy path: hermes declares 720, claude-code doesn't, only
  hermes appears in the map
- max-on-duplicate: same runtime in two templates → max wins
- skip-bad-inputs: missing runtime, zero timeout, malformed yaml,
  loose top-level files all silently ignored
- missing-dir: returns empty map, no crash
- cache: lazy-init on first get; subsequent gets hit cache even
  after underlying file changes (sync.Once contract); unknown
  runtime returns zero

Phase 3 (separate template-repo PR): template-hermes config.yaml
declares provision_timeout_seconds: 720 under runtime_config.
canvas RUNTIME_PROFILES.hermes becomes redundant + removable.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
rabbitblood 2026-04-26 06:37:45 -07:00
parent f4cbb50ddf
commit 27396d992c
4 changed files with 261 additions and 13 deletions

View File

@ -0,0 +1,83 @@
package handlers
import (
"log"
"os"
"path/filepath"
"sync"
"gopkg.in/yaml.v3"
)
// runtimeProvisionTimeouts caches the per-runtime provision-timeout values
// declared in template config.yaml manifests (#2054 phase 2). Lazy-init so
// the first workspace API request after process start pays the read cost
// (a few KB of yaml across ~50 templates) and every subsequent one is a
// map lookup.
//
// Cache lifetime = process lifetime. Templates only change on container
// rebuild + workspace-server restart, which already invalidates the
// in-memory state. A future template-hot-reload feature would need to
// refresh this cache; today there is no such hook.
type runtimeProvisionTimeoutsCache struct {
once sync.Once
m map[string]int // runtime → seconds
}
func (c *runtimeProvisionTimeoutsCache) get(configsDir string, runtime string) int {
c.once.Do(func() {
c.m = loadRuntimeProvisionTimeouts(configsDir)
})
return c.m[runtime]
}
// loadRuntimeProvisionTimeouts walks `configsDir`, parses every immediate
// subdirectory's `config.yaml`, and returns a map of runtime → seconds
// for templates that declared `runtime_config.provision_timeout_seconds`.
//
// Templates without the field aren't represented (lookup returns zero,
// which the caller treats as "fall through to canvas runtime profile").
//
// Multiple templates with the same runtime: take the MAX timeout — a
// slow template's threshold should win over a fast template's so users
// of either template see a true-positive timeout signal rather than a
// false alarm. Same-runtime divergence is rare in practice (typically
// one canonical template-{runtime} per runtime) but the rule is the
// safer default.
func loadRuntimeProvisionTimeouts(configsDir string) map[string]int {
out := map[string]int{}
entries, err := os.ReadDir(configsDir)
if err != nil {
// Logged but not fatal — workspace-server starts cleanly with
// no templates (dev / fresh-clone). The result is an empty map
// so every runtime falls through to canvas's profile default.
log.Printf("loadRuntimeProvisionTimeouts: read configsDir %s: %v", configsDir, err)
return out
}
for _, e := range entries {
if !e.IsDir() {
continue
}
data, err := os.ReadFile(filepath.Join(configsDir, e.Name(), "config.yaml"))
if err != nil {
continue
}
var raw struct {
Runtime string `yaml:"runtime"`
RuntimeConfig struct {
ProvisionTimeoutSeconds int `yaml:"provision_timeout_seconds"`
} `yaml:"runtime_config"`
}
if err := yaml.Unmarshal(data, &raw); err != nil {
continue
}
secs := raw.RuntimeConfig.ProvisionTimeoutSeconds
if secs <= 0 || raw.Runtime == "" {
continue
}
if existing, ok := out[raw.Runtime]; !ok || secs > existing {
out[raw.Runtime] = secs
}
}
return out
}

View File

@ -0,0 +1,128 @@
package handlers
import (
"os"
"path/filepath"
"testing"
)
// writeTemplate is a tiny test fixture: drop a config.yaml under
// tmp/<dir>/config.yaml with the given content. Mirrors the real
// configsDir layout (one subdir per template, each with its own
// config.yaml).
func writeTemplate(t *testing.T, dir, name, content string) {
t.Helper()
p := filepath.Join(dir, name)
if err := os.MkdirAll(p, 0o755); err != nil {
t.Fatalf("mkdir %s: %v", p, err)
}
if err := os.WriteFile(filepath.Join(p, "config.yaml"), []byte(content), 0o644); err != nil {
t.Fatalf("write config.yaml: %v", err)
}
}
func TestLoadRuntimeProvisionTimeouts_HappyPath(t *testing.T) {
dir := t.TempDir()
writeTemplate(t, dir, "template-hermes", `
name: Hermes
runtime: hermes
runtime_config:
provision_timeout_seconds: 720
`)
writeTemplate(t, dir, "template-claude-code", `
name: Claude
runtime: claude-code
runtime_config:
model: anthropic:claude-opus
`)
got := loadRuntimeProvisionTimeouts(dir)
if got["hermes"] != 720 {
t.Errorf("hermes: got %d, want 720", got["hermes"])
}
// claude-code didn't declare a timeout — must not appear in the map
// (zero-value lookup is the no-override signal).
if _, ok := got["claude-code"]; ok {
t.Errorf("claude-code: present without declaration: %d", got["claude-code"])
}
}
func TestLoadRuntimeProvisionTimeouts_MaxOnDuplicateRuntime(t *testing.T) {
dir := t.TempDir()
writeTemplate(t, dir, "template-hermes-fast", `
runtime: hermes
runtime_config:
provision_timeout_seconds: 300
`)
writeTemplate(t, dir, "template-hermes-slow", `
runtime: hermes
runtime_config:
provision_timeout_seconds: 900
`)
got := loadRuntimeProvisionTimeouts(dir)
// Max wins so the slowest template's threshold doesn't false-alarm
// when both templates use the same runtime.
if got["hermes"] != 900 {
t.Errorf("max-on-duplicate: got %d, want 900", got["hermes"])
}
}
func TestLoadRuntimeProvisionTimeouts_SkipsBadInputs(t *testing.T) {
dir := t.TempDir()
// Missing runtime field — has timeout but no key to map under.
writeTemplate(t, dir, "template-no-runtime", `
runtime_config:
provision_timeout_seconds: 600
`)
// Zero/negative timeout — same as no declaration.
writeTemplate(t, dir, "template-zero", `
runtime: zero-runtime
runtime_config:
provision_timeout_seconds: 0
`)
// Malformed yaml — must not crash.
writeTemplate(t, dir, "template-bad", "not: valid: yaml: at: all:")
// Loose file at the top level (not a dir) — must be ignored.
if err := os.WriteFile(filepath.Join(dir, "stray.txt"), []byte("ignore me"), 0o644); err != nil {
t.Fatal(err)
}
got := loadRuntimeProvisionTimeouts(dir)
if len(got) != 0 {
t.Errorf("expected empty map for skip cases, got %v", got)
}
}
func TestLoadRuntimeProvisionTimeouts_MissingDirReturnsEmpty(t *testing.T) {
got := loadRuntimeProvisionTimeouts("/nonexistent/path/should/not/exist/12345")
if len(got) != 0 {
t.Errorf("expected empty map on missing dir, got %v", got)
}
}
func TestRuntimeProvisionTimeoutsCache_LazyInitAndCached(t *testing.T) {
dir := t.TempDir()
writeTemplate(t, dir, "template-hermes", `
runtime: hermes
runtime_config:
provision_timeout_seconds: 720
`)
c := runtimeProvisionTimeoutsCache{}
// First call populates.
if got := c.get(dir, "hermes"); got != 720 {
t.Errorf("first call: got %d, want 720", got)
}
// Second call hits cache — even if the underlying file changed we
// still see the original value (sync.Once contract).
if err := os.WriteFile(filepath.Join(dir, "template-hermes", "config.yaml"),
[]byte("runtime: hermes\nruntime_config:\n provision_timeout_seconds: 60\n"), 0o644); err != nil {
t.Fatal(err)
}
if got := c.get(dir, "hermes"); got != 720 {
t.Errorf("cached call: got %d, want 720 (cache must not re-read)", got)
}
// Unknown runtime returns zero — caller's signal to fall through to
// the canvas runtime profile default.
if got := c.get(dir, "unknown"); got != 0 {
t.Errorf("unknown runtime: got %d, want 0", got)
}
}

View File

@ -61,6 +61,13 @@ type templateSummary struct {
RequiredEnv []string `json:"required_env,omitempty"`
Skills []string `json:"skills"`
SkillCount int `json:"skill_count"`
// ProvisionTimeoutSeconds lets a slow runtime declare its expected
// cold-boot duration in its template manifest. Canvas's
// ProvisioningTimeout banner respects this per-workspace via the
// `provision_timeout_ms` field in the workspace API response (#2054).
// 0 = template hasn't declared one, falls through to canvas's
// runtime-profile default.
ProvisionTimeoutSeconds int `json:"provision_timeout_seconds,omitempty"`
}
// resolveTemplateDir finds the template directory for a workspace on the host.
@ -106,9 +113,10 @@ func (h *TemplatesHandler) List(c *gin.Context) {
Model string `yaml:"model"`
Skills []string `yaml:"skills"`
RuntimeConfig struct {
Model string `yaml:"model"`
Models []modelSpec `yaml:"models"`
RequiredEnv []string `yaml:"required_env"`
Model string `yaml:"model"`
Models []modelSpec `yaml:"models"`
RequiredEnv []string `yaml:"required_env"`
ProvisionTimeoutSeconds int `yaml:"provision_timeout_seconds"`
} `yaml:"runtime_config"`
}
if err := yaml.Unmarshal(data, &raw); err != nil {
@ -122,16 +130,17 @@ func (h *TemplatesHandler) List(c *gin.Context) {
}
templates = append(templates, templateSummary{
ID: entry.Name(),
Name: raw.Name,
Description: raw.Description,
Tier: raw.Tier,
Runtime: raw.Runtime,
Model: model,
Models: raw.RuntimeConfig.Models,
RequiredEnv: raw.RuntimeConfig.RequiredEnv,
Skills: raw.Skills,
SkillCount: len(raw.Skills),
ID: entry.Name(),
Name: raw.Name,
Description: raw.Description,
Tier: raw.Tier,
Runtime: raw.Runtime,
Model: model,
Models: raw.RuntimeConfig.Models,
RequiredEnv: raw.RuntimeConfig.RequiredEnv,
Skills: raw.Skills,
SkillCount: len(raw.Skills),
ProvisionTimeoutSeconds: raw.RuntimeConfig.ProvisionTimeoutSeconds,
})
}

View File

@ -40,6 +40,10 @@ type WorkspaceHandler struct {
// calls made by HibernateWorkspace without requiring a running Docker daemon.
// Always nil in production; the real provisioner path is used when nil.
stopFnOverride func(ctx context.Context, workspaceID string)
// provisionTimeouts caches per-runtime provision-timeout values from
// template manifests (#2054 phase 2). Lazy-init on first scan; see
// runtime_provision_timeouts.go for the loader contract.
provisionTimeouts runtimeProvisionTimeoutsCache
}
func NewWorkspaceHandler(b *events.Broadcaster, p *provisioner.Provisioner, platformURL, configsDir string) *WorkspaceHandler {
@ -343,6 +347,17 @@ func (h *WorkspaceHandler) Create(c *gin.Context) {
})
}
// addProvisionTimeoutMs decorates a workspace response map with the
// per-runtime provision-timeout override (#2054 phase 2) when one is
// declared in the runtime's template manifest. No-op when the runtime
// has no declared timeout — the canvas-side resolver falls through to
// its runtime-profile default.
func (h *WorkspaceHandler) addProvisionTimeoutMs(ws map[string]interface{}, runtime string) {
if secs := h.provisionTimeouts.get(h.configsDir, runtime); secs > 0 {
ws["provision_timeout_ms"] = secs * 1000
}
}
// scanWorkspaceRow is a helper to scan workspace+layout rows into a clean JSON map.
func scanWorkspaceRow(rows interface {
Scan(dest ...interface{}) error
@ -441,6 +456,13 @@ func (h *WorkspaceHandler) List(c *gin.Context) {
log.Printf("List scan error: %v", err)
continue
}
// #2054 phase 2: surface per-runtime provision-timeout for
// canvas's ProvisioningTimeout banner. Decorating per-row
// (vs map-once-and-reuse) keeps the helper self-contained;
// the cache hit is sub-microsecond.
if rt, _ := ws["runtime"].(string); rt != "" {
h.addProvisionTimeoutMs(ws, rt)
}
workspaces = append(workspaces, ws)
}
if err := rows.Err(); err != nil {
@ -508,5 +530,11 @@ func (h *WorkspaceHandler) Get(c *gin.Context) {
ws["last_outbound_at"] = nil
}
// #2054 phase 2: per-runtime provision-timeout for canvas's
// ProvisioningTimeout banner.
if rt, _ := ws["runtime"].(string); rt != "" {
h.addProvisionTimeoutMs(ws, rt)
}
c.JSON(http.StatusOK, ws)
}