feat(workspace-server): surface provision_timeout_ms in workspace API (#2054 phase 2)
Phase 2 of #2054 — workspace-server reads runtime-level provision_timeout_seconds from template config.yaml manifests and includes provision_timeout_ms in the workspace List/Get response. Phase 1 (canvas, #2092) already plumbs the field through socket → node-data → ProvisioningTimeout's resolver, so the moment a template declares the field the per-runtime banner threshold adjusts without a canvas release. Implementation: - templates.go: parse runtime_config.provision_timeout_seconds in the templateSummary marshaller. The /templates API now surfaces the field too — useful for ops dashboards and future tooling. - runtime_provision_timeouts.go (new): loadRuntimeProvisionTimeouts scans configsDir, parses every immediate subdir's config.yaml, returns runtime → seconds. Multiple templates with the same runtime: max wins (so a slow template's threshold doesn't get cut by a fast template's). Bad/empty inputs are silently skipped — workspace-server starts cleanly with no templates. - runtimeProvisionTimeoutsCache: sync.Once-backed lazy cache. First workspace API request after process start pays the read cost (~few KB across ~50 templates); every subsequent request is a map lookup. Cache lifetime = process lifetime; invalidates on workspace-server restart, which is the normal template-change cadence. - WorkspaceHandler gets a provisionTimeouts field (zero-value struct is valid — the cache lazy-inits on first get()). - addProvisionTimeoutMs decorates the response map with provision_timeout_ms (seconds × 1000) when the runtime has a declared timeout. Absent = no key in the response, canvas falls through to its runtime-profile default. Wired into both List (per-row decoration in the loop) and Get. Tests (5 new in runtime_provision_timeouts_test.go): - happy path: hermes declares 720, claude-code doesn't, only hermes appears in the map - max-on-duplicate: same runtime in two templates → max wins - skip-bad-inputs: missing runtime, zero timeout, malformed yaml, loose top-level files all silently ignored - missing-dir: returns empty map, no crash - cache: lazy-init on first get; subsequent gets hit cache even after underlying file changes (sync.Once contract); unknown runtime returns zero Phase 3 (separate template-repo PR): template-hermes config.yaml declares provision_timeout_seconds: 720 under runtime_config. canvas RUNTIME_PROFILES.hermes becomes redundant + removable. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
f4cbb50ddf
commit
27396d992c
@ -0,0 +1,83 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
// runtimeProvisionTimeouts caches the per-runtime provision-timeout values
|
||||
// declared in template config.yaml manifests (#2054 phase 2). Lazy-init so
|
||||
// the first workspace API request after process start pays the read cost
|
||||
// (a few KB of yaml across ~50 templates) and every subsequent one is a
|
||||
// map lookup.
|
||||
//
|
||||
// Cache lifetime = process lifetime. Templates only change on container
|
||||
// rebuild + workspace-server restart, which already invalidates the
|
||||
// in-memory state. A future template-hot-reload feature would need to
|
||||
// refresh this cache; today there is no such hook.
|
||||
type runtimeProvisionTimeoutsCache struct {
|
||||
once sync.Once
|
||||
m map[string]int // runtime → seconds
|
||||
}
|
||||
|
||||
func (c *runtimeProvisionTimeoutsCache) get(configsDir string, runtime string) int {
|
||||
c.once.Do(func() {
|
||||
c.m = loadRuntimeProvisionTimeouts(configsDir)
|
||||
})
|
||||
return c.m[runtime]
|
||||
}
|
||||
|
||||
// loadRuntimeProvisionTimeouts walks `configsDir`, parses every immediate
|
||||
// subdirectory's `config.yaml`, and returns a map of runtime → seconds
|
||||
// for templates that declared `runtime_config.provision_timeout_seconds`.
|
||||
//
|
||||
// Templates without the field aren't represented (lookup returns zero,
|
||||
// which the caller treats as "fall through to canvas runtime profile").
|
||||
//
|
||||
// Multiple templates with the same runtime: take the MAX timeout — a
|
||||
// slow template's threshold should win over a fast template's so users
|
||||
// of either template see a true-positive timeout signal rather than a
|
||||
// false alarm. Same-runtime divergence is rare in practice (typically
|
||||
// one canonical template-{runtime} per runtime) but the rule is the
|
||||
// safer default.
|
||||
func loadRuntimeProvisionTimeouts(configsDir string) map[string]int {
|
||||
out := map[string]int{}
|
||||
entries, err := os.ReadDir(configsDir)
|
||||
if err != nil {
|
||||
// Logged but not fatal — workspace-server starts cleanly with
|
||||
// no templates (dev / fresh-clone). The result is an empty map
|
||||
// so every runtime falls through to canvas's profile default.
|
||||
log.Printf("loadRuntimeProvisionTimeouts: read configsDir %s: %v", configsDir, err)
|
||||
return out
|
||||
}
|
||||
for _, e := range entries {
|
||||
if !e.IsDir() {
|
||||
continue
|
||||
}
|
||||
data, err := os.ReadFile(filepath.Join(configsDir, e.Name(), "config.yaml"))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
var raw struct {
|
||||
Runtime string `yaml:"runtime"`
|
||||
RuntimeConfig struct {
|
||||
ProvisionTimeoutSeconds int `yaml:"provision_timeout_seconds"`
|
||||
} `yaml:"runtime_config"`
|
||||
}
|
||||
if err := yaml.Unmarshal(data, &raw); err != nil {
|
||||
continue
|
||||
}
|
||||
secs := raw.RuntimeConfig.ProvisionTimeoutSeconds
|
||||
if secs <= 0 || raw.Runtime == "" {
|
||||
continue
|
||||
}
|
||||
if existing, ok := out[raw.Runtime]; !ok || secs > existing {
|
||||
out[raw.Runtime] = secs
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
@ -0,0 +1,128 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// writeTemplate is a tiny test fixture: drop a config.yaml under
|
||||
// tmp/<dir>/config.yaml with the given content. Mirrors the real
|
||||
// configsDir layout (one subdir per template, each with its own
|
||||
// config.yaml).
|
||||
func writeTemplate(t *testing.T, dir, name, content string) {
|
||||
t.Helper()
|
||||
p := filepath.Join(dir, name)
|
||||
if err := os.MkdirAll(p, 0o755); err != nil {
|
||||
t.Fatalf("mkdir %s: %v", p, err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(p, "config.yaml"), []byte(content), 0o644); err != nil {
|
||||
t.Fatalf("write config.yaml: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadRuntimeProvisionTimeouts_HappyPath(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
writeTemplate(t, dir, "template-hermes", `
|
||||
name: Hermes
|
||||
runtime: hermes
|
||||
runtime_config:
|
||||
provision_timeout_seconds: 720
|
||||
`)
|
||||
writeTemplate(t, dir, "template-claude-code", `
|
||||
name: Claude
|
||||
runtime: claude-code
|
||||
runtime_config:
|
||||
model: anthropic:claude-opus
|
||||
`)
|
||||
got := loadRuntimeProvisionTimeouts(dir)
|
||||
if got["hermes"] != 720 {
|
||||
t.Errorf("hermes: got %d, want 720", got["hermes"])
|
||||
}
|
||||
// claude-code didn't declare a timeout — must not appear in the map
|
||||
// (zero-value lookup is the no-override signal).
|
||||
if _, ok := got["claude-code"]; ok {
|
||||
t.Errorf("claude-code: present without declaration: %d", got["claude-code"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadRuntimeProvisionTimeouts_MaxOnDuplicateRuntime(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
writeTemplate(t, dir, "template-hermes-fast", `
|
||||
runtime: hermes
|
||||
runtime_config:
|
||||
provision_timeout_seconds: 300
|
||||
`)
|
||||
writeTemplate(t, dir, "template-hermes-slow", `
|
||||
runtime: hermes
|
||||
runtime_config:
|
||||
provision_timeout_seconds: 900
|
||||
`)
|
||||
got := loadRuntimeProvisionTimeouts(dir)
|
||||
// Max wins so the slowest template's threshold doesn't false-alarm
|
||||
// when both templates use the same runtime.
|
||||
if got["hermes"] != 900 {
|
||||
t.Errorf("max-on-duplicate: got %d, want 900", got["hermes"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadRuntimeProvisionTimeouts_SkipsBadInputs(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
// Missing runtime field — has timeout but no key to map under.
|
||||
writeTemplate(t, dir, "template-no-runtime", `
|
||||
runtime_config:
|
||||
provision_timeout_seconds: 600
|
||||
`)
|
||||
// Zero/negative timeout — same as no declaration.
|
||||
writeTemplate(t, dir, "template-zero", `
|
||||
runtime: zero-runtime
|
||||
runtime_config:
|
||||
provision_timeout_seconds: 0
|
||||
`)
|
||||
// Malformed yaml — must not crash.
|
||||
writeTemplate(t, dir, "template-bad", "not: valid: yaml: at: all:")
|
||||
// Loose file at the top level (not a dir) — must be ignored.
|
||||
if err := os.WriteFile(filepath.Join(dir, "stray.txt"), []byte("ignore me"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
got := loadRuntimeProvisionTimeouts(dir)
|
||||
if len(got) != 0 {
|
||||
t.Errorf("expected empty map for skip cases, got %v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadRuntimeProvisionTimeouts_MissingDirReturnsEmpty(t *testing.T) {
|
||||
got := loadRuntimeProvisionTimeouts("/nonexistent/path/should/not/exist/12345")
|
||||
if len(got) != 0 {
|
||||
t.Errorf("expected empty map on missing dir, got %v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRuntimeProvisionTimeoutsCache_LazyInitAndCached(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
writeTemplate(t, dir, "template-hermes", `
|
||||
runtime: hermes
|
||||
runtime_config:
|
||||
provision_timeout_seconds: 720
|
||||
`)
|
||||
c := runtimeProvisionTimeoutsCache{}
|
||||
|
||||
// First call populates.
|
||||
if got := c.get(dir, "hermes"); got != 720 {
|
||||
t.Errorf("first call: got %d, want 720", got)
|
||||
}
|
||||
// Second call hits cache — even if the underlying file changed we
|
||||
// still see the original value (sync.Once contract).
|
||||
if err := os.WriteFile(filepath.Join(dir, "template-hermes", "config.yaml"),
|
||||
[]byte("runtime: hermes\nruntime_config:\n provision_timeout_seconds: 60\n"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if got := c.get(dir, "hermes"); got != 720 {
|
||||
t.Errorf("cached call: got %d, want 720 (cache must not re-read)", got)
|
||||
}
|
||||
// Unknown runtime returns zero — caller's signal to fall through to
|
||||
// the canvas runtime profile default.
|
||||
if got := c.get(dir, "unknown"); got != 0 {
|
||||
t.Errorf("unknown runtime: got %d, want 0", got)
|
||||
}
|
||||
}
|
||||
@ -61,6 +61,13 @@ type templateSummary struct {
|
||||
RequiredEnv []string `json:"required_env,omitempty"`
|
||||
Skills []string `json:"skills"`
|
||||
SkillCount int `json:"skill_count"`
|
||||
// ProvisionTimeoutSeconds lets a slow runtime declare its expected
|
||||
// cold-boot duration in its template manifest. Canvas's
|
||||
// ProvisioningTimeout banner respects this per-workspace via the
|
||||
// `provision_timeout_ms` field in the workspace API response (#2054).
|
||||
// 0 = template hasn't declared one, falls through to canvas's
|
||||
// runtime-profile default.
|
||||
ProvisionTimeoutSeconds int `json:"provision_timeout_seconds,omitempty"`
|
||||
}
|
||||
|
||||
// resolveTemplateDir finds the template directory for a workspace on the host.
|
||||
@ -106,9 +113,10 @@ func (h *TemplatesHandler) List(c *gin.Context) {
|
||||
Model string `yaml:"model"`
|
||||
Skills []string `yaml:"skills"`
|
||||
RuntimeConfig struct {
|
||||
Model string `yaml:"model"`
|
||||
Models []modelSpec `yaml:"models"`
|
||||
RequiredEnv []string `yaml:"required_env"`
|
||||
Model string `yaml:"model"`
|
||||
Models []modelSpec `yaml:"models"`
|
||||
RequiredEnv []string `yaml:"required_env"`
|
||||
ProvisionTimeoutSeconds int `yaml:"provision_timeout_seconds"`
|
||||
} `yaml:"runtime_config"`
|
||||
}
|
||||
if err := yaml.Unmarshal(data, &raw); err != nil {
|
||||
@ -122,16 +130,17 @@ func (h *TemplatesHandler) List(c *gin.Context) {
|
||||
}
|
||||
|
||||
templates = append(templates, templateSummary{
|
||||
ID: entry.Name(),
|
||||
Name: raw.Name,
|
||||
Description: raw.Description,
|
||||
Tier: raw.Tier,
|
||||
Runtime: raw.Runtime,
|
||||
Model: model,
|
||||
Models: raw.RuntimeConfig.Models,
|
||||
RequiredEnv: raw.RuntimeConfig.RequiredEnv,
|
||||
Skills: raw.Skills,
|
||||
SkillCount: len(raw.Skills),
|
||||
ID: entry.Name(),
|
||||
Name: raw.Name,
|
||||
Description: raw.Description,
|
||||
Tier: raw.Tier,
|
||||
Runtime: raw.Runtime,
|
||||
Model: model,
|
||||
Models: raw.RuntimeConfig.Models,
|
||||
RequiredEnv: raw.RuntimeConfig.RequiredEnv,
|
||||
Skills: raw.Skills,
|
||||
SkillCount: len(raw.Skills),
|
||||
ProvisionTimeoutSeconds: raw.RuntimeConfig.ProvisionTimeoutSeconds,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@ -40,6 +40,10 @@ type WorkspaceHandler struct {
|
||||
// calls made by HibernateWorkspace without requiring a running Docker daemon.
|
||||
// Always nil in production; the real provisioner path is used when nil.
|
||||
stopFnOverride func(ctx context.Context, workspaceID string)
|
||||
// provisionTimeouts caches per-runtime provision-timeout values from
|
||||
// template manifests (#2054 phase 2). Lazy-init on first scan; see
|
||||
// runtime_provision_timeouts.go for the loader contract.
|
||||
provisionTimeouts runtimeProvisionTimeoutsCache
|
||||
}
|
||||
|
||||
func NewWorkspaceHandler(b *events.Broadcaster, p *provisioner.Provisioner, platformURL, configsDir string) *WorkspaceHandler {
|
||||
@ -343,6 +347,17 @@ func (h *WorkspaceHandler) Create(c *gin.Context) {
|
||||
})
|
||||
}
|
||||
|
||||
// addProvisionTimeoutMs decorates a workspace response map with the
|
||||
// per-runtime provision-timeout override (#2054 phase 2) when one is
|
||||
// declared in the runtime's template manifest. No-op when the runtime
|
||||
// has no declared timeout — the canvas-side resolver falls through to
|
||||
// its runtime-profile default.
|
||||
func (h *WorkspaceHandler) addProvisionTimeoutMs(ws map[string]interface{}, runtime string) {
|
||||
if secs := h.provisionTimeouts.get(h.configsDir, runtime); secs > 0 {
|
||||
ws["provision_timeout_ms"] = secs * 1000
|
||||
}
|
||||
}
|
||||
|
||||
// scanWorkspaceRow is a helper to scan workspace+layout rows into a clean JSON map.
|
||||
func scanWorkspaceRow(rows interface {
|
||||
Scan(dest ...interface{}) error
|
||||
@ -441,6 +456,13 @@ func (h *WorkspaceHandler) List(c *gin.Context) {
|
||||
log.Printf("List scan error: %v", err)
|
||||
continue
|
||||
}
|
||||
// #2054 phase 2: surface per-runtime provision-timeout for
|
||||
// canvas's ProvisioningTimeout banner. Decorating per-row
|
||||
// (vs map-once-and-reuse) keeps the helper self-contained;
|
||||
// the cache hit is sub-microsecond.
|
||||
if rt, _ := ws["runtime"].(string); rt != "" {
|
||||
h.addProvisionTimeoutMs(ws, rt)
|
||||
}
|
||||
workspaces = append(workspaces, ws)
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
@ -508,5 +530,11 @@ func (h *WorkspaceHandler) Get(c *gin.Context) {
|
||||
ws["last_outbound_at"] = nil
|
||||
}
|
||||
|
||||
// #2054 phase 2: per-runtime provision-timeout for canvas's
|
||||
// ProvisioningTimeout banner.
|
||||
if rt, _ := ws["runtime"].(string); rt != "" {
|
||||
h.addProvisionTimeoutMs(ws, rt)
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, ws)
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user