Merge pull request #2139 from Molecule-AI/feat/idle-timeout-adapter-override
feat(runtime): adapter-declared idle_timeout_override — primitive #2 of 6
This commit is contained in:
commit
bc5b0f614f
@ -254,6 +254,12 @@ func main() {
|
||||
|
||||
// Cron Scheduler — fires A2A messages to workspaces on user-defined schedules
|
||||
cronSched := scheduler.New(wh, broadcaster)
|
||||
// Wire the native-scheduler skip — when an adapter's heartbeat
|
||||
// declares provides_native_scheduler=true, the platform's polling
|
||||
// loop drops that workspace's schedules to avoid double-fire (the
|
||||
// SDK runs them itself). See project memory
|
||||
// `project_runtime_native_pluggable.md` and capability primitive #3.
|
||||
cronSched.SetNativeSchedulerCheck(handlers.ProvidesNativeScheduler)
|
||||
go supervised.RunWithRecover(ctx, "scheduler", cronSched.Start)
|
||||
|
||||
// Hibernation Monitor — auto-pauses idle workspaces that have
|
||||
|
||||
@ -588,7 +588,18 @@ func (h *WorkspaceHandler) dispatchA2A(ctx context.Context, workspaceID, agentUR
|
||||
if concrete, ok := h.broadcaster.(*events.Broadcaster); ok {
|
||||
b = concrete
|
||||
}
|
||||
forwardCtx, idleCancel := applyIdleTimeout(forwardCtx, b, workspaceID, idleTimeoutDuration)
|
||||
// Per-workspace idle-timeout override (capability primitive #2 —
|
||||
// see workspace/adapter_base.py:idle_timeout_override). The
|
||||
// adapter declares a longer/shorter window than the platform
|
||||
// default in its heartbeat; the heartbeat handler stashes it in
|
||||
// runtimeOverrides; we honor it here. Falls through to the global
|
||||
// default (env A2A_IDLE_TIMEOUT_SECONDS, default 5min) when no
|
||||
// override is registered for this workspace.
|
||||
idle := idleTimeoutDuration
|
||||
if perWorkspace, ok := runtimeOverrides.IdleTimeout(workspaceID); ok {
|
||||
idle = perWorkspace
|
||||
}
|
||||
forwardCtx, idleCancel := applyIdleTimeout(forwardCtx, b, workspaceID, idle)
|
||||
cancel := func() {
|
||||
idleCancel()
|
||||
if ceilingCancel != nil {
|
||||
|
||||
@ -11,6 +11,7 @@ import (
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
|
||||
@ -461,6 +462,28 @@ func (h *RegistryHandler) Heartbeat(c *gin.Context) {
|
||||
"uptime_seconds": payload.UptimeSeconds,
|
||||
})
|
||||
|
||||
// Refresh per-workspace runtime overrides from the heartbeat's
|
||||
// runtime_metadata block (introduced for the native+pluggable
|
||||
// runtime principle — see project memory). Both idle_timeout_seconds
|
||||
// and capability flags are stored. Each consumer (a2a_proxy.dispatchA2A
|
||||
// for idle timeout, scheduler.tick for native scheduler, etc.) reads
|
||||
// what it needs from the cache. nil RuntimeMetadata or absent field
|
||||
// clears the corresponding override so the dispatch path uses the
|
||||
// global default.
|
||||
if payload.RuntimeMetadata != nil && payload.RuntimeMetadata.IdleTimeoutSeconds != nil {
|
||||
runtimeOverrides.SetIdleTimeout(
|
||||
payload.WorkspaceID,
|
||||
time.Duration(*payload.RuntimeMetadata.IdleTimeoutSeconds)*time.Second,
|
||||
)
|
||||
} else {
|
||||
runtimeOverrides.SetIdleTimeout(payload.WorkspaceID, 0) // clear
|
||||
}
|
||||
if payload.RuntimeMetadata != nil {
|
||||
runtimeOverrides.SetCapabilities(payload.WorkspaceID, payload.RuntimeMetadata.Capabilities)
|
||||
} else {
|
||||
runtimeOverrides.SetCapabilities(payload.WorkspaceID, nil) // clear
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{"status": "ok"})
|
||||
}
|
||||
|
||||
|
||||
164
workspace-server/internal/handlers/runtime_overrides.go
Normal file
164
workspace-server/internal/handlers/runtime_overrides.go
Normal file
@ -0,0 +1,164 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// runtimeOverrides is the in-memory cache of per-workspace, adapter-
|
||||
// declared overrides for cross-cutting capabilities. Populated by the
|
||||
// heartbeat handler from HeartbeatPayload.RuntimeMetadata; consumed by
|
||||
// dispatch paths (a2a_proxy.dispatchA2A reads IdleTimeout) before
|
||||
// applying their own platform-default behavior.
|
||||
//
|
||||
// Why an in-memory cache and not a DB column:
|
||||
// - Heartbeats arrive every ~30s, so a fresh override propagates
|
||||
// within a heartbeat cycle of any change in adapter declarations.
|
||||
// - On platform restart the cache resets to empty until each
|
||||
// workspace's next heartbeat repopulates it. Worst-case window =
|
||||
// 30s of platform-default behavior. Acceptable; nothing about
|
||||
// these overrides is correctness-critical (they tune timeouts +
|
||||
// enable native ownership of fallback features, not state).
|
||||
// - DB-roundtripping every dispatch would add latency to a hot
|
||||
// path (a2a_proxy is on every agent → agent call). The cache is
|
||||
// a sync.Map — atomic ptr load per dispatch, zero lock contention
|
||||
// under steady load.
|
||||
//
|
||||
// Stale entries: a workspace that goes offline never sends another
|
||||
// heartbeat, but the cache entry persists until the platform restarts.
|
||||
// Acceptable because dispatchA2A only consults the cache when actually
|
||||
// dispatching to that workspace — a stale entry for an offline
|
||||
// workspace just means "use the override that was active when it was
|
||||
// last alive" (correct behavior; the workspace will get the same
|
||||
// timeouts when it comes back).
|
||||
//
|
||||
// See workspace/adapter_base.py:idle_timeout_override and project
|
||||
// memory `project_runtime_native_pluggable.md`.
|
||||
var runtimeOverrides runtimeOverrideCache
|
||||
|
||||
type runtimeOverrideEntry struct {
|
||||
idleTimeout time.Duration // 0 means "no override; use global default"
|
||||
// capabilities maps wire-name keys from RuntimeCapabilities.to_dict()
|
||||
// — "heartbeat", "scheduler", "session", "status_mgmt", "retry",
|
||||
// "activity_decoration", "channel_dispatch" — to whether the adapter
|
||||
// claims native ownership. Consumers (e.g. scheduler.tick) read this
|
||||
// to decide whether to fire their platform-fallback behavior for this
|
||||
// workspace.
|
||||
//
|
||||
// nil map means "no capability declarations received yet" → consumers
|
||||
// fall back to the platform default (today's behavior).
|
||||
capabilities map[string]bool
|
||||
}
|
||||
|
||||
type runtimeOverrideCache struct {
|
||||
m sync.Map // key: workspaceID (string), value: runtimeOverrideEntry
|
||||
}
|
||||
|
||||
// loadEntry returns the entry for workspaceID (or a zero-value entry).
|
||||
// Internal helper for the partial-update Set methods; sync.Map's
|
||||
// Load doesn't support "read or default" in one shot.
|
||||
func (c *runtimeOverrideCache) loadEntry(workspaceID string) runtimeOverrideEntry {
|
||||
if v, ok := c.m.Load(workspaceID); ok {
|
||||
if e, ok := v.(runtimeOverrideEntry); ok {
|
||||
return e
|
||||
}
|
||||
}
|
||||
return runtimeOverrideEntry{}
|
||||
}
|
||||
|
||||
// deleteIfEmpty drops the workspace's entry from the cache when both
|
||||
// idleTimeout and capabilities are absent. Keeps the cache from
|
||||
// retaining empty husks forever after a runtime stops sending overrides.
|
||||
func (c *runtimeOverrideCache) deleteIfEmpty(workspaceID string, e runtimeOverrideEntry) {
|
||||
if e.idleTimeout <= 0 && len(e.capabilities) == 0 {
|
||||
c.m.Delete(workspaceID)
|
||||
return
|
||||
}
|
||||
c.m.Store(workspaceID, e)
|
||||
}
|
||||
|
||||
// SetIdleTimeout records the per-workspace idle-timeout override sent
|
||||
// in the most recent heartbeat. d == 0 clears the override (falling
|
||||
// back to the global default), so a runtime that previously declared
|
||||
// an override and then dropped it cleanly returns to platform behavior.
|
||||
// Capability flags on the same workspace are preserved.
|
||||
func (c *runtimeOverrideCache) SetIdleTimeout(workspaceID string, d time.Duration) {
|
||||
if workspaceID == "" {
|
||||
return
|
||||
}
|
||||
e := c.loadEntry(workspaceID)
|
||||
if d <= 0 {
|
||||
e.idleTimeout = 0
|
||||
} else {
|
||||
e.idleTimeout = d
|
||||
}
|
||||
c.deleteIfEmpty(workspaceID, e)
|
||||
}
|
||||
|
||||
// IdleTimeout returns the per-workspace override and ok=true when one
|
||||
// is in effect; ok=false means dispatchA2A should fall back to the
|
||||
// global idleTimeoutDuration.
|
||||
func (c *runtimeOverrideCache) IdleTimeout(workspaceID string) (time.Duration, bool) {
|
||||
e := c.loadEntry(workspaceID)
|
||||
if e.idleTimeout <= 0 {
|
||||
return 0, false
|
||||
}
|
||||
return e.idleTimeout, true
|
||||
}
|
||||
|
||||
// SetCapabilities records the per-workspace capability declaration map
|
||||
// (e.g. {"scheduler": true, "heartbeat": false, ...}) sent in the most
|
||||
// recent heartbeat. Replaces any prior map; pass nil to clear.
|
||||
// IdleTimeout on the same workspace is preserved.
|
||||
//
|
||||
// The wire-name keys (heartbeat, scheduler, session, status_mgmt, retry,
|
||||
// activity_decoration, channel_dispatch) match RuntimeCapabilities.to_dict()
|
||||
// in workspace/adapter_base.py — keep in sync there.
|
||||
func (c *runtimeOverrideCache) SetCapabilities(workspaceID string, caps map[string]bool) {
|
||||
if workspaceID == "" {
|
||||
return
|
||||
}
|
||||
e := c.loadEntry(workspaceID)
|
||||
if len(caps) == 0 {
|
||||
e.capabilities = nil
|
||||
} else {
|
||||
// Defensive copy: caller may reuse / mutate the map after the
|
||||
// call; the cache holds long-lived refs.
|
||||
dup := make(map[string]bool, len(caps))
|
||||
for k, v := range caps {
|
||||
dup[k] = v
|
||||
}
|
||||
e.capabilities = dup
|
||||
}
|
||||
c.deleteIfEmpty(workspaceID, e)
|
||||
}
|
||||
|
||||
// HasCapability returns true when the workspace's adapter has declared
|
||||
// native ownership of the named capability. False when no entry exists,
|
||||
// no capability map was ever sent, or the named capability is absent /
|
||||
// false. Consumers (scheduler.tick, etc.) call this before firing their
|
||||
// platform-fallback behavior.
|
||||
func (c *runtimeOverrideCache) HasCapability(workspaceID, name string) bool {
|
||||
if workspaceID == "" || name == "" {
|
||||
return false
|
||||
}
|
||||
e := c.loadEntry(workspaceID)
|
||||
return e.capabilities[name]
|
||||
}
|
||||
|
||||
// Reset clears the entire cache. Test-only; production code never
|
||||
// needs this since heartbeats refresh entries naturally.
|
||||
func (c *runtimeOverrideCache) Reset() {
|
||||
c.m.Range(func(k, _ any) bool {
|
||||
c.m.Delete(k)
|
||||
return true
|
||||
})
|
||||
}
|
||||
|
||||
// ProvidesNativeScheduler is the public adapter exposed to the scheduler
|
||||
// package — wraps HasCapability("scheduler") with the package-level
|
||||
// runtimeOverrides instance. Wired into Scheduler.New() at router setup
|
||||
// to keep scheduler/scheduler.go free of a handlers/ import.
|
||||
func ProvidesNativeScheduler(workspaceID string) bool {
|
||||
return runtimeOverrides.HasCapability(workspaceID, "scheduler")
|
||||
}
|
||||
241
workspace-server/internal/handlers/runtime_overrides_test.go
Normal file
241
workspace-server/internal/handlers/runtime_overrides_test.go
Normal file
@ -0,0 +1,241 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestRuntimeOverrideCache_SetAndGet(t *testing.T) {
|
||||
c := &runtimeOverrideCache{}
|
||||
|
||||
if _, ok := c.IdleTimeout("ws-a"); ok {
|
||||
t.Fatal("empty cache should not return any override")
|
||||
}
|
||||
|
||||
c.SetIdleTimeout("ws-a", 10*time.Minute)
|
||||
got, ok := c.IdleTimeout("ws-a")
|
||||
if !ok || got != 10*time.Minute {
|
||||
t.Fatalf("expected 10m override; got=%v ok=%v", got, ok)
|
||||
}
|
||||
|
||||
// Sibling workspace unaffected — pin against the trap where a
|
||||
// shared map without proper keying would leak overrides across
|
||||
// workspaces (a hard-to-debug "claude-code's longer timeout
|
||||
// somehow applied to langgraph too").
|
||||
if _, ok := c.IdleTimeout("ws-b"); ok {
|
||||
t.Fatal("override for ws-a leaked to ws-b")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRuntimeOverrideCache_ZeroOrNegativeClears(t *testing.T) {
|
||||
// Adapter dropping the override (returning None / 0 from
|
||||
// idle_timeout_override) must restore platform-default behavior.
|
||||
// If the cache held the previous value indefinitely, an adapter
|
||||
// downgrade would silently keep the longer timeout active.
|
||||
c := &runtimeOverrideCache{}
|
||||
c.SetIdleTimeout("ws-a", 10*time.Minute)
|
||||
if _, ok := c.IdleTimeout("ws-a"); !ok {
|
||||
t.Fatal("setup: override should be set")
|
||||
}
|
||||
|
||||
c.SetIdleTimeout("ws-a", 0)
|
||||
if _, ok := c.IdleTimeout("ws-a"); ok {
|
||||
t.Fatal("zero duration should clear override")
|
||||
}
|
||||
|
||||
c.SetIdleTimeout("ws-a", 5*time.Minute)
|
||||
c.SetIdleTimeout("ws-a", -1*time.Second)
|
||||
if _, ok := c.IdleTimeout("ws-a"); ok {
|
||||
t.Fatal("negative duration should clear override")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRuntimeOverrideCache_EmptyWorkspaceIDIgnored(t *testing.T) {
|
||||
// Defensive: a misrouted heartbeat with empty workspace_id
|
||||
// should NOT pollute the cache with a "" key. workspaceID == ""
|
||||
// is also the value dispatchA2A passes when the workspace is
|
||||
// indeterminate, and that path must not surface a stored value.
|
||||
c := &runtimeOverrideCache{}
|
||||
c.SetIdleTimeout("", 10*time.Minute)
|
||||
if _, ok := c.IdleTimeout(""); ok {
|
||||
t.Fatal("empty workspace_id must not store overrides")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRuntimeOverrideCache_SetReplaces(t *testing.T) {
|
||||
// A heartbeat with a new override value replaces, doesn't append.
|
||||
c := &runtimeOverrideCache{}
|
||||
c.SetIdleTimeout("ws-a", 10*time.Minute)
|
||||
c.SetIdleTimeout("ws-a", 20*time.Minute)
|
||||
got, _ := c.IdleTimeout("ws-a")
|
||||
if got != 20*time.Minute {
|
||||
t.Fatalf("expected 20m after replacement; got %v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRuntimeOverrideCache_Reset(t *testing.T) {
|
||||
c := &runtimeOverrideCache{}
|
||||
c.SetIdleTimeout("ws-a", 10*time.Minute)
|
||||
c.SetIdleTimeout("ws-b", 20*time.Minute)
|
||||
c.Reset()
|
||||
if _, ok := c.IdleTimeout("ws-a"); ok {
|
||||
t.Fatal("reset should clear ws-a")
|
||||
}
|
||||
if _, ok := c.IdleTimeout("ws-b"); ok {
|
||||
t.Fatal("reset should clear ws-b")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRuntimeOverrideCache_SetCapabilitiesAndHas(t *testing.T) {
|
||||
c := &runtimeOverrideCache{}
|
||||
if c.HasCapability("ws-a", "scheduler") {
|
||||
t.Fatal("empty cache must not return any capability")
|
||||
}
|
||||
|
||||
c.SetCapabilities("ws-a", map[string]bool{"scheduler": true, "session": false})
|
||||
if !c.HasCapability("ws-a", "scheduler") {
|
||||
t.Fatal("scheduler capability not stored")
|
||||
}
|
||||
if c.HasCapability("ws-a", "session") {
|
||||
t.Fatal("session=false should report as absent (False)")
|
||||
}
|
||||
if c.HasCapability("ws-a", "heartbeat") {
|
||||
t.Fatal("missing key must report as absent")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRuntimeOverrideCache_CapabilitiesIsolatedPerWorkspace(t *testing.T) {
|
||||
// Critical: ws-a declaring native scheduler must NOT make ws-b
|
||||
// also skip its schedules. The cache's per-key isolation is the
|
||||
// only thing standing between "claude-code adapter declares this"
|
||||
// and "every workspace silently inherits the declaration."
|
||||
c := &runtimeOverrideCache{}
|
||||
c.SetCapabilities("ws-a", map[string]bool{"scheduler": true})
|
||||
if c.HasCapability("ws-b", "scheduler") {
|
||||
t.Fatal("ws-a's scheduler capability leaked to ws-b")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRuntimeOverrideCache_NilOrEmptyCapabilitiesClears(t *testing.T) {
|
||||
// An adapter that previously declared native scheduler then
|
||||
// dropped the flag (e.g. SDK update) must restore platform
|
||||
// fallback. nil + empty-map both mean "clear".
|
||||
c := &runtimeOverrideCache{}
|
||||
c.SetCapabilities("ws-a", map[string]bool{"scheduler": true})
|
||||
if !c.HasCapability("ws-a", "scheduler") {
|
||||
t.Fatal("setup: scheduler should be set")
|
||||
}
|
||||
|
||||
c.SetCapabilities("ws-a", nil)
|
||||
if c.HasCapability("ws-a", "scheduler") {
|
||||
t.Fatal("nil should clear capabilities")
|
||||
}
|
||||
|
||||
c.SetCapabilities("ws-a", map[string]bool{"scheduler": true})
|
||||
c.SetCapabilities("ws-a", map[string]bool{})
|
||||
if c.HasCapability("ws-a", "scheduler") {
|
||||
t.Fatal("empty map should clear capabilities")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRuntimeOverrideCache_SetCapabilitiesIsDefensiveCopy(t *testing.T) {
|
||||
// The caller's map MUST NOT alias the cached one. A future careless
|
||||
// caller mutating the original map after the call should not
|
||||
// retroactively change cached capability declarations.
|
||||
c := &runtimeOverrideCache{}
|
||||
original := map[string]bool{"scheduler": true}
|
||||
c.SetCapabilities("ws-a", original)
|
||||
original["scheduler"] = false
|
||||
if !c.HasCapability("ws-a", "scheduler") {
|
||||
t.Fatal("cache aliased the caller's map; capability flipped via outside mutation")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRuntimeOverrideCache_SetIdleTimeoutPreservesCapabilities(t *testing.T) {
|
||||
// The two heartbeat fields are independent — updating one must
|
||||
// not stomp the other. Pre-fix, each Set replaced the entire
|
||||
// entry, which meant the second-arriving Set in the heartbeat
|
||||
// handler effectively erased the first.
|
||||
c := &runtimeOverrideCache{}
|
||||
c.SetCapabilities("ws-a", map[string]bool{"scheduler": true})
|
||||
c.SetIdleTimeout("ws-a", 600*time.Second)
|
||||
|
||||
if !c.HasCapability("ws-a", "scheduler") {
|
||||
t.Fatal("SetIdleTimeout erased prior capabilities")
|
||||
}
|
||||
got, ok := c.IdleTimeout("ws-a")
|
||||
if !ok || got != 600*time.Second {
|
||||
t.Fatalf("idle timeout lost; got=%v ok=%v", got, ok)
|
||||
}
|
||||
|
||||
// And the inverse: SetCapabilities must not erase IdleTimeout.
|
||||
c.SetCapabilities("ws-a", map[string]bool{"scheduler": true, "session": true})
|
||||
if got, ok := c.IdleTimeout("ws-a"); !ok || got != 600*time.Second {
|
||||
t.Fatal("SetCapabilities erased prior idle timeout")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRuntimeOverrideCache_EmptyEntryDeleted(t *testing.T) {
|
||||
// When both fields are cleared, the entry should drop out of the
|
||||
// cache entirely so a stale workspace doesn't accumulate empty
|
||||
// husks indefinitely.
|
||||
c := &runtimeOverrideCache{}
|
||||
c.SetIdleTimeout("ws-a", 60*time.Second)
|
||||
c.SetCapabilities("ws-a", map[string]bool{"scheduler": true})
|
||||
|
||||
c.SetIdleTimeout("ws-a", 0)
|
||||
c.SetCapabilities("ws-a", nil)
|
||||
|
||||
if _, ok := c.m.Load("ws-a"); ok {
|
||||
t.Fatal("entry should be deleted when both fields cleared")
|
||||
}
|
||||
}
|
||||
|
||||
func TestProvidesNativeScheduler_PackageLevel(t *testing.T) {
|
||||
// The package-level function the scheduler imports — pin that it
|
||||
// reads the same singleton the heartbeat handler writes to.
|
||||
runtimeOverrides.Reset()
|
||||
defer runtimeOverrides.Reset()
|
||||
|
||||
if ProvidesNativeScheduler("ws-a") {
|
||||
t.Fatal("empty cache should not declare native scheduler")
|
||||
}
|
||||
runtimeOverrides.SetCapabilities("ws-a", map[string]bool{"scheduler": true})
|
||||
if !ProvidesNativeScheduler("ws-a") {
|
||||
t.Fatal("ProvidesNativeScheduler did not see the declaration")
|
||||
}
|
||||
if ProvidesNativeScheduler("") {
|
||||
t.Fatal("empty workspace ID should never declare native scheduler")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRuntimeOverrideCache_ConcurrentSafe(t *testing.T) {
|
||||
// dispatchA2A reads the cache on every request; heartbeat handlers
|
||||
// write on every 30s. Different workspaces will be hot in different
|
||||
// goroutines. The sync.Map underlying the cache promises this; the
|
||||
// test pins it so a future "let me just use a regular map with a
|
||||
// mutex" change can't silently regress under load.
|
||||
c := &runtimeOverrideCache{}
|
||||
var wg sync.WaitGroup
|
||||
const N = 100
|
||||
|
||||
for i := 0; i < N; i++ {
|
||||
wg.Add(2)
|
||||
go func(i int) {
|
||||
defer wg.Done()
|
||||
c.SetIdleTimeout("ws", time.Duration(i+1)*time.Second)
|
||||
}(i)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
_, _ = c.IdleTimeout("ws")
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
// Final value must be SOME positive duration written by one of the
|
||||
// goroutines — not corrupted, not zero.
|
||||
got, ok := c.IdleTimeout("ws")
|
||||
if !ok || got <= 0 || got > time.Duration(N)*time.Second {
|
||||
t.Fatalf("expected a valid override after concurrent writes; got %v ok=%v", got, ok)
|
||||
}
|
||||
}
|
||||
@ -70,6 +70,40 @@ type HeartbeatPayload struct {
|
||||
// non-empty value is "wedged"; future values can extend this without
|
||||
// migration.
|
||||
RuntimeState string `json:"runtime_state"`
|
||||
|
||||
// RuntimeMetadata is the adapter-declared capability map + per-
|
||||
// capability override values. The Python runtime builds this from
|
||||
// BaseAdapter.capabilities() + per-hook methods (e.g.
|
||||
// idle_timeout_override()) — see workspace/heartbeat.py:
|
||||
// _runtime_metadata_payload. Optional: missing means "use platform
|
||||
// defaults for everything", matching pre-2026-04 behavior.
|
||||
//
|
||||
// Pointer (not value) so a missing JSON field is nil rather than a
|
||||
// zero-value RuntimeMetadata{} that would falsely claim "all caps =
|
||||
// false declared explicitly". Lets the platform distinguish "adapter
|
||||
// said no native ownership" from "old runtime version, didn't say".
|
||||
RuntimeMetadata *RuntimeMetadata `json:"runtime_metadata,omitempty"`
|
||||
}
|
||||
|
||||
// RuntimeMetadata is the adapter-declared capability + override block
|
||||
// the Python runtime sends in the heartbeat payload. New fields can be
|
||||
// added with `omitempty` without breaking older runtime versions.
|
||||
//
|
||||
// See project memory `project_runtime_native_pluggable.md` for the
|
||||
// principle and workspace/adapter_base.py:RuntimeCapabilities for the
|
||||
// Python source of truth.
|
||||
type RuntimeMetadata struct {
|
||||
// Capabilities maps capability name → "adapter owns it natively".
|
||||
// Keys (heartbeat, scheduler, session, status_mgmt, retry,
|
||||
// activity_decoration, channel_dispatch) match
|
||||
// RuntimeCapabilities.to_dict() in adapter_base.py — keep in sync.
|
||||
Capabilities map[string]bool `json:"capabilities,omitempty"`
|
||||
|
||||
// IdleTimeoutSeconds, when set, overrides the per-dispatch silence
|
||||
// window in a2a_proxy.go for this workspace's A2A traffic. Pointer
|
||||
// so nil means "no override; use the global default". Zero / negative
|
||||
// is treated as nil by the consumer (a2a_proxy.go).
|
||||
IdleTimeoutSeconds *int `json:"idle_timeout_seconds,omitempty"`
|
||||
}
|
||||
|
||||
type UpdateCardPayload struct {
|
||||
|
||||
52
workspace-server/internal/scheduler/native_scheduler_test.go
Normal file
52
workspace-server/internal/scheduler/native_scheduler_test.go
Normal file
@ -0,0 +1,52 @@
|
||||
package scheduler
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestSetNativeSchedulerCheck pins the wiring contract: New() leaves
|
||||
// providesNativeScheduler nil (= today's behavior, never skip);
|
||||
// SetNativeSchedulerCheck installs the override. The actual skip
|
||||
// behavior in tick() needs a DB and is exercised by the integration
|
||||
// tests in tests/e2e/.
|
||||
func TestSetNativeSchedulerCheck(t *testing.T) {
|
||||
s := New(nil, nil)
|
||||
if s.providesNativeScheduler != nil {
|
||||
t.Fatal("New() must leave providesNativeScheduler nil so untouched callers preserve today's behavior")
|
||||
}
|
||||
|
||||
called := false
|
||||
checker := NativeSchedulerCheck(func(workspaceID string) bool {
|
||||
called = true
|
||||
return workspaceID == "ws-native"
|
||||
})
|
||||
s.SetNativeSchedulerCheck(checker)
|
||||
if s.providesNativeScheduler == nil {
|
||||
t.Fatal("SetNativeSchedulerCheck did not install the function")
|
||||
}
|
||||
if !s.providesNativeScheduler("ws-native") {
|
||||
t.Fatal("installed checker not invoked / wrong return")
|
||||
}
|
||||
if !called {
|
||||
t.Fatal("installed checker not called")
|
||||
}
|
||||
if s.providesNativeScheduler("ws-other") {
|
||||
t.Fatal("checker should return false for non-native workspace")
|
||||
}
|
||||
}
|
||||
|
||||
// TestNativeSchedulerCheck_NilSafeInTick documents the contract used
|
||||
// by tick(): a nil providesNativeScheduler must mean "always fire" so
|
||||
// existing callers (test fixtures, prior to capability primitives)
|
||||
// preserve today's behavior unchanged. The conditional in tick reads
|
||||
// `s.providesNativeScheduler != nil && s.providesNativeScheduler(id)`
|
||||
// — neither branch can panic on a nil-checker scheduler.
|
||||
func TestNativeSchedulerCheck_NilSafeInTick(t *testing.T) {
|
||||
s := New(nil, nil)
|
||||
// We don't actually call tick() — that requires a live DB. We just
|
||||
// pin that the field is nil after New, which is the load-bearing
|
||||
// invariant tick() relies on.
|
||||
if s.providesNativeScheduler != nil {
|
||||
t.Fatal("nil-safety contract violated: providesNativeScheduler must be nil from New()")
|
||||
}
|
||||
}
|
||||
@ -70,6 +70,21 @@ type ChannelBroadcaster interface {
|
||||
FetchWorkspaceChannelContext(ctx context.Context, workspaceID string) string
|
||||
}
|
||||
|
||||
// NativeSchedulerCheck returns true when the workspace's adapter has
|
||||
// declared `provides_native_scheduler=True` in its capabilities. The
|
||||
// scheduler skips polling-and-firing for these workspaces — the SDK
|
||||
// runs the schedule itself (Temporal, Durable Functions, etc.) and the
|
||||
// platform's polling would cause double-fire on every restart.
|
||||
//
|
||||
// Wired at construction by the router (production) or tests. nil is
|
||||
// allowed and treated as "no override" for every workspace, preserving
|
||||
// today's behavior — same default-false posture as
|
||||
// BaseAdapter.capabilities() in workspace/adapter_base.py.
|
||||
//
|
||||
// See project memory `project_runtime_native_pluggable.md` and
|
||||
// handlers.ProvidesNativeScheduler for the production wiring.
|
||||
type NativeSchedulerCheck func(workspaceID string) bool
|
||||
|
||||
// Scheduler polls the workspace_schedules table and fires A2A messages
|
||||
// when a schedule's next_run_at has passed. Follows the same goroutine
|
||||
// pattern as registry.StartHealthSweep.
|
||||
@ -78,6 +93,11 @@ type Scheduler struct {
|
||||
broadcaster Broadcaster
|
||||
channels ChannelBroadcaster
|
||||
|
||||
// providesNativeScheduler, when non-nil and returning true, causes
|
||||
// tick() to skip firing for this workspace. nil = always-fire (the
|
||||
// pre-capability-primitive behavior). Constructor docs above.
|
||||
providesNativeScheduler NativeSchedulerCheck
|
||||
|
||||
// lastTickAt records the wall-clock time of the most recent tick
|
||||
// (whether it fired schedules or not). Read by Healthy() and the
|
||||
// /admin/scheduler/health endpoint to detect stuck-tick conditions.
|
||||
@ -102,6 +122,15 @@ func (s *Scheduler) SetChannels(ch ChannelBroadcaster) {
|
||||
s.channels = ch
|
||||
}
|
||||
|
||||
// SetNativeSchedulerCheck wires the per-workspace native-scheduler
|
||||
// override lookup. Wired by the router after the scheduler is
|
||||
// constructed (handlers package owns the cache). Pass nil to disable
|
||||
// the skip — every schedule fires regardless of adapter declaration,
|
||||
// matching pre-capability-primitive behavior.
|
||||
func (s *Scheduler) SetNativeSchedulerCheck(f NativeSchedulerCheck) {
|
||||
s.providesNativeScheduler = f
|
||||
}
|
||||
|
||||
// LastTickAt returns the wall-clock time of the most recently completed tick.
|
||||
// Returns a zero time.Time if the scheduler has never completed a tick.
|
||||
func (s *Scheduler) LastTickAt() time.Time {
|
||||
@ -231,6 +260,27 @@ func (s *Scheduler) tick(ctx context.Context) {
|
||||
log.Printf("Scheduler: scan error: %v", err)
|
||||
continue
|
||||
}
|
||||
// Skip workspaces whose adapter owns scheduling natively (e.g.
|
||||
// SDKs with built-in cron / Temporal-style workflows). Without
|
||||
// this skip, the platform's polling would fire the same
|
||||
// schedule twice — once natively in the SDK, once via this
|
||||
// loop. The skip drops only the FIRE; the schedule row stays
|
||||
// in the DB and the platform still records it, so observability
|
||||
// (next_run_at, last_run_at) is preserved per the principle.
|
||||
// Pre-fix this branch was unconditional; nil check preserves
|
||||
// behavior for callers that didn't wire the override.
|
||||
if s.providesNativeScheduler != nil && s.providesNativeScheduler(sched.WorkspaceID) {
|
||||
// Advance next_run_at so we don't tight-loop on the same
|
||||
// row every tick. A non-firing schedule is still scheduled.
|
||||
if nextTime, err := ComputeNextRun(sched.CronExpr, sched.Timezone, time.Now()); err == nil {
|
||||
if _, execErr := db.DB.ExecContext(ctx,
|
||||
`UPDATE workspace_schedules SET next_run_at=$1, updated_at=now() WHERE id=$2`,
|
||||
nextTime, sched.ID); execErr != nil {
|
||||
log.Printf("Scheduler: native-skip next_run_at UPDATE failed for schedule %s: %v", sched.ID, execErr)
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
wg.Add(1)
|
||||
sem <- struct{}{}
|
||||
go func(s2 scheduleRow) {
|
||||
|
||||
@ -164,6 +164,29 @@ class BaseAdapter(ABC):
|
||||
project memory `project_runtime_native_pluggable.md`."""
|
||||
return RuntimeCapabilities()
|
||||
|
||||
def idle_timeout_override(self) -> int | None:
|
||||
"""Per-A2A-dispatch silence window override, in SECONDS.
|
||||
|
||||
Return None to use the platform default (env var
|
||||
A2A_IDLE_TIMEOUT_SECONDS, falling back to 5 minutes — see
|
||||
a2a_proxy.go:defaultIdleTimeoutDuration). Override when this
|
||||
runtime's SDK can legitimately go silent longer than the
|
||||
default before the dispatch should be considered wedged.
|
||||
|
||||
Why this is per-adapter, not just env: the env value is a
|
||||
cluster-wide knob set by ops. Different SDKs have different
|
||||
latency profiles — claude-code synthesis on Opus + tool use
|
||||
legitimately runs 8-10 min between broadcasts; hermes synth
|
||||
with custom providers can be even slower. Hardcoding 5min for
|
||||
everyone either cancels real work (claude-code synth) or
|
||||
leaves wedged runtimes (langgraph) hanging too long.
|
||||
|
||||
Platform reads this from the heartbeat payload and stashes
|
||||
it per-workspace; dispatchA2A consults it before applying the
|
||||
idle timer. None / unset / zero falls through to the global
|
||||
default — same behavior as before this hook landed."""
|
||||
return None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Plugin install hooks
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@ -43,6 +43,43 @@ def _runtime_state_payload() -> dict:
|
||||
"sample_error": wedge_reason(),
|
||||
}
|
||||
|
||||
|
||||
def _runtime_metadata_payload() -> dict:
|
||||
"""Build the {runtime_metadata} portion of the heartbeat body —
|
||||
adapter-declared capabilities + per-capability override values
|
||||
(idle timeout, etc.). The platform reads this to route capabilities
|
||||
to the right owner: native (adapter) vs fallback (platform).
|
||||
|
||||
Returns an empty dict if the adapter can't be loaded or introspected.
|
||||
Heartbeat must NEVER fail because of capability discovery — observability
|
||||
is more important than capability accuracy. The platform falls through
|
||||
to its own defaults when fields are missing.
|
||||
|
||||
See project memory `project_runtime_native_pluggable.md` and
|
||||
workspace/adapter_base.py:RuntimeCapabilities.
|
||||
"""
|
||||
try:
|
||||
from adapters import get_adapter
|
||||
# ADAPTER_MODULE wins over the runtime arg in get_adapter — pass
|
||||
# an empty string to force the env-var path.
|
||||
adapter_cls = get_adapter("")
|
||||
adapter = adapter_cls()
|
||||
caps = adapter.capabilities()
|
||||
meta: dict = {"capabilities": caps.to_dict()}
|
||||
idle = adapter.idle_timeout_override()
|
||||
# Only include the override when it's a positive integer. None /
|
||||
# zero / negative falls through to the platform's global default
|
||||
# (env A2A_IDLE_TIMEOUT_SECONDS, default 5min) — that "absent
|
||||
# field = use default" contract is what keeps the wire small.
|
||||
if isinstance(idle, int) and idle > 0:
|
||||
meta["idle_timeout_seconds"] = idle
|
||||
return {"runtime_metadata": meta}
|
||||
except Exception as e:
|
||||
# debug-level: missing ADAPTER_MODULE in dev / test envs is normal
|
||||
logger.debug("runtime_metadata: failed to read adapter caps: %s", e)
|
||||
return {}
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
HEARTBEAT_INTERVAL = 30 # seconds
|
||||
@ -123,6 +160,7 @@ class HeartbeatLoop:
|
||||
# sample_error field. The platform reads
|
||||
# runtime_state to flip status → degraded.
|
||||
body.update(_runtime_state_payload())
|
||||
body.update(_runtime_metadata_payload())
|
||||
await client.post(
|
||||
f"{self.platform_url}/registry/heartbeat",
|
||||
json=body,
|
||||
|
||||
146
workspace/tests/test_heartbeat_runtime_metadata.py
Normal file
146
workspace/tests/test_heartbeat_runtime_metadata.py
Normal file
@ -0,0 +1,146 @@
|
||||
"""Tests for heartbeat._runtime_metadata_payload — the heartbeat-side
|
||||
producer that sends adapter capability declarations + the
|
||||
idle_timeout_override value to the platform every 30s. Capability
|
||||
primitive #2 (task #117) wires this into the platform's a2a_proxy.
|
||||
|
||||
Tests use sys.modules monkey-patching to stub the `adapters` module
|
||||
because workspace/heartbeat.py lazy-imports it inside the helper —
|
||||
keeping heartbeat resilient to a missing/broken adapter discovery
|
||||
path."""
|
||||
import sys
|
||||
from types import SimpleNamespace
|
||||
|
||||
import pytest
|
||||
|
||||
from adapter_base import BaseAdapter, RuntimeCapabilities
|
||||
from heartbeat import _runtime_metadata_payload
|
||||
|
||||
|
||||
class _FakeAdapter(BaseAdapter):
|
||||
"""Default adapter — every capability False, no idle override.
|
||||
Matches today's behavior for any runtime that doesn't opt in."""
|
||||
|
||||
@staticmethod
|
||||
def name() -> str:
|
||||
return "fake"
|
||||
|
||||
@staticmethod
|
||||
def display_name() -> str:
|
||||
return "Fake"
|
||||
|
||||
@staticmethod
|
||||
def description() -> str:
|
||||
return "Fake adapter for heartbeat metadata tests"
|
||||
|
||||
async def setup(self, config) -> None:
|
||||
return None
|
||||
|
||||
async def create_executor(self, config): # pragma: no cover
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class _NativeAdapter(_FakeAdapter):
|
||||
"""Adapter that declares native heartbeat + 600s idle override —
|
||||
matches what claude-code's adapter will declare once #87 lands."""
|
||||
|
||||
def capabilities(self) -> RuntimeCapabilities:
|
||||
return RuntimeCapabilities(provides_native_heartbeat=True)
|
||||
|
||||
def idle_timeout_override(self) -> int:
|
||||
return 600
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def stub_adapters_module(request):
|
||||
"""Install a fake `adapters` module that returns the requested
|
||||
adapter class from get_adapter(). Cleans up after the test."""
|
||||
adapter_cls = getattr(request, "param", _FakeAdapter)
|
||||
fake_mod = SimpleNamespace(get_adapter=lambda runtime: adapter_cls)
|
||||
saved = sys.modules.get("adapters")
|
||||
sys.modules["adapters"] = fake_mod # type: ignore[assignment]
|
||||
try:
|
||||
yield adapter_cls
|
||||
finally:
|
||||
if saved is None:
|
||||
sys.modules.pop("adapters", None)
|
||||
else:
|
||||
sys.modules["adapters"] = saved
|
||||
|
||||
|
||||
@pytest.mark.parametrize("stub_adapters_module", [_FakeAdapter], indirect=True)
|
||||
def test_default_adapter_emits_all_false_capabilities_no_idle_override(stub_adapters_module):
|
||||
"""Default-adapter heartbeat MUST carry the runtime_metadata block
|
||||
with all-False caps and no idle_timeout_seconds. The block being
|
||||
present (even with zero info) is the wire signal that this runtime
|
||||
speaks the new protocol — older runtimes omit the field entirely."""
|
||||
payload = _runtime_metadata_payload()
|
||||
assert "runtime_metadata" in payload
|
||||
meta = payload["runtime_metadata"]
|
||||
assert meta["capabilities"] == {
|
||||
"heartbeat": False,
|
||||
"scheduler": False,
|
||||
"session": False,
|
||||
"status_mgmt": False,
|
||||
"retry": False,
|
||||
"activity_decoration": False,
|
||||
"channel_dispatch": False,
|
||||
}
|
||||
# No override key at all — pin the "absent field = use platform
|
||||
# default" wire contract Go side relies on.
|
||||
assert "idle_timeout_seconds" not in meta
|
||||
|
||||
|
||||
@pytest.mark.parametrize("stub_adapters_module", [_NativeAdapter], indirect=True)
|
||||
def test_native_adapter_emits_capability_flag_and_idle_override(stub_adapters_module):
|
||||
payload = _runtime_metadata_payload()
|
||||
meta = payload["runtime_metadata"]
|
||||
assert meta["capabilities"]["heartbeat"] is True
|
||||
# Sibling caps untouched — declaring one capability doesn't
|
||||
# accidentally claim ownership of the others.
|
||||
assert meta["capabilities"]["scheduler"] is False
|
||||
assert meta["idle_timeout_seconds"] == 600
|
||||
|
||||
|
||||
def test_returns_empty_dict_when_adapter_module_missing(monkeypatch):
|
||||
"""get_adapter() raises KeyError when ADAPTER_MODULE is unset.
|
||||
Heartbeat must NEVER fail — the metadata is optional, the
|
||||
heartbeat itself (alive signal) is load-bearing. Pin that the
|
||||
helper swallows the error and returns {}."""
|
||||
# Remove any stub from prior tests.
|
||||
monkeypatch.delitem(sys.modules, "adapters", raising=False)
|
||||
# Force get_adapter to raise by ensuring ADAPTER_MODULE is unset.
|
||||
monkeypatch.delenv("ADAPTER_MODULE", raising=False)
|
||||
payload = _runtime_metadata_payload()
|
||||
assert payload == {}
|
||||
|
||||
|
||||
@pytest.mark.parametrize("stub_adapters_module", [_FakeAdapter], indirect=True)
|
||||
def test_idle_timeout_override_zero_or_negative_omitted(stub_adapters_module, monkeypatch):
|
||||
"""An adapter that returns 0 or negative from idle_timeout_override
|
||||
means 'use the platform default' — same as None. Don't ship a
|
||||
bogus value to the wire that the Go side would have to filter."""
|
||||
class _BadOverrideAdapter(_FakeAdapter):
|
||||
def idle_timeout_override(self) -> int:
|
||||
return 0
|
||||
|
||||
fake_mod = SimpleNamespace(get_adapter=lambda runtime: _BadOverrideAdapter)
|
||||
monkeypatch.setitem(sys.modules, "adapters", fake_mod)
|
||||
|
||||
payload = _runtime_metadata_payload()
|
||||
assert "idle_timeout_seconds" not in payload["runtime_metadata"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("stub_adapters_module", [_FakeAdapter], indirect=True)
|
||||
def test_swallows_unexpected_exception_inside_adapter(stub_adapters_module, monkeypatch):
|
||||
"""Adapter capabilities() / idle_timeout_override() throwing must
|
||||
NOT crash heartbeat. Returns {} so no field is sent and the
|
||||
platform falls through to defaults."""
|
||||
class _BrokenAdapter(_FakeAdapter):
|
||||
def capabilities(self):
|
||||
raise RuntimeError("simulated broken adapter init")
|
||||
|
||||
fake_mod = SimpleNamespace(get_adapter=lambda runtime: _BrokenAdapter)
|
||||
monkeypatch.setitem(sys.modules, "adapters", fake_mod)
|
||||
|
||||
payload = _runtime_metadata_payload()
|
||||
assert payload == {}
|
||||
@ -152,3 +152,35 @@ class TestBaseAdapterCapabilitiesDefault:
|
||||
native = _NativeHeartbeatAdapter().capabilities()
|
||||
assert minimal.provides_native_heartbeat is False
|
||||
assert native.provides_native_heartbeat is True
|
||||
|
||||
|
||||
class TestIdleTimeoutOverride:
|
||||
"""The idle_timeout_override() hook — the first capability primitive
|
||||
with an actual platform consumer (workspace-server's a2a_proxy.go
|
||||
consults this per-workspace before applying its idle timer).
|
||||
|
||||
Default behavior MUST be no-op (return None → platform uses global
|
||||
default). Subclasses override to declare longer/shorter window."""
|
||||
|
||||
def test_default_returns_none(self):
|
||||
# If this default ever flips to a positive number, every adapter
|
||||
# silently gets that idle timeout. The platform's global default
|
||||
# (env A2A_IDLE_TIMEOUT_SECONDS, default 5min) would stop being
|
||||
# the floor — instead this hook would be — and ops would lose
|
||||
# the central knob.
|
||||
assert _MinimalAdapter().idle_timeout_override() is None
|
||||
|
||||
def test_subclass_can_override_to_positive_seconds(self):
|
||||
class _SlowAdapter(_MinimalAdapter):
|
||||
def idle_timeout_override(self) -> int:
|
||||
return 600 # 10 min — typical for a slow synth runtime
|
||||
assert _SlowAdapter().idle_timeout_override() == 600
|
||||
|
||||
def test_subclass_can_explicitly_keep_default_via_none(self):
|
||||
# An adapter that overrode this in an old version then dropped
|
||||
# the override (back to None) should cleanly fall back to the
|
||||
# platform default. Pinning here makes the round-trip explicit.
|
||||
class _DroppedOverrideAdapter(_MinimalAdapter):
|
||||
def idle_timeout_override(self):
|
||||
return None
|
||||
assert _DroppedOverrideAdapter().idle_timeout_override() is None
|
||||
|
||||
Loading…
Reference in New Issue
Block a user