fix(a2a): raise canvas idle watchdog 5m→30m for long blocking turns (core#2723) #2727
@@ -979,6 +979,18 @@ func normalizeA2APayload(body []byte) ([]byte, string, *proxyA2AError) {
|
||||
// now broadcasts unconditionally — together either one alone closes the
|
||||
// gap, both together is defence in depth.
|
||||
//
|
||||
// 2026-06-13 (core#2723): raised 5min → 30min. The 30s WORKSPACE_HEARTBEAT
|
||||
// normally resets this long before 5min, so the window only matters when
|
||||
// the heartbeat STALLS — which happens when the runtime's asyncio
|
||||
// heartbeat task is starved by a long *blocking* tool call (e.g. a bulk
|
||||
// asset migration). A real long autonomous turn was getting cancelled at
|
||||
// ~300s mid-work ("tool chain lost"). The complete fix is runtime-side
|
||||
// (heartbeat on an independent thread, tracked in #2723); raising this
|
||||
// ceiling is the deployable safety margin so a multi-minute blocking step
|
||||
// survives. 30min matches the agent-to-agent absolute ceiling. The canvas
|
||||
// path has no separate ceiling, so this is its only deadline; a genuinely
|
||||
// dead agent is still surfaced by the reactive-health path, not this.
|
||||
//
|
||||
// Override via A2A_IDLE_TIMEOUT_SECONDS for ops who want to tune (e.g.
|
||||
// shorter for canary/test runners that want fail-fast on wedge, longer
|
||||
// for prod tenants running unusually slow plugins).
|
||||
@@ -987,7 +999,7 @@ var idleTimeoutDuration = parseIdleTimeoutEnv(os.Getenv("A2A_IDLE_TIMEOUT_SECOND
|
||||
// defaultIdleTimeoutDuration is what parseIdleTimeoutEnv returns when
|
||||
// the env var is unset or invalid. Pulled out as a const so tests can
|
||||
// reference it without re-deriving the value.
|
||||
const defaultIdleTimeoutDuration = 5 * time.Minute
|
||||
const defaultIdleTimeoutDuration = 30 * time.Minute
|
||||
|
||||
// parseIdleTimeoutEnv parses the A2A_IDLE_TIMEOUT_SECONDS value, falling
|
||||
// back to defaultIdleTimeoutDuration on empty / non-numeric / non-positive
|
||||
|
||||
@@ -1060,6 +1060,11 @@ done:
|
||||
// (pre-fix behaviour) and the regression would slip in unnoticed.
|
||||
|
||||
func TestParseIdleTimeoutEnv(t *testing.T) {
|
||||
// core#2723: the default is the deployable safety margin for long
|
||||
// blocking tool calls that stall the runtime heartbeat (raised 5m→30m).
|
||||
if defaultIdleTimeoutDuration != 30*time.Minute {
|
||||
t.Errorf("default idle timeout = %v, want 30m (core#2723)", defaultIdleTimeoutDuration)
|
||||
}
|
||||
cases := []struct {
|
||||
name string
|
||||
in string
|
||||
@@ -1067,6 +1072,7 @@ func TestParseIdleTimeoutEnv(t *testing.T) {
|
||||
}{
|
||||
{"empty falls back to default", "", defaultIdleTimeoutDuration},
|
||||
{"valid positive integer parses to seconds", "120", 120 * time.Second},
|
||||
{"longer override honored (30m)", "1800", 1800 * time.Second},
|
||||
{"valid integer at minimum (1) is accepted", "1", 1 * time.Second},
|
||||
{"non-numeric falls back to default", "foo", defaultIdleTimeoutDuration},
|
||||
{"negative falls back to default", "-30", defaultIdleTimeoutDuration},
|
||||
|
||||
Reference in New Issue
Block a user