fix(a2a): raise canvas idle watchdog 5m→30m for long blocking turns (core#2723) #2727

Merged
devops-engineer merged 1 commits from fix/a2a-idle-timeout-raise into main 2026-06-13 07:56:12 +00:00
2 changed files with 19 additions and 1 deletions
@@ -979,6 +979,18 @@ func normalizeA2APayload(body []byte) ([]byte, string, *proxyA2AError) {
// now broadcasts unconditionally — together either one alone closes the
// gap, both together is defence in depth.
//
// 2026-06-13 (core#2723): raised 5min → 30min. The 30s WORKSPACE_HEARTBEAT
// normally resets this long before 5min, so the window only matters when
// the heartbeat STALLS — which happens when the runtime's asyncio
// heartbeat task is starved by a long *blocking* tool call (e.g. a bulk
// asset migration). A real long autonomous turn was getting cancelled at
// ~300s mid-work ("tool chain lost"). The complete fix is runtime-side
// (heartbeat on an independent thread, tracked in #2723); raising this
// ceiling is the deployable safety margin so a multi-minute blocking step
// survives. 30min matches the agent-to-agent absolute ceiling. The canvas
// path has no separate ceiling, so this is its only deadline; a genuinely
// dead agent is still surfaced by the reactive-health path, not this.
//
// Override via A2A_IDLE_TIMEOUT_SECONDS for ops who want to tune (e.g.
// shorter for canary/test runners that want fail-fast on wedge, longer
// for prod tenants running unusually slow plugins).
@@ -987,7 +999,7 @@ var idleTimeoutDuration = parseIdleTimeoutEnv(os.Getenv("A2A_IDLE_TIMEOUT_SECOND
// defaultIdleTimeoutDuration is what parseIdleTimeoutEnv returns when
// the env var is unset or invalid. Pulled out as a const so tests can
// reference it without re-deriving the value.
const defaultIdleTimeoutDuration = 5 * time.Minute
const defaultIdleTimeoutDuration = 30 * time.Minute
// parseIdleTimeoutEnv parses the A2A_IDLE_TIMEOUT_SECONDS value, falling
// back to defaultIdleTimeoutDuration on empty / non-numeric / non-positive
@@ -1060,6 +1060,11 @@ done:
// (pre-fix behaviour) and the regression would slip in unnoticed.
func TestParseIdleTimeoutEnv(t *testing.T) {
// core#2723: the default is the deployable safety margin for long
// blocking tool calls that stall the runtime heartbeat (raised 5m→30m).
if defaultIdleTimeoutDuration != 30*time.Minute {
t.Errorf("default idle timeout = %v, want 30m (core#2723)", defaultIdleTimeoutDuration)
}
cases := []struct {
name string
in string
@@ -1067,6 +1072,7 @@ func TestParseIdleTimeoutEnv(t *testing.T) {
}{
{"empty falls back to default", "", defaultIdleTimeoutDuration},
{"valid positive integer parses to seconds", "120", 120 * time.Second},
{"longer override honored (30m)", "1800", 1800 * time.Second},
{"valid integer at minimum (1) is accepted", "1", 1 * time.Second},
{"non-numeric falls back to default", "foo", defaultIdleTimeoutDuration},
{"negative falls back to default", "-30", defaultIdleTimeoutDuration},