fix(core#2723): raise canvas idle default 5min→15min + align chat-send timeout #2728

Closed
agent-dev-b wants to merge 1 commits from fix/core2723-canvas-idle-15min into main
3 changed files with 132 additions and 9 deletions
@@ -230,7 +230,7 @@ export function useChatSend(workspaceId: string, options: UseChatSendOptions) {
metadata: { history },
},
},
{ timeoutMs: 120_000 },
{ timeoutMs: 15 * 60 * 1000 }, // 15min — must exceed server canvas-idle (core#2723: 15min) so the AbortSignal doesn't fire first and surface "agent may be unreachable" mid-turn
)
.then((resp) => {
if (sendTokenRef.current !== myToken) return;
@@ -971,13 +971,24 @@ func normalizeA2APayload(body []byte) ([]byte, string, *proxyA2AError) {
// that's just thinking silently between tool calls keeps the connection
// alive without having to emit ACTIVITY_LOGGED noise.
//
// Pre-2026-04-26 this was 60s, picked when the platform only broadcast
// on TASK_UPDATED (which itself only fires when current_task CHANGES).
// A claude-code agent doing a long packaging step or a slow model thought
// kept the same current_task for >60s, fired no broadcast, got cancelled
// mid-flight. Bumped to 5min as a safety net AND the heartbeat handler
// now broadcasts unconditionally — together either one alone closes the
// gap, both together is defence in depth.
// History:
//
// - 60s: picked when the platform only broadcast on TASK_UPDATED
// (which itself only fires when current_task CHANGES). A claude-code
// agent doing a long packaging step or a slow model thought kept the
// same current_task for >60s, fired no broadcast, got cancelled
// mid-flight. Bumped to 5min as a safety net AND the heartbeat handler
// now broadcasts unconditionally — together either one alone closes the
// gap, both together is defence in depth.
// - 5min (2026-04-26 → 2026-06-13): the JRS SEO agent hit a 5-min wedge
// during a bulk asset migration (download + re-upload ~102 images).
// 5min was not enough for a long single step that emitted no
// broadcaster events (bulk download/upload, a long build) — see
// core#2723. Raised to 15min (900s) for the canvas default; the
// 30-min absolute ceiling for agent-to-agent dispatch is unchanged.
// The runtime heartbeat in claude-code (separate work, NOT in this
// PR) is the durable architecture for active-but-silent turns;
// 15min is the headroom until that lands.
//
// Override via A2A_IDLE_TIMEOUT_SECONDS for ops who want to tune (e.g.
// shorter for canary/test runners that want fail-fast on wedge, longer
@@ -987,7 +998,11 @@ var idleTimeoutDuration = parseIdleTimeoutEnv(os.Getenv("A2A_IDLE_TIMEOUT_SECOND
// defaultIdleTimeoutDuration is what parseIdleTimeoutEnv returns when
// the env var is unset or invalid. Pulled out as a const so tests can
// reference it without re-deriving the value.
const defaultIdleTimeoutDuration = 5 * time.Minute
//
// core#2723: raised from 5*time.Minute → 15*time.Minute. The 30-min
// absolute ceiling on agent-to-agent dispatch (dispatchA2A:992) is
// independent and unchanged — only the canvas-side idle window grew.
const defaultIdleTimeoutDuration = 15 * time.Minute
// parseIdleTimeoutEnv parses the A2A_IDLE_TIMEOUT_SECONDS value, falling
// back to defaultIdleTimeoutDuration on empty / non-numeric / non-positive
@@ -2959,3 +2959,111 @@ func TestInjectCanvasUserIdentity_Nil(t *testing.T) {
}
}
// ==================== core#2723 — raised canvas idle default ====================
//
// The canvas-side idle default (defaultIdleTimeoutDuration) was raised from
// 5min to 15min in response to a CTO-reported 5-min wedge during a long
// asset-migration turn (JRS SEO agent, ~102 images, download + re-upload,
// no broadcaster events for 5min). These tests pin the new default, the
// env-override parsing (which the change must not regress), and the
// applyIdleTimeout behavior under the larger window.
// TestDefaultIdleTimeoutDuration_Is15Min pins the core#2723 raise. Any
// future change to this constant requires an explicit decision and a
// matching test update; the test fails fast so a silent revert can't
// land in main.
func TestDefaultIdleTimeoutDuration_Is15Min(t *testing.T) {
const want = 15 * time.Minute
if defaultIdleTimeoutDuration != want {
t.Errorf("defaultIdleTimeoutDuration = %v, want %v (core#2723: raised from 5min to 15min)", defaultIdleTimeoutDuration, want)
}
}
// TestParseIdleTimeoutEnv_OverrideBeatsDefault pins the A2A_IDLE_TIMEOUT_SECONDS
// override still wins over the (new) 15min default. Ops who set a longer
// window for unusually slow plugins must still see their override take
// effect — the raise is for the DEFAULT, not a hard ceiling.
func TestParseIdleTimeoutEnv_OverrideBeatsDefault(t *testing.T) {
t.Run("valid override is honored", func(t *testing.T) {
t.Setenv("A2A_IDLE_TIMEOUT_SECONDS", "1800") // 30min
got := parseIdleTimeoutEnv(os.Getenv("A2A_IDLE_TIMEOUT_SECONDS"))
want := 1800 * time.Second
if got != want {
t.Errorf("parseIdleTimeoutEnv(1800) = %v, want %v (override must beat default)", got, want)
}
})
t.Run("empty falls back to 15min", func(t *testing.T) {
t.Setenv("A2A_IDLE_TIMEOUT_SECONDS", "")
got := parseIdleTimeoutEnv(os.Getenv("A2A_IDLE_TIMEOUT_SECONDS"))
if got != defaultIdleTimeoutDuration {
t.Errorf("parseIdleTimeoutEnv(\"\") = %v, want %v (the new 15min default)", got, defaultIdleTimeoutDuration)
}
if got != 15*time.Minute {
t.Errorf("parseIdleTimeoutEnv(\"\") = %v, want 15min (core#2723 raise)", got)
}
})
t.Run("non-numeric falls back to 15min", func(t *testing.T) {
t.Setenv("A2A_IDLE_TIMEOUT_SECONDS", "not-a-number")
got := parseIdleTimeoutEnv(os.Getenv("A2A_IDLE_TIMEOUT_SECONDS"))
if got != 15*time.Minute {
t.Errorf("parseIdleTimeoutEnv(\"not-a-number\") = %v, want 15min", got)
}
})
t.Run("zero falls back to 15min", func(t *testing.T) {
t.Setenv("A2A_IDLE_TIMEOUT_SECONDS", "0")
got := parseIdleTimeoutEnv(os.Getenv("A2A_IDLE_TIMEOUT_SECONDS"))
if got != 15*time.Minute {
t.Errorf("parseIdleTimeoutEnv(\"0\") = %v, want 15min (zero treated as invalid)", got)
}
})
t.Run("negative falls back to 15min", func(t *testing.T) {
t.Setenv("A2A_IDLE_TIMEOUT_SECONDS", "-30")
got := parseIdleTimeoutEnv(os.Getenv("A2A_IDLE_TIMEOUT_SECONDS"))
if got != 15*time.Minute {
t.Errorf("parseIdleTimeoutEnv(\"-30\") = %v, want 15min (negative treated as invalid)", got)
}
})
}
// TestApplyIdleTimeout_ResetsOnEvent_AtLargerWindow is the #2723 e2e proxy:
// even at a 15-min-scale idle window, an event mid-window must still reset
// the clock. We can't actually wait 15 min in a unit test, so we verify
// the property with a small (80ms) window + a single mid-window event
// (mirroring the existing TestApplyIdleTimeout_ResetsOnEvent but explicitly
// scoped to the #2723 raise — a future refactor that breaks the reset
// semantics MUST fail this test).
func TestApplyIdleTimeout_ResetsOnEvent_AtLargerWindow(t *testing.T) {
setupTestDB(t)
setupTestRedis(t)
b := newTestBroadcaster()
parent, parentCancel := context.WithTimeout(context.Background(), 5*time.Second)
defer parentCancel()
idle := 80 * time.Millisecond
idleCtx, idleCancel := applyIdleTimeout(parent, b, "ws-2723-active", idle)
defer idleCancel()
// Mid-window event: reset the clock.
time.Sleep(idle / 2)
b.BroadcastOnly("ws-2723-active", "ACTIVITY_LOGGED", map[string]interface{}{
"activity_type": "tool_call", // matches the JRS bulk-upload pattern
})
// Past the original deadline, ctx must still be alive.
select {
case <-idleCtx.Done():
t.Fatal("idleCtx cancelled despite mid-window event (core#2723 regression: reset semantics must survive the raise)")
case <-time.After(idle - (idle / 2) + 10*time.Millisecond):
// ok
}
// Second silence window fires.
select {
case <-idleCtx.Done():
// expected
case <-time.After(idle + 200*time.Millisecond):
t.Fatal("idleCtx never cancelled after the second silence window")
}
}