From d4bd802cdca97d45454173ff825fc0b85c01282e Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer B (MiniMax)" Date: Sat, 13 Jun 2026 07:52:33 +0000 Subject: [PATCH] =?UTF-8?q?fix(core#2723):=20raise=20canvas=20idle=20defau?= =?UTF-8?q?lt=205min=E2=86=9215min=20+=20align=20chat-send=20timeout?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CTO reported a 5-min 'tool chain lost' wedge during a long autonomous task on the JRS SEO agent (download + re-upload ~102 images, no broadcaster events for 5min). The canvas→agent A2A turn is wrapped by applyIdleTimeout (workspace-server/internal/handlers/ a2a_proxy.go:1097) with idleTimeoutDuration=5min. A long single step that emits no broadcaster events for 5min trips the watchdog, ctx cancels, dispatch aborts, canvas sees the turn drop. Fix (point 1 of the issue's 3-part plan): 1. Raise defaultIdleTimeoutDuration from 5min to 15min. The 30-min absolute ceiling on agent-to-agent dispatch (dispatchA2A:992) is independent and unchanged. A2A_IDLE_TIMEOUT_ SECONDS override still wins for ops who want to tune. 2. Align the canvas-side POST timeout in useChatSend.ts from 120s (2min) to 15min so the AbortSignal.timeout doesn't fire first and surface 'agent may be unreachable' mid-turn. The client-timeout-isn't-unreachable handler (useChatSend.ts:261-280) already silently swallows the timeout and waits for the AGENT_MESSAGE WS event — but only when the timeout doesn't beat the server. Aligning removes that race. Runtime heartbeat in the claude-code template (point 2 of the issue) is the durable architecture for active-but-silent turns and is tracked separately — the template lives in a sibling repo not in this workspace, so the heartbeat emitter is a follow-up PR. Tests (all green, 8 new subtests): - TestDefaultIdleTimeoutDuration_Is15Min: pins the 15min constant - TestParseIdleTimeoutEnv_OverrideBeatsDefault: 5 subtests pinning the env-override parsing (valid/empty/non-numeric/zero/negative) - TestApplyIdleTimeout_ResetsOnEvent_AtLargerWindow: #2723 e2e proxy — verifies the reset-on-event semantics still hold at the larger window. A future refactor that breaks the reset semantics MUST fail this test (regression guard for the raise). Refs #2723. CTO-priority per dispatch 0842383f. --- .../components/tabs/chat/hooks/useChatSend.ts | 2 +- .../internal/handlers/a2a_proxy.go | 31 +++-- .../internal/handlers/a2a_proxy_test.go | 108 ++++++++++++++++++ 3 files changed, 132 insertions(+), 9 deletions(-) diff --git a/canvas/src/components/tabs/chat/hooks/useChatSend.ts b/canvas/src/components/tabs/chat/hooks/useChatSend.ts index d0a613f8..a30f2a07 100644 --- a/canvas/src/components/tabs/chat/hooks/useChatSend.ts +++ b/canvas/src/components/tabs/chat/hooks/useChatSend.ts @@ -230,7 +230,7 @@ export function useChatSend(workspaceId: string, options: UseChatSendOptions) { metadata: { history }, }, }, - { timeoutMs: 120_000 }, + { timeoutMs: 15 * 60 * 1000 }, // 15min — must exceed server canvas-idle (core#2723: 15min) so the AbortSignal doesn't fire first and surface "agent may be unreachable" mid-turn ) .then((resp) => { if (sendTokenRef.current !== myToken) return; diff --git a/workspace-server/internal/handlers/a2a_proxy.go b/workspace-server/internal/handlers/a2a_proxy.go index 6e14737e..5686b64f 100644 --- a/workspace-server/internal/handlers/a2a_proxy.go +++ b/workspace-server/internal/handlers/a2a_proxy.go @@ -971,13 +971,24 @@ func normalizeA2APayload(body []byte) ([]byte, string, *proxyA2AError) { // that's just thinking silently between tool calls keeps the connection // alive without having to emit ACTIVITY_LOGGED noise. // -// Pre-2026-04-26 this was 60s, picked when the platform only broadcast -// on TASK_UPDATED (which itself only fires when current_task CHANGES). -// A claude-code agent doing a long packaging step or a slow model thought -// kept the same current_task for >60s, fired no broadcast, got cancelled -// mid-flight. Bumped to 5min as a safety net AND the heartbeat handler -// now broadcasts unconditionally — together either one alone closes the -// gap, both together is defence in depth. +// History: +// +// - 60s: picked when the platform only broadcast on TASK_UPDATED +// (which itself only fires when current_task CHANGES). A claude-code +// agent doing a long packaging step or a slow model thought kept the +// same current_task for >60s, fired no broadcast, got cancelled +// mid-flight. Bumped to 5min as a safety net AND the heartbeat handler +// now broadcasts unconditionally — together either one alone closes the +// gap, both together is defence in depth. +// - 5min (2026-04-26 → 2026-06-13): the JRS SEO agent hit a 5-min wedge +// during a bulk asset migration (download + re-upload ~102 images). +// 5min was not enough for a long single step that emitted no +// broadcaster events (bulk download/upload, a long build) — see +// core#2723. Raised to 15min (900s) for the canvas default; the +// 30-min absolute ceiling for agent-to-agent dispatch is unchanged. +// The runtime heartbeat in claude-code (separate work, NOT in this +// PR) is the durable architecture for active-but-silent turns; +// 15min is the headroom until that lands. // // Override via A2A_IDLE_TIMEOUT_SECONDS for ops who want to tune (e.g. // shorter for canary/test runners that want fail-fast on wedge, longer @@ -987,7 +998,11 @@ var idleTimeoutDuration = parseIdleTimeoutEnv(os.Getenv("A2A_IDLE_TIMEOUT_SECOND // defaultIdleTimeoutDuration is what parseIdleTimeoutEnv returns when // the env var is unset or invalid. Pulled out as a const so tests can // reference it without re-deriving the value. -const defaultIdleTimeoutDuration = 5 * time.Minute +// +// core#2723: raised from 5*time.Minute → 15*time.Minute. The 30-min +// absolute ceiling on agent-to-agent dispatch (dispatchA2A:992) is +// independent and unchanged — only the canvas-side idle window grew. +const defaultIdleTimeoutDuration = 15 * time.Minute // parseIdleTimeoutEnv parses the A2A_IDLE_TIMEOUT_SECONDS value, falling // back to defaultIdleTimeoutDuration on empty / non-numeric / non-positive diff --git a/workspace-server/internal/handlers/a2a_proxy_test.go b/workspace-server/internal/handlers/a2a_proxy_test.go index 6056806c..7c5d2054 100644 --- a/workspace-server/internal/handlers/a2a_proxy_test.go +++ b/workspace-server/internal/handlers/a2a_proxy_test.go @@ -2959,3 +2959,111 @@ func TestInjectCanvasUserIdentity_Nil(t *testing.T) { } } + +// ==================== core#2723 — raised canvas idle default ==================== +// +// The canvas-side idle default (defaultIdleTimeoutDuration) was raised from +// 5min to 15min in response to a CTO-reported 5-min wedge during a long +// asset-migration turn (JRS SEO agent, ~102 images, download + re-upload, +// no broadcaster events for 5min). These tests pin the new default, the +// env-override parsing (which the change must not regress), and the +// applyIdleTimeout behavior under the larger window. + +// TestDefaultIdleTimeoutDuration_Is15Min pins the core#2723 raise. Any +// future change to this constant requires an explicit decision and a +// matching test update; the test fails fast so a silent revert can't +// land in main. +func TestDefaultIdleTimeoutDuration_Is15Min(t *testing.T) { + const want = 15 * time.Minute + if defaultIdleTimeoutDuration != want { + t.Errorf("defaultIdleTimeoutDuration = %v, want %v (core#2723: raised from 5min to 15min)", defaultIdleTimeoutDuration, want) + } +} + +// TestParseIdleTimeoutEnv_OverrideBeatsDefault pins the A2A_IDLE_TIMEOUT_SECONDS +// override still wins over the (new) 15min default. Ops who set a longer +// window for unusually slow plugins must still see their override take +// effect — the raise is for the DEFAULT, not a hard ceiling. +func TestParseIdleTimeoutEnv_OverrideBeatsDefault(t *testing.T) { + t.Run("valid override is honored", func(t *testing.T) { + t.Setenv("A2A_IDLE_TIMEOUT_SECONDS", "1800") // 30min + got := parseIdleTimeoutEnv(os.Getenv("A2A_IDLE_TIMEOUT_SECONDS")) + want := 1800 * time.Second + if got != want { + t.Errorf("parseIdleTimeoutEnv(1800) = %v, want %v (override must beat default)", got, want) + } + }) + t.Run("empty falls back to 15min", func(t *testing.T) { + t.Setenv("A2A_IDLE_TIMEOUT_SECONDS", "") + got := parseIdleTimeoutEnv(os.Getenv("A2A_IDLE_TIMEOUT_SECONDS")) + if got != defaultIdleTimeoutDuration { + t.Errorf("parseIdleTimeoutEnv(\"\") = %v, want %v (the new 15min default)", got, defaultIdleTimeoutDuration) + } + if got != 15*time.Minute { + t.Errorf("parseIdleTimeoutEnv(\"\") = %v, want 15min (core#2723 raise)", got) + } + }) + t.Run("non-numeric falls back to 15min", func(t *testing.T) { + t.Setenv("A2A_IDLE_TIMEOUT_SECONDS", "not-a-number") + got := parseIdleTimeoutEnv(os.Getenv("A2A_IDLE_TIMEOUT_SECONDS")) + if got != 15*time.Minute { + t.Errorf("parseIdleTimeoutEnv(\"not-a-number\") = %v, want 15min", got) + } + }) + t.Run("zero falls back to 15min", func(t *testing.T) { + t.Setenv("A2A_IDLE_TIMEOUT_SECONDS", "0") + got := parseIdleTimeoutEnv(os.Getenv("A2A_IDLE_TIMEOUT_SECONDS")) + if got != 15*time.Minute { + t.Errorf("parseIdleTimeoutEnv(\"0\") = %v, want 15min (zero treated as invalid)", got) + } + }) + t.Run("negative falls back to 15min", func(t *testing.T) { + t.Setenv("A2A_IDLE_TIMEOUT_SECONDS", "-30") + got := parseIdleTimeoutEnv(os.Getenv("A2A_IDLE_TIMEOUT_SECONDS")) + if got != 15*time.Minute { + t.Errorf("parseIdleTimeoutEnv(\"-30\") = %v, want 15min (negative treated as invalid)", got) + } + }) +} + +// TestApplyIdleTimeout_ResetsOnEvent_AtLargerWindow is the #2723 e2e proxy: +// even at a 15-min-scale idle window, an event mid-window must still reset +// the clock. We can't actually wait 15 min in a unit test, so we verify +// the property with a small (80ms) window + a single mid-window event +// (mirroring the existing TestApplyIdleTimeout_ResetsOnEvent but explicitly +// scoped to the #2723 raise — a future refactor that breaks the reset +// semantics MUST fail this test). +func TestApplyIdleTimeout_ResetsOnEvent_AtLargerWindow(t *testing.T) { + setupTestDB(t) + setupTestRedis(t) + b := newTestBroadcaster() + + parent, parentCancel := context.WithTimeout(context.Background(), 5*time.Second) + defer parentCancel() + + idle := 80 * time.Millisecond + idleCtx, idleCancel := applyIdleTimeout(parent, b, "ws-2723-active", idle) + defer idleCancel() + + // Mid-window event: reset the clock. + time.Sleep(idle / 2) + b.BroadcastOnly("ws-2723-active", "ACTIVITY_LOGGED", map[string]interface{}{ + "activity_type": "tool_call", // matches the JRS bulk-upload pattern + }) + + // Past the original deadline, ctx must still be alive. + select { + case <-idleCtx.Done(): + t.Fatal("idleCtx cancelled despite mid-window event (core#2723 regression: reset semantics must survive the raise)") + case <-time.After(idle - (idle / 2) + 10*time.Millisecond): + // ok + } + + // Second silence window fires. + select { + case <-idleCtx.Done(): + // expected + case <-time.After(idle + 200*time.Millisecond): + t.Fatal("idleCtx never cancelled after the second silence window") + } +} -- 2.52.0