diff --git a/workspace-server/internal/handlers/a2a_proxy.go b/workspace-server/internal/handlers/a2a_proxy.go index 52d57f50..d40e68de 100644 --- a/workspace-server/internal/handlers/a2a_proxy.go +++ b/workspace-server/internal/handlers/a2a_proxy.go @@ -132,15 +132,20 @@ func readBodyWithLimit(r io.Reader, limit int, kind string) ([]byte, error) { // a generic 502 page to canvas. 10s is well above realistic intra-region // latencies and well below CF's edge timeout. // -// 3. Transport.ResponseHeaderTimeout — 5min default. From request-body-end +// 3. Transport.ResponseHeaderTimeout — 30min default. From request-body-end // to response-headers-start. Configurable via // A2A_PROXY_RESPONSE_HEADER_TIMEOUT (envx.Duration). Covers cold-start -// first-byte (30-60s OAuth flow above) with enough room for Opus agent -// turns and Codex scheduled tasks (big context + internal delegate_task -// round-trips routinely exceed the old 60s/180s ceilings). Body streaming -// after headers is governed by the -// per-request context deadline, NOT this timeout — so multi-minute agent -// responses still work fine. +// first-byte (30-60s OAuth flow above) AND long SYNCHRONOUS autonomous +// turns where the runtime computes the whole response before sending +// headers (it does NOT always stream a 200 early). A real "migrate from +// blob" turn ran 443s and was killed at the old 5min default with +// `timeout awaiting response headers` (core#2723 class — the SAME long-turn +// cut as the idle watchdog). Aligned to 30min = the agent-to-agent +// ceiling + the canvas idle default, so no LEGIT turn (none exceed that) +// trips it; a genuinely-stuck agent is surfaced by DialContext (fast, +// connection-level) + the reactive-health/heartbeat path, not by cutting +// a working turn short. Body streaming after headers is governed by the +// per-request context deadline, NOT this timeout. // // The point of (2) and (3) is to surface a *structured* 503 from // handleA2ADispatchError when the workspace agent is unreachable, so canvas @@ -153,7 +158,7 @@ var a2aClient = &http.Client{ Timeout: 10 * time.Second, KeepAlive: 30 * time.Second, }).DialContext, - ResponseHeaderTimeout: envx.Duration("A2A_PROXY_RESPONSE_HEADER_TIMEOUT", 5*time.Minute), + ResponseHeaderTimeout: envx.Duration("A2A_PROXY_RESPONSE_HEADER_TIMEOUT", 30*time.Minute), TLSHandshakeTimeout: 10 * time.Second, // MaxIdleConns / IdleConnTimeout: stdlib defaults are fine; agent // fan-in is bounded by the platform's broadcaster fan-out, not by diff --git a/workspace-server/internal/handlers/a2a_proxy_test.go b/workspace-server/internal/handlers/a2a_proxy_test.go index 6056806c..b618b52e 100644 --- a/workspace-server/internal/handlers/a2a_proxy_test.go +++ b/workspace-server/internal/handlers/a2a_proxy_test.go @@ -2850,7 +2850,10 @@ func TestLookupDeliveryMode_ContextCanceled_FailsClosed(t *testing.T) { // ==================== a2aClient ResponseHeaderTimeout config ==================== func TestA2AClientResponseHeaderTimeout(t *testing.T) { - const defaultTimeout = 5 * time.Minute + // core#2723 class: raised 5min→30min so long synchronous autonomous turns + // (e.g. a 443s "migrate from blob" run that errored with "timeout awaiting + // response headers") aren't cut short. Aligned to the agent ceiling + idle. + const defaultTimeout = 30 * time.Minute // Default (unset env) — a2aClient was initialised at package load time. if a2aClient.Transport.(*http.Transport).ResponseHeaderTimeout != defaultTimeout {