diff --git a/workspace-server/internal/handlers/a2a_proxy.go b/workspace-server/internal/handlers/a2a_proxy.go index f74790e4..3ad2c4fb 100644 --- a/workspace-server/internal/handlers/a2a_proxy.go +++ b/workspace-server/internal/handlers/a2a_proxy.go @@ -121,27 +121,26 @@ func isUpstreamBusyError(err error) bool { if err == nil { return false } + // Typed sentinels propagate cleanly through *url.Error.Unwrap + // since Go 1.13, so errors.Is is the primary check for both + // DeadlineExceeded and Canceled. The substring fallbacks below + // stay only for shapes net/http does NOT type — bare "EOF" / + // "connection reset" can arrive as plain *net.OpError with no + // errors.Is hook to the stdlib sentinels. if errors.Is(err, context.DeadlineExceeded) { return true } - // applyIdleTimeout cancels via context.WithCancel when the - // broadcaster silence window elapses — context.Canceled - // propagates cleanly through errors.Is, no substring fallback - // needed (and a substring on "context canceled" would also match - // healthy client-side aborts which we don't want to label busy). + // applyIdleTimeout uses context.WithCancel; surfaces here as + // Canceled, distinct from DeadlineExceeded but the same "upstream + // busy" class — caller produces a 503 + Retry-After. if errors.Is(err, context.Canceled) { return true } if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) { return true } - // url.Error wraps "read tcp … EOF" and "Post …: context deadline - // exceeded" strings from the stdlib HTTP client without typing the - // inner cause. Fall back to substring match for those specific - // shapes only. msg := err.Error() - return strings.Contains(msg, "context deadline exceeded") || - strings.Contains(msg, "EOF") || + return strings.Contains(msg, "EOF") || strings.Contains(msg, "connection reset") } diff --git a/workspace-server/internal/handlers/a2a_proxy_test.go b/workspace-server/internal/handlers/a2a_proxy_test.go index 8cdbf0d8..0f21b337 100644 --- a/workspace-server/internal/handlers/a2a_proxy_test.go +++ b/workspace-server/internal/handlers/a2a_proxy_test.go @@ -601,9 +601,21 @@ func TestIsUpstreamBusyError(t *testing.T) { }{ {"nil", nil, false}, {"context.DeadlineExceeded", context.DeadlineExceeded, true}, + // applyIdleTimeout cancels its child ctx via context.WithCancel + // when the broadcaster silence window elapses — surfaces here + // as context.Canceled. Same "upstream busy" classification. + {"context.Canceled", context.Canceled, true}, + {"wrapped context.Canceled", fmt.Errorf("dispatch wrapped: %w", context.Canceled), true}, {"io.EOF", io.EOF, true}, {"io.ErrUnexpectedEOF", io.ErrUnexpectedEOF, true}, - {"wrapped context deadline string", fmt.Errorf(`Post "http://ws-foo:8000": context deadline exceeded`), true}, + // Real net/http wraps context.DeadlineExceeded via *url.Error.Unwrap, + // so errors.Is(err, context.DeadlineExceeded) catches it. The + // pre-892de784 substring "context deadline exceeded" fallback + // also accepted a string-only error like + // `fmt.Errorf("Post: context deadline exceeded")`; that fallback + // was dropped because errors.Is handles the real shape and the + // substring was indistinguishable from a user-content match. + {"wrapped context deadline (errors.Is path)", fmt.Errorf("Post: %w", context.DeadlineExceeded), true}, {"wrapped EOF string", fmt.Errorf(`Post "http://ws-foo:8000": EOF`), true}, {"connection reset", fmt.Errorf("read tcp 127.0.0.1:8080->127.0.0.1:12345: connection reset by peer"), true}, {"generic dns error", fmt.Errorf("no such host"), false}, diff --git a/workspace-server/migrations/043_workspace_status_enum.down.sql b/workspace-server/migrations/043_workspace_status_enum.down.sql index 88ecfbc1..4df43519 100644 --- a/workspace-server/migrations/043_workspace_status_enum.down.sql +++ b/workspace-server/migrations/043_workspace_status_enum.down.sql @@ -5,6 +5,11 @@ BEGIN; +-- Symmetric with the up migration: a rollback under the same load +-- that motivated the up-file's 5s lock_timeout would otherwise stall +-- writers indefinitely. +SET LOCAL lock_timeout = '5s'; + ALTER TABLE workspaces ALTER COLUMN status DROP DEFAULT; diff --git a/workspace/claude_sdk_executor.py b/workspace/claude_sdk_executor.py index af3abc71..a0a89c5d 100644 --- a/workspace/claude_sdk_executor.py +++ b/workspace/claude_sdk_executor.py @@ -541,7 +541,15 @@ class ClaudeSDKExecutor(AgentExecutor): # cleanly, the SDK clearly works again. Clear so the next # heartbeat reports runtime_state empty and the platform flips # status degraded → online without a manual restart. - _clear_sdk_wedge_on_success() + # + # Gate on actual content from the stream so a degenerate + # "iterator returned without raising but emitted nothing" + # case (possible from a partial stream or a stub SDK) doesn't + # falsely advertise recovery. A real successful query yields + # at least a ResultMessage (sets result_text) or one + # AssistantMessage TextBlock (populates assistant_chunks). + if result_text is not None or assistant_chunks: + _clear_sdk_wedge_on_success() return QueryResult(text=text, session_id=session_id) # ------------------------------------------------------------------