From 3c4eef49aacd9a24de337d58053554aaef44dcb4 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sat, 25 Apr 2026 08:48:30 -0700 Subject: [PATCH] =?UTF-8?q?chore:=20second-pass=20review=20polish=20?= =?UTF-8?q?=E2=80=94=20symmetry=20+=20clearer=20test=20fixtures?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round-2 review of the wedge/idle/progress bundle came back Approve with 4 optional polish items. All taken: 1. Migration 043 down file gained `SET LOCAL lock_timeout = '5s'` matching the up file. A rollback under the same load that motivated the up-file guard would otherwise stall writers. 2. _clear_sdk_wedge_on_success now gates on actual stream content (result_text or assistant_chunks). A degenerate "iterator returned without raising but emitted nothing" case (possible from a partial stream or stub SDK) no longer falsely advertises recovery — only a real successful query (≥1 ResultMessage or AssistantMessage TextBlock) clears the wedge. 3. isUpstreamBusyError dropped the redundant `strings.Contains(msg, "context deadline exceeded")` fallback. *url.Error.Unwrap propagates the typed sentinel since Go 1.13; errors.Is(err, context.DeadlineExceeded) catches the real net/http shape. The substring was a foot-gun (would also match user-content with that phrase). Test fixture updated to use `fmt.Errorf("Post: %w", context.DeadlineExceeded)` which reflects what net/http actually returns. 4. TestIsUpstreamBusyError added a context.Canceled case (both typed and wrapped via %w) — pins the new applyIdleTimeout classification. No critical/required findings on second pass; reviewer verdict was Approve. Items above are polish for symmetry and test clarity. 1010 canvas + 64 Python + full Go suites pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/handlers/a2a_proxy.go | 21 +++++++++---------- .../internal/handlers/a2a_proxy_test.go | 14 ++++++++++++- .../043_workspace_status_enum.down.sql | 5 +++++ workspace/claude_sdk_executor.py | 10 ++++++++- 4 files changed, 37 insertions(+), 13 deletions(-) diff --git a/workspace-server/internal/handlers/a2a_proxy.go b/workspace-server/internal/handlers/a2a_proxy.go index f74790e4..3ad2c4fb 100644 --- a/workspace-server/internal/handlers/a2a_proxy.go +++ b/workspace-server/internal/handlers/a2a_proxy.go @@ -121,27 +121,26 @@ func isUpstreamBusyError(err error) bool { if err == nil { return false } + // Typed sentinels propagate cleanly through *url.Error.Unwrap + // since Go 1.13, so errors.Is is the primary check for both + // DeadlineExceeded and Canceled. The substring fallbacks below + // stay only for shapes net/http does NOT type — bare "EOF" / + // "connection reset" can arrive as plain *net.OpError with no + // errors.Is hook to the stdlib sentinels. if errors.Is(err, context.DeadlineExceeded) { return true } - // applyIdleTimeout cancels via context.WithCancel when the - // broadcaster silence window elapses — context.Canceled - // propagates cleanly through errors.Is, no substring fallback - // needed (and a substring on "context canceled" would also match - // healthy client-side aborts which we don't want to label busy). + // applyIdleTimeout uses context.WithCancel; surfaces here as + // Canceled, distinct from DeadlineExceeded but the same "upstream + // busy" class — caller produces a 503 + Retry-After. if errors.Is(err, context.Canceled) { return true } if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) { return true } - // url.Error wraps "read tcp … EOF" and "Post …: context deadline - // exceeded" strings from the stdlib HTTP client without typing the - // inner cause. Fall back to substring match for those specific - // shapes only. msg := err.Error() - return strings.Contains(msg, "context deadline exceeded") || - strings.Contains(msg, "EOF") || + return strings.Contains(msg, "EOF") || strings.Contains(msg, "connection reset") } diff --git a/workspace-server/internal/handlers/a2a_proxy_test.go b/workspace-server/internal/handlers/a2a_proxy_test.go index 8cdbf0d8..0f21b337 100644 --- a/workspace-server/internal/handlers/a2a_proxy_test.go +++ b/workspace-server/internal/handlers/a2a_proxy_test.go @@ -601,9 +601,21 @@ func TestIsUpstreamBusyError(t *testing.T) { }{ {"nil", nil, false}, {"context.DeadlineExceeded", context.DeadlineExceeded, true}, + // applyIdleTimeout cancels its child ctx via context.WithCancel + // when the broadcaster silence window elapses — surfaces here + // as context.Canceled. Same "upstream busy" classification. + {"context.Canceled", context.Canceled, true}, + {"wrapped context.Canceled", fmt.Errorf("dispatch wrapped: %w", context.Canceled), true}, {"io.EOF", io.EOF, true}, {"io.ErrUnexpectedEOF", io.ErrUnexpectedEOF, true}, - {"wrapped context deadline string", fmt.Errorf(`Post "http://ws-foo:8000": context deadline exceeded`), true}, + // Real net/http wraps context.DeadlineExceeded via *url.Error.Unwrap, + // so errors.Is(err, context.DeadlineExceeded) catches it. The + // pre-892de784 substring "context deadline exceeded" fallback + // also accepted a string-only error like + // `fmt.Errorf("Post: context deadline exceeded")`; that fallback + // was dropped because errors.Is handles the real shape and the + // substring was indistinguishable from a user-content match. + {"wrapped context deadline (errors.Is path)", fmt.Errorf("Post: %w", context.DeadlineExceeded), true}, {"wrapped EOF string", fmt.Errorf(`Post "http://ws-foo:8000": EOF`), true}, {"connection reset", fmt.Errorf("read tcp 127.0.0.1:8080->127.0.0.1:12345: connection reset by peer"), true}, {"generic dns error", fmt.Errorf("no such host"), false}, diff --git a/workspace-server/migrations/043_workspace_status_enum.down.sql b/workspace-server/migrations/043_workspace_status_enum.down.sql index 88ecfbc1..4df43519 100644 --- a/workspace-server/migrations/043_workspace_status_enum.down.sql +++ b/workspace-server/migrations/043_workspace_status_enum.down.sql @@ -5,6 +5,11 @@ BEGIN; +-- Symmetric with the up migration: a rollback under the same load +-- that motivated the up-file's 5s lock_timeout would otherwise stall +-- writers indefinitely. +SET LOCAL lock_timeout = '5s'; + ALTER TABLE workspaces ALTER COLUMN status DROP DEFAULT; diff --git a/workspace/claude_sdk_executor.py b/workspace/claude_sdk_executor.py index af3abc71..a0a89c5d 100644 --- a/workspace/claude_sdk_executor.py +++ b/workspace/claude_sdk_executor.py @@ -541,7 +541,15 @@ class ClaudeSDKExecutor(AgentExecutor): # cleanly, the SDK clearly works again. Clear so the next # heartbeat reports runtime_state empty and the platform flips # status degraded → online without a manual restart. - _clear_sdk_wedge_on_success() + # + # Gate on actual content from the stream so a degenerate + # "iterator returned without raising but emitted nothing" + # case (possible from a partial stream or a stub SDK) doesn't + # falsely advertise recovery. A real successful query yields + # at least a ResultMessage (sets result_text) or one + # AssistantMessage TextBlock (populates assistant_chunks). + if result_text is not None or assistant_chunks: + _clear_sdk_wedge_on_success() return QueryResult(text=text, session_id=session_id) # ------------------------------------------------------------------