chore: second-pass review polish — symmetry + clearer test fixtures

Round-2 review of the wedge/idle/progress bundle came back Approve
with 4 optional polish items. All taken:

1. Migration 043 down file gained `SET LOCAL lock_timeout = '5s'`
   matching the up file. A rollback under the same load that
   motivated the up-file guard would otherwise stall writers.

2. _clear_sdk_wedge_on_success now gates on actual stream content
   (result_text or assistant_chunks). A degenerate "iterator
   returned without raising but emitted nothing" case (possible
   from a partial stream or stub SDK) no longer falsely advertises
   recovery — only a real successful query (≥1 ResultMessage or
   AssistantMessage TextBlock) clears the wedge.

3. isUpstreamBusyError dropped the redundant
   `strings.Contains(msg, "context deadline exceeded")` fallback.
   *url.Error.Unwrap propagates the typed sentinel since Go 1.13;
   errors.Is(err, context.DeadlineExceeded) catches the real
   net/http shape. The substring was a foot-gun (would also match
   user-content with that phrase). Test fixture updated to use
   `fmt.Errorf("Post: %w", context.DeadlineExceeded)` which
   reflects what net/http actually returns.

4. TestIsUpstreamBusyError added a context.Canceled case (both
   typed and wrapped via %w) — pins the new applyIdleTimeout
   classification.

No critical/required findings on second pass; reviewer verdict was
Approve. Items above are polish for symmetry and test clarity.

1010 canvas + 64 Python + full Go suites pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hongming Wang 2026-04-25 08:48:30 -07:00
parent 892de784b3
commit 3c4eef49aa
4 changed files with 37 additions and 13 deletions

View File

@ -121,27 +121,26 @@ func isUpstreamBusyError(err error) bool {
if err == nil {
return false
}
// Typed sentinels propagate cleanly through *url.Error.Unwrap
// since Go 1.13, so errors.Is is the primary check for both
// DeadlineExceeded and Canceled. The substring fallbacks below
// stay only for shapes net/http does NOT type — bare "EOF" /
// "connection reset" can arrive as plain *net.OpError with no
// errors.Is hook to the stdlib sentinels.
if errors.Is(err, context.DeadlineExceeded) {
return true
}
// applyIdleTimeout cancels via context.WithCancel when the
// broadcaster silence window elapses — context.Canceled
// propagates cleanly through errors.Is, no substring fallback
// needed (and a substring on "context canceled" would also match
// healthy client-side aborts which we don't want to label busy).
// applyIdleTimeout uses context.WithCancel; surfaces here as
// Canceled, distinct from DeadlineExceeded but the same "upstream
// busy" class — caller produces a 503 + Retry-After.
if errors.Is(err, context.Canceled) {
return true
}
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
return true
}
// url.Error wraps "read tcp … EOF" and "Post …: context deadline
// exceeded" strings from the stdlib HTTP client without typing the
// inner cause. Fall back to substring match for those specific
// shapes only.
msg := err.Error()
return strings.Contains(msg, "context deadline exceeded") ||
strings.Contains(msg, "EOF") ||
return strings.Contains(msg, "EOF") ||
strings.Contains(msg, "connection reset")
}

View File

@ -601,9 +601,21 @@ func TestIsUpstreamBusyError(t *testing.T) {
}{
{"nil", nil, false},
{"context.DeadlineExceeded", context.DeadlineExceeded, true},
// applyIdleTimeout cancels its child ctx via context.WithCancel
// when the broadcaster silence window elapses — surfaces here
// as context.Canceled. Same "upstream busy" classification.
{"context.Canceled", context.Canceled, true},
{"wrapped context.Canceled", fmt.Errorf("dispatch wrapped: %w", context.Canceled), true},
{"io.EOF", io.EOF, true},
{"io.ErrUnexpectedEOF", io.ErrUnexpectedEOF, true},
{"wrapped context deadline string", fmt.Errorf(`Post "http://ws-foo:8000": context deadline exceeded`), true},
// Real net/http wraps context.DeadlineExceeded via *url.Error.Unwrap,
// so errors.Is(err, context.DeadlineExceeded) catches it. The
// pre-892de784 substring "context deadline exceeded" fallback
// also accepted a string-only error like
// `fmt.Errorf("Post: context deadline exceeded")`; that fallback
// was dropped because errors.Is handles the real shape and the
// substring was indistinguishable from a user-content match.
{"wrapped context deadline (errors.Is path)", fmt.Errorf("Post: %w", context.DeadlineExceeded), true},
{"wrapped EOF string", fmt.Errorf(`Post "http://ws-foo:8000": EOF`), true},
{"connection reset", fmt.Errorf("read tcp 127.0.0.1:8080->127.0.0.1:12345: connection reset by peer"), true},
{"generic dns error", fmt.Errorf("no such host"), false},

View File

@ -5,6 +5,11 @@
BEGIN;
-- Symmetric with the up migration: a rollback under the same load
-- that motivated the up-file's 5s lock_timeout would otherwise stall
-- writers indefinitely.
SET LOCAL lock_timeout = '5s';
ALTER TABLE workspaces
ALTER COLUMN status DROP DEFAULT;

View File

@ -541,7 +541,15 @@ class ClaudeSDKExecutor(AgentExecutor):
# cleanly, the SDK clearly works again. Clear so the next
# heartbeat reports runtime_state empty and the platform flips
# status degraded → online without a manual restart.
_clear_sdk_wedge_on_success()
#
# Gate on actual content from the stream so a degenerate
# "iterator returned without raising but emitted nothing"
# case (possible from a partial stream or a stub SDK) doesn't
# falsely advertise recovery. A real successful query yields
# at least a ResultMessage (sets result_text) or one
# AssistantMessage TextBlock (populates assistant_chunks).
if result_text is not None or assistant_chunks:
_clear_sdk_wedge_on_success()
return QueryResult(text=text, session_id=session_id)
# ------------------------------------------------------------------