From 8e508a7a2fac4f689e1aa8f253d4d4e5c35d9122 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 30 Apr 2026 01:39:04 -0700 Subject: [PATCH] fix(a2a): cover CF 521/522/523 in dead-origin status set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Independent review on PR #2362 caught: the dead-agent classifier at a2a_proxy.go included 502/503/504/524 but missed the rest of the CF origin-failure family (521/522/523), which are MORE indicative of a dead EC2 than 524: - 521 "Web server is down" — CF can't open TCP to origin (most direct dead-EC2 signal; fires when the workspace EC2 has been terminated and CF still has the CNAME pointing at it). - 522 "Connection timed out" — TCP didn't complete in ~15s (typical of SG/NACL flap or agent process hung on accept). - 523 "Origin is unreachable" — CF can't route to origin (DNS gone, network path broken). Pre-fix any of these would propagate as-is to the canvas and the user would see a 5xx without the reactive auto-restart firing — exactly the SaaS-blind class of failure PR #2362 was meant to close. Refactor: extracted isUpstreamDeadStatus(int) helper so the matrix is in one place, with TestIsUpstreamDeadStatus locking in 18 status codes (7 dead, 11 not-dead including 520 and 525 which look CF-shaped but indicate different failures). Also tightened TestStopForRestart_NoProvisioner_NoOp per the same review: now uses sqlmock.ExpectationsWereMet to assert the dispatcher doesn't touch the DB on the both-nil path. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/handlers/a2a_proxy.go | 39 ++++++++++--- .../internal/handlers/a2a_proxy_test.go | 55 +++++++++++++++++-- 2 files changed, 81 insertions(+), 13 deletions(-) diff --git a/workspace-server/internal/handlers/a2a_proxy.go b/workspace-server/internal/handlers/a2a_proxy.go index f5668730..b1fd4fd7 100644 --- a/workspace-server/internal/handlers/a2a_proxy.go +++ b/workspace-server/internal/handlers/a2a_proxy.go @@ -181,6 +181,35 @@ func isUpstreamBusyError(err error) bool { strings.Contains(msg, "connection reset") } +// isUpstreamDeadStatus returns true when the upstream HTTP status indicates +// the workspace agent is unreachable / unresponsive at the network layer +// (vs an agent-authored 5xx with a real body). Used by the proxy to gate +// reactive container-dead detection + auto-restart. +// +// - 502 Bad Gateway, 503 Service Unavailable, 504 Gateway Timeout: standard +// proxy-layer "upstream is broken" codes (Cloudflare, ELB, agent tunnel). +// - 521 Web Server Is Down: Cloudflare can't open TCP to origin (most +// direct dead-EC2 signal). +// - 522 Connection Timed Out: Cloudflare opened TCP but no response within +// ~15s — typical of SG/NACL flap or agent process hung. +// - 523 Origin Is Unreachable: Cloudflare can't route to origin (DNS or +// network-path failure). +// - 524 A Timeout Occurred: TCP succeeded, but origin didn't return +// headers within ~100s — agent process alive but wedged. +// +// We always probe IsRunning before acting, so a transient false positive +// from this set just costs one CP API call. +func isUpstreamDeadStatus(status int) bool { + switch status { + case http.StatusBadGateway, // 502 + http.StatusServiceUnavailable, // 503 + http.StatusGatewayTimeout, // 504 + 521, 522, 523, 524: // CF dead-origin family + return true + } + return false +} + func (e *proxyA2AError) Error() string { if e == nil || e.Response == nil { return "proxy a2a error" @@ -484,15 +513,7 @@ func (h *WorkspaceHandler) proxyA2ARequest(ctx context.Context, workspaceID stri // error body. We probe IsRunning regardless (it's the // authoritative check) but the empty-body case is what makes // this fix necessary. - // 524 = Cloudflare "origin timed out" — origin accepted the - // connection but didn't return headers within ~100s. Same - // signal as 504/502 for our purposes: the upstream agent is - // not responsive. We probe IsRunning to confirm before - // triggering a restart. - if resp.StatusCode == http.StatusBadGateway || - resp.StatusCode == http.StatusServiceUnavailable || - resp.StatusCode == http.StatusGatewayTimeout || - resp.StatusCode == 524 { + if isUpstreamDeadStatus(resp.StatusCode) { if h.maybeMarkContainerDead(ctx, workspaceID) { return 0, nil, &proxyA2AError{ Status: http.StatusServiceUnavailable, diff --git a/workspace-server/internal/handlers/a2a_proxy_test.go b/workspace-server/internal/handlers/a2a_proxy_test.go index 20d27e7f..d1334a00 100644 --- a/workspace-server/internal/handlers/a2a_proxy_test.go +++ b/workspace-server/internal/handlers/a2a_proxy_test.go @@ -919,6 +919,46 @@ func TestIsUpstreamBusyError(t *testing.T) { } } +// TestIsUpstreamDeadStatus locks in the status-code matrix that gates +// reactive container-dead detection. Order matters: the helper exists so +// the proxy + any future caller (e.g. a sweeper) classify CF dead-origin +// codes the same way. Drift here would re-introduce the SaaS-blind bug +// for whichever code we forgot. +func TestIsUpstreamDeadStatus(t *testing.T) { + cases := []struct { + name string + status int + want bool + }{ + // Standard proxy-layer dead-upstream codes + {"502 BadGateway", 502, true}, + {"503 ServiceUnavailable", 503, true}, + {"504 GatewayTimeout", 504, true}, + // Cloudflare dead-origin family + {"521 WebServerDown", 521, true}, + {"522 ConnectionTimedOut", 522, true}, + {"523 OriginUnreachable", 523, true}, + {"524 OriginTimedOut", 524, true}, + // Negative cases — must NOT trigger restart + {"200 OK", 200, false}, + {"400 BadRequest (agent rejected payload)", 400, false}, + {"401 Unauthorized", 401, false}, + {"404 NotFound (no such session)", 404, false}, + {"408 RequestTimeout (client-side)", 408, false}, + {"429 TooManyRequests (rate limited, agent alive)", 429, false}, + {"500 InternalServerError (agent crashed mid-request)", 500, false}, + {"501 NotImplemented", 501, false}, + {"505 HTTPVersionNotSupported", 505, false}, + {"520 WebServerReturnedUnknown (agent returned malformed)", 520, false}, + {"525 SSLHandshakeFailed (TLS misconfig, not dead origin)", 525, false}, + } + for _, tc := range cases { + if got := isUpstreamDeadStatus(tc.status); got != tc.want { + t.Errorf("%s: isUpstreamDeadStatus(%d) = %v, want %v", tc.name, tc.status, got, tc.want) + } + } +} + // ==================== ProxyA2A — upstream timeout returns 503 busy + Retry-After ==================== // Verifies the full error-shaping contract for the 503-busy path: @@ -1840,14 +1880,21 @@ func TestStopForRestart_SaaSPath_DispatchesViaCPProv(t *testing.T) { } } -// Both nil → no-op, no panic. Defensive guard against the dispatcher being -// invoked on a misconfigured handler. +// Both nil → no-op, no panic, no DB / broadcast side effects. Guards the +// dispatcher against being invoked on a misconfigured handler. Important +// because runRestartCycle's surrounding flow (status='provisioning' UPDATE +// + broadcast) MUST happen even when both provisioners are nil — but +// stopForRestart itself is a pure dispatcher and shouldn't touch state. func TestStopForRestart_NoProvisioner_NoOp(t *testing.T) { + mock := setupTestDB(t) setupTestRedis(t) handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir()) - // no provisioner, no cpProv + // no provisioner, no cpProv, no DB expectations set on mock — any + // unexpected query/exec will produce a sqlmock error. handler.stopForRestart(context.Background(), "ws-orphan") - // no panic, no calls — assertion is reaching this line + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("stopForRestart no-provisioner path should not touch DB: %v", err) + } } // fakeCPProv satisfies provisioner.CPProvisionerAPI for tests that exercise