fix(a2a): cover CF 521/522/523 in dead-origin status set

Independent review on PR #2362 caught: the dead-agent classifier at
a2a_proxy.go included 502/503/504/524 but missed the rest of the CF
origin-failure family (521/522/523), which are MORE indicative of a
dead EC2 than 524:

- 521 "Web server is down" — CF can't open TCP to origin (most direct
  dead-EC2 signal; fires when the workspace EC2 has been terminated
  and CF still has the CNAME pointing at it).
- 522 "Connection timed out" — TCP didn't complete in ~15s (typical
  of SG/NACL flap or agent process hung on accept).
- 523 "Origin is unreachable" — CF can't route to origin (DNS gone,
  network path broken).

Pre-fix any of these would propagate as-is to the canvas and the user
would see a 5xx without the reactive auto-restart firing — exactly
the SaaS-blind class of failure PR #2362 was meant to close.

Refactor: extracted isUpstreamDeadStatus(int) helper so the matrix is
in one place, with TestIsUpstreamDeadStatus locking in 18 status
codes (7 dead, 11 not-dead including 520 and 525 which look CF-shaped
but indicate different failures).

Also tightened TestStopForRestart_NoProvisioner_NoOp per the same
review: now uses sqlmock.ExpectationsWereMet to assert the dispatcher
doesn't touch the DB on the both-nil path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hongming Wang 2026-04-30 01:39:04 -07:00
parent 2742c3c837
commit 8e508a7a2f
2 changed files with 81 additions and 13 deletions

View File

@ -181,6 +181,35 @@ func isUpstreamBusyError(err error) bool {
strings.Contains(msg, "connection reset")
}
// isUpstreamDeadStatus returns true when the upstream HTTP status indicates
// the workspace agent is unreachable / unresponsive at the network layer
// (vs an agent-authored 5xx with a real body). Used by the proxy to gate
// reactive container-dead detection + auto-restart.
//
// - 502 Bad Gateway, 503 Service Unavailable, 504 Gateway Timeout: standard
// proxy-layer "upstream is broken" codes (Cloudflare, ELB, agent tunnel).
// - 521 Web Server Is Down: Cloudflare can't open TCP to origin (most
// direct dead-EC2 signal).
// - 522 Connection Timed Out: Cloudflare opened TCP but no response within
// ~15s — typical of SG/NACL flap or agent process hung.
// - 523 Origin Is Unreachable: Cloudflare can't route to origin (DNS or
// network-path failure).
// - 524 A Timeout Occurred: TCP succeeded, but origin didn't return
// headers within ~100s — agent process alive but wedged.
//
// We always probe IsRunning before acting, so a transient false positive
// from this set just costs one CP API call.
func isUpstreamDeadStatus(status int) bool {
switch status {
case http.StatusBadGateway, // 502
http.StatusServiceUnavailable, // 503
http.StatusGatewayTimeout, // 504
521, 522, 523, 524: // CF dead-origin family
return true
}
return false
}
func (e *proxyA2AError) Error() string {
if e == nil || e.Response == nil {
return "proxy a2a error"
@ -484,15 +513,7 @@ func (h *WorkspaceHandler) proxyA2ARequest(ctx context.Context, workspaceID stri
// error body. We probe IsRunning regardless (it's the
// authoritative check) but the empty-body case is what makes
// this fix necessary.
// 524 = Cloudflare "origin timed out" — origin accepted the
// connection but didn't return headers within ~100s. Same
// signal as 504/502 for our purposes: the upstream agent is
// not responsive. We probe IsRunning to confirm before
// triggering a restart.
if resp.StatusCode == http.StatusBadGateway ||
resp.StatusCode == http.StatusServiceUnavailable ||
resp.StatusCode == http.StatusGatewayTimeout ||
resp.StatusCode == 524 {
if isUpstreamDeadStatus(resp.StatusCode) {
if h.maybeMarkContainerDead(ctx, workspaceID) {
return 0, nil, &proxyA2AError{
Status: http.StatusServiceUnavailable,

View File

@ -919,6 +919,46 @@ func TestIsUpstreamBusyError(t *testing.T) {
}
}
// TestIsUpstreamDeadStatus locks in the status-code matrix that gates
// reactive container-dead detection. Order matters: the helper exists so
// the proxy + any future caller (e.g. a sweeper) classify CF dead-origin
// codes the same way. Drift here would re-introduce the SaaS-blind bug
// for whichever code we forgot.
func TestIsUpstreamDeadStatus(t *testing.T) {
cases := []struct {
name string
status int
want bool
}{
// Standard proxy-layer dead-upstream codes
{"502 BadGateway", 502, true},
{"503 ServiceUnavailable", 503, true},
{"504 GatewayTimeout", 504, true},
// Cloudflare dead-origin family
{"521 WebServerDown", 521, true},
{"522 ConnectionTimedOut", 522, true},
{"523 OriginUnreachable", 523, true},
{"524 OriginTimedOut", 524, true},
// Negative cases — must NOT trigger restart
{"200 OK", 200, false},
{"400 BadRequest (agent rejected payload)", 400, false},
{"401 Unauthorized", 401, false},
{"404 NotFound (no such session)", 404, false},
{"408 RequestTimeout (client-side)", 408, false},
{"429 TooManyRequests (rate limited, agent alive)", 429, false},
{"500 InternalServerError (agent crashed mid-request)", 500, false},
{"501 NotImplemented", 501, false},
{"505 HTTPVersionNotSupported", 505, false},
{"520 WebServerReturnedUnknown (agent returned malformed)", 520, false},
{"525 SSLHandshakeFailed (TLS misconfig, not dead origin)", 525, false},
}
for _, tc := range cases {
if got := isUpstreamDeadStatus(tc.status); got != tc.want {
t.Errorf("%s: isUpstreamDeadStatus(%d) = %v, want %v", tc.name, tc.status, got, tc.want)
}
}
}
// ==================== ProxyA2A — upstream timeout returns 503 busy + Retry-After ====================
// Verifies the full error-shaping contract for the 503-busy path:
@ -1840,14 +1880,21 @@ func TestStopForRestart_SaaSPath_DispatchesViaCPProv(t *testing.T) {
}
}
// Both nil → no-op, no panic. Defensive guard against the dispatcher being
// invoked on a misconfigured handler.
// Both nil → no-op, no panic, no DB / broadcast side effects. Guards the
// dispatcher against being invoked on a misconfigured handler. Important
// because runRestartCycle's surrounding flow (status='provisioning' UPDATE
// + broadcast) MUST happen even when both provisioners are nil — but
// stopForRestart itself is a pure dispatcher and shouldn't touch state.
func TestStopForRestart_NoProvisioner_NoOp(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
// no provisioner, no cpProv
// no provisioner, no cpProv, no DB expectations set on mock — any
// unexpected query/exec will produce a sqlmock error.
handler.stopForRestart(context.Background(), "ws-orphan")
// no panic, no calls — assertion is reaching this line
if err := mock.ExpectationsWereMet(); err != nil {
t.Fatalf("stopForRestart no-provisioner path should not touch DB: %v", err)
}
}
// fakeCPProv satisfies provisioner.CPProvisionerAPI for tests that exercise