diff --git a/workspace-server/internal/handlers/a2a_proxy.go b/workspace-server/internal/handlers/a2a_proxy.go index f5668730..b1fd4fd7 100644 --- a/workspace-server/internal/handlers/a2a_proxy.go +++ b/workspace-server/internal/handlers/a2a_proxy.go @@ -181,6 +181,35 @@ func isUpstreamBusyError(err error) bool { strings.Contains(msg, "connection reset") } +// isUpstreamDeadStatus returns true when the upstream HTTP status indicates +// the workspace agent is unreachable / unresponsive at the network layer +// (vs an agent-authored 5xx with a real body). Used by the proxy to gate +// reactive container-dead detection + auto-restart. +// +// - 502 Bad Gateway, 503 Service Unavailable, 504 Gateway Timeout: standard +// proxy-layer "upstream is broken" codes (Cloudflare, ELB, agent tunnel). +// - 521 Web Server Is Down: Cloudflare can't open TCP to origin (most +// direct dead-EC2 signal). +// - 522 Connection Timed Out: Cloudflare opened TCP but no response within +// ~15s — typical of SG/NACL flap or agent process hung. +// - 523 Origin Is Unreachable: Cloudflare can't route to origin (DNS or +// network-path failure). +// - 524 A Timeout Occurred: TCP succeeded, but origin didn't return +// headers within ~100s — agent process alive but wedged. +// +// We always probe IsRunning before acting, so a transient false positive +// from this set just costs one CP API call. +func isUpstreamDeadStatus(status int) bool { + switch status { + case http.StatusBadGateway, // 502 + http.StatusServiceUnavailable, // 503 + http.StatusGatewayTimeout, // 504 + 521, 522, 523, 524: // CF dead-origin family + return true + } + return false +} + func (e *proxyA2AError) Error() string { if e == nil || e.Response == nil { return "proxy a2a error" @@ -484,15 +513,7 @@ func (h *WorkspaceHandler) proxyA2ARequest(ctx context.Context, workspaceID stri // error body. We probe IsRunning regardless (it's the // authoritative check) but the empty-body case is what makes // this fix necessary. - // 524 = Cloudflare "origin timed out" — origin accepted the - // connection but didn't return headers within ~100s. Same - // signal as 504/502 for our purposes: the upstream agent is - // not responsive. We probe IsRunning to confirm before - // triggering a restart. - if resp.StatusCode == http.StatusBadGateway || - resp.StatusCode == http.StatusServiceUnavailable || - resp.StatusCode == http.StatusGatewayTimeout || - resp.StatusCode == 524 { + if isUpstreamDeadStatus(resp.StatusCode) { if h.maybeMarkContainerDead(ctx, workspaceID) { return 0, nil, &proxyA2AError{ Status: http.StatusServiceUnavailable, diff --git a/workspace-server/internal/handlers/a2a_proxy_test.go b/workspace-server/internal/handlers/a2a_proxy_test.go index 20d27e7f..d1334a00 100644 --- a/workspace-server/internal/handlers/a2a_proxy_test.go +++ b/workspace-server/internal/handlers/a2a_proxy_test.go @@ -919,6 +919,46 @@ func TestIsUpstreamBusyError(t *testing.T) { } } +// TestIsUpstreamDeadStatus locks in the status-code matrix that gates +// reactive container-dead detection. Order matters: the helper exists so +// the proxy + any future caller (e.g. a sweeper) classify CF dead-origin +// codes the same way. Drift here would re-introduce the SaaS-blind bug +// for whichever code we forgot. +func TestIsUpstreamDeadStatus(t *testing.T) { + cases := []struct { + name string + status int + want bool + }{ + // Standard proxy-layer dead-upstream codes + {"502 BadGateway", 502, true}, + {"503 ServiceUnavailable", 503, true}, + {"504 GatewayTimeout", 504, true}, + // Cloudflare dead-origin family + {"521 WebServerDown", 521, true}, + {"522 ConnectionTimedOut", 522, true}, + {"523 OriginUnreachable", 523, true}, + {"524 OriginTimedOut", 524, true}, + // Negative cases — must NOT trigger restart + {"200 OK", 200, false}, + {"400 BadRequest (agent rejected payload)", 400, false}, + {"401 Unauthorized", 401, false}, + {"404 NotFound (no such session)", 404, false}, + {"408 RequestTimeout (client-side)", 408, false}, + {"429 TooManyRequests (rate limited, agent alive)", 429, false}, + {"500 InternalServerError (agent crashed mid-request)", 500, false}, + {"501 NotImplemented", 501, false}, + {"505 HTTPVersionNotSupported", 505, false}, + {"520 WebServerReturnedUnknown (agent returned malformed)", 520, false}, + {"525 SSLHandshakeFailed (TLS misconfig, not dead origin)", 525, false}, + } + for _, tc := range cases { + if got := isUpstreamDeadStatus(tc.status); got != tc.want { + t.Errorf("%s: isUpstreamDeadStatus(%d) = %v, want %v", tc.name, tc.status, got, tc.want) + } + } +} + // ==================== ProxyA2A — upstream timeout returns 503 busy + Retry-After ==================== // Verifies the full error-shaping contract for the 503-busy path: @@ -1840,14 +1880,21 @@ func TestStopForRestart_SaaSPath_DispatchesViaCPProv(t *testing.T) { } } -// Both nil → no-op, no panic. Defensive guard against the dispatcher being -// invoked on a misconfigured handler. +// Both nil → no-op, no panic, no DB / broadcast side effects. Guards the +// dispatcher against being invoked on a misconfigured handler. Important +// because runRestartCycle's surrounding flow (status='provisioning' UPDATE +// + broadcast) MUST happen even when both provisioners are nil — but +// stopForRestart itself is a pure dispatcher and shouldn't touch state. func TestStopForRestart_NoProvisioner_NoOp(t *testing.T) { + mock := setupTestDB(t) setupTestRedis(t) handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir()) - // no provisioner, no cpProv + // no provisioner, no cpProv, no DB expectations set on mock — any + // unexpected query/exec will produce a sqlmock error. handler.stopForRestart(context.Background(), "ws-orphan") - // no panic, no calls — assertion is reaching this line + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("stopForRestart no-provisioner path should not touch DB: %v", err) + } } // fakeCPProv satisfies provisioner.CPProvisionerAPI for tests that exercise