forked from molecule-ai/molecule-core
fix(a2a): cover CF 521/522/523 in dead-origin status set
Independent review on PR #2362 caught: the dead-agent classifier at a2a_proxy.go included 502/503/504/524 but missed the rest of the CF origin-failure family (521/522/523), which are MORE indicative of a dead EC2 than 524: - 521 "Web server is down" — CF can't open TCP to origin (most direct dead-EC2 signal; fires when the workspace EC2 has been terminated and CF still has the CNAME pointing at it). - 522 "Connection timed out" — TCP didn't complete in ~15s (typical of SG/NACL flap or agent process hung on accept). - 523 "Origin is unreachable" — CF can't route to origin (DNS gone, network path broken). Pre-fix any of these would propagate as-is to the canvas and the user would see a 5xx without the reactive auto-restart firing — exactly the SaaS-blind class of failure PR #2362 was meant to close. Refactor: extracted isUpstreamDeadStatus(int) helper so the matrix is in one place, with TestIsUpstreamDeadStatus locking in 18 status codes (7 dead, 11 not-dead including 520 and 525 which look CF-shaped but indicate different failures). Also tightened TestStopForRestart_NoProvisioner_NoOp per the same review: now uses sqlmock.ExpectationsWereMet to assert the dispatcher doesn't touch the DB on the both-nil path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
2742c3c837
commit
8e508a7a2f
@ -181,6 +181,35 @@ func isUpstreamBusyError(err error) bool {
|
||||
strings.Contains(msg, "connection reset")
|
||||
}
|
||||
|
||||
// isUpstreamDeadStatus returns true when the upstream HTTP status indicates
|
||||
// the workspace agent is unreachable / unresponsive at the network layer
|
||||
// (vs an agent-authored 5xx with a real body). Used by the proxy to gate
|
||||
// reactive container-dead detection + auto-restart.
|
||||
//
|
||||
// - 502 Bad Gateway, 503 Service Unavailable, 504 Gateway Timeout: standard
|
||||
// proxy-layer "upstream is broken" codes (Cloudflare, ELB, agent tunnel).
|
||||
// - 521 Web Server Is Down: Cloudflare can't open TCP to origin (most
|
||||
// direct dead-EC2 signal).
|
||||
// - 522 Connection Timed Out: Cloudflare opened TCP but no response within
|
||||
// ~15s — typical of SG/NACL flap or agent process hung.
|
||||
// - 523 Origin Is Unreachable: Cloudflare can't route to origin (DNS or
|
||||
// network-path failure).
|
||||
// - 524 A Timeout Occurred: TCP succeeded, but origin didn't return
|
||||
// headers within ~100s — agent process alive but wedged.
|
||||
//
|
||||
// We always probe IsRunning before acting, so a transient false positive
|
||||
// from this set just costs one CP API call.
|
||||
func isUpstreamDeadStatus(status int) bool {
|
||||
switch status {
|
||||
case http.StatusBadGateway, // 502
|
||||
http.StatusServiceUnavailable, // 503
|
||||
http.StatusGatewayTimeout, // 504
|
||||
521, 522, 523, 524: // CF dead-origin family
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (e *proxyA2AError) Error() string {
|
||||
if e == nil || e.Response == nil {
|
||||
return "proxy a2a error"
|
||||
@ -484,15 +513,7 @@ func (h *WorkspaceHandler) proxyA2ARequest(ctx context.Context, workspaceID stri
|
||||
// error body. We probe IsRunning regardless (it's the
|
||||
// authoritative check) but the empty-body case is what makes
|
||||
// this fix necessary.
|
||||
// 524 = Cloudflare "origin timed out" — origin accepted the
|
||||
// connection but didn't return headers within ~100s. Same
|
||||
// signal as 504/502 for our purposes: the upstream agent is
|
||||
// not responsive. We probe IsRunning to confirm before
|
||||
// triggering a restart.
|
||||
if resp.StatusCode == http.StatusBadGateway ||
|
||||
resp.StatusCode == http.StatusServiceUnavailable ||
|
||||
resp.StatusCode == http.StatusGatewayTimeout ||
|
||||
resp.StatusCode == 524 {
|
||||
if isUpstreamDeadStatus(resp.StatusCode) {
|
||||
if h.maybeMarkContainerDead(ctx, workspaceID) {
|
||||
return 0, nil, &proxyA2AError{
|
||||
Status: http.StatusServiceUnavailable,
|
||||
|
||||
@ -919,6 +919,46 @@ func TestIsUpstreamBusyError(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestIsUpstreamDeadStatus locks in the status-code matrix that gates
|
||||
// reactive container-dead detection. Order matters: the helper exists so
|
||||
// the proxy + any future caller (e.g. a sweeper) classify CF dead-origin
|
||||
// codes the same way. Drift here would re-introduce the SaaS-blind bug
|
||||
// for whichever code we forgot.
|
||||
func TestIsUpstreamDeadStatus(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
status int
|
||||
want bool
|
||||
}{
|
||||
// Standard proxy-layer dead-upstream codes
|
||||
{"502 BadGateway", 502, true},
|
||||
{"503 ServiceUnavailable", 503, true},
|
||||
{"504 GatewayTimeout", 504, true},
|
||||
// Cloudflare dead-origin family
|
||||
{"521 WebServerDown", 521, true},
|
||||
{"522 ConnectionTimedOut", 522, true},
|
||||
{"523 OriginUnreachable", 523, true},
|
||||
{"524 OriginTimedOut", 524, true},
|
||||
// Negative cases — must NOT trigger restart
|
||||
{"200 OK", 200, false},
|
||||
{"400 BadRequest (agent rejected payload)", 400, false},
|
||||
{"401 Unauthorized", 401, false},
|
||||
{"404 NotFound (no such session)", 404, false},
|
||||
{"408 RequestTimeout (client-side)", 408, false},
|
||||
{"429 TooManyRequests (rate limited, agent alive)", 429, false},
|
||||
{"500 InternalServerError (agent crashed mid-request)", 500, false},
|
||||
{"501 NotImplemented", 501, false},
|
||||
{"505 HTTPVersionNotSupported", 505, false},
|
||||
{"520 WebServerReturnedUnknown (agent returned malformed)", 520, false},
|
||||
{"525 SSLHandshakeFailed (TLS misconfig, not dead origin)", 525, false},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
if got := isUpstreamDeadStatus(tc.status); got != tc.want {
|
||||
t.Errorf("%s: isUpstreamDeadStatus(%d) = %v, want %v", tc.name, tc.status, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ==================== ProxyA2A — upstream timeout returns 503 busy + Retry-After ====================
|
||||
|
||||
// Verifies the full error-shaping contract for the 503-busy path:
|
||||
@ -1840,14 +1880,21 @@ func TestStopForRestart_SaaSPath_DispatchesViaCPProv(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// Both nil → no-op, no panic. Defensive guard against the dispatcher being
|
||||
// invoked on a misconfigured handler.
|
||||
// Both nil → no-op, no panic, no DB / broadcast side effects. Guards the
|
||||
// dispatcher against being invoked on a misconfigured handler. Important
|
||||
// because runRestartCycle's surrounding flow (status='provisioning' UPDATE
|
||||
// + broadcast) MUST happen even when both provisioners are nil — but
|
||||
// stopForRestart itself is a pure dispatcher and shouldn't touch state.
|
||||
func TestStopForRestart_NoProvisioner_NoOp(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
|
||||
// no provisioner, no cpProv
|
||||
// no provisioner, no cpProv, no DB expectations set on mock — any
|
||||
// unexpected query/exec will produce a sqlmock error.
|
||||
handler.stopForRestart(context.Background(), "ws-orphan")
|
||||
// no panic, no calls — assertion is reaching this line
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Fatalf("stopForRestart no-provisioner path should not touch DB: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// fakeCPProv satisfies provisioner.CPProvisionerAPI for tests that exercise
|
||||
|
||||
Loading…
Reference in New Issue
Block a user