Merge pull request #2365 from Molecule-AI/auto/cf-dead-origin-status-gap

fix(a2a): cover CF 521/522/523 in dead-origin status set
This commit is contained in:
Hongming Wang 2026-04-30 08:51:28 +00:00 committed by GitHub
commit 49c3433a70
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 81 additions and 13 deletions

View File

@ -181,6 +181,35 @@ func isUpstreamBusyError(err error) bool {
strings.Contains(msg, "connection reset")
}
// isUpstreamDeadStatus returns true when the upstream HTTP status indicates
// the workspace agent is unreachable / unresponsive at the network layer
// (vs an agent-authored 5xx with a real body). Used by the proxy to gate
// reactive container-dead detection + auto-restart.
//
// - 502 Bad Gateway, 503 Service Unavailable, 504 Gateway Timeout: standard
// proxy-layer "upstream is broken" codes (Cloudflare, ELB, agent tunnel).
// - 521 Web Server Is Down: Cloudflare can't open TCP to origin (most
// direct dead-EC2 signal).
// - 522 Connection Timed Out: Cloudflare opened TCP but no response within
// ~15s — typical of SG/NACL flap or agent process hung.
// - 523 Origin Is Unreachable: Cloudflare can't route to origin (DNS or
// network-path failure).
// - 524 A Timeout Occurred: TCP succeeded, but origin didn't return
// headers within ~100s — agent process alive but wedged.
//
// We always probe IsRunning before acting, so a transient false positive
// from this set just costs one CP API call.
func isUpstreamDeadStatus(status int) bool {
switch status {
case http.StatusBadGateway, // 502
http.StatusServiceUnavailable, // 503
http.StatusGatewayTimeout, // 504
521, 522, 523, 524: // CF dead-origin family
return true
}
return false
}
func (e *proxyA2AError) Error() string {
if e == nil || e.Response == nil {
return "proxy a2a error"
@ -484,15 +513,7 @@ func (h *WorkspaceHandler) proxyA2ARequest(ctx context.Context, workspaceID stri
// error body. We probe IsRunning regardless (it's the
// authoritative check) but the empty-body case is what makes
// this fix necessary.
// 524 = Cloudflare "origin timed out" — origin accepted the
// connection but didn't return headers within ~100s. Same
// signal as 504/502 for our purposes: the upstream agent is
// not responsive. We probe IsRunning to confirm before
// triggering a restart.
if resp.StatusCode == http.StatusBadGateway ||
resp.StatusCode == http.StatusServiceUnavailable ||
resp.StatusCode == http.StatusGatewayTimeout ||
resp.StatusCode == 524 {
if isUpstreamDeadStatus(resp.StatusCode) {
if h.maybeMarkContainerDead(ctx, workspaceID) {
return 0, nil, &proxyA2AError{
Status: http.StatusServiceUnavailable,

View File

@ -919,6 +919,46 @@ func TestIsUpstreamBusyError(t *testing.T) {
}
}
// TestIsUpstreamDeadStatus locks in the status-code matrix that gates
// reactive container-dead detection. Order matters: the helper exists so
// the proxy + any future caller (e.g. a sweeper) classify CF dead-origin
// codes the same way. Drift here would re-introduce the SaaS-blind bug
// for whichever code we forgot.
func TestIsUpstreamDeadStatus(t *testing.T) {
cases := []struct {
name string
status int
want bool
}{
// Standard proxy-layer dead-upstream codes
{"502 BadGateway", 502, true},
{"503 ServiceUnavailable", 503, true},
{"504 GatewayTimeout", 504, true},
// Cloudflare dead-origin family
{"521 WebServerDown", 521, true},
{"522 ConnectionTimedOut", 522, true},
{"523 OriginUnreachable", 523, true},
{"524 OriginTimedOut", 524, true},
// Negative cases — must NOT trigger restart
{"200 OK", 200, false},
{"400 BadRequest (agent rejected payload)", 400, false},
{"401 Unauthorized", 401, false},
{"404 NotFound (no such session)", 404, false},
{"408 RequestTimeout (client-side)", 408, false},
{"429 TooManyRequests (rate limited, agent alive)", 429, false},
{"500 InternalServerError (agent crashed mid-request)", 500, false},
{"501 NotImplemented", 501, false},
{"505 HTTPVersionNotSupported", 505, false},
{"520 WebServerReturnedUnknown (agent returned malformed)", 520, false},
{"525 SSLHandshakeFailed (TLS misconfig, not dead origin)", 525, false},
}
for _, tc := range cases {
if got := isUpstreamDeadStatus(tc.status); got != tc.want {
t.Errorf("%s: isUpstreamDeadStatus(%d) = %v, want %v", tc.name, tc.status, got, tc.want)
}
}
}
// ==================== ProxyA2A — upstream timeout returns 503 busy + Retry-After ====================
// Verifies the full error-shaping contract for the 503-busy path:
@ -1840,14 +1880,21 @@ func TestStopForRestart_SaaSPath_DispatchesViaCPProv(t *testing.T) {
}
}
// Both nil → no-op, no panic. Defensive guard against the dispatcher being
// invoked on a misconfigured handler.
// Both nil → no-op, no panic, no DB / broadcast side effects. Guards the
// dispatcher against being invoked on a misconfigured handler. Important
// because runRestartCycle's surrounding flow (status='provisioning' UPDATE
// + broadcast) MUST happen even when both provisioners are nil — but
// stopForRestart itself is a pure dispatcher and shouldn't touch state.
func TestStopForRestart_NoProvisioner_NoOp(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
// no provisioner, no cpProv
// no provisioner, no cpProv, no DB expectations set on mock — any
// unexpected query/exec will produce a sqlmock error.
handler.stopForRestart(context.Background(), "ws-orphan")
// no panic, no calls — assertion is reaching this line
if err := mock.ExpectationsWereMet(); err != nil {
t.Fatalf("stopForRestart no-provisioner path should not touch DB: %v", err)
}
}
// fakeCPProv satisfies provisioner.CPProvisionerAPI for tests that exercise