From 8e508a7a2fac4f689e1aa8f253d4d4e5c35d9122 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangalt@gmail.com>
Date: Thu, 30 Apr 2026 01:39:04 -0700
Subject: [PATCH] fix(a2a): cover CF 521/522/523 in dead-origin status set
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Independent review on PR #2362 caught: the dead-agent classifier at
a2a_proxy.go included 502/503/504/524 but missed the rest of the CF
origin-failure family (521/522/523), which are MORE indicative of a
dead EC2 than 524:

- 521 "Web server is down" — CF can't open TCP to origin (most direct
  dead-EC2 signal; fires when the workspace EC2 has been terminated
  and CF still has the CNAME pointing at it).
- 522 "Connection timed out" — TCP didn't complete in ~15s (typical
  of SG/NACL flap or agent process hung on accept).
- 523 "Origin is unreachable" — CF can't route to origin (DNS gone,
  network path broken).

Pre-fix any of these would propagate as-is to the canvas and the user
would see a 5xx without the reactive auto-restart firing — exactly
the SaaS-blind class of failure PR #2362 was meant to close.

Refactor: extracted isUpstreamDeadStatus(int) helper so the matrix is
in one place, with TestIsUpstreamDeadStatus locking in 18 status
codes (7 dead, 11 not-dead including 520 and 525 which look CF-shaped
but indicate different failures).

Also tightened TestStopForRestart_NoProvisioner_NoOp per the same
review: now uses sqlmock.ExpectationsWereMet to assert the dispatcher
doesn't touch the DB on the both-nil path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../internal/handlers/a2a_proxy.go            | 39 ++++++++++---
 .../internal/handlers/a2a_proxy_test.go       | 55 +++++++++++++++++--
 2 files changed, 81 insertions(+), 13 deletions(-)

diff --git a/workspace-server/internal/handlers/a2a_proxy.go b/workspace-server/internal/handlers/a2a_proxy.go
index f5668730..b1fd4fd7 100644
--- a/workspace-server/internal/handlers/a2a_proxy.go
+++ b/workspace-server/internal/handlers/a2a_proxy.go
@@ -181,6 +181,35 @@ func isUpstreamBusyError(err error) bool {
 		strings.Contains(msg, "connection reset")
 }
 
+// isUpstreamDeadStatus returns true when the upstream HTTP status indicates
+// the workspace agent is unreachable / unresponsive at the network layer
+// (vs an agent-authored 5xx with a real body). Used by the proxy to gate
+// reactive container-dead detection + auto-restart.
+//
+//   - 502 Bad Gateway, 503 Service Unavailable, 504 Gateway Timeout: standard
+//     proxy-layer "upstream is broken" codes (Cloudflare, ELB, agent tunnel).
+//   - 521 Web Server Is Down: Cloudflare can't open TCP to origin (most
+//     direct dead-EC2 signal).
+//   - 522 Connection Timed Out: Cloudflare opened TCP but no response within
+//     ~15s — typical of SG/NACL flap or agent process hung.
+//   - 523 Origin Is Unreachable: Cloudflare can't route to origin (DNS or
+//     network-path failure).
+//   - 524 A Timeout Occurred: TCP succeeded, but origin didn't return
+//     headers within ~100s — agent process alive but wedged.
+//
+// We always probe IsRunning before acting, so a transient false positive
+// from this set just costs one CP API call.
+func isUpstreamDeadStatus(status int) bool {
+	switch status {
+	case http.StatusBadGateway, // 502
+		http.StatusServiceUnavailable, // 503
+		http.StatusGatewayTimeout,     // 504
+		521, 522, 523, 524:            // CF dead-origin family
+		return true
+	}
+	return false
+}
+
 func (e *proxyA2AError) Error() string {
 	if e == nil || e.Response == nil {
 		return "proxy a2a error"
@@ -484,15 +513,7 @@ func (h *WorkspaceHandler) proxyA2ARequest(ctx context.Context, workspaceID stri
 		// error body. We probe IsRunning regardless (it's the
 		// authoritative check) but the empty-body case is what makes
 		// this fix necessary.
-		// 524 = Cloudflare "origin timed out" — origin accepted the
-		// connection but didn't return headers within ~100s. Same
-		// signal as 504/502 for our purposes: the upstream agent is
-		// not responsive. We probe IsRunning to confirm before
-		// triggering a restart.
-		if resp.StatusCode == http.StatusBadGateway ||
-			resp.StatusCode == http.StatusServiceUnavailable ||
-			resp.StatusCode == http.StatusGatewayTimeout ||
-			resp.StatusCode == 524 {
+		if isUpstreamDeadStatus(resp.StatusCode) {
 			if h.maybeMarkContainerDead(ctx, workspaceID) {
 				return 0, nil, &proxyA2AError{
 					Status:   http.StatusServiceUnavailable,
diff --git a/workspace-server/internal/handlers/a2a_proxy_test.go b/workspace-server/internal/handlers/a2a_proxy_test.go
index 20d27e7f..d1334a00 100644
--- a/workspace-server/internal/handlers/a2a_proxy_test.go
+++ b/workspace-server/internal/handlers/a2a_proxy_test.go
@@ -919,6 +919,46 @@ func TestIsUpstreamBusyError(t *testing.T) {
 	}
 }
 
+// TestIsUpstreamDeadStatus locks in the status-code matrix that gates
+// reactive container-dead detection. Order matters: the helper exists so
+// the proxy + any future caller (e.g. a sweeper) classify CF dead-origin
+// codes the same way. Drift here would re-introduce the SaaS-blind bug
+// for whichever code we forgot.
+func TestIsUpstreamDeadStatus(t *testing.T) {
+	cases := []struct {
+		name   string
+		status int
+		want   bool
+	}{
+		// Standard proxy-layer dead-upstream codes
+		{"502 BadGateway", 502, true},
+		{"503 ServiceUnavailable", 503, true},
+		{"504 GatewayTimeout", 504, true},
+		// Cloudflare dead-origin family
+		{"521 WebServerDown", 521, true},
+		{"522 ConnectionTimedOut", 522, true},
+		{"523 OriginUnreachable", 523, true},
+		{"524 OriginTimedOut", 524, true},
+		// Negative cases — must NOT trigger restart
+		{"200 OK", 200, false},
+		{"400 BadRequest (agent rejected payload)", 400, false},
+		{"401 Unauthorized", 401, false},
+		{"404 NotFound (no such session)", 404, false},
+		{"408 RequestTimeout (client-side)", 408, false},
+		{"429 TooManyRequests (rate limited, agent alive)", 429, false},
+		{"500 InternalServerError (agent crashed mid-request)", 500, false},
+		{"501 NotImplemented", 501, false},
+		{"505 HTTPVersionNotSupported", 505, false},
+		{"520 WebServerReturnedUnknown (agent returned malformed)", 520, false},
+		{"525 SSLHandshakeFailed (TLS misconfig, not dead origin)", 525, false},
+	}
+	for _, tc := range cases {
+		if got := isUpstreamDeadStatus(tc.status); got != tc.want {
+			t.Errorf("%s: isUpstreamDeadStatus(%d) = %v, want %v", tc.name, tc.status, got, tc.want)
+		}
+	}
+}
+
 // ==================== ProxyA2A — upstream timeout returns 503 busy + Retry-After ====================
 
 // Verifies the full error-shaping contract for the 503-busy path:
@@ -1840,14 +1880,21 @@ func TestStopForRestart_SaaSPath_DispatchesViaCPProv(t *testing.T) {
 	}
 }
 
-// Both nil → no-op, no panic. Defensive guard against the dispatcher being
-// invoked on a misconfigured handler.
+// Both nil → no-op, no panic, no DB / broadcast side effects. Guards the
+// dispatcher against being invoked on a misconfigured handler. Important
+// because runRestartCycle's surrounding flow (status='provisioning' UPDATE
+// + broadcast) MUST happen even when both provisioners are nil — but
+// stopForRestart itself is a pure dispatcher and shouldn't touch state.
 func TestStopForRestart_NoProvisioner_NoOp(t *testing.T) {
+	mock := setupTestDB(t)
 	setupTestRedis(t)
 	handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
-	// no provisioner, no cpProv
+	// no provisioner, no cpProv, no DB expectations set on mock — any
+	// unexpected query/exec will produce a sqlmock error.
 	handler.stopForRestart(context.Background(), "ws-orphan")
-	// no panic, no calls — assertion is reaching this line
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Fatalf("stopForRestart no-provisioner path should not touch DB: %v", err)
+	}
 }
 
 // fakeCPProv satisfies provisioner.CPProvisionerAPI for tests that exercise