From a27cf8f39fa18a6d1ae43a797bab3fa88faa9dbb Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangalt@gmail.com>
Date: Thu, 30 Apr 2026 00:58:22 -0700
Subject: [PATCH] fix(restart): extract stopForRestart helper + add 524 to
 dead-agent list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses code-review C1 (test goroutine race) and I2 (CF 524) on PR #2362.

C1: TestRunRestartCycle_SaaSPath_DispatchesViaCPProv invoked runRestartCycle
end-to-end, which spawns `go h.sendRestartContext(...)`. That goroutine
outlived the test, then read db.DB while the next test's setupTestDB wrote
to it — DATA RACE under -race, cascading 30+ failures across the handlers
suite. Refactored: extracted `stopForRestart(ctx, id)` from runRestartCycle
as a pure dispatcher, and rewrote the SaaS-path test to call it directly
(no async goroutine spawned). Added a no-provisioner no-op guard test.

I2: Cloudflare 524 ("origin timed out") now triggers maybeMarkContainerDead
alongside 502/503/504. Same upstream signal — origin agent unresponsive.

Verified `go test -race -count=1 ./internal/handlers/...` green locally.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../internal/handlers/a2a_proxy.go            |  8 +++-
 .../internal/handlers/a2a_proxy_test.go       | 46 +++++++++----------
 .../internal/handlers/workspace_restart.go    | 34 ++++++++------
 3 files changed, 48 insertions(+), 40 deletions(-)

diff --git a/workspace-server/internal/handlers/a2a_proxy.go b/workspace-server/internal/handlers/a2a_proxy.go
index 3ce8f4f5..f5668730 100644
--- a/workspace-server/internal/handlers/a2a_proxy.go
+++ b/workspace-server/internal/handlers/a2a_proxy.go
@@ -484,9 +484,15 @@ func (h *WorkspaceHandler) proxyA2ARequest(ctx context.Context, workspaceID stri
 		// error body. We probe IsRunning regardless (it's the
 		// authoritative check) but the empty-body case is what makes
 		// this fix necessary.
+		// 524 = Cloudflare "origin timed out" — origin accepted the
+		// connection but didn't return headers within ~100s. Same
+		// signal as 504/502 for our purposes: the upstream agent is
+		// not responsive. We probe IsRunning to confirm before
+		// triggering a restart.
 		if resp.StatusCode == http.StatusBadGateway ||
 			resp.StatusCode == http.StatusServiceUnavailable ||
-			resp.StatusCode == http.StatusGatewayTimeout {
+			resp.StatusCode == http.StatusGatewayTimeout ||
+			resp.StatusCode == 524 {
 			if h.maybeMarkContainerDead(ctx, workspaceID) {
 				return 0, nil, &proxyA2AError{
 					Status:   http.StatusServiceUnavailable,
diff --git a/workspace-server/internal/handlers/a2a_proxy_test.go b/workspace-server/internal/handlers/a2a_proxy_test.go
index ef9bec05..20d27e7f 100644
--- a/workspace-server/internal/handlers/a2a_proxy_test.go
+++ b/workspace-server/internal/handlers/a2a_proxy_test.go
@@ -1819,39 +1819,35 @@ func TestMaybeMarkContainerDead_CPOnly_Running(t *testing.T) {
 // restart chain (provisionWorkspaceCP) needs its own mocked DB rows that
 // would explode the surface area of this test; what we care about here
 // is the dispatch decision, which is observable on cpProv.stopCalls.
-func TestRunRestartCycle_SaaSPath_DispatchesViaCPProv(t *testing.T) {
-	mock := setupTestDB(t)
+// stopForRestart is the dispatch helper extracted from runRestartCycle so the
+// branch logic can be tested without spawning the async sendRestartContext
+// goroutine that the full cycle fires. Pre-fix runRestartCycle's Stop dispatch
+// only called the Docker path, so on SaaS (h.provisioner=nil) the cycle NPE'd
+// silently and left the workspace stuck in status='provisioning'.
+func TestStopForRestart_SaaSPath_DispatchesViaCPProv(t *testing.T) {
 	setupTestRedis(t)
 	handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
 	cp := &fakeCPProv{}
 	handler.SetCPProvisioner(cp)
 
-	// runRestartCycle's first query — workspace metadata.
-	mock.ExpectQuery(`SELECT name, status, tier, COALESCE\(runtime, 'langgraph'\) FROM workspaces`).
-		WithArgs("ws-saas-restart").
-		WillReturnRows(sqlmock.NewRows([]string{"name", "status", "tier", "runtime"}).
-			AddRow("Test Workspace", "online", 2, "hermes"))
-	// isParentPaused — return nil parent_id so the helper short-circuits.
-	mock.ExpectQuery(`SELECT parent_id FROM workspaces WHERE id =`).
-		WithArgs("ws-saas-restart").
-		WillReturnRows(sqlmock.NewRows([]string{"parent_id"}).AddRow(nil))
-	// After Stop, the cycle UPDATEs status to 'provisioning'. We allow
-	// that and don't mock everything provisionWorkspaceCP touches —
-	// provisionWorkspaceCP will fail at its first unmocked query, which
-	// is downstream of the dispatch decision under test.
-	mock.ExpectExec(`UPDATE workspaces SET status = 'provisioning'`).
-		WithArgs("ws-saas-restart").
-		WillReturnResult(sqlmock.NewResult(0, 1))
-	// loadRestartContextData query (best-effort lookups; let them not match
-	// and the function returns a zero-valued struct).
-	mock.MatchExpectationsInOrder(false)
+	handler.stopForRestart(context.Background(), "ws-saas-restart")
 
-	handler.runRestartCycle("ws-saas-restart")
-
-	// The dispatch we're testing: SaaS path → cpProv.Stop, never Docker.
 	if cp.stopCalls != 1 {
-		t.Fatalf("expected cpProv.Stop to be called once on SaaS auto-restart; got %d calls — pre-fix the cycle NPE'd before reaching Stop on the (nil) Docker path", cp.stopCalls)
+		t.Fatalf("expected cpProv.Stop to be called once on SaaS auto-restart; got %d", cp.stopCalls)
 	}
+	if cp.startCalls != 0 {
+		t.Fatalf("expected cpProv.Start NOT to be called by stopForRestart; got %d", cp.startCalls)
+	}
+}
+
+// Both nil → no-op, no panic. Defensive guard against the dispatcher being
+// invoked on a misconfigured handler.
+func TestStopForRestart_NoProvisioner_NoOp(t *testing.T) {
+	setupTestRedis(t)
+	handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
+	// no provisioner, no cpProv
+	handler.stopForRestart(context.Background(), "ws-orphan")
+	// no panic, no calls — assertion is reaching this line
 }
 
 // fakeCPProv satisfies provisioner.CPProvisionerAPI for tests that exercise
diff --git a/workspace-server/internal/handlers/workspace_restart.go b/workspace-server/internal/handlers/workspace_restart.go
index 7f3f7065..444cb791 100644
--- a/workspace-server/internal/handlers/workspace_restart.go
+++ b/workspace-server/internal/handlers/workspace_restart.go
@@ -396,6 +396,25 @@ func coalesceRestart(workspaceID string, cycle func()) {
 	}
 }
 
+// stopForRestart dispatches Stop to whichever provisioner is wired (Docker or
+// CP/EC2 — mutually exclusive in production). Docker provisioner.Stop kills
+// the local container; CP provisioner.Stop calls DELETE /cp/workspaces/:id
+// which terminates the EC2 instance. Pre-fix runRestartCycle only called the
+// Docker path, so on SaaS (h.provisioner=nil) the auto-restart cycle silently
+// NPE'd before reaching the reprovision step — which is why every SaaS dead-
+// agent incident pre-this-fix required manual restart from canvas.
+func (h *WorkspaceHandler) stopForRestart(ctx context.Context, workspaceID string) {
+	if h.provisioner != nil {
+		h.provisioner.Stop(ctx, workspaceID)
+		return
+	}
+	if h.cpProv != nil {
+		if err := h.cpProv.Stop(ctx, workspaceID); err != nil {
+			log.Printf("Auto-restart: cpProv.Stop(%s) failed: %v (continuing to reprovision)", workspaceID, err)
+		}
+	}
+}
+
 // runRestartCycle does the actual stop+provision work for one restart
 // iteration. Synchronous (waits for provisionWorkspace to complete) so the
 // outer pending-flag loop in RestartByID can correctly coalesce — if this
@@ -431,20 +450,7 @@ func (h *WorkspaceHandler) runRestartCycle(workspaceID string) {
 
 	log.Printf("Auto-restart: restarting %s (%s) runtime=%q (was: %s)", wsName, workspaceID, dbRuntime, status)
 
-	// Stop existing compute. Branch on which provisioner is wired (mutually
-	// exclusive in production): Docker provisioner.Stop kills the local
-	// container; CP provisioner.Stop calls DELETE /cp/workspaces/:id which
-	// terminates the EC2 instance. Pre-fix this only called the Docker path,
-	// so on SaaS (h.provisioner=nil) the auto-restart cycle silently NPE'd
-	// before reaching the reprovision step — which is why every SaaS dead-
-	// agent incident pre-this-fix required manual restart from canvas.
-	if h.provisioner != nil {
-		h.provisioner.Stop(ctx, workspaceID)
-	} else if h.cpProv != nil {
-		if err := h.cpProv.Stop(ctx, workspaceID); err != nil {
-			log.Printf("Auto-restart: cpProv.Stop(%s) failed: %v (continuing to reprovision)", workspaceID, err)
-		}
-	}
+	h.stopForRestart(ctx, workspaceID)
 
 	db.DB.ExecContext(ctx,
 		`UPDATE workspaces SET status = 'provisioning', url = '', updated_at = now() WHERE id = $1`, workspaceID)