fix(restart): extract stopForRestart helper + add 524 to dead-agent list

Addresses code-review C1 (test goroutine race) and I2 (CF 524) on PR #2362. C1: TestRunRestartCycle_SaaSPath_DispatchesViaCPProv invoked runRestartCycle end-to-end, which spawns `go h.sendRestartContext(...)`. That goroutine outlived the test, then read db.DB while the next test's setupTestDB wrote to it — DATA RACE under -race, cascading 30+ failures across the handlers suite. Refactored: extracted `stopForRestart(ctx, id)` from runRestartCycle as a pure dispatcher, and rewrote the SaaS-path test to call it directly (no async goroutine spawned). Added a no-provisioner no-op guard test. I2: Cloudflare 524 ("origin timed out") now triggers maybeMarkContainerDead alongside 502/503/504. Same upstream signal — origin agent unresponsive. Verified `go test -race -count=1 ./internal/handlers/...` green locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 00:58:22 -07:00 · 2026-04-30 00:58:22 -07:00 · a27cf8f39f
commit a27cf8f39f
parent 28b4e38002
3 changed files with 48 additions and 40 deletions
--- a/workspace-server/internal/handlers/a2a_proxy.go
+++ b/workspace-server/internal/handlers/a2a_proxy.go
@ -484,9 +484,15 @@ func (h *WorkspaceHandler) proxyA2ARequest(ctx context.Context, workspaceID stri
 		// error body. We probe IsRunning regardless (it's the
 		// authoritative check) but the empty-body case is what makes
 		// this fix necessary.
+		// 524 = Cloudflare "origin timed out" — origin accepted the
+		// connection but didn't return headers within ~100s. Same
+		// signal as 504/502 for our purposes: the upstream agent is
+		// not responsive. We probe IsRunning to confirm before
+		// triggering a restart.
 		if resp.StatusCode == http.StatusBadGateway ||
 			resp.StatusCode == http.StatusServiceUnavailable ||
-			resp.StatusCode == http.StatusGatewayTimeout {
+			resp.StatusCode == http.StatusGatewayTimeout ||
+			resp.StatusCode == 524 {
 			if h.maybeMarkContainerDead(ctx, workspaceID) {
 				return 0, nil, &proxyA2AError{
 					Status:   http.StatusServiceUnavailable,
--- a/workspace-server/internal/handlers/a2a_proxy_test.go
+++ b/workspace-server/internal/handlers/a2a_proxy_test.go
@ -1819,39 +1819,35 @@ func TestMaybeMarkContainerDead_CPOnly_Running(t *testing.T) {
 // restart chain (provisionWorkspaceCP) needs its own mocked DB rows that
 // would explode the surface area of this test; what we care about here
 // is the dispatch decision, which is observable on cpProv.stopCalls.
-func TestRunRestartCycle_SaaSPath_DispatchesViaCPProv(t *testing.T) {
-	mock := setupTestDB(t)
+// stopForRestart is the dispatch helper extracted from runRestartCycle so the
+// branch logic can be tested without spawning the async sendRestartContext
+// goroutine that the full cycle fires. Pre-fix runRestartCycle's Stop dispatch
+// only called the Docker path, so on SaaS (h.provisioner=nil) the cycle NPE'd
+// silently and left the workspace stuck in status='provisioning'.
+func TestStopForRestart_SaaSPath_DispatchesViaCPProv(t *testing.T) {
 	setupTestRedis(t)
 	handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
 	cp := &fakeCPProv{}
 	handler.SetCPProvisioner(cp)

-	// runRestartCycle's first query — workspace metadata.
-	mock.ExpectQuery(`SELECT name, status, tier, COALESCE\(runtime, 'langgraph'\) FROM workspaces`).
-		WithArgs("ws-saas-restart").
-		WillReturnRows(sqlmock.NewRows([]string{"name", "status", "tier", "runtime"}).
-			AddRow("Test Workspace", "online", 2, "hermes"))
-	// isParentPaused — return nil parent_id so the helper short-circuits.
-	mock.ExpectQuery(`SELECT parent_id FROM workspaces WHERE id =`).
-		WithArgs("ws-saas-restart").
-		WillReturnRows(sqlmock.NewRows([]string{"parent_id"}).AddRow(nil))
-	// After Stop, the cycle UPDATEs status to 'provisioning'. We allow
-	// that and don't mock everything provisionWorkspaceCP touches —
-	// provisionWorkspaceCP will fail at its first unmocked query, which
-	// is downstream of the dispatch decision under test.
-	mock.ExpectExec(`UPDATE workspaces SET status = 'provisioning'`).
-		WithArgs("ws-saas-restart").
-		WillReturnResult(sqlmock.NewResult(0, 1))
-	// loadRestartContextData query (best-effort lookups; let them not match
-	// and the function returns a zero-valued struct).
-	mock.MatchExpectationsInOrder(false)
+	handler.stopForRestart(context.Background(), "ws-saas-restart")

-	handler.runRestartCycle("ws-saas-restart")
-
-	// The dispatch we're testing: SaaS path → cpProv.Stop, never Docker.
 	if cp.stopCalls != 1 {
-		t.Fatalf("expected cpProv.Stop to be called once on SaaS auto-restart; got %d calls — pre-fix the cycle NPE'd before reaching Stop on the (nil) Docker path", cp.stopCalls)
+		t.Fatalf("expected cpProv.Stop to be called once on SaaS auto-restart; got %d", cp.stopCalls)
 	}
+	if cp.startCalls != 0 {
+		t.Fatalf("expected cpProv.Start NOT to be called by stopForRestart; got %d", cp.startCalls)
+	}
+}
+
+// Both nil → no-op, no panic. Defensive guard against the dispatcher being
+// invoked on a misconfigured handler.
+func TestStopForRestart_NoProvisioner_NoOp(t *testing.T) {
+	setupTestRedis(t)
+	handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
+	// no provisioner, no cpProv
+	handler.stopForRestart(context.Background(), "ws-orphan")
+	// no panic, no calls — assertion is reaching this line
 }

 // fakeCPProv satisfies provisioner.CPProvisionerAPI for tests that exercise
--- a/workspace-server/internal/handlers/workspace_restart.go
+++ b/workspace-server/internal/handlers/workspace_restart.go
@ -396,6 +396,25 @@ func coalesceRestart(workspaceID string, cycle func()) {
 	}
 }

+// stopForRestart dispatches Stop to whichever provisioner is wired (Docker or
+// CP/EC2 — mutually exclusive in production). Docker provisioner.Stop kills
+// the local container; CP provisioner.Stop calls DELETE /cp/workspaces/:id
+// which terminates the EC2 instance. Pre-fix runRestartCycle only called the
+// Docker path, so on SaaS (h.provisioner=nil) the auto-restart cycle silently
+// NPE'd before reaching the reprovision step — which is why every SaaS dead-
+// agent incident pre-this-fix required manual restart from canvas.
+func (h *WorkspaceHandler) stopForRestart(ctx context.Context, workspaceID string) {
+	if h.provisioner != nil {
+		h.provisioner.Stop(ctx, workspaceID)
+		return
+	}
+	if h.cpProv != nil {
+		if err := h.cpProv.Stop(ctx, workspaceID); err != nil {
+			log.Printf("Auto-restart: cpProv.Stop(%s) failed: %v (continuing to reprovision)", workspaceID, err)
+		}
+	}
+}
+
 // runRestartCycle does the actual stop+provision work for one restart
 // iteration. Synchronous (waits for provisionWorkspace to complete) so the
 // outer pending-flag loop in RestartByID can correctly coalesce — if this
@ -431,20 +450,7 @@ func (h *WorkspaceHandler) runRestartCycle(workspaceID string) {

 	log.Printf("Auto-restart: restarting %s (%s) runtime=%q (was: %s)", wsName, workspaceID, dbRuntime, status)

-	// Stop existing compute. Branch on which provisioner is wired (mutually
-	// exclusive in production): Docker provisioner.Stop kills the local
-	// container; CP provisioner.Stop calls DELETE /cp/workspaces/:id which
-	// terminates the EC2 instance. Pre-fix this only called the Docker path,
-	// so on SaaS (h.provisioner=nil) the auto-restart cycle silently NPE'd
-	// before reaching the reprovision step — which is why every SaaS dead-
-	// agent incident pre-this-fix required manual restart from canvas.
-	if h.provisioner != nil {
-		h.provisioner.Stop(ctx, workspaceID)
-	} else if h.cpProv != nil {
-		if err := h.cpProv.Stop(ctx, workspaceID); err != nil {
-			log.Printf("Auto-restart: cpProv.Stop(%s) failed: %v (continuing to reprovision)", workspaceID, err)
-		}
-	}
+	h.stopForRestart(ctx, workspaceID)

 	db.DB.ExecContext(ctx,
 		`UPDATE workspaces SET status = 'provisioning', url = '', updated_at = now() WHERE id = $1`, workspaceID)