From a27cf8f39fa18a6d1ae43a797bab3fa88faa9dbb Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 30 Apr 2026 00:58:22 -0700 Subject: [PATCH] fix(restart): extract stopForRestart helper + add 524 to dead-agent list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses code-review C1 (test goroutine race) and I2 (CF 524) on PR #2362. C1: TestRunRestartCycle_SaaSPath_DispatchesViaCPProv invoked runRestartCycle end-to-end, which spawns `go h.sendRestartContext(...)`. That goroutine outlived the test, then read db.DB while the next test's setupTestDB wrote to it — DATA RACE under -race, cascading 30+ failures across the handlers suite. Refactored: extracted `stopForRestart(ctx, id)` from runRestartCycle as a pure dispatcher, and rewrote the SaaS-path test to call it directly (no async goroutine spawned). Added a no-provisioner no-op guard test. I2: Cloudflare 524 ("origin timed out") now triggers maybeMarkContainerDead alongside 502/503/504. Same upstream signal — origin agent unresponsive. Verified `go test -race -count=1 ./internal/handlers/...` green locally. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/handlers/a2a_proxy.go | 8 +++- .../internal/handlers/a2a_proxy_test.go | 46 +++++++++---------- .../internal/handlers/workspace_restart.go | 34 ++++++++------ 3 files changed, 48 insertions(+), 40 deletions(-) diff --git a/workspace-server/internal/handlers/a2a_proxy.go b/workspace-server/internal/handlers/a2a_proxy.go index 3ce8f4f5..f5668730 100644 --- a/workspace-server/internal/handlers/a2a_proxy.go +++ b/workspace-server/internal/handlers/a2a_proxy.go @@ -484,9 +484,15 @@ func (h *WorkspaceHandler) proxyA2ARequest(ctx context.Context, workspaceID stri // error body. We probe IsRunning regardless (it's the // authoritative check) but the empty-body case is what makes // this fix necessary. + // 524 = Cloudflare "origin timed out" — origin accepted the + // connection but didn't return headers within ~100s. Same + // signal as 504/502 for our purposes: the upstream agent is + // not responsive. We probe IsRunning to confirm before + // triggering a restart. if resp.StatusCode == http.StatusBadGateway || resp.StatusCode == http.StatusServiceUnavailable || - resp.StatusCode == http.StatusGatewayTimeout { + resp.StatusCode == http.StatusGatewayTimeout || + resp.StatusCode == 524 { if h.maybeMarkContainerDead(ctx, workspaceID) { return 0, nil, &proxyA2AError{ Status: http.StatusServiceUnavailable, diff --git a/workspace-server/internal/handlers/a2a_proxy_test.go b/workspace-server/internal/handlers/a2a_proxy_test.go index ef9bec05..20d27e7f 100644 --- a/workspace-server/internal/handlers/a2a_proxy_test.go +++ b/workspace-server/internal/handlers/a2a_proxy_test.go @@ -1819,39 +1819,35 @@ func TestMaybeMarkContainerDead_CPOnly_Running(t *testing.T) { // restart chain (provisionWorkspaceCP) needs its own mocked DB rows that // would explode the surface area of this test; what we care about here // is the dispatch decision, which is observable on cpProv.stopCalls. -func TestRunRestartCycle_SaaSPath_DispatchesViaCPProv(t *testing.T) { - mock := setupTestDB(t) +// stopForRestart is the dispatch helper extracted from runRestartCycle so the +// branch logic can be tested without spawning the async sendRestartContext +// goroutine that the full cycle fires. Pre-fix runRestartCycle's Stop dispatch +// only called the Docker path, so on SaaS (h.provisioner=nil) the cycle NPE'd +// silently and left the workspace stuck in status='provisioning'. +func TestStopForRestart_SaaSPath_DispatchesViaCPProv(t *testing.T) { setupTestRedis(t) handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir()) cp := &fakeCPProv{} handler.SetCPProvisioner(cp) - // runRestartCycle's first query — workspace metadata. - mock.ExpectQuery(`SELECT name, status, tier, COALESCE\(runtime, 'langgraph'\) FROM workspaces`). - WithArgs("ws-saas-restart"). - WillReturnRows(sqlmock.NewRows([]string{"name", "status", "tier", "runtime"}). - AddRow("Test Workspace", "online", 2, "hermes")) - // isParentPaused — return nil parent_id so the helper short-circuits. - mock.ExpectQuery(`SELECT parent_id FROM workspaces WHERE id =`). - WithArgs("ws-saas-restart"). - WillReturnRows(sqlmock.NewRows([]string{"parent_id"}).AddRow(nil)) - // After Stop, the cycle UPDATEs status to 'provisioning'. We allow - // that and don't mock everything provisionWorkspaceCP touches — - // provisionWorkspaceCP will fail at its first unmocked query, which - // is downstream of the dispatch decision under test. - mock.ExpectExec(`UPDATE workspaces SET status = 'provisioning'`). - WithArgs("ws-saas-restart"). - WillReturnResult(sqlmock.NewResult(0, 1)) - // loadRestartContextData query (best-effort lookups; let them not match - // and the function returns a zero-valued struct). - mock.MatchExpectationsInOrder(false) + handler.stopForRestart(context.Background(), "ws-saas-restart") - handler.runRestartCycle("ws-saas-restart") - - // The dispatch we're testing: SaaS path → cpProv.Stop, never Docker. if cp.stopCalls != 1 { - t.Fatalf("expected cpProv.Stop to be called once on SaaS auto-restart; got %d calls — pre-fix the cycle NPE'd before reaching Stop on the (nil) Docker path", cp.stopCalls) + t.Fatalf("expected cpProv.Stop to be called once on SaaS auto-restart; got %d", cp.stopCalls) } + if cp.startCalls != 0 { + t.Fatalf("expected cpProv.Start NOT to be called by stopForRestart; got %d", cp.startCalls) + } +} + +// Both nil → no-op, no panic. Defensive guard against the dispatcher being +// invoked on a misconfigured handler. +func TestStopForRestart_NoProvisioner_NoOp(t *testing.T) { + setupTestRedis(t) + handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir()) + // no provisioner, no cpProv + handler.stopForRestart(context.Background(), "ws-orphan") + // no panic, no calls — assertion is reaching this line } // fakeCPProv satisfies provisioner.CPProvisionerAPI for tests that exercise diff --git a/workspace-server/internal/handlers/workspace_restart.go b/workspace-server/internal/handlers/workspace_restart.go index 7f3f7065..444cb791 100644 --- a/workspace-server/internal/handlers/workspace_restart.go +++ b/workspace-server/internal/handlers/workspace_restart.go @@ -396,6 +396,25 @@ func coalesceRestart(workspaceID string, cycle func()) { } } +// stopForRestart dispatches Stop to whichever provisioner is wired (Docker or +// CP/EC2 — mutually exclusive in production). Docker provisioner.Stop kills +// the local container; CP provisioner.Stop calls DELETE /cp/workspaces/:id +// which terminates the EC2 instance. Pre-fix runRestartCycle only called the +// Docker path, so on SaaS (h.provisioner=nil) the auto-restart cycle silently +// NPE'd before reaching the reprovision step — which is why every SaaS dead- +// agent incident pre-this-fix required manual restart from canvas. +func (h *WorkspaceHandler) stopForRestart(ctx context.Context, workspaceID string) { + if h.provisioner != nil { + h.provisioner.Stop(ctx, workspaceID) + return + } + if h.cpProv != nil { + if err := h.cpProv.Stop(ctx, workspaceID); err != nil { + log.Printf("Auto-restart: cpProv.Stop(%s) failed: %v (continuing to reprovision)", workspaceID, err) + } + } +} + // runRestartCycle does the actual stop+provision work for one restart // iteration. Synchronous (waits for provisionWorkspace to complete) so the // outer pending-flag loop in RestartByID can correctly coalesce — if this @@ -431,20 +450,7 @@ func (h *WorkspaceHandler) runRestartCycle(workspaceID string) { log.Printf("Auto-restart: restarting %s (%s) runtime=%q (was: %s)", wsName, workspaceID, dbRuntime, status) - // Stop existing compute. Branch on which provisioner is wired (mutually - // exclusive in production): Docker provisioner.Stop kills the local - // container; CP provisioner.Stop calls DELETE /cp/workspaces/:id which - // terminates the EC2 instance. Pre-fix this only called the Docker path, - // so on SaaS (h.provisioner=nil) the auto-restart cycle silently NPE'd - // before reaching the reprovision step — which is why every SaaS dead- - // agent incident pre-this-fix required manual restart from canvas. - if h.provisioner != nil { - h.provisioner.Stop(ctx, workspaceID) - } else if h.cpProv != nil { - if err := h.cpProv.Stop(ctx, workspaceID); err != nil { - log.Printf("Auto-restart: cpProv.Stop(%s) failed: %v (continuing to reprovision)", workspaceID, err) - } - } + h.stopForRestart(ctx, workspaceID) db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'provisioning', url = '', updated_at = now() WHERE id = $1`, workspaceID)