diff --git a/workspace-server/internal/handlers/a2a_proxy.go b/workspace-server/internal/handlers/a2a_proxy.go index 3ce8f4f5..f5668730 100644 --- a/workspace-server/internal/handlers/a2a_proxy.go +++ b/workspace-server/internal/handlers/a2a_proxy.go @@ -484,9 +484,15 @@ func (h *WorkspaceHandler) proxyA2ARequest(ctx context.Context, workspaceID stri // error body. We probe IsRunning regardless (it's the // authoritative check) but the empty-body case is what makes // this fix necessary. + // 524 = Cloudflare "origin timed out" — origin accepted the + // connection but didn't return headers within ~100s. Same + // signal as 504/502 for our purposes: the upstream agent is + // not responsive. We probe IsRunning to confirm before + // triggering a restart. if resp.StatusCode == http.StatusBadGateway || resp.StatusCode == http.StatusServiceUnavailable || - resp.StatusCode == http.StatusGatewayTimeout { + resp.StatusCode == http.StatusGatewayTimeout || + resp.StatusCode == 524 { if h.maybeMarkContainerDead(ctx, workspaceID) { return 0, nil, &proxyA2AError{ Status: http.StatusServiceUnavailable, diff --git a/workspace-server/internal/handlers/a2a_proxy_test.go b/workspace-server/internal/handlers/a2a_proxy_test.go index ef9bec05..20d27e7f 100644 --- a/workspace-server/internal/handlers/a2a_proxy_test.go +++ b/workspace-server/internal/handlers/a2a_proxy_test.go @@ -1819,39 +1819,35 @@ func TestMaybeMarkContainerDead_CPOnly_Running(t *testing.T) { // restart chain (provisionWorkspaceCP) needs its own mocked DB rows that // would explode the surface area of this test; what we care about here // is the dispatch decision, which is observable on cpProv.stopCalls. -func TestRunRestartCycle_SaaSPath_DispatchesViaCPProv(t *testing.T) { - mock := setupTestDB(t) +// stopForRestart is the dispatch helper extracted from runRestartCycle so the +// branch logic can be tested without spawning the async sendRestartContext +// goroutine that the full cycle fires. Pre-fix runRestartCycle's Stop dispatch +// only called the Docker path, so on SaaS (h.provisioner=nil) the cycle NPE'd +// silently and left the workspace stuck in status='provisioning'. +func TestStopForRestart_SaaSPath_DispatchesViaCPProv(t *testing.T) { setupTestRedis(t) handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir()) cp := &fakeCPProv{} handler.SetCPProvisioner(cp) - // runRestartCycle's first query — workspace metadata. - mock.ExpectQuery(`SELECT name, status, tier, COALESCE\(runtime, 'langgraph'\) FROM workspaces`). - WithArgs("ws-saas-restart"). - WillReturnRows(sqlmock.NewRows([]string{"name", "status", "tier", "runtime"}). - AddRow("Test Workspace", "online", 2, "hermes")) - // isParentPaused — return nil parent_id so the helper short-circuits. - mock.ExpectQuery(`SELECT parent_id FROM workspaces WHERE id =`). - WithArgs("ws-saas-restart"). - WillReturnRows(sqlmock.NewRows([]string{"parent_id"}).AddRow(nil)) - // After Stop, the cycle UPDATEs status to 'provisioning'. We allow - // that and don't mock everything provisionWorkspaceCP touches — - // provisionWorkspaceCP will fail at its first unmocked query, which - // is downstream of the dispatch decision under test. - mock.ExpectExec(`UPDATE workspaces SET status = 'provisioning'`). - WithArgs("ws-saas-restart"). - WillReturnResult(sqlmock.NewResult(0, 1)) - // loadRestartContextData query (best-effort lookups; let them not match - // and the function returns a zero-valued struct). - mock.MatchExpectationsInOrder(false) + handler.stopForRestart(context.Background(), "ws-saas-restart") - handler.runRestartCycle("ws-saas-restart") - - // The dispatch we're testing: SaaS path → cpProv.Stop, never Docker. if cp.stopCalls != 1 { - t.Fatalf("expected cpProv.Stop to be called once on SaaS auto-restart; got %d calls — pre-fix the cycle NPE'd before reaching Stop on the (nil) Docker path", cp.stopCalls) + t.Fatalf("expected cpProv.Stop to be called once on SaaS auto-restart; got %d", cp.stopCalls) } + if cp.startCalls != 0 { + t.Fatalf("expected cpProv.Start NOT to be called by stopForRestart; got %d", cp.startCalls) + } +} + +// Both nil → no-op, no panic. Defensive guard against the dispatcher being +// invoked on a misconfigured handler. +func TestStopForRestart_NoProvisioner_NoOp(t *testing.T) { + setupTestRedis(t) + handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir()) + // no provisioner, no cpProv + handler.stopForRestart(context.Background(), "ws-orphan") + // no panic, no calls — assertion is reaching this line } // fakeCPProv satisfies provisioner.CPProvisionerAPI for tests that exercise diff --git a/workspace-server/internal/handlers/workspace_restart.go b/workspace-server/internal/handlers/workspace_restart.go index 7f3f7065..444cb791 100644 --- a/workspace-server/internal/handlers/workspace_restart.go +++ b/workspace-server/internal/handlers/workspace_restart.go @@ -396,6 +396,25 @@ func coalesceRestart(workspaceID string, cycle func()) { } } +// stopForRestart dispatches Stop to whichever provisioner is wired (Docker or +// CP/EC2 — mutually exclusive in production). Docker provisioner.Stop kills +// the local container; CP provisioner.Stop calls DELETE /cp/workspaces/:id +// which terminates the EC2 instance. Pre-fix runRestartCycle only called the +// Docker path, so on SaaS (h.provisioner=nil) the auto-restart cycle silently +// NPE'd before reaching the reprovision step — which is why every SaaS dead- +// agent incident pre-this-fix required manual restart from canvas. +func (h *WorkspaceHandler) stopForRestart(ctx context.Context, workspaceID string) { + if h.provisioner != nil { + h.provisioner.Stop(ctx, workspaceID) + return + } + if h.cpProv != nil { + if err := h.cpProv.Stop(ctx, workspaceID); err != nil { + log.Printf("Auto-restart: cpProv.Stop(%s) failed: %v (continuing to reprovision)", workspaceID, err) + } + } +} + // runRestartCycle does the actual stop+provision work for one restart // iteration. Synchronous (waits for provisionWorkspace to complete) so the // outer pending-flag loop in RestartByID can correctly coalesce — if this @@ -431,20 +450,7 @@ func (h *WorkspaceHandler) runRestartCycle(workspaceID string) { log.Printf("Auto-restart: restarting %s (%s) runtime=%q (was: %s)", wsName, workspaceID, dbRuntime, status) - // Stop existing compute. Branch on which provisioner is wired (mutually - // exclusive in production): Docker provisioner.Stop kills the local - // container; CP provisioner.Stop calls DELETE /cp/workspaces/:id which - // terminates the EC2 instance. Pre-fix this only called the Docker path, - // so on SaaS (h.provisioner=nil) the auto-restart cycle silently NPE'd - // before reaching the reprovision step — which is why every SaaS dead- - // agent incident pre-this-fix required manual restart from canvas. - if h.provisioner != nil { - h.provisioner.Stop(ctx, workspaceID) - } else if h.cpProv != nil { - if err := h.cpProv.Stop(ctx, workspaceID); err != nil { - log.Printf("Auto-restart: cpProv.Stop(%s) failed: %v (continuing to reprovision)", workspaceID, err) - } - } + h.stopForRestart(ctx, workspaceID) db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'provisioning', url = '', updated_at = now() WHERE id = $1`, workspaceID)