fix(restart): extract stopForRestart helper + add 524 to dead-agent list

Addresses code-review C1 (test goroutine race) and I2 (CF 524) on PR #2362.

C1: TestRunRestartCycle_SaaSPath_DispatchesViaCPProv invoked runRestartCycle
end-to-end, which spawns `go h.sendRestartContext(...)`. That goroutine
outlived the test, then read db.DB while the next test's setupTestDB wrote
to it — DATA RACE under -race, cascading 30+ failures across the handlers
suite. Refactored: extracted `stopForRestart(ctx, id)` from runRestartCycle
as a pure dispatcher, and rewrote the SaaS-path test to call it directly
(no async goroutine spawned). Added a no-provisioner no-op guard test.

I2: Cloudflare 524 ("origin timed out") now triggers maybeMarkContainerDead
alongside 502/503/504. Same upstream signal — origin agent unresponsive.

Verified `go test -race -count=1 ./internal/handlers/...` green locally.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hongming Wang 2026-04-30 00:58:22 -07:00
parent 28b4e38002
commit a27cf8f39f
3 changed files with 48 additions and 40 deletions

View File

@ -484,9 +484,15 @@ func (h *WorkspaceHandler) proxyA2ARequest(ctx context.Context, workspaceID stri
// error body. We probe IsRunning regardless (it's the
// authoritative check) but the empty-body case is what makes
// this fix necessary.
// 524 = Cloudflare "origin timed out" — origin accepted the
// connection but didn't return headers within ~100s. Same
// signal as 504/502 for our purposes: the upstream agent is
// not responsive. We probe IsRunning to confirm before
// triggering a restart.
if resp.StatusCode == http.StatusBadGateway ||
resp.StatusCode == http.StatusServiceUnavailable ||
resp.StatusCode == http.StatusGatewayTimeout {
resp.StatusCode == http.StatusGatewayTimeout ||
resp.StatusCode == 524 {
if h.maybeMarkContainerDead(ctx, workspaceID) {
return 0, nil, &proxyA2AError{
Status: http.StatusServiceUnavailable,

View File

@ -1819,39 +1819,35 @@ func TestMaybeMarkContainerDead_CPOnly_Running(t *testing.T) {
// restart chain (provisionWorkspaceCP) needs its own mocked DB rows that
// would explode the surface area of this test; what we care about here
// is the dispatch decision, which is observable on cpProv.stopCalls.
func TestRunRestartCycle_SaaSPath_DispatchesViaCPProv(t *testing.T) {
mock := setupTestDB(t)
// stopForRestart is the dispatch helper extracted from runRestartCycle so the
// branch logic can be tested without spawning the async sendRestartContext
// goroutine that the full cycle fires. Pre-fix runRestartCycle's Stop dispatch
// only called the Docker path, so on SaaS (h.provisioner=nil) the cycle NPE'd
// silently and left the workspace stuck in status='provisioning'.
func TestStopForRestart_SaaSPath_DispatchesViaCPProv(t *testing.T) {
setupTestRedis(t)
handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
cp := &fakeCPProv{}
handler.SetCPProvisioner(cp)
// runRestartCycle's first query — workspace metadata.
mock.ExpectQuery(`SELECT name, status, tier, COALESCE\(runtime, 'langgraph'\) FROM workspaces`).
WithArgs("ws-saas-restart").
WillReturnRows(sqlmock.NewRows([]string{"name", "status", "tier", "runtime"}).
AddRow("Test Workspace", "online", 2, "hermes"))
// isParentPaused — return nil parent_id so the helper short-circuits.
mock.ExpectQuery(`SELECT parent_id FROM workspaces WHERE id =`).
WithArgs("ws-saas-restart").
WillReturnRows(sqlmock.NewRows([]string{"parent_id"}).AddRow(nil))
// After Stop, the cycle UPDATEs status to 'provisioning'. We allow
// that and don't mock everything provisionWorkspaceCP touches —
// provisionWorkspaceCP will fail at its first unmocked query, which
// is downstream of the dispatch decision under test.
mock.ExpectExec(`UPDATE workspaces SET status = 'provisioning'`).
WithArgs("ws-saas-restart").
WillReturnResult(sqlmock.NewResult(0, 1))
// loadRestartContextData query (best-effort lookups; let them not match
// and the function returns a zero-valued struct).
mock.MatchExpectationsInOrder(false)
handler.stopForRestart(context.Background(), "ws-saas-restart")
handler.runRestartCycle("ws-saas-restart")
// The dispatch we're testing: SaaS path → cpProv.Stop, never Docker.
if cp.stopCalls != 1 {
t.Fatalf("expected cpProv.Stop to be called once on SaaS auto-restart; got %d calls — pre-fix the cycle NPE'd before reaching Stop on the (nil) Docker path", cp.stopCalls)
t.Fatalf("expected cpProv.Stop to be called once on SaaS auto-restart; got %d", cp.stopCalls)
}
if cp.startCalls != 0 {
t.Fatalf("expected cpProv.Start NOT to be called by stopForRestart; got %d", cp.startCalls)
}
}
// Both nil → no-op, no panic. Defensive guard against the dispatcher being
// invoked on a misconfigured handler.
func TestStopForRestart_NoProvisioner_NoOp(t *testing.T) {
setupTestRedis(t)
handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
// no provisioner, no cpProv
handler.stopForRestart(context.Background(), "ws-orphan")
// no panic, no calls — assertion is reaching this line
}
// fakeCPProv satisfies provisioner.CPProvisionerAPI for tests that exercise

View File

@ -396,6 +396,25 @@ func coalesceRestart(workspaceID string, cycle func()) {
}
}
// stopForRestart dispatches Stop to whichever provisioner is wired (Docker or
// CP/EC2 — mutually exclusive in production). Docker provisioner.Stop kills
// the local container; CP provisioner.Stop calls DELETE /cp/workspaces/:id
// which terminates the EC2 instance. Pre-fix runRestartCycle only called the
// Docker path, so on SaaS (h.provisioner=nil) the auto-restart cycle silently
// NPE'd before reaching the reprovision step — which is why every SaaS dead-
// agent incident pre-this-fix required manual restart from canvas.
func (h *WorkspaceHandler) stopForRestart(ctx context.Context, workspaceID string) {
if h.provisioner != nil {
h.provisioner.Stop(ctx, workspaceID)
return
}
if h.cpProv != nil {
if err := h.cpProv.Stop(ctx, workspaceID); err != nil {
log.Printf("Auto-restart: cpProv.Stop(%s) failed: %v (continuing to reprovision)", workspaceID, err)
}
}
}
// runRestartCycle does the actual stop+provision work for one restart
// iteration. Synchronous (waits for provisionWorkspace to complete) so the
// outer pending-flag loop in RestartByID can correctly coalesce — if this
@ -431,20 +450,7 @@ func (h *WorkspaceHandler) runRestartCycle(workspaceID string) {
log.Printf("Auto-restart: restarting %s (%s) runtime=%q (was: %s)", wsName, workspaceID, dbRuntime, status)
// Stop existing compute. Branch on which provisioner is wired (mutually
// exclusive in production): Docker provisioner.Stop kills the local
// container; CP provisioner.Stop calls DELETE /cp/workspaces/:id which
// terminates the EC2 instance. Pre-fix this only called the Docker path,
// so on SaaS (h.provisioner=nil) the auto-restart cycle silently NPE'd
// before reaching the reprovision step — which is why every SaaS dead-
// agent incident pre-this-fix required manual restart from canvas.
if h.provisioner != nil {
h.provisioner.Stop(ctx, workspaceID)
} else if h.cpProv != nil {
if err := h.cpProv.Stop(ctx, workspaceID); err != nil {
log.Printf("Auto-restart: cpProv.Stop(%s) failed: %v (continuing to reprovision)", workspaceID, err)
}
}
h.stopForRestart(ctx, workspaceID)
db.DB.ExecContext(ctx,
`UPDATE workspaces SET status = 'provisioning', url = '', updated_at = now() WHERE id = $1`, workspaceID)