fix(restart): extract stopForRestart helper + add 524 to dead-agent list
Addresses code-review C1 (test goroutine race) and I2 (CF 524) on PR #2362. C1: TestRunRestartCycle_SaaSPath_DispatchesViaCPProv invoked runRestartCycle end-to-end, which spawns `go h.sendRestartContext(...)`. That goroutine outlived the test, then read db.DB while the next test's setupTestDB wrote to it — DATA RACE under -race, cascading 30+ failures across the handlers suite. Refactored: extracted `stopForRestart(ctx, id)` from runRestartCycle as a pure dispatcher, and rewrote the SaaS-path test to call it directly (no async goroutine spawned). Added a no-provisioner no-op guard test. I2: Cloudflare 524 ("origin timed out") now triggers maybeMarkContainerDead alongside 502/503/504. Same upstream signal — origin agent unresponsive. Verified `go test -race -count=1 ./internal/handlers/...` green locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
28b4e38002
commit
a27cf8f39f
@ -484,9 +484,15 @@ func (h *WorkspaceHandler) proxyA2ARequest(ctx context.Context, workspaceID stri
|
||||
// error body. We probe IsRunning regardless (it's the
|
||||
// authoritative check) but the empty-body case is what makes
|
||||
// this fix necessary.
|
||||
// 524 = Cloudflare "origin timed out" — origin accepted the
|
||||
// connection but didn't return headers within ~100s. Same
|
||||
// signal as 504/502 for our purposes: the upstream agent is
|
||||
// not responsive. We probe IsRunning to confirm before
|
||||
// triggering a restart.
|
||||
if resp.StatusCode == http.StatusBadGateway ||
|
||||
resp.StatusCode == http.StatusServiceUnavailable ||
|
||||
resp.StatusCode == http.StatusGatewayTimeout {
|
||||
resp.StatusCode == http.StatusGatewayTimeout ||
|
||||
resp.StatusCode == 524 {
|
||||
if h.maybeMarkContainerDead(ctx, workspaceID) {
|
||||
return 0, nil, &proxyA2AError{
|
||||
Status: http.StatusServiceUnavailable,
|
||||
|
||||
@ -1819,39 +1819,35 @@ func TestMaybeMarkContainerDead_CPOnly_Running(t *testing.T) {
|
||||
// restart chain (provisionWorkspaceCP) needs its own mocked DB rows that
|
||||
// would explode the surface area of this test; what we care about here
|
||||
// is the dispatch decision, which is observable on cpProv.stopCalls.
|
||||
func TestRunRestartCycle_SaaSPath_DispatchesViaCPProv(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
// stopForRestart is the dispatch helper extracted from runRestartCycle so the
|
||||
// branch logic can be tested without spawning the async sendRestartContext
|
||||
// goroutine that the full cycle fires. Pre-fix runRestartCycle's Stop dispatch
|
||||
// only called the Docker path, so on SaaS (h.provisioner=nil) the cycle NPE'd
|
||||
// silently and left the workspace stuck in status='provisioning'.
|
||||
func TestStopForRestart_SaaSPath_DispatchesViaCPProv(t *testing.T) {
|
||||
setupTestRedis(t)
|
||||
handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
|
||||
cp := &fakeCPProv{}
|
||||
handler.SetCPProvisioner(cp)
|
||||
|
||||
// runRestartCycle's first query — workspace metadata.
|
||||
mock.ExpectQuery(`SELECT name, status, tier, COALESCE\(runtime, 'langgraph'\) FROM workspaces`).
|
||||
WithArgs("ws-saas-restart").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"name", "status", "tier", "runtime"}).
|
||||
AddRow("Test Workspace", "online", 2, "hermes"))
|
||||
// isParentPaused — return nil parent_id so the helper short-circuits.
|
||||
mock.ExpectQuery(`SELECT parent_id FROM workspaces WHERE id =`).
|
||||
WithArgs("ws-saas-restart").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"parent_id"}).AddRow(nil))
|
||||
// After Stop, the cycle UPDATEs status to 'provisioning'. We allow
|
||||
// that and don't mock everything provisionWorkspaceCP touches —
|
||||
// provisionWorkspaceCP will fail at its first unmocked query, which
|
||||
// is downstream of the dispatch decision under test.
|
||||
mock.ExpectExec(`UPDATE workspaces SET status = 'provisioning'`).
|
||||
WithArgs("ws-saas-restart").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
// loadRestartContextData query (best-effort lookups; let them not match
|
||||
// and the function returns a zero-valued struct).
|
||||
mock.MatchExpectationsInOrder(false)
|
||||
handler.stopForRestart(context.Background(), "ws-saas-restart")
|
||||
|
||||
handler.runRestartCycle("ws-saas-restart")
|
||||
|
||||
// The dispatch we're testing: SaaS path → cpProv.Stop, never Docker.
|
||||
if cp.stopCalls != 1 {
|
||||
t.Fatalf("expected cpProv.Stop to be called once on SaaS auto-restart; got %d calls — pre-fix the cycle NPE'd before reaching Stop on the (nil) Docker path", cp.stopCalls)
|
||||
t.Fatalf("expected cpProv.Stop to be called once on SaaS auto-restart; got %d", cp.stopCalls)
|
||||
}
|
||||
if cp.startCalls != 0 {
|
||||
t.Fatalf("expected cpProv.Start NOT to be called by stopForRestart; got %d", cp.startCalls)
|
||||
}
|
||||
}
|
||||
|
||||
// Both nil → no-op, no panic. Defensive guard against the dispatcher being
|
||||
// invoked on a misconfigured handler.
|
||||
func TestStopForRestart_NoProvisioner_NoOp(t *testing.T) {
|
||||
setupTestRedis(t)
|
||||
handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
|
||||
// no provisioner, no cpProv
|
||||
handler.stopForRestart(context.Background(), "ws-orphan")
|
||||
// no panic, no calls — assertion is reaching this line
|
||||
}
|
||||
|
||||
// fakeCPProv satisfies provisioner.CPProvisionerAPI for tests that exercise
|
||||
|
||||
@ -396,6 +396,25 @@ func coalesceRestart(workspaceID string, cycle func()) {
|
||||
}
|
||||
}
|
||||
|
||||
// stopForRestart dispatches Stop to whichever provisioner is wired (Docker or
|
||||
// CP/EC2 — mutually exclusive in production). Docker provisioner.Stop kills
|
||||
// the local container; CP provisioner.Stop calls DELETE /cp/workspaces/:id
|
||||
// which terminates the EC2 instance. Pre-fix runRestartCycle only called the
|
||||
// Docker path, so on SaaS (h.provisioner=nil) the auto-restart cycle silently
|
||||
// NPE'd before reaching the reprovision step — which is why every SaaS dead-
|
||||
// agent incident pre-this-fix required manual restart from canvas.
|
||||
func (h *WorkspaceHandler) stopForRestart(ctx context.Context, workspaceID string) {
|
||||
if h.provisioner != nil {
|
||||
h.provisioner.Stop(ctx, workspaceID)
|
||||
return
|
||||
}
|
||||
if h.cpProv != nil {
|
||||
if err := h.cpProv.Stop(ctx, workspaceID); err != nil {
|
||||
log.Printf("Auto-restart: cpProv.Stop(%s) failed: %v (continuing to reprovision)", workspaceID, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// runRestartCycle does the actual stop+provision work for one restart
|
||||
// iteration. Synchronous (waits for provisionWorkspace to complete) so the
|
||||
// outer pending-flag loop in RestartByID can correctly coalesce — if this
|
||||
@ -431,20 +450,7 @@ func (h *WorkspaceHandler) runRestartCycle(workspaceID string) {
|
||||
|
||||
log.Printf("Auto-restart: restarting %s (%s) runtime=%q (was: %s)", wsName, workspaceID, dbRuntime, status)
|
||||
|
||||
// Stop existing compute. Branch on which provisioner is wired (mutually
|
||||
// exclusive in production): Docker provisioner.Stop kills the local
|
||||
// container; CP provisioner.Stop calls DELETE /cp/workspaces/:id which
|
||||
// terminates the EC2 instance. Pre-fix this only called the Docker path,
|
||||
// so on SaaS (h.provisioner=nil) the auto-restart cycle silently NPE'd
|
||||
// before reaching the reprovision step — which is why every SaaS dead-
|
||||
// agent incident pre-this-fix required manual restart from canvas.
|
||||
if h.provisioner != nil {
|
||||
h.provisioner.Stop(ctx, workspaceID)
|
||||
} else if h.cpProv != nil {
|
||||
if err := h.cpProv.Stop(ctx, workspaceID); err != nil {
|
||||
log.Printf("Auto-restart: cpProv.Stop(%s) failed: %v (continuing to reprovision)", workspaceID, err)
|
||||
}
|
||||
}
|
||||
h.stopForRestart(ctx, workspaceID)
|
||||
|
||||
db.DB.ExecContext(ctx,
|
||||
`UPDATE workspaces SET status = 'provisioning', url = '', updated_at = now() WHERE id = $1`, workspaceID)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user