diff --git a/workspace-server/internal/handlers/a2a_proxy.go b/workspace-server/internal/handlers/a2a_proxy.go index bbcc4f4a..fb938ec4 100644 --- a/workspace-server/internal/handlers/a2a_proxy.go +++ b/workspace-server/internal/handlers/a2a_proxy.go @@ -808,7 +808,11 @@ func (h *WorkspaceHandler) proxyA2ARequest(ctx context.Context, workspaceID stri // authoritative check) but the empty-body case is what makes // this fix necessary. if isUpstreamDeadStatus(resp.StatusCode) { - if h.maybeMarkContainerDead(ctx, workspaceID) { + dead, queuedStatus, queuedBody := h.maybeMarkContainerDead(ctx, workspaceID, callerID, body, a2aMethod, durationMs, logActivity) + if queuedStatus != 0 { + return queuedStatus, queuedBody, nil + } + if dead { return 0, nil, &proxyA2AError{ Status: http.StatusServiceUnavailable, Headers: map[string]string{"Retry-After": "15"}, diff --git a/workspace-server/internal/handlers/a2a_proxy_helpers.go b/workspace-server/internal/handlers/a2a_proxy_helpers.go index 61a69f5e..03d05bf3 100644 --- a/workspace-server/internal/handlers/a2a_proxy_helpers.go +++ b/workspace-server/internal/handlers/a2a_proxy_helpers.go @@ -33,11 +33,18 @@ import ( // - the workspace DOES have a recent heartbeat but we see N consecutive // dead observations within the debounce window. const ( - recentHeartbeatWindow = 15 * time.Second - containerDeadDebounceThreshold = 2 - containerDeadDebounceWindow = 30 * time.Second + recentHeartbeatWindow = 15 * time.Second + containerDeadDebounceThreshold = 2 + containerDeadDebounceWindow = 30 * time.Second + containerDeadReprobeDelay = 500 * time.Millisecond + containerDeadHeartbeatWaitTimeout = 2 * time.Second ) +// containerDeadReprobeDelayV is a mutable copy so tests can shrink the +// settle-window re-probe to zero. Package-level to avoid plumbing through +// the handler struct. core#2929. +var containerDeadReprobeDelayV = containerDeadReprobeDelay + // proxyDispatchBuildError is a sentinel wrapper for failures inside // http.NewRequestWithContext. handleA2ADispatchError unwraps it to emit the // "failed to create proxy request" 500 instead of the standard 502/503 paths. @@ -61,9 +68,15 @@ func (h *WorkspaceHandler) handleA2ADispatchError(ctx context.Context, workspace log.Printf("ProxyA2A forward error: %v", err) - containerDead := h.maybeMarkContainerDead(ctx, workspaceID) + dead, queuedStatus, queuedBody := h.maybeMarkContainerDead(ctx, workspaceID, callerID, body, a2aMethod, durationMs, logActivity) - if containerDead { + if queuedStatus != 0 { + // maybeMarkContainerDead decided the container is alive-but-busy/settling + // and enqueued the request. Return the 202 Accepted response directly. + return queuedStatus, queuedBody, nil + } + + if dead { if logActivity { h.logA2AFailure(ctx, workspaceID, callerID, body, a2aMethod, err, durationMs) } @@ -204,78 +217,153 @@ func (h *WorkspaceHandler) handleA2ADispatchError(ctx context.Context, workspace // // #2929 hardening: do NOT recycle a recently-alive workspace on a single // transient A2A 503 / IsRunning flake. We guard the restart with: -// 1. A recent-heartbeat check (last_heartbeat_at within 15s). -// 2. When the heartbeat is recent, require 2 consecutive dead observations -// within 30s before declaring dead. -// 3. Treat IsRunning transport errors as "assume alive" instead of dead. -func (h *WorkspaceHandler) maybeMarkContainerDead(ctx context.Context, workspaceID string) bool { +// 1. A widened self-fire guard that also covers the post-restart settle +// window (not just a restart currently in-flight). +// 2. A re-probe delay after a lone IsRunning=false before trusting it. +// 3. A recent-heartbeat / fresh-post-restart-heartbeat check. When +// transport-liveness is green we enqueue the request (202 queued) +// instead of clearing the workspace URL and restarting. +// 4. Treat IsRunning transport errors as "assume alive" instead of dead. +// +// Returns (dead, httpStatus, responseBody). When httpStatus is non-zero the +// caller must return that response directly (e.g., 202 Accepted queued). +func (h *WorkspaceHandler) maybeMarkContainerDead(ctx context.Context, workspaceID, callerID string, body []byte, a2aMethod string, durationMs int, logActivity bool) (bool, int, []byte) { var wsRuntime string db.DB.QueryRowContext(ctx, `SELECT COALESCE(runtime, 'claude-code') FROM workspaces WHERE id = $1`, workspaceID).Scan(&wsRuntime) if isExternalLikeRuntime(wsRuntime) { - return false + return false, 0, nil } if !h.HasProvisioner() { - return false + return false, 0, nil } - // Restart-aware short-circuit: during the 20-30s EC2-pending window of - // an in-flight restart, the workspace's url='' and IsRunning() returns - // false → looks indistinguishable from a dead container. Pre-fix this - // fired a fresh RestartByID for the just-launched instance, which - // coalesceRestart's pending-flag drained by running ANOTHER full - // stop+provision cycle (= ec2_stopped of the still-pending instance - // → re-provision). That's the 4x reprov thrash class. Skip the - // container-dead path while a restart is in flight; the in-flight - // restart's own provisionWorkspaceAutoSync will surface a real failure - // (markProvisionFailed) if the new container never comes up. Issue - // internal#544. + + // Layer 1 self-fire guard: skip the container-dead path while a restart + // is in flight AND during the post-restart settle window. During the + // settle window a config-PUT-restarted container can report + // IsRunning=false before its first heartbeat arrives; a lone false must + // not trigger RestartByID and clear the URL. core#2929. if isRestarting(workspaceID) { log.Printf("ProxyA2A: maybeMarkContainerDead skipped for %s — restart already in flight (self-fire guard)", workspaceID) - return false + return false, 0, nil + } + settling := inRestartSettleWindow(workspaceID) + if settling { + log.Printf("ProxyA2A: maybeMarkContainerDead for %s is in post-restart settle window — using conservative path", workspaceID) } // #2929: recent-heartbeat guard. A workspace that heartbeated seconds // ago is almost certainly alive; a single proxy/transport flake should // not kill it. We still run IsRunning below, but a recent heartbeat - // puts us on the debounced path. + // puts us on the debounced/enqueued path. recentHeartbeat := h.hasRecentHeartbeat(ctx, workspaceID) - var running bool - var inspectErr error - if h.provisioner != nil { - running, inspectErr = h.provisioner.IsRunning(ctx, workspaceID) - } else { - // SaaS path: ask the CP about the EC2 state. Same (true, err) on - // transport errors contract — keeps the caller on the alive path - // instead of triggering a restart cascade on a flaky CP call. - running, inspectErr = h.cpProv.IsRunning(ctx, workspaceID) + // If we're in the settle window, also check whether a heartbeat arrived + // strictly AFTER the restart started (i.e. a genuine post-restart PONG). + freshHeartbeat := false + if settling { + if restartStart, ok := lastRestartStartedAt(workspaceID); ok { + // waitForFreshHeartbeat is the same correlated check used by the + // restart-context sender: url non-empty + heartbeat newer than the + // restart start. A short timeout is enough — the heartbeat we're + // looking for already happened or is imminent. + freshHeartbeat = waitForFreshHeartbeat(ctx, workspaceID, restartStart, containerDeadHeartbeatWaitTimeout) + } } + + probe := func() (bool, error) { + if h.provisioner != nil { + return h.provisioner.IsRunning(ctx, workspaceID) + } + return h.cpProv.IsRunning(ctx, workspaceID) + } + + running, inspectErr := probe() if inspectErr != nil { // Transient backend error (Docker daemon EOF, CP HTTP 5xx, etc.). // IsRunning's contract returns (true, err) in this case so we stay // on the alive path without triggering a restart cascade. log.Printf("ProxyA2A: IsRunning for %s returned transient error (assuming alive): %v", workspaceID, inspectErr) - return false + return false, 0, nil } if running { h.resetDeadProbe(workspaceID) - return false + return false, 0, nil } - // Container is not running. If we have no recent heartbeat, preserve the + // First probe says not running. In the recent-heartbeat or settle-window + // cases, do not trust a single false — re-probe after a short delay. A + // transient container-settle flap resolves itself on the second probe. + if recentHeartbeat || freshHeartbeat || settling { + time.Sleep(containerDeadReprobeDelayV) + running2, inspectErr2 := probe() + if inspectErr2 != nil { + log.Printf("ProxyA2A: IsRunning re-probe for %s returned transient error (assuming alive): %v", workspaceID, inspectErr2) + return false, 0, nil + } + if running2 { + log.Printf("ProxyA2A: IsRunning re-probe for %s now reports running — settling transient, not dead", workspaceID) + h.resetDeadProbe(workspaceID) + return false, 0, nil + } + } + + // Still not running after the re-probe. If transport-liveness is green, + // the agent is alive-but-busy/settling; queue the request instead of + // clearing the URL and restarting. core#2929. + if recentHeartbeat || freshHeartbeat { + log.Printf("ProxyA2A: container for %s not running but heartbeat is recent — enqueuing instead of restarting", workspaceID) + return h.enqueueBusyA2A(ctx, workspaceID, callerID, body, a2aMethod, durationMs, logActivity) + } + + // No recent/fresh heartbeat and not in the settle window: preserve the // pre-#2929 immediate-dead behavior so dead EC2 instances still recover // on the first failed request (hongmingwang incident recovery path). - if !recentHeartbeat { - return h.declareContainerDead(ctx, workspaceID) + if !settling { + return h.declareContainerDead(ctx, workspaceID), 0, nil } - // Recent heartbeat but IsRunning says not-running: debounce. Require N + // Settle window but no fresh heartbeat yet: debounce. Require N // consecutive observations within the window before we believe it. if !h.incrementDeadProbe(workspaceID) { - log.Printf("ProxyA2A: container for %s looks dead but has recent heartbeat — debouncing (%d/%d within %s)", + log.Printf("ProxyA2A: container for %s looks dead in settle window — debouncing (%d/%d within %s)", workspaceID, h.deadProbeCount(workspaceID), containerDeadDebounceThreshold, containerDeadDebounceWindow) - return false + return false, 0, nil } - return h.declareContainerDead(ctx, workspaceID) + return h.declareContainerDead(ctx, workspaceID), 0, nil +} + +// enqueueBusyA2A enqueues the current request for drain on the next idle +// heartbeat. Returns (dead=false, http.StatusAccepted, queuedBody) on success, +// or (false, 0, nil) if the queue insert failed so the caller can fall back to +// its normal error path. core#2929. +func (h *WorkspaceHandler) enqueueBusyA2A(ctx context.Context, workspaceID, callerID string, body []byte, a2aMethod string, durationMs int, logActivity bool) (bool, int, []byte) { + idempotencyKey := extractIdempotencyKey(body) + var expiresAt *time.Time + if secs := extractExpiresInSeconds(body); secs > 0 { + t := time.Now().Add(time.Duration(secs) * time.Second) + expiresAt = &t + } + qid, depth, qerr := EnqueueA2A( + ctx, workspaceID, callerID, PriorityTask, body, a2aMethod, idempotencyKey, expiresAt, + ) + if qerr == nil { + log.Printf("ProxyA2A: target %s busy/settling — enqueued as %s (depth=%d)", workspaceID, qid, depth) + if logActivity { + h.logA2ABusyQueued(ctx, workspaceID, callerID, body, a2aMethod, durationMs) + } + respBody, marshalErr := json.Marshal(gin.H{ + "queued": true, + "queue_id": qid, + "queue_depth": depth, + "message": "workspace agent busy — request queued, will dispatch when capacity available", + }) + if marshalErr != nil { + log.Printf("ProxyA2A %s: json.Marshal respBody failed: %v", workspaceID, marshalErr) + } + return false, http.StatusAccepted, respBody + } + log.Printf("ProxyA2A: enqueue for %s failed (%v) — falling back to 503", workspaceID, qerr) + return false, 0, nil } // hasRecentHeartbeat returns true if the workspace has a last_heartbeat_at diff --git a/workspace-server/internal/handlers/a2a_proxy_test.go b/workspace-server/internal/handlers/a2a_proxy_test.go index d5e31b7c..6199664c 100644 --- a/workspace-server/internal/handlers/a2a_proxy_test.go +++ b/workspace-server/internal/handlers/a2a_proxy_test.go @@ -2120,7 +2120,8 @@ func TestMaybeMarkContainerDead_NilProvisioner(t *testing.T) { WithArgs("ws-nilprov"). WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("claude-code")) - if got := handler.maybeMarkContainerDead(context.Background(), "ws-nilprov"); got { + dead, _, _ := handler.maybeMarkContainerDead(context.Background(), "ws-nilprov", "", []byte("{}"), "message/send", 0, false) + if dead { t.Error("expected false when provisioner is nil") } } @@ -2142,12 +2143,15 @@ func TestMaybeMarkContainerDead_CPOnly_NotRunning(t *testing.T) { mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`). WithArgs("ws-saas-dead"). WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes")) + mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`). + WithArgs("ws-saas-dead"). + WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(sql.NullTime{})) mock.ExpectExec(`UPDATE workspaces SET status =`). WithArgs(models.StatusOffline, "ws-saas-dead"). WillReturnResult(sqlmock.NewResult(0, 1)) - got := handler.maybeMarkContainerDead(context.Background(), "ws-saas-dead") - if !got { + dead, _, _ := handler.maybeMarkContainerDead(context.Background(), "ws-saas-dead", "", []byte("{}"), "message/send", 0, false) + if !dead { t.Fatal("expected true (cpProv reports not running) — without cpProv consultation, SaaS dead-agent recovery is impossible") } if cp.calls != 1 { @@ -2171,8 +2175,12 @@ func TestMaybeMarkContainerDead_CPOnly_Running(t *testing.T) { mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`). WithArgs("ws-saas-alive"). WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes")) + mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`). + WithArgs("ws-saas-alive"). + WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(sql.NullTime{})) - if got := handler.maybeMarkContainerDead(context.Background(), "ws-saas-alive"); got { + dead, _, _ := handler.maybeMarkContainerDead(context.Background(), "ws-saas-alive", "", []byte("{}"), "message/send", 0, false) + if dead { t.Error("expected false when cpProv reports running — must not recycle a healthy agent") } if cp.calls != 1 { @@ -2278,15 +2286,17 @@ func TestMaybeMarkContainerDead_ExternalRuntime(t *testing.T) { WithArgs("ws-ext"). WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("external")) - if got := handler.maybeMarkContainerDead(context.Background(), "ws-ext"); got { + dead, _, _ := handler.maybeMarkContainerDead(context.Background(), "ws-ext", "", []byte("{}"), "message/send", 0, false) + if dead { t.Error("expected false for external runtime") } } // #2929: a recent heartbeat + IsRunning=false should NOT declare the -// container dead on the first observation. The second observation within -// the debounce window should. -func TestMaybeMarkContainerDead_RecentHeartbeat_Debounces(t *testing.T) { +// container dead. The function re-probes after a short delay and, if still +// not running, enqueues the request rather than clearing the URL and +// restarting. +func TestMaybeMarkContainerDead_RecentHeartbeat_DoesNotRestart(t *testing.T) { mock := setupTestDB(t) setupTestRedis(t) handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir()) @@ -2294,10 +2304,14 @@ func TestMaybeMarkContainerDead_RecentHeartbeat_Debounces(t *testing.T) { cp := &fakeCPProv{running: false} handler.SetCPProvisioner(cp) + // Speed the test: zero the reprobe delay. The second probe still runs. + origDelay := containerDeadReprobeDelayV + containerDeadReprobeDelayV = 0 + defer func() { containerDeadReprobeDelayV = origDelay }() + recentHB := time.Now() wsid := "ws-debounce" - // First observation: recent heartbeat, not running → debounce, no UPDATE. mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`). WithArgs(wsid). WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes")) @@ -2305,29 +2319,97 @@ func TestMaybeMarkContainerDead_RecentHeartbeat_Debounces(t *testing.T) { WithArgs(wsid). WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(recentHB)) - if got := handler.maybeMarkContainerDead(context.Background(), wsid); got { - t.Fatal("first dead observation with recent heartbeat should be debounced, not restarted") + dead, status, _ := handler.maybeMarkContainerDead(context.Background(), wsid, "", []byte(`{"jsonrpc":"2.0","method":"message/send"}`), "message/send", 0, false) + if dead { + t.Fatal("dead observation with recent heartbeat should not declare container dead") } - if cp.calls != 1 { - t.Errorf("first observation: expected 1 IsRunning call, got %d", cp.calls) + // It re-probes, so two IsRunning calls even with delay=0. + if cp.calls != 2 { + t.Errorf("expected 2 IsRunning calls (probe + re-probe), got %d", cp.calls) } + // If EnqueueA2A fails (no DB expectations here) it returns status=0 and + // lets the caller fall back to its normal error path. The critical + // invariant is that the workspace was NOT marked offline/restarting. + _ = status +} - // Second observation within the window → threshold reached, restart. +// #2929: when there is no recent heartbeat and IsRunning=false, the +// container is declared dead immediately (preserves dead-EC2 recovery). +func TestMaybeMarkContainerDead_NoRecentHeartbeat_DeclaresDead(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir()) + waitForHandlerAsyncBeforeDBCleanup(t, handler) + cp := &fakeCPProv{running: false} + handler.SetCPProvisioner(cp) + + wsid := "ws-no-hb-dead" mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`). WithArgs(wsid). WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes")) mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`). WithArgs(wsid). - WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(recentHB)) + WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(sql.NullTime{})) mock.ExpectExec(`UPDATE workspaces SET status =`). WithArgs(models.StatusOffline, wsid). WillReturnResult(sqlmock.NewResult(0, 1)) - if got := handler.maybeMarkContainerDead(context.Background(), wsid); !got { - t.Fatal("second dead observation with recent heartbeat should declare dead") + dead, _, _ := handler.maybeMarkContainerDead(context.Background(), wsid, "", []byte("{}"), "message/send", 0, false) + if !dead { + t.Fatal("expected dead when there is no recent heartbeat and IsRunning=false") + } + if cp.calls != 1 { + t.Errorf("expected 1 IsRunning call, got %d", cp.calls) + } +} + +// #2929 regression: in the post-restart settle window, a single IsRunning=false +// must NOT nuke a PONG-healthy container's URL. Evidence: job 506813 saw the +// agent PONG 0.7s before a lone false probe; pre-fix that cleared the URL, +// flipped status offline, and self-fired a restart. +func TestMaybeMarkContainerDead_SettleWindow_DoesNotClearURL(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir()) + waitForHandlerAsyncBeforeDBCleanup(t, handler) + cp := &fakeCPProv{running: false} + handler.SetCPProvisioner(cp) + + origDelay := containerDeadReprobeDelayV + containerDeadReprobeDelayV = 0 + defer func() { containerDeadReprobeDelayV = origDelay }() + + wsid := "ws-settle-window" + recentHB := time.Now() + + // Stamp a restart that has *finished* (running=false) but is still inside + // the settle window. This is the exact state after a config-PUT restart + // where the container is alive-but-settling. + sv, _ := restartStates.LoadOrStore(wsid, &restartState{}) + state := sv.(*restartState) + state.mu.Lock() + state.running = false + state.restartStartedAt = time.Now() + state.mu.Unlock() + defer restartStates.Delete(wsid) + + if !inRestartSettleWindow(wsid) { + t.Fatal("test setup failed: workspace should be in restart settle window") + } + + mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`). + WithArgs(wsid). + WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes")) + mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`). + WithArgs(wsid). + WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(recentHB)) + + dead, _, _ := handler.maybeMarkContainerDead(context.Background(), wsid, "", []byte(`{"jsonrpc":"2.0","method":"message/send"}`), "message/send", 0, false) + if dead { + t.Fatal("single IsRunning=false in post-restart settle window must not declare container dead") } if cp.calls != 2 { - t.Errorf("second observation: expected 2 IsRunning calls total, got %d", cp.calls) + t.Errorf("expected 2 IsRunning calls (probe + re-probe), got %d", cp.calls) } } @@ -2348,14 +2430,15 @@ func TestMaybeMarkContainerDead_InspectErr_AssumesAlive(t *testing.T) { WithArgs(wsid). WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(sql.NullTime{})) - if got := handler.maybeMarkContainerDead(context.Background(), wsid); got { + dead, _, _ := handler.maybeMarkContainerDead(context.Background(), wsid, "", []byte("{}"), "message/send", 0, false) + if dead { t.Error("transient IsRunning error should be treated as alive, not dead") } } -// #2929: a successful IsRunning=true observation resets any accumulated -// dead-probe counter. -func TestMaybeMarkContainerDead_RunningTrue_ResetsDeadProbe(t *testing.T) { +// #2929: a successful IsRunning=true observation after a re-probe resets the +// dead-probe state and does not restart. +func TestMaybeMarkContainerDead_RunningTrueAfterReprobe_Resets(t *testing.T) { mock := setupTestDB(t) setupTestRedis(t) handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir()) @@ -2364,18 +2447,25 @@ func TestMaybeMarkContainerDead_RunningTrue_ResetsDeadProbe(t *testing.T) { wsid := "ws-alive-resets" recentHB := time.Now() - // One dead observation with recent heartbeat accumulates a probe. + origDelay := containerDeadReprobeDelayV + containerDeadReprobeDelayV = 0 + defer func() { containerDeadReprobeDelayV = origDelay }() + + // First observation: recent heartbeat but IsRunning=false. The re-probe + // also returns false, so the request is enqueued (or falls back) and the + // workspace is not marked dead. mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`). WithArgs(wsid). WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes")) mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`). WithArgs(wsid). WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(recentHB)) - if got := handler.maybeMarkContainerDead(context.Background(), wsid); got { - t.Fatal("expected first observation to be debounced") + dead, _, _ := handler.maybeMarkContainerDead(context.Background(), wsid, "", []byte("{}"), "message/send", 0, false) + if dead { + t.Fatal("expected first observation to be debounced/enqueued, not dead") } - // Next observation says running=true → resets counter and does NOT restart. + // Next observation says running=true → no restart. cp.running = true mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`). WithArgs(wsid). @@ -2383,20 +2473,9 @@ func TestMaybeMarkContainerDead_RunningTrue_ResetsDeadProbe(t *testing.T) { mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`). WithArgs(wsid). WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(recentHB)) - if got := handler.maybeMarkContainerDead(context.Background(), wsid); got { - t.Error("running=true should reset dead probe and not restart") - } - - // One more dead observation is now back to count=1, not threshold. - cp.running = false - mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`). - WithArgs(wsid). - WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes")) - mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`). - WithArgs(wsid). - WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(recentHB)) - if got := handler.maybeMarkContainerDead(context.Background(), wsid); got { - t.Fatal("expected dead probe to have been reset; first post-reset observation should be debounced") + dead, _, _ = handler.maybeMarkContainerDead(context.Background(), wsid, "", []byte("{}"), "message/send", 0, false) + if dead { + t.Error("running=true should not restart") } } diff --git a/workspace-server/internal/handlers/workspace_restart.go b/workspace-server/internal/handlers/workspace_restart.go index 2968ea77..2b8b6916 100644 --- a/workspace-server/internal/handlers/workspace_restart.go +++ b/workspace-server/internal/handlers/workspace_restart.go @@ -232,6 +232,15 @@ func (h *WorkspaceHandler) maybeRestartAfterFileWrite(workspaceID string) { // pending=true and the outer coalesceRestart loop will drain by running // ANOTHER full cycle, ec2_stopped of the just-booted instance → // re-provision. That's the self-fire loop closed by this gate. +// restartSettleWindow is the post-restart window during which a single +// IsRunning=false probe is NOT trusted. A workspace that just had its +// config.yaml PUT and was restarted can report IsRunning=false while the +// agent is still registering its first heartbeat / settling the container. +// Widening the self-fire guard to cover this window prevents a lone flaky +// probe from clearing the workspace URL and re-triggering a destructive +// restart. core#2929. +const restartSettleWindow = 30 * time.Second + func isRestarting(workspaceID string) bool { sv, ok := restartStates.Load(workspaceID) if !ok { @@ -243,6 +252,41 @@ func isRestarting(workspaceID string) bool { return state.running } +// inRestartSettleWindow reports whether workspaceID is within +// restartSettleWindow of its most recent restart start. This widens the +// self-fire guard beyond the in-flight restart flag to cover the settle +// window right after a config-PUT restart. core#2929. +func inRestartSettleWindow(workspaceID string) bool { + sv, ok := restartStates.Load(workspaceID) + if !ok { + return false + } + state := sv.(*restartState) + state.mu.Lock() + defer state.mu.Unlock() + if state.restartStartedAt.IsZero() { + return false + } + return time.Since(state.restartStartedAt) < restartSettleWindow +} + +// lastRestartStartedAt returns the timestamp recorded when the most recent +// restart cycle started for workspaceID, if any. Used by the settle-window +// heartbeat freshness check in maybeMarkContainerDead. core#2929. +func lastRestartStartedAt(workspaceID string) (time.Time, bool) { + sv, ok := restartStates.Load(workspaceID) + if !ok { + return time.Time{}, false + } + state := sv.(*restartState) + state.mu.Lock() + defer state.mu.Unlock() + if state.restartStartedAt.IsZero() { + return time.Time{}, false + } + return state.restartStartedAt, true +} + // isParentPaused checks if any ancestor of the workspace is paused. func isParentPaused(ctx context.Context, workspaceID string) (bool, string) { var parentID *string diff --git a/workspace-server/internal/handlers/workspace_restart_self_fire_test.go b/workspace-server/internal/handlers/workspace_restart_self_fire_test.go index 3ec15302..7273f981 100644 --- a/workspace-server/internal/handlers/workspace_restart_self_fire_test.go +++ b/workspace-server/internal/handlers/workspace_restart_self_fire_test.go @@ -117,8 +117,9 @@ func TestMaybeMarkContainerDead_SkippedWhileRestarting(t *testing.T) { h := newSelfFireHandler(t) h.provisioner = stub - if got := h.maybeMarkContainerDead(context.Background(), wsID); got != false { - t.Errorf("maybeMarkContainerDead must return false while restarting, got %v", got) + dead, _, _ := h.maybeMarkContainerDead(context.Background(), wsID, "", []byte("{}"), "message/send", 0, false) + if dead != false { + t.Errorf("maybeMarkContainerDead must return false while restarting, got %v", dead) } if stub.calls != 0 { t.Errorf("IsRunning must not be called while restarting (Layer 1 gate broken); got %d calls", stub.calls)