fix(a2a_proxy): debounce IsRunning=false in post-restart settle window (#2929) #2950

Merged
devops-engineer merged 1 commits from fix/2929-a2a-proxy-debounce-settle into main 2026-06-15 16:19:50 +00:00
5 changed files with 302 additions and 86 deletions
@@ -808,7 +808,11 @@ func (h *WorkspaceHandler) proxyA2ARequest(ctx context.Context, workspaceID stri
// authoritative check) but the empty-body case is what makes
// this fix necessary.
if isUpstreamDeadStatus(resp.StatusCode) {
if h.maybeMarkContainerDead(ctx, workspaceID) {
dead, queuedStatus, queuedBody := h.maybeMarkContainerDead(ctx, workspaceID, callerID, body, a2aMethod, durationMs, logActivity)
if queuedStatus != 0 {
return queuedStatus, queuedBody, nil
}
if dead {
return 0, nil, &proxyA2AError{
Status: http.StatusServiceUnavailable,
Headers: map[string]string{"Retry-After": "15"},
@@ -33,11 +33,18 @@ import (
// - the workspace DOES have a recent heartbeat but we see N consecutive
// dead observations within the debounce window.
const (
recentHeartbeatWindow = 15 * time.Second
containerDeadDebounceThreshold = 2
containerDeadDebounceWindow = 30 * time.Second
recentHeartbeatWindow = 15 * time.Second
containerDeadDebounceThreshold = 2
containerDeadDebounceWindow = 30 * time.Second
containerDeadReprobeDelay = 500 * time.Millisecond
containerDeadHeartbeatWaitTimeout = 2 * time.Second
)
// containerDeadReprobeDelayV is a mutable copy so tests can shrink the
// settle-window re-probe to zero. Package-level to avoid plumbing through
// the handler struct. core#2929.
var containerDeadReprobeDelayV = containerDeadReprobeDelay
// proxyDispatchBuildError is a sentinel wrapper for failures inside
// http.NewRequestWithContext. handleA2ADispatchError unwraps it to emit the
// "failed to create proxy request" 500 instead of the standard 502/503 paths.
@@ -61,9 +68,15 @@ func (h *WorkspaceHandler) handleA2ADispatchError(ctx context.Context, workspace
log.Printf("ProxyA2A forward error: %v", err)
containerDead := h.maybeMarkContainerDead(ctx, workspaceID)
dead, queuedStatus, queuedBody := h.maybeMarkContainerDead(ctx, workspaceID, callerID, body, a2aMethod, durationMs, logActivity)
if containerDead {
if queuedStatus != 0 {
// maybeMarkContainerDead decided the container is alive-but-busy/settling
// and enqueued the request. Return the 202 Accepted response directly.
return queuedStatus, queuedBody, nil
}
if dead {
if logActivity {
h.logA2AFailure(ctx, workspaceID, callerID, body, a2aMethod, err, durationMs)
}
@@ -204,78 +217,153 @@ func (h *WorkspaceHandler) handleA2ADispatchError(ctx context.Context, workspace
//
// #2929 hardening: do NOT recycle a recently-alive workspace on a single
// transient A2A 503 / IsRunning flake. We guard the restart with:
// 1. A recent-heartbeat check (last_heartbeat_at within 15s).
// 2. When the heartbeat is recent, require 2 consecutive dead observations
// within 30s before declaring dead.
// 3. Treat IsRunning transport errors as "assume alive" instead of dead.
func (h *WorkspaceHandler) maybeMarkContainerDead(ctx context.Context, workspaceID string) bool {
// 1. A widened self-fire guard that also covers the post-restart settle
// window (not just a restart currently in-flight).
// 2. A re-probe delay after a lone IsRunning=false before trusting it.
// 3. A recent-heartbeat / fresh-post-restart-heartbeat check. When
// transport-liveness is green we enqueue the request (202 queued)
// instead of clearing the workspace URL and restarting.
// 4. Treat IsRunning transport errors as "assume alive" instead of dead.
//
// Returns (dead, httpStatus, responseBody). When httpStatus is non-zero the
// caller must return that response directly (e.g., 202 Accepted queued).
func (h *WorkspaceHandler) maybeMarkContainerDead(ctx context.Context, workspaceID, callerID string, body []byte, a2aMethod string, durationMs int, logActivity bool) (bool, int, []byte) {
var wsRuntime string
db.DB.QueryRowContext(ctx, `SELECT COALESCE(runtime, 'claude-code') FROM workspaces WHERE id = $1`, workspaceID).Scan(&wsRuntime)
if isExternalLikeRuntime(wsRuntime) {
return false
return false, 0, nil
}
if !h.HasProvisioner() {
return false
return false, 0, nil
}
// Restart-aware short-circuit: during the 20-30s EC2-pending window of
// an in-flight restart, the workspace's url='' and IsRunning() returns
// false → looks indistinguishable from a dead container. Pre-fix this
// fired a fresh RestartByID for the just-launched instance, which
// coalesceRestart's pending-flag drained by running ANOTHER full
// stop+provision cycle (= ec2_stopped of the still-pending instance
// → re-provision). That's the 4x reprov thrash class. Skip the
// container-dead path while a restart is in flight; the in-flight
// restart's own provisionWorkspaceAutoSync will surface a real failure
// (markProvisionFailed) if the new container never comes up. Issue
// internal#544.
// Layer 1 self-fire guard: skip the container-dead path while a restart
// is in flight AND during the post-restart settle window. During the
// settle window a config-PUT-restarted container can report
// IsRunning=false before its first heartbeat arrives; a lone false must
// not trigger RestartByID and clear the URL. core#2929.
if isRestarting(workspaceID) {
log.Printf("ProxyA2A: maybeMarkContainerDead skipped for %s — restart already in flight (self-fire guard)", workspaceID)
return false
return false, 0, nil
}
settling := inRestartSettleWindow(workspaceID)
if settling {
log.Printf("ProxyA2A: maybeMarkContainerDead for %s is in post-restart settle window — using conservative path", workspaceID)
}
// #2929: recent-heartbeat guard. A workspace that heartbeated seconds
// ago is almost certainly alive; a single proxy/transport flake should
// not kill it. We still run IsRunning below, but a recent heartbeat
// puts us on the debounced path.
// puts us on the debounced/enqueued path.
recentHeartbeat := h.hasRecentHeartbeat(ctx, workspaceID)
var running bool
var inspectErr error
if h.provisioner != nil {
running, inspectErr = h.provisioner.IsRunning(ctx, workspaceID)
} else {
// SaaS path: ask the CP about the EC2 state. Same (true, err) on
// transport errors contract — keeps the caller on the alive path
// instead of triggering a restart cascade on a flaky CP call.
running, inspectErr = h.cpProv.IsRunning(ctx, workspaceID)
// If we're in the settle window, also check whether a heartbeat arrived
// strictly AFTER the restart started (i.e. a genuine post-restart PONG).
freshHeartbeat := false
if settling {
if restartStart, ok := lastRestartStartedAt(workspaceID); ok {
// waitForFreshHeartbeat is the same correlated check used by the
// restart-context sender: url non-empty + heartbeat newer than the
// restart start. A short timeout is enough — the heartbeat we're
// looking for already happened or is imminent.
freshHeartbeat = waitForFreshHeartbeat(ctx, workspaceID, restartStart, containerDeadHeartbeatWaitTimeout)
}
}
probe := func() (bool, error) {
if h.provisioner != nil {
return h.provisioner.IsRunning(ctx, workspaceID)
}
return h.cpProv.IsRunning(ctx, workspaceID)
}
running, inspectErr := probe()
if inspectErr != nil {
// Transient backend error (Docker daemon EOF, CP HTTP 5xx, etc.).
// IsRunning's contract returns (true, err) in this case so we stay
// on the alive path without triggering a restart cascade.
log.Printf("ProxyA2A: IsRunning for %s returned transient error (assuming alive): %v", workspaceID, inspectErr)
return false
return false, 0, nil
}
if running {
h.resetDeadProbe(workspaceID)
return false
return false, 0, nil
}
// Container is not running. If we have no recent heartbeat, preserve the
// First probe says not running. In the recent-heartbeat or settle-window
// cases, do not trust a single false — re-probe after a short delay. A
// transient container-settle flap resolves itself on the second probe.
if recentHeartbeat || freshHeartbeat || settling {
time.Sleep(containerDeadReprobeDelayV)
running2, inspectErr2 := probe()
if inspectErr2 != nil {
log.Printf("ProxyA2A: IsRunning re-probe for %s returned transient error (assuming alive): %v", workspaceID, inspectErr2)
return false, 0, nil
}
if running2 {
log.Printf("ProxyA2A: IsRunning re-probe for %s now reports running — settling transient, not dead", workspaceID)
h.resetDeadProbe(workspaceID)
return false, 0, nil
}
}
// Still not running after the re-probe. If transport-liveness is green,
// the agent is alive-but-busy/settling; queue the request instead of
// clearing the URL and restarting. core#2929.
if recentHeartbeat || freshHeartbeat {
log.Printf("ProxyA2A: container for %s not running but heartbeat is recent — enqueuing instead of restarting", workspaceID)
return h.enqueueBusyA2A(ctx, workspaceID, callerID, body, a2aMethod, durationMs, logActivity)
}
// No recent/fresh heartbeat and not in the settle window: preserve the
// pre-#2929 immediate-dead behavior so dead EC2 instances still recover
// on the first failed request (hongmingwang incident recovery path).
if !recentHeartbeat {
return h.declareContainerDead(ctx, workspaceID)
if !settling {
return h.declareContainerDead(ctx, workspaceID), 0, nil
}
// Recent heartbeat but IsRunning says not-running: debounce. Require N
// Settle window but no fresh heartbeat yet: debounce. Require N
// consecutive observations within the window before we believe it.
if !h.incrementDeadProbe(workspaceID) {
log.Printf("ProxyA2A: container for %s looks dead but has recent heartbeat — debouncing (%d/%d within %s)",
log.Printf("ProxyA2A: container for %s looks dead in settle window — debouncing (%d/%d within %s)",
workspaceID, h.deadProbeCount(workspaceID), containerDeadDebounceThreshold, containerDeadDebounceWindow)
return false
return false, 0, nil
}
return h.declareContainerDead(ctx, workspaceID)
return h.declareContainerDead(ctx, workspaceID), 0, nil
}
// enqueueBusyA2A enqueues the current request for drain on the next idle
// heartbeat. Returns (dead=false, http.StatusAccepted, queuedBody) on success,
// or (false, 0, nil) if the queue insert failed so the caller can fall back to
// its normal error path. core#2929.
func (h *WorkspaceHandler) enqueueBusyA2A(ctx context.Context, workspaceID, callerID string, body []byte, a2aMethod string, durationMs int, logActivity bool) (bool, int, []byte) {
idempotencyKey := extractIdempotencyKey(body)
var expiresAt *time.Time
if secs := extractExpiresInSeconds(body); secs > 0 {
t := time.Now().Add(time.Duration(secs) * time.Second)
expiresAt = &t
}
qid, depth, qerr := EnqueueA2A(
ctx, workspaceID, callerID, PriorityTask, body, a2aMethod, idempotencyKey, expiresAt,
)
if qerr == nil {
log.Printf("ProxyA2A: target %s busy/settling — enqueued as %s (depth=%d)", workspaceID, qid, depth)
if logActivity {
h.logA2ABusyQueued(ctx, workspaceID, callerID, body, a2aMethod, durationMs)
}
respBody, marshalErr := json.Marshal(gin.H{
"queued": true,
"queue_id": qid,
"queue_depth": depth,
"message": "workspace agent busy — request queued, will dispatch when capacity available",
})
if marshalErr != nil {
log.Printf("ProxyA2A %s: json.Marshal respBody failed: %v", workspaceID, marshalErr)
}
return false, http.StatusAccepted, respBody
}
log.Printf("ProxyA2A: enqueue for %s failed (%v) — falling back to 503", workspaceID, qerr)
return false, 0, nil
}
// hasRecentHeartbeat returns true if the workspace has a last_heartbeat_at
@@ -2120,7 +2120,8 @@ func TestMaybeMarkContainerDead_NilProvisioner(t *testing.T) {
WithArgs("ws-nilprov").
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("claude-code"))
if got := handler.maybeMarkContainerDead(context.Background(), "ws-nilprov"); got {
dead, _, _ := handler.maybeMarkContainerDead(context.Background(), "ws-nilprov", "", []byte("{}"), "message/send", 0, false)
if dead {
t.Error("expected false when provisioner is nil")
}
}
@@ -2142,12 +2143,15 @@ func TestMaybeMarkContainerDead_CPOnly_NotRunning(t *testing.T) {
mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`).
WithArgs("ws-saas-dead").
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`).
WithArgs("ws-saas-dead").
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(sql.NullTime{}))
mock.ExpectExec(`UPDATE workspaces SET status =`).
WithArgs(models.StatusOffline, "ws-saas-dead").
WillReturnResult(sqlmock.NewResult(0, 1))
got := handler.maybeMarkContainerDead(context.Background(), "ws-saas-dead")
if !got {
dead, _, _ := handler.maybeMarkContainerDead(context.Background(), "ws-saas-dead", "", []byte("{}"), "message/send", 0, false)
if !dead {
t.Fatal("expected true (cpProv reports not running) — without cpProv consultation, SaaS dead-agent recovery is impossible")
}
if cp.calls != 1 {
@@ -2171,8 +2175,12 @@ func TestMaybeMarkContainerDead_CPOnly_Running(t *testing.T) {
mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`).
WithArgs("ws-saas-alive").
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`).
WithArgs("ws-saas-alive").
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(sql.NullTime{}))
if got := handler.maybeMarkContainerDead(context.Background(), "ws-saas-alive"); got {
dead, _, _ := handler.maybeMarkContainerDead(context.Background(), "ws-saas-alive", "", []byte("{}"), "message/send", 0, false)
if dead {
t.Error("expected false when cpProv reports running — must not recycle a healthy agent")
}
if cp.calls != 1 {
@@ -2278,15 +2286,17 @@ func TestMaybeMarkContainerDead_ExternalRuntime(t *testing.T) {
WithArgs("ws-ext").
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("external"))
if got := handler.maybeMarkContainerDead(context.Background(), "ws-ext"); got {
dead, _, _ := handler.maybeMarkContainerDead(context.Background(), "ws-ext", "", []byte("{}"), "message/send", 0, false)
if dead {
t.Error("expected false for external runtime")
}
}
// #2929: a recent heartbeat + IsRunning=false should NOT declare the
// container dead on the first observation. The second observation within
// the debounce window should.
func TestMaybeMarkContainerDead_RecentHeartbeat_Debounces(t *testing.T) {
// container dead. The function re-probes after a short delay and, if still
// not running, enqueues the request rather than clearing the URL and
// restarting.
func TestMaybeMarkContainerDead_RecentHeartbeat_DoesNotRestart(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
@@ -2294,10 +2304,14 @@ func TestMaybeMarkContainerDead_RecentHeartbeat_Debounces(t *testing.T) {
cp := &fakeCPProv{running: false}
handler.SetCPProvisioner(cp)
// Speed the test: zero the reprobe delay. The second probe still runs.
origDelay := containerDeadReprobeDelayV
containerDeadReprobeDelayV = 0
defer func() { containerDeadReprobeDelayV = origDelay }()
recentHB := time.Now()
wsid := "ws-debounce"
// First observation: recent heartbeat, not running → debounce, no UPDATE.
mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`).
WithArgs(wsid).
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
@@ -2305,29 +2319,97 @@ func TestMaybeMarkContainerDead_RecentHeartbeat_Debounces(t *testing.T) {
WithArgs(wsid).
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(recentHB))
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got {
t.Fatal("first dead observation with recent heartbeat should be debounced, not restarted")
dead, status, _ := handler.maybeMarkContainerDead(context.Background(), wsid, "", []byte(`{"jsonrpc":"2.0","method":"message/send"}`), "message/send", 0, false)
if dead {
t.Fatal("dead observation with recent heartbeat should not declare container dead")
}
if cp.calls != 1 {
t.Errorf("first observation: expected 1 IsRunning call, got %d", cp.calls)
// It re-probes, so two IsRunning calls even with delay=0.
if cp.calls != 2 {
t.Errorf("expected 2 IsRunning calls (probe + re-probe), got %d", cp.calls)
}
// If EnqueueA2A fails (no DB expectations here) it returns status=0 and
// lets the caller fall back to its normal error path. The critical
// invariant is that the workspace was NOT marked offline/restarting.
_ = status
}
// Second observation within the window → threshold reached, restart.
// #2929: when there is no recent heartbeat and IsRunning=false, the
// container is declared dead immediately (preserves dead-EC2 recovery).
func TestMaybeMarkContainerDead_NoRecentHeartbeat_DeclaresDead(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
waitForHandlerAsyncBeforeDBCleanup(t, handler)
cp := &fakeCPProv{running: false}
handler.SetCPProvisioner(cp)
wsid := "ws-no-hb-dead"
mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`).
WithArgs(wsid).
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`).
WithArgs(wsid).
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(recentHB))
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(sql.NullTime{}))
mock.ExpectExec(`UPDATE workspaces SET status =`).
WithArgs(models.StatusOffline, wsid).
WillReturnResult(sqlmock.NewResult(0, 1))
if got := handler.maybeMarkContainerDead(context.Background(), wsid); !got {
t.Fatal("second dead observation with recent heartbeat should declare dead")
dead, _, _ := handler.maybeMarkContainerDead(context.Background(), wsid, "", []byte("{}"), "message/send", 0, false)
if !dead {
t.Fatal("expected dead when there is no recent heartbeat and IsRunning=false")
}
if cp.calls != 1 {
t.Errorf("expected 1 IsRunning call, got %d", cp.calls)
}
}
// #2929 regression: in the post-restart settle window, a single IsRunning=false
// must NOT nuke a PONG-healthy container's URL. Evidence: job 506813 saw the
// agent PONG 0.7s before a lone false probe; pre-fix that cleared the URL,
// flipped status offline, and self-fired a restart.
func TestMaybeMarkContainerDead_SettleWindow_DoesNotClearURL(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
waitForHandlerAsyncBeforeDBCleanup(t, handler)
cp := &fakeCPProv{running: false}
handler.SetCPProvisioner(cp)
origDelay := containerDeadReprobeDelayV
containerDeadReprobeDelayV = 0
defer func() { containerDeadReprobeDelayV = origDelay }()
wsid := "ws-settle-window"
recentHB := time.Now()
// Stamp a restart that has *finished* (running=false) but is still inside
// the settle window. This is the exact state after a config-PUT restart
// where the container is alive-but-settling.
sv, _ := restartStates.LoadOrStore(wsid, &restartState{})
state := sv.(*restartState)
state.mu.Lock()
state.running = false
state.restartStartedAt = time.Now()
state.mu.Unlock()
defer restartStates.Delete(wsid)
if !inRestartSettleWindow(wsid) {
t.Fatal("test setup failed: workspace should be in restart settle window")
}
mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`).
WithArgs(wsid).
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`).
WithArgs(wsid).
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(recentHB))
dead, _, _ := handler.maybeMarkContainerDead(context.Background(), wsid, "", []byte(`{"jsonrpc":"2.0","method":"message/send"}`), "message/send", 0, false)
if dead {
t.Fatal("single IsRunning=false in post-restart settle window must not declare container dead")
}
if cp.calls != 2 {
t.Errorf("second observation: expected 2 IsRunning calls total, got %d", cp.calls)
t.Errorf("expected 2 IsRunning calls (probe + re-probe), got %d", cp.calls)
}
}
@@ -2348,14 +2430,15 @@ func TestMaybeMarkContainerDead_InspectErr_AssumesAlive(t *testing.T) {
WithArgs(wsid).
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(sql.NullTime{}))
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got {
dead, _, _ := handler.maybeMarkContainerDead(context.Background(), wsid, "", []byte("{}"), "message/send", 0, false)
if dead {
t.Error("transient IsRunning error should be treated as alive, not dead")
}
}
// #2929: a successful IsRunning=true observation resets any accumulated
// dead-probe counter.
func TestMaybeMarkContainerDead_RunningTrue_ResetsDeadProbe(t *testing.T) {
// #2929: a successful IsRunning=true observation after a re-probe resets the
// dead-probe state and does not restart.
func TestMaybeMarkContainerDead_RunningTrueAfterReprobe_Resets(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
@@ -2364,18 +2447,25 @@ func TestMaybeMarkContainerDead_RunningTrue_ResetsDeadProbe(t *testing.T) {
wsid := "ws-alive-resets"
recentHB := time.Now()
// One dead observation with recent heartbeat accumulates a probe.
origDelay := containerDeadReprobeDelayV
containerDeadReprobeDelayV = 0
defer func() { containerDeadReprobeDelayV = origDelay }()
// First observation: recent heartbeat but IsRunning=false. The re-probe
// also returns false, so the request is enqueued (or falls back) and the
// workspace is not marked dead.
mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`).
WithArgs(wsid).
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`).
WithArgs(wsid).
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(recentHB))
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got {
t.Fatal("expected first observation to be debounced")
dead, _, _ := handler.maybeMarkContainerDead(context.Background(), wsid, "", []byte("{}"), "message/send", 0, false)
if dead {
t.Fatal("expected first observation to be debounced/enqueued, not dead")
}
// Next observation says running=true → resets counter and does NOT restart.
// Next observation says running=true → no restart.
cp.running = true
mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`).
WithArgs(wsid).
@@ -2383,20 +2473,9 @@ func TestMaybeMarkContainerDead_RunningTrue_ResetsDeadProbe(t *testing.T) {
mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`).
WithArgs(wsid).
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(recentHB))
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got {
t.Error("running=true should reset dead probe and not restart")
}
// One more dead observation is now back to count=1, not threshold.
cp.running = false
mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`).
WithArgs(wsid).
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`).
WithArgs(wsid).
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(recentHB))
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got {
t.Fatal("expected dead probe to have been reset; first post-reset observation should be debounced")
dead, _, _ = handler.maybeMarkContainerDead(context.Background(), wsid, "", []byte("{}"), "message/send", 0, false)
if dead {
t.Error("running=true should not restart")
}
}
@@ -232,6 +232,15 @@ func (h *WorkspaceHandler) maybeRestartAfterFileWrite(workspaceID string) {
// pending=true and the outer coalesceRestart loop will drain by running
// ANOTHER full cycle, ec2_stopped of the just-booted instance →
// re-provision. That's the self-fire loop closed by this gate.
// restartSettleWindow is the post-restart window during which a single
// IsRunning=false probe is NOT trusted. A workspace that just had its
// config.yaml PUT and was restarted can report IsRunning=false while the
// agent is still registering its first heartbeat / settling the container.
// Widening the self-fire guard to cover this window prevents a lone flaky
// probe from clearing the workspace URL and re-triggering a destructive
// restart. core#2929.
const restartSettleWindow = 30 * time.Second
func isRestarting(workspaceID string) bool {
sv, ok := restartStates.Load(workspaceID)
if !ok {
@@ -243,6 +252,41 @@ func isRestarting(workspaceID string) bool {
return state.running
}
// inRestartSettleWindow reports whether workspaceID is within
// restartSettleWindow of its most recent restart start. This widens the
// self-fire guard beyond the in-flight restart flag to cover the settle
// window right after a config-PUT restart. core#2929.
func inRestartSettleWindow(workspaceID string) bool {
sv, ok := restartStates.Load(workspaceID)
if !ok {
return false
}
state := sv.(*restartState)
state.mu.Lock()
defer state.mu.Unlock()
if state.restartStartedAt.IsZero() {
return false
}
return time.Since(state.restartStartedAt) < restartSettleWindow
}
// lastRestartStartedAt returns the timestamp recorded when the most recent
// restart cycle started for workspaceID, if any. Used by the settle-window
// heartbeat freshness check in maybeMarkContainerDead. core#2929.
func lastRestartStartedAt(workspaceID string) (time.Time, bool) {
sv, ok := restartStates.Load(workspaceID)
if !ok {
return time.Time{}, false
}
state := sv.(*restartState)
state.mu.Lock()
defer state.mu.Unlock()
if state.restartStartedAt.IsZero() {
return time.Time{}, false
}
return state.restartStartedAt, true
}
// isParentPaused checks if any ancestor of the workspace is paused.
func isParentPaused(ctx context.Context, workspaceID string) (bool, string) {
var parentID *string
@@ -117,8 +117,9 @@ func TestMaybeMarkContainerDead_SkippedWhileRestarting(t *testing.T) {
h := newSelfFireHandler(t)
h.provisioner = stub
if got := h.maybeMarkContainerDead(context.Background(), wsID); got != false {
t.Errorf("maybeMarkContainerDead must return false while restarting, got %v", got)
dead, _, _ := h.maybeMarkContainerDead(context.Background(), wsID, "", []byte("{}"), "message/send", 0, false)
if dead != false {
t.Errorf("maybeMarkContainerDead must return false while restarting, got %v", dead)
}
if stub.calls != 0 {
t.Errorf("IsRunning must not be called while restarting (Layer 1 gate broken); got %d calls", stub.calls)