fix(handlers#2929): post-config-PUT settle window + DEBOUNCE re-probe + BUSY path #2954

Closed
agent-dev-b wants to merge 1 commits from fix/2929-post-restart-settle-window into main
4 changed files with 340 additions and 66 deletions
@@ -808,7 +808,7 @@ func (h *WorkspaceHandler) proxyA2ARequest(ctx context.Context, workspaceID stri
// authoritative check) but the empty-body case is what makes
// this fix necessary.
if isUpstreamDeadStatus(resp.StatusCode) {
if h.maybeMarkContainerDead(ctx, workspaceID) {
if h.maybeMarkContainerDead(ctx, workspaceID) == deadActionDead {
return 0, nil, &proxyA2AError{
Status: http.StatusServiceUnavailable,
Headers: map[string]string{"Retry-After": "15"},
@@ -14,6 +14,8 @@ import (
"net/http"
"os"
"strconv"
"sync"
"sync/atomic"
"time"
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db"
@@ -45,6 +47,39 @@ type proxyDispatchBuildError struct{ err error }
func (e *proxyDispatchBuildError) Error() string { return e.err.Error() }
// containerDeadActionT names the maybeMarkContainerDead outcome the
// caller uses to choose its 202/503/queued path. Replaces the
// historical bool (which conflated "definitively dead" and
// "alive-but-settling"). The 3-state shape is load-bearing for the
// post-config-PUT settle window (#2929): a single IsRunning=false in
// the ~60s window after a restart completes (state.running=false
// but the new container is still booting) used to fall through to
// the declareContainerDead path and trigger a self-fire restart —
// which ec2_stop'd the just-launched instance, exactly the
// 2026-06-15 staging-boot class.
type containerDeadActionT int
const (
deadActionAlive containerDeadActionT = iota // 200 — let the caller continue
deadActionBusy // 202 queued — caller should enqueue
deadActionDead // 503 + RestartByID — caller should recycle
)
// containerPostRestartSettleWindow is the duration AFTER a restart
// completes (state.running=false) during which maybeMarkContainerDead
// treats any IsRunning=false as a settle-window artifact (busy) rather
// than a definitive dead signal. 60s covers the worst-case docker
// pull + boot + first heartbeat window on a fresh EC2 instance. The
// restart-context probe (waitForFreshHeartbeat at restart_context.go:205)
// uses the same 60s ceiling for the same reason.
const containerPostRestartSettleWindow = 60 * time.Second
// containerReProbeDelay is the time maybeMarkContainerDead waits
// between the first IsRunning=false observation and the re-probe.
// Never act on a single IsRunning=false (E2E 7c→7d staging-run
// 506813 reproduced the false-positive within a 0.7s window).
const containerReProbeDelay = 2 * time.Second
// handleA2ADispatchError translates a forward-call failure into a proxyA2AError,
// runs the reactive container-health check, and records the outcome. Busy
// targets that are successfully queued are logged as queued, not failed.
@@ -61,9 +96,9 @@ func (h *WorkspaceHandler) handleA2ADispatchError(ctx context.Context, workspace
log.Printf("ProxyA2A forward error: %v", err)
containerDead := h.maybeMarkContainerDead(ctx, workspaceID)
deadAction := h.maybeMarkContainerDead(ctx, workspaceID)
if containerDead {
if deadAction == deadActionDead {
if logActivity {
h.logA2AFailure(ctx, workspaceID, callerID, body, a2aMethod, err, durationMs)
}
@@ -208,14 +243,23 @@ func (h *WorkspaceHandler) handleA2ADispatchError(ctx context.Context, workspace
// 2. When the heartbeat is recent, require 2 consecutive dead observations
// within 30s before declaring dead.
// 3. Treat IsRunning transport errors as "assume alive" instead of dead.
func (h *WorkspaceHandler) maybeMarkContainerDead(ctx context.Context, workspaceID string) bool {
// 4. (#2929) Re-probe IsRunning after a 2s delay — never act on a single
// flake (E2E 7c→7d staging-run 506813: agent PONG'd 0.7s prior then a
// single IsRunning=false arrived in the container-settle window).
// 5. (#2929) Widen the self-fire guard to cover the post-restart settle
// window — state.running flips to false at the END of the cycle, but
// the new container is still booting for ~60s after. During that
// window IsRunning=false is the EXPECTED state (the new instance
// isn't registered yet). The guard returns BUSY, which the caller
// turns into 202-queued, not 503 + RestartByID.
func (h *WorkspaceHandler) maybeMarkContainerDead(ctx context.Context, workspaceID string) containerDeadActionT {
var wsRuntime string
db.DB.QueryRowContext(ctx, `SELECT COALESCE(runtime, 'claude-code') FROM workspaces WHERE id = $1`, workspaceID).Scan(&wsRuntime)
if isExternalLikeRuntime(wsRuntime) {
return false
return deadActionAlive
}
if !h.HasProvisioner() {
return false
return deadActionAlive
}
// Restart-aware short-circuit: during the 20-30s EC2-pending window of
// an in-flight restart, the workspace's url='' and IsRunning() returns
@@ -228,15 +272,28 @@ func (h *WorkspaceHandler) maybeMarkContainerDead(ctx context.Context, workspace
// restart's own provisionWorkspaceAutoSync will surface a real failure
// (markProvisionFailed) if the new container never comes up. Issue
// internal#544.
//
// #2929: WIDEN the self-fire guard to cover the post-restart SETTLE
// window — state.running flips to false at the END of the cycle
// (workspace_restart.go:682-685), but the new container is still
// booting for ~60s after. During that window IsRunning=false is the
// EXPECTED state (the new instance isn't registered yet); the OLD
// heartbeat is still in the DB (the next heartbeat is 30-60s away).
// Returning BUSY here makes the caller 202-queue (not 503 +
// RestartByID), which is the post-config-PUT settle-window fix.
if isRestarting(workspaceID) {
log.Printf("ProxyA2A: maybeMarkContainerDead skipped for %s — restart already in flight (self-fire guard)", workspaceID)
return false
return deadActionBusy
}
if h.inPostRestartSettleWindow(workspaceID) {
log.Printf("ProxyA2A: maybeMarkContainerDead skipped for %s — post-restart settle window (new container still booting)", workspaceID)
return deadActionBusy
}
// #2929: recent-heartbeat guard. A workspace that heartbeated seconds
// ago is almost certainly alive; a single proxy/transport flake should
// not kill it. We still run IsRunning below, but a recent heartbeat
// puts us on the debounced path.
// puts us on the BUSY path (caller 202-queues, no restart).
recentHeartbeat := h.hasRecentHeartbeat(ctx, workspaceID)
var running bool
@@ -254,28 +311,61 @@ func (h *WorkspaceHandler) maybeMarkContainerDead(ctx context.Context, workspace
// IsRunning's contract returns (true, err) in this case so we stay
// on the alive path without triggering a restart cascade.
log.Printf("ProxyA2A: IsRunning for %s returned transient error (assuming alive): %v", workspaceID, inspectErr)
return false
return deadActionAlive
}
if running {
h.resetDeadProbe(workspaceID)
return false
return deadActionAlive
}
// Container is not running. If we have no recent heartbeat, preserve the
// pre-#2929 immediate-dead behavior so dead EC2 instances still recover
// on the first failed request (hongmingwang incident recovery path).
// #2929: DEBOUNCE — never act on a single IsRunning=false. Wait
// containerReProbeDelay, re-probe, only act if the re-probe is also
// false. E2E 7c→7d staging-run 506813 reproduced a single
// IsRunning=false within 0.7s of a healthy PONG — the re-probe
// catches this and returns BUSY (no restart).
if h.shouldReProbe(workspaceID, containerReProbeDelay) {
time.Sleep(containerReProbeDelay)
log.Printf("ProxyA2A: IsRunning for %s returned false (1st probe) — re-probing after %s", workspaceID, containerReProbeDelay)
if h.provisioner != nil {
running, inspectErr = h.provisioner.IsRunning(ctx, workspaceID)
} else {
running, inspectErr = h.cpProv.IsRunning(ctx, workspaceID)
}
if inspectErr != nil {
log.Printf("ProxyA2A: re-probe for %s returned transient error (assuming alive): %v", workspaceID, inspectErr)
return deadActionAlive
}
if running {
h.resetDeadProbe(workspaceID)
log.Printf("ProxyA2A: re-probe for %s returned running=true — false positive on 1st probe was a transport flake; container is alive", workspaceID)
return deadActionAlive
}
}
// Container is not running on BOTH probes. If we have no recent
// heartbeat, preserve the pre-#2929 immediate-dead behavior so
// dead EC2 instances still recover on the first failed request
// (hongmingwang incident recovery path).
if !recentHeartbeat {
return h.declareContainerDead(ctx, workspaceID)
if h.declareContainerDead(ctx, workspaceID) {
return deadActionDead
}
return deadActionAlive
}
// Recent heartbeat but IsRunning says not-running: debounce. Require N
// consecutive observations within the window before we believe it.
// Recent heartbeat but IsRunning says not-running: BUSY (caller
// 202-queues, no restart). The pre-#2929 behavior was a debounce
// counter that eventually declared dead after N observations — but
// in the post-config-PUT settle window the BUSY path is the right
// answer (the container IS mid-boot, just not yet visible to the
// IsRunning probe). The debounce counter stays for the
// not-in-settle-window case below.
if !h.incrementDeadProbe(workspaceID) {
log.Printf("ProxyA2A: container for %s looks dead but has recent heartbeat — debouncing (%d/%d within %s)",
workspaceID, h.deadProbeCount(workspaceID), containerDeadDebounceThreshold, containerDeadDebounceWindow)
return false
return deadActionBusy
}
return h.declareContainerDead(ctx, workspaceID)
return deadActionBusy
}
// hasRecentHeartbeat returns true if the workspace has a last_heartbeat_at
@@ -295,6 +385,61 @@ func (h *WorkspaceHandler) hasRecentHeartbeat(ctx context.Context, workspaceID s
return time.Since(*lastHB) <= recentHeartbeatWindow
}
// inPostRestartSettleWindow reports whether the workspace is in the
// ~60s settle window after a restart cycle has just completed (state.
// running=false but the new container is still booting). During this
// window IsRunning=false is the EXPECTED state and any declare-
// container-dead path would ec2_stop a still-booting instance — the
// exact 2026-06-15 staging-boot class (#2929). Returns false if the
// workspace has never been restarted (no restartStartedAt stamp).
//
// Reads the same restartStates map that isRestarting() does; the
// distinction is timing (state.running vs now-since-restartStartedAt
// < containerPostRestartSettleWindow). Together they cover the full
// "in-flight or just-completed" window.
func (h *WorkspaceHandler) inPostRestartSettleWindow(workspaceID string) bool {
sv, ok := restartStates.Load(workspaceID)
if !ok {
return false
}
state := sv.(*restartState)
state.mu.Lock()
defer state.mu.Unlock()
if state.restartStartedAt.IsZero() {
return false
}
return time.Since(state.restartStartedAt) <= containerPostRestartSettleWindow
}
// shouldReProbe returns true if a re-probe delay has elapsed since
// the LAST re-probe record (or since the start of the process if
// none). This throttles re-probes so a flapping IsRunning probe doesn't
// trigger an unbounded re-probe storm. The re-probe is per-workspace.
// Not currently in a hot loop (called only from the rare dead-probe
// path), so a sync.Map is overkill — process-local atomic is enough.
var containerLastReProbeAt sync.Map // map[workspaceID]*atomic.Int64 (unix nanos)
func (h *WorkspaceHandler) shouldReProbe(workspaceID string, delay time.Duration) bool {
// Always re-probe on the first call (no prior record) so the
// 1st false-positive case is caught. The throttle applies to
// subsequent calls.
now := time.Now().UnixNano()
v, ok := containerLastReProbeAt.Load(workspaceID)
if !ok {
var stamp atomic.Int64
stamp.Store(now)
containerLastReProbeAt.Store(workspaceID, &stamp)
return true
}
stamp := v.(*atomic.Int64)
last := stamp.Load()
if now-last < int64(delay) {
return false
}
stamp.Store(now)
return true
}
// declareContainerDead is the single point that marks a workspace offline,
// clears keys, broadcasts OFFLINE, and triggers an async restart.
func (h *WorkspaceHandler) declareContainerDead(ctx context.Context, workspaceID string) bool {
@@ -280,10 +280,16 @@ func TestProxyA2A_Upstream502_TriggersContainerDeadCheck(t *testing.T) {
// and logs as success because the dispatch did get a response).
mock.ExpectExec("INSERT INTO activity_logs").
WillReturnResult(sqlmock.NewResult(0, 1))
// maybeMarkContainerDead's runtime lookup, then the offline-flip UPDATE.
// maybeMarkContainerDead's runtime + recentHB lookups, then the
// re-probe (post-#2929 DEBOUNCE — 1st probe + re-probe both run),
// then the offline-flip UPDATE. cpProv.running=false on both probes
// (no recentHB → declares dead after the re-probe).
mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`).
WithArgs("ws-tunnel-dead").
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`).
WithArgs("ws-tunnel-dead").
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(sql.NullTime{}))
mock.ExpectExec(`UPDATE workspaces SET status =`).
WithArgs(models.StatusOffline, "ws-tunnel-dead").
WillReturnResult(sqlmock.NewResult(0, 1))
@@ -309,8 +315,8 @@ func TestProxyA2A_Upstream502_TriggersContainerDeadCheck(t *testing.T) {
if w.Header().Get("Retry-After") != "15" {
t.Errorf("Retry-After header should be 15 to throttle canvas-side retry loop; got %q", w.Header().Get("Retry-After"))
}
if cp.calls != 1 {
t.Errorf("cpProv.IsRunning must be consulted exactly once; got %d calls", cp.calls)
if cp.calls != 2 {
t.Errorf("cpProv.IsRunning must be consulted exactly twice (initial + re-probe); got %d calls", cp.calls)
}
}
@@ -2120,7 +2126,7 @@ func TestMaybeMarkContainerDead_NilProvisioner(t *testing.T) {
WithArgs("ws-nilprov").
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("claude-code"))
if got := handler.maybeMarkContainerDead(context.Background(), "ws-nilprov"); got {
if got := handler.maybeMarkContainerDead(context.Background(), "ws-nilprov"); got == deadActionDead {
t.Error("expected false when provisioner is nil")
}
}
@@ -2139,19 +2145,27 @@ func TestMaybeMarkContainerDead_CPOnly_NotRunning(t *testing.T) {
cp := &fakeCPProv{running: false}
handler.SetCPProvisioner(cp)
// Old heartbeat (2h ago) → no recentHB → post-re-probe dead path
// takes the declareContainerDead branch. The 1st probe + re-probe
// (post-#2929 DEBOUNCE) both run, both return false, then the UPDATE
// flips status='offline'.
oldHB := time.Now().Add(-2 * time.Hour)
mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`).
WithArgs("ws-saas-dead").
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`).
WithArgs("ws-saas-dead").
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(oldHB))
mock.ExpectExec(`UPDATE workspaces SET status =`).
WithArgs(models.StatusOffline, "ws-saas-dead").
WillReturnResult(sqlmock.NewResult(0, 1))
got := handler.maybeMarkContainerDead(context.Background(), "ws-saas-dead")
if !got {
t.Fatal("expected true (cpProv reports not running) — without cpProv consultation, SaaS dead-agent recovery is impossible")
if got != deadActionDead {
t.Fatal("expected deadActionDead (cpProv reports not running) — without cpProv consultation, SaaS dead-agent recovery is impossible")
}
if cp.calls != 1 {
t.Errorf("expected exactly 1 IsRunning call on cpProv; got %d", cp.calls)
if cp.calls != 2 {
t.Errorf("expected 2 IsRunning calls (initial + re-probe), got %d", cp.calls)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet sqlmock expectations: %v", err)
@@ -2172,7 +2186,7 @@ func TestMaybeMarkContainerDead_CPOnly_Running(t *testing.T) {
WithArgs("ws-saas-alive").
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
if got := handler.maybeMarkContainerDead(context.Background(), "ws-saas-alive"); got {
if got := handler.maybeMarkContainerDead(context.Background(), "ws-saas-alive"); got == deadActionDead {
t.Error("expected false when cpProv reports running — must not recycle a healthy agent")
}
if cp.calls != 1 {
@@ -2241,11 +2255,16 @@ func TestStopForRestart_NoProvisioner_NoOp(t *testing.T) {
// after the assertions ran). Tests that want to ASSERT a method is unused
// can check `calls == 0` after a sync barrier.
type fakeCPProv struct {
running bool
err error
calls int
stopCalls int
startCalls int
running bool
err error
calls int
stopCalls int
startCalls int
// reProbeRunning overrides the 2nd IsRunning call (the re-probe)
// to return this value. Tests that need a different 1st/2nd-probe
// shape (re-probe returns true while 1st probe returns false, the
// transport-flake case) set running=false + reProbeRunning=true.
reProbeRunning bool
}
func (f *fakeCPProv) Start(_ context.Context, _ provisioner.WorkspaceConfig) (string, error) {
@@ -2265,6 +2284,12 @@ func (f *fakeCPProv) GetConsoleOutput(_ context.Context, _ string) (string, erro
}
func (f *fakeCPProv) IsRunning(_ context.Context, _ string) (bool, error) {
f.calls++
if f.calls == 2 && f.reProbeRunning {
// Re-probe path: the 2nd call (the re-probe triggered by
// the post-#2929 debounce) returns reProbeRunning. Lets tests
// pin the transport-flake case (1st=false, 2nd=true → alive).
return f.reProbeRunning, f.err
}
return f.running, f.err
}
@@ -2278,15 +2303,23 @@ func TestMaybeMarkContainerDead_ExternalRuntime(t *testing.T) {
WithArgs("ws-ext").
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("external"))
if got := handler.maybeMarkContainerDead(context.Background(), "ws-ext"); got {
if got := handler.maybeMarkContainerDead(context.Background(), "ws-ext"); got == deadActionDead {
t.Error("expected false for external runtime")
}
}
// #2929: a recent heartbeat + IsRunning=false should NOT declare the
// container dead on the first observation. The second observation within
// the debounce window should.
func TestMaybeMarkContainerDead_RecentHeartbeat_Debounces(t *testing.T) {
// #2929 (post-config-PUT settle window fix): a recent heartbeat +
// IsRunning=false should NEVER declare the container dead, regardless
// of how many observations accumulate. The BUSY path (caller 202-queues)
// is the right answer: the container is either mid-boot, mid-restart,
// or just had a transient IsRunning flake — none of these warrant a
// RestartByID. Pre-#2929 behavior: 2nd observation within the debounce
// window declared dead and fired a self-fire restart, which ec2_stop'd
// the just-launched instance (the 2026-06-15 staging-boot class).
// Post-#2929 behavior: BUSY on every observation (re-probe is a separate
// sub-second check, not a counter — see TestMaybeMarkContainerDead_ReProbe
// for the re-probe shape).
func TestMaybeMarkContainerDead_RecentHeartbeat_StaysBusy(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
@@ -2295,9 +2328,9 @@ func TestMaybeMarkContainerDead_RecentHeartbeat_Debounces(t *testing.T) {
handler.SetCPProvisioner(cp)
recentHB := time.Now()
wsid := "ws-debounce"
wsid := "ws-busy-stays-busy"
// First observation: recent heartbeat, not running → debounce, no UPDATE.
// First observation: recent heartbeat, not running → BUSY (no UPDATE).
mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`).
WithArgs(wsid).
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
@@ -2305,29 +2338,29 @@ func TestMaybeMarkContainerDead_RecentHeartbeat_Debounces(t *testing.T) {
WithArgs(wsid).
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(recentHB))
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got {
t.Fatal("first dead observation with recent heartbeat should be debounced, not restarted")
}
if cp.calls != 1 {
t.Errorf("first observation: expected 1 IsRunning call, got %d", cp.calls)
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got != deadActionBusy {
t.Fatal("first dead observation with recent heartbeat should be BUSY, never dead (post-#2929 settle-window fix)")
}
// Second observation within the window → threshold reached, restart.
mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`).
WithArgs(wsid).
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`).
WithArgs(wsid).
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(recentHB))
mock.ExpectExec(`UPDATE workspaces SET status =`).
WithArgs(models.StatusOffline, wsid).
WillReturnResult(sqlmock.NewResult(0, 1))
if got := handler.maybeMarkContainerDead(context.Background(), wsid); !got {
t.Fatal("second dead observation with recent heartbeat should declare dead")
}
if cp.calls != 2 {
t.Errorf("second observation: expected 2 IsRunning calls total, got %d", cp.calls)
// 2nd, 3rd, 4th observation — still BUSY. Recent heartbeat + IsRunning=false
// is a settle-window artifact or a transport flake, not a definitive
// dead signal. The caller 202-queues, the runtime settles, the next
// heartbeat arrives; the BUSY path drains itself. We use a fresh
// recentHB on every observation so the re-probe's 2s sleep doesn't
// push us outside recentHeartbeatWindow (15s) and accidentally
// degrade BUSY → DEAD in the test — that path is covered by a
// separate test (old heartbeat → DEAD after re-probe).
for i := 0; i < 3; i++ {
freshHB := time.Now() // refresh: simulate the container's next heartbeat
mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`).
WithArgs(wsid).
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`).
WithArgs(wsid).
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(freshHB))
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got != deadActionBusy {
t.Fatalf("observation %d: still expect BUSY (not dead), got %v", i+2, got)
}
}
}
@@ -2348,11 +2381,107 @@ func TestMaybeMarkContainerDead_InspectErr_AssumesAlive(t *testing.T) {
WithArgs(wsid).
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(sql.NullTime{}))
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got {
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got == deadActionDead {
t.Error("transient IsRunning error should be treated as alive, not dead")
}
}
// #2929: post-config-PUT settle window — a recent heartbeat AND
// a fresh restartStartedAt stamp (state.running=false but a restart
// just completed) → BUSY, NOT dead. This is the exact shape that
// triggered the 2026-06-15 staging-boot class (E2E 7c→7d, job 506813):
// the container was alive but mid-boot after a config.yaml-PUT restart,
// the post-restart settle window hadn't elapsed, and a single
// IsRunning=false arrived in that window. Pre-fix: ec2_stop'd the
// just-launched instance. Post-fix (#2929): BUSY, caller 202-queues.
func TestMaybeMarkContainerDead_PostRestartSettleWindow_StaysBusy(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
cp := &fakeCPProv{running: false}
handler.SetCPProvisioner(cp)
waitForHandlerAsyncBeforeDBCleanup(t, handler)
wsid := "ws-settle-busy"
// Stamp a fresh restartStartedAt (the runtime-stamp written by
// RestartByID at workspace_restart.go:668 — the in-flight cycle
// already cleared state.running at this point, so we're in the
// POST-restart settle window, not the IN-FLIGHT window).
sv, ok := restartStates.Load(wsid)
if !ok {
// Bootstrap the per-workspace restartState (Reset clears on
// identity mismatch, Load does not).
sv = &restartState{running: false}
restartStates.Store(wsid, sv)
}
state := sv.(*restartState)
state.mu.Lock()
state.restartStartedAt = time.Now() // FRESH — restart just completed
state.running = false // cycle is done, but container is mid-boot
state.mu.Unlock()
// Query path: ONLY the runtime lookup. The post-restart settle
// window check fires BEFORE the recent-heartbeat check (the
// wider of the two self-fire guards; recent-heartbeat is the
// narrower "old code" check, settle-window is the post-#2929
// wider check), so no recentHB query is issued when the settle
// window catches the call. The mock setup is intentionally
// MINIMAL to pin the early-return behavior.
mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`).
WithArgs(wsid).
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
// IsRunning is NOT consulted (we return BUSY at the settle-window
// guard before reaching the IsRunning branch). The caller 202-queues,
// the container settles, the next heartbeat arrives. No UPDATE, no
// RestartByID, no ec2_stop on a still-booting instance.
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got != deadActionBusy {
t.Fatalf("post-restart settle window: expected deadActionBusy (caller 202-queues), got %v", got)
}
// Critical assertion: the OFFLINE-flip UPDATE must NOT have fired.
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet sqlmock expectations — the recentHB query + OFFLINE-flip UPDATE must NOT have fired in the settle window: %v", err)
}
}
// #2929: RE-PROBE — a single IsRunning=false is treated as a transport
// flake. We re-probe after a 2s delay; if the re-probe returns true,
// the first false is forgiven. E2E 7c→7d staging-run 506813 reproduced
// the false-positive within a 0.7s window of a healthy PONG.
func TestMaybeMarkContainerDead_ReProbe_TransportFlake(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
cp := &fakeCPProv{running: false, reProbeRunning: true}
handler.SetCPProvisioner(cp)
waitForHandlerAsyncBeforeDBCleanup(t, handler)
wsid := "ws-reprobe-flake"
// Old heartbeat (NOT recent). The post-#2929 path is: 1st probe
// false → re-probe. If re-probe is true, alive. If re-probe is
// also false, declare dead.
oldHB := time.Now().Add(-2 * time.Hour)
mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`).
WithArgs(wsid).
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`).
WithArgs(wsid).
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(oldHB))
// 1st probe: IsRunning returns running=false. The re-probe then
// runs (cp.reProbeRunning=true). Both IsRunning calls happen; the
// re-probe returns true → alive. No UPDATE fires.
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got != deadActionAlive {
t.Fatalf("re-probe returned true: expected deadActionAlive (transport flake was benign), got %v", got)
}
// Critical: the re-probe actually fired. cp.calls counts BOTH probes.
if cp.calls != 2 {
t.Errorf("expected 2 IsRunning calls (initial + re-probe), got %d", cp.calls)
}
}
// #2929: a successful IsRunning=true observation resets any accumulated
// dead-probe counter.
func TestMaybeMarkContainerDead_RunningTrue_ResetsDeadProbe(t *testing.T) {
@@ -2371,7 +2500,7 @@ func TestMaybeMarkContainerDead_RunningTrue_ResetsDeadProbe(t *testing.T) {
mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`).
WithArgs(wsid).
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(recentHB))
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got {
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got == deadActionDead {
t.Fatal("expected first observation to be debounced")
}
@@ -2383,7 +2512,7 @@ func TestMaybeMarkContainerDead_RunningTrue_ResetsDeadProbe(t *testing.T) {
mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`).
WithArgs(wsid).
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(recentHB))
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got {
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got == deadActionDead {
t.Error("running=true should reset dead probe and not restart")
}
@@ -2395,7 +2524,7 @@ func TestMaybeMarkContainerDead_RunningTrue_ResetsDeadProbe(t *testing.T) {
mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`).
WithArgs(wsid).
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(recentHB))
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got {
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got == deadActionDead {
t.Fatal("expected dead probe to have been reset; first post-reset observation should be debounced")
}
}
@@ -117,8 +117,8 @@ func TestMaybeMarkContainerDead_SkippedWhileRestarting(t *testing.T) {
h := newSelfFireHandler(t)
h.provisioner = stub
if got := h.maybeMarkContainerDead(context.Background(), wsID); got != false {
t.Errorf("maybeMarkContainerDead must return false while restarting, got %v", got)
if got := h.maybeMarkContainerDead(context.Background(), wsID); got != deadActionBusy {
t.Errorf("maybeMarkContainerDead must return deadActionBusy while restarting, got %v", got)
}
if stub.calls != 0 {
t.Errorf("IsRunning must not be called while restarting (Layer 1 gate broken); got %d calls", stub.calls)