fix(handlers#2929): post-config-PUT settle window + DEBOUNCE re-probe + BUSY path #2954
@@ -808,7 +808,7 @@ func (h *WorkspaceHandler) proxyA2ARequest(ctx context.Context, workspaceID stri
|
||||
// authoritative check) but the empty-body case is what makes
|
||||
// this fix necessary.
|
||||
if isUpstreamDeadStatus(resp.StatusCode) {
|
||||
if h.maybeMarkContainerDead(ctx, workspaceID) {
|
||||
if h.maybeMarkContainerDead(ctx, workspaceID) == deadActionDead {
|
||||
return 0, nil, &proxyA2AError{
|
||||
Status: http.StatusServiceUnavailable,
|
||||
Headers: map[string]string{"Retry-After": "15"},
|
||||
|
||||
@@ -14,6 +14,8 @@ import (
|
||||
"net/http"
|
||||
"os"
|
||||
"strconv"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db"
|
||||
@@ -45,6 +47,39 @@ type proxyDispatchBuildError struct{ err error }
|
||||
|
||||
func (e *proxyDispatchBuildError) Error() string { return e.err.Error() }
|
||||
|
||||
// containerDeadActionT names the maybeMarkContainerDead outcome the
|
||||
// caller uses to choose its 202/503/queued path. Replaces the
|
||||
// historical bool (which conflated "definitively dead" and
|
||||
// "alive-but-settling"). The 3-state shape is load-bearing for the
|
||||
// post-config-PUT settle window (#2929): a single IsRunning=false in
|
||||
// the ~60s window after a restart completes (state.running=false
|
||||
// but the new container is still booting) used to fall through to
|
||||
// the declareContainerDead path and trigger a self-fire restart —
|
||||
// which ec2_stop'd the just-launched instance, exactly the
|
||||
// 2026-06-15 staging-boot class.
|
||||
type containerDeadActionT int
|
||||
|
||||
const (
|
||||
deadActionAlive containerDeadActionT = iota // 200 — let the caller continue
|
||||
deadActionBusy // 202 queued — caller should enqueue
|
||||
deadActionDead // 503 + RestartByID — caller should recycle
|
||||
)
|
||||
|
||||
// containerPostRestartSettleWindow is the duration AFTER a restart
|
||||
// completes (state.running=false) during which maybeMarkContainerDead
|
||||
// treats any IsRunning=false as a settle-window artifact (busy) rather
|
||||
// than a definitive dead signal. 60s covers the worst-case docker
|
||||
// pull + boot + first heartbeat window on a fresh EC2 instance. The
|
||||
// restart-context probe (waitForFreshHeartbeat at restart_context.go:205)
|
||||
// uses the same 60s ceiling for the same reason.
|
||||
const containerPostRestartSettleWindow = 60 * time.Second
|
||||
|
||||
// containerReProbeDelay is the time maybeMarkContainerDead waits
|
||||
// between the first IsRunning=false observation and the re-probe.
|
||||
// Never act on a single IsRunning=false (E2E 7c→7d staging-run
|
||||
// 506813 reproduced the false-positive within a 0.7s window).
|
||||
const containerReProbeDelay = 2 * time.Second
|
||||
|
||||
// handleA2ADispatchError translates a forward-call failure into a proxyA2AError,
|
||||
// runs the reactive container-health check, and records the outcome. Busy
|
||||
// targets that are successfully queued are logged as queued, not failed.
|
||||
@@ -61,9 +96,9 @@ func (h *WorkspaceHandler) handleA2ADispatchError(ctx context.Context, workspace
|
||||
|
||||
log.Printf("ProxyA2A forward error: %v", err)
|
||||
|
||||
containerDead := h.maybeMarkContainerDead(ctx, workspaceID)
|
||||
deadAction := h.maybeMarkContainerDead(ctx, workspaceID)
|
||||
|
||||
if containerDead {
|
||||
if deadAction == deadActionDead {
|
||||
if logActivity {
|
||||
h.logA2AFailure(ctx, workspaceID, callerID, body, a2aMethod, err, durationMs)
|
||||
}
|
||||
@@ -208,14 +243,23 @@ func (h *WorkspaceHandler) handleA2ADispatchError(ctx context.Context, workspace
|
||||
// 2. When the heartbeat is recent, require 2 consecutive dead observations
|
||||
// within 30s before declaring dead.
|
||||
// 3. Treat IsRunning transport errors as "assume alive" instead of dead.
|
||||
func (h *WorkspaceHandler) maybeMarkContainerDead(ctx context.Context, workspaceID string) bool {
|
||||
// 4. (#2929) Re-probe IsRunning after a 2s delay — never act on a single
|
||||
// flake (E2E 7c→7d staging-run 506813: agent PONG'd 0.7s prior then a
|
||||
// single IsRunning=false arrived in the container-settle window).
|
||||
// 5. (#2929) Widen the self-fire guard to cover the post-restart settle
|
||||
// window — state.running flips to false at the END of the cycle, but
|
||||
// the new container is still booting for ~60s after. During that
|
||||
// window IsRunning=false is the EXPECTED state (the new instance
|
||||
// isn't registered yet). The guard returns BUSY, which the caller
|
||||
// turns into 202-queued, not 503 + RestartByID.
|
||||
func (h *WorkspaceHandler) maybeMarkContainerDead(ctx context.Context, workspaceID string) containerDeadActionT {
|
||||
var wsRuntime string
|
||||
db.DB.QueryRowContext(ctx, `SELECT COALESCE(runtime, 'claude-code') FROM workspaces WHERE id = $1`, workspaceID).Scan(&wsRuntime)
|
||||
if isExternalLikeRuntime(wsRuntime) {
|
||||
return false
|
||||
return deadActionAlive
|
||||
}
|
||||
if !h.HasProvisioner() {
|
||||
return false
|
||||
return deadActionAlive
|
||||
}
|
||||
// Restart-aware short-circuit: during the 20-30s EC2-pending window of
|
||||
// an in-flight restart, the workspace's url='' and IsRunning() returns
|
||||
@@ -228,15 +272,28 @@ func (h *WorkspaceHandler) maybeMarkContainerDead(ctx context.Context, workspace
|
||||
// restart's own provisionWorkspaceAutoSync will surface a real failure
|
||||
// (markProvisionFailed) if the new container never comes up. Issue
|
||||
// internal#544.
|
||||
//
|
||||
// #2929: WIDEN the self-fire guard to cover the post-restart SETTLE
|
||||
// window — state.running flips to false at the END of the cycle
|
||||
// (workspace_restart.go:682-685), but the new container is still
|
||||
// booting for ~60s after. During that window IsRunning=false is the
|
||||
// EXPECTED state (the new instance isn't registered yet); the OLD
|
||||
// heartbeat is still in the DB (the next heartbeat is 30-60s away).
|
||||
// Returning BUSY here makes the caller 202-queue (not 503 +
|
||||
// RestartByID), which is the post-config-PUT settle-window fix.
|
||||
if isRestarting(workspaceID) {
|
||||
log.Printf("ProxyA2A: maybeMarkContainerDead skipped for %s — restart already in flight (self-fire guard)", workspaceID)
|
||||
return false
|
||||
return deadActionBusy
|
||||
}
|
||||
if h.inPostRestartSettleWindow(workspaceID) {
|
||||
log.Printf("ProxyA2A: maybeMarkContainerDead skipped for %s — post-restart settle window (new container still booting)", workspaceID)
|
||||
return deadActionBusy
|
||||
}
|
||||
|
||||
// #2929: recent-heartbeat guard. A workspace that heartbeated seconds
|
||||
// ago is almost certainly alive; a single proxy/transport flake should
|
||||
// not kill it. We still run IsRunning below, but a recent heartbeat
|
||||
// puts us on the debounced path.
|
||||
// puts us on the BUSY path (caller 202-queues, no restart).
|
||||
recentHeartbeat := h.hasRecentHeartbeat(ctx, workspaceID)
|
||||
|
||||
var running bool
|
||||
@@ -254,28 +311,61 @@ func (h *WorkspaceHandler) maybeMarkContainerDead(ctx context.Context, workspace
|
||||
// IsRunning's contract returns (true, err) in this case so we stay
|
||||
// on the alive path without triggering a restart cascade.
|
||||
log.Printf("ProxyA2A: IsRunning for %s returned transient error (assuming alive): %v", workspaceID, inspectErr)
|
||||
return false
|
||||
return deadActionAlive
|
||||
}
|
||||
if running {
|
||||
h.resetDeadProbe(workspaceID)
|
||||
return false
|
||||
return deadActionAlive
|
||||
}
|
||||
|
||||
// Container is not running. If we have no recent heartbeat, preserve the
|
||||
// pre-#2929 immediate-dead behavior so dead EC2 instances still recover
|
||||
// on the first failed request (hongmingwang incident recovery path).
|
||||
// #2929: DEBOUNCE — never act on a single IsRunning=false. Wait
|
||||
// containerReProbeDelay, re-probe, only act if the re-probe is also
|
||||
// false. E2E 7c→7d staging-run 506813 reproduced a single
|
||||
// IsRunning=false within 0.7s of a healthy PONG — the re-probe
|
||||
// catches this and returns BUSY (no restart).
|
||||
if h.shouldReProbe(workspaceID, containerReProbeDelay) {
|
||||
time.Sleep(containerReProbeDelay)
|
||||
log.Printf("ProxyA2A: IsRunning for %s returned false (1st probe) — re-probing after %s", workspaceID, containerReProbeDelay)
|
||||
if h.provisioner != nil {
|
||||
running, inspectErr = h.provisioner.IsRunning(ctx, workspaceID)
|
||||
} else {
|
||||
running, inspectErr = h.cpProv.IsRunning(ctx, workspaceID)
|
||||
}
|
||||
if inspectErr != nil {
|
||||
log.Printf("ProxyA2A: re-probe for %s returned transient error (assuming alive): %v", workspaceID, inspectErr)
|
||||
return deadActionAlive
|
||||
}
|
||||
if running {
|
||||
h.resetDeadProbe(workspaceID)
|
||||
log.Printf("ProxyA2A: re-probe for %s returned running=true — false positive on 1st probe was a transport flake; container is alive", workspaceID)
|
||||
return deadActionAlive
|
||||
}
|
||||
}
|
||||
|
||||
// Container is not running on BOTH probes. If we have no recent
|
||||
// heartbeat, preserve the pre-#2929 immediate-dead behavior so
|
||||
// dead EC2 instances still recover on the first failed request
|
||||
// (hongmingwang incident recovery path).
|
||||
if !recentHeartbeat {
|
||||
return h.declareContainerDead(ctx, workspaceID)
|
||||
if h.declareContainerDead(ctx, workspaceID) {
|
||||
return deadActionDead
|
||||
}
|
||||
return deadActionAlive
|
||||
}
|
||||
|
||||
// Recent heartbeat but IsRunning says not-running: debounce. Require N
|
||||
// consecutive observations within the window before we believe it.
|
||||
// Recent heartbeat but IsRunning says not-running: BUSY (caller
|
||||
// 202-queues, no restart). The pre-#2929 behavior was a debounce
|
||||
// counter that eventually declared dead after N observations — but
|
||||
// in the post-config-PUT settle window the BUSY path is the right
|
||||
// answer (the container IS mid-boot, just not yet visible to the
|
||||
// IsRunning probe). The debounce counter stays for the
|
||||
// not-in-settle-window case below.
|
||||
if !h.incrementDeadProbe(workspaceID) {
|
||||
log.Printf("ProxyA2A: container for %s looks dead but has recent heartbeat — debouncing (%d/%d within %s)",
|
||||
workspaceID, h.deadProbeCount(workspaceID), containerDeadDebounceThreshold, containerDeadDebounceWindow)
|
||||
return false
|
||||
return deadActionBusy
|
||||
}
|
||||
return h.declareContainerDead(ctx, workspaceID)
|
||||
return deadActionBusy
|
||||
}
|
||||
|
||||
// hasRecentHeartbeat returns true if the workspace has a last_heartbeat_at
|
||||
@@ -295,6 +385,61 @@ func (h *WorkspaceHandler) hasRecentHeartbeat(ctx context.Context, workspaceID s
|
||||
return time.Since(*lastHB) <= recentHeartbeatWindow
|
||||
}
|
||||
|
||||
// inPostRestartSettleWindow reports whether the workspace is in the
|
||||
// ~60s settle window after a restart cycle has just completed (state.
|
||||
// running=false but the new container is still booting). During this
|
||||
// window IsRunning=false is the EXPECTED state and any declare-
|
||||
// container-dead path would ec2_stop a still-booting instance — the
|
||||
// exact 2026-06-15 staging-boot class (#2929). Returns false if the
|
||||
// workspace has never been restarted (no restartStartedAt stamp).
|
||||
//
|
||||
// Reads the same restartStates map that isRestarting() does; the
|
||||
// distinction is timing (state.running vs now-since-restartStartedAt
|
||||
// < containerPostRestartSettleWindow). Together they cover the full
|
||||
// "in-flight or just-completed" window.
|
||||
func (h *WorkspaceHandler) inPostRestartSettleWindow(workspaceID string) bool {
|
||||
sv, ok := restartStates.Load(workspaceID)
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
state := sv.(*restartState)
|
||||
state.mu.Lock()
|
||||
defer state.mu.Unlock()
|
||||
if state.restartStartedAt.IsZero() {
|
||||
return false
|
||||
}
|
||||
return time.Since(state.restartStartedAt) <= containerPostRestartSettleWindow
|
||||
}
|
||||
|
||||
// shouldReProbe returns true if a re-probe delay has elapsed since
|
||||
// the LAST re-probe record (or since the start of the process if
|
||||
// none). This throttles re-probes so a flapping IsRunning probe doesn't
|
||||
// trigger an unbounded re-probe storm. The re-probe is per-workspace.
|
||||
// Not currently in a hot loop (called only from the rare dead-probe
|
||||
// path), so a sync.Map is overkill — process-local atomic is enough.
|
||||
var containerLastReProbeAt sync.Map // map[workspaceID]*atomic.Int64 (unix nanos)
|
||||
|
||||
func (h *WorkspaceHandler) shouldReProbe(workspaceID string, delay time.Duration) bool {
|
||||
// Always re-probe on the first call (no prior record) so the
|
||||
// 1st false-positive case is caught. The throttle applies to
|
||||
// subsequent calls.
|
||||
now := time.Now().UnixNano()
|
||||
v, ok := containerLastReProbeAt.Load(workspaceID)
|
||||
if !ok {
|
||||
var stamp atomic.Int64
|
||||
stamp.Store(now)
|
||||
containerLastReProbeAt.Store(workspaceID, &stamp)
|
||||
return true
|
||||
}
|
||||
stamp := v.(*atomic.Int64)
|
||||
last := stamp.Load()
|
||||
if now-last < int64(delay) {
|
||||
return false
|
||||
}
|
||||
stamp.Store(now)
|
||||
return true
|
||||
}
|
||||
|
||||
// declareContainerDead is the single point that marks a workspace offline,
|
||||
// clears keys, broadcasts OFFLINE, and triggers an async restart.
|
||||
func (h *WorkspaceHandler) declareContainerDead(ctx context.Context, workspaceID string) bool {
|
||||
|
||||
@@ -280,10 +280,16 @@ func TestProxyA2A_Upstream502_TriggersContainerDeadCheck(t *testing.T) {
|
||||
// and logs as success because the dispatch did get a response).
|
||||
mock.ExpectExec("INSERT INTO activity_logs").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
// maybeMarkContainerDead's runtime lookup, then the offline-flip UPDATE.
|
||||
// maybeMarkContainerDead's runtime + recentHB lookups, then the
|
||||
// re-probe (post-#2929 DEBOUNCE — 1st probe + re-probe both run),
|
||||
// then the offline-flip UPDATE. cpProv.running=false on both probes
|
||||
// (no recentHB → declares dead after the re-probe).
|
||||
mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`).
|
||||
WithArgs("ws-tunnel-dead").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
|
||||
mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`).
|
||||
WithArgs("ws-tunnel-dead").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(sql.NullTime{}))
|
||||
mock.ExpectExec(`UPDATE workspaces SET status =`).
|
||||
WithArgs(models.StatusOffline, "ws-tunnel-dead").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
@@ -309,8 +315,8 @@ func TestProxyA2A_Upstream502_TriggersContainerDeadCheck(t *testing.T) {
|
||||
if w.Header().Get("Retry-After") != "15" {
|
||||
t.Errorf("Retry-After header should be 15 to throttle canvas-side retry loop; got %q", w.Header().Get("Retry-After"))
|
||||
}
|
||||
if cp.calls != 1 {
|
||||
t.Errorf("cpProv.IsRunning must be consulted exactly once; got %d calls", cp.calls)
|
||||
if cp.calls != 2 {
|
||||
t.Errorf("cpProv.IsRunning must be consulted exactly twice (initial + re-probe); got %d calls", cp.calls)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2120,7 +2126,7 @@ func TestMaybeMarkContainerDead_NilProvisioner(t *testing.T) {
|
||||
WithArgs("ws-nilprov").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("claude-code"))
|
||||
|
||||
if got := handler.maybeMarkContainerDead(context.Background(), "ws-nilprov"); got {
|
||||
if got := handler.maybeMarkContainerDead(context.Background(), "ws-nilprov"); got == deadActionDead {
|
||||
t.Error("expected false when provisioner is nil")
|
||||
}
|
||||
}
|
||||
@@ -2139,19 +2145,27 @@ func TestMaybeMarkContainerDead_CPOnly_NotRunning(t *testing.T) {
|
||||
cp := &fakeCPProv{running: false}
|
||||
handler.SetCPProvisioner(cp)
|
||||
|
||||
// Old heartbeat (2h ago) → no recentHB → post-re-probe dead path
|
||||
// takes the declareContainerDead branch. The 1st probe + re-probe
|
||||
// (post-#2929 DEBOUNCE) both run, both return false, then the UPDATE
|
||||
// flips status='offline'.
|
||||
oldHB := time.Now().Add(-2 * time.Hour)
|
||||
mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`).
|
||||
WithArgs("ws-saas-dead").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
|
||||
mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`).
|
||||
WithArgs("ws-saas-dead").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(oldHB))
|
||||
mock.ExpectExec(`UPDATE workspaces SET status =`).
|
||||
WithArgs(models.StatusOffline, "ws-saas-dead").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
got := handler.maybeMarkContainerDead(context.Background(), "ws-saas-dead")
|
||||
if !got {
|
||||
t.Fatal("expected true (cpProv reports not running) — without cpProv consultation, SaaS dead-agent recovery is impossible")
|
||||
if got != deadActionDead {
|
||||
t.Fatal("expected deadActionDead (cpProv reports not running) — without cpProv consultation, SaaS dead-agent recovery is impossible")
|
||||
}
|
||||
if cp.calls != 1 {
|
||||
t.Errorf("expected exactly 1 IsRunning call on cpProv; got %d", cp.calls)
|
||||
if cp.calls != 2 {
|
||||
t.Errorf("expected 2 IsRunning calls (initial + re-probe), got %d", cp.calls)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet sqlmock expectations: %v", err)
|
||||
@@ -2172,7 +2186,7 @@ func TestMaybeMarkContainerDead_CPOnly_Running(t *testing.T) {
|
||||
WithArgs("ws-saas-alive").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
|
||||
|
||||
if got := handler.maybeMarkContainerDead(context.Background(), "ws-saas-alive"); got {
|
||||
if got := handler.maybeMarkContainerDead(context.Background(), "ws-saas-alive"); got == deadActionDead {
|
||||
t.Error("expected false when cpProv reports running — must not recycle a healthy agent")
|
||||
}
|
||||
if cp.calls != 1 {
|
||||
@@ -2241,11 +2255,16 @@ func TestStopForRestart_NoProvisioner_NoOp(t *testing.T) {
|
||||
// after the assertions ran). Tests that want to ASSERT a method is unused
|
||||
// can check `calls == 0` after a sync barrier.
|
||||
type fakeCPProv struct {
|
||||
running bool
|
||||
err error
|
||||
calls int
|
||||
stopCalls int
|
||||
startCalls int
|
||||
running bool
|
||||
err error
|
||||
calls int
|
||||
stopCalls int
|
||||
startCalls int
|
||||
// reProbeRunning overrides the 2nd IsRunning call (the re-probe)
|
||||
// to return this value. Tests that need a different 1st/2nd-probe
|
||||
// shape (re-probe returns true while 1st probe returns false, the
|
||||
// transport-flake case) set running=false + reProbeRunning=true.
|
||||
reProbeRunning bool
|
||||
}
|
||||
|
||||
func (f *fakeCPProv) Start(_ context.Context, _ provisioner.WorkspaceConfig) (string, error) {
|
||||
@@ -2265,6 +2284,12 @@ func (f *fakeCPProv) GetConsoleOutput(_ context.Context, _ string) (string, erro
|
||||
}
|
||||
func (f *fakeCPProv) IsRunning(_ context.Context, _ string) (bool, error) {
|
||||
f.calls++
|
||||
if f.calls == 2 && f.reProbeRunning {
|
||||
// Re-probe path: the 2nd call (the re-probe triggered by
|
||||
// the post-#2929 debounce) returns reProbeRunning. Lets tests
|
||||
// pin the transport-flake case (1st=false, 2nd=true → alive).
|
||||
return f.reProbeRunning, f.err
|
||||
}
|
||||
return f.running, f.err
|
||||
}
|
||||
|
||||
@@ -2278,15 +2303,23 @@ func TestMaybeMarkContainerDead_ExternalRuntime(t *testing.T) {
|
||||
WithArgs("ws-ext").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("external"))
|
||||
|
||||
if got := handler.maybeMarkContainerDead(context.Background(), "ws-ext"); got {
|
||||
if got := handler.maybeMarkContainerDead(context.Background(), "ws-ext"); got == deadActionDead {
|
||||
t.Error("expected false for external runtime")
|
||||
}
|
||||
}
|
||||
|
||||
// #2929: a recent heartbeat + IsRunning=false should NOT declare the
|
||||
// container dead on the first observation. The second observation within
|
||||
// the debounce window should.
|
||||
func TestMaybeMarkContainerDead_RecentHeartbeat_Debounces(t *testing.T) {
|
||||
// #2929 (post-config-PUT settle window fix): a recent heartbeat +
|
||||
// IsRunning=false should NEVER declare the container dead, regardless
|
||||
// of how many observations accumulate. The BUSY path (caller 202-queues)
|
||||
// is the right answer: the container is either mid-boot, mid-restart,
|
||||
// or just had a transient IsRunning flake — none of these warrant a
|
||||
// RestartByID. Pre-#2929 behavior: 2nd observation within the debounce
|
||||
// window declared dead and fired a self-fire restart, which ec2_stop'd
|
||||
// the just-launched instance (the 2026-06-15 staging-boot class).
|
||||
// Post-#2929 behavior: BUSY on every observation (re-probe is a separate
|
||||
// sub-second check, not a counter — see TestMaybeMarkContainerDead_ReProbe
|
||||
// for the re-probe shape).
|
||||
func TestMaybeMarkContainerDead_RecentHeartbeat_StaysBusy(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
|
||||
@@ -2295,9 +2328,9 @@ func TestMaybeMarkContainerDead_RecentHeartbeat_Debounces(t *testing.T) {
|
||||
handler.SetCPProvisioner(cp)
|
||||
|
||||
recentHB := time.Now()
|
||||
wsid := "ws-debounce"
|
||||
wsid := "ws-busy-stays-busy"
|
||||
|
||||
// First observation: recent heartbeat, not running → debounce, no UPDATE.
|
||||
// First observation: recent heartbeat, not running → BUSY (no UPDATE).
|
||||
mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`).
|
||||
WithArgs(wsid).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
|
||||
@@ -2305,29 +2338,29 @@ func TestMaybeMarkContainerDead_RecentHeartbeat_Debounces(t *testing.T) {
|
||||
WithArgs(wsid).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(recentHB))
|
||||
|
||||
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got {
|
||||
t.Fatal("first dead observation with recent heartbeat should be debounced, not restarted")
|
||||
}
|
||||
if cp.calls != 1 {
|
||||
t.Errorf("first observation: expected 1 IsRunning call, got %d", cp.calls)
|
||||
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got != deadActionBusy {
|
||||
t.Fatal("first dead observation with recent heartbeat should be BUSY, never dead (post-#2929 settle-window fix)")
|
||||
}
|
||||
|
||||
// Second observation within the window → threshold reached, restart.
|
||||
mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`).
|
||||
WithArgs(wsid).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
|
||||
mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`).
|
||||
WithArgs(wsid).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(recentHB))
|
||||
mock.ExpectExec(`UPDATE workspaces SET status =`).
|
||||
WithArgs(models.StatusOffline, wsid).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
if got := handler.maybeMarkContainerDead(context.Background(), wsid); !got {
|
||||
t.Fatal("second dead observation with recent heartbeat should declare dead")
|
||||
}
|
||||
if cp.calls != 2 {
|
||||
t.Errorf("second observation: expected 2 IsRunning calls total, got %d", cp.calls)
|
||||
// 2nd, 3rd, 4th observation — still BUSY. Recent heartbeat + IsRunning=false
|
||||
// is a settle-window artifact or a transport flake, not a definitive
|
||||
// dead signal. The caller 202-queues, the runtime settles, the next
|
||||
// heartbeat arrives; the BUSY path drains itself. We use a fresh
|
||||
// recentHB on every observation so the re-probe's 2s sleep doesn't
|
||||
// push us outside recentHeartbeatWindow (15s) and accidentally
|
||||
// degrade BUSY → DEAD in the test — that path is covered by a
|
||||
// separate test (old heartbeat → DEAD after re-probe).
|
||||
for i := 0; i < 3; i++ {
|
||||
freshHB := time.Now() // refresh: simulate the container's next heartbeat
|
||||
mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`).
|
||||
WithArgs(wsid).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
|
||||
mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`).
|
||||
WithArgs(wsid).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(freshHB))
|
||||
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got != deadActionBusy {
|
||||
t.Fatalf("observation %d: still expect BUSY (not dead), got %v", i+2, got)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2348,11 +2381,107 @@ func TestMaybeMarkContainerDead_InspectErr_AssumesAlive(t *testing.T) {
|
||||
WithArgs(wsid).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(sql.NullTime{}))
|
||||
|
||||
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got {
|
||||
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got == deadActionDead {
|
||||
t.Error("transient IsRunning error should be treated as alive, not dead")
|
||||
}
|
||||
}
|
||||
|
||||
// #2929: post-config-PUT settle window — a recent heartbeat AND
|
||||
// a fresh restartStartedAt stamp (state.running=false but a restart
|
||||
// just completed) → BUSY, NOT dead. This is the exact shape that
|
||||
// triggered the 2026-06-15 staging-boot class (E2E 7c→7d, job 506813):
|
||||
// the container was alive but mid-boot after a config.yaml-PUT restart,
|
||||
// the post-restart settle window hadn't elapsed, and a single
|
||||
// IsRunning=false arrived in that window. Pre-fix: ec2_stop'd the
|
||||
// just-launched instance. Post-fix (#2929): BUSY, caller 202-queues.
|
||||
func TestMaybeMarkContainerDead_PostRestartSettleWindow_StaysBusy(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
|
||||
cp := &fakeCPProv{running: false}
|
||||
handler.SetCPProvisioner(cp)
|
||||
waitForHandlerAsyncBeforeDBCleanup(t, handler)
|
||||
|
||||
wsid := "ws-settle-busy"
|
||||
|
||||
// Stamp a fresh restartStartedAt (the runtime-stamp written by
|
||||
// RestartByID at workspace_restart.go:668 — the in-flight cycle
|
||||
// already cleared state.running at this point, so we're in the
|
||||
// POST-restart settle window, not the IN-FLIGHT window).
|
||||
sv, ok := restartStates.Load(wsid)
|
||||
if !ok {
|
||||
// Bootstrap the per-workspace restartState (Reset clears on
|
||||
// identity mismatch, Load does not).
|
||||
sv = &restartState{running: false}
|
||||
restartStates.Store(wsid, sv)
|
||||
}
|
||||
state := sv.(*restartState)
|
||||
state.mu.Lock()
|
||||
state.restartStartedAt = time.Now() // FRESH — restart just completed
|
||||
state.running = false // cycle is done, but container is mid-boot
|
||||
state.mu.Unlock()
|
||||
|
||||
// Query path: ONLY the runtime lookup. The post-restart settle
|
||||
// window check fires BEFORE the recent-heartbeat check (the
|
||||
// wider of the two self-fire guards; recent-heartbeat is the
|
||||
// narrower "old code" check, settle-window is the post-#2929
|
||||
// wider check), so no recentHB query is issued when the settle
|
||||
// window catches the call. The mock setup is intentionally
|
||||
// MINIMAL to pin the early-return behavior.
|
||||
mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`).
|
||||
WithArgs(wsid).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
|
||||
|
||||
// IsRunning is NOT consulted (we return BUSY at the settle-window
|
||||
// guard before reaching the IsRunning branch). The caller 202-queues,
|
||||
// the container settles, the next heartbeat arrives. No UPDATE, no
|
||||
// RestartByID, no ec2_stop on a still-booting instance.
|
||||
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got != deadActionBusy {
|
||||
t.Fatalf("post-restart settle window: expected deadActionBusy (caller 202-queues), got %v", got)
|
||||
}
|
||||
// Critical assertion: the OFFLINE-flip UPDATE must NOT have fired.
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet sqlmock expectations — the recentHB query + OFFLINE-flip UPDATE must NOT have fired in the settle window: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// #2929: RE-PROBE — a single IsRunning=false is treated as a transport
|
||||
// flake. We re-probe after a 2s delay; if the re-probe returns true,
|
||||
// the first false is forgiven. E2E 7c→7d staging-run 506813 reproduced
|
||||
// the false-positive within a 0.7s window of a healthy PONG.
|
||||
func TestMaybeMarkContainerDead_ReProbe_TransportFlake(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
|
||||
cp := &fakeCPProv{running: false, reProbeRunning: true}
|
||||
handler.SetCPProvisioner(cp)
|
||||
waitForHandlerAsyncBeforeDBCleanup(t, handler)
|
||||
|
||||
wsid := "ws-reprobe-flake"
|
||||
|
||||
// Old heartbeat (NOT recent). The post-#2929 path is: 1st probe
|
||||
// false → re-probe. If re-probe is true, alive. If re-probe is
|
||||
// also false, declare dead.
|
||||
oldHB := time.Now().Add(-2 * time.Hour)
|
||||
mock.ExpectQuery(`SELECT COALESCE\(runtime, 'claude-code'\) FROM workspaces WHERE id =`).
|
||||
WithArgs(wsid).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
|
||||
mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`).
|
||||
WithArgs(wsid).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(oldHB))
|
||||
|
||||
// 1st probe: IsRunning returns running=false. The re-probe then
|
||||
// runs (cp.reProbeRunning=true). Both IsRunning calls happen; the
|
||||
// re-probe returns true → alive. No UPDATE fires.
|
||||
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got != deadActionAlive {
|
||||
t.Fatalf("re-probe returned true: expected deadActionAlive (transport flake was benign), got %v", got)
|
||||
}
|
||||
// Critical: the re-probe actually fired. cp.calls counts BOTH probes.
|
||||
if cp.calls != 2 {
|
||||
t.Errorf("expected 2 IsRunning calls (initial + re-probe), got %d", cp.calls)
|
||||
}
|
||||
}
|
||||
|
||||
// #2929: a successful IsRunning=true observation resets any accumulated
|
||||
// dead-probe counter.
|
||||
func TestMaybeMarkContainerDead_RunningTrue_ResetsDeadProbe(t *testing.T) {
|
||||
@@ -2371,7 +2500,7 @@ func TestMaybeMarkContainerDead_RunningTrue_ResetsDeadProbe(t *testing.T) {
|
||||
mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`).
|
||||
WithArgs(wsid).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(recentHB))
|
||||
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got {
|
||||
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got == deadActionDead {
|
||||
t.Fatal("expected first observation to be debounced")
|
||||
}
|
||||
|
||||
@@ -2383,7 +2512,7 @@ func TestMaybeMarkContainerDead_RunningTrue_ResetsDeadProbe(t *testing.T) {
|
||||
mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`).
|
||||
WithArgs(wsid).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(recentHB))
|
||||
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got {
|
||||
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got == deadActionDead {
|
||||
t.Error("running=true should reset dead probe and not restart")
|
||||
}
|
||||
|
||||
@@ -2395,7 +2524,7 @@ func TestMaybeMarkContainerDead_RunningTrue_ResetsDeadProbe(t *testing.T) {
|
||||
mock.ExpectQuery(`SELECT last_heartbeat_at FROM workspaces WHERE id =`).
|
||||
WithArgs(wsid).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"last_heartbeat_at"}).AddRow(recentHB))
|
||||
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got {
|
||||
if got := handler.maybeMarkContainerDead(context.Background(), wsid); got == deadActionDead {
|
||||
t.Fatal("expected dead probe to have been reset; first post-reset observation should be debounced")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -117,8 +117,8 @@ func TestMaybeMarkContainerDead_SkippedWhileRestarting(t *testing.T) {
|
||||
h := newSelfFireHandler(t)
|
||||
h.provisioner = stub
|
||||
|
||||
if got := h.maybeMarkContainerDead(context.Background(), wsID); got != false {
|
||||
t.Errorf("maybeMarkContainerDead must return false while restarting, got %v", got)
|
||||
if got := h.maybeMarkContainerDead(context.Background(), wsID); got != deadActionBusy {
|
||||
t.Errorf("maybeMarkContainerDead must return deadActionBusy while restarting, got %v", got)
|
||||
}
|
||||
if stub.calls != 0 {
|
||||
t.Errorf("IsRunning must not be called while restarting (Layer 1 gate broken); got %d calls", stub.calls)
|
||||
|
||||
Reference in New Issue
Block a user