fix(workspace): recover status from "failed" on live heartbeat #2414

Merged
core-devops merged 1 commits from fix/recover-workspace-from-failed into main 2026-06-07 22:48:22 +00:00
2 changed files with 71 additions and 0 deletions
@@ -869,6 +869,29 @@ func (h *RegistryHandler) evaluateStatus(c *gin.Context, payload models.Heartbea
})
}
// Auto-recovery from failed: the provision-timeout sweeper
// (registry/provisiontimeout.go) flips a workspace to 'failed' when it sits
// in 'provisioning' past DefaultProvisioningTimeout (10m for claude-code).
// But a slow cold-boot (EC2 image pull + LLM preflight) can still finish and
// start heartbeating AFTER the flip — agent_card is written unconditionally
// on register, so the box is genuinely serving while its status is stuck
// 'failed'. A live heartbeat is authoritative: recover to online. Without
// this, a healthy-but-slow workspace (e.g. a model that preflights slower
// than the 10m budget) shows 'failed' forever despite working — the
// mechanism behind the intermittent multi-provider e2e "boot failures". The
// `AND status = 'failed'` guard keeps the flip conditional (won't override
// 'removed').
if currentStatus == "failed" {
if _, err := db.DB.ExecContext(ctx, `UPDATE workspaces SET status = $1, updated_at = now() WHERE id = $2 AND status = 'failed'`, models.StatusOnline, payload.WorkspaceID); err != nil {
log.Printf("Heartbeat: failed to recover %s from failed: %v", payload.WorkspaceID, err)
} else {
log.Printf("Heartbeat: transitioned %s from failed to online (late heartbeat after provision-timeout)", payload.WorkspaceID)
}
h.broadcaster.RecordAndBroadcast(ctx, string(events.EventWorkspaceOnline), payload.WorkspaceID, map[string]interface{}{
"recovered_from": currentStatus,
})
}
// #1870 Phase 1: drain one queued A2A request if the target reports
// spare capacity. The heartbeat's active_tasks field reflects what the
// workspace runtime is ACTUALLY running right now, independent of
@@ -193,6 +193,54 @@ func TestHeartbeatHandler_ProvisioningToOnline(t *testing.T) {
}
}
// ==================== Heartbeat — failed → online recovery (#616 follow-up) ====================
// A workspace flipped to 'failed' by the provision-timeout sweeper must recover
// to 'online' once it starts heartbeating: a live heartbeat proves the agent
// booted (just slowly, past the 10m budget), so the timeout flip was premature.
func TestHeartbeatHandler_FailedToOnline(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
broadcaster := newTestBroadcaster()
handler := NewRegistryHandler(broadcaster)
mock.ExpectQuery("SELECT COALESCE\\(current_task").
WithArgs("ws-failed").
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
mock.ExpectExec("UPDATE workspaces SET").
WithArgs("ws-failed", 0.0, "", 1, 3000, "").
WillReturnResult(sqlmock.NewResult(0, 1))
// evaluateStatus SELECT — currently failed (provision-timeout sweeper flip)
mock.ExpectQuery("SELECT status FROM workspaces WHERE id =").
WithArgs("ws-failed").
WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("failed"))
// the new failed → online recovery transition
mock.ExpectExec("UPDATE workspaces SET status =").
WithArgs(models.StatusOnline, "ws-failed").
WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectExec("INSERT INTO structure_events").
WillReturnResult(sqlmock.NewResult(0, 1))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
body := `{"workspace_id":"ws-failed","error_rate":0.0,"sample_error":"","active_tasks":1,"uptime_seconds":3000}`
c.Request = httptest.NewRequest("POST", "/registry/heartbeat", bytes.NewBufferString(body))
c.Request.Header.Set("Content-Type", "application/json")
handler.Heartbeat(c)
if w.Code != http.StatusOK {
t.Errorf("expected status 200, got %d: %s", w.Code, w.Body.String())
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet sqlmock expectations: %v", err)
}
}
// ==================== Heartbeat — awaiting_agent → online recovery ====================
// External workspaces flip to 'awaiting_agent' via healthsweep when their
// heartbeat goes stale. When the operator's poller comes back, heartbeat