fix(workspace): recover status from "failed" on live heartbeat #2414
@@ -869,6 +869,29 @@ func (h *RegistryHandler) evaluateStatus(c *gin.Context, payload models.Heartbea
|
||||
})
|
||||
}
|
||||
|
||||
// Auto-recovery from failed: the provision-timeout sweeper
|
||||
// (registry/provisiontimeout.go) flips a workspace to 'failed' when it sits
|
||||
// in 'provisioning' past DefaultProvisioningTimeout (10m for claude-code).
|
||||
// But a slow cold-boot (EC2 image pull + LLM preflight) can still finish and
|
||||
// start heartbeating AFTER the flip — agent_card is written unconditionally
|
||||
// on register, so the box is genuinely serving while its status is stuck
|
||||
// 'failed'. A live heartbeat is authoritative: recover to online. Without
|
||||
// this, a healthy-but-slow workspace (e.g. a model that preflights slower
|
||||
// than the 10m budget) shows 'failed' forever despite working — the
|
||||
// mechanism behind the intermittent multi-provider e2e "boot failures". The
|
||||
// `AND status = 'failed'` guard keeps the flip conditional (won't override
|
||||
// 'removed').
|
||||
if currentStatus == "failed" {
|
||||
if _, err := db.DB.ExecContext(ctx, `UPDATE workspaces SET status = $1, updated_at = now() WHERE id = $2 AND status = 'failed'`, models.StatusOnline, payload.WorkspaceID); err != nil {
|
||||
log.Printf("Heartbeat: failed to recover %s from failed: %v", payload.WorkspaceID, err)
|
||||
} else {
|
||||
log.Printf("Heartbeat: transitioned %s from failed to online (late heartbeat after provision-timeout)", payload.WorkspaceID)
|
||||
}
|
||||
h.broadcaster.RecordAndBroadcast(ctx, string(events.EventWorkspaceOnline), payload.WorkspaceID, map[string]interface{}{
|
||||
"recovered_from": currentStatus,
|
||||
})
|
||||
}
|
||||
|
||||
// #1870 Phase 1: drain one queued A2A request if the target reports
|
||||
// spare capacity. The heartbeat's active_tasks field reflects what the
|
||||
// workspace runtime is ACTUALLY running right now, independent of
|
||||
|
||||
@@ -193,6 +193,54 @@ func TestHeartbeatHandler_ProvisioningToOnline(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// ==================== Heartbeat — failed → online recovery (#616 follow-up) ====================
|
||||
|
||||
// A workspace flipped to 'failed' by the provision-timeout sweeper must recover
|
||||
// to 'online' once it starts heartbeating: a live heartbeat proves the agent
|
||||
// booted (just slowly, past the 10m budget), so the timeout flip was premature.
|
||||
func TestHeartbeatHandler_FailedToOnline(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
broadcaster := newTestBroadcaster()
|
||||
handler := NewRegistryHandler(broadcaster)
|
||||
|
||||
mock.ExpectQuery("SELECT COALESCE\\(current_task").
|
||||
WithArgs("ws-failed").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
|
||||
|
||||
mock.ExpectExec("UPDATE workspaces SET").
|
||||
WithArgs("ws-failed", 0.0, "", 1, 3000, "").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
// evaluateStatus SELECT — currently failed (provision-timeout sweeper flip)
|
||||
mock.ExpectQuery("SELECT status FROM workspaces WHERE id =").
|
||||
WithArgs("ws-failed").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("failed"))
|
||||
|
||||
// the new failed → online recovery transition
|
||||
mock.ExpectExec("UPDATE workspaces SET status =").
|
||||
WithArgs(models.StatusOnline, "ws-failed").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
mock.ExpectExec("INSERT INTO structure_events").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
body := `{"workspace_id":"ws-failed","error_rate":0.0,"sample_error":"","active_tasks":1,"uptime_seconds":3000}`
|
||||
c.Request = httptest.NewRequest("POST", "/registry/heartbeat", bytes.NewBufferString(body))
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
|
||||
handler.Heartbeat(c)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Errorf("expected status 200, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet sqlmock expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// ==================== Heartbeat — awaiting_agent → online recovery ====================
|
||||
// External workspaces flip to 'awaiting_agent' via healthsweep when their
|
||||
// heartbeat goes stale. When the operator's poller comes back, heartbeat
|
||||
|
||||
Reference in New Issue
Block a user