diff --git a/workspace-server/internal/handlers/registry.go b/workspace-server/internal/handlers/registry.go index 047b8ea1d..2856e5ab0 100644 --- a/workspace-server/internal/handlers/registry.go +++ b/workspace-server/internal/handlers/registry.go @@ -869,6 +869,29 @@ func (h *RegistryHandler) evaluateStatus(c *gin.Context, payload models.Heartbea }) } + // Auto-recovery from failed: the provision-timeout sweeper + // (registry/provisiontimeout.go) flips a workspace to 'failed' when it sits + // in 'provisioning' past DefaultProvisioningTimeout (10m for claude-code). + // But a slow cold-boot (EC2 image pull + LLM preflight) can still finish and + // start heartbeating AFTER the flip — agent_card is written unconditionally + // on register, so the box is genuinely serving while its status is stuck + // 'failed'. A live heartbeat is authoritative: recover to online. Without + // this, a healthy-but-slow workspace (e.g. a model that preflights slower + // than the 10m budget) shows 'failed' forever despite working — the + // mechanism behind the intermittent multi-provider e2e "boot failures". The + // `AND status = 'failed'` guard keeps the flip conditional (won't override + // 'removed'). + if currentStatus == "failed" { + if _, err := db.DB.ExecContext(ctx, `UPDATE workspaces SET status = $1, updated_at = now() WHERE id = $2 AND status = 'failed'`, models.StatusOnline, payload.WorkspaceID); err != nil { + log.Printf("Heartbeat: failed to recover %s from failed: %v", payload.WorkspaceID, err) + } else { + log.Printf("Heartbeat: transitioned %s from failed to online (late heartbeat after provision-timeout)", payload.WorkspaceID) + } + h.broadcaster.RecordAndBroadcast(ctx, string(events.EventWorkspaceOnline), payload.WorkspaceID, map[string]interface{}{ + "recovered_from": currentStatus, + }) + } + // #1870 Phase 1: drain one queued A2A request if the target reports // spare capacity. The heartbeat's active_tasks field reflects what the // workspace runtime is ACTUALLY running right now, independent of diff --git a/workspace-server/internal/handlers/registry_test.go b/workspace-server/internal/handlers/registry_test.go index a8407b956..51640051a 100644 --- a/workspace-server/internal/handlers/registry_test.go +++ b/workspace-server/internal/handlers/registry_test.go @@ -193,6 +193,54 @@ func TestHeartbeatHandler_ProvisioningToOnline(t *testing.T) { } } +// ==================== Heartbeat — failed → online recovery (#616 follow-up) ==================== + +// A workspace flipped to 'failed' by the provision-timeout sweeper must recover +// to 'online' once it starts heartbeating: a live heartbeat proves the agent +// booted (just slowly, past the 10m budget), so the timeout flip was premature. +func TestHeartbeatHandler_FailedToOnline(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + handler := NewRegistryHandler(broadcaster) + + mock.ExpectQuery("SELECT COALESCE\\(current_task"). + WithArgs("ws-failed"). + WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("")) + + mock.ExpectExec("UPDATE workspaces SET"). + WithArgs("ws-failed", 0.0, "", 1, 3000, ""). + WillReturnResult(sqlmock.NewResult(0, 1)) + + // evaluateStatus SELECT — currently failed (provision-timeout sweeper flip) + mock.ExpectQuery("SELECT status FROM workspaces WHERE id ="). + WithArgs("ws-failed"). + WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("failed")) + + // the new failed → online recovery transition + mock.ExpectExec("UPDATE workspaces SET status ="). + WithArgs(models.StatusOnline, "ws-failed"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + mock.ExpectExec("INSERT INTO structure_events"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + body := `{"workspace_id":"ws-failed","error_rate":0.0,"sample_error":"","active_tasks":1,"uptime_seconds":3000}` + c.Request = httptest.NewRequest("POST", "/registry/heartbeat", bytes.NewBufferString(body)) + c.Request.Header.Set("Content-Type", "application/json") + + handler.Heartbeat(c) + + if w.Code != http.StatusOK { + t.Errorf("expected status 200, got %d: %s", w.Code, w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + // ==================== Heartbeat — awaiting_agent → online recovery ==================== // External workspaces flip to 'awaiting_agent' via healthsweep when their // heartbeat goes stale. When the operator's poller comes back, heartbeat