fix(registry): auto-recover failed/provisioning workspaces on successful heartbeat (extracted from #1664)

When a workspace is marked "failed" or "provisioning" but is actively sending heartbeats, transition it to "online". Transient boot failures or mid-setup provisioner crashes otherwise leave workspaces stuck in a stale terminal state even after they become healthy. Preserves existing online/degraded/offline transitions; only adds a new conditional branch for the failed/provisioning case with a guarded WHERE clause so a concurrent delete cannot flip 'removed' back to 'online'.
2026-04-22 20:00:26 -07:00 · 2026-04-22 20:00:26 -07:00 · 7c81b081d2
commit 7c81b081d2
parent 32555a884a
1 changed files with 16 additions and 0 deletions
--- a/workspace-server/internal/handlers/registry.go
+++ b/workspace-server/internal/handlers/registry.go
@ -444,6 +444,22 @@ func (h *RegistryHandler) evaluateStatus(c *gin.Context, payload models.Heartbea
 		}
 		h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_ONLINE", payload.WorkspaceID, map[string]interface{}{})
 	}
+
+	// Auto-recovery: if a workspace is marked "failed" or "provisioning" but is
+	// actively sending heartbeats, it has clearly booted successfully. Transition
+	// to "online" so the scheduler and dashboard reflect reality. This catches
+	// cases where the provisioner crashed mid-setup or an earlier error left the
+	// status stale.
+	if currentStatus == "failed" || currentStatus == "provisioning" {
+		if _, err := db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'online', updated_at = now() WHERE id = $1 AND status IN ('failed', 'provisioning')`, payload.WorkspaceID); err != nil {
+			log.Printf("Heartbeat: failed to auto-recover %s from %s to online: %v", payload.WorkspaceID, currentStatus, err)
+		} else {
+			log.Printf("Heartbeat: auto-recovered %s from %s to online (heartbeat received)", payload.WorkspaceID, currentStatus)
+		}
+		h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_ONLINE", payload.WorkspaceID, map[string]interface{}{
+			"recovered_from": currentStatus,
+		})
+	}
 }

 // UpdateCard handles POST /registry/update-card