From f001a4cf5e62526af63e0ef95c08eb60d18a6fbf Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 23 Apr 2026 11:34:10 -0700 Subject: [PATCH] =?UTF-8?q?fix(registry):=20heartbeat=20transitions=20prov?= =?UTF-8?q?isioning=E2=86=92online=20on=20first=20heartbeat=20(#1784)=20(#?= =?UTF-8?q?1794)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Workspaces restart with status='provisioning' and never transition to 'online' because the runtime never calls /registry/register after container start — only the heartbeat loop runs post-boot. The heartbeat handler had transitions for online→degraded, degraded→online, and offline→online, but NOT provisioning→online, leaving newly-started workspaces in a phantom-idle state where the scheduler defers dispatch and the A2A proxy rejects them even though they're running fine. Fix: add provisioning→online transition to evaluateStatus(), guarded by `AND status = 'provisioning'` in the UPDATE WHERE clause so a concurrent Delete cannot flip 'removed' back to 'online'. Broadcasts WORKSPACE_ONLINE with recovered_from='provisioning' so dashboard/scheduler reflect reality. Add TestHeartbeatHandler_ProvisioningToOnline to cover the new path. Issue: Molecule-AI/molecule-core#1784 Co-authored-by: Molecule AI Core-BE Co-authored-by: molecule-ai[bot] <276602405+molecule-ai[bot]@users.noreply.github.com> --- .../internal/handlers/registry.go | 19 +++---- .../internal/handlers/registry_test.go | 50 +++++++++++++++++++ 2 files changed, 60 insertions(+), 9 deletions(-) diff --git a/workspace-server/internal/handlers/registry.go b/workspace-server/internal/handlers/registry.go index 97ef8537..4e3d6675 100644 --- a/workspace-server/internal/handlers/registry.go +++ b/workspace-server/internal/handlers/registry.go @@ -451,16 +451,17 @@ func (h *RegistryHandler) evaluateStatus(c *gin.Context, payload models.Heartbea h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_ONLINE", payload.WorkspaceID, map[string]interface{}{}) } - // Auto-recovery: if a workspace is marked "failed" or "provisioning" but is - // actively sending heartbeats, it has clearly booted successfully. Transition - // to "online" so the scheduler and dashboard reflect reality. This catches - // cases where the provisioner crashed mid-setup or an earlier error left the - // status stale. - if currentStatus == "failed" || currentStatus == "provisioning" { - if _, err := db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'online', updated_at = now() WHERE id = $1 AND status IN ('failed', 'provisioning')`, payload.WorkspaceID); err != nil { - log.Printf("Heartbeat: failed to auto-recover %s from %s to online: %v", payload.WorkspaceID, currentStatus, err) + // Auto-recovery: if a workspace is marked "provisioning" but is actively sending + // heartbeats, it has successfully started up. Transition to "online" so the scheduler + // and A2A proxy can dispatch tasks to it. The provisioner does not call + // /registry/register on container start — only the heartbeat loop does, so this + // transition is the only mechanism that moves newly-started workspaces out of + // the phantom-idle state. (#1784) + if currentStatus == "provisioning" { + if _, err := db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'online', updated_at = now() WHERE id = $1 AND status = 'provisioning'`, payload.WorkspaceID); err != nil { + log.Printf("Heartbeat: failed to transition %s from provisioning to online: %v", payload.WorkspaceID, err) } else { - log.Printf("Heartbeat: auto-recovered %s from %s to online (heartbeat received)", payload.WorkspaceID, currentStatus) + log.Printf("Heartbeat: transitioned %s from provisioning to online (heartbeat received)", payload.WorkspaceID) } h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_ONLINE", payload.WorkspaceID, map[string]interface{}{ "recovered_from": currentStatus, diff --git a/workspace-server/internal/handlers/registry_test.go b/workspace-server/internal/handlers/registry_test.go index 791f07dc..4d2cb904 100644 --- a/workspace-server/internal/handlers/registry_test.go +++ b/workspace-server/internal/handlers/registry_test.go @@ -134,6 +134,56 @@ func TestHeartbeatHandler_OfflineToOnline(t *testing.T) { } } +// ==================== Heartbeat — provisioning → online recovery (#1784) ==================== + +func TestHeartbeatHandler_ProvisioningToOnline(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + handler := NewRegistryHandler(broadcaster) + + // Expect prevTask SELECT + mock.ExpectQuery("SELECT COALESCE\\(current_task"). + WithArgs("ws-provisioning"). + WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("")) + + // Expect heartbeat UPDATE + mock.ExpectExec("UPDATE workspaces SET"). + WithArgs("ws-provisioning", 0.0, "", 1, 3000, ""). + WillReturnResult(sqlmock.NewResult(0, 1)) + + // Expect evaluateStatus SELECT — currently provisioning + mock.ExpectQuery("SELECT status FROM workspaces WHERE id ="). + WithArgs("ws-provisioning"). + WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("provisioning")) + + // Expect status transition to online (#1784) + mock.ExpectExec("UPDATE workspaces SET status = 'online'"). + WithArgs("ws-provisioning"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + // Expect RecordAndBroadcast INSERT for WORKSPACE_ONLINE + mock.ExpectExec("INSERT INTO structure_events"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + + body := `{"workspace_id":"ws-provisioning","error_rate":0.0,"sample_error":"","active_tasks":1,"uptime_seconds":3000}` + c.Request = httptest.NewRequest("POST", "/registry/heartbeat", bytes.NewBufferString(body)) + c.Request.Header.Set("Content-Type", "application/json") + + handler.Heartbeat(c) + + if w.Code != http.StatusOK { + t.Errorf("expected status 200, got %d: %s", w.Code, w.Body.String()) + } + + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + func TestHeartbeatHandler_BadJSON(t *testing.T) { setupTestDB(t) setupTestRedis(t)