From dd3dad7952bd90062f41f1eb4692cb655b1a5308 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Fri, 12 Jun 2026 20:46:30 +0000 Subject: [PATCH] fix(registry): clear last_register_failure_at on healthy heartbeat agent_card backfill A transient /registry/register failure stamps last_register_failure_at and forces the workspace to degraded. When a subsequent heartbeat carries a valid agent_card and backfills the missing card, it is now a healthy recovery path, so clear last_register_failure_at in the same UPDATE. Without this, evaluateStatus keeps the workspace stuck degraded forever. Relates-to: #2659 #2665 --- .../internal/handlers/registry.go | 9 ++- .../internal/handlers/registry_test.go | 56 +++++++++++++++++++ 2 files changed, 63 insertions(+), 2 deletions(-) diff --git a/workspace-server/internal/handlers/registry.go b/workspace-server/internal/handlers/registry.go index 6c286ecd..5940f7a4 100644 --- a/workspace-server/internal/handlers/registry.go +++ b/workspace-server/internal/handlers/registry.go @@ -784,17 +784,22 @@ func (h *RegistryHandler) Heartbeat(c *gin.Context) { // heartbeat carries it. Only writes when NULL — never overwrites a // reconciled or updated card. This is the recovery path for fast-cloud // workspaces whose DNS wasn't ready at first register. + // + // #2659/#2665: also clear last_register_failure_at on this recovery, + // otherwise evaluateStatus keeps the workspace stuck in 'degraded' + // forever (degraded→online recovery requires no recent register failure). if len(payload.AgentCard) > 0 { res, err := db.DB.ExecContext(ctx, ` UPDATE workspaces - SET agent_card = $2 + SET agent_card = $2, + last_register_failure_at = NULL WHERE id = $1 AND agent_card IS NULL `, payload.WorkspaceID, payload.AgentCard) if err != nil { log.Printf("Registry heartbeat: agent_card backfill failed for %s: %v", payload.WorkspaceID, err) } else { if rows, _ := res.RowsAffected(); rows > 0 { - log.Printf("Registry heartbeat: backfilled agent_card for %s (initial register had failed)", payload.WorkspaceID) + log.Printf("Registry heartbeat: backfilled agent_card and cleared register-failure marker for %s (initial register had failed)", payload.WorkspaceID) } } } diff --git a/workspace-server/internal/handlers/registry_test.go b/workspace-server/internal/handlers/registry_test.go index 1a309949..127cc3e4 100644 --- a/workspace-server/internal/handlers/registry_test.go +++ b/workspace-server/internal/handlers/registry_test.go @@ -942,6 +942,62 @@ func TestHeartbeatHandler_SkipsAgentCardBackfill_WhenAlreadySet(t *testing.T) { } } +// TestHeartbeatHandler_BackfillAgentCard_ClearsRegisterFailure verifies the +// #2659/#2665 recovery path: a healthy heartbeat that backfills a missing +// agent_card also clears last_register_failure_at, so a workspace that was +// previously forced degraded by a transient register failure can recover to +// online instead of staying stuck degraded forever. +func TestHeartbeatHandler_BackfillAgentCard_ClearsRegisterFailure(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + handler := NewRegistryHandler(broadcaster) + + mock.ExpectQuery("SELECT COALESCE\\(current_task"). + WithArgs("ws-degraded-register-fail"). + WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0)) + + mock.ExpectExec("UPDATE workspaces SET"). + WithArgs("ws-degraded-register-fail", 0.0, "", 0, 0, ""). + WillReturnResult(sqlmock.NewResult(0, 1)) + + // The heartbeat carries an agent_card and the row is NULL, so the backfill + // UPDATE must ALSO clear last_register_failure_at. + mock.ExpectExec("UPDATE workspaces SET agent_card ="). + WithArgs("ws-degraded-register-fail", sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) + + // Status check sees degraded, but last_register_failure_at is now NULL because + // the agent_card backfill UPDATE cleared it. + mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + WithArgs("ws-degraded-register-fail"). + WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow(models.StatusDegraded, nil)) + + // Because the failure marker was cleared by the backfill, evaluateStatus + // should now recover the workspace to online. + mock.ExpectExec("UPDATE workspaces SET status ="). + WithArgs(models.StatusOnline, "ws-degraded-register-fail"). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("INSERT INTO structure_events"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + + body := `{"workspace_id":"ws-degraded-register-fail","agent_card":{"name":"recovered"}}` + c.Request = httptest.NewRequest("POST", "/registry/heartbeat", bytes.NewBufferString(body)) + c.Request.Header.Set("Content-Type", "application/json") + + handler.Heartbeat(c) + + if w.Code != http.StatusOK { + t.Errorf("expected status 200, got %d: %s", w.Code, w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + // ------------------------------------------------------------ // validateAgentURL (C6 SSRF fix) // ------------------------------------------------------------ -- 2.52.0