diff --git a/workspace-server/internal/handlers/handlers_additional_test.go b/workspace-server/internal/handlers/handlers_additional_test.go index 16d76304..d34ee8a5 100644 --- a/workspace-server/internal/handlers/handlers_additional_test.go +++ b/workspace-server/internal/handlers/handlers_additional_test.go @@ -1456,3 +1456,74 @@ func TestSecretsSet_InvalidJSON(t *testing.T) { t.Errorf("expected 400, got %d: %s", w.Code, w.Body.String()) } } + +// TestHeartbeatHandler_RegisterFailureClearedOnCardBearingRestart is core#2739: +// a RESTARTED workspace already has a persisted agent_card, so the NULL-scoped +// agent_card backfill (and its bundled register-failure clear) never fires. +// The restarted container's authenticated /registry/register can 400 with +// url_validate_failed (Docker-internal hostname unresolvable from the platform), +// which stamps last_register_failure_at. Before the fix, that marker stuck and +// evaluateStatus held the workspace 'degraded' past the Local Provision +// Lifecycle restart-survival window (run 358593). The fix clears the marker on +// ANY card-bearing heartbeat (the live card proves the runtime is reachable), +// so a heartbeating restarted workspace recovers degraded->online promptly. +// +// This test asserts the decoupled clear runs even when agent_card already +// exists (backfill affects 0 rows) and that recovery then proceeds. +func TestHeartbeatHandler_RegisterFailureClearedOnCardBearingRestart(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + handler := NewRegistryHandler(broadcaster) + + // prevTask SELECT + mock.ExpectQuery("SELECT COALESCE\\(current_task"). + WithArgs("ws-2739"). + WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("")) + + // heartbeat UPDATE + mock.ExpectExec("UPDATE workspaces SET"). + WithArgs("ws-2739", 0.0, "", 0, 120, ""). + WillReturnResult(sqlmock.NewResult(0, 1)) + + // agent_card backfill — restart case: card ALREADY exists, 0 rows affected. + mock.ExpectExec("UPDATE workspaces\\s+SET agent_card = \\$2"). + WithArgs("ws-2739", sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 0)) + + // core#2739 decoupled clear — fires on the card-bearing heartbeat even + // though the backfill above changed nothing. 1 row: the stale marker is wiped. + mock.ExpectExec("UPDATE workspaces\\s+SET last_register_failure_at = NULL\\s+WHERE id = \\$1 AND last_register_failure_at IS NOT NULL"). + WithArgs("ws-2739"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + // evaluateStatus SELECT — workspace is degraded; failure marker is now NULL + // (the clear above wiped it in the real DB), so recovery is unblocked. + mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + WithArgs("ws-2739"). + WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("degraded", nil)) + + // degraded -> online recovery + mock.ExpectExec("UPDATE workspaces SET status ="). + WithArgs(models.StatusOnline, "ws-2739"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + // WORKSPACE_ONLINE broadcast + mock.ExpectExec("INSERT INTO structure_events"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + body := `{"workspace_id":"ws-2739","error_rate":0.0,"sample_error":"","active_tasks":0,"uptime_seconds":120,"agent_card":{"name":"restarted-agent","url":"http://ws-2739:8000"}}` + c.Request = httptest.NewRequest("POST", "/registry/heartbeat", bytes.NewBufferString(body)) + c.Request.Header.Set("Content-Type", "application/json") + + handler.Heartbeat(c) + + if w.Code != http.StatusOK { + t.Errorf("expected status 200, got %d: %s", w.Code, w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} diff --git a/workspace-server/internal/handlers/registry.go b/workspace-server/internal/handlers/registry.go index 62a804c5..5cc3590e 100644 --- a/workspace-server/internal/handlers/registry.go +++ b/workspace-server/internal/handlers/registry.go @@ -820,26 +820,52 @@ func (h *RegistryHandler) Heartbeat(c *gin.Context) { // heartbeat carries it. Only writes when NULL — never overwrites a // reconciled or updated card. This is the recovery path for fast-cloud // workspaces whose DNS wasn't ready at first register. - // - // #2659/#2665: also clear last_register_failure_at on this recovery, - // otherwise evaluateStatus keeps the workspace stuck in 'degraded' - // forever (degraded→online recovery requires no recent register failure). if len(payload.AgentCard) > 0 { res, err := db.DB.ExecContext(ctx, ` UPDATE workspaces - SET agent_card = $2, - last_register_failure_at = NULL + SET agent_card = $2 WHERE id = $1 AND agent_card IS NULL `, payload.WorkspaceID, payload.AgentCard) if err != nil { log.Printf("Registry heartbeat: agent_card backfill failed for %s: %v", payload.WorkspaceID, err) } else { if rows, _ := res.RowsAffected(); rows > 0 { - log.Printf("Registry heartbeat: backfilled agent_card and cleared register-failure marker for %s (initial register had failed)", payload.WorkspaceID) + log.Printf("Registry heartbeat: backfilled agent_card for %s (initial register had failed)", payload.WorkspaceID) } } } + // #2659/#2665/#2739: clear last_register_failure_at whenever a heartbeat + // carries a valid agent_card — NOT only on the agent_card-was-NULL backfill + // path above. A heartbeat with a card proves the runtime is alive and + // re-advertising the SAME reachable card the platform already trusts, so any + // recent register-400 was transient and must not keep the workspace pinned + // in 'degraded' until the 5-minute failure window ages out. + // + // #2739: on RESTART (vs first provision) the agent_card row is ALREADY + // populated, so the NULL-scoped backfill never fires and never cleared the + // marker. The restarted container's authenticated /registry/register can + // 400 with url_validate_failed (its Docker-internal hostname e.g. + // "212851b5693d" is not resolvable from the platform), stamping + // last_register_failure_at; with the card already present the marker stuck + // and evaluateStatus held the workspace degraded past the Local Provision + // Lifecycle restart-survival window (run 358593). Credentials are correctly + // projected post-restart (core#2709/#2712); this is purely the degraded->online + // recovery gap. Clearing on a card-bearing heartbeat is the same trust signal + // the success-on-register clear (register handler) relies on. + if len(payload.AgentCard) > 0 { + res, err := db.DB.ExecContext(ctx, ` + UPDATE workspaces + SET last_register_failure_at = NULL + WHERE id = $1 AND last_register_failure_at IS NOT NULL + `, payload.WorkspaceID) + if err != nil { + log.Printf("Registry heartbeat: clear register-failure marker failed for %s: %v", payload.WorkspaceID, err) + } else if rows, _ := res.RowsAffected(); rows > 0 { + log.Printf("Registry heartbeat: cleared register-failure marker for %s (live card-bearing heartbeat after a transient register-400)", payload.WorkspaceID) + } + } + // Refresh Redis TTL if err := db.RefreshTTL(ctx, payload.WorkspaceID); err != nil { log.Printf("Heartbeat redis error: %v", err)