fix(workspace): re-project BYOK Anthropic creds on restart/recreate (#2739/#2712) #2741
@@ -1456,3 +1456,74 @@ func TestSecretsSet_InvalidJSON(t *testing.T) {
|
||||
t.Errorf("expected 400, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
// TestHeartbeatHandler_RegisterFailureClearedOnCardBearingRestart is core#2739:
|
||||
// a RESTARTED workspace already has a persisted agent_card, so the NULL-scoped
|
||||
// agent_card backfill (and its bundled register-failure clear) never fires.
|
||||
// The restarted container's authenticated /registry/register can 400 with
|
||||
// url_validate_failed (Docker-internal hostname unresolvable from the platform),
|
||||
// which stamps last_register_failure_at. Before the fix, that marker stuck and
|
||||
// evaluateStatus held the workspace 'degraded' past the Local Provision
|
||||
// Lifecycle restart-survival window (run 358593). The fix clears the marker on
|
||||
// ANY card-bearing heartbeat (the live card proves the runtime is reachable),
|
||||
// so a heartbeating restarted workspace recovers degraded->online promptly.
|
||||
//
|
||||
// This test asserts the decoupled clear runs even when agent_card already
|
||||
// exists (backfill affects 0 rows) and that recovery then proceeds.
|
||||
func TestHeartbeatHandler_RegisterFailureClearedOnCardBearingRestart(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
broadcaster := newTestBroadcaster()
|
||||
handler := NewRegistryHandler(broadcaster)
|
||||
|
||||
// prevTask SELECT
|
||||
mock.ExpectQuery("SELECT COALESCE\\(current_task").
|
||||
WithArgs("ws-2739").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
|
||||
|
||||
// heartbeat UPDATE
|
||||
mock.ExpectExec("UPDATE workspaces SET").
|
||||
WithArgs("ws-2739", 0.0, "", 0, 120, "").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
// agent_card backfill — restart case: card ALREADY exists, 0 rows affected.
|
||||
mock.ExpectExec("UPDATE workspaces\\s+SET agent_card = \\$2").
|
||||
WithArgs("ws-2739", sqlmock.AnyArg()).
|
||||
WillReturnResult(sqlmock.NewResult(0, 0))
|
||||
|
||||
// core#2739 decoupled clear — fires on the card-bearing heartbeat even
|
||||
// though the backfill above changed nothing. 1 row: the stale marker is wiped.
|
||||
mock.ExpectExec("UPDATE workspaces\\s+SET last_register_failure_at = NULL\\s+WHERE id = \\$1 AND last_register_failure_at IS NOT NULL").
|
||||
WithArgs("ws-2739").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
// evaluateStatus SELECT — workspace is degraded; failure marker is now NULL
|
||||
// (the clear above wiped it in the real DB), so recovery is unblocked.
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-2739").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("degraded", nil))
|
||||
|
||||
// degraded -> online recovery
|
||||
mock.ExpectExec("UPDATE workspaces SET status =").
|
||||
WithArgs(models.StatusOnline, "ws-2739").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
// WORKSPACE_ONLINE broadcast
|
||||
mock.ExpectExec("INSERT INTO structure_events").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
body := `{"workspace_id":"ws-2739","error_rate":0.0,"sample_error":"","active_tasks":0,"uptime_seconds":120,"agent_card":{"name":"restarted-agent","url":"http://ws-2739:8000"}}`
|
||||
c.Request = httptest.NewRequest("POST", "/registry/heartbeat", bytes.NewBufferString(body))
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
|
||||
handler.Heartbeat(c)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Errorf("expected status 200, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet sqlmock expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -820,26 +820,52 @@ func (h *RegistryHandler) Heartbeat(c *gin.Context) {
|
||||
// heartbeat carries it. Only writes when NULL — never overwrites a
|
||||
// reconciled or updated card. This is the recovery path for fast-cloud
|
||||
// workspaces whose DNS wasn't ready at first register.
|
||||
//
|
||||
// #2659/#2665: also clear last_register_failure_at on this recovery,
|
||||
// otherwise evaluateStatus keeps the workspace stuck in 'degraded'
|
||||
// forever (degraded→online recovery requires no recent register failure).
|
||||
if len(payload.AgentCard) > 0 {
|
||||
res, err := db.DB.ExecContext(ctx, `
|
||||
UPDATE workspaces
|
||||
SET agent_card = $2,
|
||||
last_register_failure_at = NULL
|
||||
SET agent_card = $2
|
||||
WHERE id = $1 AND agent_card IS NULL
|
||||
`, payload.WorkspaceID, payload.AgentCard)
|
||||
if err != nil {
|
||||
log.Printf("Registry heartbeat: agent_card backfill failed for %s: %v", payload.WorkspaceID, err)
|
||||
} else {
|
||||
if rows, _ := res.RowsAffected(); rows > 0 {
|
||||
log.Printf("Registry heartbeat: backfilled agent_card and cleared register-failure marker for %s (initial register had failed)", payload.WorkspaceID)
|
||||
log.Printf("Registry heartbeat: backfilled agent_card for %s (initial register had failed)", payload.WorkspaceID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// #2659/#2665/#2739: clear last_register_failure_at whenever a heartbeat
|
||||
// carries a valid agent_card — NOT only on the agent_card-was-NULL backfill
|
||||
// path above. A heartbeat with a card proves the runtime is alive and
|
||||
// re-advertising the SAME reachable card the platform already trusts, so any
|
||||
// recent register-400 was transient and must not keep the workspace pinned
|
||||
// in 'degraded' until the 5-minute failure window ages out.
|
||||
//
|
||||
// #2739: on RESTART (vs first provision) the agent_card row is ALREADY
|
||||
// populated, so the NULL-scoped backfill never fires and never cleared the
|
||||
// marker. The restarted container's authenticated /registry/register can
|
||||
// 400 with url_validate_failed (its Docker-internal hostname e.g.
|
||||
// "212851b5693d" is not resolvable from the platform), stamping
|
||||
// last_register_failure_at; with the card already present the marker stuck
|
||||
// and evaluateStatus held the workspace degraded past the Local Provision
|
||||
// Lifecycle restart-survival window (run 358593). Credentials are correctly
|
||||
// projected post-restart (core#2709/#2712); this is purely the degraded->online
|
||||
// recovery gap. Clearing on a card-bearing heartbeat is the same trust signal
|
||||
// the success-on-register clear (register handler) relies on.
|
||||
if len(payload.AgentCard) > 0 {
|
||||
res, err := db.DB.ExecContext(ctx, `
|
||||
UPDATE workspaces
|
||||
SET last_register_failure_at = NULL
|
||||
WHERE id = $1 AND last_register_failure_at IS NOT NULL
|
||||
`, payload.WorkspaceID)
|
||||
if err != nil {
|
||||
log.Printf("Registry heartbeat: clear register-failure marker failed for %s: %v", payload.WorkspaceID, err)
|
||||
} else if rows, _ := res.RowsAffected(); rows > 0 {
|
||||
log.Printf("Registry heartbeat: cleared register-failure marker for %s (live card-bearing heartbeat after a transient register-400)", payload.WorkspaceID)
|
||||
}
|
||||
}
|
||||
|
||||
// Refresh Redis TTL
|
||||
if err := db.RefreshTTL(ctx, payload.WorkspaceID); err != nil {
|
||||
log.Printf("Heartbeat redis error: %v", err)
|
||||
|
||||
Reference in New Issue
Block a user