fix(platform-agent): fail-closed MCP-server gate for concierge online-marking (RCA #2970) #2989
@@ -455,15 +455,15 @@ func TestHeartbeat_ExactThreshold_Degraded(t *testing.T) {
|
||||
|
||||
mock.ExpectQuery("SELECT COALESCE\\(current_task").
|
||||
WithArgs("ws-edge").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0))
|
||||
mock.ExpectExec("UPDATE workspaces SET").
|
||||
WithArgs("ws-edge", 0.5, "edge case", 0, 500, "").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
// error_rate == 0.5 should trigger degraded (>= 0.5)
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-edge").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
|
||||
mock.ExpectExec("UPDATE workspaces SET status =").
|
||||
WithArgs(models.StatusDegraded, "ws-edge").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
@@ -496,15 +496,15 @@ func TestHeartbeat_DegradedRecovery(t *testing.T) {
|
||||
|
||||
mock.ExpectQuery("SELECT COALESCE\\(current_task").
|
||||
WithArgs("ws-rec").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0))
|
||||
mock.ExpectExec("UPDATE workspaces SET").
|
||||
WithArgs("ws-rec", 0.05, "", 1, 2000, "").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
// Currently degraded, error_rate < 0.1 → should recover to online
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-rec").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("degraded", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("degraded", "", nil))
|
||||
mock.ExpectExec("UPDATE workspaces SET status =").
|
||||
WithArgs(models.StatusOnline, "ws-rec").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
@@ -538,15 +538,15 @@ func TestHeartbeat_ErrorRateDegrade_Guarded(t *testing.T) {
|
||||
|
||||
mock.ExpectQuery("SELECT COALESCE\\(current_task").
|
||||
WithArgs("ws-degrade-guard").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0))
|
||||
mock.ExpectExec("UPDATE workspaces SET").
|
||||
WithArgs("ws-degrade-guard", 0.6, "", 1, 100, "").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
// Stale read: heartbeat started before CascadeDelete set status='removed'
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-degrade-guard").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
|
||||
|
||||
// Guarded UPDATE returns 0 rows because row is actually 'removed'
|
||||
mock.ExpectExec("UPDATE workspaces SET status =.*AND status = 'online'").
|
||||
@@ -584,15 +584,15 @@ func TestHeartbeat_DegradedRecovery_Guarded(t *testing.T) {
|
||||
|
||||
mock.ExpectQuery("SELECT COALESCE\\(current_task").
|
||||
WithArgs("ws-recover-guard").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0))
|
||||
mock.ExpectExec("UPDATE workspaces SET").
|
||||
WithArgs("ws-recover-guard", 0.05, "", 1, 100, "").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
// Stale read: heartbeat started before CascadeDelete set status='removed'
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-recover-guard").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("degraded", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("degraded", "", nil))
|
||||
|
||||
// Guarded UPDATE returns 0 rows because row is actually 'removed'
|
||||
mock.ExpectExec("UPDATE workspaces SET status =.*AND status = 'degraded'").
|
||||
@@ -1479,7 +1479,7 @@ func TestHeartbeatHandler_RegisterFailureClearedOnCardBearingRestart(t *testing.
|
||||
// prevTask SELECT
|
||||
mock.ExpectQuery("SELECT COALESCE\\(current_task").
|
||||
WithArgs("ws-2739").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0))
|
||||
|
||||
// heartbeat UPDATE
|
||||
mock.ExpectExec("UPDATE workspaces SET").
|
||||
@@ -1499,9 +1499,9 @@ func TestHeartbeatHandler_RegisterFailureClearedOnCardBearingRestart(t *testing.
|
||||
|
||||
// evaluateStatus SELECT — workspace is degraded; failure marker is now NULL
|
||||
// (the clear above wiped it in the real DB), so recovery is unblocked.
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-2739").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("degraded", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("degraded", "", nil))
|
||||
|
||||
// degraded -> online recovery
|
||||
mock.ExpectExec("UPDATE workspaces SET status =").
|
||||
|
||||
@@ -244,7 +244,7 @@ func TestHeartbeatHandler_Normal(t *testing.T) {
|
||||
// Expect prevTask SELECT (before UPDATE)
|
||||
mock.ExpectQuery("SELECT COALESCE\\(current_task").
|
||||
WithArgs("ws-123").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0))
|
||||
|
||||
// Expect heartbeat UPDATE
|
||||
mock.ExpectExec("UPDATE workspaces SET").
|
||||
@@ -252,9 +252,9 @@ func TestHeartbeatHandler_Normal(t *testing.T) {
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
// Expect evaluateStatus SELECT
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-123").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
@@ -283,7 +283,7 @@ func TestHeartbeatHandler_Degraded(t *testing.T) {
|
||||
// Expect prevTask SELECT (before UPDATE)
|
||||
mock.ExpectQuery("SELECT COALESCE\\(current_task").
|
||||
WithArgs("ws-123").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0))
|
||||
|
||||
// Expect heartbeat UPDATE
|
||||
mock.ExpectExec("UPDATE workspaces SET").
|
||||
@@ -291,9 +291,9 @@ func TestHeartbeatHandler_Degraded(t *testing.T) {
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
// Expect evaluateStatus SELECT — currently online
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-123").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
|
||||
|
||||
// Expect status transition to degraded
|
||||
mock.ExpectExec("UPDATE workspaces SET status =").
|
||||
@@ -331,7 +331,7 @@ func TestHeartbeatHandler_Recovery(t *testing.T) {
|
||||
// Expect prevTask SELECT (before UPDATE)
|
||||
mock.ExpectQuery("SELECT COALESCE\\(current_task").
|
||||
WithArgs("ws-123").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0))
|
||||
|
||||
// Expect heartbeat UPDATE
|
||||
mock.ExpectExec("UPDATE workspaces SET").
|
||||
@@ -339,9 +339,9 @@ func TestHeartbeatHandler_Recovery(t *testing.T) {
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
// Expect evaluateStatus SELECT — currently degraded
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-123").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("degraded", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("degraded", "", nil))
|
||||
|
||||
// Expect status transition back to online
|
||||
mock.ExpectExec("UPDATE workspaces SET status =").
|
||||
@@ -727,7 +727,7 @@ func TestHeartbeatHandler_TaskChanged(t *testing.T) {
|
||||
// Expect prevTask SELECT — currently "old task"
|
||||
mock.ExpectQuery("SELECT COALESCE\\(current_task").
|
||||
WithArgs("ws-123").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("old task"))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("old task", 0))
|
||||
|
||||
// Expect heartbeat UPDATE with new task
|
||||
mock.ExpectExec("UPDATE workspaces SET").
|
||||
@@ -735,9 +735,9 @@ func TestHeartbeatHandler_TaskChanged(t *testing.T) {
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
// Expect evaluateStatus SELECT
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-123").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
@@ -913,7 +913,7 @@ func TestHeartbeatHandler_TaskUnchanged(t *testing.T) {
|
||||
// Expect prevTask SELECT — task is already "doing work"
|
||||
mock.ExpectQuery("SELECT COALESCE\\(current_task").
|
||||
WithArgs("ws-123").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("doing work"))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("doing work", 0))
|
||||
|
||||
// Expect heartbeat UPDATE with same task
|
||||
mock.ExpectExec("UPDATE workspaces SET").
|
||||
@@ -921,9 +921,9 @@ func TestHeartbeatHandler_TaskUnchanged(t *testing.T) {
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
// Expect evaluateStatus SELECT
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-123").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
|
||||
|
||||
// NO TASK_UPDATED broadcast expected — task didn't change
|
||||
|
||||
@@ -956,7 +956,7 @@ func TestHeartbeatHandler_TaskCleared(t *testing.T) {
|
||||
// Expect prevTask SELECT — was doing something
|
||||
mock.ExpectQuery("SELECT COALESCE\\(current_task").
|
||||
WithArgs("ws-123").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("old task"))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("old task", 0))
|
||||
|
||||
// Expect heartbeat UPDATE with empty task
|
||||
mock.ExpectExec("UPDATE workspaces SET").
|
||||
@@ -964,9 +964,9 @@ func TestHeartbeatHandler_TaskCleared(t *testing.T) {
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
// Expect evaluateStatus SELECT
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-123").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
|
||||
|
||||
// TASK_UPDATED broadcast expected — changed from "old task" to ""
|
||||
// (BroadcastOnly doesn't hit sqlmock, so no expectation needed)
|
||||
@@ -1019,13 +1019,13 @@ func TestHeartbeatHandler_AlwaysBroadcastsHeartbeat(t *testing.T) {
|
||||
// Pre-fix this path emitted ZERO broadcasts.
|
||||
mock.ExpectQuery("SELECT COALESCE\\(current_task").
|
||||
WithArgs("ws-123").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("doing work"))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("doing work", 0))
|
||||
mock.ExpectExec("UPDATE workspaces SET").
|
||||
WithArgs("ws-123", 0.0, "", 1, 500, "doing work").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-123").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
|
||||
@@ -36,7 +36,7 @@ func TestHeartbeat_NativeStatusMgmt_SkipsDegradeInference(t *testing.T) {
|
||||
// prevTask SELECT (before UPDATE)
|
||||
mock.ExpectQuery("SELECT COALESCE\\(current_task").
|
||||
WithArgs("ws-native-status").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0))
|
||||
|
||||
// heartbeat UPDATE — same as the non-native path
|
||||
mock.ExpectExec("UPDATE workspaces SET").
|
||||
@@ -48,9 +48,9 @@ func TestHeartbeat_NativeStatusMgmt_SkipsDegradeInference(t *testing.T) {
|
||||
// MUST NOT. We deliberately don't ExpectExec the degrade UPDATE
|
||||
// — sqlmock fails the test if any UPDATE happens that wasn't
|
||||
// expected, which is the regression cover.
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-native-status").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
@@ -89,7 +89,7 @@ func TestHeartbeat_NativeStatusMgmt_SkipsRecovery(t *testing.T) {
|
||||
|
||||
mock.ExpectQuery("SELECT COALESCE\\(current_task").
|
||||
WithArgs("ws-native-recovery").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0))
|
||||
|
||||
// heartbeat UPDATE — error_rate=0.05 would fire recovery
|
||||
mock.ExpectExec("UPDATE workspaces SET").
|
||||
@@ -99,9 +99,9 @@ func TestHeartbeat_NativeStatusMgmt_SkipsRecovery(t *testing.T) {
|
||||
// evaluateStatus SELECT — currently degraded; recovery branch
|
||||
// would normally fire UPDATE → online + WORKSPACE_ONLINE broadcast.
|
||||
// Under native_status_mgmt, neither should run.
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-native-recovery").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("degraded", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("degraded", "", nil))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
@@ -135,7 +135,7 @@ func TestHeartbeat_NativeStatusMgmt_WedgedStillRespected(t *testing.T) {
|
||||
|
||||
mock.ExpectQuery("SELECT COALESCE\\(current_task").
|
||||
WithArgs("ws-wedged").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0))
|
||||
|
||||
// heartbeat UPDATE — RuntimeState="wedged" means sample_error
|
||||
// reflects the wedge reason, error_rate stays 0
|
||||
@@ -144,9 +144,9 @@ func TestHeartbeat_NativeStatusMgmt_WedgedStillRespected(t *testing.T) {
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
// evaluateStatus SELECT — currently online, wedged branch SHOULD fire
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-wedged").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
|
||||
|
||||
// Wedged degrade UPDATE — must still happen even with native_status_mgmt
|
||||
mock.ExpectExec("UPDATE workspaces SET status =").
|
||||
|
||||
@@ -393,12 +393,24 @@ func (h *RegistryHandler) platformAgentHasModelSecret(ctx context.Context, works
|
||||
return exists, err
|
||||
}
|
||||
|
||||
// platformAgentMCPServerPresent reports whether the runtime declared the
|
||||
// platform-agent image's /opt/molecule-mcp-server binary present. The payload
|
||||
// field is a pointer so an absent declaration (nil) is treated as false —
|
||||
// fail-closed: an old/generic runtime cannot prove it has the concierge MCP.
|
||||
func (h *RegistryHandler) platformAgentMCPServerPresent(present *bool) bool {
|
||||
return present != nil && *present
|
||||
}
|
||||
|
||||
// markWorkspaceFailed updates a workspace row to status='failed' and broadcasts
|
||||
// WORKSPACE_PROVISION_FAILED. It is a RegistryHandler-local fallback for the
|
||||
// fail-closed platform-agent identity gate; the WorkspaceHandler's
|
||||
// markProvisionFailed is the primary path during provisioning.
|
||||
func (h *RegistryHandler) markWorkspaceFailed(ctx context.Context, workspaceID, msg string) {
|
||||
extra := map[string]interface{}{"error": msg, "code": "PLATFORM_AGENT_IDENTITY_GATE"}
|
||||
func (h *RegistryHandler) markWorkspaceFailed(ctx context.Context, workspaceID, msg, reason string) {
|
||||
extra := map[string]interface{}{
|
||||
"error": msg,
|
||||
"code": "PLATFORM_AGENT_IDENTITY_GATE",
|
||||
"reason": reason,
|
||||
}
|
||||
h.broadcaster.RecordAndBroadcast(ctx, string(events.EventWorkspaceProvisionFailed), workspaceID, extra)
|
||||
if _, dbErr := db.DB.ExecContext(ctx,
|
||||
`UPDATE workspaces SET status = $3, last_sample_error = $2, updated_at = now() WHERE id = $1`,
|
||||
@@ -539,27 +551,45 @@ func (h *RegistryHandler) Register(c *gin.Context) {
|
||||
}
|
||||
|
||||
// Issue #2970: fail CLOSED if a platform agent reaches registration without
|
||||
// the seeded MODEL workspace_secret. The MISSING_MODEL gate in
|
||||
// BOTH the seeded MODEL workspace_secret AND the platform-agent image's baked
|
||||
// /opt/molecule-mcp-server binary. The MISSING_MODEL gate in
|
||||
// prepareProvisionContext is the primary defense, but if a model-less/identity-
|
||||
// less concierge somehow boots on a path that bypasses that gate (e.g. an old
|
||||
// or generic image), this second-layer guard prevents it from ever marking
|
||||
// less/mcp-less concierge somehow boots on a path that bypasses that gate (e.g.
|
||||
// an old or generic image), this second-layer guard prevents it from ever marking
|
||||
// itself online-routable. Instead we mark the workspace failed so the canvas
|
||||
// surfaces a provision failure rather than serving users a generic Claude Code.
|
||||
//
|
||||
// The runtime declares mcp-server availability via payload.mcp_server_present.
|
||||
// A nil/false value is fail-closed: an undeclared or missing MCP server cannot be
|
||||
// trusted for a concierge.
|
||||
//
|
||||
// existingState.ExistingKind is populated by fetchExistingWorkspaceStateForDiagnostics
|
||||
// (best-effort). We treat "platform" literally; any other value (including "(new)"
|
||||
// or "(unavailable)") means the gate does not apply unless payload.Kind itself is
|
||||
// "platform" (covered by the privilege-escalation precheck above).
|
||||
if payload.Kind == models.KindPlatform || existingState.ExistingKind == models.KindPlatform {
|
||||
if hasModel, mErr := h.platformAgentHasModelSecret(ctx, payload.ID); mErr != nil {
|
||||
hasModel, mErr := h.platformAgentHasModelSecret(ctx, payload.ID)
|
||||
if mErr != nil {
|
||||
log.Printf("Registry register: model secret lookup failed for %s: %v", payload.ID, mErr)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "registration failed"})
|
||||
return
|
||||
} else if !hasModel {
|
||||
msg := "platform agent registered without a seeded MODEL secret; refusing online"
|
||||
}
|
||||
hasMCP := h.platformAgentMCPServerPresent(payload.MCPServerPresent)
|
||||
if !hasModel || !hasMCP {
|
||||
var msg, reason, logCode string
|
||||
switch {
|
||||
case !hasModel:
|
||||
msg = "platform agent registered without a seeded MODEL secret; refusing online"
|
||||
reason = "model_missing"
|
||||
logCode = "platform_agent_model_missing"
|
||||
case !hasMCP:
|
||||
msg = "platform agent registered without /opt/molecule-mcp-server; refusing online"
|
||||
reason = "mcp_server_missing"
|
||||
logCode = "platform_agent_mcp_server_missing"
|
||||
}
|
||||
log.Printf("Registry register: %s (workspace=%s)", msg, payload.ID)
|
||||
h.markWorkspaceFailed(ctx, payload.ID, msg)
|
||||
logRegister400Reason("platform_agent_model_missing", payload.ID, payload, existingState, msg)
|
||||
h.markWorkspaceFailed(ctx, payload.ID, msg, reason)
|
||||
logRegister400Reason(logCode, payload.ID, payload, existingState, msg)
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "platform agent identity incomplete"})
|
||||
return
|
||||
}
|
||||
@@ -1193,14 +1223,50 @@ func (h *RegistryHandler) evaluateStatus(c *gin.Context, payload models.Heartbea
|
||||
ctx := c.Request.Context()
|
||||
|
||||
var currentStatus string
|
||||
var currentKind string
|
||||
var lastRegisterFailure sql.NullTime
|
||||
err := db.DB.QueryRowContext(ctx, `SELECT status, last_register_failure_at FROM workspaces WHERE id = $1`, payload.WorkspaceID).
|
||||
Scan(¤tStatus, &lastRegisterFailure)
|
||||
err := db.DB.QueryRowContext(ctx, `SELECT status, kind, last_register_failure_at FROM workspaces WHERE id = $1`, payload.WorkspaceID).
|
||||
Scan(¤tStatus, ¤tKind, &lastRegisterFailure)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
hasRecentRegisterFailure := lastRegisterFailure.Valid && time.Since(lastRegisterFailure.Time) < 5*time.Minute
|
||||
|
||||
// FAIL-CLOSED concierge online-marking gate (RCA #2970).
|
||||
// A kind='platform' workspace that has lost either its seeded MODEL secret or
|
||||
// the image-baked /opt/molecule-mcp-server binary must never be allowed back
|
||||
// to status='online' via heartbeat recovery. The Register handler already gates
|
||||
// the initial online marking; this gate closes the heartbeat-driven recovery
|
||||
// paths (provisioning/failed/offline/awaiting_agent/degraded → online) that
|
||||
// would otherwise resurrect a model-less/mcp-less concierge and let it serve
|
||||
// users generic Claude Code.
|
||||
//
|
||||
// The runtime now declares mcp-server availability via
|
||||
// payload.mcp_server_present on every heartbeat/register call. nil/false is
|
||||
// fail-closed: an old/generic runtime cannot prove it is a real concierge.
|
||||
if currentKind == models.KindPlatform {
|
||||
hasModel, mErr := h.platformAgentHasModelSecret(ctx, payload.WorkspaceID)
|
||||
if mErr != nil {
|
||||
log.Printf("Heartbeat: model secret lookup failed for platform agent %s: %v", payload.WorkspaceID, mErr)
|
||||
return
|
||||
}
|
||||
hasMCP := h.platformAgentMCPServerPresent(payload.MCPServerPresent)
|
||||
if !hasModel || !hasMCP {
|
||||
var msg, reason string
|
||||
switch {
|
||||
case !hasModel:
|
||||
msg = "platform agent heartbeat denied: no seeded MODEL workspace_secret; refusing to mark online (RCA #2970 FAIL-CLOSED)"
|
||||
reason = "model_missing"
|
||||
case !hasMCP:
|
||||
msg = "platform agent heartbeat denied: /opt/molecule-mcp-server missing; refusing to mark online (RCA #2970 FAIL-CLOSED)"
|
||||
reason = "mcp_server_missing"
|
||||
}
|
||||
log.Printf("Heartbeat: %s (workspace=%s)", msg, payload.WorkspaceID)
|
||||
h.markWorkspaceFailed(ctx, payload.WorkspaceID, msg, reason)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// Self-reported runtime wedge: takes precedence over the error_rate
|
||||
// path. The heartbeat task lives in its own asyncio task and keeps
|
||||
// firing 200s even after claude_agent_sdk locks up on
|
||||
|
||||
@@ -291,9 +291,9 @@ func TestHeartbeatHandler_OfflineToOnline(t *testing.T) {
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
// Expect evaluateStatus SELECT — currently offline
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-offline").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("offline", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("offline", "", nil))
|
||||
|
||||
// Expect status transition back to online
|
||||
mock.ExpectExec("UPDATE workspaces SET status =").
|
||||
@@ -362,9 +362,9 @@ func TestHeartbeatHandler_ProvisioningToOnline(t *testing.T) {
|
||||
|
||||
// evaluateStatus SELECT — reads the post-CASE status ('online'), so its own
|
||||
// provisioning→online branch does NOT fire (no duplicate transition exec).
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-provisioning").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
@@ -414,9 +414,9 @@ func TestHeartbeatHandler_FailedToOnline(t *testing.T) {
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
// evaluateStatus SELECT — currently failed (provision-timeout sweeper flip)
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-failed").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("failed", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("failed", "", nil))
|
||||
|
||||
// the new failed → online recovery transition
|
||||
mock.ExpectExec("UPDATE workspaces SET status =").
|
||||
@@ -463,9 +463,9 @@ func TestHeartbeatHandler_AwaitingAgentToOnline(t *testing.T) {
|
||||
WithArgs("ws-external", 0.0, "", 0, 60, "").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-external").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("awaiting_agent", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("awaiting_agent", "", nil))
|
||||
|
||||
// The new branch — UPDATE ... WHERE status = 'awaiting_agent'
|
||||
mock.ExpectExec("UPDATE workspaces SET status =").
|
||||
@@ -586,9 +586,9 @@ func TestHeartbeatHandler_OnlineStaysOnline(t *testing.T) {
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
// evaluateStatus: online with error_rate 0.2 — below 0.5 threshold, stays online
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-stable").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
@@ -635,9 +635,9 @@ func TestHeartbeatHandler_RuntimeWedged_FlipsOnlineToDegraded(t *testing.T) {
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
// evaluateStatus: currentStatus = online
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-wedged").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
|
||||
|
||||
// The wedge-handling branch fires the degraded UPDATE with the
|
||||
// `AND status = 'online'` guard (race-safe against concurrent
|
||||
@@ -688,9 +688,9 @@ func TestHeartbeatHandler_DegradedRecoversOnlyAfterWedgeClears(t *testing.T) {
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
// currentStatus = degraded
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-still-wedged").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("degraded", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("degraded", "", nil))
|
||||
|
||||
// No additional UPDATE expected — the recovery branch's
|
||||
// `runtime_state == ""` guard blocks the flip back to online.
|
||||
@@ -733,9 +733,9 @@ func TestHeartbeatHandler_DegradedToOnline_AfterWedgeClears(t *testing.T) {
|
||||
WithArgs("ws-recovered", 0.0, "", 0, 30, "").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-recovered").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("degraded", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("degraded", "", nil))
|
||||
|
||||
// Recovery UPDATE fires (degraded → online).
|
||||
mock.ExpectExec("UPDATE workspaces SET status =").
|
||||
@@ -978,7 +978,7 @@ func TestHeartbeat_SkipsRemovedRows(t *testing.T) {
|
||||
WillReturnResult(sqlmock.NewResult(0, 0))
|
||||
|
||||
// evaluateStatus SELECT
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id").
|
||||
WithArgs("ws-zombie").
|
||||
WillReturnError(sql.ErrNoRows) // row effectively removed from view
|
||||
|
||||
@@ -1019,9 +1019,9 @@ func TestHeartbeatHandler_BackfillsAgentCard_WhenNull(t *testing.T) {
|
||||
WithArgs("ws-nocard", sqlmock.AnyArg()).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-nocard").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow(models.StatusOnline, nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow(models.StatusOnline, "", nil))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
@@ -1059,9 +1059,9 @@ func TestHeartbeatHandler_SkipsAgentCardBackfill_WhenAlreadySet(t *testing.T) {
|
||||
WithArgs("ws-hascard", sqlmock.AnyArg()).
|
||||
WillReturnResult(sqlmock.NewResult(0, 0))
|
||||
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-hascard").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow(models.StatusOnline, nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow(models.StatusOnline, "", nil))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
@@ -1107,9 +1107,9 @@ func TestHeartbeatHandler_BackfillAgentCard_ClearsRegisterFailure(t *testing.T)
|
||||
|
||||
// Status check sees degraded, but last_register_failure_at is now NULL because
|
||||
// the agent_card backfill UPDATE cleared it.
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-degraded-register-fail").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow(models.StatusDegraded, nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow(models.StatusDegraded, "", nil))
|
||||
|
||||
// Because the failure marker was cleared by the backfill, evaluateStatus
|
||||
// should now recover the workspace to online.
|
||||
@@ -1754,9 +1754,9 @@ func TestHeartbeat_MonthlySpend_WithinBounds(t *testing.T) {
|
||||
WithArgs("ws-spend-ok", 0.0, "", 0, 0, "", int64(15000)). // $150.00
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id").
|
||||
WithArgs("ws-spend-ok").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
@@ -1790,9 +1790,9 @@ func TestHeartbeat_MonthlySpend_NegativeClamped(t *testing.T) {
|
||||
WithArgs("ws-spend-neg", 0.0, "", 0, 0, "").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id").
|
||||
WithArgs("ws-spend-neg").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
@@ -1826,9 +1826,9 @@ func TestHeartbeat_MonthlySpend_OverflowClamped(t *testing.T) {
|
||||
WithArgs("ws-spend-overflow", 0.0, "", 0, 0, "", int64(1_000_000_000_000)).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id").
|
||||
WithArgs("ws-spend-overflow").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
@@ -1861,9 +1861,9 @@ func TestHeartbeat_MonthlySpend_ExactCap(t *testing.T) {
|
||||
WithArgs("ws-spend-cap", 0.0, "", 0, 0, "", int64(1_000_000_000_000)).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id").
|
||||
WithArgs("ws-spend-cap").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
@@ -1897,9 +1897,9 @@ func TestHeartbeat_MonthlySpend_Zero_NoUpdate(t *testing.T) {
|
||||
WithArgs("ws-spend-zero", 0.0, "", 0, 0, "").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id").
|
||||
WithArgs("ws-spend-zero").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
@@ -2143,7 +2143,7 @@ func TestRegister_AllowsAlreadyPlatformReRegister(t *testing.T) {
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Request = httptest.NewRequest("POST", "/registry/register",
|
||||
bytes.NewBufferString(`{"id":"`+wsID+`","url":"http://localhost:9100","delivery_mode":"push","kind":"platform","agent_card":{"name":"concierge"}}`))
|
||||
bytes.NewBufferString(`{"id":"`+wsID+`","url":"http://localhost:9100","delivery_mode":"push","kind":"platform","mcp_server_present":true,"agent_card":{"name":"concierge"}}`))
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
|
||||
handler.Register(c)
|
||||
@@ -2270,7 +2270,7 @@ func TestRegister_PlatformAgentMissingModelSecret_FailsClosed(t *testing.T) {
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Request = httptest.NewRequest("POST", "/registry/register",
|
||||
bytes.NewBufferString(`{"id":"`+wsID+`","url":"http://localhost:9100","delivery_mode":"push","kind":"platform","agent_card":{"name":"concierge"}}`))
|
||||
bytes.NewBufferString(`{"id":"`+wsID+`","url":"http://localhost:9100","delivery_mode":"push","kind":"platform","mcp_server_present":true,"agent_card":{"name":"concierge"}}`))
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
|
||||
handler.Register(c)
|
||||
@@ -2559,9 +2559,9 @@ func TestHeartbeatHandler_DeliversPlatformInboundSecret(t *testing.T) {
|
||||
WithArgs("ws-with-secret", 0.0, "", 0, 100, "").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-with-secret").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
|
||||
|
||||
// readOrLazyHealInboundSecret — short-circuit: secret already on file.
|
||||
mock.ExpectQuery(`SELECT platform_inbound_secret FROM workspaces WHERE id = \$1`).
|
||||
@@ -2614,9 +2614,9 @@ func TestHeartbeatHandler_LazyHealsPlatformInboundSecret(t *testing.T) {
|
||||
WithArgs("ws-needs-heal", 0.0, "", 0, 100, "").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-needs-heal").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
|
||||
|
||||
// readOrLazyHealInboundSecret — NULL column triggers mint.
|
||||
mock.ExpectQuery(`SELECT platform_inbound_secret FROM workspaces WHERE id = \$1`).
|
||||
@@ -2670,9 +2670,9 @@ func TestHeartbeatHandler_OmitsSecretOnHealFailure(t *testing.T) {
|
||||
WithArgs("ws-heal-fails", 0.0, "", 0, 100, "").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-heal-fails").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
|
||||
|
||||
// Read returns NULL → mint is attempted...
|
||||
mock.ExpectQuery(`SELECT platform_inbound_secret FROM workspaces WHERE id = \$1`).
|
||||
@@ -3037,10 +3037,10 @@ func TestHeartbeat_RecentRegisterFailure_DegradesWorkspace(t *testing.T) {
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
// evaluateStatus SELECT — online with recent register failure
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-degrade-reg").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).
|
||||
AddRow("online", time.Now().Add(-2*time.Minute)))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).
|
||||
AddRow("online", "", time.Now().Add(-2*time.Minute)))
|
||||
|
||||
// Degrade UPDATE
|
||||
mock.ExpectExec("UPDATE workspaces SET status =").
|
||||
@@ -3087,10 +3087,10 @@ func TestHeartbeat_RecentRegisterFailure_BlocksRecovery(t *testing.T) {
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
// evaluateStatus SELECT — degraded with recent register failure
|
||||
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs("ws-no-recover").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).
|
||||
AddRow("degraded", time.Now().Add(-2*time.Minute)))
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).
|
||||
AddRow("degraded", "", time.Now().Add(-2*time.Minute)))
|
||||
|
||||
// NO recovery UPDATE expected — register failure blocks recovery.
|
||||
|
||||
@@ -3421,3 +3421,112 @@ func TestRegister_400_LogsExistingRowState(t *testing.T) {
|
||||
// + resolveDeliveryMode (all run BEFORE the isSafeURL check), which
|
||||
// is brittle. The test coverage gap is documented; the test
|
||||
// would be a redundant copy of the UpdateCard coverage.
|
||||
|
||||
// TestRegister_PlatformAgentMissingMCPServer_FailsClosed guards the second half
|
||||
// of issue #2970: a platform agent whose runtime reports mcp_server_present=false
|
||||
// (or omits the field) must NOT be marked online, even when the MODEL secret is
|
||||
// present. Fail-closed on the MCP server prevents a generic-image concierge from
|
||||
// booting as a routable platform agent.
|
||||
func TestRegister_PlatformAgentMissingMCPServer_FailsClosed(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
broadcaster := newTestBroadcaster()
|
||||
handler := NewRegistryHandler(broadcaster)
|
||||
|
||||
const wsID = "ws-platform-no-mcp"
|
||||
|
||||
// Bootstrap path — no live tokens.
|
||||
mock.ExpectQuery("SELECT COUNT\\(\\*\\) FROM workspace_auth_tokens").
|
||||
WithArgs(wsID).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0))
|
||||
|
||||
// kind precheck: existing row is kind="platform".
|
||||
mock.ExpectQuery("SELECT kind FROM workspaces WHERE id").
|
||||
WithArgs(wsID).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform"))
|
||||
|
||||
// MODEL secret exists, but the runtime declares the MCP server absent.
|
||||
mock.ExpectQuery("SELECT EXISTS\\(SELECT 1 FROM workspace_secrets WHERE workspace_id = \\$1 AND key = 'MODEL'\\)").
|
||||
WithArgs(wsID).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
|
||||
|
||||
// Gate failure broadcasts WORKSPACE_PROVISION_FAILED and marks the row failed.
|
||||
mock.ExpectExec("INSERT INTO structure_events").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
mock.ExpectExec("UPDATE workspaces SET status = \\$3, last_sample_error = \\$2, updated_at = now\\(\\) WHERE id = \\$1").
|
||||
WithArgs(wsID, sqlmock.AnyArg(), models.StatusFailed).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Request = httptest.NewRequest("POST", "/registry/register",
|
||||
bytes.NewBufferString(`{"id":"`+wsID+`","url":"http://localhost:9100","delivery_mode":"push","kind":"platform","mcp_server_present":false,"agent_card":{"name":"concierge"}}`))
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
|
||||
handler.Register(c)
|
||||
|
||||
if w.Code != http.StatusBadRequest {
|
||||
t.Fatalf("platform agent missing MCP server: expected 400, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestHeartbeat_PlatformAgentMissingMCPServer_FailsClosed guards the heartbeat
|
||||
// recovery side of issue #2970: a kind='platform' workspace whose runtime reports
|
||||
// mcp_server_present=false must NOT recover to 'online' on heartbeat. Instead it
|
||||
// is marked 'failed' so the canvas surfaces a provision failure.
|
||||
func TestHeartbeat_PlatformAgentMissingMCPServer_FailsClosed(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
broadcaster := newTestBroadcaster()
|
||||
handler := NewRegistryHandler(broadcaster)
|
||||
|
||||
const wsID = "ws-platform-heartbeat-no-mcp"
|
||||
|
||||
// prevTask SELECT — use loose regex to match 3-col query on main
|
||||
mock.ExpectQuery("SELECT COALESCE\\(current_task").
|
||||
WithArgs(wsID).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
|
||||
|
||||
// Heartbeat UPDATE (MonthlySpend=0 branch)
|
||||
mock.ExpectExec("UPDATE workspaces SET").
|
||||
WithArgs(wsID, 0.0, "", 0, 60, "").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
// evaluateStatus SELECT — platform agent currently in provisioning
|
||||
// (the recovery path most likely to resurrect a broken concierge).
|
||||
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
|
||||
WithArgs(wsID).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).
|
||||
AddRow("provisioning", "platform", nil))
|
||||
|
||||
// MODEL secret exists, but MCP server is absent.
|
||||
mock.ExpectQuery("SELECT EXISTS\\(SELECT 1 FROM workspace_secrets WHERE workspace_id = \\$1 AND key = 'MODEL'\\)").
|
||||
WithArgs(wsID).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
|
||||
|
||||
// Gate failure: broadcast WORKSPACE_PROVISION_FAILED + mark failed.
|
||||
mock.ExpectExec("INSERT INTO structure_events").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
mock.ExpectExec("UPDATE workspaces SET status = \\$3, last_sample_error = \\$2, updated_at = now\\(\\) WHERE id = \\$1").
|
||||
WithArgs(wsID, sqlmock.AnyArg(), models.StatusFailed).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
|
||||
body := `{"workspace_id":"` + wsID + `","error_rate":0.0,"sample_error":"","active_tasks":0,"uptime_seconds":60,"mcp_server_present":false}`
|
||||
c.Request = httptest.NewRequest("POST", "/registry/heartbeat", bytes.NewBufferString(body))
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
|
||||
handler.Heartbeat(c)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("heartbeat mcp gate: expected 200, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -53,7 +53,8 @@ const runtimeRegisterBody = `{
|
||||
"skills": [{"id": "coding", "name": "coding", "description": "coding", "tags": []}],
|
||||
"capabilities": {"streaming": true, "pushNotifications": false},
|
||||
"configuration_status": "ready"
|
||||
}
|
||||
},
|
||||
"mcp_server_present": false
|
||||
}`
|
||||
|
||||
func TestRegisterPayload_RuntimeBodyBinds(t *testing.T) {
|
||||
@@ -70,6 +71,9 @@ func TestRegisterPayload_RuntimeBodyBinds(t *testing.T) {
|
||||
if p.URL == "" {
|
||||
t.Error("url should round-trip from the runtime body")
|
||||
}
|
||||
if p.MCPServerPresent == nil {
|
||||
t.Error("mcp_server_present must decode (nil would be fail-closed)")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRegisterPayload_MissingID_Rejected(t *testing.T) {
|
||||
@@ -98,7 +102,8 @@ const runtimeHeartbeatBody = `{
|
||||
"sample_error": "",
|
||||
"active_tasks": 0,
|
||||
"current_task": "",
|
||||
"uptime_seconds": 42
|
||||
"uptime_seconds": 42,
|
||||
"mcp_server_present": false
|
||||
}`
|
||||
|
||||
func TestHeartbeatPayload_RuntimeBodyBinds(t *testing.T) {
|
||||
@@ -112,6 +117,9 @@ func TestHeartbeatPayload_RuntimeBodyBinds(t *testing.T) {
|
||||
if p.UptimeSeconds != 42 {
|
||||
t.Errorf("uptime_seconds not decoded: %d", p.UptimeSeconds)
|
||||
}
|
||||
if p.MCPServerPresent == nil {
|
||||
t.Error("mcp_server_present must decode (nil would be fail-closed)")
|
||||
}
|
||||
}
|
||||
|
||||
// The wedged-runtime heartbeat (heartbeat.py _runtime_state_payload +
|
||||
|
||||
@@ -102,6 +102,11 @@ type RegisterPayload struct {
|
||||
// the row to be its own org root (parent_id IS NULL) and to be the only
|
||||
// platform agent in the org — enforced by the Register handler.
|
||||
Kind string `json:"kind,omitempty"`
|
||||
// MCPServerPresent is the runtime's declaration that the platform-agent
|
||||
// image's baked /opt/molecule-mcp-server binary is present. For platform
|
||||
// agents the controlplane treats nil/false as fail-closed (RCA #2970).
|
||||
// Non-platform workspaces may omit this field.
|
||||
MCPServerPresent *bool `json:"mcp_server_present,omitempty"`
|
||||
}
|
||||
|
||||
type HeartbeatPayload struct {
|
||||
@@ -149,6 +154,10 @@ type HeartbeatPayload struct {
|
||||
// The heartbeat handler backfills NULL agent_card rows so the workspace
|
||||
// can come online without requiring a full re-register. (#2421)
|
||||
AgentCard json.RawMessage `json:"agent_card,omitempty"`
|
||||
// MCPServerPresent mirrors the register payload field on every heartbeat
|
||||
// so the fail-closed platform-agent gate can block recovery paths that
|
||||
// would otherwise resurrect an mcp-less concierge (RCA #2970).
|
||||
MCPServerPresent *bool `json:"mcp_server_present,omitempty"`
|
||||
}
|
||||
|
||||
// RuntimeMetadata is the adapter-declared capability + override block
|
||||
|
||||
Reference in New Issue
Block a user