fix(platform-agent): fail-closed MCP-server gate for concierge online-marking (RCA #2970) #2989

Merged
core-devops merged 2 commits from fix/2970-concierge-online-marking-gate into main 2026-06-18 03:46:41 +00:00
7 changed files with 298 additions and 106 deletions
@@ -455,15 +455,15 @@ func TestHeartbeat_ExactThreshold_Degraded(t *testing.T) {
mock.ExpectQuery("SELECT COALESCE\\(current_task").
WithArgs("ws-edge").
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0))
mock.ExpectExec("UPDATE workspaces SET").
WithArgs("ws-edge", 0.5, "edge case", 0, 500, "").
WillReturnResult(sqlmock.NewResult(0, 1))
// error_rate == 0.5 should trigger degraded (>= 0.5)
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-edge").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
mock.ExpectExec("UPDATE workspaces SET status =").
WithArgs(models.StatusDegraded, "ws-edge").
WillReturnResult(sqlmock.NewResult(0, 1))
@@ -496,15 +496,15 @@ func TestHeartbeat_DegradedRecovery(t *testing.T) {
mock.ExpectQuery("SELECT COALESCE\\(current_task").
WithArgs("ws-rec").
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0))
mock.ExpectExec("UPDATE workspaces SET").
WithArgs("ws-rec", 0.05, "", 1, 2000, "").
WillReturnResult(sqlmock.NewResult(0, 1))
// Currently degraded, error_rate < 0.1 → should recover to online
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-rec").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("degraded", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("degraded", "", nil))
mock.ExpectExec("UPDATE workspaces SET status =").
WithArgs(models.StatusOnline, "ws-rec").
WillReturnResult(sqlmock.NewResult(0, 1))
@@ -538,15 +538,15 @@ func TestHeartbeat_ErrorRateDegrade_Guarded(t *testing.T) {
mock.ExpectQuery("SELECT COALESCE\\(current_task").
WithArgs("ws-degrade-guard").
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0))
mock.ExpectExec("UPDATE workspaces SET").
WithArgs("ws-degrade-guard", 0.6, "", 1, 100, "").
WillReturnResult(sqlmock.NewResult(0, 1))
// Stale read: heartbeat started before CascadeDelete set status='removed'
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-degrade-guard").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
// Guarded UPDATE returns 0 rows because row is actually 'removed'
mock.ExpectExec("UPDATE workspaces SET status =.*AND status = 'online'").
@@ -584,15 +584,15 @@ func TestHeartbeat_DegradedRecovery_Guarded(t *testing.T) {
mock.ExpectQuery("SELECT COALESCE\\(current_task").
WithArgs("ws-recover-guard").
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0))
mock.ExpectExec("UPDATE workspaces SET").
WithArgs("ws-recover-guard", 0.05, "", 1, 100, "").
WillReturnResult(sqlmock.NewResult(0, 1))
// Stale read: heartbeat started before CascadeDelete set status='removed'
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-recover-guard").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("degraded", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("degraded", "", nil))
// Guarded UPDATE returns 0 rows because row is actually 'removed'
mock.ExpectExec("UPDATE workspaces SET status =.*AND status = 'degraded'").
@@ -1479,7 +1479,7 @@ func TestHeartbeatHandler_RegisterFailureClearedOnCardBearingRestart(t *testing.
// prevTask SELECT
mock.ExpectQuery("SELECT COALESCE\\(current_task").
WithArgs("ws-2739").
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0))
// heartbeat UPDATE
mock.ExpectExec("UPDATE workspaces SET").
@@ -1499,9 +1499,9 @@ func TestHeartbeatHandler_RegisterFailureClearedOnCardBearingRestart(t *testing.
// evaluateStatus SELECT — workspace is degraded; failure marker is now NULL
// (the clear above wiped it in the real DB), so recovery is unblocked.
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-2739").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("degraded", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("degraded", "", nil))
// degraded -> online recovery
mock.ExpectExec("UPDATE workspaces SET status =").
@@ -244,7 +244,7 @@ func TestHeartbeatHandler_Normal(t *testing.T) {
// Expect prevTask SELECT (before UPDATE)
mock.ExpectQuery("SELECT COALESCE\\(current_task").
WithArgs("ws-123").
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0))
// Expect heartbeat UPDATE
mock.ExpectExec("UPDATE workspaces SET").
@@ -252,9 +252,9 @@ func TestHeartbeatHandler_Normal(t *testing.T) {
WillReturnResult(sqlmock.NewResult(0, 1))
// Expect evaluateStatus SELECT
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-123").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
@@ -283,7 +283,7 @@ func TestHeartbeatHandler_Degraded(t *testing.T) {
// Expect prevTask SELECT (before UPDATE)
mock.ExpectQuery("SELECT COALESCE\\(current_task").
WithArgs("ws-123").
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0))
// Expect heartbeat UPDATE
mock.ExpectExec("UPDATE workspaces SET").
@@ -291,9 +291,9 @@ func TestHeartbeatHandler_Degraded(t *testing.T) {
WillReturnResult(sqlmock.NewResult(0, 1))
// Expect evaluateStatus SELECT — currently online
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-123").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
// Expect status transition to degraded
mock.ExpectExec("UPDATE workspaces SET status =").
@@ -331,7 +331,7 @@ func TestHeartbeatHandler_Recovery(t *testing.T) {
// Expect prevTask SELECT (before UPDATE)
mock.ExpectQuery("SELECT COALESCE\\(current_task").
WithArgs("ws-123").
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0))
// Expect heartbeat UPDATE
mock.ExpectExec("UPDATE workspaces SET").
@@ -339,9 +339,9 @@ func TestHeartbeatHandler_Recovery(t *testing.T) {
WillReturnResult(sqlmock.NewResult(0, 1))
// Expect evaluateStatus SELECT — currently degraded
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-123").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("degraded", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("degraded", "", nil))
// Expect status transition back to online
mock.ExpectExec("UPDATE workspaces SET status =").
@@ -727,7 +727,7 @@ func TestHeartbeatHandler_TaskChanged(t *testing.T) {
// Expect prevTask SELECT — currently "old task"
mock.ExpectQuery("SELECT COALESCE\\(current_task").
WithArgs("ws-123").
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("old task"))
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("old task", 0))
// Expect heartbeat UPDATE with new task
mock.ExpectExec("UPDATE workspaces SET").
@@ -735,9 +735,9 @@ func TestHeartbeatHandler_TaskChanged(t *testing.T) {
WillReturnResult(sqlmock.NewResult(0, 1))
// Expect evaluateStatus SELECT
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-123").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
@@ -913,7 +913,7 @@ func TestHeartbeatHandler_TaskUnchanged(t *testing.T) {
// Expect prevTask SELECT — task is already "doing work"
mock.ExpectQuery("SELECT COALESCE\\(current_task").
WithArgs("ws-123").
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("doing work"))
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("doing work", 0))
// Expect heartbeat UPDATE with same task
mock.ExpectExec("UPDATE workspaces SET").
@@ -921,9 +921,9 @@ func TestHeartbeatHandler_TaskUnchanged(t *testing.T) {
WillReturnResult(sqlmock.NewResult(0, 1))
// Expect evaluateStatus SELECT
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-123").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
// NO TASK_UPDATED broadcast expected — task didn't change
@@ -956,7 +956,7 @@ func TestHeartbeatHandler_TaskCleared(t *testing.T) {
// Expect prevTask SELECT — was doing something
mock.ExpectQuery("SELECT COALESCE\\(current_task").
WithArgs("ws-123").
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("old task"))
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("old task", 0))
// Expect heartbeat UPDATE with empty task
mock.ExpectExec("UPDATE workspaces SET").
@@ -964,9 +964,9 @@ func TestHeartbeatHandler_TaskCleared(t *testing.T) {
WillReturnResult(sqlmock.NewResult(0, 1))
// Expect evaluateStatus SELECT
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-123").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
// TASK_UPDATED broadcast expected — changed from "old task" to ""
// (BroadcastOnly doesn't hit sqlmock, so no expectation needed)
@@ -1019,13 +1019,13 @@ func TestHeartbeatHandler_AlwaysBroadcastsHeartbeat(t *testing.T) {
// Pre-fix this path emitted ZERO broadcasts.
mock.ExpectQuery("SELECT COALESCE\\(current_task").
WithArgs("ws-123").
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("doing work"))
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("doing work", 0))
mock.ExpectExec("UPDATE workspaces SET").
WithArgs("ws-123", 0.0, "", 1, 500, "doing work").
WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-123").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
@@ -36,7 +36,7 @@ func TestHeartbeat_NativeStatusMgmt_SkipsDegradeInference(t *testing.T) {
// prevTask SELECT (before UPDATE)
mock.ExpectQuery("SELECT COALESCE\\(current_task").
WithArgs("ws-native-status").
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0))
// heartbeat UPDATE — same as the non-native path
mock.ExpectExec("UPDATE workspaces SET").
@@ -48,9 +48,9 @@ func TestHeartbeat_NativeStatusMgmt_SkipsDegradeInference(t *testing.T) {
// MUST NOT. We deliberately don't ExpectExec the degrade UPDATE
// — sqlmock fails the test if any UPDATE happens that wasn't
// expected, which is the regression cover.
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-native-status").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
@@ -89,7 +89,7 @@ func TestHeartbeat_NativeStatusMgmt_SkipsRecovery(t *testing.T) {
mock.ExpectQuery("SELECT COALESCE\\(current_task").
WithArgs("ws-native-recovery").
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0))
// heartbeat UPDATE — error_rate=0.05 would fire recovery
mock.ExpectExec("UPDATE workspaces SET").
@@ -99,9 +99,9 @@ func TestHeartbeat_NativeStatusMgmt_SkipsRecovery(t *testing.T) {
// evaluateStatus SELECT — currently degraded; recovery branch
// would normally fire UPDATE → online + WORKSPACE_ONLINE broadcast.
// Under native_status_mgmt, neither should run.
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-native-recovery").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("degraded", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("degraded", "", nil))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
@@ -135,7 +135,7 @@ func TestHeartbeat_NativeStatusMgmt_WedgedStillRespected(t *testing.T) {
mock.ExpectQuery("SELECT COALESCE\\(current_task").
WithArgs("ws-wedged").
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0))
// heartbeat UPDATE — RuntimeState="wedged" means sample_error
// reflects the wedge reason, error_rate stays 0
@@ -144,9 +144,9 @@ func TestHeartbeat_NativeStatusMgmt_WedgedStillRespected(t *testing.T) {
WillReturnResult(sqlmock.NewResult(0, 1))
// evaluateStatus SELECT — currently online, wedged branch SHOULD fire
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-wedged").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
// Wedged degrade UPDATE — must still happen even with native_status_mgmt
mock.ExpectExec("UPDATE workspaces SET status =").
+78 -12
View File
@@ -393,12 +393,24 @@ func (h *RegistryHandler) platformAgentHasModelSecret(ctx context.Context, works
return exists, err
}
// platformAgentMCPServerPresent reports whether the runtime declared the
// platform-agent image's /opt/molecule-mcp-server binary present. The payload
// field is a pointer so an absent declaration (nil) is treated as false —
// fail-closed: an old/generic runtime cannot prove it has the concierge MCP.
func (h *RegistryHandler) platformAgentMCPServerPresent(present *bool) bool {
return present != nil && *present
}
// markWorkspaceFailed updates a workspace row to status='failed' and broadcasts
// WORKSPACE_PROVISION_FAILED. It is a RegistryHandler-local fallback for the
// fail-closed platform-agent identity gate; the WorkspaceHandler's
// markProvisionFailed is the primary path during provisioning.
func (h *RegistryHandler) markWorkspaceFailed(ctx context.Context, workspaceID, msg string) {
extra := map[string]interface{}{"error": msg, "code": "PLATFORM_AGENT_IDENTITY_GATE"}
func (h *RegistryHandler) markWorkspaceFailed(ctx context.Context, workspaceID, msg, reason string) {
extra := map[string]interface{}{
"error": msg,
"code": "PLATFORM_AGENT_IDENTITY_GATE",
"reason": reason,
}
h.broadcaster.RecordAndBroadcast(ctx, string(events.EventWorkspaceProvisionFailed), workspaceID, extra)
if _, dbErr := db.DB.ExecContext(ctx,
`UPDATE workspaces SET status = $3, last_sample_error = $2, updated_at = now() WHERE id = $1`,
@@ -539,27 +551,45 @@ func (h *RegistryHandler) Register(c *gin.Context) {
}
// Issue #2970: fail CLOSED if a platform agent reaches registration without
// the seeded MODEL workspace_secret. The MISSING_MODEL gate in
// BOTH the seeded MODEL workspace_secret AND the platform-agent image's baked
// /opt/molecule-mcp-server binary. The MISSING_MODEL gate in
// prepareProvisionContext is the primary defense, but if a model-less/identity-
// less concierge somehow boots on a path that bypasses that gate (e.g. an old
// or generic image), this second-layer guard prevents it from ever marking
// less/mcp-less concierge somehow boots on a path that bypasses that gate (e.g.
// an old or generic image), this second-layer guard prevents it from ever marking
// itself online-routable. Instead we mark the workspace failed so the canvas
// surfaces a provision failure rather than serving users a generic Claude Code.
//
// The runtime declares mcp-server availability via payload.mcp_server_present.
// A nil/false value is fail-closed: an undeclared or missing MCP server cannot be
// trusted for a concierge.
//
// existingState.ExistingKind is populated by fetchExistingWorkspaceStateForDiagnostics
// (best-effort). We treat "platform" literally; any other value (including "(new)"
// or "(unavailable)") means the gate does not apply unless payload.Kind itself is
// "platform" (covered by the privilege-escalation precheck above).
if payload.Kind == models.KindPlatform || existingState.ExistingKind == models.KindPlatform {
if hasModel, mErr := h.platformAgentHasModelSecret(ctx, payload.ID); mErr != nil {
hasModel, mErr := h.platformAgentHasModelSecret(ctx, payload.ID)
if mErr != nil {
log.Printf("Registry register: model secret lookup failed for %s: %v", payload.ID, mErr)
c.JSON(http.StatusInternalServerError, gin.H{"error": "registration failed"})
return
} else if !hasModel {
msg := "platform agent registered without a seeded MODEL secret; refusing online"
}
hasMCP := h.platformAgentMCPServerPresent(payload.MCPServerPresent)
if !hasModel || !hasMCP {
var msg, reason, logCode string
switch {
case !hasModel:
msg = "platform agent registered without a seeded MODEL secret; refusing online"
reason = "model_missing"
logCode = "platform_agent_model_missing"
case !hasMCP:
msg = "platform agent registered without /opt/molecule-mcp-server; refusing online"
reason = "mcp_server_missing"
logCode = "platform_agent_mcp_server_missing"
}
log.Printf("Registry register: %s (workspace=%s)", msg, payload.ID)
h.markWorkspaceFailed(ctx, payload.ID, msg)
logRegister400Reason("platform_agent_model_missing", payload.ID, payload, existingState, msg)
h.markWorkspaceFailed(ctx, payload.ID, msg, reason)
logRegister400Reason(logCode, payload.ID, payload, existingState, msg)
c.JSON(http.StatusBadRequest, gin.H{"error": "platform agent identity incomplete"})
return
}
@@ -1193,14 +1223,50 @@ func (h *RegistryHandler) evaluateStatus(c *gin.Context, payload models.Heartbea
ctx := c.Request.Context()
var currentStatus string
var currentKind string
var lastRegisterFailure sql.NullTime
err := db.DB.QueryRowContext(ctx, `SELECT status, last_register_failure_at FROM workspaces WHERE id = $1`, payload.WorkspaceID).
Scan(&currentStatus, &lastRegisterFailure)
err := db.DB.QueryRowContext(ctx, `SELECT status, kind, last_register_failure_at FROM workspaces WHERE id = $1`, payload.WorkspaceID).
Scan(&currentStatus, &currentKind, &lastRegisterFailure)
if err != nil {
return
}
hasRecentRegisterFailure := lastRegisterFailure.Valid && time.Since(lastRegisterFailure.Time) < 5*time.Minute
// FAIL-CLOSED concierge online-marking gate (RCA #2970).
// A kind='platform' workspace that has lost either its seeded MODEL secret or
// the image-baked /opt/molecule-mcp-server binary must never be allowed back
// to status='online' via heartbeat recovery. The Register handler already gates
// the initial online marking; this gate closes the heartbeat-driven recovery
// paths (provisioning/failed/offline/awaiting_agent/degraded → online) that
// would otherwise resurrect a model-less/mcp-less concierge and let it serve
// users generic Claude Code.
//
// The runtime now declares mcp-server availability via
// payload.mcp_server_present on every heartbeat/register call. nil/false is
// fail-closed: an old/generic runtime cannot prove it is a real concierge.
if currentKind == models.KindPlatform {
hasModel, mErr := h.platformAgentHasModelSecret(ctx, payload.WorkspaceID)
if mErr != nil {
log.Printf("Heartbeat: model secret lookup failed for platform agent %s: %v", payload.WorkspaceID, mErr)
return
}
hasMCP := h.platformAgentMCPServerPresent(payload.MCPServerPresent)
if !hasModel || !hasMCP {
var msg, reason string
switch {
case !hasModel:
msg = "platform agent heartbeat denied: no seeded MODEL workspace_secret; refusing to mark online (RCA #2970 FAIL-CLOSED)"
reason = "model_missing"
case !hasMCP:
msg = "platform agent heartbeat denied: /opt/molecule-mcp-server missing; refusing to mark online (RCA #2970 FAIL-CLOSED)"
reason = "mcp_server_missing"
}
log.Printf("Heartbeat: %s (workspace=%s)", msg, payload.WorkspaceID)
h.markWorkspaceFailed(ctx, payload.WorkspaceID, msg, reason)
return
}
}
// Self-reported runtime wedge: takes precedence over the error_rate
// path. The heartbeat task lives in its own asyncio task and keeps
// firing 200s even after claude_agent_sdk locks up on
@@ -291,9 +291,9 @@ func TestHeartbeatHandler_OfflineToOnline(t *testing.T) {
WillReturnResult(sqlmock.NewResult(0, 1))
// Expect evaluateStatus SELECT — currently offline
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-offline").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("offline", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("offline", "", nil))
// Expect status transition back to online
mock.ExpectExec("UPDATE workspaces SET status =").
@@ -362,9 +362,9 @@ func TestHeartbeatHandler_ProvisioningToOnline(t *testing.T) {
// evaluateStatus SELECT — reads the post-CASE status ('online'), so its own
// provisioning→online branch does NOT fire (no duplicate transition exec).
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-provisioning").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
@@ -414,9 +414,9 @@ func TestHeartbeatHandler_FailedToOnline(t *testing.T) {
WillReturnResult(sqlmock.NewResult(0, 1))
// evaluateStatus SELECT — currently failed (provision-timeout sweeper flip)
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-failed").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("failed", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("failed", "", nil))
// the new failed → online recovery transition
mock.ExpectExec("UPDATE workspaces SET status =").
@@ -463,9 +463,9 @@ func TestHeartbeatHandler_AwaitingAgentToOnline(t *testing.T) {
WithArgs("ws-external", 0.0, "", 0, 60, "").
WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-external").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("awaiting_agent", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("awaiting_agent", "", nil))
// The new branch — UPDATE ... WHERE status = 'awaiting_agent'
mock.ExpectExec("UPDATE workspaces SET status =").
@@ -586,9 +586,9 @@ func TestHeartbeatHandler_OnlineStaysOnline(t *testing.T) {
WillReturnResult(sqlmock.NewResult(0, 1))
// evaluateStatus: online with error_rate 0.2 — below 0.5 threshold, stays online
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-stable").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
@@ -635,9 +635,9 @@ func TestHeartbeatHandler_RuntimeWedged_FlipsOnlineToDegraded(t *testing.T) {
WillReturnResult(sqlmock.NewResult(0, 1))
// evaluateStatus: currentStatus = online
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-wedged").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
// The wedge-handling branch fires the degraded UPDATE with the
// `AND status = 'online'` guard (race-safe against concurrent
@@ -688,9 +688,9 @@ func TestHeartbeatHandler_DegradedRecoversOnlyAfterWedgeClears(t *testing.T) {
WillReturnResult(sqlmock.NewResult(0, 1))
// currentStatus = degraded
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-still-wedged").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("degraded", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("degraded", "", nil))
// No additional UPDATE expected — the recovery branch's
// `runtime_state == ""` guard blocks the flip back to online.
@@ -733,9 +733,9 @@ func TestHeartbeatHandler_DegradedToOnline_AfterWedgeClears(t *testing.T) {
WithArgs("ws-recovered", 0.0, "", 0, 30, "").
WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-recovered").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("degraded", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("degraded", "", nil))
// Recovery UPDATE fires (degraded → online).
mock.ExpectExec("UPDATE workspaces SET status =").
@@ -978,7 +978,7 @@ func TestHeartbeat_SkipsRemovedRows(t *testing.T) {
WillReturnResult(sqlmock.NewResult(0, 0))
// evaluateStatus SELECT
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id").
WithArgs("ws-zombie").
WillReturnError(sql.ErrNoRows) // row effectively removed from view
@@ -1019,9 +1019,9 @@ func TestHeartbeatHandler_BackfillsAgentCard_WhenNull(t *testing.T) {
WithArgs("ws-nocard", sqlmock.AnyArg()).
WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-nocard").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow(models.StatusOnline, nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow(models.StatusOnline, "", nil))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
@@ -1059,9 +1059,9 @@ func TestHeartbeatHandler_SkipsAgentCardBackfill_WhenAlreadySet(t *testing.T) {
WithArgs("ws-hascard", sqlmock.AnyArg()).
WillReturnResult(sqlmock.NewResult(0, 0))
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-hascard").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow(models.StatusOnline, nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow(models.StatusOnline, "", nil))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
@@ -1107,9 +1107,9 @@ func TestHeartbeatHandler_BackfillAgentCard_ClearsRegisterFailure(t *testing.T)
// Status check sees degraded, but last_register_failure_at is now NULL because
// the agent_card backfill UPDATE cleared it.
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-degraded-register-fail").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow(models.StatusDegraded, nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow(models.StatusDegraded, "", nil))
// Because the failure marker was cleared by the backfill, evaluateStatus
// should now recover the workspace to online.
@@ -1754,9 +1754,9 @@ func TestHeartbeat_MonthlySpend_WithinBounds(t *testing.T) {
WithArgs("ws-spend-ok", 0.0, "", 0, 0, "", int64(15000)). // $150.00
WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id").
WithArgs("ws-spend-ok").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
@@ -1790,9 +1790,9 @@ func TestHeartbeat_MonthlySpend_NegativeClamped(t *testing.T) {
WithArgs("ws-spend-neg", 0.0, "", 0, 0, "").
WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id").
WithArgs("ws-spend-neg").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
@@ -1826,9 +1826,9 @@ func TestHeartbeat_MonthlySpend_OverflowClamped(t *testing.T) {
WithArgs("ws-spend-overflow", 0.0, "", 0, 0, "", int64(1_000_000_000_000)).
WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id").
WithArgs("ws-spend-overflow").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
@@ -1861,9 +1861,9 @@ func TestHeartbeat_MonthlySpend_ExactCap(t *testing.T) {
WithArgs("ws-spend-cap", 0.0, "", 0, 0, "", int64(1_000_000_000_000)).
WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id").
WithArgs("ws-spend-cap").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
@@ -1897,9 +1897,9 @@ func TestHeartbeat_MonthlySpend_Zero_NoUpdate(t *testing.T) {
WithArgs("ws-spend-zero", 0.0, "", 0, 0, "").
WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id").
WithArgs("ws-spend-zero").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
@@ -2143,7 +2143,7 @@ func TestRegister_AllowsAlreadyPlatformReRegister(t *testing.T) {
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
c.Request = httptest.NewRequest("POST", "/registry/register",
bytes.NewBufferString(`{"id":"`+wsID+`","url":"http://localhost:9100","delivery_mode":"push","kind":"platform","agent_card":{"name":"concierge"}}`))
bytes.NewBufferString(`{"id":"`+wsID+`","url":"http://localhost:9100","delivery_mode":"push","kind":"platform","mcp_server_present":true,"agent_card":{"name":"concierge"}}`))
c.Request.Header.Set("Content-Type", "application/json")
handler.Register(c)
@@ -2270,7 +2270,7 @@ func TestRegister_PlatformAgentMissingModelSecret_FailsClosed(t *testing.T) {
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
c.Request = httptest.NewRequest("POST", "/registry/register",
bytes.NewBufferString(`{"id":"`+wsID+`","url":"http://localhost:9100","delivery_mode":"push","kind":"platform","agent_card":{"name":"concierge"}}`))
bytes.NewBufferString(`{"id":"`+wsID+`","url":"http://localhost:9100","delivery_mode":"push","kind":"platform","mcp_server_present":true,"agent_card":{"name":"concierge"}}`))
c.Request.Header.Set("Content-Type", "application/json")
handler.Register(c)
@@ -2559,9 +2559,9 @@ func TestHeartbeatHandler_DeliversPlatformInboundSecret(t *testing.T) {
WithArgs("ws-with-secret", 0.0, "", 0, 100, "").
WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-with-secret").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
// readOrLazyHealInboundSecret — short-circuit: secret already on file.
mock.ExpectQuery(`SELECT platform_inbound_secret FROM workspaces WHERE id = \$1`).
@@ -2614,9 +2614,9 @@ func TestHeartbeatHandler_LazyHealsPlatformInboundSecret(t *testing.T) {
WithArgs("ws-needs-heal", 0.0, "", 0, 100, "").
WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-needs-heal").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
// readOrLazyHealInboundSecret — NULL column triggers mint.
mock.ExpectQuery(`SELECT platform_inbound_secret FROM workspaces WHERE id = \$1`).
@@ -2670,9 +2670,9 @@ func TestHeartbeatHandler_OmitsSecretOnHealFailure(t *testing.T) {
WithArgs("ws-heal-fails", 0.0, "", 0, 100, "").
WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-heal-fails").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil))
// Read returns NULL → mint is attempted...
mock.ExpectQuery(`SELECT platform_inbound_secret FROM workspaces WHERE id = \$1`).
@@ -3037,10 +3037,10 @@ func TestHeartbeat_RecentRegisterFailure_DegradesWorkspace(t *testing.T) {
WillReturnResult(sqlmock.NewResult(0, 1))
// evaluateStatus SELECT — online with recent register failure
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-degrade-reg").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).
AddRow("online", time.Now().Add(-2*time.Minute)))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).
AddRow("online", "", time.Now().Add(-2*time.Minute)))
// Degrade UPDATE
mock.ExpectExec("UPDATE workspaces SET status =").
@@ -3087,10 +3087,10 @@ func TestHeartbeat_RecentRegisterFailure_BlocksRecovery(t *testing.T) {
WillReturnResult(sqlmock.NewResult(0, 1))
// evaluateStatus SELECT — degraded with recent register failure
mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id =").
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs("ws-no-recover").
WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).
AddRow("degraded", time.Now().Add(-2*time.Minute)))
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).
AddRow("degraded", "", time.Now().Add(-2*time.Minute)))
// NO recovery UPDATE expected — register failure blocks recovery.
@@ -3421,3 +3421,112 @@ func TestRegister_400_LogsExistingRowState(t *testing.T) {
// + resolveDeliveryMode (all run BEFORE the isSafeURL check), which
// is brittle. The test coverage gap is documented; the test
// would be a redundant copy of the UpdateCard coverage.
// TestRegister_PlatformAgentMissingMCPServer_FailsClosed guards the second half
// of issue #2970: a platform agent whose runtime reports mcp_server_present=false
// (or omits the field) must NOT be marked online, even when the MODEL secret is
// present. Fail-closed on the MCP server prevents a generic-image concierge from
// booting as a routable platform agent.
func TestRegister_PlatformAgentMissingMCPServer_FailsClosed(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
broadcaster := newTestBroadcaster()
handler := NewRegistryHandler(broadcaster)
const wsID = "ws-platform-no-mcp"
// Bootstrap path — no live tokens.
mock.ExpectQuery("SELECT COUNT\\(\\*\\) FROM workspace_auth_tokens").
WithArgs(wsID).
WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0))
// kind precheck: existing row is kind="platform".
mock.ExpectQuery("SELECT kind FROM workspaces WHERE id").
WithArgs(wsID).
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform"))
// MODEL secret exists, but the runtime declares the MCP server absent.
mock.ExpectQuery("SELECT EXISTS\\(SELECT 1 FROM workspace_secrets WHERE workspace_id = \\$1 AND key = 'MODEL'\\)").
WithArgs(wsID).
WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
// Gate failure broadcasts WORKSPACE_PROVISION_FAILED and marks the row failed.
mock.ExpectExec("INSERT INTO structure_events").
WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectExec("UPDATE workspaces SET status = \\$3, last_sample_error = \\$2, updated_at = now\\(\\) WHERE id = \\$1").
WithArgs(wsID, sqlmock.AnyArg(), models.StatusFailed).
WillReturnResult(sqlmock.NewResult(0, 1))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
c.Request = httptest.NewRequest("POST", "/registry/register",
bytes.NewBufferString(`{"id":"`+wsID+`","url":"http://localhost:9100","delivery_mode":"push","kind":"platform","mcp_server_present":false,"agent_card":{"name":"concierge"}}`))
c.Request.Header.Set("Content-Type", "application/json")
handler.Register(c)
if w.Code != http.StatusBadRequest {
t.Fatalf("platform agent missing MCP server: expected 400, got %d: %s", w.Code, w.Body.String())
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet expectations: %v", err)
}
}
// TestHeartbeat_PlatformAgentMissingMCPServer_FailsClosed guards the heartbeat
// recovery side of issue #2970: a kind='platform' workspace whose runtime reports
// mcp_server_present=false must NOT recover to 'online' on heartbeat. Instead it
// is marked 'failed' so the canvas surfaces a provision failure.
func TestHeartbeat_PlatformAgentMissingMCPServer_FailsClosed(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
broadcaster := newTestBroadcaster()
handler := NewRegistryHandler(broadcaster)
const wsID = "ws-platform-heartbeat-no-mcp"
// prevTask SELECT — use loose regex to match 3-col query on main
mock.ExpectQuery("SELECT COALESCE\\(current_task").
WithArgs(wsID).
WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow(""))
// Heartbeat UPDATE (MonthlySpend=0 branch)
mock.ExpectExec("UPDATE workspaces SET").
WithArgs(wsID, 0.0, "", 0, 60, "").
WillReturnResult(sqlmock.NewResult(0, 1))
// evaluateStatus SELECT — platform agent currently in provisioning
// (the recovery path most likely to resurrect a broken concierge).
mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id =").
WithArgs(wsID).
WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).
AddRow("provisioning", "platform", nil))
// MODEL secret exists, but MCP server is absent.
mock.ExpectQuery("SELECT EXISTS\\(SELECT 1 FROM workspace_secrets WHERE workspace_id = \\$1 AND key = 'MODEL'\\)").
WithArgs(wsID).
WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true))
// Gate failure: broadcast WORKSPACE_PROVISION_FAILED + mark failed.
mock.ExpectExec("INSERT INTO structure_events").
WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectExec("UPDATE workspaces SET status = \\$3, last_sample_error = \\$2, updated_at = now\\(\\) WHERE id = \\$1").
WithArgs(wsID, sqlmock.AnyArg(), models.StatusFailed).
WillReturnResult(sqlmock.NewResult(0, 1))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
body := `{"workspace_id":"` + wsID + `","error_rate":0.0,"sample_error":"","active_tasks":0,"uptime_seconds":60,"mcp_server_present":false}`
c.Request = httptest.NewRequest("POST", "/registry/heartbeat", bytes.NewBufferString(body))
c.Request.Header.Set("Content-Type", "application/json")
handler.Heartbeat(c)
if w.Code != http.StatusOK {
t.Fatalf("heartbeat mcp gate: expected 200, got %d: %s", w.Code, w.Body.String())
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet expectations: %v", err)
}
}
@@ -53,7 +53,8 @@ const runtimeRegisterBody = `{
"skills": [{"id": "coding", "name": "coding", "description": "coding", "tags": []}],
"capabilities": {"streaming": true, "pushNotifications": false},
"configuration_status": "ready"
}
},
"mcp_server_present": false
}`
func TestRegisterPayload_RuntimeBodyBinds(t *testing.T) {
@@ -70,6 +71,9 @@ func TestRegisterPayload_RuntimeBodyBinds(t *testing.T) {
if p.URL == "" {
t.Error("url should round-trip from the runtime body")
}
if p.MCPServerPresent == nil {
t.Error("mcp_server_present must decode (nil would be fail-closed)")
}
}
func TestRegisterPayload_MissingID_Rejected(t *testing.T) {
@@ -98,7 +102,8 @@ const runtimeHeartbeatBody = `{
"sample_error": "",
"active_tasks": 0,
"current_task": "",
"uptime_seconds": 42
"uptime_seconds": 42,
"mcp_server_present": false
}`
func TestHeartbeatPayload_RuntimeBodyBinds(t *testing.T) {
@@ -112,6 +117,9 @@ func TestHeartbeatPayload_RuntimeBodyBinds(t *testing.T) {
if p.UptimeSeconds != 42 {
t.Errorf("uptime_seconds not decoded: %d", p.UptimeSeconds)
}
if p.MCPServerPresent == nil {
t.Error("mcp_server_present must decode (nil would be fail-closed)")
}
}
// The wedged-runtime heartbeat (heartbeat.py _runtime_state_payload +
@@ -102,6 +102,11 @@ type RegisterPayload struct {
// the row to be its own org root (parent_id IS NULL) and to be the only
// platform agent in the org — enforced by the Register handler.
Kind string `json:"kind,omitempty"`
// MCPServerPresent is the runtime's declaration that the platform-agent
// image's baked /opt/molecule-mcp-server binary is present. For platform
// agents the controlplane treats nil/false as fail-closed (RCA #2970).
// Non-platform workspaces may omit this field.
MCPServerPresent *bool `json:"mcp_server_present,omitempty"`
}
type HeartbeatPayload struct {
@@ -149,6 +154,10 @@ type HeartbeatPayload struct {
// The heartbeat handler backfills NULL agent_card rows so the workspace
// can come online without requiring a full re-register. (#2421)
AgentCard json.RawMessage `json:"agent_card,omitempty"`
// MCPServerPresent mirrors the register payload field on every heartbeat
// so the fail-closed platform-agent gate can block recovery paths that
// would otherwise resurrect an mcp-less concierge (RCA #2970).
MCPServerPresent *bool `json:"mcp_server_present,omitempty"`
}
// RuntimeMetadata is the adapter-declared capability + override block