diff --git a/workspace-server/internal/handlers/handlers_additional_test.go b/workspace-server/internal/handlers/handlers_additional_test.go index d34ee8a54..46ebe317f 100644 --- a/workspace-server/internal/handlers/handlers_additional_test.go +++ b/workspace-server/internal/handlers/handlers_additional_test.go @@ -455,15 +455,15 @@ func TestHeartbeat_ExactThreshold_Degraded(t *testing.T) { mock.ExpectQuery("SELECT COALESCE\\(current_task"). WithArgs("ws-edge"). - WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("")) + WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0)) mock.ExpectExec("UPDATE workspaces SET"). WithArgs("ws-edge", 0.5, "edge case", 0, 500, ""). WillReturnResult(sqlmock.NewResult(0, 1)) // error_rate == 0.5 should trigger degraded (>= 0.5) - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-edge"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil)) mock.ExpectExec("UPDATE workspaces SET status ="). WithArgs(models.StatusDegraded, "ws-edge"). WillReturnResult(sqlmock.NewResult(0, 1)) @@ -496,15 +496,15 @@ func TestHeartbeat_DegradedRecovery(t *testing.T) { mock.ExpectQuery("SELECT COALESCE\\(current_task"). WithArgs("ws-rec"). - WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("")) + WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0)) mock.ExpectExec("UPDATE workspaces SET"). WithArgs("ws-rec", 0.05, "", 1, 2000, ""). WillReturnResult(sqlmock.NewResult(0, 1)) // Currently degraded, error_rate < 0.1 → should recover to online - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-rec"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("degraded", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("degraded", "", nil)) mock.ExpectExec("UPDATE workspaces SET status ="). WithArgs(models.StatusOnline, "ws-rec"). WillReturnResult(sqlmock.NewResult(0, 1)) @@ -538,15 +538,15 @@ func TestHeartbeat_ErrorRateDegrade_Guarded(t *testing.T) { mock.ExpectQuery("SELECT COALESCE\\(current_task"). WithArgs("ws-degrade-guard"). - WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("")) + WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0)) mock.ExpectExec("UPDATE workspaces SET"). WithArgs("ws-degrade-guard", 0.6, "", 1, 100, ""). WillReturnResult(sqlmock.NewResult(0, 1)) // Stale read: heartbeat started before CascadeDelete set status='removed' - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-degrade-guard"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil)) // Guarded UPDATE returns 0 rows because row is actually 'removed' mock.ExpectExec("UPDATE workspaces SET status =.*AND status = 'online'"). @@ -584,15 +584,15 @@ func TestHeartbeat_DegradedRecovery_Guarded(t *testing.T) { mock.ExpectQuery("SELECT COALESCE\\(current_task"). WithArgs("ws-recover-guard"). - WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("")) + WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0)) mock.ExpectExec("UPDATE workspaces SET"). WithArgs("ws-recover-guard", 0.05, "", 1, 100, ""). WillReturnResult(sqlmock.NewResult(0, 1)) // Stale read: heartbeat started before CascadeDelete set status='removed' - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-recover-guard"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("degraded", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("degraded", "", nil)) // Guarded UPDATE returns 0 rows because row is actually 'removed' mock.ExpectExec("UPDATE workspaces SET status =.*AND status = 'degraded'"). @@ -1479,7 +1479,7 @@ func TestHeartbeatHandler_RegisterFailureClearedOnCardBearingRestart(t *testing. // prevTask SELECT mock.ExpectQuery("SELECT COALESCE\\(current_task"). WithArgs("ws-2739"). - WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("")) + WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0)) // heartbeat UPDATE mock.ExpectExec("UPDATE workspaces SET"). @@ -1499,9 +1499,9 @@ func TestHeartbeatHandler_RegisterFailureClearedOnCardBearingRestart(t *testing. // evaluateStatus SELECT — workspace is degraded; failure marker is now NULL // (the clear above wiped it in the real DB), so recovery is unblocked. - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-2739"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("degraded", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("degraded", "", nil)) // degraded -> online recovery mock.ExpectExec("UPDATE workspaces SET status ="). diff --git a/workspace-server/internal/handlers/handlers_test.go b/workspace-server/internal/handlers/handlers_test.go index 50778fea7..3d0fa850f 100644 --- a/workspace-server/internal/handlers/handlers_test.go +++ b/workspace-server/internal/handlers/handlers_test.go @@ -244,7 +244,7 @@ func TestHeartbeatHandler_Normal(t *testing.T) { // Expect prevTask SELECT (before UPDATE) mock.ExpectQuery("SELECT COALESCE\\(current_task"). WithArgs("ws-123"). - WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("")) + WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0)) // Expect heartbeat UPDATE mock.ExpectExec("UPDATE workspaces SET"). @@ -252,9 +252,9 @@ func TestHeartbeatHandler_Normal(t *testing.T) { WillReturnResult(sqlmock.NewResult(0, 1)) // Expect evaluateStatus SELECT - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-123"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil)) w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) @@ -283,7 +283,7 @@ func TestHeartbeatHandler_Degraded(t *testing.T) { // Expect prevTask SELECT (before UPDATE) mock.ExpectQuery("SELECT COALESCE\\(current_task"). WithArgs("ws-123"). - WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("")) + WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0)) // Expect heartbeat UPDATE mock.ExpectExec("UPDATE workspaces SET"). @@ -291,9 +291,9 @@ func TestHeartbeatHandler_Degraded(t *testing.T) { WillReturnResult(sqlmock.NewResult(0, 1)) // Expect evaluateStatus SELECT — currently online - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-123"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil)) // Expect status transition to degraded mock.ExpectExec("UPDATE workspaces SET status ="). @@ -331,7 +331,7 @@ func TestHeartbeatHandler_Recovery(t *testing.T) { // Expect prevTask SELECT (before UPDATE) mock.ExpectQuery("SELECT COALESCE\\(current_task"). WithArgs("ws-123"). - WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("")) + WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0)) // Expect heartbeat UPDATE mock.ExpectExec("UPDATE workspaces SET"). @@ -339,9 +339,9 @@ func TestHeartbeatHandler_Recovery(t *testing.T) { WillReturnResult(sqlmock.NewResult(0, 1)) // Expect evaluateStatus SELECT — currently degraded - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-123"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("degraded", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("degraded", "", nil)) // Expect status transition back to online mock.ExpectExec("UPDATE workspaces SET status ="). @@ -727,7 +727,7 @@ func TestHeartbeatHandler_TaskChanged(t *testing.T) { // Expect prevTask SELECT — currently "old task" mock.ExpectQuery("SELECT COALESCE\\(current_task"). WithArgs("ws-123"). - WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("old task")) + WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("old task", 0)) // Expect heartbeat UPDATE with new task mock.ExpectExec("UPDATE workspaces SET"). @@ -735,9 +735,9 @@ func TestHeartbeatHandler_TaskChanged(t *testing.T) { WillReturnResult(sqlmock.NewResult(0, 1)) // Expect evaluateStatus SELECT - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-123"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil)) w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) @@ -913,7 +913,7 @@ func TestHeartbeatHandler_TaskUnchanged(t *testing.T) { // Expect prevTask SELECT — task is already "doing work" mock.ExpectQuery("SELECT COALESCE\\(current_task"). WithArgs("ws-123"). - WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("doing work")) + WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("doing work", 0)) // Expect heartbeat UPDATE with same task mock.ExpectExec("UPDATE workspaces SET"). @@ -921,9 +921,9 @@ func TestHeartbeatHandler_TaskUnchanged(t *testing.T) { WillReturnResult(sqlmock.NewResult(0, 1)) // Expect evaluateStatus SELECT - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-123"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil)) // NO TASK_UPDATED broadcast expected — task didn't change @@ -956,7 +956,7 @@ func TestHeartbeatHandler_TaskCleared(t *testing.T) { // Expect prevTask SELECT — was doing something mock.ExpectQuery("SELECT COALESCE\\(current_task"). WithArgs("ws-123"). - WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("old task")) + WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("old task", 0)) // Expect heartbeat UPDATE with empty task mock.ExpectExec("UPDATE workspaces SET"). @@ -964,9 +964,9 @@ func TestHeartbeatHandler_TaskCleared(t *testing.T) { WillReturnResult(sqlmock.NewResult(0, 1)) // Expect evaluateStatus SELECT - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-123"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil)) // TASK_UPDATED broadcast expected — changed from "old task" to "" // (BroadcastOnly doesn't hit sqlmock, so no expectation needed) @@ -1019,13 +1019,13 @@ func TestHeartbeatHandler_AlwaysBroadcastsHeartbeat(t *testing.T) { // Pre-fix this path emitted ZERO broadcasts. mock.ExpectQuery("SELECT COALESCE\\(current_task"). WithArgs("ws-123"). - WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("doing work")) + WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("doing work", 0)) mock.ExpectExec("UPDATE workspaces SET"). WithArgs("ws-123", 0.0, "", 1, 500, "doing work"). WillReturnResult(sqlmock.NewResult(0, 1)) - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-123"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil)) w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) diff --git a/workspace-server/internal/handlers/native_status_mgmt_test.go b/workspace-server/internal/handlers/native_status_mgmt_test.go index fd8dc11df..7269051d0 100644 --- a/workspace-server/internal/handlers/native_status_mgmt_test.go +++ b/workspace-server/internal/handlers/native_status_mgmt_test.go @@ -36,7 +36,7 @@ func TestHeartbeat_NativeStatusMgmt_SkipsDegradeInference(t *testing.T) { // prevTask SELECT (before UPDATE) mock.ExpectQuery("SELECT COALESCE\\(current_task"). WithArgs("ws-native-status"). - WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("")) + WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0)) // heartbeat UPDATE — same as the non-native path mock.ExpectExec("UPDATE workspaces SET"). @@ -48,9 +48,9 @@ func TestHeartbeat_NativeStatusMgmt_SkipsDegradeInference(t *testing.T) { // MUST NOT. We deliberately don't ExpectExec the degrade UPDATE // — sqlmock fails the test if any UPDATE happens that wasn't // expected, which is the regression cover. - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-native-status"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil)) w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) @@ -89,7 +89,7 @@ func TestHeartbeat_NativeStatusMgmt_SkipsRecovery(t *testing.T) { mock.ExpectQuery("SELECT COALESCE\\(current_task"). WithArgs("ws-native-recovery"). - WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("")) + WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0)) // heartbeat UPDATE — error_rate=0.05 would fire recovery mock.ExpectExec("UPDATE workspaces SET"). @@ -99,9 +99,9 @@ func TestHeartbeat_NativeStatusMgmt_SkipsRecovery(t *testing.T) { // evaluateStatus SELECT — currently degraded; recovery branch // would normally fire UPDATE → online + WORKSPACE_ONLINE broadcast. // Under native_status_mgmt, neither should run. - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-native-recovery"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("degraded", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("degraded", "", nil)) w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) @@ -135,7 +135,7 @@ func TestHeartbeat_NativeStatusMgmt_WedgedStillRespected(t *testing.T) { mock.ExpectQuery("SELECT COALESCE\\(current_task"). WithArgs("ws-wedged"). - WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("")) + WillReturnRows(sqlmock.NewRows([]string{"current_task", "monthly_spend"}).AddRow("", 0)) // heartbeat UPDATE — RuntimeState="wedged" means sample_error // reflects the wedge reason, error_rate stays 0 @@ -144,9 +144,9 @@ func TestHeartbeat_NativeStatusMgmt_WedgedStillRespected(t *testing.T) { WillReturnResult(sqlmock.NewResult(0, 1)) // evaluateStatus SELECT — currently online, wedged branch SHOULD fire - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-wedged"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil)) // Wedged degrade UPDATE — must still happen even with native_status_mgmt mock.ExpectExec("UPDATE workspaces SET status ="). diff --git a/workspace-server/internal/handlers/registry.go b/workspace-server/internal/handlers/registry.go index e6eeeada9..583896c61 100644 --- a/workspace-server/internal/handlers/registry.go +++ b/workspace-server/internal/handlers/registry.go @@ -393,12 +393,24 @@ func (h *RegistryHandler) platformAgentHasModelSecret(ctx context.Context, works return exists, err } +// platformAgentMCPServerPresent reports whether the runtime declared the +// platform-agent image's /opt/molecule-mcp-server binary present. The payload +// field is a pointer so an absent declaration (nil) is treated as false — +// fail-closed: an old/generic runtime cannot prove it has the concierge MCP. +func (h *RegistryHandler) platformAgentMCPServerPresent(present *bool) bool { + return present != nil && *present +} + // markWorkspaceFailed updates a workspace row to status='failed' and broadcasts // WORKSPACE_PROVISION_FAILED. It is a RegistryHandler-local fallback for the // fail-closed platform-agent identity gate; the WorkspaceHandler's // markProvisionFailed is the primary path during provisioning. -func (h *RegistryHandler) markWorkspaceFailed(ctx context.Context, workspaceID, msg string) { - extra := map[string]interface{}{"error": msg, "code": "PLATFORM_AGENT_IDENTITY_GATE"} +func (h *RegistryHandler) markWorkspaceFailed(ctx context.Context, workspaceID, msg, reason string) { + extra := map[string]interface{}{ + "error": msg, + "code": "PLATFORM_AGENT_IDENTITY_GATE", + "reason": reason, + } h.broadcaster.RecordAndBroadcast(ctx, string(events.EventWorkspaceProvisionFailed), workspaceID, extra) if _, dbErr := db.DB.ExecContext(ctx, `UPDATE workspaces SET status = $3, last_sample_error = $2, updated_at = now() WHERE id = $1`, @@ -539,27 +551,45 @@ func (h *RegistryHandler) Register(c *gin.Context) { } // Issue #2970: fail CLOSED if a platform agent reaches registration without - // the seeded MODEL workspace_secret. The MISSING_MODEL gate in + // BOTH the seeded MODEL workspace_secret AND the platform-agent image's baked + // /opt/molecule-mcp-server binary. The MISSING_MODEL gate in // prepareProvisionContext is the primary defense, but if a model-less/identity- - // less concierge somehow boots on a path that bypasses that gate (e.g. an old - // or generic image), this second-layer guard prevents it from ever marking + // less/mcp-less concierge somehow boots on a path that bypasses that gate (e.g. + // an old or generic image), this second-layer guard prevents it from ever marking // itself online-routable. Instead we mark the workspace failed so the canvas // surfaces a provision failure rather than serving users a generic Claude Code. // + // The runtime declares mcp-server availability via payload.mcp_server_present. + // A nil/false value is fail-closed: an undeclared or missing MCP server cannot be + // trusted for a concierge. + // // existingState.ExistingKind is populated by fetchExistingWorkspaceStateForDiagnostics // (best-effort). We treat "platform" literally; any other value (including "(new)" // or "(unavailable)") means the gate does not apply unless payload.Kind itself is // "platform" (covered by the privilege-escalation precheck above). if payload.Kind == models.KindPlatform || existingState.ExistingKind == models.KindPlatform { - if hasModel, mErr := h.platformAgentHasModelSecret(ctx, payload.ID); mErr != nil { + hasModel, mErr := h.platformAgentHasModelSecret(ctx, payload.ID) + if mErr != nil { log.Printf("Registry register: model secret lookup failed for %s: %v", payload.ID, mErr) c.JSON(http.StatusInternalServerError, gin.H{"error": "registration failed"}) return - } else if !hasModel { - msg := "platform agent registered without a seeded MODEL secret; refusing online" + } + hasMCP := h.platformAgentMCPServerPresent(payload.MCPServerPresent) + if !hasModel || !hasMCP { + var msg, reason, logCode string + switch { + case !hasModel: + msg = "platform agent registered without a seeded MODEL secret; refusing online" + reason = "model_missing" + logCode = "platform_agent_model_missing" + case !hasMCP: + msg = "platform agent registered without /opt/molecule-mcp-server; refusing online" + reason = "mcp_server_missing" + logCode = "platform_agent_mcp_server_missing" + } log.Printf("Registry register: %s (workspace=%s)", msg, payload.ID) - h.markWorkspaceFailed(ctx, payload.ID, msg) - logRegister400Reason("platform_agent_model_missing", payload.ID, payload, existingState, msg) + h.markWorkspaceFailed(ctx, payload.ID, msg, reason) + logRegister400Reason(logCode, payload.ID, payload, existingState, msg) c.JSON(http.StatusBadRequest, gin.H{"error": "platform agent identity incomplete"}) return } @@ -1193,14 +1223,50 @@ func (h *RegistryHandler) evaluateStatus(c *gin.Context, payload models.Heartbea ctx := c.Request.Context() var currentStatus string + var currentKind string var lastRegisterFailure sql.NullTime - err := db.DB.QueryRowContext(ctx, `SELECT status, last_register_failure_at FROM workspaces WHERE id = $1`, payload.WorkspaceID). - Scan(¤tStatus, &lastRegisterFailure) + err := db.DB.QueryRowContext(ctx, `SELECT status, kind, last_register_failure_at FROM workspaces WHERE id = $1`, payload.WorkspaceID). + Scan(¤tStatus, ¤tKind, &lastRegisterFailure) if err != nil { return } hasRecentRegisterFailure := lastRegisterFailure.Valid && time.Since(lastRegisterFailure.Time) < 5*time.Minute + // FAIL-CLOSED concierge online-marking gate (RCA #2970). + // A kind='platform' workspace that has lost either its seeded MODEL secret or + // the image-baked /opt/molecule-mcp-server binary must never be allowed back + // to status='online' via heartbeat recovery. The Register handler already gates + // the initial online marking; this gate closes the heartbeat-driven recovery + // paths (provisioning/failed/offline/awaiting_agent/degraded → online) that + // would otherwise resurrect a model-less/mcp-less concierge and let it serve + // users generic Claude Code. + // + // The runtime now declares mcp-server availability via + // payload.mcp_server_present on every heartbeat/register call. nil/false is + // fail-closed: an old/generic runtime cannot prove it is a real concierge. + if currentKind == models.KindPlatform { + hasModel, mErr := h.platformAgentHasModelSecret(ctx, payload.WorkspaceID) + if mErr != nil { + log.Printf("Heartbeat: model secret lookup failed for platform agent %s: %v", payload.WorkspaceID, mErr) + return + } + hasMCP := h.platformAgentMCPServerPresent(payload.MCPServerPresent) + if !hasModel || !hasMCP { + var msg, reason string + switch { + case !hasModel: + msg = "platform agent heartbeat denied: no seeded MODEL workspace_secret; refusing to mark online (RCA #2970 FAIL-CLOSED)" + reason = "model_missing" + case !hasMCP: + msg = "platform agent heartbeat denied: /opt/molecule-mcp-server missing; refusing to mark online (RCA #2970 FAIL-CLOSED)" + reason = "mcp_server_missing" + } + log.Printf("Heartbeat: %s (workspace=%s)", msg, payload.WorkspaceID) + h.markWorkspaceFailed(ctx, payload.WorkspaceID, msg, reason) + return + } + } + // Self-reported runtime wedge: takes precedence over the error_rate // path. The heartbeat task lives in its own asyncio task and keeps // firing 200s even after claude_agent_sdk locks up on diff --git a/workspace-server/internal/handlers/registry_test.go b/workspace-server/internal/handlers/registry_test.go index 6bbb82c44..a6145edbc 100644 --- a/workspace-server/internal/handlers/registry_test.go +++ b/workspace-server/internal/handlers/registry_test.go @@ -291,9 +291,9 @@ func TestHeartbeatHandler_OfflineToOnline(t *testing.T) { WillReturnResult(sqlmock.NewResult(0, 1)) // Expect evaluateStatus SELECT — currently offline - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-offline"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("offline", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("offline", "", nil)) // Expect status transition back to online mock.ExpectExec("UPDATE workspaces SET status ="). @@ -362,9 +362,9 @@ func TestHeartbeatHandler_ProvisioningToOnline(t *testing.T) { // evaluateStatus SELECT — reads the post-CASE status ('online'), so its own // provisioning→online branch does NOT fire (no duplicate transition exec). - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-provisioning"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil)) w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) @@ -414,9 +414,9 @@ func TestHeartbeatHandler_FailedToOnline(t *testing.T) { WillReturnResult(sqlmock.NewResult(0, 1)) // evaluateStatus SELECT — currently failed (provision-timeout sweeper flip) - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-failed"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("failed", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("failed", "", nil)) // the new failed → online recovery transition mock.ExpectExec("UPDATE workspaces SET status ="). @@ -463,9 +463,9 @@ func TestHeartbeatHandler_AwaitingAgentToOnline(t *testing.T) { WithArgs("ws-external", 0.0, "", 0, 60, ""). WillReturnResult(sqlmock.NewResult(0, 1)) - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-external"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("awaiting_agent", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("awaiting_agent", "", nil)) // The new branch — UPDATE ... WHERE status = 'awaiting_agent' mock.ExpectExec("UPDATE workspaces SET status ="). @@ -586,9 +586,9 @@ func TestHeartbeatHandler_OnlineStaysOnline(t *testing.T) { WillReturnResult(sqlmock.NewResult(0, 1)) // evaluateStatus: online with error_rate 0.2 — below 0.5 threshold, stays online - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-stable"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil)) w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) @@ -635,9 +635,9 @@ func TestHeartbeatHandler_RuntimeWedged_FlipsOnlineToDegraded(t *testing.T) { WillReturnResult(sqlmock.NewResult(0, 1)) // evaluateStatus: currentStatus = online - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-wedged"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil)) // The wedge-handling branch fires the degraded UPDATE with the // `AND status = 'online'` guard (race-safe against concurrent @@ -688,9 +688,9 @@ func TestHeartbeatHandler_DegradedRecoversOnlyAfterWedgeClears(t *testing.T) { WillReturnResult(sqlmock.NewResult(0, 1)) // currentStatus = degraded - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-still-wedged"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("degraded", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("degraded", "", nil)) // No additional UPDATE expected — the recovery branch's // `runtime_state == ""` guard blocks the flip back to online. @@ -733,9 +733,9 @@ func TestHeartbeatHandler_DegradedToOnline_AfterWedgeClears(t *testing.T) { WithArgs("ws-recovered", 0.0, "", 0, 30, ""). WillReturnResult(sqlmock.NewResult(0, 1)) - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-recovered"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("degraded", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("degraded", "", nil)) // Recovery UPDATE fires (degraded → online). mock.ExpectExec("UPDATE workspaces SET status ="). @@ -978,7 +978,7 @@ func TestHeartbeat_SkipsRemovedRows(t *testing.T) { WillReturnResult(sqlmock.NewResult(0, 0)) // evaluateStatus SELECT - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id"). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id"). WithArgs("ws-zombie"). WillReturnError(sql.ErrNoRows) // row effectively removed from view @@ -1019,9 +1019,9 @@ func TestHeartbeatHandler_BackfillsAgentCard_WhenNull(t *testing.T) { WithArgs("ws-nocard", sqlmock.AnyArg()). WillReturnResult(sqlmock.NewResult(0, 1)) - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-nocard"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow(models.StatusOnline, nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow(models.StatusOnline, "", nil)) w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) @@ -1059,9 +1059,9 @@ func TestHeartbeatHandler_SkipsAgentCardBackfill_WhenAlreadySet(t *testing.T) { WithArgs("ws-hascard", sqlmock.AnyArg()). WillReturnResult(sqlmock.NewResult(0, 0)) - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-hascard"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow(models.StatusOnline, nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow(models.StatusOnline, "", nil)) w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) @@ -1107,9 +1107,9 @@ func TestHeartbeatHandler_BackfillAgentCard_ClearsRegisterFailure(t *testing.T) // Status check sees degraded, but last_register_failure_at is now NULL because // the agent_card backfill UPDATE cleared it. - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-degraded-register-fail"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow(models.StatusDegraded, nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow(models.StatusDegraded, "", nil)) // Because the failure marker was cleared by the backfill, evaluateStatus // should now recover the workspace to online. @@ -1754,9 +1754,9 @@ func TestHeartbeat_MonthlySpend_WithinBounds(t *testing.T) { WithArgs("ws-spend-ok", 0.0, "", 0, 0, "", int64(15000)). // $150.00 WillReturnResult(sqlmock.NewResult(0, 1)) - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id"). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id"). WithArgs("ws-spend-ok"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil)) w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) @@ -1790,9 +1790,9 @@ func TestHeartbeat_MonthlySpend_NegativeClamped(t *testing.T) { WithArgs("ws-spend-neg", 0.0, "", 0, 0, ""). WillReturnResult(sqlmock.NewResult(0, 1)) - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id"). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id"). WithArgs("ws-spend-neg"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil)) w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) @@ -1826,9 +1826,9 @@ func TestHeartbeat_MonthlySpend_OverflowClamped(t *testing.T) { WithArgs("ws-spend-overflow", 0.0, "", 0, 0, "", int64(1_000_000_000_000)). WillReturnResult(sqlmock.NewResult(0, 1)) - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id"). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id"). WithArgs("ws-spend-overflow"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil)) w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) @@ -1861,9 +1861,9 @@ func TestHeartbeat_MonthlySpend_ExactCap(t *testing.T) { WithArgs("ws-spend-cap", 0.0, "", 0, 0, "", int64(1_000_000_000_000)). WillReturnResult(sqlmock.NewResult(0, 1)) - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id"). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id"). WithArgs("ws-spend-cap"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil)) w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) @@ -1897,9 +1897,9 @@ func TestHeartbeat_MonthlySpend_Zero_NoUpdate(t *testing.T) { WithArgs("ws-spend-zero", 0.0, "", 0, 0, ""). WillReturnResult(sqlmock.NewResult(0, 1)) - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id"). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id"). WithArgs("ws-spend-zero"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil)) w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) @@ -2143,7 +2143,7 @@ func TestRegister_AllowsAlreadyPlatformReRegister(t *testing.T) { w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) c.Request = httptest.NewRequest("POST", "/registry/register", - bytes.NewBufferString(`{"id":"`+wsID+`","url":"http://localhost:9100","delivery_mode":"push","kind":"platform","agent_card":{"name":"concierge"}}`)) + bytes.NewBufferString(`{"id":"`+wsID+`","url":"http://localhost:9100","delivery_mode":"push","kind":"platform","mcp_server_present":true,"agent_card":{"name":"concierge"}}`)) c.Request.Header.Set("Content-Type", "application/json") handler.Register(c) @@ -2270,7 +2270,7 @@ func TestRegister_PlatformAgentMissingModelSecret_FailsClosed(t *testing.T) { w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) c.Request = httptest.NewRequest("POST", "/registry/register", - bytes.NewBufferString(`{"id":"`+wsID+`","url":"http://localhost:9100","delivery_mode":"push","kind":"platform","agent_card":{"name":"concierge"}}`)) + bytes.NewBufferString(`{"id":"`+wsID+`","url":"http://localhost:9100","delivery_mode":"push","kind":"platform","mcp_server_present":true,"agent_card":{"name":"concierge"}}`)) c.Request.Header.Set("Content-Type", "application/json") handler.Register(c) @@ -2559,9 +2559,9 @@ func TestHeartbeatHandler_DeliversPlatformInboundSecret(t *testing.T) { WithArgs("ws-with-secret", 0.0, "", 0, 100, ""). WillReturnResult(sqlmock.NewResult(0, 1)) - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-with-secret"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil)) // readOrLazyHealInboundSecret — short-circuit: secret already on file. mock.ExpectQuery(`SELECT platform_inbound_secret FROM workspaces WHERE id = \$1`). @@ -2614,9 +2614,9 @@ func TestHeartbeatHandler_LazyHealsPlatformInboundSecret(t *testing.T) { WithArgs("ws-needs-heal", 0.0, "", 0, 100, ""). WillReturnResult(sqlmock.NewResult(0, 1)) - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-needs-heal"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil)) // readOrLazyHealInboundSecret — NULL column triggers mint. mock.ExpectQuery(`SELECT platform_inbound_secret FROM workspaces WHERE id = \$1`). @@ -2670,9 +2670,9 @@ func TestHeartbeatHandler_OmitsSecretOnHealFailure(t *testing.T) { WithArgs("ws-heal-fails", 0.0, "", 0, 100, ""). WillReturnResult(sqlmock.NewResult(0, 1)) - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-heal-fails"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}).AddRow("online", nil)) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}).AddRow("online", "", nil)) // Read returns NULL → mint is attempted... mock.ExpectQuery(`SELECT platform_inbound_secret FROM workspaces WHERE id = \$1`). @@ -3037,10 +3037,10 @@ func TestHeartbeat_RecentRegisterFailure_DegradesWorkspace(t *testing.T) { WillReturnResult(sqlmock.NewResult(0, 1)) // evaluateStatus SELECT — online with recent register failure - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-degrade-reg"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}). - AddRow("online", time.Now().Add(-2*time.Minute))) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}). + AddRow("online", "", time.Now().Add(-2*time.Minute))) // Degrade UPDATE mock.ExpectExec("UPDATE workspaces SET status ="). @@ -3087,10 +3087,10 @@ func TestHeartbeat_RecentRegisterFailure_BlocksRecovery(t *testing.T) { WillReturnResult(sqlmock.NewResult(0, 1)) // evaluateStatus SELECT — degraded with recent register failure - mock.ExpectQuery("SELECT status, last_register_failure_at FROM workspaces WHERE id ="). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). WithArgs("ws-no-recover"). - WillReturnRows(sqlmock.NewRows([]string{"status", "last_register_failure_at"}). - AddRow("degraded", time.Now().Add(-2*time.Minute))) + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}). + AddRow("degraded", "", time.Now().Add(-2*time.Minute))) // NO recovery UPDATE expected — register failure blocks recovery. @@ -3421,3 +3421,112 @@ func TestRegister_400_LogsExistingRowState(t *testing.T) { // + resolveDeliveryMode (all run BEFORE the isSafeURL check), which // is brittle. The test coverage gap is documented; the test // would be a redundant copy of the UpdateCard coverage. + +// TestRegister_PlatformAgentMissingMCPServer_FailsClosed guards the second half +// of issue #2970: a platform agent whose runtime reports mcp_server_present=false +// (or omits the field) must NOT be marked online, even when the MODEL secret is +// present. Fail-closed on the MCP server prevents a generic-image concierge from +// booting as a routable platform agent. +func TestRegister_PlatformAgentMissingMCPServer_FailsClosed(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + handler := NewRegistryHandler(broadcaster) + + const wsID = "ws-platform-no-mcp" + + // Bootstrap path — no live tokens. + mock.ExpectQuery("SELECT COUNT\\(\\*\\) FROM workspace_auth_tokens"). + WithArgs(wsID). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0)) + + // kind precheck: existing row is kind="platform". + mock.ExpectQuery("SELECT kind FROM workspaces WHERE id"). + WithArgs(wsID). + WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform")) + + // MODEL secret exists, but the runtime declares the MCP server absent. + mock.ExpectQuery("SELECT EXISTS\\(SELECT 1 FROM workspace_secrets WHERE workspace_id = \\$1 AND key = 'MODEL'\\)"). + WithArgs(wsID). + WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true)) + + // Gate failure broadcasts WORKSPACE_PROVISION_FAILED and marks the row failed. + mock.ExpectExec("INSERT INTO structure_events"). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("UPDATE workspaces SET status = \\$3, last_sample_error = \\$2, updated_at = now\\(\\) WHERE id = \\$1"). + WithArgs(wsID, sqlmock.AnyArg(), models.StatusFailed). + WillReturnResult(sqlmock.NewResult(0, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request = httptest.NewRequest("POST", "/registry/register", + bytes.NewBufferString(`{"id":"`+wsID+`","url":"http://localhost:9100","delivery_mode":"push","kind":"platform","mcp_server_present":false,"agent_card":{"name":"concierge"}}`)) + c.Request.Header.Set("Content-Type", "application/json") + + handler.Register(c) + + if w.Code != http.StatusBadRequest { + t.Fatalf("platform agent missing MCP server: expected 400, got %d: %s", w.Code, w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet expectations: %v", err) + } +} + +// TestHeartbeat_PlatformAgentMissingMCPServer_FailsClosed guards the heartbeat +// recovery side of issue #2970: a kind='platform' workspace whose runtime reports +// mcp_server_present=false must NOT recover to 'online' on heartbeat. Instead it +// is marked 'failed' so the canvas surfaces a provision failure. +func TestHeartbeat_PlatformAgentMissingMCPServer_FailsClosed(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + handler := NewRegistryHandler(broadcaster) + + const wsID = "ws-platform-heartbeat-no-mcp" + + // prevTask SELECT — use loose regex to match 3-col query on main + mock.ExpectQuery("SELECT COALESCE\\(current_task"). + WithArgs(wsID). + WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("")) + + // Heartbeat UPDATE (MonthlySpend=0 branch) + mock.ExpectExec("UPDATE workspaces SET"). + WithArgs(wsID, 0.0, "", 0, 60, ""). + WillReturnResult(sqlmock.NewResult(0, 1)) + + // evaluateStatus SELECT — platform agent currently in provisioning + // (the recovery path most likely to resurrect a broken concierge). + mock.ExpectQuery("SELECT status, kind, last_register_failure_at FROM workspaces WHERE id ="). + WithArgs(wsID). + WillReturnRows(sqlmock.NewRows([]string{"status", "kind", "last_register_failure_at"}). + AddRow("provisioning", "platform", nil)) + + // MODEL secret exists, but MCP server is absent. + mock.ExpectQuery("SELECT EXISTS\\(SELECT 1 FROM workspace_secrets WHERE workspace_id = \\$1 AND key = 'MODEL'\\)"). + WithArgs(wsID). + WillReturnRows(sqlmock.NewRows([]string{"exists"}).AddRow(true)) + + // Gate failure: broadcast WORKSPACE_PROVISION_FAILED + mark failed. + mock.ExpectExec("INSERT INTO structure_events"). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("UPDATE workspaces SET status = \\$3, last_sample_error = \\$2, updated_at = now\\(\\) WHERE id = \\$1"). + WithArgs(wsID, sqlmock.AnyArg(), models.StatusFailed). + WillReturnResult(sqlmock.NewResult(0, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + + body := `{"workspace_id":"` + wsID + `","error_rate":0.0,"sample_error":"","active_tasks":0,"uptime_seconds":60,"mcp_server_present":false}` + c.Request = httptest.NewRequest("POST", "/registry/heartbeat", bytes.NewBufferString(body)) + c.Request.Header.Set("Content-Type", "application/json") + + handler.Heartbeat(c) + + if w.Code != http.StatusOK { + t.Fatalf("heartbeat mcp gate: expected 200, got %d: %s", w.Code, w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet expectations: %v", err) + } +} diff --git a/workspace-server/internal/models/registry_payload_contract_test.go b/workspace-server/internal/models/registry_payload_contract_test.go index 143ad5c42..bf3f83190 100644 --- a/workspace-server/internal/models/registry_payload_contract_test.go +++ b/workspace-server/internal/models/registry_payload_contract_test.go @@ -53,7 +53,8 @@ const runtimeRegisterBody = `{ "skills": [{"id": "coding", "name": "coding", "description": "coding", "tags": []}], "capabilities": {"streaming": true, "pushNotifications": false}, "configuration_status": "ready" - } + }, + "mcp_server_present": false }` func TestRegisterPayload_RuntimeBodyBinds(t *testing.T) { @@ -70,6 +71,9 @@ func TestRegisterPayload_RuntimeBodyBinds(t *testing.T) { if p.URL == "" { t.Error("url should round-trip from the runtime body") } + if p.MCPServerPresent == nil { + t.Error("mcp_server_present must decode (nil would be fail-closed)") + } } func TestRegisterPayload_MissingID_Rejected(t *testing.T) { @@ -98,7 +102,8 @@ const runtimeHeartbeatBody = `{ "sample_error": "", "active_tasks": 0, "current_task": "", - "uptime_seconds": 42 + "uptime_seconds": 42, + "mcp_server_present": false }` func TestHeartbeatPayload_RuntimeBodyBinds(t *testing.T) { @@ -112,6 +117,9 @@ func TestHeartbeatPayload_RuntimeBodyBinds(t *testing.T) { if p.UptimeSeconds != 42 { t.Errorf("uptime_seconds not decoded: %d", p.UptimeSeconds) } + if p.MCPServerPresent == nil { + t.Error("mcp_server_present must decode (nil would be fail-closed)") + } } // The wedged-runtime heartbeat (heartbeat.py _runtime_state_payload + diff --git a/workspace-server/internal/models/workspace.go b/workspace-server/internal/models/workspace.go index 43c079fc9..a47c2b27c 100644 --- a/workspace-server/internal/models/workspace.go +++ b/workspace-server/internal/models/workspace.go @@ -102,6 +102,11 @@ type RegisterPayload struct { // the row to be its own org root (parent_id IS NULL) and to be the only // platform agent in the org — enforced by the Register handler. Kind string `json:"kind,omitempty"` + // MCPServerPresent is the runtime's declaration that the platform-agent + // image's baked /opt/molecule-mcp-server binary is present. For platform + // agents the controlplane treats nil/false as fail-closed (RCA #2970). + // Non-platform workspaces may omit this field. + MCPServerPresent *bool `json:"mcp_server_present,omitempty"` } type HeartbeatPayload struct { @@ -149,6 +154,10 @@ type HeartbeatPayload struct { // The heartbeat handler backfills NULL agent_card rows so the workspace // can come online without requiring a full re-register. (#2421) AgentCard json.RawMessage `json:"agent_card,omitempty"` + // MCPServerPresent mirrors the register payload field on every heartbeat + // so the fail-closed platform-agent gate can block recovery paths that + // would otherwise resurrect an mcp-less concierge (RCA #2970). + MCPServerPresent *bool `json:"mcp_server_present,omitempty"` } // RuntimeMetadata is the adapter-declared capability + override block