From 763791752e1274577a8c10a8e008132466a813ca Mon Sep 17 00:00:00 2001 From: devops-engineer Date: Thu, 18 Jun 2026 02:37:02 -0700 Subject: [PATCH] fix(concierge): #2989 gate must treat nil mcp_server_present as ALLOW (rollout-safety) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Live RCA 2026-06-18: #2989's fail-closed gate (`present != nil && *present`) treats a runtime that doesn't report mcp_server_present (nil) as FALSE → fail-closed. But the runtime field is added by #147, which merged at 01:52 TODAY — AFTER the pinned concierge image's runtime (0.3.32, cut 19:53 the prior day; platform_agent_identity.py is 404 at that tag). So the current concierge image bakes /opt/molecule-mcp-server but its runtime can't SPEAK the contract. Result: the moment #2989 deploys ahead of a #147-bearing concierge image, EVERY concierge is marked failed despite a present MCP binary — observed on test3 (it was online, then failed the instant its tenant box rolled to the #2989 build), and it would take the whole fleet offline on a full roll. Fix: nil (field absent ⇒ old runtime ⇒ unknown) → ALLOW, not block. Only an explicit &false (a #147-aware runtime affirmatively reporting the MCP absent) fail-closes. &true → allow. This makes the contract-pair (#2989 gate + #147 runtime) deploy-order-safe; gate ENFORCEMENT activates naturally once the concierge image ships a #147 runtime. Test: TestPlatformAgentMCPServerPresent_NilTolerance; existing true→online / false→failed tests unchanged. build+vet green. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../handlers/concierge_mcp_present_test.go | 21 ++++++++++++++++++ .../internal/handlers/registry.go | 22 ++++++++++++++----- 2 files changed, 38 insertions(+), 5 deletions(-) create mode 100644 workspace-server/internal/handlers/concierge_mcp_present_test.go diff --git a/workspace-server/internal/handlers/concierge_mcp_present_test.go b/workspace-server/internal/handlers/concierge_mcp_present_test.go new file mode 100644 index 00000000..9ccdbb5f --- /dev/null +++ b/workspace-server/internal/handlers/concierge_mcp_present_test.go @@ -0,0 +1,21 @@ +package handlers + +import "testing" + +// #2989 rollout-safety (2026-06-18): nil (runtime predating #147, field absent) +// must be treated as ALLOW, not fail-closed — else the gate takes every +// pre-#147 concierge offline. Only an explicit false fail-closes. +func TestPlatformAgentMCPServerPresent_NilTolerance(t *testing.T) { + h := &RegistryHandler{} + tt := true + ff := false + if !h.platformAgentMCPServerPresent(nil) { + t.Error("nil (old runtime, field absent) must be ALLOW, got block") + } + if !h.platformAgentMCPServerPresent(&tt) { + t.Error("&true must be allow") + } + if h.platformAgentMCPServerPresent(&ff) { + t.Error("&false (runtime reports MCP absent) must fail-closed") + } +} diff --git a/workspace-server/internal/handlers/registry.go b/workspace-server/internal/handlers/registry.go index 583896c6..b6cce98a 100644 --- a/workspace-server/internal/handlers/registry.go +++ b/workspace-server/internal/handlers/registry.go @@ -393,12 +393,24 @@ func (h *RegistryHandler) platformAgentHasModelSecret(ctx context.Context, works return exists, err } -// platformAgentMCPServerPresent reports whether the runtime declared the -// platform-agent image's /opt/molecule-mcp-server binary present. The payload -// field is a pointer so an absent declaration (nil) is treated as false — -// fail-closed: an old/generic runtime cannot prove it has the concierge MCP. +// platformAgentMCPServerPresent reports whether the concierge's MCP server +// should be considered present for the fail-closed online gate (#2970). +// +// The payload field is a pointer to distinguish three states: +// - nil → the runtime did NOT report the field at all. This is a runtime +// PREDATING the #147 contract (mcp_server_present on register/heartbeat). +// The concierge image bakes /opt/molecule-mcp-server unconditionally, so an +// old runtime that simply can't SPEAK the contract must NOT be fail-closed — +// that would take every concierge offline the moment this gate deploys ahead +// of the runtime release that adds the field (the exact rollout-order hazard +// that fail-closed #2989 + a pre-#147 concierge image hit on 2026-06-18: +// test3 + the fleet would be marked failed despite a present MCP binary). +// Treat nil as ALLOW (unknown ⇒ don't block) for backward-compat. +// - &true → runtime affirmatively reports the binary present → allow. +// - &false → a #147-aware runtime affirmatively reports it ABSENT → fail-closed +// (the real signal this gate exists to catch). func (h *RegistryHandler) platformAgentMCPServerPresent(present *bool) bool { - return present != nil && *present + return present == nil || *present } // markWorkspaceFailed updates a workspace row to status='failed' and broadcasts -- 2.52.0