From 125aefe59dd1652808d2b3cf0c0b1d2061611556 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer B (MiniMax)" Date: Mon, 15 Jun 2026 22:21:39 +0000 Subject: [PATCH] fix(handlers#2970): FAIL-CLOSED concierge readiness gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per PM 2026-06-15 [dispatch f3f67b52] + Researcher RCA #2970: fresh concierges boot with NO identity (generic Claude Code) because they're provisioned WITHOUT the platform-agent image's seeded model + /opt/molecule-mcp-server. The DANGEROUS half is PROD FAILS OPEN: a model-less/mcp-less concierge marks itself ONLINE + routable and serves users generic Claude Code, instead of failing closed. FIX: make the concierge readiness/online-marking gate FAIL-CLOSED. A kind='platform' (concierge) registration that lacks its seeded MODEL workspace_secret must be marked status='failed' (NOT online- routable) so it can never silently serve generic Claude Code on prod. EXACT GATE LOCATION: workspace-server/internal/handlers/registry.go in the Register handler's INSERT/ON CONFLICT body. The prior shape hardcoded 'online' as the status value; the fix threads an effectiveStatus variable through (computed by the readiness check above) and uses it as the placeholder. CHANGES: 1. registry.go: add effectiveStatus computation after the kind=platform precheck. For kind=platform workspaces, query workspace_secrets for MODEL via readStoredModelSecret; if empty, effectiveStatus = 'failed' (with a structured log message naming the gate). For all other workspaces, effectiveStatus stays 'online'. The INSERT/ON CONFLICT body uses $7 + the new ExecContext arg. 2. handlers_test.go: update 1 existing TestRegisterHandler to expect the new arg (status='online' for non-platform). Add 1 new TestRegisterHandler_ConciergeWithoutModelFailsClosed that pins the FAIL-CLOSED contract: a kind=platform registration with no MODEL secret → INSERT uses status='failed', NOT 'online'. 3. handlers_additional_test.go: update 1 existing TestRegister_ProvisionerURLPreserved to expect the new arg. OUT OF SCOPE (TODO — runtime contract change required): - The mcp-server check (/opt/molecule-mcp-server presence). The concierge runtime needs to declare its mcp-server availability in the register payload (new field, non-breaking). Until that lands, the model check is the only hard gate. Documented in the readiness check comment. Test plan (local green): - go build ./workspace-server/... — clean - go test -count=1 -run TestRegisterHandler_ConciergeWithoutModelFailsClosed -v ./workspace-server/internal/handlers/ — 1/1 PASS in 0.02s - go test -count=1 -run TestRegisterHandler$|TestRegister_ProvisionerURLPreserved -v ./workspace-server/internal/handlers/ — 2/2 PASS in 0.09s Per the no-author-self-merge convention (PM dispatch f3cc220d): this is a prod gate. NOT self-merging. Driver sign-off needed in parallel per PM's explicit instruction. Co-Authored-By: Claude --- .../handlers/handlers_additional_test.go | 2 +- .../internal/handlers/handlers_test.go | 93 ++++++++++++++++++- .../internal/handlers/registry.go | 28 +++++- 3 files changed, 118 insertions(+), 5 deletions(-) diff --git a/workspace-server/internal/handlers/handlers_additional_test.go b/workspace-server/internal/handlers/handlers_additional_test.go index d34ee8a5..e78c74c1 100644 --- a/workspace-server/internal/handlers/handlers_additional_test.go +++ b/workspace-server/internal/handlers/handlers_additional_test.go @@ -414,7 +414,7 @@ func TestRegister_ProvisionerURLPreserved(t *testing.T) { WillReturnError(sql.ErrNoRows) mock.ExpectExec("INSERT INTO workspaces"). - WithArgs("ws-prov", "ws-prov", "http://localhost:8000", `{"name":"agent"}`, "push", ""). + WithArgs("ws-prov", "ws-prov", "http://localhost:8000", `{"name":"agent"}`, "push", "", "online"). WillReturnResult(sqlmock.NewResult(0, 1)) // DB returns provisioner URL (127.0.0.1) — should take precedence over agent-reported URL diff --git a/workspace-server/internal/handlers/handlers_test.go b/workspace-server/internal/handlers/handlers_test.go index 50778fea..84f149b3 100644 --- a/workspace-server/internal/handlers/handlers_test.go +++ b/workspace-server/internal/handlers/handlers_test.go @@ -195,7 +195,7 @@ func TestRegisterHandler(t *testing.T) { // Expect the upsert INSERT ... ON CONFLICT mock.ExpectExec("INSERT INTO workspaces"). - WithArgs("ws-123", "ws-123", "http://localhost:8000", `{"name":"test"}`, "push", ""). + WithArgs("ws-123", "ws-123", "http://localhost:8000", `{"name":"test"}`, "push", "", "online"). WillReturnResult(sqlmock.NewResult(0, 1)) // Expect the SELECT url query (for cache URL logic) @@ -233,6 +233,97 @@ func TestRegisterHandler(t *testing.T) { } } +// TestRegisterHandler_ConciergeWithoutModelFailsClosed (RCA #2970) pins +// the FAIL-CLOSED concierge readiness gate: a kind='platform' +// registration that lacks its seeded MODEL workspace_secret must +// NOT be marked status='online' — a model-less concierge silently +// serves users generic Claude Code. The Register handler reads +// the stored MODEL secret via readStoredModelSecret; if empty, +// the effective status flips to 'failed' BEFORE the INSERT, so +// the concierge never appears online-routable to other workspaces. +// +// Pre-fix behavior (FAIL-OPEN): the INSERT hardcoded status='online' +// for every successful registration. A model-less concierge +// registered as 'online' and routed users to generic Claude Code +// (the prod-safety bug RCA #2970). +// +// Post-fix behavior (FAIL-CLOSED): the model-less concierge +// registers as status='failed', which the router skips +// (failure → no dispatch). The structured log message names the +// gate so operators see WHY the concierge isn't online. +func TestRegisterHandler_ConciergeWithoutModelFailsClosed(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + handler := NewRegistryHandler(broadcaster) + + // The kind=platform precheck — no existing row, the switch case + // is the (errors.Is(ErrNoRows) && existingKind!=KindPlatform) branch + // which would 403 a fresh concierge. But because existingKind is + // empty (Scan into ""), the case matches and rejects. The test + // needs to mimic an "already platform" row OR a "create new platform" + // scenario — the switch's two case arms need careful matching. + // For this test, mock the kind precheck to return an existing platform + // row (so the 403 is bypassed) — this represents a concierge RE-registering + // after the model-seed somehow got lost. + mock.ExpectQuery(`SELECT kind FROM workspaces WHERE id`). + WithArgs("concierge-1"). + WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform")) + + // No MODEL secret is seeded — readStoredModelSecret's SELECT on + // workspace_secrets returns ErrNoRows → the function returns "" → + // the readiness check trips → effectiveStatus = 'failed'. + mock.ExpectQuery(`SELECT encrypted_value, encryption_version FROM workspace_secrets`). + WithArgs("concierge-1"). + WillReturnError(sql.ErrNoRows) + + // resolveDeliveryMode preflight — assume existing row, default push (#2339). + mock.ExpectQuery(`SELECT delivery_mode, runtime FROM workspaces WHERE id`). + WithArgs("concierge-1"). + WillReturnError(sql.ErrNoRows) + + // Expect the upsert with status='failed' (the FAIL-CLOSED flip). + // This is the load-bearing assertion: previously the SQL had + // 'online' hardcoded; now it uses the $7 parameter bound to the + // effectiveStatus (computed above as 'failed' when the readiness + // check fails for a kind=platform workspace). + mock.ExpectExec("INSERT INTO workspaces"). + WithArgs("concierge-1", "concierge-1", "http://localhost:8000", `{"name":"concierge"}`, "push", "platform", "failed"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + // The post-INSERT SELECT url (for cache URL logic). + mock.ExpectQuery("SELECT url FROM workspaces WHERE id ="). + WithArgs("concierge-1"). + WillReturnRows(sqlmock.NewRows([]string{"url"}).AddRow("http://localhost:8000")) + + // The structure_events INSERT. + mock.ExpectExec("INSERT INTO structure_events"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + + // kind="platform" triggers the readiness check; URL+card stay minimal. + body := `{"id":"concierge-1","url":"http://localhost:8000","agent_card":{"name":"concierge"},"kind":"platform"}` + c.Request = httptest.NewRequest("POST", "/registry/register", bytes.NewBufferString(body)) + c.Request.Header.Set("Content-Type", "application/json") + + handler.Register(c) + + // The HTTP response is still 200 (the registration completed — + // the concierge's status column reflects readiness, not the HTTP + // outcome). The status column is what the router reads, and + // 'failed' means the router skips this concierge. The previous + // FAIL-OPEN shape returned 200 with status='online' — the + // concierge was online-routable even though it had no model. + if w.Code != http.StatusOK { + t.Errorf("expected 200, got %d: %s", w.Code, w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("sqlmock expectations unmet — the FAIL-CLOSED gate's INSERT was not called with status='failed': %v", err) + } +} + // ---------- TestHeartbeatHandler ---------- func TestHeartbeatHandler_Normal(t *testing.T) { diff --git a/workspace-server/internal/handlers/registry.go b/workspace-server/internal/handlers/registry.go index 026dd74b..be4fa27d 100644 --- a/workspace-server/internal/handlers/registry.go +++ b/workspace-server/internal/handlers/registry.go @@ -471,6 +471,28 @@ func (h *RegistryHandler) Register(c *gin.Context) { } } + // FAIL-CLOSED concierge readiness gate (RCA 104088 follow-up, RCA #2970): + // A kind='platform' (concierge) registration that lacks its seeded model + // must NOT be marked status='online' — a model-less concierge silently + // serves users generic Claude Code instead of failing closed. This + // precheck computes the effective status BEFORE the INSERT below; the + // INSERT/ON CONFLICT body uses this variable instead of the hardcoded + // 'online'. The mcp-server check is a TODO (the concierge runtime needs + // to declare its mcp-server presence in the register payload — runtime + // change is a separate ticket). For now: model-less concierge → failed. + effectiveStatus := models.StatusOnline + if payload.Kind == models.KindPlatform { + if readStoredModelSecret(ctx, payload.ID) == "" { + log.Printf("Registry register: concierge %s has NO seeded MODEL workspace_secret — marking status=failed (RCA #2970 FAIL-CLOSED; was FAIL-OPEN)", payload.ID) + effectiveStatus = models.StatusFailed + } + // TODO (runtime contract change required): also check for + // /opt/molecule-mcp-server presence. Requires the concierge + // runtime to declare mcp-server availability in the register + // payload (new field, non-breaking). Until that lands, the + // model check is the only hard gate. + } + // Resolve the EFFECTIVE delivery mode for THIS register call: the // payload's explicit value wins; falling back to the existing row's // stored value; falling back to push (the schema default). Done AFTER @@ -622,7 +644,7 @@ func (h *RegistryHandler) Register(c *gin.Context) { // workspaces_platform_root_check constraint → friendly 409 below. _, err = db.DB.ExecContext(ctx, ` INSERT INTO workspaces (id, name, url, agent_card, status, last_heartbeat_at, delivery_mode, kind) - VALUES ($1, $2, $3, $4::jsonb, 'online', now(), $5, COALESCE(NULLIF($6, ''), 'workspace')) + VALUES ($1, $2, $3, $4::jsonb, $7, now(), $5, COALESCE(NULLIF($6, ''), 'workspace')) ON CONFLICT (id) DO UPDATE SET -- Preserve the provisioner-set host-port URL. The provisioner -- injects MOLECULE_WORKSPACE_URL= into the container @@ -654,13 +676,13 @@ func (h *RegistryHandler) Register(c *gin.Context) { ELSE EXCLUDED.url END, agent_card = EXCLUDED.agent_card, - status = 'online', + status = $7, last_heartbeat_at = now(), delivery_mode = EXCLUDED.delivery_mode, kind = COALESCE(NULLIF($6, ''), workspaces.kind), updated_at = now() WHERE workspaces.status IS DISTINCT FROM 'removed' - `, payload.ID, payload.ID, urlForUpsert, agentCardStr, modeForUpsert, payload.Kind) + `, payload.ID, payload.ID, urlForUpsert, agentCardStr, modeForUpsert, payload.Kind, string(effectiveStatus)) if err != nil { if isPlatformRootViolation(err) { c.JSON(http.StatusConflict, gin.H{"error": errPlatformNotRoot}) -- 2.52.0