diff --git a/workspace-server/internal/handlers/handlers_additional_test.go b/workspace-server/internal/handlers/handlers_additional_test.go index d34ee8a5..e78c74c1 100644 --- a/workspace-server/internal/handlers/handlers_additional_test.go +++ b/workspace-server/internal/handlers/handlers_additional_test.go @@ -414,7 +414,7 @@ func TestRegister_ProvisionerURLPreserved(t *testing.T) { WillReturnError(sql.ErrNoRows) mock.ExpectExec("INSERT INTO workspaces"). - WithArgs("ws-prov", "ws-prov", "http://localhost:8000", `{"name":"agent"}`, "push", ""). + WithArgs("ws-prov", "ws-prov", "http://localhost:8000", `{"name":"agent"}`, "push", "", "online"). WillReturnResult(sqlmock.NewResult(0, 1)) // DB returns provisioner URL (127.0.0.1) — should take precedence over agent-reported URL diff --git a/workspace-server/internal/handlers/handlers_test.go b/workspace-server/internal/handlers/handlers_test.go index 50778fea..84f149b3 100644 --- a/workspace-server/internal/handlers/handlers_test.go +++ b/workspace-server/internal/handlers/handlers_test.go @@ -195,7 +195,7 @@ func TestRegisterHandler(t *testing.T) { // Expect the upsert INSERT ... ON CONFLICT mock.ExpectExec("INSERT INTO workspaces"). - WithArgs("ws-123", "ws-123", "http://localhost:8000", `{"name":"test"}`, "push", ""). + WithArgs("ws-123", "ws-123", "http://localhost:8000", `{"name":"test"}`, "push", "", "online"). WillReturnResult(sqlmock.NewResult(0, 1)) // Expect the SELECT url query (for cache URL logic) @@ -233,6 +233,97 @@ func TestRegisterHandler(t *testing.T) { } } +// TestRegisterHandler_ConciergeWithoutModelFailsClosed (RCA #2970) pins +// the FAIL-CLOSED concierge readiness gate: a kind='platform' +// registration that lacks its seeded MODEL workspace_secret must +// NOT be marked status='online' — a model-less concierge silently +// serves users generic Claude Code. The Register handler reads +// the stored MODEL secret via readStoredModelSecret; if empty, +// the effective status flips to 'failed' BEFORE the INSERT, so +// the concierge never appears online-routable to other workspaces. +// +// Pre-fix behavior (FAIL-OPEN): the INSERT hardcoded status='online' +// for every successful registration. A model-less concierge +// registered as 'online' and routed users to generic Claude Code +// (the prod-safety bug RCA #2970). +// +// Post-fix behavior (FAIL-CLOSED): the model-less concierge +// registers as status='failed', which the router skips +// (failure → no dispatch). The structured log message names the +// gate so operators see WHY the concierge isn't online. +func TestRegisterHandler_ConciergeWithoutModelFailsClosed(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + handler := NewRegistryHandler(broadcaster) + + // The kind=platform precheck — no existing row, the switch case + // is the (errors.Is(ErrNoRows) && existingKind!=KindPlatform) branch + // which would 403 a fresh concierge. But because existingKind is + // empty (Scan into ""), the case matches and rejects. The test + // needs to mimic an "already platform" row OR a "create new platform" + // scenario — the switch's two case arms need careful matching. + // For this test, mock the kind precheck to return an existing platform + // row (so the 403 is bypassed) — this represents a concierge RE-registering + // after the model-seed somehow got lost. + mock.ExpectQuery(`SELECT kind FROM workspaces WHERE id`). + WithArgs("concierge-1"). + WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform")) + + // No MODEL secret is seeded — readStoredModelSecret's SELECT on + // workspace_secrets returns ErrNoRows → the function returns "" → + // the readiness check trips → effectiveStatus = 'failed'. + mock.ExpectQuery(`SELECT encrypted_value, encryption_version FROM workspace_secrets`). + WithArgs("concierge-1"). + WillReturnError(sql.ErrNoRows) + + // resolveDeliveryMode preflight — assume existing row, default push (#2339). + mock.ExpectQuery(`SELECT delivery_mode, runtime FROM workspaces WHERE id`). + WithArgs("concierge-1"). + WillReturnError(sql.ErrNoRows) + + // Expect the upsert with status='failed' (the FAIL-CLOSED flip). + // This is the load-bearing assertion: previously the SQL had + // 'online' hardcoded; now it uses the $7 parameter bound to the + // effectiveStatus (computed above as 'failed' when the readiness + // check fails for a kind=platform workspace). + mock.ExpectExec("INSERT INTO workspaces"). + WithArgs("concierge-1", "concierge-1", "http://localhost:8000", `{"name":"concierge"}`, "push", "platform", "failed"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + // The post-INSERT SELECT url (for cache URL logic). + mock.ExpectQuery("SELECT url FROM workspaces WHERE id ="). + WithArgs("concierge-1"). + WillReturnRows(sqlmock.NewRows([]string{"url"}).AddRow("http://localhost:8000")) + + // The structure_events INSERT. + mock.ExpectExec("INSERT INTO structure_events"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + + // kind="platform" triggers the readiness check; URL+card stay minimal. + body := `{"id":"concierge-1","url":"http://localhost:8000","agent_card":{"name":"concierge"},"kind":"platform"}` + c.Request = httptest.NewRequest("POST", "/registry/register", bytes.NewBufferString(body)) + c.Request.Header.Set("Content-Type", "application/json") + + handler.Register(c) + + // The HTTP response is still 200 (the registration completed — + // the concierge's status column reflects readiness, not the HTTP + // outcome). The status column is what the router reads, and + // 'failed' means the router skips this concierge. The previous + // FAIL-OPEN shape returned 200 with status='online' — the + // concierge was online-routable even though it had no model. + if w.Code != http.StatusOK { + t.Errorf("expected 200, got %d: %s", w.Code, w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("sqlmock expectations unmet — the FAIL-CLOSED gate's INSERT was not called with status='failed': %v", err) + } +} + // ---------- TestHeartbeatHandler ---------- func TestHeartbeatHandler_Normal(t *testing.T) { diff --git a/workspace-server/internal/handlers/registry.go b/workspace-server/internal/handlers/registry.go index 026dd74b..be4fa27d 100644 --- a/workspace-server/internal/handlers/registry.go +++ b/workspace-server/internal/handlers/registry.go @@ -471,6 +471,28 @@ func (h *RegistryHandler) Register(c *gin.Context) { } } + // FAIL-CLOSED concierge readiness gate (RCA 104088 follow-up, RCA #2970): + // A kind='platform' (concierge) registration that lacks its seeded model + // must NOT be marked status='online' — a model-less concierge silently + // serves users generic Claude Code instead of failing closed. This + // precheck computes the effective status BEFORE the INSERT below; the + // INSERT/ON CONFLICT body uses this variable instead of the hardcoded + // 'online'. The mcp-server check is a TODO (the concierge runtime needs + // to declare its mcp-server presence in the register payload — runtime + // change is a separate ticket). For now: model-less concierge → failed. + effectiveStatus := models.StatusOnline + if payload.Kind == models.KindPlatform { + if readStoredModelSecret(ctx, payload.ID) == "" { + log.Printf("Registry register: concierge %s has NO seeded MODEL workspace_secret — marking status=failed (RCA #2970 FAIL-CLOSED; was FAIL-OPEN)", payload.ID) + effectiveStatus = models.StatusFailed + } + // TODO (runtime contract change required): also check for + // /opt/molecule-mcp-server presence. Requires the concierge + // runtime to declare mcp-server availability in the register + // payload (new field, non-breaking). Until that lands, the + // model check is the only hard gate. + } + // Resolve the EFFECTIVE delivery mode for THIS register call: the // payload's explicit value wins; falling back to the existing row's // stored value; falling back to push (the schema default). Done AFTER @@ -622,7 +644,7 @@ func (h *RegistryHandler) Register(c *gin.Context) { // workspaces_platform_root_check constraint → friendly 409 below. _, err = db.DB.ExecContext(ctx, ` INSERT INTO workspaces (id, name, url, agent_card, status, last_heartbeat_at, delivery_mode, kind) - VALUES ($1, $2, $3, $4::jsonb, 'online', now(), $5, COALESCE(NULLIF($6, ''), 'workspace')) + VALUES ($1, $2, $3, $4::jsonb, $7, now(), $5, COALESCE(NULLIF($6, ''), 'workspace')) ON CONFLICT (id) DO UPDATE SET -- Preserve the provisioner-set host-port URL. The provisioner -- injects MOLECULE_WORKSPACE_URL= into the container @@ -654,13 +676,13 @@ func (h *RegistryHandler) Register(c *gin.Context) { ELSE EXCLUDED.url END, agent_card = EXCLUDED.agent_card, - status = 'online', + status = $7, last_heartbeat_at = now(), delivery_mode = EXCLUDED.delivery_mode, kind = COALESCE(NULLIF($6, ''), workspaces.kind), updated_at = now() WHERE workspaces.status IS DISTINCT FROM 'removed' - `, payload.ID, payload.ID, urlForUpsert, agentCardStr, modeForUpsert, payload.Kind) + `, payload.ID, payload.ID, urlForUpsert, agentCardStr, modeForUpsert, payload.Kind, string(effectiveStatus)) if err != nil { if isPlatformRootViolation(err) { c.JSON(http.StatusConflict, gin.H{"error": errPlatformNotRoot})