fix(handlers#2970): FAIL-CLOSED concierge readiness gate (live prod-safety — model-less concierge no longer marks itself online) #2972

Closed
agent-dev-b wants to merge 1 commits from fix/2970-concierge-readiness-fail-closed into main
3 changed files with 118 additions and 5 deletions
@@ -414,7 +414,7 @@ func TestRegister_ProvisionerURLPreserved(t *testing.T) {
WillReturnError(sql.ErrNoRows)
mock.ExpectExec("INSERT INTO workspaces").
WithArgs("ws-prov", "ws-prov", "http://localhost:8000", `{"name":"agent"}`, "push", "").
WithArgs("ws-prov", "ws-prov", "http://localhost:8000", `{"name":"agent"}`, "push", "", "online").
WillReturnResult(sqlmock.NewResult(0, 1))
// DB returns provisioner URL (127.0.0.1) — should take precedence over agent-reported URL
@@ -195,7 +195,7 @@ func TestRegisterHandler(t *testing.T) {
// Expect the upsert INSERT ... ON CONFLICT
mock.ExpectExec("INSERT INTO workspaces").
WithArgs("ws-123", "ws-123", "http://localhost:8000", `{"name":"test"}`, "push", "").
WithArgs("ws-123", "ws-123", "http://localhost:8000", `{"name":"test"}`, "push", "", "online").
WillReturnResult(sqlmock.NewResult(0, 1))
// Expect the SELECT url query (for cache URL logic)
@@ -233,6 +233,97 @@ func TestRegisterHandler(t *testing.T) {
}
}
// TestRegisterHandler_ConciergeWithoutModelFailsClosed (RCA #2970) pins
// the FAIL-CLOSED concierge readiness gate: a kind='platform'
// registration that lacks its seeded MODEL workspace_secret must
// NOT be marked status='online' — a model-less concierge silently
// serves users generic Claude Code. The Register handler reads
// the stored MODEL secret via readStoredModelSecret; if empty,
// the effective status flips to 'failed' BEFORE the INSERT, so
// the concierge never appears online-routable to other workspaces.
//
// Pre-fix behavior (FAIL-OPEN): the INSERT hardcoded status='online'
// for every successful registration. A model-less concierge
// registered as 'online' and routed users to generic Claude Code
// (the prod-safety bug RCA #2970).
//
// Post-fix behavior (FAIL-CLOSED): the model-less concierge
// registers as status='failed', which the router skips
// (failure → no dispatch). The structured log message names the
// gate so operators see WHY the concierge isn't online.
func TestRegisterHandler_ConciergeWithoutModelFailsClosed(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
broadcaster := newTestBroadcaster()
handler := NewRegistryHandler(broadcaster)
// The kind=platform precheck — no existing row, the switch case
// is the (errors.Is(ErrNoRows) && existingKind!=KindPlatform) branch
// which would 403 a fresh concierge. But because existingKind is
// empty (Scan into ""), the case matches and rejects. The test
// needs to mimic an "already platform" row OR a "create new platform"
// scenario — the switch's two case arms need careful matching.
// For this test, mock the kind precheck to return an existing platform
// row (so the 403 is bypassed) — this represents a concierge RE-registering
// after the model-seed somehow got lost.
mock.ExpectQuery(`SELECT kind FROM workspaces WHERE id`).
WithArgs("concierge-1").
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform"))
// No MODEL secret is seeded — readStoredModelSecret's SELECT on
// workspace_secrets returns ErrNoRows → the function returns "" →
// the readiness check trips → effectiveStatus = 'failed'.
mock.ExpectQuery(`SELECT encrypted_value, encryption_version FROM workspace_secrets`).
WithArgs("concierge-1").
WillReturnError(sql.ErrNoRows)
// resolveDeliveryMode preflight — assume existing row, default push (#2339).
mock.ExpectQuery(`SELECT delivery_mode, runtime FROM workspaces WHERE id`).
WithArgs("concierge-1").
WillReturnError(sql.ErrNoRows)
// Expect the upsert with status='failed' (the FAIL-CLOSED flip).
// This is the load-bearing assertion: previously the SQL had
// 'online' hardcoded; now it uses the $7 parameter bound to the
// effectiveStatus (computed above as 'failed' when the readiness
// check fails for a kind=platform workspace).
mock.ExpectExec("INSERT INTO workspaces").
WithArgs("concierge-1", "concierge-1", "http://localhost:8000", `{"name":"concierge"}`, "push", "platform", "failed").
WillReturnResult(sqlmock.NewResult(0, 1))
// The post-INSERT SELECT url (for cache URL logic).
mock.ExpectQuery("SELECT url FROM workspaces WHERE id =").
WithArgs("concierge-1").
WillReturnRows(sqlmock.NewRows([]string{"url"}).AddRow("http://localhost:8000"))
// The structure_events INSERT.
mock.ExpectExec("INSERT INTO structure_events").
WillReturnResult(sqlmock.NewResult(0, 1))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
// kind="platform" triggers the readiness check; URL+card stay minimal.
body := `{"id":"concierge-1","url":"http://localhost:8000","agent_card":{"name":"concierge"},"kind":"platform"}`
c.Request = httptest.NewRequest("POST", "/registry/register", bytes.NewBufferString(body))
c.Request.Header.Set("Content-Type", "application/json")
handler.Register(c)
// The HTTP response is still 200 (the registration completed —
// the concierge's status column reflects readiness, not the HTTP
// outcome). The status column is what the router reads, and
// 'failed' means the router skips this concierge. The previous
// FAIL-OPEN shape returned 200 with status='online' — the
// concierge was online-routable even though it had no model.
if w.Code != http.StatusOK {
t.Errorf("expected 200, got %d: %s", w.Code, w.Body.String())
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("sqlmock expectations unmet — the FAIL-CLOSED gate's INSERT was not called with status='failed': %v", err)
}
}
// ---------- TestHeartbeatHandler ----------
func TestHeartbeatHandler_Normal(t *testing.T) {
+25 -3
View File
@@ -471,6 +471,28 @@ func (h *RegistryHandler) Register(c *gin.Context) {
}
}
// FAIL-CLOSED concierge readiness gate (RCA 104088 follow-up, RCA #2970):
// A kind='platform' (concierge) registration that lacks its seeded model
// must NOT be marked status='online' — a model-less concierge silently
// serves users generic Claude Code instead of failing closed. This
// precheck computes the effective status BEFORE the INSERT below; the
// INSERT/ON CONFLICT body uses this variable instead of the hardcoded
// 'online'. The mcp-server check is a TODO (the concierge runtime needs
// to declare its mcp-server presence in the register payload — runtime
// change is a separate ticket). For now: model-less concierge → failed.
effectiveStatus := models.StatusOnline
if payload.Kind == models.KindPlatform {
if readStoredModelSecret(ctx, payload.ID) == "" {
log.Printf("Registry register: concierge %s has NO seeded MODEL workspace_secret — marking status=failed (RCA #2970 FAIL-CLOSED; was FAIL-OPEN)", payload.ID)
effectiveStatus = models.StatusFailed
}
// TODO (runtime contract change required): also check for
// /opt/molecule-mcp-server presence. Requires the concierge
// runtime to declare mcp-server availability in the register
// payload (new field, non-breaking). Until that lands, the
// model check is the only hard gate.
}
// Resolve the EFFECTIVE delivery mode for THIS register call: the
// payload's explicit value wins; falling back to the existing row's
// stored value; falling back to push (the schema default). Done AFTER
@@ -622,7 +644,7 @@ func (h *RegistryHandler) Register(c *gin.Context) {
// workspaces_platform_root_check constraint → friendly 409 below.
_, err = db.DB.ExecContext(ctx, `
INSERT INTO workspaces (id, name, url, agent_card, status, last_heartbeat_at, delivery_mode, kind)
VALUES ($1, $2, $3, $4::jsonb, 'online', now(), $5, COALESCE(NULLIF($6, ''), 'workspace'))
VALUES ($1, $2, $3, $4::jsonb, $7, now(), $5, COALESCE(NULLIF($6, ''), 'workspace'))
ON CONFLICT (id) DO UPDATE SET
-- Preserve the provisioner-set host-port URL. The provisioner
-- injects MOLECULE_WORKSPACE_URL=<host-port> into the container
@@ -654,13 +676,13 @@ func (h *RegistryHandler) Register(c *gin.Context) {
ELSE EXCLUDED.url
END,
agent_card = EXCLUDED.agent_card,
status = 'online',
status = $7,
last_heartbeat_at = now(),
delivery_mode = EXCLUDED.delivery_mode,
kind = COALESCE(NULLIF($6, ''), workspaces.kind),
updated_at = now()
WHERE workspaces.status IS DISTINCT FROM 'removed'
`, payload.ID, payload.ID, urlForUpsert, agentCardStr, modeForUpsert, payload.Kind)
`, payload.ID, payload.ID, urlForUpsert, agentCardStr, modeForUpsert, payload.Kind, string(effectiveStatus))
if err != nil {
if isPlatformRootViolation(err) {
c.JSON(http.StatusConflict, gin.H{"error": errPlatformNotRoot})