From 1533f90b43089478e4e0c70f6bc6817ff892c896 Mon Sep 17 00:00:00 2001 From: devops-engineer Date: Mon, 15 Jun 2026 13:21:09 -0700 Subject: [PATCH] =?UTF-8?q?fix(concierge):=20restore=20model-seed=20at=20p?= =?UTF-8?q?rovision=20=E2=80=94=20#2919=20regressed=20fresh=20platform-age?= =?UTF-8?q?nt=20provisions=20to=20MISSING=5FMODEL?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit #2919 (RFC #2843 §10a de-hardcode) removed ensureConciergeModel + the conciergeDeclaredModel const on the theory the platform-agent TEMPLATE would deliver the concierge's model. But that template entry was never added to manifest.json (the file's own comment defers it to "a follow-up PR once template config.yaml exists"), AND — more fundamentally — the universal MISSING_MODEL gate (core#2594) reads the stored MODEL secret at provision time, BEFORE any template config.yaml is fetched. So the model cannot come from the template: a model-less platform agent fails closed ("reached provisioning with no model set"). Every fresh org's concierge provision has been failing since #2919 deployed (incident: test1, 2026-06-15). The model is the ONE concierge identity element that legitimately lives in core — the concierge IS the platform-agent product, so it declares its own model exactly as a template declares one (core#2594), and the gate ordering requires it pre-fetch. The PROMPT/MCP de-hardcode from #2919 stays intact. Changes: - Restore conciergeRuntime + conciergeDeclaredModel consts and ensureConciergeModel/readStoredModelSecret; re-add the seed call as step 0 of applyConciergeProvisionConfig (platform-kind only). SEED-ONLY: respects a customer's later model pick; fail-closed (seeds nothing) on registry drift. - Remove conciergeDeclaredModel from TestNoConciergeLiteralsInCore's banned list — that guard wrongly lumped the model with the prompt/MCP literals, which is the exact design error that caused this regression. Documented why. - Add TestApplyConciergeProvisionConfig_SeedsModel: gates the seed (fresh → declared model in env + persisted), seed-only (existing pick respected), and platform-kind-only. Verified to FAIL if the seed call is removed. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../internal/handlers/platform_agent.go | 116 +++++++++++++++-- .../internal/handlers/platform_agent_test.go | 119 +++++++++++++++++- 2 files changed, 222 insertions(+), 13 deletions(-) diff --git a/workspace-server/internal/handlers/platform_agent.go b/workspace-server/internal/handlers/platform_agent.go index 807616720..9cbf868f9 100644 --- a/workspace-server/internal/handlers/platform_agent.go +++ b/workspace-server/internal/handlers/platform_agent.go @@ -30,6 +30,7 @@ import ( "os" "strings" + "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/crypto" "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db" "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/models" "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/provisioner" @@ -174,14 +175,24 @@ func conciergePlatformMCPEnv(env map[string]string) { // kind='platform' workspace (create, restart, auto-recover). It is a no-op // for ordinary workspaces. // -// Post RFC #2843 §10a: the concierge's identity (system prompt, model, -// runtime, MCP wiring) is delivered via the molecule-ai-workspace-template- +// Post RFC #2843 §10a: the concierge's PROMPT + MCP-wiring identity (system +// prompt, agent-skills) is delivered via the molecule-ai-workspace-template- // platform-agent template and applied like any other runtime template. This -// hook's only remaining responsibilities are (1) inject the platform-MCP env -// (org-admin token + platform URL + org id) and (2) the per-instance -// {{CONCIERGE_NAME}} substitution in the delivered system-prompt.md -// (recommended in the PR — see the PR description for the rationale vs -// the alternative of dropping the dynamic name). +// hook's responsibilities are (0) SEED the concierge's declared model so the +// provision passes the universal MISSING_MODEL gate (core#2594), (1) inject the +// platform-MCP env (org-admin token + platform URL + org id) and (2) the +// per-instance {{CONCIERGE_NAME}} substitution in the system-prompt.md. +// +// IMPORTANT (incident 2026-06-15, regression from #2919): the model is NOT +// delivered via the template. The MISSING_MODEL gate reads the stored MODEL +// workspace_secret at provision time — BEFORE any template config.yaml is +// fetched — so a model-less platform agent fails closed ("reached provisioning +// with no model set"). #2919 removed the model-seed on the theory the +// platform-agent template would supply it, but that template entry was never +// added to the manifest. The concierge is the platform-agent PRODUCT itself, so +// it carries an explicit declared model in code (core#2594) exactly as a +// template declares one — this is the SSOT-correct source, not a stopgap. The +// CI test TestApplyConciergeProvisionConfig_SeedsModel gates against re-removal. // // Returns the (possibly newly-allocated) configFiles map so the caller can // rebind it — configFiles is nil on the auto-restart path, where this is the @@ -203,6 +214,14 @@ func (h *WorkspaceHandler) applyConciergeProvisionConfig( return configFiles } + // 0. Concierge model (core#2594). The platform agent carries an explicit, + // SSOT-declared model so it never relies on a silent default. The + // universal MISSING_MODEL gate (prepareProvisionContext) reads the stored + // MODEL secret at provision time — before any template config.yaml is + // fetched — so this MUST run here, not be deferred to the template. + // Seed-only: it respects a model the customer later picked. + h.ensureConciergeModel(ctx, workspaceID, envVars) + // 1. Platform-MCP env (org-admin token + platform URL + org id). conciergePlatformMCPEnv(envVars) @@ -252,6 +271,89 @@ func substituteConciergeName(prompt []byte, name string) []byte { return []byte(strings.ReplaceAll(string(prompt), conciergeNamePlaceholder, name)) } +// conciergeRuntime is the runtime the platform agent (concierge) always runs as +// (the platform-agent image variant of 'claude-code'). conciergeDeclaredModel is +// validated against the registry for this runtime before being seeded. +const conciergeRuntime = "claude-code" + +// conciergeDeclaredModel is the platform agent's OWN declared model — a +// first-class product decision, NOT a generic platform default. It mirrors a +// template's runtime_config.model SSOT. The SSOT directive +// (feedback_workspace_model_required_no_platform_default...) forbids the platform +// from defaulting a USER workspace's model, but the concierge IS the +// platform-agent product, so it declares its own model exactly as a template +// declares one (core#2594). +const conciergeDeclaredModel = "moonshot/kimi-k2.6" + +// ensureConciergeModel makes the platform agent's model explicit (core#2594). +// It (1) seeds the container model env for the current provision and (2) +// persists the MODEL workspace_secret so the read endpoint / canvas Config tab +// surface the resolved model. The model is the concierge's declared SSOT model, +// validated against the registry for its runtime. If validation fails (registry +// drift — a build bug caught by the model-registry CI test), it sets NOTHING: +// the downstream universal MISSING_MODEL gate then fails the provision CLOSED +// rather than letting the runtime pick an opaque default. +func (h *WorkspaceHandler) ensureConciergeModel(ctx context.Context, workspaceID string, envVars map[string]string) { + // SEED-ONLY (CTO 2026-06-12: customer setting > platform default; the + // concierge's model is changeable like any workspace, "anytime"). If a MODEL + // secret already exists — whether the original seed or a model the customer + // later picked in the canvas — RESPECT it: loadWorkspaceSecrets + + // applyRuntimeModelEnv have already put it in envVars, so do nothing. Only + // SEED the declared default when the concierge has no model at all (first + // boot). Re-asserting on EVERY provision would silently revert the customer's + // pick — exactly the platform-overriding-customer violation the SSOT + // directive forbids. + if existing := readStoredModelSecret(ctx, workspaceID); existing != "" { + return // explicit model already set; never overwrite the customer's choice + } + + // First boot — no model yet. Seed the concierge's declared default, but only + // if it is genuinely routable for the runtime; otherwise leave it unset and + // let the MISSING_MODEL gate fail closed (loud) rather than seed a model the + // platform cannot actually route to (silent late failure). + model := conciergeDeclaredModel + if ok, why := validateRegisteredModelForRuntime(conciergeRuntime, model); !ok { + log.Printf("Provisioner: concierge %s declared model %q is NOT registered for runtime %q (%s) — leaving model unset; provision will fail closed", workspaceID, model, conciergeRuntime, why) + return + } + if ok, why := validateDerivedProviderInRegistry(conciergeRuntime, model); !ok { + log.Printf("Provisioner: concierge %s declared model %q has no derivable registry provider for runtime %q (%s) — leaving model unset; provision will fail closed", workspaceID, model, conciergeRuntime, why) + return + } + + // Seed the container env (precedence MOLECULE_MODEL > MODEL in the runtime). + // applyRuntimeModelEnv already ran with an empty payload model for the + // concierge (no stored MODEL on first boot), so set both canonical names + // here so this provision actually runs the seeded default. + envVars["MOLECULE_MODEL"] = model + envVars["MODEL"] = model + + // Persist so GET /workspaces/:id/model returns it (Config tab visibility) and + // the next provision's readStoredModelSecret takes the respect-existing path. + if setErr := setModelSecret(ctx, workspaceID, model); setErr != nil { + log.Printf("Provisioner: concierge %s persist MODEL secret failed: %v (env still seeded for this provision)", workspaceID, setErr) + } +} + +// readStoredModelSecret returns the decrypted MODEL workspace_secret, or "" when +// none is stored (or on any read/decrypt error — treated as "unset" so a +// transient miss re-seeds rather than wedges). Used by ensureConciergeModel to +// decide seed-vs-respect. +func readStoredModelSecret(ctx context.Context, workspaceID string) string { + var stored []byte + var version int + if err := db.DB.QueryRowContext(ctx, + `SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = $1 AND key = 'MODEL'`, + workspaceID).Scan(&stored, &version); err != nil { + return "" + } + dec, err := crypto.DecryptVersioned(stored, version) + if err != nil { + return "" + } + return string(dec) +} + // EnsureSelfHostedPlatformAgent installs the org's platform agent (the concierge, // the org root) on a tenant that has no control plane to do it — i.e. self-hosted // or local. In SaaS the CP calls InstallPlatformAgent at org-provision time; this diff --git a/workspace-server/internal/handlers/platform_agent_test.go b/workspace-server/internal/handlers/platform_agent_test.go index 7d0edbb47..7553cd422 100644 --- a/workspace-server/internal/handlers/platform_agent_test.go +++ b/workspace-server/internal/handlers/platform_agent_test.go @@ -462,6 +462,12 @@ func TestApplyConciergeProvisionConfig_OnlyPlatformGetsOrgMCP(t *testing.T) { t.Setenv("PLATFORM_URL", "http://platform:8080") h := &WorkspaceHandler{} const kindQuery = `SELECT COALESCE\(kind, 'workspace'\) FROM workspaces WHERE id =` + // ensureConciergeModel (step 0, platform kind only) reads the stored MODEL + // secret to decide seed-vs-respect. These subtests are about MCP/name, so + // they stub an EXISTING model → ensureConciergeModel returns early (no + // INSERT). The seed path itself is covered by + // TestApplyConciergeProvisionConfig_SeedsModel. + const modelSelQuery = `SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = \$1 AND key = 'MODEL'` t.Run("ordinary workspace gets NO org MCP, NO admin token, NO substitution", func(t *testing.T) { mock := setupTestDB(t) @@ -500,6 +506,9 @@ func TestApplyConciergeProvisionConfig_OnlyPlatformGetsOrgMCP(t *testing.T) { mock := setupTestDB(t) mock.ExpectQuery(kindQuery).WithArgs("ws-concierge"). WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform")) + mock.ExpectQuery(modelSelQuery).WithArgs("ws-concierge"). + WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}). + AddRow([]byte("moonshot/kimi-k2.6"), 0)) env := map[string]string{} cf := map[string][]byte{ "config.yaml": []byte("runtime: claude-code\nmodel: moonshot/kimi-k2.6\n"), @@ -536,6 +545,9 @@ func TestApplyConciergeProvisionConfig_OnlyPlatformGetsOrgMCP(t *testing.T) { mock := setupTestDB(t) mock.ExpectQuery(kindQuery).WithArgs("ws-concierge"). WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform")) + mock.ExpectQuery(modelSelQuery).WithArgs("ws-concierge"). + WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}). + AddRow([]byte("moonshot/kimi-k2.6"), 0)) env := map[string]string{} // Already-substituted prompt (a re-provision of a running concierge). cf := map[string][]byte{ @@ -556,12 +568,108 @@ func TestApplyConciergeProvisionConfig_OnlyPlatformGetsOrgMCP(t *testing.T) { }) } +// TestApplyConciergeProvisionConfig_SeedsModel is the CI regression gate for +// the 2026-06-15 incident: #2919 removed the concierge model-seed, so every +// fresh platform-agent provision reached the universal MISSING_MODEL gate +// (core#2594) with no stored model and failed closed ("reached provisioning +// with no model set"). It passed CI because no test (and no e2e) provisions a +// fresh platform agent through the model path. This test asserts the seed +// fires for a model-less platform agent, is SEED-ONLY (respects a customer's +// later pick), and never touches ordinary workspaces. +func TestApplyConciergeProvisionConfig_SeedsModel(t *testing.T) { + h := &WorkspaceHandler{} + const kindQuery = `SELECT COALESCE\(kind, 'workspace'\) FROM workspaces WHERE id =` + const modelSelQuery = `SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = \$1 AND key = 'MODEL'` + const modelInsert = `INSERT INTO workspace_secrets` + + t.Run("fresh platform agent with NO stored model gets the declared model seeded + persisted", func(t *testing.T) { + mock := setupTestDB(t) + mock.ExpectQuery(kindQuery).WithArgs("ws-fresh"). + WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform")) + // No MODEL row yet (first boot) → readStoredModelSecret returns "". + mock.ExpectQuery(modelSelQuery).WithArgs("ws-fresh"). + WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"})) + // Seed path must PERSIST the declared model. + mock.ExpectExec(modelInsert). + WithArgs("ws-fresh", sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) + + env := map[string]string{} + h.applyConciergeProvisionConfig(context.Background(), "ws-fresh", "", nil, env, "Org Concierge") + + // THE regression assertion: without this seed the provision hits + // MISSING_MODEL and fails closed. Both canonical env names must carry + // the declared model so the runtime actually boots on it this provision. + if env["MODEL"] != conciergeDeclaredModel { + t.Errorf("fresh concierge did not seed MODEL=%q; got %q (env=%v) — MISSING_MODEL would fail this provision closed", conciergeDeclaredModel, env["MODEL"], env) + } + if env["MOLECULE_MODEL"] != conciergeDeclaredModel { + t.Errorf("fresh concierge did not seed MOLECULE_MODEL=%q; got %q", conciergeDeclaredModel, env["MOLECULE_MODEL"]) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations (the MODEL secret was not persisted): %v", err) + } + }) + + t.Run("SEED-ONLY: an existing customer model is respected, never overwritten", func(t *testing.T) { + mock := setupTestDB(t) + mock.ExpectQuery(kindQuery).WithArgs("ws-picked"). + WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform")) + // Customer already picked a model — stored MODEL secret present. + mock.ExpectQuery(modelSelQuery).WithArgs("ws-picked"). + WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}). + AddRow([]byte("anthropic:claude-opus-4-8"), 0)) + // NO ExpectExec: ensureConciergeModel must return early (no re-seed, + // no INSERT) — re-asserting the default would silently revert the pick. + + env := map[string]string{} + h.applyConciergeProvisionConfig(context.Background(), "ws-picked", "", nil, env, "Org Concierge") + + if env["MODEL"] == conciergeDeclaredModel { + t.Errorf("seed-only violated: ensureConciergeModel overwrote the customer's model with the declared default") + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations (an unexpected INSERT means it re-seeded over the customer's pick): %v", err) + } + }) + + t.Run("ordinary workspace never seeds a model (no model queries at all)", func(t *testing.T) { + mock := setupTestDB(t) + mock.ExpectQuery(kindQuery).WithArgs("ws-ordinary"). + WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("workspace")) + // No model SELECT/INSERT expected — the hook returns before step 0. + + env := map[string]string{} + h.applyConciergeProvisionConfig(context.Background(), "ws-ordinary", "", nil, env, "Worker") + + if _, ok := env["MODEL"]; ok { + t.Errorf("ordinary workspace had a model seeded — the concierge model-seed must be platform-kind only; env=%v", env) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations (ordinary workspace ran a model query): %v", err) + } + }) +} + // TestNoConciergeLiteralsInCore is the regression guard for the RFC #2843 -// §10a de-hardcode: the concierge's identity (system prompt, model, runtime, -// MCP wiring) MUST live in the platform-agent template, not as Go string -// literals in core. A future re-introduction of the consts would silently -// regress the SSOT; this test fails on a fresh build if any of the deleted -// identifiers reappear as bare Go references in the package source. +// §10a de-hardcode: the concierge's PROMPT + MCP-wiring identity (system +// prompt template, MCP-servers block, identity files) MUST live in the +// platform-agent template, not as Go string literals in core. A future +// re-introduction of those consts would silently regress the SSOT; this test +// fails on a fresh build if any of the banned identifiers reappear as bare Go +// references in the package source. +// +// NOTE (incident 2026-06-15): conciergeDeclaredModel is DELIBERATELY NOT +// banned. The model is the ONE concierge identity element that legitimately +// lives in core: the universal MISSING_MODEL gate (core#2594) reads the stored +// MODEL secret at provision time — BEFORE any template config.yaml is fetched — +// so the model MUST be seeded from a core-resident declared value, not deferred +// to template delivery. #2919 wrongly lumped the model in with the prompt/MCP +// literals and banned it here; removing the model-seed regressed every fresh +// platform-agent provision to MISSING_MODEL fail-closed. The concierge IS the +// platform-agent product, so it declares its own model exactly as a template +// does (this is SSOT-correct, not a hardcoded platform default). The seeding is +// gated by TestApplyConciergeProvisionConfig_SeedsModel. // // This is a grep over the package — brittle by design (intentionally so: // the concierge-literal pattern was the exact failure mode of the @@ -572,7 +680,6 @@ func TestNoConciergeLiteralsInCore(t *testing.T) { "conciergeSystemPromptTmpl", "conciergeMCPServersBlock", "conciergeMCPFragmentFile", - "conciergeDeclaredModel", "conciergeIdentityFiles", } for _, id := range banned { -- 2.52.0