fix(concierge): restore model-seed at provision — #2919 regressed fresh platform-agent provisions to MISSING_MODEL #2966

Merged
devops-engineer merged 1 commits from fix/core-2594-concierge-model-seed-regression into main 2026-06-15 20:30:11 +00:00
2 changed files with 222 additions and 13 deletions
@@ -30,6 +30,7 @@ import (
"os"
"strings"
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/crypto"
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db"
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/models"
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/provisioner"
@@ -174,14 +175,24 @@ func conciergePlatformMCPEnv(env map[string]string) {
// kind='platform' workspace (create, restart, auto-recover). It is a no-op
// for ordinary workspaces.
//
// Post RFC #2843 §10a: the concierge's identity (system prompt, model,
// runtime, MCP wiring) is delivered via the molecule-ai-workspace-template-
// Post RFC #2843 §10a: the concierge's PROMPT + MCP-wiring identity (system
// prompt, agent-skills) is delivered via the molecule-ai-workspace-template-
// platform-agent template and applied like any other runtime template. This
// hook's only remaining responsibilities are (1) inject the platform-MCP env
// (org-admin token + platform URL + org id) and (2) the per-instance
// {{CONCIERGE_NAME}} substitution in the delivered system-prompt.md
// (recommended in the PR — see the PR description for the rationale vs
// the alternative of dropping the dynamic name).
// hook's responsibilities are (0) SEED the concierge's declared model so the
// provision passes the universal MISSING_MODEL gate (core#2594), (1) inject the
// platform-MCP env (org-admin token + platform URL + org id) and (2) the
// per-instance {{CONCIERGE_NAME}} substitution in the system-prompt.md.
//
// IMPORTANT (incident 2026-06-15, regression from #2919): the model is NOT
// delivered via the template. The MISSING_MODEL gate reads the stored MODEL
// workspace_secret at provision time — BEFORE any template config.yaml is
// fetched — so a model-less platform agent fails closed ("reached provisioning
// with no model set"). #2919 removed the model-seed on the theory the
// platform-agent template would supply it, but that template entry was never
// added to the manifest. The concierge is the platform-agent PRODUCT itself, so
// it carries an explicit declared model in code (core#2594) exactly as a
// template declares one — this is the SSOT-correct source, not a stopgap. The
// CI test TestApplyConciergeProvisionConfig_SeedsModel gates against re-removal.
//
// Returns the (possibly newly-allocated) configFiles map so the caller can
// rebind it — configFiles is nil on the auto-restart path, where this is the
@@ -203,6 +214,14 @@ func (h *WorkspaceHandler) applyConciergeProvisionConfig(
return configFiles
}
// 0. Concierge model (core#2594). The platform agent carries an explicit,
// SSOT-declared model so it never relies on a silent default. The
// universal MISSING_MODEL gate (prepareProvisionContext) reads the stored
// MODEL secret at provision time — before any template config.yaml is
// fetched — so this MUST run here, not be deferred to the template.
// Seed-only: it respects a model the customer later picked.
h.ensureConciergeModel(ctx, workspaceID, envVars)
// 1. Platform-MCP env (org-admin token + platform URL + org id).
conciergePlatformMCPEnv(envVars)
@@ -252,6 +271,89 @@ func substituteConciergeName(prompt []byte, name string) []byte {
return []byte(strings.ReplaceAll(string(prompt), conciergeNamePlaceholder, name))
}
// conciergeRuntime is the runtime the platform agent (concierge) always runs as
// (the platform-agent image variant of 'claude-code'). conciergeDeclaredModel is
// validated against the registry for this runtime before being seeded.
const conciergeRuntime = "claude-code"
// conciergeDeclaredModel is the platform agent's OWN declared model — a
// first-class product decision, NOT a generic platform default. It mirrors a
// template's runtime_config.model SSOT. The SSOT directive
// (feedback_workspace_model_required_no_platform_default...) forbids the platform
// from defaulting a USER workspace's model, but the concierge IS the
// platform-agent product, so it declares its own model exactly as a template
// declares one (core#2594).
const conciergeDeclaredModel = "moonshot/kimi-k2.6"
// ensureConciergeModel makes the platform agent's model explicit (core#2594).
// It (1) seeds the container model env for the current provision and (2)
// persists the MODEL workspace_secret so the read endpoint / canvas Config tab
// surface the resolved model. The model is the concierge's declared SSOT model,
// validated against the registry for its runtime. If validation fails (registry
// drift — a build bug caught by the model-registry CI test), it sets NOTHING:
// the downstream universal MISSING_MODEL gate then fails the provision CLOSED
// rather than letting the runtime pick an opaque default.
func (h *WorkspaceHandler) ensureConciergeModel(ctx context.Context, workspaceID string, envVars map[string]string) {
// SEED-ONLY (CTO 2026-06-12: customer setting > platform default; the
// concierge's model is changeable like any workspace, "anytime"). If a MODEL
// secret already exists — whether the original seed or a model the customer
// later picked in the canvas — RESPECT it: loadWorkspaceSecrets +
// applyRuntimeModelEnv have already put it in envVars, so do nothing. Only
// SEED the declared default when the concierge has no model at all (first
// boot). Re-asserting on EVERY provision would silently revert the customer's
// pick — exactly the platform-overriding-customer violation the SSOT
// directive forbids.
if existing := readStoredModelSecret(ctx, workspaceID); existing != "" {
return // explicit model already set; never overwrite the customer's choice
}
// First boot — no model yet. Seed the concierge's declared default, but only
// if it is genuinely routable for the runtime; otherwise leave it unset and
// let the MISSING_MODEL gate fail closed (loud) rather than seed a model the
// platform cannot actually route to (silent late failure).
model := conciergeDeclaredModel
if ok, why := validateRegisteredModelForRuntime(conciergeRuntime, model); !ok {
log.Printf("Provisioner: concierge %s declared model %q is NOT registered for runtime %q (%s) — leaving model unset; provision will fail closed", workspaceID, model, conciergeRuntime, why)
return
}
if ok, why := validateDerivedProviderInRegistry(conciergeRuntime, model); !ok {
log.Printf("Provisioner: concierge %s declared model %q has no derivable registry provider for runtime %q (%s) — leaving model unset; provision will fail closed", workspaceID, model, conciergeRuntime, why)
return
}
// Seed the container env (precedence MOLECULE_MODEL > MODEL in the runtime).
// applyRuntimeModelEnv already ran with an empty payload model for the
// concierge (no stored MODEL on first boot), so set both canonical names
// here so this provision actually runs the seeded default.
envVars["MOLECULE_MODEL"] = model
envVars["MODEL"] = model
// Persist so GET /workspaces/:id/model returns it (Config tab visibility) and
// the next provision's readStoredModelSecret takes the respect-existing path.
if setErr := setModelSecret(ctx, workspaceID, model); setErr != nil {
log.Printf("Provisioner: concierge %s persist MODEL secret failed: %v (env still seeded for this provision)", workspaceID, setErr)
}
}
// readStoredModelSecret returns the decrypted MODEL workspace_secret, or "" when
// none is stored (or on any read/decrypt error — treated as "unset" so a
// transient miss re-seeds rather than wedges). Used by ensureConciergeModel to
// decide seed-vs-respect.
func readStoredModelSecret(ctx context.Context, workspaceID string) string {
var stored []byte
var version int
if err := db.DB.QueryRowContext(ctx,
`SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = $1 AND key = 'MODEL'`,
workspaceID).Scan(&stored, &version); err != nil {
return ""
}
dec, err := crypto.DecryptVersioned(stored, version)
if err != nil {
return ""
}
return string(dec)
}
// EnsureSelfHostedPlatformAgent installs the org's platform agent (the concierge,
// the org root) on a tenant that has no control plane to do it — i.e. self-hosted
// or local. In SaaS the CP calls InstallPlatformAgent at org-provision time; this
@@ -462,6 +462,12 @@ func TestApplyConciergeProvisionConfig_OnlyPlatformGetsOrgMCP(t *testing.T) {
t.Setenv("PLATFORM_URL", "http://platform:8080")
h := &WorkspaceHandler{}
const kindQuery = `SELECT COALESCE\(kind, 'workspace'\) FROM workspaces WHERE id =`
// ensureConciergeModel (step 0, platform kind only) reads the stored MODEL
// secret to decide seed-vs-respect. These subtests are about MCP/name, so
// they stub an EXISTING model → ensureConciergeModel returns early (no
// INSERT). The seed path itself is covered by
// TestApplyConciergeProvisionConfig_SeedsModel.
const modelSelQuery = `SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = \$1 AND key = 'MODEL'`
t.Run("ordinary workspace gets NO org MCP, NO admin token, NO substitution", func(t *testing.T) {
mock := setupTestDB(t)
@@ -500,6 +506,9 @@ func TestApplyConciergeProvisionConfig_OnlyPlatformGetsOrgMCP(t *testing.T) {
mock := setupTestDB(t)
mock.ExpectQuery(kindQuery).WithArgs("ws-concierge").
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform"))
mock.ExpectQuery(modelSelQuery).WithArgs("ws-concierge").
WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}).
AddRow([]byte("moonshot/kimi-k2.6"), 0))
env := map[string]string{}
cf := map[string][]byte{
"config.yaml": []byte("runtime: claude-code\nmodel: moonshot/kimi-k2.6\n"),
@@ -536,6 +545,9 @@ func TestApplyConciergeProvisionConfig_OnlyPlatformGetsOrgMCP(t *testing.T) {
mock := setupTestDB(t)
mock.ExpectQuery(kindQuery).WithArgs("ws-concierge").
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform"))
mock.ExpectQuery(modelSelQuery).WithArgs("ws-concierge").
WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}).
AddRow([]byte("moonshot/kimi-k2.6"), 0))
env := map[string]string{}
// Already-substituted prompt (a re-provision of a running concierge).
cf := map[string][]byte{
@@ -556,12 +568,108 @@ func TestApplyConciergeProvisionConfig_OnlyPlatformGetsOrgMCP(t *testing.T) {
})
}
// TestApplyConciergeProvisionConfig_SeedsModel is the CI regression gate for
// the 2026-06-15 incident: #2919 removed the concierge model-seed, so every
// fresh platform-agent provision reached the universal MISSING_MODEL gate
// (core#2594) with no stored model and failed closed ("reached provisioning
// with no model set"). It passed CI because no test (and no e2e) provisions a
// fresh platform agent through the model path. This test asserts the seed
// fires for a model-less platform agent, is SEED-ONLY (respects a customer's
// later pick), and never touches ordinary workspaces.
func TestApplyConciergeProvisionConfig_SeedsModel(t *testing.T) {
h := &WorkspaceHandler{}
const kindQuery = `SELECT COALESCE\(kind, 'workspace'\) FROM workspaces WHERE id =`
const modelSelQuery = `SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = \$1 AND key = 'MODEL'`
const modelInsert = `INSERT INTO workspace_secrets`
t.Run("fresh platform agent with NO stored model gets the declared model seeded + persisted", func(t *testing.T) {
mock := setupTestDB(t)
mock.ExpectQuery(kindQuery).WithArgs("ws-fresh").
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform"))
// No MODEL row yet (first boot) → readStoredModelSecret returns "".
mock.ExpectQuery(modelSelQuery).WithArgs("ws-fresh").
WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}))
// Seed path must PERSIST the declared model.
mock.ExpectExec(modelInsert).
WithArgs("ws-fresh", sqlmock.AnyArg(), sqlmock.AnyArg()).
WillReturnResult(sqlmock.NewResult(0, 1))
env := map[string]string{}
h.applyConciergeProvisionConfig(context.Background(), "ws-fresh", "", nil, env, "Org Concierge")
// THE regression assertion: without this seed the provision hits
// MISSING_MODEL and fails closed. Both canonical env names must carry
// the declared model so the runtime actually boots on it this provision.
if env["MODEL"] != conciergeDeclaredModel {
t.Errorf("fresh concierge did not seed MODEL=%q; got %q (env=%v) — MISSING_MODEL would fail this provision closed", conciergeDeclaredModel, env["MODEL"], env)
}
if env["MOLECULE_MODEL"] != conciergeDeclaredModel {
t.Errorf("fresh concierge did not seed MOLECULE_MODEL=%q; got %q", conciergeDeclaredModel, env["MOLECULE_MODEL"])
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet sqlmock expectations (the MODEL secret was not persisted): %v", err)
}
})
t.Run("SEED-ONLY: an existing customer model is respected, never overwritten", func(t *testing.T) {
mock := setupTestDB(t)
mock.ExpectQuery(kindQuery).WithArgs("ws-picked").
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform"))
// Customer already picked a model — stored MODEL secret present.
mock.ExpectQuery(modelSelQuery).WithArgs("ws-picked").
WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}).
AddRow([]byte("anthropic:claude-opus-4-8"), 0))
// NO ExpectExec: ensureConciergeModel must return early (no re-seed,
// no INSERT) — re-asserting the default would silently revert the pick.
env := map[string]string{}
h.applyConciergeProvisionConfig(context.Background(), "ws-picked", "", nil, env, "Org Concierge")
if env["MODEL"] == conciergeDeclaredModel {
t.Errorf("seed-only violated: ensureConciergeModel overwrote the customer's model with the declared default")
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet sqlmock expectations (an unexpected INSERT means it re-seeded over the customer's pick): %v", err)
}
})
t.Run("ordinary workspace never seeds a model (no model queries at all)", func(t *testing.T) {
mock := setupTestDB(t)
mock.ExpectQuery(kindQuery).WithArgs("ws-ordinary").
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("workspace"))
// No model SELECT/INSERT expected — the hook returns before step 0.
env := map[string]string{}
h.applyConciergeProvisionConfig(context.Background(), "ws-ordinary", "", nil, env, "Worker")
if _, ok := env["MODEL"]; ok {
t.Errorf("ordinary workspace had a model seeded — the concierge model-seed must be platform-kind only; env=%v", env)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet sqlmock expectations (ordinary workspace ran a model query): %v", err)
}
})
}
// TestNoConciergeLiteralsInCore is the regression guard for the RFC #2843
// §10a de-hardcode: the concierge's identity (system prompt, model, runtime,
// MCP wiring) MUST live in the platform-agent template, not as Go string
// literals in core. A future re-introduction of the consts would silently
// regress the SSOT; this test fails on a fresh build if any of the deleted
// identifiers reappear as bare Go references in the package source.
// §10a de-hardcode: the concierge's PROMPT + MCP-wiring identity (system
// prompt template, MCP-servers block, identity files) MUST live in the
// platform-agent template, not as Go string literals in core. A future
// re-introduction of those consts would silently regress the SSOT; this test
// fails on a fresh build if any of the banned identifiers reappear as bare Go
// references in the package source.
//
// NOTE (incident 2026-06-15): conciergeDeclaredModel is DELIBERATELY NOT
// banned. The model is the ONE concierge identity element that legitimately
// lives in core: the universal MISSING_MODEL gate (core#2594) reads the stored
// MODEL secret at provision time — BEFORE any template config.yaml is fetched —
// so the model MUST be seeded from a core-resident declared value, not deferred
// to template delivery. #2919 wrongly lumped the model in with the prompt/MCP
// literals and banned it here; removing the model-seed regressed every fresh
// platform-agent provision to MISSING_MODEL fail-closed. The concierge IS the
// platform-agent product, so it declares its own model exactly as a template
// does (this is SSOT-correct, not a hardcoded platform default). The seeding is
// gated by TestApplyConciergeProvisionConfig_SeedsModel.
//
// This is a grep over the package — brittle by design (intentionally so:
// the concierge-literal pattern was the exact failure mode of the
@@ -572,7 +680,6 @@ func TestNoConciergeLiteralsInCore(t *testing.T) {
"conciergeSystemPromptTmpl",
"conciergeMCPServersBlock",
"conciergeMCPFragmentFile",
"conciergeDeclaredModel",
"conciergeIdentityFiles",
}
for _, id := range banned {