fix(concierge): restore model-seed at provision — #2919 regressed fresh platform-agent provisions to MISSING_MODEL #2966
@@ -30,6 +30,7 @@ import (
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/crypto"
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db"
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/models"
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/provisioner"
|
||||
@@ -174,14 +175,24 @@ func conciergePlatformMCPEnv(env map[string]string) {
|
||||
// kind='platform' workspace (create, restart, auto-recover). It is a no-op
|
||||
// for ordinary workspaces.
|
||||
//
|
||||
// Post RFC #2843 §10a: the concierge's identity (system prompt, model,
|
||||
// runtime, MCP wiring) is delivered via the molecule-ai-workspace-template-
|
||||
// Post RFC #2843 §10a: the concierge's PROMPT + MCP-wiring identity (system
|
||||
// prompt, agent-skills) is delivered via the molecule-ai-workspace-template-
|
||||
// platform-agent template and applied like any other runtime template. This
|
||||
// hook's only remaining responsibilities are (1) inject the platform-MCP env
|
||||
// (org-admin token + platform URL + org id) and (2) the per-instance
|
||||
// {{CONCIERGE_NAME}} substitution in the delivered system-prompt.md
|
||||
// (recommended in the PR — see the PR description for the rationale vs
|
||||
// the alternative of dropping the dynamic name).
|
||||
// hook's responsibilities are (0) SEED the concierge's declared model so the
|
||||
// provision passes the universal MISSING_MODEL gate (core#2594), (1) inject the
|
||||
// platform-MCP env (org-admin token + platform URL + org id) and (2) the
|
||||
// per-instance {{CONCIERGE_NAME}} substitution in the system-prompt.md.
|
||||
//
|
||||
// IMPORTANT (incident 2026-06-15, regression from #2919): the model is NOT
|
||||
// delivered via the template. The MISSING_MODEL gate reads the stored MODEL
|
||||
// workspace_secret at provision time — BEFORE any template config.yaml is
|
||||
// fetched — so a model-less platform agent fails closed ("reached provisioning
|
||||
// with no model set"). #2919 removed the model-seed on the theory the
|
||||
// platform-agent template would supply it, but that template entry was never
|
||||
// added to the manifest. The concierge is the platform-agent PRODUCT itself, so
|
||||
// it carries an explicit declared model in code (core#2594) exactly as a
|
||||
// template declares one — this is the SSOT-correct source, not a stopgap. The
|
||||
// CI test TestApplyConciergeProvisionConfig_SeedsModel gates against re-removal.
|
||||
//
|
||||
// Returns the (possibly newly-allocated) configFiles map so the caller can
|
||||
// rebind it — configFiles is nil on the auto-restart path, where this is the
|
||||
@@ -203,6 +214,14 @@ func (h *WorkspaceHandler) applyConciergeProvisionConfig(
|
||||
return configFiles
|
||||
}
|
||||
|
||||
// 0. Concierge model (core#2594). The platform agent carries an explicit,
|
||||
// SSOT-declared model so it never relies on a silent default. The
|
||||
// universal MISSING_MODEL gate (prepareProvisionContext) reads the stored
|
||||
// MODEL secret at provision time — before any template config.yaml is
|
||||
// fetched — so this MUST run here, not be deferred to the template.
|
||||
// Seed-only: it respects a model the customer later picked.
|
||||
h.ensureConciergeModel(ctx, workspaceID, envVars)
|
||||
|
||||
// 1. Platform-MCP env (org-admin token + platform URL + org id).
|
||||
conciergePlatformMCPEnv(envVars)
|
||||
|
||||
@@ -252,6 +271,89 @@ func substituteConciergeName(prompt []byte, name string) []byte {
|
||||
return []byte(strings.ReplaceAll(string(prompt), conciergeNamePlaceholder, name))
|
||||
}
|
||||
|
||||
// conciergeRuntime is the runtime the platform agent (concierge) always runs as
|
||||
// (the platform-agent image variant of 'claude-code'). conciergeDeclaredModel is
|
||||
// validated against the registry for this runtime before being seeded.
|
||||
const conciergeRuntime = "claude-code"
|
||||
|
||||
// conciergeDeclaredModel is the platform agent's OWN declared model — a
|
||||
// first-class product decision, NOT a generic platform default. It mirrors a
|
||||
// template's runtime_config.model SSOT. The SSOT directive
|
||||
// (feedback_workspace_model_required_no_platform_default...) forbids the platform
|
||||
// from defaulting a USER workspace's model, but the concierge IS the
|
||||
// platform-agent product, so it declares its own model exactly as a template
|
||||
// declares one (core#2594).
|
||||
const conciergeDeclaredModel = "moonshot/kimi-k2.6"
|
||||
|
||||
// ensureConciergeModel makes the platform agent's model explicit (core#2594).
|
||||
// It (1) seeds the container model env for the current provision and (2)
|
||||
// persists the MODEL workspace_secret so the read endpoint / canvas Config tab
|
||||
// surface the resolved model. The model is the concierge's declared SSOT model,
|
||||
// validated against the registry for its runtime. If validation fails (registry
|
||||
// drift — a build bug caught by the model-registry CI test), it sets NOTHING:
|
||||
// the downstream universal MISSING_MODEL gate then fails the provision CLOSED
|
||||
// rather than letting the runtime pick an opaque default.
|
||||
func (h *WorkspaceHandler) ensureConciergeModel(ctx context.Context, workspaceID string, envVars map[string]string) {
|
||||
// SEED-ONLY (CTO 2026-06-12: customer setting > platform default; the
|
||||
// concierge's model is changeable like any workspace, "anytime"). If a MODEL
|
||||
// secret already exists — whether the original seed or a model the customer
|
||||
// later picked in the canvas — RESPECT it: loadWorkspaceSecrets +
|
||||
// applyRuntimeModelEnv have already put it in envVars, so do nothing. Only
|
||||
// SEED the declared default when the concierge has no model at all (first
|
||||
// boot). Re-asserting on EVERY provision would silently revert the customer's
|
||||
// pick — exactly the platform-overriding-customer violation the SSOT
|
||||
// directive forbids.
|
||||
if existing := readStoredModelSecret(ctx, workspaceID); existing != "" {
|
||||
return // explicit model already set; never overwrite the customer's choice
|
||||
}
|
||||
|
||||
// First boot — no model yet. Seed the concierge's declared default, but only
|
||||
// if it is genuinely routable for the runtime; otherwise leave it unset and
|
||||
// let the MISSING_MODEL gate fail closed (loud) rather than seed a model the
|
||||
// platform cannot actually route to (silent late failure).
|
||||
model := conciergeDeclaredModel
|
||||
if ok, why := validateRegisteredModelForRuntime(conciergeRuntime, model); !ok {
|
||||
log.Printf("Provisioner: concierge %s declared model %q is NOT registered for runtime %q (%s) — leaving model unset; provision will fail closed", workspaceID, model, conciergeRuntime, why)
|
||||
return
|
||||
}
|
||||
if ok, why := validateDerivedProviderInRegistry(conciergeRuntime, model); !ok {
|
||||
log.Printf("Provisioner: concierge %s declared model %q has no derivable registry provider for runtime %q (%s) — leaving model unset; provision will fail closed", workspaceID, model, conciergeRuntime, why)
|
||||
return
|
||||
}
|
||||
|
||||
// Seed the container env (precedence MOLECULE_MODEL > MODEL in the runtime).
|
||||
// applyRuntimeModelEnv already ran with an empty payload model for the
|
||||
// concierge (no stored MODEL on first boot), so set both canonical names
|
||||
// here so this provision actually runs the seeded default.
|
||||
envVars["MOLECULE_MODEL"] = model
|
||||
envVars["MODEL"] = model
|
||||
|
||||
// Persist so GET /workspaces/:id/model returns it (Config tab visibility) and
|
||||
// the next provision's readStoredModelSecret takes the respect-existing path.
|
||||
if setErr := setModelSecret(ctx, workspaceID, model); setErr != nil {
|
||||
log.Printf("Provisioner: concierge %s persist MODEL secret failed: %v (env still seeded for this provision)", workspaceID, setErr)
|
||||
}
|
||||
}
|
||||
|
||||
// readStoredModelSecret returns the decrypted MODEL workspace_secret, or "" when
|
||||
// none is stored (or on any read/decrypt error — treated as "unset" so a
|
||||
// transient miss re-seeds rather than wedges). Used by ensureConciergeModel to
|
||||
// decide seed-vs-respect.
|
||||
func readStoredModelSecret(ctx context.Context, workspaceID string) string {
|
||||
var stored []byte
|
||||
var version int
|
||||
if err := db.DB.QueryRowContext(ctx,
|
||||
`SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = $1 AND key = 'MODEL'`,
|
||||
workspaceID).Scan(&stored, &version); err != nil {
|
||||
return ""
|
||||
}
|
||||
dec, err := crypto.DecryptVersioned(stored, version)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return string(dec)
|
||||
}
|
||||
|
||||
// EnsureSelfHostedPlatformAgent installs the org's platform agent (the concierge,
|
||||
// the org root) on a tenant that has no control plane to do it — i.e. self-hosted
|
||||
// or local. In SaaS the CP calls InstallPlatformAgent at org-provision time; this
|
||||
|
||||
@@ -462,6 +462,12 @@ func TestApplyConciergeProvisionConfig_OnlyPlatformGetsOrgMCP(t *testing.T) {
|
||||
t.Setenv("PLATFORM_URL", "http://platform:8080")
|
||||
h := &WorkspaceHandler{}
|
||||
const kindQuery = `SELECT COALESCE\(kind, 'workspace'\) FROM workspaces WHERE id =`
|
||||
// ensureConciergeModel (step 0, platform kind only) reads the stored MODEL
|
||||
// secret to decide seed-vs-respect. These subtests are about MCP/name, so
|
||||
// they stub an EXISTING model → ensureConciergeModel returns early (no
|
||||
// INSERT). The seed path itself is covered by
|
||||
// TestApplyConciergeProvisionConfig_SeedsModel.
|
||||
const modelSelQuery = `SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = \$1 AND key = 'MODEL'`
|
||||
|
||||
t.Run("ordinary workspace gets NO org MCP, NO admin token, NO substitution", func(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
@@ -500,6 +506,9 @@ func TestApplyConciergeProvisionConfig_OnlyPlatformGetsOrgMCP(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
mock.ExpectQuery(kindQuery).WithArgs("ws-concierge").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform"))
|
||||
mock.ExpectQuery(modelSelQuery).WithArgs("ws-concierge").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}).
|
||||
AddRow([]byte("moonshot/kimi-k2.6"), 0))
|
||||
env := map[string]string{}
|
||||
cf := map[string][]byte{
|
||||
"config.yaml": []byte("runtime: claude-code\nmodel: moonshot/kimi-k2.6\n"),
|
||||
@@ -536,6 +545,9 @@ func TestApplyConciergeProvisionConfig_OnlyPlatformGetsOrgMCP(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
mock.ExpectQuery(kindQuery).WithArgs("ws-concierge").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform"))
|
||||
mock.ExpectQuery(modelSelQuery).WithArgs("ws-concierge").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}).
|
||||
AddRow([]byte("moonshot/kimi-k2.6"), 0))
|
||||
env := map[string]string{}
|
||||
// Already-substituted prompt (a re-provision of a running concierge).
|
||||
cf := map[string][]byte{
|
||||
@@ -556,12 +568,108 @@ func TestApplyConciergeProvisionConfig_OnlyPlatformGetsOrgMCP(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
// TestApplyConciergeProvisionConfig_SeedsModel is the CI regression gate for
|
||||
// the 2026-06-15 incident: #2919 removed the concierge model-seed, so every
|
||||
// fresh platform-agent provision reached the universal MISSING_MODEL gate
|
||||
// (core#2594) with no stored model and failed closed ("reached provisioning
|
||||
// with no model set"). It passed CI because no test (and no e2e) provisions a
|
||||
// fresh platform agent through the model path. This test asserts the seed
|
||||
// fires for a model-less platform agent, is SEED-ONLY (respects a customer's
|
||||
// later pick), and never touches ordinary workspaces.
|
||||
func TestApplyConciergeProvisionConfig_SeedsModel(t *testing.T) {
|
||||
h := &WorkspaceHandler{}
|
||||
const kindQuery = `SELECT COALESCE\(kind, 'workspace'\) FROM workspaces WHERE id =`
|
||||
const modelSelQuery = `SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = \$1 AND key = 'MODEL'`
|
||||
const modelInsert = `INSERT INTO workspace_secrets`
|
||||
|
||||
t.Run("fresh platform agent with NO stored model gets the declared model seeded + persisted", func(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
mock.ExpectQuery(kindQuery).WithArgs("ws-fresh").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform"))
|
||||
// No MODEL row yet (first boot) → readStoredModelSecret returns "".
|
||||
mock.ExpectQuery(modelSelQuery).WithArgs("ws-fresh").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}))
|
||||
// Seed path must PERSIST the declared model.
|
||||
mock.ExpectExec(modelInsert).
|
||||
WithArgs("ws-fresh", sqlmock.AnyArg(), sqlmock.AnyArg()).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
env := map[string]string{}
|
||||
h.applyConciergeProvisionConfig(context.Background(), "ws-fresh", "", nil, env, "Org Concierge")
|
||||
|
||||
// THE regression assertion: without this seed the provision hits
|
||||
// MISSING_MODEL and fails closed. Both canonical env names must carry
|
||||
// the declared model so the runtime actually boots on it this provision.
|
||||
if env["MODEL"] != conciergeDeclaredModel {
|
||||
t.Errorf("fresh concierge did not seed MODEL=%q; got %q (env=%v) — MISSING_MODEL would fail this provision closed", conciergeDeclaredModel, env["MODEL"], env)
|
||||
}
|
||||
if env["MOLECULE_MODEL"] != conciergeDeclaredModel {
|
||||
t.Errorf("fresh concierge did not seed MOLECULE_MODEL=%q; got %q", conciergeDeclaredModel, env["MOLECULE_MODEL"])
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet sqlmock expectations (the MODEL secret was not persisted): %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("SEED-ONLY: an existing customer model is respected, never overwritten", func(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
mock.ExpectQuery(kindQuery).WithArgs("ws-picked").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform"))
|
||||
// Customer already picked a model — stored MODEL secret present.
|
||||
mock.ExpectQuery(modelSelQuery).WithArgs("ws-picked").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}).
|
||||
AddRow([]byte("anthropic:claude-opus-4-8"), 0))
|
||||
// NO ExpectExec: ensureConciergeModel must return early (no re-seed,
|
||||
// no INSERT) — re-asserting the default would silently revert the pick.
|
||||
|
||||
env := map[string]string{}
|
||||
h.applyConciergeProvisionConfig(context.Background(), "ws-picked", "", nil, env, "Org Concierge")
|
||||
|
||||
if env["MODEL"] == conciergeDeclaredModel {
|
||||
t.Errorf("seed-only violated: ensureConciergeModel overwrote the customer's model with the declared default")
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet sqlmock expectations (an unexpected INSERT means it re-seeded over the customer's pick): %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("ordinary workspace never seeds a model (no model queries at all)", func(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
mock.ExpectQuery(kindQuery).WithArgs("ws-ordinary").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("workspace"))
|
||||
// No model SELECT/INSERT expected — the hook returns before step 0.
|
||||
|
||||
env := map[string]string{}
|
||||
h.applyConciergeProvisionConfig(context.Background(), "ws-ordinary", "", nil, env, "Worker")
|
||||
|
||||
if _, ok := env["MODEL"]; ok {
|
||||
t.Errorf("ordinary workspace had a model seeded — the concierge model-seed must be platform-kind only; env=%v", env)
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet sqlmock expectations (ordinary workspace ran a model query): %v", err)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// TestNoConciergeLiteralsInCore is the regression guard for the RFC #2843
|
||||
// §10a de-hardcode: the concierge's identity (system prompt, model, runtime,
|
||||
// MCP wiring) MUST live in the platform-agent template, not as Go string
|
||||
// literals in core. A future re-introduction of the consts would silently
|
||||
// regress the SSOT; this test fails on a fresh build if any of the deleted
|
||||
// identifiers reappear as bare Go references in the package source.
|
||||
// §10a de-hardcode: the concierge's PROMPT + MCP-wiring identity (system
|
||||
// prompt template, MCP-servers block, identity files) MUST live in the
|
||||
// platform-agent template, not as Go string literals in core. A future
|
||||
// re-introduction of those consts would silently regress the SSOT; this test
|
||||
// fails on a fresh build if any of the banned identifiers reappear as bare Go
|
||||
// references in the package source.
|
||||
//
|
||||
// NOTE (incident 2026-06-15): conciergeDeclaredModel is DELIBERATELY NOT
|
||||
// banned. The model is the ONE concierge identity element that legitimately
|
||||
// lives in core: the universal MISSING_MODEL gate (core#2594) reads the stored
|
||||
// MODEL secret at provision time — BEFORE any template config.yaml is fetched —
|
||||
// so the model MUST be seeded from a core-resident declared value, not deferred
|
||||
// to template delivery. #2919 wrongly lumped the model in with the prompt/MCP
|
||||
// literals and banned it here; removing the model-seed regressed every fresh
|
||||
// platform-agent provision to MISSING_MODEL fail-closed. The concierge IS the
|
||||
// platform-agent product, so it declares its own model exactly as a template
|
||||
// does (this is SSOT-correct, not a hardcoded platform default). The seeding is
|
||||
// gated by TestApplyConciergeProvisionConfig_SeedsModel.
|
||||
//
|
||||
// This is a grep over the package — brittle by design (intentionally so:
|
||||
// the concierge-literal pattern was the exact failure mode of the
|
||||
@@ -572,7 +680,6 @@ func TestNoConciergeLiteralsInCore(t *testing.T) {
|
||||
"conciergeSystemPromptTmpl",
|
||||
"conciergeMCPServersBlock",
|
||||
"conciergeMCPFragmentFile",
|
||||
"conciergeDeclaredModel",
|
||||
"conciergeIdentityFiles",
|
||||
}
|
||||
for _, id := range banned {
|
||||
|
||||
Reference in New Issue
Block a user