From 1533f90b43089478e4e0c70f6bc6817ff892c896 Mon Sep 17 00:00:00 2001
From: devops-engineer <devops-engineer@agents.moleculesai.app>
Date: Mon, 15 Jun 2026 13:21:09 -0700
Subject: [PATCH] =?UTF-8?q?fix(concierge):=20restore=20model-seed=20at=20p?=
 =?UTF-8?q?rovision=20=E2=80=94=20#2919=20regressed=20fresh=20platform-age?=
 =?UTF-8?q?nt=20provisions=20to=20MISSING=5FMODEL?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

#2919 (RFC #2843 §10a de-hardcode) removed ensureConciergeModel + the
conciergeDeclaredModel const on the theory the platform-agent TEMPLATE
would deliver the concierge's model. But that template entry was never
added to manifest.json (the file's own comment defers it to "a follow-up
PR once template config.yaml exists"), AND — more fundamentally — the
universal MISSING_MODEL gate (core#2594) reads the stored MODEL secret at
provision time, BEFORE any template config.yaml is fetched. So the model
cannot come from the template: a model-less platform agent fails closed
("reached provisioning with no model set"). Every fresh org's concierge
provision has been failing since #2919 deployed (incident: test1, 2026-06-15).

The model is the ONE concierge identity element that legitimately lives in
core — the concierge IS the platform-agent product, so it declares its own
model exactly as a template declares one (core#2594), and the gate ordering
requires it pre-fetch. The PROMPT/MCP de-hardcode from #2919 stays intact.

Changes:
- Restore conciergeRuntime + conciergeDeclaredModel consts and
  ensureConciergeModel/readStoredModelSecret; re-add the seed call as step 0
  of applyConciergeProvisionConfig (platform-kind only). SEED-ONLY: respects
  a customer's later model pick; fail-closed (seeds nothing) on registry drift.
- Remove conciergeDeclaredModel from TestNoConciergeLiteralsInCore's banned
  list — that guard wrongly lumped the model with the prompt/MCP literals,
  which is the exact design error that caused this regression. Documented why.
- Add TestApplyConciergeProvisionConfig_SeedsModel: gates the seed (fresh →
  declared model in env + persisted), seed-only (existing pick respected),
  and platform-kind-only. Verified to FAIL if the seed call is removed.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../internal/handlers/platform_agent.go       | 116 +++++++++++++++--
 .../internal/handlers/platform_agent_test.go  | 119 +++++++++++++++++-
 2 files changed, 222 insertions(+), 13 deletions(-)

diff --git a/workspace-server/internal/handlers/platform_agent.go b/workspace-server/internal/handlers/platform_agent.go
index 807616720..9cbf868f9 100644
--- a/workspace-server/internal/handlers/platform_agent.go
+++ b/workspace-server/internal/handlers/platform_agent.go
@@ -30,6 +30,7 @@ import (
 	"os"
 	"strings"
 
+	"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/crypto"
 	"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db"
 	"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/models"
 	"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/provisioner"
@@ -174,14 +175,24 @@ func conciergePlatformMCPEnv(env map[string]string) {
 // kind='platform' workspace (create, restart, auto-recover). It is a no-op
 // for ordinary workspaces.
 //
-// Post RFC #2843 §10a: the concierge's identity (system prompt, model,
-// runtime, MCP wiring) is delivered via the molecule-ai-workspace-template-
+// Post RFC #2843 §10a: the concierge's PROMPT + MCP-wiring identity (system
+// prompt, agent-skills) is delivered via the molecule-ai-workspace-template-
 // platform-agent template and applied like any other runtime template. This
-// hook's only remaining responsibilities are (1) inject the platform-MCP env
-// (org-admin token + platform URL + org id) and (2) the per-instance
-// {{CONCIERGE_NAME}} substitution in the delivered system-prompt.md
-// (recommended in the PR — see the PR description for the rationale vs
-// the alternative of dropping the dynamic name).
+// hook's responsibilities are (0) SEED the concierge's declared model so the
+// provision passes the universal MISSING_MODEL gate (core#2594), (1) inject the
+// platform-MCP env (org-admin token + platform URL + org id) and (2) the
+// per-instance {{CONCIERGE_NAME}} substitution in the system-prompt.md.
+//
+// IMPORTANT (incident 2026-06-15, regression from #2919): the model is NOT
+// delivered via the template. The MISSING_MODEL gate reads the stored MODEL
+// workspace_secret at provision time — BEFORE any template config.yaml is
+// fetched — so a model-less platform agent fails closed ("reached provisioning
+// with no model set"). #2919 removed the model-seed on the theory the
+// platform-agent template would supply it, but that template entry was never
+// added to the manifest. The concierge is the platform-agent PRODUCT itself, so
+// it carries an explicit declared model in code (core#2594) exactly as a
+// template declares one — this is the SSOT-correct source, not a stopgap. The
+// CI test TestApplyConciergeProvisionConfig_SeedsModel gates against re-removal.
 //
 // Returns the (possibly newly-allocated) configFiles map so the caller can
 // rebind it — configFiles is nil on the auto-restart path, where this is the
@@ -203,6 +214,14 @@ func (h *WorkspaceHandler) applyConciergeProvisionConfig(
 		return configFiles
 	}
 
+	// 0. Concierge model (core#2594). The platform agent carries an explicit,
+	//    SSOT-declared model so it never relies on a silent default. The
+	//    universal MISSING_MODEL gate (prepareProvisionContext) reads the stored
+	//    MODEL secret at provision time — before any template config.yaml is
+	//    fetched — so this MUST run here, not be deferred to the template.
+	//    Seed-only: it respects a model the customer later picked.
+	h.ensureConciergeModel(ctx, workspaceID, envVars)
+
 	// 1. Platform-MCP env (org-admin token + platform URL + org id).
 	conciergePlatformMCPEnv(envVars)
 
@@ -252,6 +271,89 @@ func substituteConciergeName(prompt []byte, name string) []byte {
 	return []byte(strings.ReplaceAll(string(prompt), conciergeNamePlaceholder, name))
 }
 
+// conciergeRuntime is the runtime the platform agent (concierge) always runs as
+// (the platform-agent image variant of 'claude-code'). conciergeDeclaredModel is
+// validated against the registry for this runtime before being seeded.
+const conciergeRuntime = "claude-code"
+
+// conciergeDeclaredModel is the platform agent's OWN declared model — a
+// first-class product decision, NOT a generic platform default. It mirrors a
+// template's runtime_config.model SSOT. The SSOT directive
+// (feedback_workspace_model_required_no_platform_default...) forbids the platform
+// from defaulting a USER workspace's model, but the concierge IS the
+// platform-agent product, so it declares its own model exactly as a template
+// declares one (core#2594).
+const conciergeDeclaredModel = "moonshot/kimi-k2.6"
+
+// ensureConciergeModel makes the platform agent's model explicit (core#2594).
+// It (1) seeds the container model env for the current provision and (2)
+// persists the MODEL workspace_secret so the read endpoint / canvas Config tab
+// surface the resolved model. The model is the concierge's declared SSOT model,
+// validated against the registry for its runtime. If validation fails (registry
+// drift — a build bug caught by the model-registry CI test), it sets NOTHING:
+// the downstream universal MISSING_MODEL gate then fails the provision CLOSED
+// rather than letting the runtime pick an opaque default.
+func (h *WorkspaceHandler) ensureConciergeModel(ctx context.Context, workspaceID string, envVars map[string]string) {
+	// SEED-ONLY (CTO 2026-06-12: customer setting > platform default; the
+	// concierge's model is changeable like any workspace, "anytime"). If a MODEL
+	// secret already exists — whether the original seed or a model the customer
+	// later picked in the canvas — RESPECT it: loadWorkspaceSecrets +
+	// applyRuntimeModelEnv have already put it in envVars, so do nothing. Only
+	// SEED the declared default when the concierge has no model at all (first
+	// boot). Re-asserting on EVERY provision would silently revert the customer's
+	// pick — exactly the platform-overriding-customer violation the SSOT
+	// directive forbids.
+	if existing := readStoredModelSecret(ctx, workspaceID); existing != "" {
+		return // explicit model already set; never overwrite the customer's choice
+	}
+
+	// First boot — no model yet. Seed the concierge's declared default, but only
+	// if it is genuinely routable for the runtime; otherwise leave it unset and
+	// let the MISSING_MODEL gate fail closed (loud) rather than seed a model the
+	// platform cannot actually route to (silent late failure).
+	model := conciergeDeclaredModel
+	if ok, why := validateRegisteredModelForRuntime(conciergeRuntime, model); !ok {
+		log.Printf("Provisioner: concierge %s declared model %q is NOT registered for runtime %q (%s) — leaving model unset; provision will fail closed", workspaceID, model, conciergeRuntime, why)
+		return
+	}
+	if ok, why := validateDerivedProviderInRegistry(conciergeRuntime, model); !ok {
+		log.Printf("Provisioner: concierge %s declared model %q has no derivable registry provider for runtime %q (%s) — leaving model unset; provision will fail closed", workspaceID, model, conciergeRuntime, why)
+		return
+	}
+
+	// Seed the container env (precedence MOLECULE_MODEL > MODEL in the runtime).
+	// applyRuntimeModelEnv already ran with an empty payload model for the
+	// concierge (no stored MODEL on first boot), so set both canonical names
+	// here so this provision actually runs the seeded default.
+	envVars["MOLECULE_MODEL"] = model
+	envVars["MODEL"] = model
+
+	// Persist so GET /workspaces/:id/model returns it (Config tab visibility) and
+	// the next provision's readStoredModelSecret takes the respect-existing path.
+	if setErr := setModelSecret(ctx, workspaceID, model); setErr != nil {
+		log.Printf("Provisioner: concierge %s persist MODEL secret failed: %v (env still seeded for this provision)", workspaceID, setErr)
+	}
+}
+
+// readStoredModelSecret returns the decrypted MODEL workspace_secret, or "" when
+// none is stored (or on any read/decrypt error — treated as "unset" so a
+// transient miss re-seeds rather than wedges). Used by ensureConciergeModel to
+// decide seed-vs-respect.
+func readStoredModelSecret(ctx context.Context, workspaceID string) string {
+	var stored []byte
+	var version int
+	if err := db.DB.QueryRowContext(ctx,
+		`SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = $1 AND key = 'MODEL'`,
+		workspaceID).Scan(&stored, &version); err != nil {
+		return ""
+	}
+	dec, err := crypto.DecryptVersioned(stored, version)
+	if err != nil {
+		return ""
+	}
+	return string(dec)
+}
+
 // EnsureSelfHostedPlatformAgent installs the org's platform agent (the concierge,
 // the org root) on a tenant that has no control plane to do it — i.e. self-hosted
 // or local. In SaaS the CP calls InstallPlatformAgent at org-provision time; this
diff --git a/workspace-server/internal/handlers/platform_agent_test.go b/workspace-server/internal/handlers/platform_agent_test.go
index 7d0edbb47..7553cd422 100644
--- a/workspace-server/internal/handlers/platform_agent_test.go
+++ b/workspace-server/internal/handlers/platform_agent_test.go
@@ -462,6 +462,12 @@ func TestApplyConciergeProvisionConfig_OnlyPlatformGetsOrgMCP(t *testing.T) {
 	t.Setenv("PLATFORM_URL", "http://platform:8080")
 	h := &WorkspaceHandler{}
 	const kindQuery = `SELECT COALESCE\(kind, 'workspace'\) FROM workspaces WHERE id =`
+	// ensureConciergeModel (step 0, platform kind only) reads the stored MODEL
+	// secret to decide seed-vs-respect. These subtests are about MCP/name, so
+	// they stub an EXISTING model → ensureConciergeModel returns early (no
+	// INSERT). The seed path itself is covered by
+	// TestApplyConciergeProvisionConfig_SeedsModel.
+	const modelSelQuery = `SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = \$1 AND key = 'MODEL'`
 
 	t.Run("ordinary workspace gets NO org MCP, NO admin token, NO substitution", func(t *testing.T) {
 		mock := setupTestDB(t)
@@ -500,6 +506,9 @@ func TestApplyConciergeProvisionConfig_OnlyPlatformGetsOrgMCP(t *testing.T) {
 		mock := setupTestDB(t)
 		mock.ExpectQuery(kindQuery).WithArgs("ws-concierge").
 			WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform"))
+		mock.ExpectQuery(modelSelQuery).WithArgs("ws-concierge").
+			WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}).
+				AddRow([]byte("moonshot/kimi-k2.6"), 0))
 		env := map[string]string{}
 		cf := map[string][]byte{
 			"config.yaml":      []byte("runtime: claude-code\nmodel: moonshot/kimi-k2.6\n"),
@@ -536,6 +545,9 @@ func TestApplyConciergeProvisionConfig_OnlyPlatformGetsOrgMCP(t *testing.T) {
 		mock := setupTestDB(t)
 		mock.ExpectQuery(kindQuery).WithArgs("ws-concierge").
 			WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform"))
+		mock.ExpectQuery(modelSelQuery).WithArgs("ws-concierge").
+			WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}).
+				AddRow([]byte("moonshot/kimi-k2.6"), 0))
 		env := map[string]string{}
 		// Already-substituted prompt (a re-provision of a running concierge).
 		cf := map[string][]byte{
@@ -556,12 +568,108 @@ func TestApplyConciergeProvisionConfig_OnlyPlatformGetsOrgMCP(t *testing.T) {
 	})
 }
 
+// TestApplyConciergeProvisionConfig_SeedsModel is the CI regression gate for
+// the 2026-06-15 incident: #2919 removed the concierge model-seed, so every
+// fresh platform-agent provision reached the universal MISSING_MODEL gate
+// (core#2594) with no stored model and failed closed ("reached provisioning
+// with no model set"). It passed CI because no test (and no e2e) provisions a
+// fresh platform agent through the model path. This test asserts the seed
+// fires for a model-less platform agent, is SEED-ONLY (respects a customer's
+// later pick), and never touches ordinary workspaces.
+func TestApplyConciergeProvisionConfig_SeedsModel(t *testing.T) {
+	h := &WorkspaceHandler{}
+	const kindQuery = `SELECT COALESCE\(kind, 'workspace'\) FROM workspaces WHERE id =`
+	const modelSelQuery = `SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = \$1 AND key = 'MODEL'`
+	const modelInsert = `INSERT INTO workspace_secrets`
+
+	t.Run("fresh platform agent with NO stored model gets the declared model seeded + persisted", func(t *testing.T) {
+		mock := setupTestDB(t)
+		mock.ExpectQuery(kindQuery).WithArgs("ws-fresh").
+			WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform"))
+		// No MODEL row yet (first boot) → readStoredModelSecret returns "".
+		mock.ExpectQuery(modelSelQuery).WithArgs("ws-fresh").
+			WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}))
+		// Seed path must PERSIST the declared model.
+		mock.ExpectExec(modelInsert).
+			WithArgs("ws-fresh", sqlmock.AnyArg(), sqlmock.AnyArg()).
+			WillReturnResult(sqlmock.NewResult(0, 1))
+
+		env := map[string]string{}
+		h.applyConciergeProvisionConfig(context.Background(), "ws-fresh", "", nil, env, "Org Concierge")
+
+		// THE regression assertion: without this seed the provision hits
+		// MISSING_MODEL and fails closed. Both canonical env names must carry
+		// the declared model so the runtime actually boots on it this provision.
+		if env["MODEL"] != conciergeDeclaredModel {
+			t.Errorf("fresh concierge did not seed MODEL=%q; got %q (env=%v) — MISSING_MODEL would fail this provision closed", conciergeDeclaredModel, env["MODEL"], env)
+		}
+		if env["MOLECULE_MODEL"] != conciergeDeclaredModel {
+			t.Errorf("fresh concierge did not seed MOLECULE_MODEL=%q; got %q", conciergeDeclaredModel, env["MOLECULE_MODEL"])
+		}
+		if err := mock.ExpectationsWereMet(); err != nil {
+			t.Errorf("unmet sqlmock expectations (the MODEL secret was not persisted): %v", err)
+		}
+	})
+
+	t.Run("SEED-ONLY: an existing customer model is respected, never overwritten", func(t *testing.T) {
+		mock := setupTestDB(t)
+		mock.ExpectQuery(kindQuery).WithArgs("ws-picked").
+			WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform"))
+		// Customer already picked a model — stored MODEL secret present.
+		mock.ExpectQuery(modelSelQuery).WithArgs("ws-picked").
+			WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}).
+				AddRow([]byte("anthropic:claude-opus-4-8"), 0))
+		// NO ExpectExec: ensureConciergeModel must return early (no re-seed,
+		// no INSERT) — re-asserting the default would silently revert the pick.
+
+		env := map[string]string{}
+		h.applyConciergeProvisionConfig(context.Background(), "ws-picked", "", nil, env, "Org Concierge")
+
+		if env["MODEL"] == conciergeDeclaredModel {
+			t.Errorf("seed-only violated: ensureConciergeModel overwrote the customer's model with the declared default")
+		}
+		if err := mock.ExpectationsWereMet(); err != nil {
+			t.Errorf("unmet sqlmock expectations (an unexpected INSERT means it re-seeded over the customer's pick): %v", err)
+		}
+	})
+
+	t.Run("ordinary workspace never seeds a model (no model queries at all)", func(t *testing.T) {
+		mock := setupTestDB(t)
+		mock.ExpectQuery(kindQuery).WithArgs("ws-ordinary").
+			WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("workspace"))
+		// No model SELECT/INSERT expected — the hook returns before step 0.
+
+		env := map[string]string{}
+		h.applyConciergeProvisionConfig(context.Background(), "ws-ordinary", "", nil, env, "Worker")
+
+		if _, ok := env["MODEL"]; ok {
+			t.Errorf("ordinary workspace had a model seeded — the concierge model-seed must be platform-kind only; env=%v", env)
+		}
+		if err := mock.ExpectationsWereMet(); err != nil {
+			t.Errorf("unmet sqlmock expectations (ordinary workspace ran a model query): %v", err)
+		}
+	})
+}
+
 // TestNoConciergeLiteralsInCore is the regression guard for the RFC #2843
-// §10a de-hardcode: the concierge's identity (system prompt, model, runtime,
-// MCP wiring) MUST live in the platform-agent template, not as Go string
-// literals in core. A future re-introduction of the consts would silently
-// regress the SSOT; this test fails on a fresh build if any of the deleted
-// identifiers reappear as bare Go references in the package source.
+// §10a de-hardcode: the concierge's PROMPT + MCP-wiring identity (system
+// prompt template, MCP-servers block, identity files) MUST live in the
+// platform-agent template, not as Go string literals in core. A future
+// re-introduction of those consts would silently regress the SSOT; this test
+// fails on a fresh build if any of the banned identifiers reappear as bare Go
+// references in the package source.
+//
+// NOTE (incident 2026-06-15): conciergeDeclaredModel is DELIBERATELY NOT
+// banned. The model is the ONE concierge identity element that legitimately
+// lives in core: the universal MISSING_MODEL gate (core#2594) reads the stored
+// MODEL secret at provision time — BEFORE any template config.yaml is fetched —
+// so the model MUST be seeded from a core-resident declared value, not deferred
+// to template delivery. #2919 wrongly lumped the model in with the prompt/MCP
+// literals and banned it here; removing the model-seed regressed every fresh
+// platform-agent provision to MISSING_MODEL fail-closed. The concierge IS the
+// platform-agent product, so it declares its own model exactly as a template
+// does (this is SSOT-correct, not a hardcoded platform default). The seeding is
+// gated by TestApplyConciergeProvisionConfig_SeedsModel.
 //
 // This is a grep over the package — brittle by design (intentionally so:
 // the concierge-literal pattern was the exact failure mode of the
@@ -572,7 +680,6 @@ func TestNoConciergeLiteralsInCore(t *testing.T) {
 		"conciergeSystemPromptTmpl",
 		"conciergeMCPServersBlock",
 		"conciergeMCPFragmentFile",
-		"conciergeDeclaredModel",
 		"conciergeIdentityFiles",
 	}
 	for _, id := range banned {
-- 
2.52.0