From de4a9f6059278aaa6d75b63c885a524afafbdf1a Mon Sep 17 00:00:00 2001 From: core-devops Date: Thu, 18 Jun 2026 18:20:15 +0000 Subject: [PATCH 1/7] fix(concierge): seed LLM_PROVIDER=platform env pin (platform_agent.go) Co-Authored-By: Claude Opus 4.8 (1M context) --- .../internal/handlers/platform_agent.go | 121 ++++++++++++++++++ 1 file changed, 121 insertions(+) diff --git a/workspace-server/internal/handlers/platform_agent.go b/workspace-server/internal/handlers/platform_agent.go index 84836b32..328f1838 100644 --- a/workspace-server/internal/handlers/platform_agent.go +++ b/workspace-server/internal/handlers/platform_agent.go @@ -222,6 +222,29 @@ func (h *WorkspaceHandler) applyConciergeProvisionConfig( // Seed-only: it respects a model the customer later picked. h.ensureConciergeModel(ctx, workspaceID, envVars) + // 0b. Concierge LLM provider pin (companion to the model seed). The + // molecule-runtime wheel DERIVES a provider slug from the model id + // ("moonshot/kimi-k2.6" -> "moonshot" via _derive_provider_from_model), + // which is a model-PREFIX on the `platform` provider, NOT a provider + // NAME — so the claude-code adapter's _resolve_provider fail-closes + // ("provider='moonshot' but it is not in the providers registry") and + // the concierge boots configuration_status=not_configured: online but + // unable to run a single turn (last_outbound_at stays null). + // + // The platform-agent template config.yaml's `provider: platform` field + // does NOT fix this on SaaS: the on-box /configs/config.yaml is the + // BAKED base-image config (the box reports the base claude-code image's + // 8-entry provider registry, not the platform-agent template's 3), so + // the template `provider:` scalar never reaches the adapter. Seeding + // LLM_PROVIDER (env, highest precedence in the wheel's + // LLM_PROVIDER > YAML provider: > derive chain; injected via the + // workspace secret) is the robust pin — it survives restart and the + // regenerated config.yaml. Verified on prod: setting LLM_PROVIDER=platform + // flipped a stuck concierge from not_configured to ready + responding. + // Seed-only + gated to the platform-managed model namespace so it never + // overrides a BYOK/self-host concierge (see ensureConciergeProvider). + h.ensureConciergeProvider(ctx, workspaceID, envVars) + // 1. Platform-MCP env (org-admin token + platform URL + org id). conciergePlatformMCPEnv(envVars) @@ -354,6 +377,104 @@ func readStoredModelSecret(ctx context.Context, workspaceID string) string { return string(dec) } +// conciergeProvider is the provider-registry NAME the concierge's declared +// platform-managed model resolves to. The platform agent is always +// platform-managed (billing/audit flow through the platform LLM proxy), so the +// provider is unconditionally "platform" for the platform-managed model family. +const conciergeProvider = "platform" + +// platformManagedModelPrefix is the model-id namespace served by the platform +// LLM proxy that ALSO collides with the wheel's provider derivation (the slug +// before '/' is "moonshot", not a registry name). A concierge whose effective +// model carries this prefix MUST have its provider pinned to `platform` +// explicitly; without the pin the claude-code adapter fail-closes. Gating on +// this prefix keeps the seed from touching a BYOK/self-host concierge whose +// model resolves cleanly on its own (e.g. `sonnet` -> anthropic-oauth). +const platformManagedModelPrefix = "moonshot/" + +// ensureConciergeProvider pins the concierge's LLM provider to `platform` (core +// companion to ensureConciergeModel). It guarantees the env-level provider pin +// that the runtime needs, independent of the template config.yaml (which is NOT +// delivered to the on-box /configs — the box uses the baked base-image config). +// +// SEED-ONLY, keyed on the LLM_PROVIDER secret (NOT MODEL) so an EXISTING +// concierge that already has a MODEL secret still receives the provider pin on +// its next provision, while a provider the customer later pinned in the canvas +// (which writes LLM_PROVIDER) is respected. GATED on the effective model's +// platform-managed namespace so it never forces `platform` onto a BYOK or +// self-hosted concierge running a non-proxy model. +func (h *WorkspaceHandler) ensureConciergeProvider(ctx context.Context, workspaceID string, envVars map[string]string) { + // Respect an explicit provider already set (customer canvas pick or a prior + // seed): loadWorkspaceSecrets already injected it into envVars. Do nothing. + if existing := readStoredProviderSecret(ctx, workspaceID); existing != "" { + return + } + + // Effective model for this provision. In production envVars["MODEL"] is + // ALWAYS populated before this runs — either by loadWorkspaceSecrets + + // applyRuntimeModelEnv (an existing/customer model) or by ensureConciergeModel + // just above (the fresh-boot seed) — so reading it here is sufficient and + // avoids a redundant secret decrypt. + model := strings.TrimSpace(envVars["MODEL"]) + // Only pin when the model is in the platform-managed namespace that needs it. + // A non-platform model (e.g. `sonnet`, a BYOK `claude-…`) resolves on its + // own; forcing `platform` there would mis-route auth and break the agent. + if !strings.HasPrefix(strings.ToLower(model), platformManagedModelPrefix) { + return + } + + envVars["LLM_PROVIDER"] = conciergeProvider + if setErr := setProviderSecret(ctx, workspaceID, conciergeProvider); setErr != nil { + log.Printf("Provisioner: concierge %s persist LLM_PROVIDER secret failed: %v (env still seeded for this provision)", workspaceID, setErr) + } else { + log.Printf("Provisioner: concierge %s pinned LLM_PROVIDER=%s for platform-managed model %q", workspaceID, conciergeProvider, model) + } +} + +// readStoredProviderSecret returns the decrypted LLM_PROVIDER workspace_secret, +// or "" when none is stored (or on any read/decrypt error — treated as "unset" +// so a transient miss re-seeds rather than wedges). Mirrors readStoredModelSecret. +func readStoredProviderSecret(ctx context.Context, workspaceID string) string { + var stored []byte + var version int + if err := db.DB.QueryRowContext(ctx, + `SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = $1 AND key = 'LLM_PROVIDER'`, + workspaceID).Scan(&stored, &version); err != nil { + return "" + } + dec, err := crypto.DecryptVersioned(stored, version) + if err != nil { + return "" + } + return string(dec) +} + +// setProviderSecret persists (or clears, when provider == "") the LLM_PROVIDER +// workspace_secret. Mirrors setModelSecret (secrets.go). LLM_PROVIDER is the +// provider-slug pin the molecule-runtime wheel reads at highest precedence; it +// is injected into the container as an env var by loadWorkspaceSecrets, so it +// survives restarts and the regenerated on-box config.yaml. +func setProviderSecret(ctx context.Context, workspaceID, provider string) error { + if provider == "" { + _, err := db.DB.ExecContext(ctx, + `DELETE FROM workspace_secrets WHERE workspace_id = $1 AND key = 'LLM_PROVIDER'`, + workspaceID) + return err + } + encrypted, err := crypto.Encrypt([]byte(provider)) + if err != nil { + return err + } + version := crypto.CurrentEncryptionVersion() + _, err = db.DB.ExecContext(ctx, ` + INSERT INTO workspace_secrets (workspace_id, key, encrypted_value, encryption_version) + VALUES ($1, 'LLM_PROVIDER', $2, $3) + ON CONFLICT (workspace_id, key) DO UPDATE + SET encrypted_value = $2, encryption_version = $3, updated_at = now() + `, workspaceID, encrypted, version) + return err +} + // EnsureSelfHostedPlatformAgent installs the org's platform agent (the concierge, // the org root) on a tenant that has no control plane to do it — i.e. self-hosted // or local. In SaaS the CP calls InstallPlatformAgent at org-provision time; this -- 2.52.0 From a396220096175284bebfc3f15d270a08cd7a2161 Mon Sep 17 00:00:00 2001 From: core-devops Date: Thu, 18 Jun 2026 18:20:16 +0000 Subject: [PATCH 2/7] fix(concierge): seed LLM_PROVIDER=platform env pin (platform_agent_test.go) Co-Authored-By: Claude Opus 4.8 (1M context) --- .../internal/handlers/platform_agent_test.go | 131 +++++++++++++++++- 1 file changed, 127 insertions(+), 4 deletions(-) diff --git a/workspace-server/internal/handlers/platform_agent_test.go b/workspace-server/internal/handlers/platform_agent_test.go index 7553cd42..92ab1170 100644 --- a/workspace-server/internal/handlers/platform_agent_test.go +++ b/workspace-server/internal/handlers/platform_agent_test.go @@ -468,6 +468,13 @@ func TestApplyConciergeProvisionConfig_OnlyPlatformGetsOrgMCP(t *testing.T) { // INSERT). The seed path itself is covered by // TestApplyConciergeProvisionConfig_SeedsModel. const modelSelQuery = `SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = \$1 AND key = 'MODEL'` + // ensureConciergeProvider (step 0b, platform kind only) reads the stored + // LLM_PROVIDER secret to decide seed-vs-respect. In these MCP/name subtests + // the test env carries no MODEL (loadWorkspaceSecrets is not run), so the + // provider gate (platform-managed model namespace) is not met and NO + // LLM_PROVIDER INSERT fires — only the existence SELECT. The seed itself is + // covered by TestApplyConciergeProvisionConfig_SeedsProvider. + const providerSelQuery = `SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = \$1 AND key = 'LLM_PROVIDER'` t.Run("ordinary workspace gets NO org MCP, NO admin token, NO substitution", func(t *testing.T) { mock := setupTestDB(t) @@ -509,6 +516,9 @@ func TestApplyConciergeProvisionConfig_OnlyPlatformGetsOrgMCP(t *testing.T) { mock.ExpectQuery(modelSelQuery).WithArgs("ws-concierge"). WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}). AddRow([]byte("moonshot/kimi-k2.6"), 0)) + // ensureConciergeProvider existence check (env has no MODEL here → no pin). + mock.ExpectQuery(providerSelQuery).WithArgs("ws-concierge"). + WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"})) env := map[string]string{} cf := map[string][]byte{ "config.yaml": []byte("runtime: claude-code\nmodel: moonshot/kimi-k2.6\n"), @@ -548,6 +558,8 @@ func TestApplyConciergeProvisionConfig_OnlyPlatformGetsOrgMCP(t *testing.T) { mock.ExpectQuery(modelSelQuery).WithArgs("ws-concierge"). WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}). AddRow([]byte("moonshot/kimi-k2.6"), 0)) + mock.ExpectQuery(providerSelQuery).WithArgs("ws-concierge"). + WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"})) env := map[string]string{} // Already-substituted prompt (a re-provision of a running concierge). cf := map[string][]byte{ @@ -580,7 +592,8 @@ func TestApplyConciergeProvisionConfig_SeedsModel(t *testing.T) { h := &WorkspaceHandler{} const kindQuery = `SELECT COALESCE\(kind, 'workspace'\) FROM workspaces WHERE id =` const modelSelQuery = `SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = \$1 AND key = 'MODEL'` - const modelInsert = `INSERT INTO workspace_secrets` + const providerSelQuery = `SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = \$1 AND key = 'LLM_PROVIDER'` + const secretInsert = `INSERT INTO workspace_secrets` t.Run("fresh platform agent with NO stored model gets the declared model seeded + persisted", func(t *testing.T) { mock := setupTestDB(t) @@ -590,7 +603,15 @@ func TestApplyConciergeProvisionConfig_SeedsModel(t *testing.T) { mock.ExpectQuery(modelSelQuery).WithArgs("ws-fresh"). WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"})) // Seed path must PERSIST the declared model. - mock.ExpectExec(modelInsert). + mock.ExpectExec(secretInsert). + WithArgs("ws-fresh", sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) + // ensureConciergeProvider: no LLM_PROVIDER yet → existence SELECT empty; + // the just-seeded MODEL (moonshot/…) meets the platform namespace gate, + // so the provider pin is PERSISTED too. + mock.ExpectQuery(providerSelQuery).WithArgs("ws-fresh"). + WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"})) + mock.ExpectExec(secretInsert). WithArgs("ws-fresh", sqlmock.AnyArg(), sqlmock.AnyArg()). WillReturnResult(sqlmock.NewResult(0, 1)) @@ -606,8 +627,13 @@ func TestApplyConciergeProvisionConfig_SeedsModel(t *testing.T) { if env["MOLECULE_MODEL"] != conciergeDeclaredModel { t.Errorf("fresh concierge did not seed MOLECULE_MODEL=%q; got %q", conciergeDeclaredModel, env["MOLECULE_MODEL"]) } + // Companion provider pin: the concierge can't run a turn without it + // (moonshot/… derives a non-registry provider name → adapter fail-closes). + if env["LLM_PROVIDER"] != conciergeProvider { + t.Errorf("fresh concierge did not seed LLM_PROVIDER=%q; got %q (env=%v) — concierge would boot not_configured", conciergeProvider, env["LLM_PROVIDER"], env) + } if err := mock.ExpectationsWereMet(); err != nil { - t.Errorf("unmet sqlmock expectations (the MODEL secret was not persisted): %v", err) + t.Errorf("unmet sqlmock expectations (MODEL or LLM_PROVIDER secret not persisted): %v", err) } }) @@ -619,8 +645,13 @@ func TestApplyConciergeProvisionConfig_SeedsModel(t *testing.T) { mock.ExpectQuery(modelSelQuery).WithArgs("ws-picked"). WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}). AddRow([]byte("anthropic:claude-opus-4-8"), 0)) - // NO ExpectExec: ensureConciergeModel must return early (no re-seed, + // NO model ExpectExec: ensureConciergeModel must return early (no re-seed, // no INSERT) — re-asserting the default would silently revert the pick. + // ensureConciergeProvider runs its existence SELECT, but the test env + // carries no MODEL and the customer's model is non-platform-namespace, so + // NO provider pin fires either. + mock.ExpectQuery(providerSelQuery).WithArgs("ws-picked"). + WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"})) env := map[string]string{} h.applyConciergeProvisionConfig(context.Background(), "ws-picked", "", nil, env, "Org Concierge") @@ -651,6 +682,98 @@ func TestApplyConciergeProvisionConfig_SeedsModel(t *testing.T) { }) } +// TestApplyConciergeProvisionConfig_SeedsProvider is the CI regression gate for +// the concierge non-response incident (prod 2026-06-18): the concierge booted +// online but configuration_status=not_configured because the runtime wheel +// derives provider="moonshot" from the model id "moonshot/kimi-k2.6" (a +// model-PREFIX on the `platform` provider, NOT a provider NAME), and the +// claude-code adapter fail-closes. The template config.yaml `provider:` field +// does not reach the on-box config, so core MUST seed the LLM_PROVIDER env pin +// (the highest-precedence, restart-surviving signal). Verified on prod test3: +// setting LLM_PROVIDER=platform flipped not_configured → ready + responding. +func TestApplyConciergeProvisionConfig_SeedsProvider(t *testing.T) { + h := &WorkspaceHandler{} + const kindQuery = `SELECT COALESCE\(kind, 'workspace'\) FROM workspaces WHERE id =` + const modelSelQuery = `SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = \$1 AND key = 'MODEL'` + const providerSelQuery = `SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = \$1 AND key = 'LLM_PROVIDER'` + const secretInsert = `INSERT INTO workspace_secrets` + + t.Run("existing platform-managed concierge with NO provider gets LLM_PROVIDER=platform pinned", func(t *testing.T) { + mock := setupTestDB(t) + mock.ExpectQuery(kindQuery).WithArgs("ws-heal"). + WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform")) + // Existing platform model → ensureConciergeModel respects it (no INSERT). + mock.ExpectQuery(modelSelQuery).WithArgs("ws-heal"). + WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}). + AddRow([]byte(conciergeDeclaredModel), 0)) + // No LLM_PROVIDER yet → existence SELECT empty, then PERSIST the pin. + mock.ExpectQuery(providerSelQuery).WithArgs("ws-heal"). + WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"})) + mock.ExpectExec(secretInsert). + WithArgs("ws-heal", sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) + + // Simulate loadWorkspaceSecrets having populated MODEL into the env + // (the production precondition for an existing-model concierge). + env := map[string]string{"MODEL": conciergeDeclaredModel} + h.applyConciergeProvisionConfig(context.Background(), "ws-heal", "", nil, env, "Org Concierge") + + if env["LLM_PROVIDER"] != conciergeProvider { + t.Errorf("existing platform-managed concierge did not get LLM_PROVIDER=%q pinned; got %q (env=%v)", conciergeProvider, env["LLM_PROVIDER"], env) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations (LLM_PROVIDER pin not persisted): %v", err) + } + }) + + t.Run("SEED-ONLY: a customer-picked provider is respected, never overwritten", func(t *testing.T) { + mock := setupTestDB(t) + mock.ExpectQuery(kindQuery).WithArgs("ws-prov-picked"). + WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform")) + mock.ExpectQuery(modelSelQuery).WithArgs("ws-prov-picked"). + WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}). + AddRow([]byte(conciergeDeclaredModel), 0)) + // Customer already pinned a provider in the canvas → existence SELECT + // returns it → NO INSERT (respecting the pick). + mock.ExpectQuery(providerSelQuery).WithArgs("ws-prov-picked"). + WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}). + AddRow([]byte("anthropic-api"), 0)) + + env := map[string]string{"MODEL": conciergeDeclaredModel, "LLM_PROVIDER": "anthropic-api"} + h.applyConciergeProvisionConfig(context.Background(), "ws-prov-picked", "", nil, env, "Org Concierge") + + if env["LLM_PROVIDER"] != "anthropic-api" { + t.Errorf("seed-only violated: overwrote the customer's provider pick (got %q)", env["LLM_PROVIDER"]) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations (an unexpected INSERT means it re-pinned over the customer's pick): %v", err) + } + }) + + t.Run("non-platform model namespace does NOT get a platform provider pin", func(t *testing.T) { + mock := setupTestDB(t) + mock.ExpectQuery(kindQuery).WithArgs("ws-byok"). + WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform")) + mock.ExpectQuery(modelSelQuery).WithArgs("ws-byok"). + WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}). + AddRow([]byte("sonnet"), 0)) + // Existence SELECT runs; model "sonnet" resolves on its own (anthropic- + // oauth alias), so the gate is NOT met → NO provider INSERT. + mock.ExpectQuery(providerSelQuery).WithArgs("ws-byok"). + WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"})) + + env := map[string]string{"MODEL": "sonnet"} + h.applyConciergeProvisionConfig(context.Background(), "ws-byok", "", nil, env, "Org Concierge") + + if _, ok := env["LLM_PROVIDER"]; ok { + t.Errorf("non-platform model wrongly got LLM_PROVIDER pinned (%q) — would mis-route a BYOK/self-host concierge", env["LLM_PROVIDER"]) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } + }) +} + // TestNoConciergeLiteralsInCore is the regression guard for the RFC #2843 // §10a de-hardcode: the concierge's PROMPT + MCP-wiring identity (system // prompt template, MCP-servers block, identity files) MUST live in the -- 2.52.0 From 7c0807c7a21ca05a646125e27cd36e172b5f5d35 Mon Sep 17 00:00:00 2001 From: core-devops Date: Thu, 18 Jun 2026 19:42:27 +0000 Subject: [PATCH 3/7] test: remove stale platform-agent image drift-gate (Dockerfile moved to template repo in #3027; baked-image approach retired per rfc-platform-mcp-as-plugin) The gate read workspace-server/Dockerfile.platform-agent, deleted in #3027 when the platform-agent image build moved to molecule-ai-workspace-template-claude-code. The stale read fails Platform (Go) for ANY workspace-server PR. The SSOT-integrity concern it guarded now belongs in the template repo's CI (follow-up). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../platform_agent_image_drift_test.go | 462 ------------------ 1 file changed, 462 deletions(-) delete mode 100644 workspace-server/internal/provisioner/platform_agent_image_drift_test.go diff --git a/workspace-server/internal/provisioner/platform_agent_image_drift_test.go b/workspace-server/internal/provisioner/platform_agent_image_drift_test.go deleted file mode 100644 index 24f7caef..00000000 --- a/workspace-server/internal/provisioner/platform_agent_image_drift_test.go +++ /dev/null @@ -1,462 +0,0 @@ -package provisioner - -// platform_agent_image_drift_test.go — CI DRIFT-GATE for the -// IMAGE-BAKED platform-agent identity (RFC #2843 §10a). -// -// The IMAGE-BAKED impl (workspace-server/Dockerfile.platform-agent) -// bakes the concierge's identity (config.yaml + -// prompts/concierge.md + mcp_servers.yaml + identity-fallback.sh) -// from the platform-agent TEMPLATE REPO into the platform-agent -// image at /opt/molecule-platform-agent-template/. The driver -// hard-requirement: -// "The image-baked config.yaml + prompts/concierge.md + -// mcp_servers.yaml MUST be SOURCED FROM the platform-agent TEMPLATE -// REPO (single SSOT = PR #1's content) — NOT vendored/duplicated in -// core." -// -// A future drift — e.g., someone edits config.yaml in core, or the -// pre-clone step points at the wrong dir, or a build-arg change -// reroutes the source — would silently create a 2-SSOT situation -// (image snapshot diverges from template repo). The driver-rejected -// option (b) MINIMAL IN-CORE FALLBACK was rejected EXPLICITLY -// because of this 2-SSOT drift risk; the IMAGE-BAKED impl survives -// only because the drift-gate closes that risk. -// -// The drift-gate (this test) has TWO halves: -// -// 1. Dockerfile-side checks (ALWAYS RUN, no SSOT needed): pin the -// Dockerfile's COPY instructions, build-arg declaration, and -// destination path. Catches a regression in the Dockerfile that -// re-introduces vendored/duplicated content or breaks the build- -// arg contract. These are cheap (file-read only) and run on -// every CI lane, including pull_request where the SSOT may not -// be pre-cloned. -// -// 2. SSOT-side checks (RUN WHEN SSOT AVAILABLE): byte-equal content -// between the pre-cloned template repo and the would-be image- -// baked paths that Dockerfile COPYs. Requires the platform-agent -// template to be pre-cloned (via scripts/clone-manifest.sh from -// manifest.json's workspace_templates entry, OR the operator- -// override env var). Skipped with a t.Logf note when the SSOT -// is not available — pull_request CI doesn't pre-clone (that's -// the publish-workspace-server-image.yml workflow's job), and -// we don't want a missing pre-clone to fail this lane. -// -// How to run: `go test -run TestPlatformAgentImageDriftGate -// ./internal/provisioner/`. Set PLATFORM_AGENT_TEMPLATE_REPO_PATH -// to the pre-cloned template dir to enable the SSOT-side checks -// (the publish-workspace-server-image.yml workflow does this via -// the post-pre-clone test step). -// -// Test scope: the 4 files the Dockerfile COPYs (config.yaml, -// mcp_servers.yaml, prompts/concierge.md, identity-fallback.sh). -// A future concierge-identity change that adds a new file MUST also -// extend the expectedImageBakedFiles list here; the Dockerfile-side -// check catches the missing COPY, and the SSOT-side check (when -// run) catches the missing identity file in the template repo. - -import ( - "os" - "path/filepath" - "regexp" - "strings" - "testing" -) - -// expectedImageBakedFiles is the canonical list of files the -// IMAGE-BAKED impl bakes into the platform-agent image. The list -// MUST match Dockerfile.platform-agent's COPY instructions exactly. -// Adding a new concierge-identity file = adding it here AND in the -// Dockerfile; the test fails if either side drifts. -// -// Paths are RELATIVE to the SSOT root (the platform-agent template -// repo). The Dockerfile's PLATFORM_AGENT_TEMPLATE_DIR build-arg -// points at this same root. -// -// The "identity-fallback.sh" entry is the boot-time per-file copy -// script (template-platform-agent #2, copied into the image and -// invoked from the platform-agent entrypoint). It's a 1st-class -// IMAGE-BAKED asset (NOT metadata / not a future change) — the -// runtime /opt→/configs fallback (workspace-runtime PR #141 -// load_config) and the boot-time /opt→/configs fallback (this -// Dockerfile's entrypoint) are complementary, and BOTH need the -// image-baked copy at /opt/.../identity-fallback.sh in the build -// to close the self-host + pre-#29-bootstrap window. Listed here -// so the SSOT-side check rejects a template-repo that ships the -// script (correctly, in the platform-agent template) without the -// matching Dockerfile COPY (regression). -var expectedImageBakedFiles = []string{ - "config.yaml", - "mcp_servers.yaml", - "prompts/concierge.md", - "identity-fallback.sh", -} - -// isConciergeIdentityPath reports whether a path in the platform-agent -// template repo is part of the concierge's IDENTITY (the set of -// files the IMAGE-BAKED impl should COPY into the image). A file -// outside this namespace (e.g. README.md, .gitignore) is -// documentation / metadata and is correctly EXCLUDED from the -// image-baked content. -// -// Namespace mirrors the template-asset allowlist in -// internal/provisioner/template_assets.go (IsCPTemplateAssetPath): -// - "config.yaml" — runtime entrypoint config -// - "mcp_servers.yaml" — MCP wiring (overlay) -// - "prompts/*" — system prompts -// - "identity-fallback.sh" — boot-time /opt→/configs copy script -// (template-platform-agent #2, invoked -// from the platform-agent entrypoint) -// -// A future RFC that adds a new namespace (e.g. "hooks/*") MUST -// extend this function AND the Dockerfile AND expectedImageBakedFiles -// in lockstep. The drift-gate's value is in the lockstep invariant. -func isConciergeIdentityPath(rel string) bool { - rel = filepath.ToSlash(filepath.Clean(rel)) - return rel == "config.yaml" || - rel == "mcp_servers.yaml" || - rel == "identity-fallback.sh" || - strings.HasPrefix(rel, "prompts/") -} - -// canonicalPlatformAgentSSOTRelPath is the default SSOT path the -// drift-gate reads from when PLATFORM_AGENT_TEMPLATE_REPO_PATH is -// unset, RELATIVE TO THE REPO ROOT. It mirrors Dockerfile.platform- -// agent's default PLATFORM_AGENT_TEMPLATE_DIR build-arg (i.e. where -// scripts/clone-manifest.sh places the platform-agent template repo -// after the pre-clone step in publish-workspace-server-image.yml). -// -// The env-var override exists for operators running the test -// outside the canonical CI context (e.g. an ad-hoc build verifying -// the drift-gate against a custom template mirror). When the env -// var is set, the test uses that path verbatim; otherwise it walks -// up from the test's CWD to find the repo root and resolves the -// canonical path from there. -// -// The drift-gate is CWD-AGNOSTIC: the test runs from the package -// dir (workspace-server/internal/provisioner/) which is two levels -// below the repo root, so the walk-up is necessary. This is the -// standard pattern for Go tests that need a repo-rooted fixture. -const canonicalPlatformAgentSSOTRelPath = ".tenant-bundle-deps/workspace-configs-templates/platform-agent" - -// repoRoot walks up from the test's CWD until it finds the -// molecule-core repo root (identified by go.mod at workspace-server/ -// go.mod or by the presence of manifest.json — the molecule-core -// root marker). Returns the absolute path to the repo root. -// -// Used by the drift-gate to resolve canonicalPlatformAgentSSOTRelPath -// to an absolute path regardless of where the test was invoked -// from. Bounded walk-up (max 10 levels) prevents an infinite loop -// if the test somehow runs from a path that doesn't contain a -// molecule-core repo above it. -func repoRoot(t *testing.T) string { - t.Helper() - wd, err := os.Getwd() - if err != nil { - t.Fatalf("getwd: %v", err) - } - dir := wd - for i := 0; i < 10; i++ { - // The canonical repo-root marker: manifest.json (present - // only at the molecule-core repo root, not in any submodule - // or vendored copy). workspace-server/go.mod is a weaker - // signal — it's also present in nested test fixtures. - if _, err := os.Stat(filepath.Join(dir, "manifest.json")); err == nil { - return dir - } - parent := filepath.Dir(dir) - if parent == dir { - break - } - dir = parent - } - t.Fatalf("could not locate molecule-core repo root from CWD %q (walked up 10 levels; expected manifest.json in some ancestor)", wd) - return "" -} - -// resolveSSOTRoot returns the absolute path to the platform-agent -// template SSOT. The order is: (1) $PLATFORM_AGENT_TEMPLATE_REPO_PATH -// (operator override), (2) canonical CI path (canonicalPlatformAgentSSOTRelPath -// resolved against repoRoot). Returns "" if neither resolves; the -// caller treats that as "SSOT not available, skip SSOT-side checks". -// -// A nil error with a non-empty path means the path EXISTS and is -// readable. A non-nil error means the path doesn't exist (caller -// may choose to skip or fail depending on lane). We deliberately do -// NOT fatal here — the split-half design lets the test run Dockerfile- -// only checks when the SSOT is unavailable. -func resolveSSOTRoot(t *testing.T) (path string, available bool) { - t.Helper() - ssotRoot := os.Getenv("PLATFORM_AGENT_TEMPLATE_REPO_PATH") - if ssotRoot == "" { - ssotRoot = filepath.Join(repoRoot(t), canonicalPlatformAgentSSOTRelPath) - } - if _, err := os.Stat(ssotRoot); err != nil { - return "", false - } - return ssotRoot, true -} - -// TestPlatformAgentImageDriftGate pins the IMAGE-BAKED ↔ template -// SSOT invariant. The test has TWO halves: -// -// 1. Dockerfile-side checks (ALWAYS RUN, even without SSOT): -// pins Dockerfile COPY instructions + build-arg + destination -// path. Catches any regression in the Dockerfile that -// re-introduces vendored/duplicated content or breaks the -// build-arg contract. These run on every CI lane, including -// pull_request. -// -// 2. SSOT-side checks (RUN WHEN SSOT AVAILABLE): byte-equal -// content between the pre-cloned template repo and the -// would-be image-baked paths. Requires the platform-agent -// template to be pre-cloned (via scripts/clone-manifest.sh -// from manifest.json's workspace_templates entry, OR the -// operator-override env var). Skipped with a t.Logf note -// when the SSOT is not available — pull_request CI doesn't -// pre-clone (that's the publish-workspace-server-image.yml -// workflow's job), and we don't want a missing pre-clone -// to fail this lane. -// -// This split-half design lets the test serve as BOTH: -// - a CHEAP Dockerfile-shape gate that runs on every PR (catches -// "someone vendored the config into core"); AND -// - a FULL SSOT-content gate that runs on the publish workflow -// (catches "image-baked content drifted from template repo"). -func TestPlatformAgentImageDriftGate(t *testing.T) { - // === Half 1: Dockerfile-side checks (always run) === - - dockerfilePath := filepath.Join("..", "..", "Dockerfile.platform-agent") - dockerfile, err := os.ReadFile(dockerfilePath) - if err != nil { - t.Fatalf("read %s: %v — the drift-gate requires Dockerfile.platform-agent to live next to the other Dockerfiles; verify the path", dockerfilePath, err) - } - dockerfileStr := string(dockerfile) - - for _, rel := range expectedImageBakedFiles { - // The Dockerfile uses two patterns: COPY /opt/... - // for the top-level files (config.yaml, mcp_servers.yaml, - // identity-fallback.sh) and COPY / /opt/.../ for the - // prompts/ directory. We check that EITHER pattern appears - // for the expected file. - // - // COPY may carry build-flags between the verb and the source - // arg — e.g. `COPY --chmod=0755 ${PLATFORM_AGENT_TEMPLATE_DIR}/ - // identity-fallback.sh ...` (e4efc35d switched identity- - // fallback.sh from `RUN chmod` to `COPY --chmod` because the - // non-root tenant base can't `RUN chmod`). The matcher must - // tolerate any such `--flag[=value]` tokens; a literal-substring - // match on `COPY ${...}/` would false-fail the drift-gate the - // moment a COPY grows a flag. Match `COPY` + optional flags + - // the source path via regex (whitespace-flexible). - quotedDir := regexp.QuoteMeta(`${PLATFORM_AGENT_TEMPLATE_DIR}/`) - copyFlags := `(?:\s+--\S+)*` // zero or more `--flag[=val]` tokens - topLevel := regexp.MustCompile(`COPY` + copyFlags + `\s+` + quotedDir + regexp.QuoteMeta(rel) + `\b`) - dirPattern := regexp.MustCompile(`COPY` + copyFlags + `\s+` + quotedDir + regexp.QuoteMeta(filepath.Dir(rel)) + `/`) - if !topLevel.MatchString(dockerfileStr) && !dirPattern.MatchString(dockerfileStr) { - t.Errorf("Dockerfile COPY missing: %s — the IMAGE-BAKED impl must COPY %s from the platform-agent template SSOT; if a new identity file is added, update Dockerfile.platform-agent AND expectedImageBakedFiles", rel, rel) - } - } - - // ALSO verify the Dockerfile references the build-arg + the - // destination path. A future refactor that changes either of - // these would silently break the SSOT contract; the test pins - // the names that the workspace-server's runtime fallback (and - // any operator inspecting the image) relies on. - if !strings.Contains(dockerfileStr, "ARG PLATFORM_AGENT_TEMPLATE_DIR=") { - t.Error("Dockerfile.platform-agent is missing the PLATFORM_AGENT_TEMPLATE_DIR build-arg declaration — the IMAGE-BAKED impl requires this arg to source from the pre-cloned template repo") - } - if !strings.Contains(dockerfileStr, "/opt/molecule-platform-agent-template/") { - t.Error("Dockerfile.platform-agent is missing the /opt/molecule-platform-agent-template/ destination path — the workspace-server runtime fallback (and the drift-gate convention) pins this path; a change requires a coordinated update in both places") - } - - // === Half 2: SSOT-side checks (conditional on SSOT availability) === - - ssotRoot, available := resolveSSOTRoot(t) - if !available { - // SSOT not pre-cloned (typical for pull_request CI). Run - // the Dockerfile-side checks only; the SSOT-side checks - // will run on the publish-workspace-server-image.yml - // workflow which pre-clones via scripts/clone-manifest.sh. - t.Logf("platform-agent template SSOT not available at canonical CI path (PLATFORM_AGENT_TEMPLATE_REPO_PATH unset, .tenant-bundle-deps/workspace-configs-templates/platform-agent missing). Dockerfile-side checks ran; SSOT-side checks SKIPPED. Set PLATFORM_AGENT_TEMPLATE_REPO_PATH to the pre-cloned template dir to enable the full gate (the publish-workspace-server-image.yml workflow does this via the post-pre-clone test step).") - return - } - - // SSOT-side: each expected file MUST exist at ssotRoot/ - // and have non-zero content (zero-byte file = silent miss). - for _, rel := range expectedImageBakedFiles { - ssotPath := filepath.Join(ssotRoot, rel) - data, err := os.ReadFile(ssotPath) - if err != nil { - t.Errorf("SSOT missing: %s (read: %v) — the platform-agent template repo is the load-bearing identity SSOT; a missing file is a regression", ssotPath, err) - continue - } - if len(data) == 0 { - t.Errorf("SSOT empty: %s — zero-byte identity file would silently bake a broken concierge into the image", ssotPath) - } - } - - // SSOT-side: scan the platform-agent template repo for any - // additional files in the concierge-identity namespace (e.g. - // prompts/foo.md) that the Dockerfile might be missing. The - // forward-direction check (above) catches a missing expected - // file; this REVERSE check catches an un-expected new identity - // file the Dockerfile doesn't COPY. Both must hold for the - // image-baked content to remain SSOT-equal. - extraIdentityFiles, err := scanConciergeIdentityFiles(ssotRoot) - if err != nil { - t.Errorf("scan SSOT identity files: %v", err) - } else { - for _, rel := range extraIdentityFiles { - found := false - for _, expected := range expectedImageBakedFiles { - if rel == expected { - found = true - break - } - } - if !found { - t.Errorf("SSOT has an un-baked concierge-identity file: %s — the IMAGE-BAKED impl is now SILENTLY DRIFTING from the SSOT (a new file was added to the platform-agent template repo without a matching COPY in Dockerfile.platform-agent + entry in expectedImageBakedFiles). Either bake it (update Dockerfile + expected list) or mark it non-identity.", rel) - } - } - } -} - -// TestPlatformAgentEntrypointWiring pins the boot-time identity- -// fallback wiring. The IMAGE_BAKED_IDENTITY_PRESENT echo-marker -// that the #2919 PR shipped was a log line that did nothing — a -// partial-template / no-fetch self-host concierge would still -// MISSING_MODEL fail at runtime because /configs would be empty -// even though /opt/molecule-platform-agent-template/ had the -// content. This test pins the WIRE-UP shape that closes the gap: -// -// 1. Dockerfile.platform-agent defines a /entrypoint-platform-agent.sh -// heredoc that invokes identity-fallback.sh BEFORE handing off -// to /entrypoint.sh (the base image's entrypoint). The -// identity-fallback.sh script is the WORKING /opt→/configs -// fill-absent-only copy from template-platform-agent #2. -// 2. The Dockerfile's ENTRYPOINT directive points at the new -// /entrypoint-platform-agent.sh (NOT the base image's -// /entrypoint.sh). Otherwise the wiring is dormant — the -// fallback would never fire. -// 3. The IMAGE_BAKED_IDENTITY_PRESENT echo-only marker is GONE. -// A regression that re-adds the echo marker would re-introduce -// the dormant-fallback bug (script exists but never runs). -// -// Why pin the wiring here (not in a shell-script test): the -// Dockerfile is the source-of-truth for the IMAGE-BAKED impl, and -// the drift-gate already pins the Dockerfile's other shape -// invariants (COPY lines, build-arg, destination path). Adding -// entrypoint-wiring pins to the same file keeps the IMAGE-BAKED -// image contract in a single test surface — operators / reviewers -// reading TestPlatformAgentImageDriftGate see the full contract -// (data + activation), not just the COPY instructions. -// -// A future change that moves the entrypoint to a different -// filename / different invocation order must update this test -// in lockstep. The shape (identity-fallback.sh + /entrypoint.sh -// handoff) is the load-bearing part; the names are conventions. -func TestPlatformAgentEntrypointWiring(t *testing.T) { - dockerfilePath := filepath.Join("..", "..", "Dockerfile.platform-agent") - dockerfile, err := os.ReadFile(dockerfilePath) - if err != nil { - t.Fatalf("read %s: %v", dockerfilePath, err) - } - dockerfileStr := string(dockerfile) - - // 1. Heredoc-defined entrypoint-platform-agent.sh: must exist, - // must invoke identity-fallback.sh, must hand off to - // /entrypoint.sh (the base image's entrypoint). - if !strings.Contains(dockerfileStr, "/entrypoint-platform-agent.sh") { - t.Errorf("Dockerfile.platform-agent is missing /entrypoint-platform-agent.sh — the platform-agent entrypoint is the load-bearing wire-up that activates the /opt→/configs fallback at boot") - } - if !strings.Contains(dockerfileStr, "identity-fallback.sh") { - t.Errorf("Dockerfile.platform-agent does not reference identity-fallback.sh — the boot-time /opt→/configs fill-absent-only copy script (template-platform-agent #2) is the WORKING fallback that replaces the IMAGE_BAKED_IDENTITY_PRESENT echo-only marker") - } - // The hand-off: the new entrypoint must exec /entrypoint.sh - // (the base image's entrypoint) with the CMD args. A regression - // that omits the hand-off would skip the docker-socket group - // setup + memory-plugin sidecar + su-exec /platform boot. - if !strings.Contains(dockerfileStr, "exec /entrypoint.sh \"$@\"") { - t.Errorf("Dockerfile.platform-agent entrypoint does not exec /entrypoint.sh \"$@\" — the platform-agent entrypoint must hand off to the base image's entrypoint (docker-socket group setup, memory-plugin sidecar, su-exec /platform); a regression here would skip the base-image boot") - } - - // 2. ENTRYPOINT directive: must point at the new entrypoint - // (NOT the base /entrypoint.sh). The default ENTRYPOINT - // (inherited from the base image) is /entrypoint.sh; a - // regression that omits the override would activate the - // identity-fallback.sh script via COPY but never invoke - // it at boot — the dormant-fallback bug. - if !strings.Contains(dockerfileStr, `ENTRYPOINT ["/entrypoint-platform-agent.sh"]`) { - t.Errorf(`Dockerfile.platform-agent is missing ENTRYPOINT ["/entrypoint-platform-agent.sh"] — the platform-agent entrypoint override is what activates the identity-fallback at boot; without it the script is COPY'd into the image but never runs`) - } - - // 3. The IMAGE_BAKED_IDENTITY_PRESENT echo-only marker MUST - // be GONE. The marker was a no-op log line that did nothing; - // re-introducing it would either (a) replace the - // identity-fallback.sh COPY (regression — fallback never - // fires) or (b) coexist with the script (which is fine but - // leaves a confusing dead file at /opt/.../IMAGE_BAKED_ - // IDENTITY_PRESENT). Either way it's a regression marker. - // - // Pin pattern: a non-comment line that creates the marker - // file (the original #2919 PR's `RUN echo ... > ...IMAGE_BAKED - // _IDENTITY_PRESENT` heredoc). A comment that mentions the - // marker name is fine (documentation); a creation line is a - // regression. The check requires the marker name to be on a - // line that ALSO contains a shell-creating token (`>`, `tee`, - // `cp`, or the start of a `RUN` directive with a heredoc) — - // this is intentionally a coarse heuristic, not a full - // Dockerfile parser, but it's tight enough to catch the - // regression while not flagging the explanatory comment. - markerCreationRegex := regexp.MustCompile(`(?m)^[^#]*IMAGE_BAKED_IDENTITY_PRESENT[^#]*(>|tee |cp |<<)`) - if markerCreationRegex.MatchString(dockerfileStr) { - t.Errorf("Dockerfile.platform-agent still creates the IMAGE_BAKED_IDENTITY_PRESENT echo-only marker — the marker was a no-op log line that did nothing; the identity-fallback.sh script (template-platform-agent #2) is the real working fallback. The marker creation line must be removed when the script is wired in.") - } -} - -// scanConciergeIdentityFiles walks the platform-agent template repo -// and returns the RELATIVE paths of every file in the concierge- -// identity namespace (config.yaml + mcp_servers.yaml + -// identity-fallback.sh + prompts/). Non-identity files (README, -// .gitignore, etc.) are filtered out. -// -// Errors are returned for filesystem-walk failures; the caller turns -// them into a t.Errorf (so other checks still run). The walk is -// deliberately non-recursive beyond the namespace prefix — the -// concierge's identity is config + mcp + fallback-script + prompts, -// nothing nested. -func scanConciergeIdentityFiles(ssotRoot string) ([]string, error) { - var identity []string - entries, err := os.ReadDir(ssotRoot) - if err != nil { - return nil, err - } - for _, e := range entries { - // Top-level files: config.yaml, mcp_servers.yaml, - // identity-fallback.sh - if !e.IsDir() { - if isConciergeIdentityPath(e.Name()) { - identity = append(identity, e.Name()) - } - continue - } - // Directories: scan prompts/ - if e.Name() == "prompts" { - promptEntries, err := os.ReadDir(filepath.Join(ssotRoot, e.Name())) - if err != nil { - return nil, err - } - for _, pe := range promptEntries { - if pe.IsDir() { - continue - } - rel := filepath.ToSlash(filepath.Join(e.Name(), pe.Name())) - if isConciergeIdentityPath(rel) { - identity = append(identity, rel) - } - } - } - } - return identity, nil -} -- 2.52.0 From 67ed46e72773d0a2ef294a2f4941fbbac8ea36a5 Mon Sep 17 00:00:00 2001 From: core-devops Date: Thu, 18 Jun 2026 19:53:52 +0000 Subject: [PATCH 4/7] feat(concierge): declare molecule-platform-mcp plugin + entitlement gate (platform_agent.go) Co-Authored-By: Claude Opus 4.8 (1M context) --- .../internal/handlers/platform_agent.go | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/workspace-server/internal/handlers/platform_agent.go b/workspace-server/internal/handlers/platform_agent.go index 328f1838..74126655 100644 --- a/workspace-server/internal/handlers/platform_agent.go +++ b/workspace-server/internal/handlers/platform_agent.go @@ -245,6 +245,21 @@ func (h *WorkspaceHandler) applyConciergeProvisionConfig( // overrides a BYOK/self-host concierge (see ensureConciergeProvider). h.ensureConciergeProvider(ctx, workspaceID, envVars) + // 0c. Declare the concierge's management MCP as a PLUGIN (RFC: + // rfc-platform-mcp-as-plugin). The asset-channel mcp_servers.yaml does NOT + // reach the on-box /configs (the box runs the baked base-image config), so + // the concierge boots with no management MCP — generic Claude Code, no + // create_workspace. Routing it through the plugin channel (the path that + // reliably delivers skills) fixes that: declare it here so the post-online + // reconcile + boot-install wire molecule-platform-mcp via MCPServerAdaptor. + // This declaration runs ONLY on the kind=platform concierge (this function + // is kind-gated) → it is the primary entitlement gate for the privileged + // org-admin MCP; recordDeclaredPlugin fail-closes the same name for any + // non-platform workspace as defense-in-depth. Idempotent (upsert). + if rec, skip := seedTemplatePlugins(ctx, workspaceID, []string{conciergePlatformMCPPlugin}); skip > 0 { + log.Printf("Provisioner: concierge %s could not declare %q plugin (recorded=%d skipped=%d) — management MCP may be absent until next provision", workspaceID, conciergePlatformMCPPlugin, rec, skip) + } + // 1. Platform-MCP env (org-admin token + platform URL + org id). conciergePlatformMCPEnv(envVars) @@ -392,6 +407,18 @@ const conciergeProvider = "platform" // model resolves cleanly on its own (e.g. `sonnet` -> anthropic-oauth). const platformManagedModelPrefix = "moonshot/" +// conciergePlatformMCPPlugin is the management-MCP plugin the concierge declares +// (repo molecule-ai-plugin-molecule-platform-mcp). It wires the `molecule-mcp` +// server (MOLECULE_MCP_MODE=management — create_workspace, list_workspaces, …) +// into the Claude Code runtime via the plugin channel's MCPServerAdaptor, +// replacing the baked-image + asset-channel mcp_servers.yaml path that does NOT +// reach the on-box config (RFC: rfc-platform-mcp-as-plugin). Declaring it here — +// from the kind=platform-only applyConciergeProvisionConfig — IS the primary +// entitlement gate (no user workspace runs this path); recordDeclaredPlugin adds +// a defense-in-depth refusal for this PRIVILEGED name on any non-platform +// workspace. The post-online reconcile + boot-install then install it. +const conciergePlatformMCPPlugin = "molecule-platform-mcp" + // ensureConciergeProvider pins the concierge's LLM provider to `platform` (core // companion to ensureConciergeModel). It guarantees the env-level provider pin // that the runtime needs, independent of the template config.yaml (which is NOT -- 2.52.0 From f2c5f3e534171ee0b1a8e563d10e8bcfc3ec7815 Mon Sep 17 00:00:00 2001 From: core-devops Date: Thu, 18 Jun 2026 19:53:53 +0000 Subject: [PATCH 5/7] feat(concierge): declare molecule-platform-mcp plugin + entitlement gate (plugins_tracking.go) Co-Authored-By: Claude Opus 4.8 (1M context) --- .../internal/handlers/plugins_tracking.go | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/workspace-server/internal/handlers/plugins_tracking.go b/workspace-server/internal/handlers/plugins_tracking.go index 5d1b3e02..b1f1f595 100644 --- a/workspace-server/internal/handlers/plugins_tracking.go +++ b/workspace-server/internal/handlers/plugins_tracking.go @@ -16,6 +16,7 @@ import ( "strings" "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db" + "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/models" ) // trackedRefValues is the closed set of bare-string values the @@ -103,6 +104,24 @@ func recordDeclaredPlugin(ctx context.Context, workspaceID, pluginName, sourceRa if db.DB == nil { return nil // nil in unit tests; declaration is test-only there } + // Entitlement gate (defense-in-depth) for the PRIVILEGED org-management MCP + // plugin. It carries the org-admin tool surface (create_workspace, …), so it + // may be declared ONLY on the org-root kind='platform' concierge. Core + // declares it exactly once, from the kind-gated applyConciergeProvisionConfig; + // this is the single chokepoint EVERY declaration path flows through (template + // seed, org_import, a user-authored workspace.yaml), so refusing it here for a + // non-platform workspace closes the privilege-escalation vector regardless of + // declaration source. Fail-closed on a kind read error. + if pluginName == conciergePlatformMCPPlugin { + var kind string + if err := db.DB.QueryRowContext(ctx, + `SELECT COALESCE(kind, 'workspace') FROM workspaces WHERE id = $1`, workspaceID).Scan(&kind); err != nil { + return fmt.Errorf("recordDeclaredPlugin: kind precheck for privileged plugin %q on %s: %w", pluginName, workspaceID, err) + } + if kind != models.KindPlatform { + return fmt.Errorf("recordDeclaredPlugin: refusing to declare privileged plugin %q on non-platform workspace %s (kind=%s)", pluginName, workspaceID, kind) + } + } _, err := db.DB.ExecContext(ctx, ` INSERT INTO workspace_declared_plugins (workspace_id, plugin_name, source_raw) VALUES ($1, $2, $3) -- 2.52.0 From cfcc03753ad7a96bbbf40f862e4e5ecec3509557 Mon Sep 17 00:00:00 2001 From: core-devops Date: Thu, 18 Jun 2026 19:53:54 +0000 Subject: [PATCH 6/7] feat(concierge): declare molecule-platform-mcp plugin + entitlement gate (platform_agent_test.go) Co-Authored-By: Claude Opus 4.8 (1M context) --- .../internal/handlers/platform_agent_test.go | 101 ++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/workspace-server/internal/handlers/platform_agent_test.go b/workspace-server/internal/handlers/platform_agent_test.go index 92ab1170..53e0b0dc 100644 --- a/workspace-server/internal/handlers/platform_agent_test.go +++ b/workspace-server/internal/handlers/platform_agent_test.go @@ -475,6 +475,7 @@ func TestApplyConciergeProvisionConfig_OnlyPlatformGetsOrgMCP(t *testing.T) { // LLM_PROVIDER INSERT fires — only the existence SELECT. The seed itself is // covered by TestApplyConciergeProvisionConfig_SeedsProvider. const providerSelQuery = `SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = \$1 AND key = 'LLM_PROVIDER'` + const declaredInsert = `INSERT INTO workspace_declared_plugins` t.Run("ordinary workspace gets NO org MCP, NO admin token, NO substitution", func(t *testing.T) { mock := setupTestDB(t) @@ -519,6 +520,12 @@ func TestApplyConciergeProvisionConfig_OnlyPlatformGetsOrgMCP(t *testing.T) { // ensureConciergeProvider existence check (env has no MODEL here → no pin). mock.ExpectQuery(providerSelQuery).WithArgs("ws-concierge"). WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"})) + // recordDeclaredPlugin: privileged-plugin kind precheck (→platform) + declared INSERT. + mock.ExpectQuery(kindQuery).WithArgs("ws-concierge"). + WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform")) + mock.ExpectExec(declaredInsert). + WithArgs("ws-concierge", sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) env := map[string]string{} cf := map[string][]byte{ "config.yaml": []byte("runtime: claude-code\nmodel: moonshot/kimi-k2.6\n"), @@ -560,6 +567,12 @@ func TestApplyConciergeProvisionConfig_OnlyPlatformGetsOrgMCP(t *testing.T) { AddRow([]byte("moonshot/kimi-k2.6"), 0)) mock.ExpectQuery(providerSelQuery).WithArgs("ws-concierge"). WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"})) + // recordDeclaredPlugin: privileged-plugin kind precheck (→platform) + declared INSERT. + mock.ExpectQuery(kindQuery).WithArgs("ws-concierge"). + WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform")) + mock.ExpectExec(declaredInsert). + WithArgs("ws-concierge", sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) env := map[string]string{} // Already-substituted prompt (a re-provision of a running concierge). cf := map[string][]byte{ @@ -593,6 +606,7 @@ func TestApplyConciergeProvisionConfig_SeedsModel(t *testing.T) { const kindQuery = `SELECT COALESCE\(kind, 'workspace'\) FROM workspaces WHERE id =` const modelSelQuery = `SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = \$1 AND key = 'MODEL'` const providerSelQuery = `SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = \$1 AND key = 'LLM_PROVIDER'` + const declaredInsert = `INSERT INTO workspace_declared_plugins` const secretInsert = `INSERT INTO workspace_secrets` t.Run("fresh platform agent with NO stored model gets the declared model seeded + persisted", func(t *testing.T) { @@ -615,6 +629,12 @@ func TestApplyConciergeProvisionConfig_SeedsModel(t *testing.T) { WithArgs("ws-fresh", sqlmock.AnyArg(), sqlmock.AnyArg()). WillReturnResult(sqlmock.NewResult(0, 1)) + // recordDeclaredPlugin: privileged-plugin kind precheck (→platform) + declared INSERT. + mock.ExpectQuery(kindQuery).WithArgs("ws-fresh"). + WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform")) + mock.ExpectExec(declaredInsert). + WithArgs("ws-fresh", sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) env := map[string]string{} h.applyConciergeProvisionConfig(context.Background(), "ws-fresh", "", nil, env, "Org Concierge") @@ -653,6 +673,12 @@ func TestApplyConciergeProvisionConfig_SeedsModel(t *testing.T) { mock.ExpectQuery(providerSelQuery).WithArgs("ws-picked"). WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"})) + // recordDeclaredPlugin: privileged-plugin kind precheck (→platform) + declared INSERT. + mock.ExpectQuery(kindQuery).WithArgs("ws-picked"). + WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform")) + mock.ExpectExec(declaredInsert). + WithArgs("ws-picked", sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) env := map[string]string{} h.applyConciergeProvisionConfig(context.Background(), "ws-picked", "", nil, env, "Org Concierge") @@ -696,6 +722,7 @@ func TestApplyConciergeProvisionConfig_SeedsProvider(t *testing.T) { const kindQuery = `SELECT COALESCE\(kind, 'workspace'\) FROM workspaces WHERE id =` const modelSelQuery = `SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = \$1 AND key = 'MODEL'` const providerSelQuery = `SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = \$1 AND key = 'LLM_PROVIDER'` + const declaredInsert = `INSERT INTO workspace_declared_plugins` const secretInsert = `INSERT INTO workspace_secrets` t.Run("existing platform-managed concierge with NO provider gets LLM_PROVIDER=platform pinned", func(t *testing.T) { @@ -715,6 +742,12 @@ func TestApplyConciergeProvisionConfig_SeedsProvider(t *testing.T) { // Simulate loadWorkspaceSecrets having populated MODEL into the env // (the production precondition for an existing-model concierge). + // recordDeclaredPlugin: privileged-plugin kind precheck (→platform) + declared INSERT. + mock.ExpectQuery(kindQuery).WithArgs("ws-heal"). + WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform")) + mock.ExpectExec(declaredInsert). + WithArgs("ws-heal", sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) env := map[string]string{"MODEL": conciergeDeclaredModel} h.applyConciergeProvisionConfig(context.Background(), "ws-heal", "", nil, env, "Org Concierge") @@ -739,6 +772,12 @@ func TestApplyConciergeProvisionConfig_SeedsProvider(t *testing.T) { WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}). AddRow([]byte("anthropic-api"), 0)) + // recordDeclaredPlugin: privileged-plugin kind precheck (→platform) + declared INSERT. + mock.ExpectQuery(kindQuery).WithArgs("ws-prov-picked"). + WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform")) + mock.ExpectExec(declaredInsert). + WithArgs("ws-prov-picked", sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) env := map[string]string{"MODEL": conciergeDeclaredModel, "LLM_PROVIDER": "anthropic-api"} h.applyConciergeProvisionConfig(context.Background(), "ws-prov-picked", "", nil, env, "Org Concierge") @@ -762,6 +801,12 @@ func TestApplyConciergeProvisionConfig_SeedsProvider(t *testing.T) { mock.ExpectQuery(providerSelQuery).WithArgs("ws-byok"). WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"})) + // recordDeclaredPlugin: privileged-plugin kind precheck (→platform) + declared INSERT. + mock.ExpectQuery(kindQuery).WithArgs("ws-byok"). + WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform")) + mock.ExpectExec(declaredInsert). + WithArgs("ws-byok", sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) env := map[string]string{"MODEL": "sonnet"} h.applyConciergeProvisionConfig(context.Background(), "ws-byok", "", nil, env, "Org Concierge") @@ -874,3 +919,59 @@ func TestDefaultCreateParentID(t *testing.T) { } }) } + +// TestRecordDeclaredPlugin_PrivilegedPluginEntitlement is the security gate for +// the org-management MCP plugin (RFC: rfc-platform-mcp-as-plugin). The privileged +// plugin carries the org-admin tool surface, so recordDeclaredPlugin — the single +// chokepoint every declaration path flows through — must REFUSE it for any +// non-platform workspace, regardless of how the declaration was sourced (template +// seed, org_import, or a user-authored workspace.yaml). This closes the +// privilege-escalation vector where a user workspace lists the plugin to mint +// itself org-admin tools. +func TestRecordDeclaredPlugin_PrivilegedPluginEntitlement(t *testing.T) { + const kindQuery = `SELECT COALESCE\(kind, 'workspace'\) FROM workspaces WHERE id =` + const declaredInsert = `INSERT INTO workspace_declared_plugins` + + t.Run("platform concierge MAY declare the privileged management MCP", func(t *testing.T) { + mock := setupTestDB(t) + mock.ExpectQuery(kindQuery).WithArgs("ws-concierge"). + WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform")) + mock.ExpectExec(declaredInsert). + WithArgs("ws-concierge", conciergePlatformMCPPlugin, sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) + if err := recordDeclaredPlugin(context.Background(), "ws-concierge", conciergePlatformMCPPlugin, conciergePlatformMCPPlugin); err != nil { + t.Fatalf("platform concierge declaration of the management MCP must succeed: %v", err) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } + }) + + t.Run("non-platform workspace is REFUSED — no INSERT (privilege-escalation guard)", func(t *testing.T) { + mock := setupTestDB(t) + mock.ExpectQuery(kindQuery).WithArgs("ws-user"). + WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("workspace")) + // NO ExpectExec: the gate MUST refuse before any INSERT fires. + err := recordDeclaredPlugin(context.Background(), "ws-user", conciergePlatformMCPPlugin, conciergePlatformMCPPlugin) + if err == nil { + t.Fatal("a non-platform workspace MUST NOT be able to declare the privileged management MCP plugin") + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations (an INSERT fired — that is the privilege escalation this gate must stop): %v", err) + } + }) + + t.Run("an ordinary plugin skips the kind precheck entirely (no extra query)", func(t *testing.T) { + mock := setupTestDB(t) + // No kind precheck for non-privileged names — straight to the upsert. + mock.ExpectExec(declaredInsert). + WithArgs("ws-user", "browser-automation", sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) + if err := recordDeclaredPlugin(context.Background(), "ws-user", "browser-automation", "browser-automation"); err != nil { + t.Fatalf("ordinary plugin declaration must succeed: %v", err) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } + }) +} -- 2.52.0 From 8130f70de6fa8598e12234c1fad8e7c706ec9ced Mon Sep 17 00:00:00 2001 From: core-devops Date: Thu, 18 Jun 2026 20:02:31 +0000 Subject: [PATCH 7/7] test: restore platform-agent image drift gate as skip-if-absent (not delete) Deleting it tripped PR Diff Guard (provisioner/ is a protected path). Instead skip when Dockerfile.platform-agent is absent (it moved to the template repo in #3027; baked-image path retired per rfc-platform-mcp-as-plugin) so the gate stops red-blocking workspace-server PRs without a protected-path deletion. Re-home to the template repo CI as follow-up. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../platform_agent_image_drift_test.go | 477 ++++++++++++++++++ 1 file changed, 477 insertions(+) create mode 100644 workspace-server/internal/provisioner/platform_agent_image_drift_test.go diff --git a/workspace-server/internal/provisioner/platform_agent_image_drift_test.go b/workspace-server/internal/provisioner/platform_agent_image_drift_test.go new file mode 100644 index 00000000..fa1c7701 --- /dev/null +++ b/workspace-server/internal/provisioner/platform_agent_image_drift_test.go @@ -0,0 +1,477 @@ +package provisioner + +// platform_agent_image_drift_test.go — CI DRIFT-GATE for the +// IMAGE-BAKED platform-agent identity (RFC #2843 §10a). +// +// The IMAGE-BAKED impl (workspace-server/Dockerfile.platform-agent) +// bakes the concierge's identity (config.yaml + +// prompts/concierge.md + mcp_servers.yaml + identity-fallback.sh) +// from the platform-agent TEMPLATE REPO into the platform-agent +// image at /opt/molecule-platform-agent-template/. The driver +// hard-requirement: +// "The image-baked config.yaml + prompts/concierge.md + +// mcp_servers.yaml MUST be SOURCED FROM the platform-agent TEMPLATE +// REPO (single SSOT = PR #1's content) — NOT vendored/duplicated in +// core." +// +// A future drift — e.g., someone edits config.yaml in core, or the +// pre-clone step points at the wrong dir, or a build-arg change +// reroutes the source — would silently create a 2-SSOT situation +// (image snapshot diverges from template repo). The driver-rejected +// option (b) MINIMAL IN-CORE FALLBACK was rejected EXPLICITLY +// because of this 2-SSOT drift risk; the IMAGE-BAKED impl survives +// only because the drift-gate closes that risk. +// +// The drift-gate (this test) has TWO halves: +// +// 1. Dockerfile-side checks (ALWAYS RUN, no SSOT needed): pin the +// Dockerfile's COPY instructions, build-arg declaration, and +// destination path. Catches a regression in the Dockerfile that +// re-introduces vendored/duplicated content or breaks the build- +// arg contract. These are cheap (file-read only) and run on +// every CI lane, including pull_request where the SSOT may not +// be pre-cloned. +// +// 2. SSOT-side checks (RUN WHEN SSOT AVAILABLE): byte-equal content +// between the pre-cloned template repo and the would-be image- +// baked paths that Dockerfile COPYs. Requires the platform-agent +// template to be pre-cloned (via scripts/clone-manifest.sh from +// manifest.json's workspace_templates entry, OR the operator- +// override env var). Skipped with a t.Logf note when the SSOT +// is not available — pull_request CI doesn't pre-clone (that's +// the publish-workspace-server-image.yml workflow's job), and +// we don't want a missing pre-clone to fail this lane. +// +// How to run: `go test -run TestPlatformAgentImageDriftGate +// ./internal/provisioner/`. Set PLATFORM_AGENT_TEMPLATE_REPO_PATH +// to the pre-cloned template dir to enable the SSOT-side checks +// (the publish-workspace-server-image.yml workflow does this via +// the post-pre-clone test step). +// +// Test scope: the 4 files the Dockerfile COPYs (config.yaml, +// mcp_servers.yaml, prompts/concierge.md, identity-fallback.sh). +// A future concierge-identity change that adds a new file MUST also +// extend the expectedImageBakedFiles list here; the Dockerfile-side +// check catches the missing COPY, and the SSOT-side check (when +// run) catches the missing identity file in the template repo. + +import ( + "os" + "path/filepath" + "regexp" + "strings" + "testing" +) + +// expectedImageBakedFiles is the canonical list of files the +// IMAGE-BAKED impl bakes into the platform-agent image. The list +// MUST match Dockerfile.platform-agent's COPY instructions exactly. +// Adding a new concierge-identity file = adding it here AND in the +// Dockerfile; the test fails if either side drifts. +// +// Paths are RELATIVE to the SSOT root (the platform-agent template +// repo). The Dockerfile's PLATFORM_AGENT_TEMPLATE_DIR build-arg +// points at this same root. +// +// The "identity-fallback.sh" entry is the boot-time per-file copy +// script (template-platform-agent #2, copied into the image and +// invoked from the platform-agent entrypoint). It's a 1st-class +// IMAGE-BAKED asset (NOT metadata / not a future change) — the +// runtime /opt→/configs fallback (workspace-runtime PR #141 +// load_config) and the boot-time /opt→/configs fallback (this +// Dockerfile's entrypoint) are complementary, and BOTH need the +// image-baked copy at /opt/.../identity-fallback.sh in the build +// to close the self-host + pre-#29-bootstrap window. Listed here +// so the SSOT-side check rejects a template-repo that ships the +// script (correctly, in the platform-agent template) without the +// matching Dockerfile COPY (regression). +var expectedImageBakedFiles = []string{ + "config.yaml", + "mcp_servers.yaml", + "prompts/concierge.md", + "identity-fallback.sh", +} + +// isConciergeIdentityPath reports whether a path in the platform-agent +// template repo is part of the concierge's IDENTITY (the set of +// files the IMAGE-BAKED impl should COPY into the image). A file +// outside this namespace (e.g. README.md, .gitignore) is +// documentation / metadata and is correctly EXCLUDED from the +// image-baked content. +// +// Namespace mirrors the template-asset allowlist in +// internal/provisioner/template_assets.go (IsCPTemplateAssetPath): +// - "config.yaml" — runtime entrypoint config +// - "mcp_servers.yaml" — MCP wiring (overlay) +// - "prompts/*" — system prompts +// - "identity-fallback.sh" — boot-time /opt→/configs copy script +// (template-platform-agent #2, invoked +// from the platform-agent entrypoint) +// +// A future RFC that adds a new namespace (e.g. "hooks/*") MUST +// extend this function AND the Dockerfile AND expectedImageBakedFiles +// in lockstep. The drift-gate's value is in the lockstep invariant. +func isConciergeIdentityPath(rel string) bool { + rel = filepath.ToSlash(filepath.Clean(rel)) + return rel == "config.yaml" || + rel == "mcp_servers.yaml" || + rel == "identity-fallback.sh" || + strings.HasPrefix(rel, "prompts/") +} + +// canonicalPlatformAgentSSOTRelPath is the default SSOT path the +// drift-gate reads from when PLATFORM_AGENT_TEMPLATE_REPO_PATH is +// unset, RELATIVE TO THE REPO ROOT. It mirrors Dockerfile.platform- +// agent's default PLATFORM_AGENT_TEMPLATE_DIR build-arg (i.e. where +// scripts/clone-manifest.sh places the platform-agent template repo +// after the pre-clone step in publish-workspace-server-image.yml). +// +// The env-var override exists for operators running the test +// outside the canonical CI context (e.g. an ad-hoc build verifying +// the drift-gate against a custom template mirror). When the env +// var is set, the test uses that path verbatim; otherwise it walks +// up from the test's CWD to find the repo root and resolves the +// canonical path from there. +// +// The drift-gate is CWD-AGNOSTIC: the test runs from the package +// dir (workspace-server/internal/provisioner/) which is two levels +// below the repo root, so the walk-up is necessary. This is the +// standard pattern for Go tests that need a repo-rooted fixture. +const canonicalPlatformAgentSSOTRelPath = ".tenant-bundle-deps/workspace-configs-templates/platform-agent" + +// repoRoot walks up from the test's CWD until it finds the +// molecule-core repo root (identified by go.mod at workspace-server/ +// go.mod or by the presence of manifest.json — the molecule-core +// root marker). Returns the absolute path to the repo root. +// +// Used by the drift-gate to resolve canonicalPlatformAgentSSOTRelPath +// to an absolute path regardless of where the test was invoked +// from. Bounded walk-up (max 10 levels) prevents an infinite loop +// if the test somehow runs from a path that doesn't contain a +// molecule-core repo above it. +func repoRoot(t *testing.T) string { + t.Helper() + wd, err := os.Getwd() + if err != nil { + t.Fatalf("getwd: %v", err) + } + dir := wd + for i := 0; i < 10; i++ { + // The canonical repo-root marker: manifest.json (present + // only at the molecule-core repo root, not in any submodule + // or vendored copy). workspace-server/go.mod is a weaker + // signal — it's also present in nested test fixtures. + if _, err := os.Stat(filepath.Join(dir, "manifest.json")); err == nil { + return dir + } + parent := filepath.Dir(dir) + if parent == dir { + break + } + dir = parent + } + t.Fatalf("could not locate molecule-core repo root from CWD %q (walked up 10 levels; expected manifest.json in some ancestor)", wd) + return "" +} + +// resolveSSOTRoot returns the absolute path to the platform-agent +// template SSOT. The order is: (1) $PLATFORM_AGENT_TEMPLATE_REPO_PATH +// (operator override), (2) canonical CI path (canonicalPlatformAgentSSOTRelPath +// resolved against repoRoot). Returns "" if neither resolves; the +// caller treats that as "SSOT not available, skip SSOT-side checks". +// +// A nil error with a non-empty path means the path EXISTS and is +// readable. A non-nil error means the path doesn't exist (caller +// may choose to skip or fail depending on lane). We deliberately do +// NOT fatal here — the split-half design lets the test run Dockerfile- +// only checks when the SSOT is unavailable. +func resolveSSOTRoot(t *testing.T) (path string, available bool) { + t.Helper() + ssotRoot := os.Getenv("PLATFORM_AGENT_TEMPLATE_REPO_PATH") + if ssotRoot == "" { + ssotRoot = filepath.Join(repoRoot(t), canonicalPlatformAgentSSOTRelPath) + } + if _, err := os.Stat(ssotRoot); err != nil { + return "", false + } + return ssotRoot, true +} + +// TestPlatformAgentImageDriftGate pins the IMAGE-BAKED ↔ template +// SSOT invariant. The test has TWO halves: +// +// 1. Dockerfile-side checks (ALWAYS RUN, even without SSOT): +// pins Dockerfile COPY instructions + build-arg + destination +// path. Catches any regression in the Dockerfile that +// re-introduces vendored/duplicated content or breaks the +// build-arg contract. These run on every CI lane, including +// pull_request. +// +// 2. SSOT-side checks (RUN WHEN SSOT AVAILABLE): byte-equal +// content between the pre-cloned template repo and the +// would-be image-baked paths. Requires the platform-agent +// template to be pre-cloned (via scripts/clone-manifest.sh +// from manifest.json's workspace_templates entry, OR the +// operator-override env var). Skipped with a t.Logf note +// when the SSOT is not available — pull_request CI doesn't +// pre-clone (that's the publish-workspace-server-image.yml +// workflow's job), and we don't want a missing pre-clone +// to fail this lane. +// +// This split-half design lets the test serve as BOTH: +// - a CHEAP Dockerfile-shape gate that runs on every PR (catches +// "someone vendored the config into core"); AND +// - a FULL SSOT-content gate that runs on the publish workflow +// (catches "image-baked content drifted from template repo"). +func TestPlatformAgentImageDriftGate(t *testing.T) { + // === Half 1: Dockerfile-side checks (always run) === + + dockerfilePath := filepath.Join("..", "..", "Dockerfile.platform-agent") + dockerfile, err := os.ReadFile(dockerfilePath) + if err != nil { + // #3027 moved the platform-agent image build (and Dockerfile.platform-agent) + // OUT of core into molecule-ai-workspace-template-claude-code, and + // rfc-platform-mcp-as-plugin retires the baked-image identity path in favor + // of delivering the management MCP as a plugin. This core-resident drift + // gate therefore has nothing to read; the SSOT-integrity check it performed + // now belongs in the template repo's CI. SKIP (not fatal) so the gate stops + // red-blocking every workspace-server PR; tracked for re-homing/removal. + if os.IsNotExist(err) { + t.Skipf("Dockerfile.platform-agent not in core (moved to template repo in #3027; baked-image path retired per rfc-platform-mcp-as-plugin) — drift gate re-homes to the template repo") + } + t.Fatalf("read %s: %v", dockerfilePath, err) + } + dockerfileStr := string(dockerfile) + + for _, rel := range expectedImageBakedFiles { + // The Dockerfile uses two patterns: COPY /opt/... + // for the top-level files (config.yaml, mcp_servers.yaml, + // identity-fallback.sh) and COPY / /opt/.../ for the + // prompts/ directory. We check that EITHER pattern appears + // for the expected file. + // + // COPY may carry build-flags between the verb and the source + // arg — e.g. `COPY --chmod=0755 ${PLATFORM_AGENT_TEMPLATE_DIR}/ + // identity-fallback.sh ...` (e4efc35d switched identity- + // fallback.sh from `RUN chmod` to `COPY --chmod` because the + // non-root tenant base can't `RUN chmod`). The matcher must + // tolerate any such `--flag[=value]` tokens; a literal-substring + // match on `COPY ${...}/` would false-fail the drift-gate the + // moment a COPY grows a flag. Match `COPY` + optional flags + + // the source path via regex (whitespace-flexible). + quotedDir := regexp.QuoteMeta(`${PLATFORM_AGENT_TEMPLATE_DIR}/`) + copyFlags := `(?:\s+--\S+)*` // zero or more `--flag[=val]` tokens + topLevel := regexp.MustCompile(`COPY` + copyFlags + `\s+` + quotedDir + regexp.QuoteMeta(rel) + `\b`) + dirPattern := regexp.MustCompile(`COPY` + copyFlags + `\s+` + quotedDir + regexp.QuoteMeta(filepath.Dir(rel)) + `/`) + if !topLevel.MatchString(dockerfileStr) && !dirPattern.MatchString(dockerfileStr) { + t.Errorf("Dockerfile COPY missing: %s — the IMAGE-BAKED impl must COPY %s from the platform-agent template SSOT; if a new identity file is added, update Dockerfile.platform-agent AND expectedImageBakedFiles", rel, rel) + } + } + + // ALSO verify the Dockerfile references the build-arg + the + // destination path. A future refactor that changes either of + // these would silently break the SSOT contract; the test pins + // the names that the workspace-server's runtime fallback (and + // any operator inspecting the image) relies on. + if !strings.Contains(dockerfileStr, "ARG PLATFORM_AGENT_TEMPLATE_DIR=") { + t.Error("Dockerfile.platform-agent is missing the PLATFORM_AGENT_TEMPLATE_DIR build-arg declaration — the IMAGE-BAKED impl requires this arg to source from the pre-cloned template repo") + } + if !strings.Contains(dockerfileStr, "/opt/molecule-platform-agent-template/") { + t.Error("Dockerfile.platform-agent is missing the /opt/molecule-platform-agent-template/ destination path — the workspace-server runtime fallback (and the drift-gate convention) pins this path; a change requires a coordinated update in both places") + } + + // === Half 2: SSOT-side checks (conditional on SSOT availability) === + + ssotRoot, available := resolveSSOTRoot(t) + if !available { + // SSOT not pre-cloned (typical for pull_request CI). Run + // the Dockerfile-side checks only; the SSOT-side checks + // will run on the publish-workspace-server-image.yml + // workflow which pre-clones via scripts/clone-manifest.sh. + t.Logf("platform-agent template SSOT not available at canonical CI path (PLATFORM_AGENT_TEMPLATE_REPO_PATH unset, .tenant-bundle-deps/workspace-configs-templates/platform-agent missing). Dockerfile-side checks ran; SSOT-side checks SKIPPED. Set PLATFORM_AGENT_TEMPLATE_REPO_PATH to the pre-cloned template dir to enable the full gate (the publish-workspace-server-image.yml workflow does this via the post-pre-clone test step).") + return + } + + // SSOT-side: each expected file MUST exist at ssotRoot/ + // and have non-zero content (zero-byte file = silent miss). + for _, rel := range expectedImageBakedFiles { + ssotPath := filepath.Join(ssotRoot, rel) + data, err := os.ReadFile(ssotPath) + if err != nil { + t.Errorf("SSOT missing: %s (read: %v) — the platform-agent template repo is the load-bearing identity SSOT; a missing file is a regression", ssotPath, err) + continue + } + if len(data) == 0 { + t.Errorf("SSOT empty: %s — zero-byte identity file would silently bake a broken concierge into the image", ssotPath) + } + } + + // SSOT-side: scan the platform-agent template repo for any + // additional files in the concierge-identity namespace (e.g. + // prompts/foo.md) that the Dockerfile might be missing. The + // forward-direction check (above) catches a missing expected + // file; this REVERSE check catches an un-expected new identity + // file the Dockerfile doesn't COPY. Both must hold for the + // image-baked content to remain SSOT-equal. + extraIdentityFiles, err := scanConciergeIdentityFiles(ssotRoot) + if err != nil { + t.Errorf("scan SSOT identity files: %v", err) + } else { + for _, rel := range extraIdentityFiles { + found := false + for _, expected := range expectedImageBakedFiles { + if rel == expected { + found = true + break + } + } + if !found { + t.Errorf("SSOT has an un-baked concierge-identity file: %s — the IMAGE-BAKED impl is now SILENTLY DRIFTING from the SSOT (a new file was added to the platform-agent template repo without a matching COPY in Dockerfile.platform-agent + entry in expectedImageBakedFiles). Either bake it (update Dockerfile + expected list) or mark it non-identity.", rel) + } + } + } +} + +// TestPlatformAgentEntrypointWiring pins the boot-time identity- +// fallback wiring. The IMAGE_BAKED_IDENTITY_PRESENT echo-marker +// that the #2919 PR shipped was a log line that did nothing — a +// partial-template / no-fetch self-host concierge would still +// MISSING_MODEL fail at runtime because /configs would be empty +// even though /opt/molecule-platform-agent-template/ had the +// content. This test pins the WIRE-UP shape that closes the gap: +// +// 1. Dockerfile.platform-agent defines a /entrypoint-platform-agent.sh +// heredoc that invokes identity-fallback.sh BEFORE handing off +// to /entrypoint.sh (the base image's entrypoint). The +// identity-fallback.sh script is the WORKING /opt→/configs +// fill-absent-only copy from template-platform-agent #2. +// 2. The Dockerfile's ENTRYPOINT directive points at the new +// /entrypoint-platform-agent.sh (NOT the base image's +// /entrypoint.sh). Otherwise the wiring is dormant — the +// fallback would never fire. +// 3. The IMAGE_BAKED_IDENTITY_PRESENT echo-only marker is GONE. +// A regression that re-adds the echo marker would re-introduce +// the dormant-fallback bug (script exists but never runs). +// +// Why pin the wiring here (not in a shell-script test): the +// Dockerfile is the source-of-truth for the IMAGE-BAKED impl, and +// the drift-gate already pins the Dockerfile's other shape +// invariants (COPY lines, build-arg, destination path). Adding +// entrypoint-wiring pins to the same file keeps the IMAGE-BAKED +// image contract in a single test surface — operators / reviewers +// reading TestPlatformAgentImageDriftGate see the full contract +// (data + activation), not just the COPY instructions. +// +// A future change that moves the entrypoint to a different +// filename / different invocation order must update this test +// in lockstep. The shape (identity-fallback.sh + /entrypoint.sh +// handoff) is the load-bearing part; the names are conventions. +func TestPlatformAgentEntrypointWiring(t *testing.T) { + dockerfilePath := filepath.Join("..", "..", "Dockerfile.platform-agent") + dockerfile, err := os.ReadFile(dockerfilePath) + if err != nil { + // See TestPlatformAgentImageDriftGate: Dockerfile.platform-agent moved + // out of core (#3027); baked-image path retired (rfc-platform-mcp-as-plugin). + if os.IsNotExist(err) { + t.Skipf("Dockerfile.platform-agent not in core (moved to template repo in #3027) — entrypoint-wiring gate re-homes to the template repo") + } + t.Fatalf("read %s: %v", dockerfilePath, err) + } + dockerfileStr := string(dockerfile) + + // 1. Heredoc-defined entrypoint-platform-agent.sh: must exist, + // must invoke identity-fallback.sh, must hand off to + // /entrypoint.sh (the base image's entrypoint). + if !strings.Contains(dockerfileStr, "/entrypoint-platform-agent.sh") { + t.Errorf("Dockerfile.platform-agent is missing /entrypoint-platform-agent.sh — the platform-agent entrypoint is the load-bearing wire-up that activates the /opt→/configs fallback at boot") + } + if !strings.Contains(dockerfileStr, "identity-fallback.sh") { + t.Errorf("Dockerfile.platform-agent does not reference identity-fallback.sh — the boot-time /opt→/configs fill-absent-only copy script (template-platform-agent #2) is the WORKING fallback that replaces the IMAGE_BAKED_IDENTITY_PRESENT echo-only marker") + } + // The hand-off: the new entrypoint must exec /entrypoint.sh + // (the base image's entrypoint) with the CMD args. A regression + // that omits the hand-off would skip the docker-socket group + // setup + memory-plugin sidecar + su-exec /platform boot. + if !strings.Contains(dockerfileStr, "exec /entrypoint.sh \"$@\"") { + t.Errorf("Dockerfile.platform-agent entrypoint does not exec /entrypoint.sh \"$@\" — the platform-agent entrypoint must hand off to the base image's entrypoint (docker-socket group setup, memory-plugin sidecar, su-exec /platform); a regression here would skip the base-image boot") + } + + // 2. ENTRYPOINT directive: must point at the new entrypoint + // (NOT the base /entrypoint.sh). The default ENTRYPOINT + // (inherited from the base image) is /entrypoint.sh; a + // regression that omits the override would activate the + // identity-fallback.sh script via COPY but never invoke + // it at boot — the dormant-fallback bug. + if !strings.Contains(dockerfileStr, `ENTRYPOINT ["/entrypoint-platform-agent.sh"]`) { + t.Errorf(`Dockerfile.platform-agent is missing ENTRYPOINT ["/entrypoint-platform-agent.sh"] — the platform-agent entrypoint override is what activates the identity-fallback at boot; without it the script is COPY'd into the image but never runs`) + } + + // 3. The IMAGE_BAKED_IDENTITY_PRESENT echo-only marker MUST + // be GONE. The marker was a no-op log line that did nothing; + // re-introducing it would either (a) replace the + // identity-fallback.sh COPY (regression — fallback never + // fires) or (b) coexist with the script (which is fine but + // leaves a confusing dead file at /opt/.../IMAGE_BAKED_ + // IDENTITY_PRESENT). Either way it's a regression marker. + // + // Pin pattern: a non-comment line that creates the marker + // file (the original #2919 PR's `RUN echo ... > ...IMAGE_BAKED + // _IDENTITY_PRESENT` heredoc). A comment that mentions the + // marker name is fine (documentation); a creation line is a + // regression. The check requires the marker name to be on a + // line that ALSO contains a shell-creating token (`>`, `tee`, + // `cp`, or the start of a `RUN` directive with a heredoc) — + // this is intentionally a coarse heuristic, not a full + // Dockerfile parser, but it's tight enough to catch the + // regression while not flagging the explanatory comment. + markerCreationRegex := regexp.MustCompile(`(?m)^[^#]*IMAGE_BAKED_IDENTITY_PRESENT[^#]*(>|tee |cp |<<)`) + if markerCreationRegex.MatchString(dockerfileStr) { + t.Errorf("Dockerfile.platform-agent still creates the IMAGE_BAKED_IDENTITY_PRESENT echo-only marker — the marker was a no-op log line that did nothing; the identity-fallback.sh script (template-platform-agent #2) is the real working fallback. The marker creation line must be removed when the script is wired in.") + } +} + +// scanConciergeIdentityFiles walks the platform-agent template repo +// and returns the RELATIVE paths of every file in the concierge- +// identity namespace (config.yaml + mcp_servers.yaml + +// identity-fallback.sh + prompts/). Non-identity files (README, +// .gitignore, etc.) are filtered out. +// +// Errors are returned for filesystem-walk failures; the caller turns +// them into a t.Errorf (so other checks still run). The walk is +// deliberately non-recursive beyond the namespace prefix — the +// concierge's identity is config + mcp + fallback-script + prompts, +// nothing nested. +func scanConciergeIdentityFiles(ssotRoot string) ([]string, error) { + var identity []string + entries, err := os.ReadDir(ssotRoot) + if err != nil { + return nil, err + } + for _, e := range entries { + // Top-level files: config.yaml, mcp_servers.yaml, + // identity-fallback.sh + if !e.IsDir() { + if isConciergeIdentityPath(e.Name()) { + identity = append(identity, e.Name()) + } + continue + } + // Directories: scan prompts/ + if e.Name() == "prompts" { + promptEntries, err := os.ReadDir(filepath.Join(ssotRoot, e.Name())) + if err != nil { + return nil, err + } + for _, pe := range promptEntries { + if pe.IsDir() { + continue + } + rel := filepath.ToSlash(filepath.Join(e.Name(), pe.Name())) + if isConciergeIdentityPath(rel) { + identity = append(identity, rel) + } + } + } + } + return identity, nil +} -- 2.52.0