fix(workspace-server): create_workspace children born NOT_CONFIGURED — pin LLM_PROVIDER=platform for platform-managed models #3200

Merged
core-devops merged 1 commits from fix/create-workspace-complete-llm-config into main 2026-06-24 05:43:43 +00:00
3 changed files with 196 additions and 0 deletions
@@ -0,0 +1,111 @@
package handlers
// create_workspace NOT_CONFIGURED regression gate.
//
// A child workspace the concierge spawns via the `create_workspace`
// management-MCP tool flows through WorkspaceHandler.Create, which persists
// MODEL (setModelSecret) but — since the internal#718 P4 closure removed the
// unconditional setProviderSecret write — persisted NO LLM_PROVIDER. For a
// platform-managed model id like "moonshot/kimi-k2.6" the on-box runtime
// re-derives provider="moonshot" (a model PREFIX, not a registry NAME) and the
// claude-code adapter fail-closes ("workspace config picks provider='moonshot'
// but it is not in the providers registry") → the child boots online but
// NOT_CONFIGURED.
//
// ensureCreatedWorkspaceProviderPin (wired into Create after setModelSecret)
// closes that gap: it pins LLM_PROVIDER=platform iff the registry derivation of
// (runtime, model) is the closed `platform` provider, mirroring the concierge's
// ensureConciergeProvider IsPlatform gate. These tests prove a child created via
// create_workspace is born with a COMPLETE, self-consistent (runtime, model,
// provider) config that the registry validates — not a MODEL-without-provider
// the adapter would reject — while BYOK/OAuth children are left untouched.
import (
"context"
"testing"
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/providers"
"github.com/DATA-DOG/go-sqlmock"
)
func TestEnsureCreatedWorkspaceProviderPin(t *testing.T) {
const secretInsert = `INSERT INTO workspace_secrets`
t.Run("platform-managed child (moonshot/kimi-k2.6) gets LLM_PROVIDER=platform pinned", func(t *testing.T) {
mock := setupTestDB(t)
// setProviderSecret writes the LLM_PROVIDER row directly (no preceding
// existence SELECT — Create owns the fresh row, there is nothing to respect).
mock.ExpectExec(secretInsert).
WithArgs("ws-child", sqlmock.AnyArg(), sqlmock.AnyArg()).
WillReturnResult(sqlmock.NewResult(0, 1))
ensureCreatedWorkspaceProviderPin(context.Background(), "ws-child", "claude-code", "moonshot/kimi-k2.6", nil)
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("platform-managed child was NOT pinned LLM_PROVIDER=platform (the create_workspace NOT_CONFIGURED bug): %v", err)
}
})
t.Run("pinned provider matches the registry-derived provider (config is self-consistent)", func(t *testing.T) {
// The value the helper persists MUST equal what DeriveProvider resolves for
// the same (runtime, model) — otherwise the config the adapter reads is
// internally inconsistent. Assert the pin equals the derived provider name
// AND that it IS the closed platform provider.
m, err := providerRegistry()
if err != nil || m == nil {
t.Fatalf("provider registry unavailable: %v", err)
}
prov, derr := m.DeriveProvider("claude-code", "moonshot/kimi-k2.6", nil)
if derr != nil {
t.Fatalf("DeriveProvider(claude-code, moonshot/kimi-k2.6) failed: %v", derr)
}
if !prov.IsPlatform() {
t.Fatalf("registry SSOT changed: moonshot/kimi-k2.6 no longer derives to the platform provider (got %q) — the pin logic is keyed off this", prov.Name)
}
if prov.Name != providers.PlatformProviderName {
t.Errorf("derived provider name %q != %q (the value the helper pins)", prov.Name, providers.PlatformProviderName)
}
})
t.Run("BYOK child carrying a vendor key derives to its real provider — NOT pinned platform", func(t *testing.T) {
// A create that supplies ANTHROPIC_API_KEY for an anthropic-api model
// derives to anthropic-api (a real provider entry), not platform. The
// helper must NOT write any LLM_PROVIDER row (no INSERT expected): pinning
// would mis-route the runtime's own correct derivation.
mock := setupTestDB(t)
// No ExpectExec — any INSERT is an unmet-expectation failure below.
// Model whose registry derivation is a real (non-platform) provider when
// the anthropic-api auth env is present.
ensureCreatedWorkspaceProviderPin(
context.Background(),
"ws-byok-child",
"claude-code",
"anthropic:claude-opus-4-7",
[]string{"ANTHROPIC_API_KEY"},
)
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("BYOK child wrongly got an LLM_PROVIDER write (would mis-route the runtime's own derivation): %v", err)
}
})
t.Run("empty model is a no-op (external/registerless create)", func(t *testing.T) {
mock := setupTestDB(t)
// No INSERT expected.
ensureCreatedWorkspaceProviderPin(context.Background(), "ws-empty", "claude-code", "", nil)
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("empty model should be a no-op (no provider pin): %v", err)
}
})
t.Run("unknown/federated runtime derive-miss is a no-op (fails open, no pin)", func(t *testing.T) {
mock := setupTestDB(t)
// DeriveProvider errors for an unknown runtime → helper returns without a
// write (matches the create-boundary registry gates that fail open here).
ensureCreatedWorkspaceProviderPin(context.Background(), "ws-fed", "some-federated-runtime", "vendor/model", nil)
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unknown-runtime derive-miss should be a no-op: %v", err)
}
})
}
@@ -34,6 +34,7 @@ import (
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/crypto"
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db"
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/models"
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/providers"
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/provisioner"
"github.com/gin-gonic/gin"
"github.com/google/uuid"
@@ -624,6 +625,70 @@ func setProviderSecret(ctx context.Context, workspaceID, provider string) error
return err
}
// ensureCreatedWorkspaceProviderPin pins LLM_PROVIDER=platform on a freshly
// CREATED workspace (the Create handler / `create_workspace` management-MCP
// path) when, and ONLY when, its (runtime, model) derives to the closed
// `platform` provider via the registry.
//
// Why this exists (the create_workspace NOT_CONFIGURED bug): the platform/root
// concierge gets LLM_PROVIDER=platform from ensureConciergeProvider, but that
// helper runs ONLY on the kind=platform provision path. A CHILD workspace the
// concierge spawns via `create_workspace` goes through WorkspaceHandler.Create,
// which persists MODEL (setModelSecret) but — since the internal#718 P4 closure
// removed the unconditional setProviderSecret write — persists NO LLM_PROVIDER.
// For a platform-managed model id like "moonshot/kimi-k2.6" the on-box runtime
// re-derives the provider with its own slug-split (_derive_provider_from_model →
// "moonshot", a model-PREFIX, NOT a registry provider NAME), so the claude-code
// adapter fail-closes: "workspace config picks provider='moonshot' but it is not
// in the providers registry" → the child boots online but NOT_CONFIGURED. This
// is the exact symptom ensureConciergeProvider was added to cure for the root;
// children created via create_workspace need the same env-level pin.
//
// The pin is gated on the registry-DERIVED provider being `platform`
// (providers.Manifest.DeriveProvider → IsPlatform), NOT on a model-prefix string
// or on the parent being platform-managed. This:
// - is parent-independent and not moonshot-specific (any platform-managed
// model id whose registry derivation is `platform` gets the pin);
// - leaves BYOK / OAuth / self-host children UNTOUCHED — their model derives to
// a real provider entry (anthropic-oauth, minimax, kimi-coding, …) the
// runtime resolves correctly on its own, so pinning would be wrong. Mirrors
// ensureConciergeProvider's `IsPlatform` gate (it only ever pins `platform`).
//
// availableAuthEnv is the auth-env-var NAMES present in the create payload's
// secrets — the same disambiguation input DeriveProvider uses elsewhere — so a
// BYOK create that carries vendor keys derives to its real provider (not
// platform) and is correctly skipped. Best-effort + non-fatal: a derive miss or
// a persist error logs and returns; the workspace row stays consistent and the
// (unchanged) downstream provision validation still applies.
func ensureCreatedWorkspaceProviderPin(ctx context.Context, workspaceID, runtime, model string, secretKeys []string) {
model = strings.TrimSpace(model)
if model == "" {
return
}
m, err := providerRegistry()
if err != nil || m == nil {
log.Printf("Create workspace %s: provider registry unavailable; cannot pin LLM_PROVIDER (non-fatal): %v", workspaceID, err)
return
}
prov, derr := m.DeriveProvider(runtime, model, secretKeys)
if derr != nil {
// Unknown/ambiguous (runtime, model) — the create-boundary registry
// gates already let this through (e.g. federated/unknown runtimes), and
// a non-derivable model is not platform-managed by construction. No pin.
return
}
if !prov.IsPlatform() {
// BYOK / OAuth / self-host: the model derives to a real provider entry
// the runtime resolves on its own. Pinning here would mis-route.
return
}
if setErr := setProviderSecret(ctx, workspaceID, providers.PlatformProviderName); setErr != nil {
log.Printf("Create workspace %s: failed to pin LLM_PROVIDER=%s for platform-managed model %q: %v (non-fatal)", workspaceID, providers.PlatformProviderName, model, setErr)
return
}
log.Printf("Create workspace %s: pinned LLM_PROVIDER=%s for platform-managed model %q (create_workspace child config completeness)", workspaceID, providers.PlatformProviderName, model)
}
// EnsureSelfHostedPlatformAgent installs the org's platform agent (the concierge,
// the org root) on a tenant that has no control plane to do it — i.e. self-hosted
// or local. In SaaS the CP calls InstallPlatformAgent at org-provision time; this
@@ -837,6 +837,26 @@ func (h *WorkspaceHandler) Create(c *gin.Context) {
if err := setModelSecret(ctx, id, payload.Model); err != nil {
log.Printf("Create workspace %s: failed to persist MODEL_PROVIDER %q: %v (non-fatal)", id, payload.Model, err)
}
// core (create_workspace NOT_CONFIGURED): a child workspace the concierge
// spawns via `create_workspace` lands here with MODEL but — since the
// internal#718 P4 closure removed the unconditional setProviderSecret write
// — NO LLM_PROVIDER. For a platform-managed model id (e.g.
// "moonshot/kimi-k2.6") the on-box runtime re-derives provider="moonshot"
// (a model PREFIX, not a registry NAME) and the adapter fail-closes ("picks
// provider='moonshot' but it is not in the providers registry") → the child
// boots online but NOT_CONFIGURED. Mirror the concierge's ensureConcierge-
// Provider pin here so the child is born with a COMPLETE config: pin
// LLM_PROVIDER=platform iff the registry derivation of (runtime, model) is
// the closed `platform` provider. BYOK/OAuth/self-host children (whose model
// derives to a real provider entry) are left untouched. availableAuthEnv =
// the create payload's secret KEYS so a BYOK create derives to its real
// provider and is correctly skipped. Non-fatal.
createSecretKeys := make([]string, 0, len(payload.Secrets))
for k := range payload.Secrets {
createSecretKeys = append(createSecretKeys, k)
}
ensureCreatedWorkspaceProviderPin(ctx, id, payload.Runtime, payload.Model, createSecretKeys)
}
// Insert canvas layout — non-fatal: workspace can be dragged into position later