fix(provision): platform-managed workspace must fail-closed when CP proxy env absent (#2162) #2164

Merged
core-be merged 3 commits from fix/2162-platform-managed-fail-closed-missing-proxy into main 2026-06-03 06:21:10 +00:00
8 changed files with 145 additions and 6 deletions
@@ -372,3 +372,78 @@ func TestApplyPlatformManagedLLMEnv_WorkspaceOriginCredExemptFromStrip(t *testin
t.Errorf("sqlmock expectations: %v", err)
}
}
// TestApplyPlatformManagedLLMEnv_MissingProxyEnvFailClosed is the #2162
// regression guard. A platform-managed workspace whose CP proxy env is absent
// must NOT start credential-less. The empty-proxy path must return
// HasUsableLLMCred=false so the caller aborts with MISSING_PLATFORM_PROXY.
//
// Mutation: revert the early-return from HasUsableLLMCred=false to true
// → workspace starts with zero credential → "container started but never
// called /registry/register" (600s provision-timeout sweep) → this test RED.
func TestApplyPlatformManagedLLMEnv_MissingProxyEnvFailClosed(t *testing.T) {
ctx := context.Background()
const wsID = "29b95be9-811e-4857-be36-1dafdbf4f697" // adk-demo failure workspace
mock := setupTestDB(t)
expectOverrideQuery(mock, wsID, "")
// No proxy env present — simulates the boot-race / misconfig path.
envVars := map[string]string{}
res := applyPlatformManagedLLMEnv(ctx, envVars, wsID, "claude-code", "moonshot/kimi-k2.6", nil)
if res.ResolvedMode != LLMBillingModePlatformManaged {
t.Fatalf("platform-managed model must stay platform_managed, got %q (source=%s)", res.ResolvedMode, res.Source)
}
// THE FIX: must NOT report usable credential when none was injected.
if res.HasUsableLLMCred {
t.Fatalf("empty proxy env → HasUsableLLMCred must be false (fail-closed), got true — the #2162 dark-wedge class")
}
// No credential env must be present.
if _, present := envVars["ANTHROPIC_API_KEY"]; present {
t.Errorf("empty proxy env must NOT inject ANTHROPIC_API_KEY")
}
if _, present := envVars["MOLECULE_LLM_USAGE_TOKEN"]; present {
t.Errorf("empty proxy env must NOT inject MOLECULE_LLM_USAGE_TOKEN")
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("sqlmock expectations: %v", err)
}
}
// TestApplyPlatformManagedLLMEnv_ProxyEnvPresentInjectsCredential is the
// positive-path pair to the #2162 regression guard: when the CP proxy env IS
// present, the platform-managed path must inject ANTHROPIC_API_KEY +
// ANTHROPIC_BASE_URL for an Anthropic-native runtime and report
// HasUsableLLMCred=true.
func TestApplyPlatformManagedLLMEnv_ProxyEnvPresentInjectsCredential(t *testing.T) {
ctx := context.Background()
const wsID = "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"
mock := setupTestDB(t)
expectOverrideQuery(mock, wsID, "")
envVars := map[string]string{}
// Simulate the CP proxy env being present (as it is in production).
t.Setenv("MOLECULE_LLM_BASE_URL", "https://api.moleculesai.app/api/v1/internal/llm/openai/v1")
t.Setenv("MOLECULE_LLM_ANTHROPIC_BASE_URL", "https://api.moleculesai.app/api/v1/internal/llm/anthropic/v1")
t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "PLATFORM-PROXY-TOKEN")
res := applyPlatformManagedLLMEnv(ctx, envVars, wsID, "claude-code", "moonshot/kimi-k2.6", nil)
if res.ResolvedMode != LLMBillingModePlatformManaged {
t.Fatalf("expected platform_managed, got %q", res.ResolvedMode)
}
if !res.HasUsableLLMCred {
t.Fatalf("proxy env present → HasUsableLLMCred must be true, got false")
}
if envVars["ANTHROPIC_API_KEY"] != "PLATFORM-PROXY-TOKEN" {
t.Errorf("ANTHROPIC_API_KEY must be injected with the platform proxy token; got %q", envVars["ANTHROPIC_API_KEY"])
}
if envVars["ANTHROPIC_BASE_URL"] != "https://api.moleculesai.app/api/v1/internal/llm/anthropic/v1" {
t.Errorf("ANTHROPIC_BASE_URL must be injected with the platform anthropic proxy; got %q", envVars["ANTHROPIC_BASE_URL"])
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("sqlmock expectations: %v", err)
}
}
@@ -93,3 +93,16 @@ func formatMissingBYOKCredentialError(mode string) string {
mode,
)
}
// formatMissingPlatformProxyError builds the user-facing message for a
// provision failure caused by a platform-managed workspace whose control-plane
// proxy environment is absent (#2162). The platform-managed path requires
// MOLECULE_LLM_BASE_URL + MOLECULE_LLM_USAGE_TOKEN (or their OPENAI_*
// fallbacks) to inject a usable credential; without them the workspace must
// NOT start credential-less.
func formatMissingPlatformProxyError() string {
return "this workspace is configured for platform-managed LLM billing but the control-plane proxy is not ready. " +
"The required platform proxy env (MOLECULE_LLM_BASE_URL + MOLECULE_LLM_USAGE_TOKEN) is absent. " +
"This is usually a transient boot-race; retry in 30 seconds. If it persists, verify the platform proxy " +
"is configured for this tenant/runtime and contact the platform team."
}
@@ -1003,12 +1003,13 @@ func applyPlatformManagedLLMEnv(ctx context.Context, envVars map[string]string,
anthropicBaseURL := firstNonEmptyEnv("MOLECULE_LLM_ANTHROPIC_BASE_URL", "ANTHROPIC_BASE_URL")
token := firstNonEmptyEnv("MOLECULE_LLM_USAGE_TOKEN", "OPENAI_API_KEY")
if baseURL == "" || token == "" {
// Proxy not configured (boot race / misconfig). On the platform_managed
// path the workspace IS entitled to platform creds, so we do NOT strip
// here — but we report HasUsableLLMCred from whatever survived so the
// caller's fail-closed branch (non-platform only) is never reached on
// this path.
return platformLLMEnvResult{ResolvedMode: res.ResolvedMode, HasUsableLLMCred: true, Source: res.Source}
// Proxy not configured (boot race / misconfig). The platform_managed
// path REQUIRES the CP proxy env to inject a usable credential.
// Reporting HasUsableLLMCred=true here would start the workspace
// credential-less — the adk-demo dark-wedge class (#2162).
// Return false so the caller's fail-closed branch aborts with
// MISSING_PLATFORM_PROXY.
return platformLLMEnvResult{ResolvedMode: res.ResolvedMode, HasUsableLLMCred: false, Source: res.Source}
}
stripPlatformManagedLLMBypassEnv(envVars)
@@ -134,6 +134,11 @@ func TestProvisionWorkspaceAuto_NoBackendMarksFailed(t *testing.T) {
// This is the regression-prevention test for the Design Director bug
// where 7-of-7 sub-agents went down the Docker path on SaaS.
func TestProvisionWorkspaceAuto_RoutesToCPWhenSet(t *testing.T) {
// Supply the CP proxy env so the platform-managed default does not abort
// with MISSING_PLATFORM_PROXY (molecule-core#2162).
t.Setenv("MOLECULE_LLM_BASE_URL", "https://api.example.test/api/v1/internal/llm/openai/v1")
t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "tenant-admin-token")
mock := setupTestDB(t)
mock.MatchExpectationsInOrder(false)
@@ -597,6 +602,11 @@ func TestNoCallSiteCallsBareStop(t *testing.T) {
// count without mocking out the retry helper itself, which would
// invert the test contract — the retry IS the dispatcher's job here).
func TestRestartWorkspaceAuto_RoutesToCPWhenSet(t *testing.T) {
// Supply the CP proxy env so the platform-managed default does not abort
// with MISSING_PLATFORM_PROXY (molecule-core#2162).
t.Setenv("MOLECULE_LLM_BASE_URL", "https://api.example.test/api/v1/internal/llm/openai/v1")
t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "tenant-admin-token")
rec := &trackingCPProv{}
bcast := &concurrentSafeBroadcaster{}
h := NewWorkspaceHandler(bcast, nil, "http://localhost:8080", t.TempDir())
@@ -795,6 +805,11 @@ func TestResumeHandler_UsesProvisionWorkspaceAuto(t *testing.T) {
// the async tests; the absence of `go` semantics is the load-bearing
// distinction we're pinning.
func TestProvisionWorkspaceAutoSync_RoutesToCPWhenSet(t *testing.T) {
// Supply the CP proxy env so the platform-managed default does not abort
// with MISSING_PLATFORM_PROXY (molecule-core#2162).
t.Setenv("MOLECULE_LLM_BASE_URL", "https://api.example.test/api/v1/internal/llm/openai/v1")
t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "tenant-admin-token")
mock := setupTestDB(t)
mock.MatchExpectationsInOrder(false)
// provisionWorkspaceCP runs prepareProvisionContext synchronously, which
@@ -98,6 +98,11 @@ func (r *recordingCPProv) startedSet() map[string]struct{} {
func TestProvisionWorkspaceCP_ConcurrentBurst_NoSilentDrop(t *testing.T) {
const numWorkspaces = 7
// Supply the CP proxy env so the platform-managed default does not abort
// with MISSING_PLATFORM_PROXY (molecule-core#2162).
t.Setenv("MOLECULE_LLM_BASE_URL", "https://api.example.test/api/v1/internal/llm/openai/v1")
t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "tenant-admin-token")
mock := setupTestDB(t)
// Every goroutine runs prepareProvisionContext → mintWorkspaceSecrets
@@ -230,6 +230,18 @@ func (h *WorkspaceHandler) prepareProvisionContext(
Extra: map[string]interface{}{"error": msg, "code": "MISSING_BYOK_CREDENTIAL", "billing_mode": llmRes.ResolvedMode, "issue": "1994"},
}
}
// Fail closed for a platform-managed workspace whose CP proxy env is
// absent: do NOT start it credential-less (adk-demo dark-wedge class,
// #2162). The platform_managed path requires the proxy injection to
// produce a usable credential.
if llmRes.ResolvedMode == LLMBillingModePlatformManaged && !llmRes.HasUsableLLMCred {
msg := formatMissingPlatformProxyError()
log.Printf("Provisioner: ABORT workspace=%s — platform-managed billing mode but CP proxy env absent (MISSING_PLATFORM_PROXY, molecule-core#2162)", workspaceID)
return nil, &provisionAbort{
Msg: msg,
Extra: map[string]interface{}{"error": msg, "code": "MISSING_PLATFORM_PROXY", "billing_mode": llmRes.ResolvedMode, "issue": "2162"},
}
}
applyRuntimeModelEnv(envVars, payload.Runtime, payload.Model)
if payload.Role != "" {
envVars["MOLECULE_AGENT_ROLE"] = payload.Role
@@ -264,6 +264,11 @@ func TestPrepareProvisionContext_ParentIDInjection(t *testing.T) {
},
}
// Supply the CP proxy env so the platform-managed default does not abort
// with MISSING_PLATFORM_PROXY (molecule-core#2162).
t.Setenv("MOLECULE_LLM_BASE_URL", "https://api.example.test/api/v1/internal/llm/openai/v1")
t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "tenant-admin-token")
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
mock := setupTestDB(t)
@@ -331,6 +336,10 @@ func TestPrepareProvisionContext_InjectsGitHTTPCredsFromPersonaToken(t *testing.
}
}
t.Setenv("MOLECULE_PERSONA_ROOT", root)
// Supply the CP proxy env so the platform-managed default does not abort
// with MISSING_PLATFORM_PROXY (molecule-core#2162).
t.Setenv("MOLECULE_LLM_BASE_URL", "https://api.example.test/api/v1/internal/llm/openai/v1")
t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "tenant-admin-token")
cases := []struct {
name string
@@ -459,6 +468,10 @@ func TestPrepareProvisionContext_WorkspaceSecretWinsOverPersonaToken(t *testing.
t.Fatal(err)
}
t.Setenv("MOLECULE_PERSONA_ROOT", root)
// Supply the CP proxy env so the platform-managed default does not abort
// with MISSING_PLATFORM_PROXY (molecule-core#2162).
t.Setenv("MOLECULE_LLM_BASE_URL", "https://api.example.test/api/v1/internal/llm/openai/v1")
t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "tenant-admin-token")
mock := setupTestDB(t)
mock.ExpectQuery(`SELECT key, encrypted_value, encryption_version FROM global_secrets`).
@@ -1424,6 +1424,11 @@ func (s *stubFailingCPProv) IsRunning(_ context.Context, _ string) (bool, error)
// the broadcast payload would surface every marker; the canned
// "provisioning failed" message must surface none of them.
func TestProvisionWorkspaceCP_NoInternalErrorsInBroadcast(t *testing.T) {
// Supply the CP proxy env so the platform-managed default does not abort
// with MISSING_PLATFORM_PROXY (molecule-core#2162).
t.Setenv("MOLECULE_LLM_BASE_URL", "https://api.example.test/api/v1/internal/llm/openai/v1")
t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "tenant-admin-token")
mock := setupTestDB(t)
// loadWorkspaceSecrets queries global_secrets and workspace_secrets