diff --git a/workspace-server/cmd/server/cp_config.go b/workspace-server/cmd/server/cp_config.go index d1021c22f..acbac0ac8 100644 --- a/workspace-server/cmd/server/cp_config.go +++ b/workspace-server/cmd/server/cp_config.go @@ -105,3 +105,53 @@ func refreshEnvFromCP() error { log.Printf("CP env refresh: applied %d values from %s/cp/tenants/config", applied, base) return nil } + +// requiredLLMEnvVars is the set of LLM proxy env vars a managed SaaS +// tenant must have populated after refreshEnvFromCP. cp#469 (tenant +// proxy-env delivery) — guaranteed CP-delivered creds reach the +// tenant process env on boot. Per Researcher Task #37 / Spec 2 and +// Task #46 (watch-fail-first test). +// +// Key set byte-matched against Researcher's verified emission in +// controlplane tenant_config.go:140-144 (Researcher REQUEST_CHANGES +// iterate body, 3987f59c). The four keys below ARE the LLM-proxy +// subset of the 8 CP-emitted keys; OPENAI_BASE_URL / OPENAI_API_KEY / +// ANTHROPIC_BASE_URL / ANTHROPIC_API_KEY are out of scope for cp#469 +// (different feature surfaces — direct-to-provider fallbacks, not +// the proxy). v2 fix: MOLECULE_LLM_USAGE_TOKEN, MOLECULE_LLM_USAGE_URL, +// MOLECULE_LLM_BASE_URL, MOLECULE_LLM_ANTHROPIC_BASE_URL — note the +// 4th key is namespaced MOLECULE_LLM_ANTHROPIC_BASE_URL, NOT bare +// ANTHROPIC_BASE_URL. Bare ANTHROPIC_BASE_URL is a separate CP-emitted +// key for direct-provider use, not the LLM proxy. +var requiredLLMEnvVars = []string{ + "MOLECULE_LLM_USAGE_TOKEN", + "MOLECULE_LLM_USAGE_URL", // CRITICAL fix v2: was MOLECULE_LLM_URL in v1 + "MOLECULE_LLM_BASE_URL", + "MOLECULE_LLM_ANTHROPIC_BASE_URL", // CRITICAL fix v3: was ANTHROPIC_BASE_URL in v2 (different key!) +} + +// assertManagedTenantHasLLMEnv verifies that, when running as a +// managed SaaS tenant (MOLECULE_ORG_ID + ADMIN_TOKEN both set), all +// required LLM proxy env vars are populated after refreshEnvFromCP. +// +// Self-hosted (no orgID/adminToken) is exempt — dev must not be +// blocked here. Managed tenants with missing LLM keys fail with +// MISSING_CP_LLM_ENV so they do not silently boot with broken proxy +// creds. Caller in main.go decides whether to log and continue or +// log.Fatalf depending on deployment context. +func assertManagedTenantHasLLMEnv() error { + if os.Getenv("MOLECULE_ORG_ID") == "" || os.Getenv("ADMIN_TOKEN") == "" { + // Self-hosted dev / not yet provisioned — not a managed tenant. + return nil + } + var missing []string + for _, k := range requiredLLMEnvVars { + if os.Getenv(k) == "" { + missing = append(missing, k) + } + } + if len(missing) > 0 { + return fmt.Errorf("MISSING_CP_LLM_ENV: required LLM proxy keys not set after refreshEnvFromCP: %v", missing) + } + return nil +} diff --git a/workspace-server/cmd/server/cp_config_test.go b/workspace-server/cmd/server/cp_config_test.go index fddceddea..9a2ec4607 100644 --- a/workspace-server/cmd/server/cp_config_test.go +++ b/workspace-server/cmd/server/cp_config_test.go @@ -5,6 +5,7 @@ import ( "net/http" "net/http/httptest" "os" + "strings" "testing" ) @@ -47,6 +48,138 @@ func TestRefreshEnvFromCP_AppliesCPResponse(t *testing.T) { } } +// TestRefreshEnvFromCP_ManagedTenantRequiresLLMKeys: watch-fail-first +// per Researcher Task #46. When running as a managed tenant +// (MOLECULE_ORG_ID + ADMIN_TOKEN set), missing LLM proxy env vars +// after refreshEnvFromCP MUST surface as MISSING_CP_LLM_ENV, not be +// silently accepted. Without this guard, a CP that loses its LLM +// creds (e.g. during an incident) would let a tenant boot and then +// fail later at first LLM call — worse than a loud refusal here. +func TestRefreshEnvFromCP_ManagedTenantRequiresLLMKeys(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + // Stub CP returns a CP response WITHOUT any of the required + // LLM keys — simulates the failure mode where the CP side + // dropped or never had the LLM creds for this org. + w.Header().Set("Content-Type", "application/json") + fmt.Fprint(w, `{"MOLECULE_CP_SHARED_SECRET":"x","MOLECULE_CP_URL":"https://api.moleculesai.app"}`) + })) + defer srv.Close() + + t.Setenv("MOLECULE_ORG_ID", "org-managed-1") + t.Setenv("ADMIN_TOKEN", "admin-tok") + t.Setenv("MOLECULE_CP_URL", srv.URL) + // Clear all LLM keys to simulate the boot-without-LLM-env failure mode. + t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "") + t.Setenv("MOLECULE_LLM_USAGE_URL", "") + t.Setenv("MOLECULE_LLM_BASE_URL", "") + t.Setenv("MOLECULE_LLM_ANTHROPIC_BASE_URL", "") + + // refreshEnvFromCP itself should succeed — CP is reachable, returned 200. + if err := refreshEnvFromCP(); err != nil { + t.Fatalf("refreshEnvFromCP: %v", err) + } + // The boot assertion must catch the missing LLM keys. + err := assertManagedTenantHasLLMEnv() + if err == nil { + t.Fatal("expected MISSING_CP_LLM_ENV error for managed tenant without LLM keys, got nil") + } + if !strings.Contains(err.Error(), "MISSING_CP_LLM_ENV") { + t.Errorf("expected error to contain MISSING_CP_LLM_ENV, got: %v", err) + } +} + +// TestRefreshEnvFromCP_ManagedTenantHappyPath: when the CP returns +// all 4 LLM-proxy keys, the gate must PASS — no MISSING_CP_LLM_ENV +// for a properly-configured managed tenant. Watch-fail counterpart +// to TestRefreshEnvFromCP_ManagedTenantRequiresLLMKeys: if THIS test +// ever fires MISSING_CP_LLM_ENV on the byte-correct key set, the +// requiredLLMEnvVars list has drifted from the CP emission again. +// Per Researcher REQUEST_CHANGES TEST ADEQUACY note. +func TestRefreshEnvFromCP_ManagedTenantHappyPath(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + // Return ALL 4 LLM-proxy keys — names byte-matched to + // tenant_config.go:140-144 CP emission. + fmt.Fprint(w, `{"MOLECULE_LLM_USAGE_TOKEN":"tok-1","MOLECULE_LLM_USAGE_URL":"https://llm.example.com/usage","MOLECULE_LLM_BASE_URL":"https://llm.example.com","MOLECULE_LLM_ANTHROPIC_BASE_URL":"https://llm.example.com/anthropic"}`) + })) + defer srv.Close() + + t.Setenv("MOLECULE_ORG_ID", "org-managed-happy") + t.Setenv("ADMIN_TOKEN", "admin-tok") + t.Setenv("MOLECULE_CP_URL", srv.URL) + // Pre-clear so we can verify the refresh actually populated them. + t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "") + t.Setenv("MOLECULE_LLM_USAGE_URL", "") + t.Setenv("MOLECULE_LLM_BASE_URL", "") + t.Setenv("MOLECULE_LLM_ANTHROPIC_BASE_URL", "") + + if err := refreshEnvFromCP(); err != nil { + t.Fatalf("refreshEnvFromCP: %v", err) + } + // Sanity: refresh actually applied the keys. + if got := os.Getenv("MOLECULE_LLM_USAGE_TOKEN"); got != "tok-1" { + t.Errorf("refresh did not apply USAGE_TOKEN: got %q", got) + } + // The boot assertion must pass — no MISSING_CP_LLM_ENV. + if err := assertManagedTenantHasLLMEnv(); err != nil { + t.Errorf("managed happy path must not MISSING_CP_LLM_ENV, got: %v", err) + } +} + +// TestRefreshEnvFromCP_ManagedTenantPartialEnv: when the CP returns +// 3 of 4 LLM-proxy keys (one missing), the gate must STILL catch it +// and the error must name the missing key. Per Researcher +// REQUEST_CHANGES TEST ADEQUACY note — partial-env coverage is +// critical because the production failure mode is usually "one +// key dropped" not "all keys dropped". +func TestRefreshEnvFromCP_ManagedTenantPartialEnv(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + // 3 of 4 — MOLECULE_LLM_ANTHROPIC_BASE_URL is missing. + fmt.Fprint(w, `{"MOLECULE_LLM_USAGE_TOKEN":"tok-1","MOLECULE_LLM_USAGE_URL":"https://llm.example.com/usage","MOLECULE_LLM_BASE_URL":"https://llm.example.com"}`) + })) + defer srv.Close() + + t.Setenv("MOLECULE_ORG_ID", "org-managed-partial") + t.Setenv("ADMIN_TOKEN", "admin-tok") + t.Setenv("MOLECULE_CP_URL", srv.URL) + // Pre-clear all 4 so the 3 that come back from CP are the only + // ones set; the 4th (MOLECULE_LLM_ANTHROPIC_BASE_URL) stays empty. + t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "") + t.Setenv("MOLECULE_LLM_USAGE_URL", "") + t.Setenv("MOLECULE_LLM_BASE_URL", "") + t.Setenv("MOLECULE_LLM_ANTHROPIC_BASE_URL", "") + + if err := refreshEnvFromCP(); err != nil { + t.Fatalf("refreshEnvFromCP: %v", err) + } + err := assertManagedTenantHasLLMEnv() + if err == nil { + t.Fatal("expected MISSING_CP_LLM_ENV for partial env (3 of 4 keys), got nil") + } + if !strings.Contains(err.Error(), "MISSING_CP_LLM_ENV") { + t.Errorf("expected error to contain MISSING_CP_LLM_ENV, got: %v", err) + } + if !strings.Contains(err.Error(), "MOLECULE_LLM_ANTHROPIC_BASE_URL") { + t.Errorf("expected error to name the missing key MOLECULE_LLM_ANTHROPIC_BASE_URL, got: %v", err) + } +} + +// TestAssertManagedTenantHasLLMEnv_NotManagedIsNoop: self-hosted +// (no orgID/adminToken) must NOT block on missing LLM keys — dev +// ergonomics matter and the assertion's contract is "managed only". +func TestAssertManagedTenantHasLLMEnv_NotManagedIsNoop(t *testing.T) { + t.Setenv("MOLECULE_ORG_ID", "") + t.Setenv("ADMIN_TOKEN", "") + t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "") + t.Setenv("MOLECULE_LLM_USAGE_URL", "") + t.Setenv("MOLECULE_LLM_BASE_URL", "") + t.Setenv("MOLECULE_LLM_ANTHROPIC_BASE_URL", "") + if err := assertManagedTenantHasLLMEnv(); err != nil { + t.Errorf("self-hosted (not managed) must not block, got: %v", err) + } +} + // TestRefreshEnvFromCP_CPUnreachableDoesNotFailBoot: network errors must // return non-nil BUT main.go treats that as warn-and-continue. We assert // the function returns an error (not a panic) so the caller can log. diff --git a/workspace-server/cmd/server/main.go b/workspace-server/cmd/server/main.go index d93f13255..bb3d3b7ef 100644 --- a/workspace-server/cmd/server/main.go +++ b/workspace-server/cmd/server/main.go @@ -56,6 +56,16 @@ func main() { log.Printf("CP env refresh: %v (continuing with baked-in env)", err) } + // Managed-tenant boot assertion (cp#469 — tenant proxy-env delivery). + // If we're a managed SaaS tenant (orgID + adminToken set), all required + // LLM proxy env vars must be present after refresh. Missing keys block + // the tenant from booting with broken LLM creds — silent-fail is worse + // than a loud refusal. Self-hosted (no orgID/adminToken) short-circuits + // inside the assertion, so this never fires for dev. + if err := assertManagedTenantHasLLMEnv(); err != nil { + log.Fatalf("Managed tenant boot assertion: %v", err) + } + // Secrets encryption. In MOLECULE_ENV=prod, boot refuses to start // without a valid SECRETS_ENCRYPTION_KEY (fail-secure — Top-5 #5). // In any other environment, missing keys just log a warning and diff --git a/workspace-server/internal/handlers/a2a_proxy_helpers.go b/workspace-server/internal/handlers/a2a_proxy_helpers.go index 98c51bb7d..11916e6b1 100644 --- a/workspace-server/internal/handlers/a2a_proxy_helpers.go +++ b/workspace-server/internal/handlers/a2a_proxy_helpers.go @@ -407,15 +407,6 @@ func validateCallerToken(ctx context.Context, c *gin.Context, callerID string) e // matching (the wsauth errors are typed for the invalid case). var errInvalidCallerToken = errors.New("missing caller auth token") -// canvasUserMessage holds the extracted user message extracted from an -// A2A canvas request body for broadcasting to other sessions. -type canvasUserMessage struct { - Message string `json:"message,omitempty"` - Parts []map[string]interface{} `json:"parts,omitempty"` - MessageID string `json:"messageId,omitempty"` - Attachments []map[string]interface{} `json:"attachments,omitempty"` -} - // extractCanvasUserMessage parses an A2A JSON-RPC request body and extracts // the user-authored text and attachments from a canvas-initiated message/send. // Returns nil when the body is not a canvas user message (empty, malformed,