From 4b7fb6ecfcab99d870ae9ce99922a4465f482d39 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Mon, 22 Jun 2026 09:46:30 -0700 Subject: [PATCH] fix(platform-agent): pin LLM_PROVIDER=platform when concierge MODEL is empty MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ensureConciergeProvider gated the LLM_PROVIDER=platform pin on the effective MODEL having the platform-managed prefix. On a rebuilt-from-DB provision payload the MODEL is empty, so HasPrefix("", ...) was false and the pin was skipped — the concierge booted without LLM_PROVIDER. The runtime then could not drop the inherited tenant CLAUDE_CODE_OAUTH_TOKEN, so claude-code sent the OAuth bearer to the CP LLM proxy (which authenticates via the per-workspace admin_token) and 401'd every call. Root cause of the test9 / platform-agent "Incorrect API key" outage. Treat an empty MODEL as platform-managed (pin platform): empty is never a BYOK signal — BYOK concierges carry a stored LLM_PROVIDER (early-return) or an explicit non-platform MODEL (still skipped). Only the unresolved/empty case now pins. Regression test: empty MODEL pins platform; explicit BYOK model still skips. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../internal/handlers/platform_agent.go | 15 +++-- .../internal/handlers/platform_agent_test.go | 55 +++++++++++++++++++ 2 files changed, 66 insertions(+), 4 deletions(-) diff --git a/workspace-server/internal/handlers/platform_agent.go b/workspace-server/internal/handlers/platform_agent.go index d17f6e42e..02df8a0e8 100644 --- a/workspace-server/internal/handlers/platform_agent.go +++ b/workspace-server/internal/handlers/platform_agent.go @@ -502,10 +502,17 @@ func (h *WorkspaceHandler) ensureConciergeProvider(ctx context.Context, workspac // just above (the fresh-boot seed) — so reading it here is sufficient and // avoids a redundant secret decrypt. model := strings.TrimSpace(envVars["MODEL"]) - // Only pin when the model is in the platform-managed namespace that needs it. - // A non-platform model (e.g. `sonnet`, a BYOK `claude-…`) resolves on its - // own; forcing `platform` there would mis-route auth and break the agent. - if !strings.HasPrefix(strings.ToLower(model), platformManagedModelPrefix) { + // Pin platform when the model is platform-managed OR unresolved (empty). An + // empty MODEL here is NOT a BYOK/self-host signal — those carry a stored + // LLM_PROVIDER (handled by the early-return above) or an explicit non-platform + // MODEL (skipped just below). Empty means an unresolved fresh/rebuilt-from-DB + // payload, which defaults to the platform-managed family; skipping the pin + // there (the old `HasPrefix("", …)`==false path) left the concierge without + // LLM_PROVIDER, so the runtime could not drop the inherited tenant + // CLAUDE_CODE_OAUTH_TOKEN and the agent 401'd against the CP LLM proxy. Only a + // NON-empty non-platform model (an explicit BYOK pick) resolves on its own; + // forcing `platform` there would mis-route auth and break the agent. + if model != "" && !strings.HasPrefix(strings.ToLower(model), platformManagedModelPrefix) { return } diff --git a/workspace-server/internal/handlers/platform_agent_test.go b/workspace-server/internal/handlers/platform_agent_test.go index ed4e2e576..539f9b03f 100644 --- a/workspace-server/internal/handlers/platform_agent_test.go +++ b/workspace-server/internal/handlers/platform_agent_test.go @@ -717,6 +717,61 @@ func TestApplyConciergeProvisionConfig_SeedsModel(t *testing.T) { }) } +// TestEnsureConciergeProvider_EmptyModelPins is the direct-unit regression gate +// for the fix/concierge-provider-empty-model fix: the pin gate changed from +// `if !strings.HasPrefix(strings.ToLower(model), platformManagedModelPrefix)` +// to `if model != "" && !strings.HasPrefix(...)`. The old form computed +// HasPrefix("", "moonshot/") == false for an EMPTY model and so returned early +// WITHOUT pinning — leaving a fresh/rebuilt-from-DB concierge payload (whose +// MODEL env was not yet populated) with no LLM_PROVIDER, which 401'd against the +// CP LLM proxy. The fix treats an unresolved (empty) model as the platform- +// managed default and pins. This calls ensureConciergeProvider DIRECTLY (not via +// applyConciergeProvisionConfig) to isolate the gate. +func TestEnsureConciergeProvider_EmptyModelPins(t *testing.T) { + h := &WorkspaceHandler{} + const providerSelQuery = `SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = \$1 AND key = 'LLM_PROVIDER'` + const secretInsert = `INSERT INTO workspace_secrets` + + t.Run("empty MODEL (rebuilt-from-DB payload) still pins platform", func(t *testing.T) { + mock := setupTestDB(t) + // No LLM_PROVIDER stored yet → existence SELECT empty → proceed to the gate. + mock.ExpectQuery(providerSelQuery).WithArgs("ws-empty-model"). + WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"})) + // Empty model is the platform-managed default → the pin MUST persist. + mock.ExpectExec(secretInsert). + WithArgs("ws-empty-model", sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) + + env := map[string]string{} // no MODEL key — unresolved fresh/rebuilt payload + h.ensureConciergeProvider(context.Background(), "ws-empty-model", env) + + if env["LLM_PROVIDER"] != conciergeProvider { + t.Errorf("empty MODEL did not pin LLM_PROVIDER=%q; got %q (env=%v) — concierge would 401 against the CP LLM proxy", conciergeProvider, env["LLM_PROVIDER"], env) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations (LLM_PROVIDER pin not persisted): %v", err) + } + }) + + t.Run("explicit BYOK non-platform model still skips the pin", func(t *testing.T) { + mock := setupTestDB(t) + // No LLM_PROVIDER stored yet → existence SELECT empty → proceed to the gate. + mock.ExpectQuery(providerSelQuery).WithArgs("ws-byok-model"). + WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"})) + // NO ExpectExec: a non-empty BYOK model resolves on its own → no pin. + + env := map[string]string{"MODEL": "anthropic:claude-opus-4-8"} + h.ensureConciergeProvider(context.Background(), "ws-byok-model", env) + + if _, ok := env["LLM_PROVIDER"]; ok { + t.Errorf("explicit BYOK model wrongly pinned LLM_PROVIDER=%q — would mis-route a BYOK/self-host concierge", env["LLM_PROVIDER"]) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations (an unexpected INSERT means it pinned a BYOK model): %v", err) + } + }) +} + // TestApplyConciergeProvisionConfig_SeedsProvider is the CI regression gate for // the concierge non-response incident (prod 2026-06-18): the concierge booted // online but configuration_status=not_configured because the runtime wheel -- 2.52.0