From 7c3840fee90bcb340ac4637f8caef67a131055ef Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer B (MiniMax)" Date: Tue, 23 Jun 2026 08:40:41 +0000 Subject: [PATCH] fix(core#3162): fail-CLOSED on readStoredProviderSecret decrypt/read error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes core#3162: a transient decrypt/read error on an existing LLM_PROVIDER workspace_secret used to be collapsed into an empty string and treated as "unset" by readStoredProviderSecret. Combined with the empty-MODEL platform pin (core#3160), a BYOK/self-host concierge that hit a transient decrypt failure while its MODEL was momentarily empty (rebuilt-from-DB payload) could be silently mis-pinned to LLM_PROVIDER=platform and mis-routed through the platform LLM proxy. Fix --- * readStoredProviderSecret now returns (string, error) and distinguishes: - (value, nil) → secret is stored and decrypted; caller respects it - ("", nil) → sql.ErrNoRows (genuine unset; re-seed is safe) - ("", err) → any other Scan error or DecryptVersioned error; caller MUST fail closed (not seed platform) * ensureConciergeProvider fails closed on the ('', err) case: logs the error and returns without seeding LLM_PROVIDER, so the next provision re-tries rather than silently mis-routing. Scope discipline (per PM's "one item, don't batch more core") --- * readStoredModelSecret has the same fail-open shape (its docstring even says "Mirrors readStoredModelSecret") but is intentionally OUT OF SCOPE for this PR. The MODEL read is not on the BYOK-leak path the issue body describes (MODEL is the customer's model pick, not a provider pin), and per PM's standing rule, a follow-up issue will track it separately. Tests (4 new + 2 existing) --- * TestEnsureConciergeProvider_FailsClosedOnReadError/ decrypt_error_on_existing_row_fails_closed_(does_NOT_seed_platform) — the primary regression: the fail-OPEN case the issue describes is now closed. * …/db_scan_error_(non-ErrNoRows)_fails_closed — connection-level errors also fail closed. * …/sql.ErrNoRows_(genuine_unset)_proceeds_to_seed_(no_regression) — the fresh-boot path still re-seeds. * …/existing_stored_provider_is_respected_on_successful_read_(no_regression) — the customer-pick path still wins. * The pre-existing TestEnsureConciergeProvider_EmptyModelPins tests (empty-MODEL pins, BYOK non-platform model skips) continue to pass. Independence from the red #3164 deployment surface --- * Pure server-side fix to the secret-read path in workspace-server. No concierge / MCP / heartbeat / identity-gate / platform-side surface touched. Safe to merge on the core-lane. Closes: core#3162 Co-Authored-By: Claude Opus 4.8 (1M context) --- .../internal/handlers/platform_agent.go | 61 ++++++++--- .../internal/handlers/platform_agent_test.go | 101 ++++++++++++++++++ 2 files changed, 150 insertions(+), 12 deletions(-) diff --git a/workspace-server/internal/handlers/platform_agent.go b/workspace-server/internal/handlers/platform_agent.go index 02df8a0e..0cb6d249 100644 --- a/workspace-server/internal/handlers/platform_agent.go +++ b/workspace-server/internal/handlers/platform_agent.go @@ -24,6 +24,7 @@ package handlers import ( "context" "database/sql" + "errors" "fmt" "log" "net/http" @@ -492,7 +493,18 @@ const conciergePlatformMCPCreateWorkspaceTool = "mcp__molecule-platform__create_ func (h *WorkspaceHandler) ensureConciergeProvider(ctx context.Context, workspaceID string, envVars map[string]string) { // Respect an explicit provider already set (customer canvas pick or a prior // seed): loadWorkspaceSecrets already injected it into envVars. Do nothing. - if existing := readStoredProviderSecret(ctx, workspaceID); existing != "" { + // + // Fail-CLOSED on decrypt/read error (core#3162): if the secret store returned + // an error rather than a clean "" (the row exists but the value is unreadable + // — transient DB blip, key-rotation drift, ciphertext corruption), we MUST NOT + // fall through to the platform-provider pin below. Doing so would wedge a + // transient decrypt failure into a fail-OPEN platform mis-pin on the next + // provision — combined with a momentarily-empty MODEL, that could silently + // route a BYOK/self-host concierge through the platform LLM proxy. + if existing, readErr := readStoredProviderSecret(ctx, workspaceID); readErr != nil { + log.Printf("Provisioner: concierge %s LLM_PROVIDER read failed (failing closed, NOT seeding): %v", workspaceID, readErr) + return + } else if existing != "" { return } @@ -524,22 +536,47 @@ func (h *WorkspaceHandler) ensureConciergeProvider(ctx context.Context, workspac } } -// readStoredProviderSecret returns the decrypted LLM_PROVIDER workspace_secret, -// or "" when none is stored (or on any read/decrypt error — treated as "unset" -// so a transient miss re-seeds rather than wedges). Mirrors readStoredModelSecret. -func readStoredProviderSecret(ctx context.Context, workspaceID string) string { +// readStoredProviderSecret returns the decrypted LLM_PROVIDER workspace_secret. +// The second return value distinguishes the three observed states so the caller +// can fail closed rather than fail open on a transient error: +// +// - (value, nil): a secret is stored and decrypted successfully → caller +// respects the existing provider pin and skips the seed. +// - ("", nil): no row exists for this workspace/key → caller may re-seed +// safely (this is the fresh-boot / cleared-secret case). +// - ("", error): the row exists (or the read otherwise succeeded) but the +// decryption failed → caller MUST NOT treat this as "unset" and MUST NOT +// fall back to seeding the platform provider. Returning "" without the +// error used to wedge a transient decrypt failure into a fail-OPEN +// platform-pin (see core#3162): combined with a momentarily-empty MODEL, +// a BYOK/self-host concierge could be silently mis-routed onto the +// platform LLM proxy on the next provision. +// +// `sql.ErrNoRows` is the canonical "no row" case and is mapped to ("", nil). +// Any other Scan error or a DecryptVersioned error is treated as the +// error-case and returned as ("", err). +// +// NOTE: this fix is scoped to the BYOK fail-open path (core#3162). +// `readStoredModelSecret` has the same shape but is intentionally out of scope +// per the issue body and PM's scope discipline (one item, don't bundle). +// Tracking separately. +func readStoredProviderSecret(ctx context.Context, workspaceID string) (string, error) { var stored []byte var version int - if err := db.DB.QueryRowContext(ctx, + err := db.DB.QueryRowContext(ctx, `SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = $1 AND key = 'LLM_PROVIDER'`, - workspaceID).Scan(&stored, &version); err != nil { - return "" - } - dec, err := crypto.DecryptVersioned(stored, version) + workspaceID).Scan(&stored, &version) if err != nil { - return "" + if errors.Is(err, sql.ErrNoRows) { + return "", nil + } + return "", fmt.Errorf("readStoredProviderSecret scan: %w", err) } - return string(dec) + dec, derr := crypto.DecryptVersioned(stored, version) + if derr != nil { + return "", fmt.Errorf("readStoredProviderSecret decrypt: %w", derr) + } + return string(dec), nil } // setProviderSecret persists (or clears, when provider == "") the LLM_PROVIDER diff --git a/workspace-server/internal/handlers/platform_agent_test.go b/workspace-server/internal/handlers/platform_agent_test.go index 539f9b03..f0379bde 100644 --- a/workspace-server/internal/handlers/platform_agent_test.go +++ b/workspace-server/internal/handlers/platform_agent_test.go @@ -5,6 +5,7 @@ import ( "context" "database/sql" "encoding/json" + "fmt" "net/http" "net/http/httptest" "os/exec" @@ -1039,3 +1040,103 @@ func TestRecordDeclaredPlugin_PrivilegedPluginEntitlement(t *testing.T) { } }) } + +// TestEnsureConciergeProvider_FailsClosedOnReadError is the CI regression gate +// for core#3162 (BYOK fail-open): a transient decrypt/read error on an EXISTING +// LLM_PROVIDER row used to be collapsed into "" and treated as "unset", which +// combined with a momentarily-empty MODEL could silently mis-pin a BYOK/self-host +// concierge onto the platform LLM proxy. The fix returns the error to the +// caller; ensureConciergeProvider MUST fail closed (return without seeding) so +// the next provision re-tries rather than silently mis-routing. +func TestEnsureConciergeProvider_FailsClosedOnReadError(t *testing.T) { + h := &WorkspaceHandler{} + const providerSelQuery = `SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = \$1 AND key = 'LLM_PROVIDER'` + const secretInsert = `INSERT INTO workspace_secrets` + + t.Run("decrypt error on existing row fails closed (does NOT seed platform)", func(t *testing.T) { + // Real encrypted_value bytes that cannot be decrypted by the current + // key/algorithm: forces crypto.DecryptVersioned to return an error. + // This is the realistic "the row exists but the ciphertext is unreadable" + // case — exactly the failure mode that previously fell through to a + // fail-OPEN platform pin. + mock := setupTestDB(t) + mock.ExpectQuery(providerSelQuery).WithArgs("ws-decrypt-fail"). + WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}). + AddRow([]byte("corrupt-ciphertext-that-cannot-be-decrypted"), 0)) + // NO ExpectExec: the fail-closed path MUST NOT persist LLM_PROVIDER. + + env := map[string]string{} // empty MODEL — the mis-pin window + h.ensureConciergeProvider(context.Background(), "ws-decrypt-fail", env) + + if _, pinned := env["LLM_PROVIDER"]; pinned { + t.Errorf("transient decrypt error caused a platform provider pin (env=%v) — would mis-route a BYOK/self-host concierge", env) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations (a pin INSERT means the fail-closed path leaked): %v", err) + } + }) + + t.Run("db scan error (non-ErrNoRows) fails closed", func(t *testing.T) { + // A connection error / context-cancellation on the secret lookup. Not + // the clean "row doesn't exist" case (sql.ErrNoRows) — this is a real + // failure that must also fail closed. + mock := setupTestDB(t) + mock.ExpectQuery(providerSelQuery).WithArgs("ws-scan-fail"). + WillReturnError(fmt.Errorf("connection refused")) + // NO ExpectExec: fail closed. + + env := map[string]string{} + h.ensureConciergeProvider(context.Background(), "ws-scan-fail", env) + + if _, pinned := env["LLM_PROVIDER"]; pinned { + t.Errorf("DB scan error caused a platform provider pin (env=%v)", env) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } + }) + + t.Run("sql.ErrNoRows (genuine unset) proceeds to seed (no regression)", func(t *testing.T) { + // The clean "no row exists" case MUST still let the caller proceed to + // the platform-provider seed. This is the fresh-boot / cleared-secret + // case the existing happy-path tests cover; we re-pin it here so a + // future refactor of the error-path can break it loudly. + mock := setupTestDB(t) + mock.ExpectQuery(providerSelQuery).WithArgs("ws-fresh-unset"). + WillReturnError(sql.ErrNoRows) + mock.ExpectExec(secretInsert). + WithArgs("ws-fresh-unset", sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) + + env := map[string]string{} // empty MODEL → seed path + h.ensureConciergeProvider(context.Background(), "ws-fresh-unset", env) + + if env["LLM_PROVIDER"] != conciergeProvider { + t.Errorf("genuine unset did not pin LLM_PROVIDER=%q; got %q (env=%v) — regression on the seed path", conciergeProvider, env["LLM_PROVIDER"], env) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations (the seed path did not persist): %v", err) + } + }) + + t.Run("existing stored provider is respected on successful read (no regression)", func(t *testing.T) { + // Existing happy path: a stored provider row reads cleanly → caller + // returns early without pinning. Pinned here as the regression + // sentinel for the successful-read branch. + mock := setupTestDB(t) + mock.ExpectQuery(providerSelQuery).WithArgs("ws-existing-prov"). + WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}). + AddRow([]byte("customer-picked-byok-provider"), 0)) + // NO ExpectExec: existing provider wins → no pin. + + env := map[string]string{} + h.ensureConciergeProvider(context.Background(), "ws-existing-prov", env) + + if _, pinned := env["LLM_PROVIDER"]; pinned { + t.Errorf("existing stored provider wrongly pinned platform (env=%v) — would override the customer pick", env) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } + }) +} -- 2.52.0