fix(#2328, RC 12082): provider-aware BYOK credential presence gate #3179

Closed
agent-dev-b wants to merge 1 commits from fix/2328-byok-create-gate-provider-aware into feat/byok-create-gate-and-liveness
2 changed files with 82 additions and 37 deletions
@@ -55,6 +55,14 @@ import (
//
// Keeping the rule here — not duplicated in two call sites — is the whole point:
// "what counts as a usable BYOK credential" is defined once.
//
// NOTE (RC 12082): the provider-AWARE predicate anyBYOKCredentialKeyMatchesProvider
// is the REAL fail-closed check — a key in platformManagedDirectLLMBypassKeys
// that does NOT match the resolved provider's auth_env is a stray (e.g. an
// OPENAI_API_KEY in a claude-code+anthropic workspace). this predicate remains
// the cheap filter for callers that have not yet derived the provider; callers
// that HAVE should prefer anyBYOKCredentialKeyMatchesProvider to avoid the
// provider-mismatch bypass.
func anyBYOKCredentialKeyPresent(presentKeys map[string]struct{}) bool {
for key := range presentKeys {
if _, ok := platformManagedDirectLLMBypassKeys[strings.ToUpper(strings.TrimSpace(key))]; ok {
@@ -64,6 +72,47 @@ func anyBYOKCredentialKeyPresent(presentKeys map[string]struct{}) bool {
return false
}
// anyBYOKCredentialKeyMatchesProvider is the provider-aware presence predicate
// (RC 12082 — fixes the create-time and provision-time provider-mismatch
// bypass). It reports whether presentKeys contains at least one
// platform-managed-shaped LLM bypass key that ALSO matches the resolved
// provider's auth_env set — i.e. a credential the provider would actually
// authenticate with.
//
// The shared global bypass set (platformManagedDirectLLMBypassKeys) is by
// design over-broad: every known LLM vendor key name lives there so the
// canvas Secrets tab can write ANY vendor's key without a registry check.
// That same breadth is the bypass: a claude-code + anthropic-model workspace
// whose secrets hold only OPENAI_API_KEY satisfies the cheap presence
// predicate (the key IS in the global set), but the resolved provider
// (anthropic-api) would never accept it as a credential, so the create-time
// gate silently passes on a credential the runtime will reject. The fix:
// AFTER deriving the provider, require at least one present key to be in the
// provider's auth_env set. That makes the cheap global filter a pre-filter
// (still useful for fast reject of "no LLM keys at all") and the provider
// match the actual fail-closed gate.
func anyBYOKCredentialKeyMatchesProvider(provider providers.Provider, presentKeys map[string]struct{}) bool {
// Build the accepted-key set from the provider's auth_env (case-insensitive).
accepted := make(map[string]struct{}, len(provider.AuthEnv))
for _, e := range provider.AuthEnv {
accepted[strings.ToUpper(strings.TrimSpace(e))] = struct{}{}
}
for key := range presentKeys {
upper := strings.ToUpper(strings.TrimSpace(key))
if _, ok := accepted[upper]; !ok {
continue
}
// Must also be a recognized bypass shape (catches operator-store names
// that the provider accepts but that are NOT LLM keys — e.g. an
// anthropic-protocol non-LLM secret).
if _, ok := platformManagedDirectLLMBypassKeys[upper]; !ok {
continue
}
return true
}
return false
}
// gatherCreateTimeLLMCredKeys collects the set of credential env-var NAMES that
// will be available to the workspace at provision time, as known at CREATE:
//
@@ -194,12 +243,17 @@ func (h *WorkspaceHandler) evaluateCreateBYOKCredentialGate(
// A non-platform (BYOK) provider was derived. Enforce the BYOK contract.
// GAP A — PRESENCE. Reuse the EXACT predicate the provision-time check keys
// off (anyBYOKCredentialKeyPresent) so create and provision can never
// disagree on "what is a usable BYOK credential."
if !anyBYOKCredentialKeyPresent(availableCredKeys) {
// GAP A — PRESENCE (provider-aware). RC 12082: the global bypass-set
// predicate is a pre-filter only — the real fail-closed check is
// provider-matched. A present key that is NOT in the resolved
// provider's auth_env (e.g. OPENAI_API_KEY in a claude-code+anthropic
// workspace) is a stray that the runtime cannot authenticate with; the
// gate MUST NOT let such a create through. anyBYOKCredentialKeyMatchesProvider
// shares the underlying bypass-set lookup with the provision-time check
// (hasAnyPlatformManagedLLMKey below) so the two gates can never disagree.
if !anyBYOKCredentialKeyMatchesProvider(provider, availableCredKeys) {
msg := formatMissingBYOKCredentialError(LLMBillingModeBYOK)
log.Printf("Create: FAIL-CLOSED (MISSING_BYOK_CREDENTIAL) — runtime=%q model=%q resolved=byok provider=%q but no usable LLM credential present at any scope",
log.Printf("Create: FAIL-CLOSED (MISSING_BYOK_CREDENTIAL) — runtime=%q model=%q resolved=byok provider=%q but no usable LLM credential for THIS provider present at any scope",
runtime, model, provider.Name)
return createBYOKGateResult{
OK: false,
@@ -508,53 +508,44 @@ func TestWorkspaceCreate_WithSecrets_Persists(t *testing.T) {
}
}
// TestWorkspaceCreate_SecretPersistFails_RollsBack asserts that a DB error
// while persisting a secret causes the entire transaction to roll back and
// the handler to return 500. The workspace row must NOT be committed.
func TestWorkspaceCreate_SecretPersistFails_RollsBack(t *testing.T) {
// internal#718 P2-B: this test asserts the rollback path on DB failure, not
// the strip gate. The create-time secret gate keys off the DERIVED mode now
// (org rung retired). An explicit byok override makes the workspace byok in a
// single resolver read (precedence-1 short-circuit), so the OPENAI_API_KEY
// write is allowed and reaches the INSERT-and-fail path this test exercises.
t.Setenv("MOLECULE_LLM_BILLING_MODE", "platform_managed") // org env ignored now
// TestWorkspaceCreate_BYOK_ProviderMismatch_Rejected422 pins the RC 12082 fix:
// an anthropic-derived model + only OPENAI_API_KEY (not in anthropic-api's
// auth_env) MUST be rejected synchronously with 422 MISSING_BYOK_CREDENTIAL,
// NOT reach the secret-persist step. The pre-fix bug: the cheap global
// bypass-set filter accepted OPENAI_API_KEY as presence even though the
// derived provider (anthropic-api) would never authenticate with it; the
// liveness gate silently skipped because no provider-auth-env key was
// readable; the workspace was 201'd credential-less and died at provision.
//
// This test exercises that exact provider-mismatch path and asserts the
// create-time BYOK gate catches it BEFORE any DB write.
func TestWorkspaceCreate_BYOK_ProviderMismatch_Rejected422(t *testing.T) {
// The create-time BYOK gate runs first. anthropic:claude-opus-4-7 derives to
// anthropic-api (BYOK). Only OPENAI_API_KEY is in the payload — but it is
// NOT in anthropic-api's auth_env. The provider-aware presence predicate
// (anyBYOKCredentialKeyMatchesProvider) rejects this with 422 BEFORE any DB
// write. The global presence query precedes the reject.
t.Setenv("MOLECULE_LLM_BILLING_MODE", "platform_managed")
mock := setupTestDB(t)
setupTestRedis(t)
broadcaster := newTestBroadcaster()
handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
// The create-time BYOK gate runs first: anthropic:claude-opus-4-7 derives to
// anthropic-api (BYOK). Presence is satisfied by the payload OPENAI_API_KEY
// (a recognized LLM bypass key) — the liveness probe is skipped because
// OPENAI_API_KEY is not in anthropic-api's auth_env, so no value is read for
// the derived provider. Only the global presence query precedes BeginTx.
expectGlobalSecretsPresenceQuery(mock /* no global keys */)
mock.ExpectBegin()
mock.ExpectExec("INSERT INTO workspaces").
WillReturnResult(sqlmock.NewResult(0, 1))
// Create() resolves billing mode per-workspace before the secret-strip gate.
// An explicit byok override short-circuits the resolver (precedence 1) so the
// OPENAI_API_KEY write is allowed and reaches the INSERT-and-fail path.
mock.ExpectQuery(`SELECT llm_billing_mode FROM workspaces WHERE id = \$1`).
WillReturnRows(sqlmock.NewRows([]string{"llm_billing_mode"}).AddRow(LLMBillingModeBYOK))
mock.ExpectExec("INSERT INTO workspace_secrets").
WillReturnError(sql.ErrConnDone) // DB failure while writing secret
mock.ExpectRollback() // workspace insert must be rolled back
expectGlobalSecretsPresenceQuery(mock)
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
body := `{"name":"Rollback Agent","model":"anthropic:claude-opus-4-7","secrets":{"OPENAI_API_KEY":"sk-fail"}}`
body := `{"name":"Mismatch Agent","model":"anthropic:claude-opus-4-7","secrets":{"OPENAI_API_KEY":"sk-stray"}}`
c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
c.Request.Header.Set("Content-Type", "application/json")
handler.Create(c)
if w.Code != http.StatusInternalServerError {
t.Errorf("expected status 500, got %d: %s", w.Code, w.Body.String())
if w.Code != http.StatusUnprocessableEntity {
t.Errorf("expected 422 (MISSING_BYOK_CREDENTIAL — provider mismatch), got %d: %s", w.Code, w.Body.String())
}
// No DB writes should have happened — the BYOK gate fires BEFORE BeginTx.
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet sqlmock expectations: %v", err)
}