feat(byok): create-time credential gate + liveness preflight (fail-closed, no silent skip) #2328
@@ -0,0 +1,514 @@
|
||||
package handlers
|
||||
|
||||
// byok_credential_gate.go — fail-closed BYOK credential checks at the CREATE
|
||||
// boundary (molecule-core, BYOK billing-model gap-closure).
|
||||
//
|
||||
// Two related gaps in the fail-closed BYOK billing model are closed here, both
|
||||
// in the same credential-validation area:
|
||||
//
|
||||
// GAP A — PRESENCE gate at CREATE. The provision-time fail-closed check
|
||||
// (workspace_provision_shared.go) already aborts a BYOK provision with no
|
||||
// usable credential, but that fires AFTER a 201 and a goroutine launch — a
|
||||
// credential-less BYOK create looks successful, then dies late in provision.
|
||||
// This gate moves the SAME presence rule to the synchronous create path so a
|
||||
// credential-less BYOK create is a 4xx, not a 201-then-late-failure. It
|
||||
// reuses anyBYOKCredentialKeyPresent — the exact predicate the provision
|
||||
// check keys off (via hasAnyPlatformManagedLLMKey) — so the two checks can
|
||||
// never diverge.
|
||||
//
|
||||
// GAP B — LIVENESS preflight. Presence is not validity: a BYOK workspace can
|
||||
// carry a PRESENT-but-DEAD token (revoked/expired) and still 201 + provision,
|
||||
// only to wedge at first LLM call. This gate makes the cheapest authenticated
|
||||
// probe (a models-list GET, else a 1-token no-op completion) against the
|
||||
// provider's base_url (derived from providers.yaml — the SSOT) and FAILS
|
||||
// CLOSED on a hard auth failure (401/403). The crux distinction, documented
|
||||
// on probeBYOKCredentialLiveness: a DEAD token (401/403) MUST block; a
|
||||
// TRANSIENT failure (network/timeout/5xx) MUST NOT block (a flaky upstream
|
||||
// cannot be allowed to wedge every create) — it logs loudly and allows.
|
||||
//
|
||||
// Both gates are scoped to the resolved billing mode == byok. platform_managed
|
||||
// (the CP proxy owns the credential) and disabled (no platform-billed LLM) are
|
||||
// untouched.
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/crypto"
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db"
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/providers"
|
||||
)
|
||||
|
||||
// anyBYOKCredentialKeyPresent is the SINGLE presence predicate shared by the
|
||||
// create-time gate (this file) and the provision-time check
|
||||
// (hasAnyPlatformManagedLLMKey). It reports whether the supplied set of present
|
||||
// credential env-var NAMES contains at least one platform-managed-shaped LLM
|
||||
// bypass key — the tenant's own usable BYOK credential. The caller is
|
||||
// responsible for only including keys whose VALUE is non-empty (the provision
|
||||
// path filters on TrimSpace != "", the create path includes only non-empty
|
||||
// payload secrets + configured global_secrets keys).
|
||||
//
|
||||
// Keeping the rule here — not duplicated in two call sites — is the whole point:
|
||||
// "what counts as a usable BYOK credential" is defined once.
|
||||
func anyBYOKCredentialKeyPresent(presentKeys map[string]struct{}) bool {
|
||||
for key := range presentKeys {
|
||||
if _, ok := platformManagedDirectLLMBypassKeys[strings.ToUpper(strings.TrimSpace(key))]; ok {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// gatherCreateTimeLLMCredKeys collects the set of credential env-var NAMES that
|
||||
// will be available to the workspace at provision time, as known at CREATE:
|
||||
//
|
||||
// - the create payload's own secrets (workspace scope), non-empty values only;
|
||||
// - the tenant's configured global_secrets keys (operator/global scope).
|
||||
//
|
||||
// It returns ONLY non-empty keys, so the set can be passed straight to
|
||||
// anyBYOKCredentialKeyPresent (presence) and is the right input for the
|
||||
// auth-env disambiguation DeriveProvider uses. Returns the full key set (not
|
||||
// just bypass keys) because DeriveProvider's tie-break wants every recognized
|
||||
// auth-env name; the bypass-key filter is applied by anyBYOKCredentialKeyPresent.
|
||||
//
|
||||
// The global_secrets read is presence-only (key names, never values) — values
|
||||
// are decrypted only by the liveness probe, and only for the resolved provider's
|
||||
// own auth_env keys.
|
||||
func gatherCreateTimeLLMCredKeys(ctx context.Context, payloadSecrets map[string]string) map[string]struct{} {
|
||||
keys := map[string]struct{}{}
|
||||
for k, v := range payloadSecrets {
|
||||
if strings.TrimSpace(v) != "" {
|
||||
keys[k] = struct{}{}
|
||||
}
|
||||
}
|
||||
// Global/operator-scope keys (presence only). Best-effort: a read error
|
||||
// degrades to "only payload secrets" — the gate then fails CLOSED if the
|
||||
// payload alone lacks a credential, which is the safe direction (a byok
|
||||
// create that depends on a global cred we could not enumerate is rejected
|
||||
// with an actionable error rather than silently provisioned).
|
||||
globalKeys, err := loadConfiguredGlobalSecretKeys(ctx)
|
||||
if err != nil {
|
||||
log.Printf("byok_credential_gate: WARN — could not load global_secrets keys for create-time BYOK check: %v (proceeding with payload secrets only)", err)
|
||||
return keys
|
||||
}
|
||||
for k := range globalKeys {
|
||||
keys[k] = struct{}{}
|
||||
}
|
||||
return keys
|
||||
}
|
||||
|
||||
// availableAuthEnvNamesFromKeySet returns the subset of presentKeys that are
|
||||
// recognized provider auth-env names — the DeriveProvider tie-break input — so
|
||||
// the create-time derive splits oauth-vs-api the same way the provision path
|
||||
// does. Mirror of availableAuthEnvNames but over a key SET (the create path has
|
||||
// no env-value map, only presence).
|
||||
func availableAuthEnvNamesFromKeySet(presentKeys map[string]struct{}) []string {
|
||||
authSet := authEnvNameSet()
|
||||
var out []string
|
||||
for k := range presentKeys {
|
||||
if _, ok := authSet[k]; ok {
|
||||
out = append(out, k)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// createBYOKGateResult is the outcome of the create-time BYOK gate. Code is the
|
||||
// structured error code ("" on pass); Message is the user-visible reason; HTTP
|
||||
// is the status to return; Provider names the derived provider when known
|
||||
// (surfaced in logs + the structured error). When OK is true the create
|
||||
// proceeds.
|
||||
type createBYOKGateResult struct {
|
||||
OK bool
|
||||
Code string
|
||||
Message string
|
||||
HTTP int
|
||||
Provider string
|
||||
}
|
||||
|
||||
// passCreateBYOKGate is the proceed sentinel.
|
||||
func passCreateBYOKGate() createBYOKGateResult { return createBYOKGateResult{OK: true} }
|
||||
|
||||
// evaluateCreateBYOKCredentialGate runs GAP A (presence) and GAP B (liveness)
|
||||
// at the create boundary for a (runtime, model) pair, given the credential keys
|
||||
// that will be available at provision time and a probe function (injected so
|
||||
// tests mock the network — no real outbound in unit tests).
|
||||
//
|
||||
// Flow:
|
||||
// 1. DERIVE the provider from (runtime, model) via the registry SSOT and key
|
||||
// the byok-vs-platform decision off IsPlatform(derived) — the SAME rule
|
||||
// ResolveLLMBillingModeDerived's precedence-2 uses. We derive DIRECTLY here
|
||||
// (not via ResolveLLMBillingModeDerived) because at create there is NO
|
||||
// workspace row yet: that resolver short-circuits a "" workspaceID to the
|
||||
// pure platform-default (its pre-provision contract) and would never see
|
||||
// byok. There is also no operator-override column to honor pre-row, so a
|
||||
// direct derive is both correct and sufficient. A derive failure
|
||||
// (unregistered/ambiguous) → default-closed to platform (NOT byok), exactly
|
||||
// as the resolver does — never fail a credential-less create as byok on an
|
||||
// underivable model. (The model-registry gate upstream already rejects
|
||||
// unregistered models, so this is belt-and-suspenders.)
|
||||
// 2. GAP A: if byok and NO usable credential key is present → reject
|
||||
// MISSING_BYOK_CREDENTIAL (4xx, synchronous).
|
||||
// 3. GAP B: a credential IS present — probe its LIVENESS. Fail CLOSED only on a
|
||||
// hard auth failure (401/403); a transient/network/5xx/timeout logs loudly
|
||||
// and ALLOWS (see probeBYOKCredentialLiveness for the crux rationale).
|
||||
//
|
||||
// External workspaces are handled by the caller (they have no model/provider
|
||||
// contract) — this function assumes a spawned-runtime create.
|
||||
func (h *WorkspaceHandler) evaluateCreateBYOKCredentialGate(
|
||||
ctx context.Context,
|
||||
runtime, model string,
|
||||
availableCredKeys map[string]struct{},
|
||||
credValueForKey func(ctx context.Context, key string) (string, bool),
|
||||
probe byokLivenessProbe,
|
||||
) createBYOKGateResult {
|
||||
authEnv := availableAuthEnvNamesFromKeySet(availableCredKeys)
|
||||
|
||||
manifest, mErr := providerRegistry()
|
||||
if mErr != nil || manifest == nil {
|
||||
// Registry unavailable (a build-time defect the verify-providers-gen /
|
||||
// sync gates catch). We cannot derive the mode — default-closed to
|
||||
// platform (not byok), so the gate enforces nothing. Mirrors the
|
||||
// resolver's own registry-load fail-direction.
|
||||
log.Printf("byok_credential_gate: WARN — provider registry unavailable, skipping BYOK create gate (runtime=%q model=%q): %v", runtime, model, mErr)
|
||||
return passCreateBYOKGate()
|
||||
}
|
||||
provider, dErr := manifest.DeriveProvider(runtime, model, authEnv)
|
||||
if dErr != nil {
|
||||
// No model / unknown runtime / unregistered / ambiguous → default-closed
|
||||
// to platform (NOT byok). A derive miss never fails a create as byok.
|
||||
log.Printf("byok_credential_gate: provider underivable for create (runtime=%q model=%q): %v — treating as non-byok (default-closed to platform)", runtime, model, dErr)
|
||||
return passCreateBYOKGate()
|
||||
}
|
||||
if provider.IsPlatform() {
|
||||
// platform_managed — the CP proxy owns the credential; nothing for this
|
||||
// gate to enforce. (disabled is never DERIVED — it is only an explicit
|
||||
// operator override, which does not exist pre-row.)
|
||||
return passCreateBYOKGate()
|
||||
}
|
||||
|
||||
// A non-platform (BYOK) provider was derived. Enforce the BYOK contract.
|
||||
|
||||
// GAP A — PRESENCE. Reuse the EXACT predicate the provision-time check keys
|
||||
// off (anyBYOKCredentialKeyPresent) so create and provision can never
|
||||
// disagree on "what is a usable BYOK credential."
|
||||
if !anyBYOKCredentialKeyPresent(availableCredKeys) {
|
||||
msg := formatMissingBYOKCredentialError(LLMBillingModeBYOK)
|
||||
log.Printf("Create: FAIL-CLOSED (MISSING_BYOK_CREDENTIAL) — runtime=%q model=%q resolved=byok provider=%q but no usable LLM credential present at any scope",
|
||||
runtime, model, provider.Name)
|
||||
return createBYOKGateResult{
|
||||
OK: false,
|
||||
Code: "MISSING_BYOK_CREDENTIAL",
|
||||
Message: msg,
|
||||
HTTP: http.StatusUnprocessableEntity,
|
||||
Provider: provider.Name,
|
||||
}
|
||||
}
|
||||
|
||||
// GAP B — LIVENESS. A credential IS present; probe that it actually works.
|
||||
// Disabled when no probe is wired (provision-only deployments / probe off).
|
||||
if probe == nil {
|
||||
return passCreateBYOKGate()
|
||||
}
|
||||
return runBYOKLivenessGate(ctx, provider, availableCredKeys, credValueForKey, probe)
|
||||
}
|
||||
|
||||
// runBYOKLivenessGate selects the credential VALUE for the derived provider's
|
||||
// auth_env and runs the liveness probe. Fails CLOSED only on a hard auth
|
||||
// failure; transient failures (and an unreadable credential value) pass.
|
||||
func runBYOKLivenessGate(
|
||||
ctx context.Context,
|
||||
provider providers.Provider,
|
||||
availableCredKeys map[string]struct{},
|
||||
credValueForKey func(ctx context.Context, key string) (string, bool),
|
||||
probe byokLivenessProbe,
|
||||
) createBYOKGateResult {
|
||||
if credValueForKey == nil {
|
||||
// No value resolver wired — presence already passed; nothing to probe.
|
||||
return passCreateBYOKGate()
|
||||
}
|
||||
// Pick the FIRST of the provider's auth_env keys that is both present and
|
||||
// readable. We probe a single credential — the one the provider would
|
||||
// actually authenticate with — not every key.
|
||||
credKey, credVal := "", ""
|
||||
for _, want := range provider.AuthEnv {
|
||||
if _, present := availableCredKeys[want]; !present {
|
||||
continue
|
||||
}
|
||||
if v, ok := credValueForKey(ctx, want); ok && strings.TrimSpace(v) != "" {
|
||||
credKey, credVal = want, v
|
||||
break
|
||||
}
|
||||
}
|
||||
if credKey == "" {
|
||||
// The present key(s) for this provider could not be read back (decrypt
|
||||
// miss / value lives only in operator store we don't decrypt here).
|
||||
// Presence already passed; we cannot probe a value we don't have, so
|
||||
// allow — the provision-time path will still re-check presence, and a
|
||||
// truly dead token surfaces there. Do NOT fail closed on an unreadable
|
||||
// value (that would block legitimate global-scope creds).
|
||||
log.Printf("byok_credential_gate: liveness — provider=%q has a present auth key but no readable value at create; skipping probe (presence already satisfied)", provider.Name)
|
||||
return passCreateBYOKGate()
|
||||
}
|
||||
|
||||
outcome := probeBYOKCredentialLiveness(ctx, provider, credKey, credVal, probe)
|
||||
switch outcome.kind {
|
||||
case livenessDead:
|
||||
msg := formatInvalidBYOKCredentialError(provider.Name, credKey, outcome.status)
|
||||
log.Printf("Create: FAIL-CLOSED (BYOK_CREDENTIAL_INVALID) — provider=%q auth_env=%q probe returned HTTP %d (dead/revoked credential)",
|
||||
provider.Name, credKey, outcome.status)
|
||||
return createBYOKGateResult{
|
||||
OK: false,
|
||||
Code: "BYOK_CREDENTIAL_INVALID",
|
||||
Message: msg,
|
||||
HTTP: http.StatusUnprocessableEntity,
|
||||
Provider: provider.Name,
|
||||
}
|
||||
case livenessTransient:
|
||||
// LOUD log, but ALLOW — a flaky upstream / network blip / 5xx must not
|
||||
// block every create. This is the documented transient-vs-dead split.
|
||||
log.Printf("Create: BYOK liveness probe TRANSIENT for provider=%q auth_env=%q (status=%d err=%v) — NOT blocking (transient != dead); provision-time re-check remains",
|
||||
provider.Name, credKey, outcome.status, outcome.err)
|
||||
return passCreateBYOKGate()
|
||||
default: // livenessOK
|
||||
log.Printf("Create: BYOK liveness probe OK for provider=%q auth_env=%q (status=%d)", provider.Name, credKey, outcome.status)
|
||||
return passCreateBYOKGate()
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// GAP B — liveness probe
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
// byokLivenessProbe is the injected outbound-HTTP seam: given a prepared
|
||||
// request it returns the response (or a transport error). Production wires the
|
||||
// real http.Client (see defaultBYOKLivenessProbe); tests inject an httptest /
|
||||
// stub so NO real network is touched. The probe MUST honor the request context
|
||||
// deadline.
|
||||
type byokLivenessProbe func(req *http.Request) (*http.Response, error)
|
||||
|
||||
// byokProbeTimeout is the hard cap on a single liveness probe. Kept tight (the
|
||||
// probe is on the synchronous create path) — long enough for a healthy
|
||||
// models-list GET, short enough that a hung upstream degrades to a TRANSIENT
|
||||
// (allow) rather than wedging the create.
|
||||
const byokProbeTimeout = 6 * time.Second
|
||||
|
||||
type livenessKind int
|
||||
|
||||
const (
|
||||
livenessOK livenessKind = iota
|
||||
// livenessDead — a HARD auth failure (401/403). The credential is present
|
||||
// but rejected: revoked, expired, or wrong scope. MUST fail closed.
|
||||
livenessDead
|
||||
// livenessTransient — network error, timeout, 5xx, or any non-auth status.
|
||||
// The credential MIGHT be fine; the upstream is unreachable/unhappy right
|
||||
// now. MUST NOT fail closed (a flaky network cannot block all creates).
|
||||
livenessTransient
|
||||
)
|
||||
|
||||
type livenessOutcome struct {
|
||||
kind livenessKind
|
||||
status int // HTTP status observed (0 when no response — transport error/timeout)
|
||||
err error // transport/timeout error when kind==livenessTransient with no response
|
||||
}
|
||||
|
||||
// probeBYOKCredentialLiveness makes ONE cheapest-possible authenticated call to
|
||||
// the provider's upstream and classifies the result.
|
||||
//
|
||||
// ============================ THE CRUX ============================
|
||||
// The auth-failure vs transient-failure distinction is the entire point of this
|
||||
// probe, and getting the FAIL DIRECTION right is security-vs-availability
|
||||
// critical:
|
||||
//
|
||||
// - 401 / 403 → livenessDead → FAIL CLOSED. The upstream positively rejected
|
||||
// the credential. A present-but-dead token is exactly the gap GAP B closes
|
||||
// (a revoked key that would 201 + provision + wedge at first call). Blocking
|
||||
// here gives the user synchronous, actionable feedback.
|
||||
//
|
||||
// - network error / DNS / connection refused / TLS / TIMEOUT → livenessTransient
|
||||
// → ALLOW (loud log). The credential's validity is UNKNOWN — the upstream is
|
||||
// unreachable. Failing closed here would let a single flaky network minute,
|
||||
// or an upstream we cannot reach from this VPC, block EVERY byok create. The
|
||||
// credential may be perfectly valid; we must not punish it for the network.
|
||||
//
|
||||
// - 5xx (500/502/503/504) → livenessTransient → ALLOW. The upstream is having
|
||||
// a bad time; this says NOTHING about the credential. Same reasoning as a
|
||||
// network error.
|
||||
//
|
||||
// - 429 (rate limited) → livenessTransient → ALLOW. A rate-limit means the key
|
||||
// is being ACCEPTED (you cannot be rate-limited without authenticating) —
|
||||
// the opposite of dead. Never block on it.
|
||||
//
|
||||
// - 2xx (and any other non-auth, non-5xx 4xx like 400/404) → livenessOK. A 2xx
|
||||
// is an unambiguous pass. A 400/404 means the endpoint authenticated us and
|
||||
// then disliked the request shape (e.g. a provider without a models-list
|
||||
// route) — auth succeeded, so it is NOT a dead credential. Treat as OK
|
||||
// rather than transient so a benign endpoint-shape mismatch never blocks.
|
||||
//
|
||||
// A DEAD token (401/403) MUST block; a flaky network MUST NOT. That asymmetry is
|
||||
// the contract.
|
||||
// =================================================================
|
||||
func probeBYOKCredentialLiveness(parent context.Context, provider providers.Provider, credKey, credVal string, probe byokLivenessProbe) livenessOutcome {
|
||||
ctx, cancel := context.WithTimeout(parent, byokProbeTimeout)
|
||||
defer cancel()
|
||||
|
||||
req, err := buildBYOKLivenessRequest(ctx, provider, credKey, credVal)
|
||||
if err != nil {
|
||||
// We could not even construct a probe target (no base_url for this
|
||||
// provider — e.g. an OAuth arm the CLI dials directly, no HTTP surface we
|
||||
// can reach). Treat as transient/allow: we have NO evidence the
|
||||
// credential is dead, and presence already passed. Never fail closed on
|
||||
// "we don't know how to probe this provider."
|
||||
return livenessOutcome{kind: livenessTransient, err: err}
|
||||
}
|
||||
|
||||
resp, err := probe(req)
|
||||
if err != nil {
|
||||
// Transport error or context deadline (timeout). Validity unknown → allow.
|
||||
return livenessOutcome{kind: livenessTransient, err: err}
|
||||
}
|
||||
defer func() {
|
||||
// Drain + close so the connection can be reused and we don't leak.
|
||||
_, _ = io.Copy(io.Discard, io.LimitReader(resp.Body, 4<<10))
|
||||
_ = resp.Body.Close()
|
||||
}()
|
||||
|
||||
switch {
|
||||
case resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden:
|
||||
// 401 / 403 — the upstream positively rejected the credential. DEAD.
|
||||
return livenessOutcome{kind: livenessDead, status: resp.StatusCode}
|
||||
case resp.StatusCode == http.StatusTooManyRequests:
|
||||
// 429 — rate limited means the key authenticated. NOT dead.
|
||||
return livenessOutcome{kind: livenessTransient, status: resp.StatusCode}
|
||||
case resp.StatusCode >= 500:
|
||||
// 5xx — upstream trouble, says nothing about the credential.
|
||||
return livenessOutcome{kind: livenessTransient, status: resp.StatusCode}
|
||||
default:
|
||||
// 2xx, or a non-auth 4xx (400/404 — auth succeeded, request shape
|
||||
// disliked). Either way the credential was NOT rejected. OK.
|
||||
return livenessOutcome{kind: livenessOK, status: resp.StatusCode}
|
||||
}
|
||||
}
|
||||
|
||||
// buildBYOKLivenessRequest constructs the cheapest authenticated probe request
|
||||
// from the provider's providers.yaml row — provider-generic, no per-provider
|
||||
// URL hardcoding beyond what the SSOT supplies.
|
||||
//
|
||||
// Endpoint: prefer a models-list GET on the OpenAI-protocol base
|
||||
// (base_url_template + "/models"); for anthropic-protocol providers without an
|
||||
// openai base, fall back to a minimal 1-token messages POST on
|
||||
// base_url_anthropic. Auth header is chosen from the protocol + the auth-env
|
||||
// key the probe is exercising (Authorization: Bearer for openai/bearer keys,
|
||||
// x-api-key for anthropic API keys).
|
||||
//
|
||||
// Returns an error when the provider exposes NO probeable HTTP base (an OAuth
|
||||
// arm whose base_url_* are null — the CLI dials the vendor directly). The caller
|
||||
// treats that as transient/allow (we cannot probe what has no endpoint).
|
||||
func buildBYOKLivenessRequest(ctx context.Context, provider providers.Provider, credKey, credVal string) (*http.Request, error) {
|
||||
openaiBase := strings.TrimRight(strings.TrimSpace(provider.BaseURLTemplate), "/")
|
||||
anthropicBase := strings.TrimRight(strings.TrimSpace(provider.BaseURLAnthropic), "/")
|
||||
|
||||
// Prefer the OpenAI-protocol models-list GET (cheapest, no body, no tokens).
|
||||
if openaiBase != "" {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, openaiBase+"/models", nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
applyBYOKProbeAuth(req, provider, credKey, credVal)
|
||||
return req, nil
|
||||
}
|
||||
|
||||
// Anthropic-protocol providers expose a models-list GET too (/v1/models);
|
||||
// the base already includes the version segment for the registry rows
|
||||
// (e.g. https://api.anthropic.com/v1). A GET /models is the cheapest probe.
|
||||
if anthropicBase != "" {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, anthropicBase+"/models", nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
applyBYOKProbeAuth(req, provider, credKey, credVal)
|
||||
// Anthropic requires the version header even on GET /models.
|
||||
req.Header.Set("anthropic-version", "2023-06-01")
|
||||
return req, nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("provider %q exposes no probeable HTTP base (oauth/CLI-direct arm)", provider.Name)
|
||||
}
|
||||
|
||||
// applyBYOKProbeAuth sets the auth header the upstream expects for this
|
||||
// credential. Anthropic-protocol API keys use the x-api-key header; everything
|
||||
// else (openai-protocol bearer keys, anthropic bearer auth tokens) uses
|
||||
// Authorization: Bearer. The choice is keyed off the provider protocol + the
|
||||
// auth-env name being exercised, NOT a per-provider hardcode.
|
||||
func applyBYOKProbeAuth(req *http.Request, provider providers.Provider, credKey, credVal string) {
|
||||
upperKey := strings.ToUpper(strings.TrimSpace(credKey))
|
||||
// Anthropic protocol + an x-api-key-style key → x-api-key header. ANTHROPIC_API_KEY
|
||||
// and KIMI_API_KEY (kimi's claude-code integration uses x-api-key) go here.
|
||||
// ANTHROPIC_AUTH_TOKEN is a bearer token even on anthropic-protocol providers.
|
||||
if provider.Protocol == providers.ProtocolAnthropic && upperKey != "ANTHROPIC_AUTH_TOKEN" {
|
||||
req.Header.Set("x-api-key", credVal)
|
||||
return
|
||||
}
|
||||
req.Header.Set("Authorization", "Bearer "+credVal)
|
||||
}
|
||||
|
||||
// defaultBYOKLivenessProbe is the production probe: a real http.Client whose
|
||||
// per-call timeout is governed by the request context (set by
|
||||
// probeBYOKCredentialLiveness). A modest client timeout is a second belt against
|
||||
// a hung dial.
|
||||
func defaultBYOKLivenessProbe(req *http.Request) (*http.Response, error) {
|
||||
client := &http.Client{Timeout: byokProbeTimeout}
|
||||
return client.Do(req)
|
||||
}
|
||||
|
||||
// formatInvalidBYOKCredentialError builds the user-facing message for a
|
||||
// create rejected because a present BYOK credential was positively rejected by
|
||||
// the provider (401/403). Names the provider, the credential env, and the
|
||||
// observed status so the user knows exactly which secret to rotate.
|
||||
func formatInvalidBYOKCredentialError(providerName, credKey string, status int) string {
|
||||
return fmt.Sprintf(
|
||||
"the BYOK credential %q for provider %q was rejected by the provider (HTTP %d). "+
|
||||
"The credential is present but not valid (revoked, expired, or wrong scope). "+
|
||||
"Update %q under Config → Secrets with a working key for %q and retry.",
|
||||
credKey, providerName, status, credKey, providerName)
|
||||
}
|
||||
|
||||
// makeCreateTimeCredValueResolver returns a credValueForKey function for the
|
||||
// create path: it resolves a credential's VALUE by name, preferring the
|
||||
// plaintext value in the create payload's own secrets (workspace scope — no
|
||||
// decrypt needed) and falling back to decrypting the matching global_secrets
|
||||
// row. Returns (value, true) only when a non-empty value is found; (_, false)
|
||||
// otherwise (the liveness gate then skips probing that key — presence already
|
||||
// passed, and we never fail closed on an unreadable value).
|
||||
//
|
||||
// The closure decrypts ONLY the single key being probed (the derived provider's
|
||||
// auth_env), never the whole store.
|
||||
func makeCreateTimeCredValueResolver(payloadSecrets map[string]string) func(ctx context.Context, key string) (string, bool) {
|
||||
return func(ctx context.Context, key string) (string, bool) {
|
||||
if v, ok := payloadSecrets[key]; ok && strings.TrimSpace(v) != "" {
|
||||
return v, true
|
||||
}
|
||||
var enc []byte
|
||||
var ver int
|
||||
err := db.DB.QueryRowContext(ctx,
|
||||
`SELECT encrypted_value, encryption_version FROM global_secrets WHERE key = $1`,
|
||||
key).Scan(&enc, &ver)
|
||||
if err != nil {
|
||||
return "", false
|
||||
}
|
||||
dec, decErr := crypto.DecryptVersioned(enc, ver)
|
||||
if decErr != nil {
|
||||
log.Printf("byok_credential_gate: WARN — could not decrypt global secret %q for liveness probe: %v", key, decErr)
|
||||
return "", false
|
||||
}
|
||||
if strings.TrimSpace(string(dec)) == "" {
|
||||
return "", false
|
||||
}
|
||||
return string(dec), true
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,450 @@
|
||||
package handlers
|
||||
|
||||
// byok_credential_gate_test.go — tests for the fail-closed BYOK credential gate
|
||||
// at CREATE (GAP A presence + GAP B liveness). All outbound HTTP is mocked via
|
||||
// an injected byokLivenessProbe — NO real network is touched.
|
||||
//
|
||||
// Coverage (the five contract cases from the gap-closure spec):
|
||||
// 1. BYOK create, missing credential → 422 MISSING_BYOK_CREDENTIAL
|
||||
// 2. BYOK create, present-but-INVALID (401) → 422 BYOK_CREDENTIAL_INVALID
|
||||
// 3. BYOK create, transient probe failure → NOT blocked (loud log, allowed)
|
||||
// 4. BYOK create, valid credential (200) → proceeds (201)
|
||||
// 5. platform-managed create → gate is a no-op (unaffected)
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"errors"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/crypto"
|
||||
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/providers"
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
// stubProbe returns a byokLivenessProbe that always responds with the given
|
||||
// status (and no error). The response body is empty; the gate only inspects the
|
||||
// status code.
|
||||
func stubProbe(status int) byokLivenessProbe {
|
||||
return func(req *http.Request) (*http.Response, error) {
|
||||
return &http.Response{
|
||||
StatusCode: status,
|
||||
Body: http.NoBody,
|
||||
Header: make(http.Header),
|
||||
}, nil
|
||||
}
|
||||
}
|
||||
|
||||
// transientProbe returns a probe that fails with a transport error (the
|
||||
// network-blip / timeout class).
|
||||
func transientProbe(err error) byokLivenessProbe {
|
||||
return func(req *http.Request) (*http.Response, error) {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
// expectGlobalSecretsPresenceQuery mocks the create-time global_secrets
|
||||
// presence read (key names only). Pass the configured global key names.
|
||||
func expectGlobalSecretsPresenceQuery(mock sqlmock.Sqlmock, keys ...string) {
|
||||
rows := sqlmock.NewRows([]string{"key"})
|
||||
for _, k := range keys {
|
||||
rows.AddRow(k)
|
||||
}
|
||||
mock.ExpectQuery(`SELECT key FROM global_secrets WHERE octet_length\(encrypted_value\) > 0 LIMIT \$1`).
|
||||
WillReturnRows(rows)
|
||||
}
|
||||
|
||||
// expectGlobalSecretValueQuery mocks the liveness value-resolver read of a
|
||||
// single global secret's encrypted value (used when the probed credential lives
|
||||
// in global_secrets rather than the create payload).
|
||||
func expectGlobalSecretValueQuery(mock sqlmock.Sqlmock, key, value string) {
|
||||
enc, _ := crypto.Encrypt([]byte(value))
|
||||
ver := crypto.CurrentEncryptionVersion()
|
||||
mock.ExpectQuery(`SELECT encrypted_value, encryption_version FROM global_secrets WHERE key = \$1`).
|
||||
WithArgs(key).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}).AddRow(enc, ver))
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Unit tests for the gate logic (no HTTP handler) — fast, deterministic.
|
||||
// ============================================================================
|
||||
|
||||
// TestCreateBYOKGate_MissingCredential_RejectsPresence — GAP A: a byok-derived
|
||||
// (runtime, model) with NO available credential key is rejected with
|
||||
// MISSING_BYOK_CREDENTIAL before any liveness probe runs.
|
||||
func TestCreateBYOKGate_MissingCredential_RejectsPresence(t *testing.T) {
|
||||
h := &WorkspaceHandler{}
|
||||
probeCalled := false
|
||||
probe := func(req *http.Request) (*http.Response, error) {
|
||||
probeCalled = true
|
||||
return stubProbe(200)(req)
|
||||
}
|
||||
// claude-code + kimi-for-coding derives to kimi-coding (BYOK). No cred keys.
|
||||
res := h.evaluateCreateBYOKCredentialGate(
|
||||
context.Background(),
|
||||
"claude-code", "kimi-for-coding",
|
||||
map[string]struct{}{}, // no available credentials
|
||||
func(ctx context.Context, key string) (string, bool) { return "", false },
|
||||
probe,
|
||||
)
|
||||
if res.OK {
|
||||
t.Fatalf("expected gate to REJECT a credential-less byok create, got OK")
|
||||
}
|
||||
if res.Code != "MISSING_BYOK_CREDENTIAL" {
|
||||
t.Errorf("expected code MISSING_BYOK_CREDENTIAL, got %q", res.Code)
|
||||
}
|
||||
if res.HTTP != http.StatusUnprocessableEntity {
|
||||
t.Errorf("expected 422, got %d", res.HTTP)
|
||||
}
|
||||
if probeCalled {
|
||||
t.Errorf("liveness probe must NOT run when presence already fails")
|
||||
}
|
||||
}
|
||||
|
||||
// TestCreateBYOKGate_InvalidCredential_FailsClosed — GAP B: a PRESENT credential
|
||||
// that the provider rejects (401) → BYOK_CREDENTIAL_INVALID, fail-closed.
|
||||
func TestCreateBYOKGate_InvalidCredential_FailsClosed(t *testing.T) {
|
||||
h := &WorkspaceHandler{}
|
||||
keys := map[string]struct{}{"KIMI_API_KEY": {}}
|
||||
res := h.evaluateCreateBYOKCredentialGate(
|
||||
context.Background(),
|
||||
"claude-code", "kimi-for-coding",
|
||||
keys,
|
||||
func(ctx context.Context, key string) (string, bool) { return "sk-dead-token", true },
|
||||
stubProbe(http.StatusUnauthorized),
|
||||
)
|
||||
if res.OK {
|
||||
t.Fatalf("expected gate to REJECT a present-but-dead (401) credential, got OK")
|
||||
}
|
||||
if res.Code != "BYOK_CREDENTIAL_INVALID" {
|
||||
t.Errorf("expected code BYOK_CREDENTIAL_INVALID, got %q (msg=%s)", res.Code, res.Message)
|
||||
}
|
||||
if res.HTTP != http.StatusUnprocessableEntity {
|
||||
t.Errorf("expected 422, got %d", res.HTTP)
|
||||
}
|
||||
}
|
||||
|
||||
// TestCreateBYOKGate_Forbidden_FailsClosed — 403 is also a hard auth failure.
|
||||
func TestCreateBYOKGate_Forbidden_FailsClosed(t *testing.T) {
|
||||
h := &WorkspaceHandler{}
|
||||
keys := map[string]struct{}{"KIMI_API_KEY": {}}
|
||||
res := h.evaluateCreateBYOKCredentialGate(
|
||||
context.Background(),
|
||||
"claude-code", "kimi-for-coding", keys,
|
||||
func(ctx context.Context, key string) (string, bool) { return "sk-forbidden", true },
|
||||
stubProbe(http.StatusForbidden),
|
||||
)
|
||||
if res.OK || res.Code != "BYOK_CREDENTIAL_INVALID" {
|
||||
t.Fatalf("expected 403 to fail closed as BYOK_CREDENTIAL_INVALID, got OK=%v code=%q", res.OK, res.Code)
|
||||
}
|
||||
}
|
||||
|
||||
// TestCreateBYOKGate_Transient_NotBlocked — GAP B crux: a transient failure
|
||||
// (network error, 5xx, 429, timeout) must NOT block the create. The credential
|
||||
// validity is UNKNOWN; a flaky upstream cannot wedge every create.
|
||||
func TestCreateBYOKGate_Transient_NotBlocked(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
probe byokLivenessProbe
|
||||
}{
|
||||
{"network_error", transientProbe(errors.New("dial tcp: connection refused"))},
|
||||
{"timeout", transientProbe(context.DeadlineExceeded)},
|
||||
{"500", stubProbe(http.StatusInternalServerError)},
|
||||
{"502", stubProbe(http.StatusBadGateway)},
|
||||
{"503", stubProbe(http.StatusServiceUnavailable)},
|
||||
{"429_rate_limited", stubProbe(http.StatusTooManyRequests)},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
h := &WorkspaceHandler{}
|
||||
keys := map[string]struct{}{"KIMI_API_KEY": {}}
|
||||
res := h.evaluateCreateBYOKCredentialGate(
|
||||
context.Background(),
|
||||
"claude-code", "kimi-for-coding", keys,
|
||||
func(ctx context.Context, key string) (string, bool) { return "sk-maybe-fine", true },
|
||||
tc.probe,
|
||||
)
|
||||
if !res.OK {
|
||||
t.Fatalf("transient probe (%s) must NOT block create, got rejected code=%q", tc.name, res.Code)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestCreateBYOKGate_ValidCredential_Proceeds — GAP B happy path: a present,
|
||||
// valid (200) credential proceeds.
|
||||
func TestCreateBYOKGate_ValidCredential_Proceeds(t *testing.T) {
|
||||
h := &WorkspaceHandler{}
|
||||
keys := map[string]struct{}{"KIMI_API_KEY": {}}
|
||||
res := h.evaluateCreateBYOKCredentialGate(
|
||||
context.Background(),
|
||||
"claude-code", "kimi-for-coding", keys,
|
||||
func(ctx context.Context, key string) (string, bool) { return "sk-live", true },
|
||||
stubProbe(http.StatusOK),
|
||||
)
|
||||
if !res.OK {
|
||||
t.Fatalf("valid (200) credential must proceed, got rejected code=%q msg=%s", res.Code, res.Message)
|
||||
}
|
||||
}
|
||||
|
||||
// TestCreateBYOKGate_PlatformManaged_Unaffected — GAP scope: a platform-derived
|
||||
// model is NOT byok, so the gate is a no-op even with no credential and even
|
||||
// with a probe that would 401. The probe must never run.
|
||||
func TestCreateBYOKGate_PlatformManaged_Unaffected(t *testing.T) {
|
||||
h := &WorkspaceHandler{}
|
||||
probeCalled := false
|
||||
probe := func(req *http.Request) (*http.Response, error) {
|
||||
probeCalled = true
|
||||
return stubProbe(http.StatusUnauthorized)(req)
|
||||
}
|
||||
// claude-code + a platform-namespaced model derives to the closed `platform`
|
||||
// provider → platform_managed → gate no-op.
|
||||
res := h.evaluateCreateBYOKCredentialGate(
|
||||
context.Background(),
|
||||
"claude-code", "anthropic/claude-opus-4-7",
|
||||
map[string]struct{}{}, // no creds — irrelevant for platform
|
||||
func(ctx context.Context, key string) (string, bool) { return "", false },
|
||||
probe,
|
||||
)
|
||||
if !res.OK {
|
||||
t.Fatalf("platform-managed create must be unaffected by the BYOK gate, got code=%q", res.Code)
|
||||
}
|
||||
if probeCalled {
|
||||
t.Errorf("liveness probe must NOT run for a platform-managed model")
|
||||
}
|
||||
}
|
||||
|
||||
// TestCreateBYOKGate_UnderivableModel_DefaultsClosedToPlatform — a model that
|
||||
// does not derive (the model-registry gate owns rejecting it) is treated as
|
||||
// non-byok (default-closed to platform), NOT failed as a credential-less byok.
|
||||
func TestCreateBYOKGate_UnderivableModel_DefaultsClosedToPlatform(t *testing.T) {
|
||||
h := &WorkspaceHandler{}
|
||||
res := h.evaluateCreateBYOKCredentialGate(
|
||||
context.Background(),
|
||||
"claude-code", "totally-made-up-xyz",
|
||||
map[string]struct{}{},
|
||||
func(ctx context.Context, key string) (string, bool) { return "", false },
|
||||
stubProbe(http.StatusOK),
|
||||
)
|
||||
if !res.OK {
|
||||
t.Fatalf("an underivable model must default-closed to platform (non-byok), not fail as MISSING_BYOK_CREDENTIAL; got code=%q", res.Code)
|
||||
}
|
||||
}
|
||||
|
||||
// TestCreateBYOKGate_NonAuth4xx_TreatedOK — a 400/404 means auth succeeded but
|
||||
// the endpoint disliked the request shape (e.g. a provider with no models-list
|
||||
// route). That is NOT a dead credential → proceed.
|
||||
func TestCreateBYOKGate_NonAuth4xx_TreatedOK(t *testing.T) {
|
||||
h := &WorkspaceHandler{}
|
||||
keys := map[string]struct{}{"KIMI_API_KEY": {}}
|
||||
for _, status := range []int{http.StatusBadRequest, http.StatusNotFound} {
|
||||
res := h.evaluateCreateBYOKCredentialGate(
|
||||
context.Background(),
|
||||
"claude-code", "kimi-for-coding", keys,
|
||||
func(ctx context.Context, key string) (string, bool) { return "sk-live", true },
|
||||
stubProbe(status),
|
||||
)
|
||||
if !res.OK {
|
||||
t.Errorf("non-auth 4xx (%d) means auth succeeded; must proceed, got code=%q", status, res.Code)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestProbeAuthHeader_AnthropicVsBearer pins the provider-generic auth-header
|
||||
// selection: anthropic-protocol x-api-key keys use x-api-key; bearer keys use
|
||||
// Authorization: Bearer. No per-provider URL hardcoding.
|
||||
func TestProbeAuthHeader_AnthropicVsBearer(t *testing.T) {
|
||||
anthropicProvider := providers.Provider{
|
||||
Name: "kimi-coding",
|
||||
Protocol: providers.ProtocolAnthropic,
|
||||
BaseURLTemplate: "https://api.kimi.com/coding/v1",
|
||||
AuthEnv: []string{"KIMI_API_KEY"},
|
||||
}
|
||||
req, err := buildBYOKLivenessRequest(context.Background(), anthropicProvider, "KIMI_API_KEY", "sk-kimi")
|
||||
if err != nil {
|
||||
t.Fatalf("buildBYOKLivenessRequest: %v", err)
|
||||
}
|
||||
if got := req.Header.Get("x-api-key"); got != "sk-kimi" {
|
||||
t.Errorf("anthropic-protocol KIMI_API_KEY should set x-api-key, got %q", got)
|
||||
}
|
||||
if req.Header.Get("Authorization") != "" {
|
||||
t.Errorf("anthropic x-api-key key must not also set Authorization, got %q", req.Header.Get("Authorization"))
|
||||
}
|
||||
if req.Method != http.MethodGet || req.URL.String() != "https://api.kimi.com/coding/v1/models" {
|
||||
t.Errorf("expected GET .../models, got %s %s", req.Method, req.URL.String())
|
||||
}
|
||||
|
||||
openaiProvider := providers.Provider{
|
||||
Name: "openai-api",
|
||||
Protocol: providers.ProtocolOpenAI,
|
||||
BaseURLTemplate: "https://api.openai.com/v1",
|
||||
AuthEnv: []string{"OPENAI_API_KEY"},
|
||||
}
|
||||
req2, err := buildBYOKLivenessRequest(context.Background(), openaiProvider, "OPENAI_API_KEY", "sk-openai")
|
||||
if err != nil {
|
||||
t.Fatalf("buildBYOKLivenessRequest openai: %v", err)
|
||||
}
|
||||
if got := req2.Header.Get("Authorization"); got != "Bearer sk-openai" {
|
||||
t.Errorf("openai-protocol OPENAI_API_KEY should set Authorization: Bearer, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestProbeRequest_NoBaseURL_Errors — an OAuth arm with null base URLs has no
|
||||
// probeable HTTP surface; buildBYOKLivenessRequest errors and the gate treats it
|
||||
// as transient/allow.
|
||||
func TestProbeRequest_NoBaseURL_Errors(t *testing.T) {
|
||||
oauthArm := providers.Provider{
|
||||
Name: "anthropic-oauth",
|
||||
Protocol: providers.ProtocolAnthropic,
|
||||
AuthEnv: []string{"CLAUDE_CODE_OAUTH_TOKEN"},
|
||||
// base_url_template + base_url_anthropic both empty (null in yaml).
|
||||
}
|
||||
_, err := buildBYOKLivenessRequest(context.Background(), oauthArm, "CLAUDE_CODE_OAUTH_TOKEN", "tok")
|
||||
if err == nil {
|
||||
t.Fatalf("expected an error for a provider with no probeable HTTP base")
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// End-to-end through the Create HTTP handler — proves the wiring + 4xx body.
|
||||
// ============================================================================
|
||||
|
||||
// TestWorkspaceCreate_BYOK_MissingCredential_Rejected422 — GAP A end-to-end:
|
||||
// a credential-less byok create is rejected synchronously (no DB insert).
|
||||
func TestWorkspaceCreate_BYOK_MissingCredential_Rejected422(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
broadcaster := newTestBroadcaster()
|
||||
handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
|
||||
// Probe should never run (presence fails first); stub it to 200 anyway.
|
||||
handler.byokLivenessProbeOverride = stubProbe(http.StatusOK)
|
||||
|
||||
// Only the global_secrets presence query is expected (returns NO keys) —
|
||||
// the 422 fires before BeginTx, so any INSERT would fail ExpectationsWereMet.
|
||||
expectGlobalSecretsPresenceQuery(mock /* no keys */)
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
body := `{"name":"Kimi BYOK","runtime":"claude-code","model":"kimi-for-coding"}`
|
||||
c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
handler.Create(c)
|
||||
|
||||
if w.Code != http.StatusUnprocessableEntity {
|
||||
t.Fatalf("expected 422 MISSING_BYOK_CREDENTIAL, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
if !bytes.Contains(w.Body.Bytes(), []byte(`"code":"MISSING_BYOK_CREDENTIAL"`)) {
|
||||
t.Errorf("expected code MISSING_BYOK_CREDENTIAL in body, got %s", w.Body.String())
|
||||
}
|
||||
if !bytes.Contains(w.Body.Bytes(), []byte(`"provider":"kimi-coding"`)) {
|
||||
t.Errorf("expected derived provider kimi-coding echoed in body, got %s", w.Body.String())
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unexpected DB activity on synchronous reject: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestWorkspaceCreate_BYOK_InvalidCredential_Rejected422 — GAP B end-to-end:
|
||||
// a present-but-dead (401) credential supplied in the payload is rejected
|
||||
// synchronously with BYOK_CREDENTIAL_INVALID (no DB insert).
|
||||
func TestWorkspaceCreate_BYOK_InvalidCredential_Rejected422(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
broadcaster := newTestBroadcaster()
|
||||
handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
|
||||
handler.byokLivenessProbeOverride = stubProbe(http.StatusUnauthorized)
|
||||
|
||||
// Presence is satisfied by the PAYLOAD secret (no global keys needed); the
|
||||
// value resolver reads the payload directly (no global_secrets value query).
|
||||
expectGlobalSecretsPresenceQuery(mock /* no global keys */)
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
body := `{"name":"Kimi Dead","runtime":"claude-code","model":"kimi-for-coding","secrets":{"KIMI_API_KEY":"sk-revoked"}}`
|
||||
c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
handler.Create(c)
|
||||
|
||||
if w.Code != http.StatusUnprocessableEntity {
|
||||
t.Fatalf("expected 422 BYOK_CREDENTIAL_INVALID, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
if !bytes.Contains(w.Body.Bytes(), []byte(`"code":"BYOK_CREDENTIAL_INVALID"`)) {
|
||||
t.Errorf("expected code BYOK_CREDENTIAL_INVALID in body, got %s", w.Body.String())
|
||||
}
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unexpected DB activity on synchronous reject: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestWorkspaceCreate_BYOK_TransientProbe_Proceeds — GAP B end-to-end: a
|
||||
// transient probe failure does NOT block; the create proceeds to 201. The
|
||||
// credential is supplied via a GLOBAL secret (so there is no in-tx
|
||||
// workspace_secrets write and the create flow matches the registry-proceed
|
||||
// tests). Presence passes via the global key; the value resolver decrypts the
|
||||
// global secret value to feed the probe, which then errors transiently.
|
||||
func TestWorkspaceCreate_BYOK_TransientProbe_Proceeds(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
broadcaster := newTestBroadcaster()
|
||||
handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
|
||||
waitForHandlerAsyncBeforeDBCleanup(t, handler)
|
||||
handler.byokLivenessProbeOverride = transientProbe(errors.New("dial tcp: i/o timeout"))
|
||||
|
||||
// Presence: global_secrets carries KIMI_API_KEY.
|
||||
expectGlobalSecretsPresenceQuery(mock, "KIMI_API_KEY")
|
||||
// Liveness value resolver: decrypt the global KIMI_API_KEY value.
|
||||
expectGlobalSecretValueQuery(mock, "KIMI_API_KEY", "sk-maybe-fine")
|
||||
// Create proceeds (transient = allow). No payload secrets → no in-tx
|
||||
// workspace_secrets write; the create flow matches the registry-proceed path.
|
||||
mock.ExpectBegin()
|
||||
mock.ExpectExec("INSERT INTO workspaces").WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
mock.ExpectCommit()
|
||||
mock.ExpectExec("INSERT INTO canvas_layouts").WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
mock.ExpectExec("INSERT INTO structure_events").WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
mock.ExpectExec("INSERT INTO workspace_auth_tokens").WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
body := `{"name":"Kimi Transient","runtime":"claude-code","model":"kimi-for-coding"}`
|
||||
c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
handler.Create(c)
|
||||
|
||||
if w.Code != http.StatusCreated {
|
||||
t.Fatalf("transient probe must NOT block; expected 201, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
// TestWorkspaceCreate_BYOK_ValidGlobalCredential_Proceeds — GAP B happy path
|
||||
// e2e: a valid (200) global credential proceeds to 201.
|
||||
func TestWorkspaceCreate_BYOK_ValidGlobalCredential_Proceeds(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
broadcaster := newTestBroadcaster()
|
||||
handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
|
||||
waitForHandlerAsyncBeforeDBCleanup(t, handler)
|
||||
handler.byokLivenessProbeOverride = stubProbe(http.StatusOK)
|
||||
|
||||
expectGlobalSecretsPresenceQuery(mock, "KIMI_API_KEY")
|
||||
expectGlobalSecretValueQuery(mock, "KIMI_API_KEY", "sk-live")
|
||||
mock.ExpectBegin()
|
||||
mock.ExpectExec("INSERT INTO workspaces").WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
mock.ExpectCommit()
|
||||
mock.ExpectExec("INSERT INTO canvas_layouts").WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
mock.ExpectExec("INSERT INTO structure_events").WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
mock.ExpectExec("INSERT INTO workspace_auth_tokens").WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
body := `{"name":"Kimi Live","runtime":"claude-code","model":"kimi-for-coding"}`
|
||||
c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
handler.Create(c)
|
||||
|
||||
if w.Code != http.StatusCreated {
|
||||
t.Fatalf("valid credential must proceed; expected 201, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
}
|
||||
@@ -48,7 +48,8 @@ func TestWorkspaceCreate_WithParentID(t *testing.T) {
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
body := `{"name":"Child Agent","model":"anthropic:claude-opus-4-7","parent_id":"parent-ws-123"}`
|
||||
// Platform-managed model id → BYOK gate no-op (this test asserts parent_id).
|
||||
body := `{"name":"Child Agent","model":"anthropic/claude-opus-4-7","parent_id":"parent-ws-123"}`
|
||||
c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
|
||||
@@ -86,7 +87,9 @@ func TestWorkspaceCreate_ExplicitClaudeCodeRuntime(t *testing.T) {
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
body := `{"name":"CC Agent","tier":2,"runtime":"claude-code","model":"sonnet","canvas":{"x":10,"y":20}}`
|
||||
// Platform-managed model id → BYOK gate no-op (this test asserts the
|
||||
// claude-code runtime label, not billing).
|
||||
body := `{"name":"CC Agent","tier":2,"runtime":"claude-code","model":"anthropic/claude-sonnet-4-6","canvas":{"x":10,"y":20}}`
|
||||
c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
|
||||
@@ -309,7 +312,9 @@ func TestWorkspaceCreate_MaxConcurrentTasksOverride(t *testing.T) {
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
body := `{"name":"Leader Agent","runtime":"claude-code","model":"sonnet","max_concurrent_tasks":3}`
|
||||
// Platform-managed model id → BYOK gate no-op (this test asserts
|
||||
// max_concurrent_tasks).
|
||||
body := `{"name":"Leader Agent","runtime":"claude-code","model":"anthropic/claude-sonnet-4-6","max_concurrent_tasks":3}`
|
||||
c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
|
||||
|
||||
@@ -398,7 +398,10 @@ func TestWorkspaceCreate(t *testing.T) {
|
||||
// the bare-defaults path (no template, no runtime → claude-code), so
|
||||
// the body must declare an explicit model. Using a claude-code-compatible
|
||||
// id; the test doesn't exercise model semantics beyond presence.
|
||||
body := `{"name":"Test Agent","model":"anthropic:claude-opus-4-7","canvas":{"x":100,"y":200}}`
|
||||
// Platform-managed model id (derives to the closed `platform` provider) so
|
||||
// the fail-closed BYOK create gate is a no-op — this test exercises the
|
||||
// generic create flow, not billing/credential semantics.
|
||||
body := `{"name":"Test Agent","model":"anthropic/claude-opus-4-7","canvas":{"x":100,"y":200}}`
|
||||
c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
|
||||
@@ -466,7 +469,9 @@ func TestWorkspaceCreate_ReturnsAuthToken_201(t *testing.T) {
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
body := `{"name":"Token Holder","model":"anthropic:claude-opus-4-7"}`
|
||||
// Platform-managed model id → BYOK gate no-op (this test asserts the auth
|
||||
// token, not billing).
|
||||
body := `{"name":"Token Holder","model":"anthropic/claude-opus-4-7"}`
|
||||
c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
|
||||
|
||||
@@ -89,6 +89,21 @@ type WorkspaceHandler struct {
|
||||
// for async DB users (restart, provision) before asserting results.
|
||||
// Matches the pattern from main commit 1c3b4ff3.
|
||||
asyncWG sync.WaitGroup
|
||||
// byokLivenessProbeOverride lets tests inject a mock outbound-HTTP probe
|
||||
// for the create-time BYOK credential LIVENESS gate (GAP B) so no real
|
||||
// network is touched in tests. nil in production → effectiveBYOKLivenessProbe
|
||||
// returns defaultBYOKLivenessProbe (a real http.Client). Set it to a stub in
|
||||
// tests, or to a sentinel that disables probing.
|
||||
byokLivenessProbeOverride byokLivenessProbe
|
||||
}
|
||||
|
||||
// effectiveBYOKLivenessProbe returns the probe the create-time liveness gate
|
||||
// should use: the test override when set, else the production http.Client probe.
|
||||
func (h *WorkspaceHandler) effectiveBYOKLivenessProbe() byokLivenessProbe {
|
||||
if h.byokLivenessProbeOverride != nil {
|
||||
return h.byokLivenessProbeOverride
|
||||
}
|
||||
return defaultBYOKLivenessProbe
|
||||
}
|
||||
|
||||
// seedMemoryPluginAPI is the slice of the v2 memory plugin client that
|
||||
@@ -550,6 +565,46 @@ func (h *WorkspaceHandler) Create(c *gin.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
// Fail-closed BYOK credential gate (byok_credential_gate.go) — synchronous,
|
||||
// BEFORE any DB write so a credential-less or dead-credential BYOK create is
|
||||
// a 4xx, not a 201-then-late-provision-failure. Placed AFTER the cheap field
|
||||
// validations (compute / access / dir / URL) so a malformed request is
|
||||
// rejected on its own merits before the heavier credential check (one
|
||||
// global_secrets read + a network probe), and immediately BEFORE BeginTx so
|
||||
// the rejection path never touches the DB transaction:
|
||||
//
|
||||
// GAP A — PRESENCE: resolved mode == byok but no usable LLM credential at
|
||||
// any scope (payload secrets + configured global_secrets) → 422
|
||||
// MISSING_BYOK_CREDENTIAL. Same presence rule the provision-time check
|
||||
// keys off (anyBYOKCredentialKeyPresent), so create and provision agree.
|
||||
// GAP B — LIVENESS: a credential IS present — probe it actually works. A
|
||||
// positively-rejected credential (401/403) → 422 BYOK_CREDENTIAL_INVALID;
|
||||
// a transient/network/5xx/timeout never blocks (see the probe's doc).
|
||||
//
|
||||
// Skipped for external workspaces (no model/provider contract — the URL is
|
||||
// the contract, mirroring the MODEL_REQUIRED / registry gates above).
|
||||
if !isExternal {
|
||||
availableCredKeys := gatherCreateTimeLLMCredKeys(ctx, payload.Secrets)
|
||||
gate := h.evaluateCreateBYOKCredentialGate(
|
||||
ctx,
|
||||
payload.Runtime,
|
||||
payload.Model,
|
||||
availableCredKeys,
|
||||
makeCreateTimeCredValueResolver(payload.Secrets),
|
||||
h.effectiveBYOKLivenessProbe(),
|
||||
)
|
||||
if !gate.OK {
|
||||
c.JSON(gate.HTTP, gin.H{
|
||||
"error": gate.Message,
|
||||
"code": gate.Code,
|
||||
"runtime": payload.Runtime,
|
||||
"model": payload.Model,
|
||||
"provider": gate.Provider,
|
||||
})
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
tx, txErr := db.DB.BeginTx(ctx, nil)
|
||||
if txErr != nil {
|
||||
log.Printf("Create workspace: begin tx error: %v", txErr)
|
||||
|
||||
@@ -173,7 +173,8 @@ func TestWorkspaceBudget_Create_WithLimit(t *testing.T) {
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
body := `{"name":"Budgeted Agent","model":"anthropic:claude-opus-4-7","budget_limit":1000}`
|
||||
// Platform-managed model id → BYOK gate no-op (this test asserts budget_limit).
|
||||
body := `{"name":"Budgeted Agent","model":"anthropic/claude-opus-4-7","budget_limit":1000}`
|
||||
c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
handler.Create(c)
|
||||
|
||||
@@ -1142,13 +1142,21 @@ func stripPlatformManagedLLMBypassEnv(envVars map[string]string) {
|
||||
// global or workspace scope). Used by the byok fail-closed branch: a byok
|
||||
// workspace with no LLM credential at ANY scope must be aborted with
|
||||
// MISSING_BYOK_CREDENTIAL rather than started credential-less.
|
||||
//
|
||||
// This is the PROVISION-time presence rule. The CREATE-time presence gate
|
||||
// (byok_credential_gate.go) reuses the SAME underlying predicate
|
||||
// (anyBYOKCredentialKeyPresent) over a presence-only key set so the two
|
||||
// fail-closed checks can never diverge: the rule "any non-empty
|
||||
// platform-managed LLM bypass key is a usable BYOK credential" lives in exactly
|
||||
// one place.
|
||||
func hasAnyPlatformManagedLLMKey(envVars map[string]string) bool {
|
||||
for key := range platformManagedDirectLLMBypassKeys {
|
||||
if strings.TrimSpace(envVars[key]) != "" {
|
||||
return true
|
||||
present := make(map[string]struct{}, len(envVars))
|
||||
for key, val := range envVars {
|
||||
if strings.TrimSpace(val) != "" {
|
||||
present[key] = struct{}{}
|
||||
}
|
||||
}
|
||||
return false
|
||||
return anyBYOKCredentialKeyPresent(present)
|
||||
}
|
||||
|
||||
// stripNonMatchingGlobalOriginLLMCreds is the byok-branch provider-matched
|
||||
|
||||
@@ -350,7 +350,9 @@ func TestWorkspaceCreate_DBInsertError(t *testing.T) {
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
|
||||
body := `{"name":"Failing Agent","model":"anthropic:claude-opus-4-7"}`
|
||||
// Platform-managed model id → BYOK gate no-op (this test asserts the DB
|
||||
// insert-error rollback, not billing).
|
||||
body := `{"name":"Failing Agent","model":"anthropic/claude-opus-4-7"}`
|
||||
c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
|
||||
@@ -396,7 +398,9 @@ func TestWorkspaceCreate_DefaultsApplied(t *testing.T) {
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
|
||||
body := `{"name":"Default Agent","model":"anthropic:claude-opus-4-7"}`
|
||||
// Platform-managed model id → BYOK gate no-op (this test asserts the
|
||||
// status/defaults, not billing).
|
||||
body := `{"name":"Default Agent","model":"anthropic/claude-opus-4-7"}`
|
||||
c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
|
||||
@@ -519,6 +523,13 @@ func TestWorkspaceCreate_SecretPersistFails_RollsBack(t *testing.T) {
|
||||
broadcaster := newTestBroadcaster()
|
||||
handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
|
||||
|
||||
// The create-time BYOK gate runs first: anthropic:claude-opus-4-7 derives to
|
||||
// anthropic-api (BYOK). Presence is satisfied by the payload OPENAI_API_KEY
|
||||
// (a recognized LLM bypass key) — the liveness probe is skipped because
|
||||
// OPENAI_API_KEY is not in anthropic-api's auth_env, so no value is read for
|
||||
// the derived provider. Only the global presence query precedes BeginTx.
|
||||
expectGlobalSecretsPresenceQuery(mock /* no global keys */)
|
||||
|
||||
mock.ExpectBegin()
|
||||
mock.ExpectExec("INSERT INTO workspaces").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
@@ -2118,6 +2129,12 @@ func TestWorkspaceCreate_718_P4_RegisteredModelProceeds(t *testing.T) {
|
||||
setupTestRedis(t)
|
||||
broadcaster := newTestBroadcaster()
|
||||
handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
|
||||
// claude-opus-4-7 derives to anthropic-api (BYOK). The fail-closed BYOK
|
||||
// create gate now requires a present + live credential — supply one via a
|
||||
// global secret and stub the liveness probe to 200 (no real network).
|
||||
handler.byokLivenessProbeOverride = stubProbe(http.StatusOK)
|
||||
expectGlobalSecretsPresenceQuery(mock, "ANTHROPIC_API_KEY")
|
||||
expectGlobalSecretValueQuery(mock, "ANTHROPIC_API_KEY", "sk-ant-live")
|
||||
|
||||
mock.ExpectBegin()
|
||||
mock.ExpectExec("INSERT INTO workspaces").
|
||||
@@ -2156,6 +2173,11 @@ func TestWorkspaceCreate_718_P4_LegacyColonVocabAccepted(t *testing.T) {
|
||||
setupTestRedis(t)
|
||||
broadcaster := newTestBroadcaster()
|
||||
handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
|
||||
// anthropic:claude-opus-4-7 derives to anthropic-api (BYOK) — satisfy the
|
||||
// fail-closed BYOK create gate with a present + live global credential.
|
||||
handler.byokLivenessProbeOverride = stubProbe(http.StatusOK)
|
||||
expectGlobalSecretsPresenceQuery(mock, "ANTHROPIC_API_KEY")
|
||||
expectGlobalSecretValueQuery(mock, "ANTHROPIC_API_KEY", "sk-ant-live")
|
||||
|
||||
mock.ExpectBegin()
|
||||
mock.ExpectExec("INSERT INTO workspaces").
|
||||
@@ -2220,6 +2242,13 @@ func TestWorkspaceCreate_188_ExplicitRuntimeNoTemplate_OK(t *testing.T) {
|
||||
setupTestRedis(t)
|
||||
broadcaster := newTestBroadcaster()
|
||||
handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
|
||||
// codex gpt-5.5 derives to openai-subscription (BYOK, OAuth arm). The OAuth
|
||||
// arm has null base URLs → the liveness probe is unprobeable → transient/
|
||||
// allow; presence still requires a credential. Supply CODEX_AUTH_JSON via a
|
||||
// global secret (presence passes; the probe is skipped as unprobeable).
|
||||
handler.byokLivenessProbeOverride = stubProbe(http.StatusOK)
|
||||
expectGlobalSecretsPresenceQuery(mock, "CODEX_AUTH_JSON")
|
||||
expectGlobalSecretValueQuery(mock, "CODEX_AUTH_JSON", `{"tokens":"x"}`)
|
||||
|
||||
mock.ExpectBegin()
|
||||
mock.ExpectExec("INSERT INTO workspaces").
|
||||
|
||||
Reference in New Issue
Block a user