From 1a78459e39952e08a3580806aba2d25b33bf4e32 Mon Sep 17 00:00:00 2001 From: core-devops Date: Fri, 5 Jun 2026 16:47:59 -0700 Subject: [PATCH] =?UTF-8?q?feat(billing):=20fail-closed=20BYOK=20credentia?= =?UTF-8?q?l=20gate=20at=20CREATE=20=E2=80=94=20presence=20+=20liveness?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Close two gaps in the fail-closed BYOK billing model, both in the credential-validation area of the workspace CREATE path. One coherent change. GAP A — PRESENCE gate at CREATE (synchronous, fail-closed) The provision-time MISSING_BYOK_CREDENTIAL check (workspace_provision_shared.go) already aborts a credential-less BYOK provision, but only AFTER a 201 + a provision goroutine — a credential-less BYOK create looks successful, then dies late. Move the SAME presence rule to the synchronous create path: if the DERIVED provider is non-platform (byok) and no usable LLM credential is present at any scope (payload secrets + configured global_secrets), reject 422 MISSING_BYOK_CREDENTIAL naming the provider. The provision-time check stays as defense-in-depth. The presence predicate is factored into ONE shared helper (anyBYOKCredentialKeyPresent); hasAnyPlatformManagedLLMKey now delegates to it so create and provision can never diverge on "what is a usable BYOK credential". The create path derives the provider DIRECTLY from (runtime, model) via the registry SSOT (not via ResolveLLMBillingModeDerived, which short-circuits a "" workspaceID to the platform default pre-row) and keys byok-vs-platform off IsPlatform(derived). A derive miss defaults closed to platform (never fails a credential-less create as byok). GAP B — credential LIVENESS preflight (validity, not just presence) Presence != validity: a present-but-revoked token would 201 + provision and wedge at first call. Add a time-bounded (6s), authenticated probe that derives base_url + auth header from the provider's providers.yaml row (cheapest models-list GET) and classifies the result. THE CRUX (documented in code): - 401/403 -> DEAD credential -> FAIL CLOSED (422 BYOK_CREDENTIAL_INVALID) - network/timeout/5xx/429 -> TRANSIENT -> ALLOW (loud log; a flaky upstream must never block every create; 429 means the key authenticated) - 2xx / non-auth 4xx -> OK (auth succeeded) The probe is provider-generic (no per-provider URL hardcoding beyond the SSOT) and injected (byokLivenessProbe) so tests mock all outbound HTTP — no real network in tests. The probe URL is registry-derived only; the credential value goes solely into the auth header (no SSRF surface, no secret value logged). Both gates run immediately before BeginTx (after the cheap field validations) and only for BYOK; platform_managed and disabled are untouched. External workspaces are exempt (URL is the contract, not the model). Tests (byok_credential_gate_test.go): missing-cred -> 422; present-but-invalid (mock 401/403) -> BYOK_CREDENTIAL_INVALID; transient (network/timeout/5xx/429) -> not blocked; valid (200) -> proceeds; platform-managed -> gate no-op; underivable -> default-closed; auth-header selection; no-base-URL -> allow; plus e2e through the Create handler. Existing generic create tests that incidentally used BYOK model ids switched to platform model ids (billing not their concern); the registry/colon-vocab/codex tests gained a credential + probe stub. go build/vet/test ./internal/handlers/... green (integration-tagged tests require INTEGRATION_DB_URL, pre-existing). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../internal/handlers/byok_credential_gate.go | 514 ++++++++++++++++++ .../handlers/byok_credential_gate_test.go | 450 +++++++++++++++ .../handlers/handlers_additional_test.go | 11 +- .../internal/handlers/handlers_test.go | 9 +- .../internal/handlers/workspace.go | 55 ++ .../handlers/workspace_budget_test.go | 3 +- .../internal/handlers/workspace_provision.go | 16 +- .../internal/handlers/workspace_test.go | 33 +- 8 files changed, 1079 insertions(+), 12 deletions(-) create mode 100644 workspace-server/internal/handlers/byok_credential_gate.go create mode 100644 workspace-server/internal/handlers/byok_credential_gate_test.go diff --git a/workspace-server/internal/handlers/byok_credential_gate.go b/workspace-server/internal/handlers/byok_credential_gate.go new file mode 100644 index 00000000..70b04828 --- /dev/null +++ b/workspace-server/internal/handlers/byok_credential_gate.go @@ -0,0 +1,514 @@ +package handlers + +// byok_credential_gate.go — fail-closed BYOK credential checks at the CREATE +// boundary (molecule-core, BYOK billing-model gap-closure). +// +// Two related gaps in the fail-closed BYOK billing model are closed here, both +// in the same credential-validation area: +// +// GAP A — PRESENCE gate at CREATE. The provision-time fail-closed check +// (workspace_provision_shared.go) already aborts a BYOK provision with no +// usable credential, but that fires AFTER a 201 and a goroutine launch — a +// credential-less BYOK create looks successful, then dies late in provision. +// This gate moves the SAME presence rule to the synchronous create path so a +// credential-less BYOK create is a 4xx, not a 201-then-late-failure. It +// reuses anyBYOKCredentialKeyPresent — the exact predicate the provision +// check keys off (via hasAnyPlatformManagedLLMKey) — so the two checks can +// never diverge. +// +// GAP B — LIVENESS preflight. Presence is not validity: a BYOK workspace can +// carry a PRESENT-but-DEAD token (revoked/expired) and still 201 + provision, +// only to wedge at first LLM call. This gate makes the cheapest authenticated +// probe (a models-list GET, else a 1-token no-op completion) against the +// provider's base_url (derived from providers.yaml — the SSOT) and FAILS +// CLOSED on a hard auth failure (401/403). The crux distinction, documented +// on probeBYOKCredentialLiveness: a DEAD token (401/403) MUST block; a +// TRANSIENT failure (network/timeout/5xx) MUST NOT block (a flaky upstream +// cannot be allowed to wedge every create) — it logs loudly and allows. +// +// Both gates are scoped to the resolved billing mode == byok. platform_managed +// (the CP proxy owns the credential) and disabled (no platform-billed LLM) are +// untouched. + +import ( + "context" + "fmt" + "io" + "log" + "net/http" + "strings" + "time" + + "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/crypto" + "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db" + "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/providers" +) + +// anyBYOKCredentialKeyPresent is the SINGLE presence predicate shared by the +// create-time gate (this file) and the provision-time check +// (hasAnyPlatformManagedLLMKey). It reports whether the supplied set of present +// credential env-var NAMES contains at least one platform-managed-shaped LLM +// bypass key — the tenant's own usable BYOK credential. The caller is +// responsible for only including keys whose VALUE is non-empty (the provision +// path filters on TrimSpace != "", the create path includes only non-empty +// payload secrets + configured global_secrets keys). +// +// Keeping the rule here — not duplicated in two call sites — is the whole point: +// "what counts as a usable BYOK credential" is defined once. +func anyBYOKCredentialKeyPresent(presentKeys map[string]struct{}) bool { + for key := range presentKeys { + if _, ok := platformManagedDirectLLMBypassKeys[strings.ToUpper(strings.TrimSpace(key))]; ok { + return true + } + } + return false +} + +// gatherCreateTimeLLMCredKeys collects the set of credential env-var NAMES that +// will be available to the workspace at provision time, as known at CREATE: +// +// - the create payload's own secrets (workspace scope), non-empty values only; +// - the tenant's configured global_secrets keys (operator/global scope). +// +// It returns ONLY non-empty keys, so the set can be passed straight to +// anyBYOKCredentialKeyPresent (presence) and is the right input for the +// auth-env disambiguation DeriveProvider uses. Returns the full key set (not +// just bypass keys) because DeriveProvider's tie-break wants every recognized +// auth-env name; the bypass-key filter is applied by anyBYOKCredentialKeyPresent. +// +// The global_secrets read is presence-only (key names, never values) — values +// are decrypted only by the liveness probe, and only for the resolved provider's +// own auth_env keys. +func gatherCreateTimeLLMCredKeys(ctx context.Context, payloadSecrets map[string]string) map[string]struct{} { + keys := map[string]struct{}{} + for k, v := range payloadSecrets { + if strings.TrimSpace(v) != "" { + keys[k] = struct{}{} + } + } + // Global/operator-scope keys (presence only). Best-effort: a read error + // degrades to "only payload secrets" — the gate then fails CLOSED if the + // payload alone lacks a credential, which is the safe direction (a byok + // create that depends on a global cred we could not enumerate is rejected + // with an actionable error rather than silently provisioned). + globalKeys, err := loadConfiguredGlobalSecretKeys(ctx) + if err != nil { + log.Printf("byok_credential_gate: WARN — could not load global_secrets keys for create-time BYOK check: %v (proceeding with payload secrets only)", err) + return keys + } + for k := range globalKeys { + keys[k] = struct{}{} + } + return keys +} + +// availableAuthEnvNamesFromKeySet returns the subset of presentKeys that are +// recognized provider auth-env names — the DeriveProvider tie-break input — so +// the create-time derive splits oauth-vs-api the same way the provision path +// does. Mirror of availableAuthEnvNames but over a key SET (the create path has +// no env-value map, only presence). +func availableAuthEnvNamesFromKeySet(presentKeys map[string]struct{}) []string { + authSet := authEnvNameSet() + var out []string + for k := range presentKeys { + if _, ok := authSet[k]; ok { + out = append(out, k) + } + } + return out +} + +// createBYOKGateResult is the outcome of the create-time BYOK gate. Code is the +// structured error code ("" on pass); Message is the user-visible reason; HTTP +// is the status to return; Provider names the derived provider when known +// (surfaced in logs + the structured error). When OK is true the create +// proceeds. +type createBYOKGateResult struct { + OK bool + Code string + Message string + HTTP int + Provider string +} + +// passCreateBYOKGate is the proceed sentinel. +func passCreateBYOKGate() createBYOKGateResult { return createBYOKGateResult{OK: true} } + +// evaluateCreateBYOKCredentialGate runs GAP A (presence) and GAP B (liveness) +// at the create boundary for a (runtime, model) pair, given the credential keys +// that will be available at provision time and a probe function (injected so +// tests mock the network — no real outbound in unit tests). +// +// Flow: +// 1. DERIVE the provider from (runtime, model) via the registry SSOT and key +// the byok-vs-platform decision off IsPlatform(derived) — the SAME rule +// ResolveLLMBillingModeDerived's precedence-2 uses. We derive DIRECTLY here +// (not via ResolveLLMBillingModeDerived) because at create there is NO +// workspace row yet: that resolver short-circuits a "" workspaceID to the +// pure platform-default (its pre-provision contract) and would never see +// byok. There is also no operator-override column to honor pre-row, so a +// direct derive is both correct and sufficient. A derive failure +// (unregistered/ambiguous) → default-closed to platform (NOT byok), exactly +// as the resolver does — never fail a credential-less create as byok on an +// underivable model. (The model-registry gate upstream already rejects +// unregistered models, so this is belt-and-suspenders.) +// 2. GAP A: if byok and NO usable credential key is present → reject +// MISSING_BYOK_CREDENTIAL (4xx, synchronous). +// 3. GAP B: a credential IS present — probe its LIVENESS. Fail CLOSED only on a +// hard auth failure (401/403); a transient/network/5xx/timeout logs loudly +// and ALLOWS (see probeBYOKCredentialLiveness for the crux rationale). +// +// External workspaces are handled by the caller (they have no model/provider +// contract) — this function assumes a spawned-runtime create. +func (h *WorkspaceHandler) evaluateCreateBYOKCredentialGate( + ctx context.Context, + runtime, model string, + availableCredKeys map[string]struct{}, + credValueForKey func(ctx context.Context, key string) (string, bool), + probe byokLivenessProbe, +) createBYOKGateResult { + authEnv := availableAuthEnvNamesFromKeySet(availableCredKeys) + + manifest, mErr := providerRegistry() + if mErr != nil || manifest == nil { + // Registry unavailable (a build-time defect the verify-providers-gen / + // sync gates catch). We cannot derive the mode — default-closed to + // platform (not byok), so the gate enforces nothing. Mirrors the + // resolver's own registry-load fail-direction. + log.Printf("byok_credential_gate: WARN — provider registry unavailable, skipping BYOK create gate (runtime=%q model=%q): %v", runtime, model, mErr) + return passCreateBYOKGate() + } + provider, dErr := manifest.DeriveProvider(runtime, model, authEnv) + if dErr != nil { + // No model / unknown runtime / unregistered / ambiguous → default-closed + // to platform (NOT byok). A derive miss never fails a create as byok. + log.Printf("byok_credential_gate: provider underivable for create (runtime=%q model=%q): %v — treating as non-byok (default-closed to platform)", runtime, model, dErr) + return passCreateBYOKGate() + } + if provider.IsPlatform() { + // platform_managed — the CP proxy owns the credential; nothing for this + // gate to enforce. (disabled is never DERIVED — it is only an explicit + // operator override, which does not exist pre-row.) + return passCreateBYOKGate() + } + + // A non-platform (BYOK) provider was derived. Enforce the BYOK contract. + + // GAP A — PRESENCE. Reuse the EXACT predicate the provision-time check keys + // off (anyBYOKCredentialKeyPresent) so create and provision can never + // disagree on "what is a usable BYOK credential." + if !anyBYOKCredentialKeyPresent(availableCredKeys) { + msg := formatMissingBYOKCredentialError(LLMBillingModeBYOK) + log.Printf("Create: FAIL-CLOSED (MISSING_BYOK_CREDENTIAL) — runtime=%q model=%q resolved=byok provider=%q but no usable LLM credential present at any scope", + runtime, model, provider.Name) + return createBYOKGateResult{ + OK: false, + Code: "MISSING_BYOK_CREDENTIAL", + Message: msg, + HTTP: http.StatusUnprocessableEntity, + Provider: provider.Name, + } + } + + // GAP B — LIVENESS. A credential IS present; probe that it actually works. + // Disabled when no probe is wired (provision-only deployments / probe off). + if probe == nil { + return passCreateBYOKGate() + } + return runBYOKLivenessGate(ctx, provider, availableCredKeys, credValueForKey, probe) +} + +// runBYOKLivenessGate selects the credential VALUE for the derived provider's +// auth_env and runs the liveness probe. Fails CLOSED only on a hard auth +// failure; transient failures (and an unreadable credential value) pass. +func runBYOKLivenessGate( + ctx context.Context, + provider providers.Provider, + availableCredKeys map[string]struct{}, + credValueForKey func(ctx context.Context, key string) (string, bool), + probe byokLivenessProbe, +) createBYOKGateResult { + if credValueForKey == nil { + // No value resolver wired — presence already passed; nothing to probe. + return passCreateBYOKGate() + } + // Pick the FIRST of the provider's auth_env keys that is both present and + // readable. We probe a single credential — the one the provider would + // actually authenticate with — not every key. + credKey, credVal := "", "" + for _, want := range provider.AuthEnv { + if _, present := availableCredKeys[want]; !present { + continue + } + if v, ok := credValueForKey(ctx, want); ok && strings.TrimSpace(v) != "" { + credKey, credVal = want, v + break + } + } + if credKey == "" { + // The present key(s) for this provider could not be read back (decrypt + // miss / value lives only in operator store we don't decrypt here). + // Presence already passed; we cannot probe a value we don't have, so + // allow — the provision-time path will still re-check presence, and a + // truly dead token surfaces there. Do NOT fail closed on an unreadable + // value (that would block legitimate global-scope creds). + log.Printf("byok_credential_gate: liveness — provider=%q has a present auth key but no readable value at create; skipping probe (presence already satisfied)", provider.Name) + return passCreateBYOKGate() + } + + outcome := probeBYOKCredentialLiveness(ctx, provider, credKey, credVal, probe) + switch outcome.kind { + case livenessDead: + msg := formatInvalidBYOKCredentialError(provider.Name, credKey, outcome.status) + log.Printf("Create: FAIL-CLOSED (BYOK_CREDENTIAL_INVALID) — provider=%q auth_env=%q probe returned HTTP %d (dead/revoked credential)", + provider.Name, credKey, outcome.status) + return createBYOKGateResult{ + OK: false, + Code: "BYOK_CREDENTIAL_INVALID", + Message: msg, + HTTP: http.StatusUnprocessableEntity, + Provider: provider.Name, + } + case livenessTransient: + // LOUD log, but ALLOW — a flaky upstream / network blip / 5xx must not + // block every create. This is the documented transient-vs-dead split. + log.Printf("Create: BYOK liveness probe TRANSIENT for provider=%q auth_env=%q (status=%d err=%v) — NOT blocking (transient != dead); provision-time re-check remains", + provider.Name, credKey, outcome.status, outcome.err) + return passCreateBYOKGate() + default: // livenessOK + log.Printf("Create: BYOK liveness probe OK for provider=%q auth_env=%q (status=%d)", provider.Name, credKey, outcome.status) + return passCreateBYOKGate() + } +} + +// ---------------------------------------------------------------------------- +// GAP B — liveness probe +// ---------------------------------------------------------------------------- + +// byokLivenessProbe is the injected outbound-HTTP seam: given a prepared +// request it returns the response (or a transport error). Production wires the +// real http.Client (see defaultBYOKLivenessProbe); tests inject an httptest / +// stub so NO real network is touched. The probe MUST honor the request context +// deadline. +type byokLivenessProbe func(req *http.Request) (*http.Response, error) + +// byokProbeTimeout is the hard cap on a single liveness probe. Kept tight (the +// probe is on the synchronous create path) — long enough for a healthy +// models-list GET, short enough that a hung upstream degrades to a TRANSIENT +// (allow) rather than wedging the create. +const byokProbeTimeout = 6 * time.Second + +type livenessKind int + +const ( + livenessOK livenessKind = iota + // livenessDead — a HARD auth failure (401/403). The credential is present + // but rejected: revoked, expired, or wrong scope. MUST fail closed. + livenessDead + // livenessTransient — network error, timeout, 5xx, or any non-auth status. + // The credential MIGHT be fine; the upstream is unreachable/unhappy right + // now. MUST NOT fail closed (a flaky network cannot block all creates). + livenessTransient +) + +type livenessOutcome struct { + kind livenessKind + status int // HTTP status observed (0 when no response — transport error/timeout) + err error // transport/timeout error when kind==livenessTransient with no response +} + +// probeBYOKCredentialLiveness makes ONE cheapest-possible authenticated call to +// the provider's upstream and classifies the result. +// +// ============================ THE CRUX ============================ +// The auth-failure vs transient-failure distinction is the entire point of this +// probe, and getting the FAIL DIRECTION right is security-vs-availability +// critical: +// +// - 401 / 403 → livenessDead → FAIL CLOSED. The upstream positively rejected +// the credential. A present-but-dead token is exactly the gap GAP B closes +// (a revoked key that would 201 + provision + wedge at first call). Blocking +// here gives the user synchronous, actionable feedback. +// +// - network error / DNS / connection refused / TLS / TIMEOUT → livenessTransient +// → ALLOW (loud log). The credential's validity is UNKNOWN — the upstream is +// unreachable. Failing closed here would let a single flaky network minute, +// or an upstream we cannot reach from this VPC, block EVERY byok create. The +// credential may be perfectly valid; we must not punish it for the network. +// +// - 5xx (500/502/503/504) → livenessTransient → ALLOW. The upstream is having +// a bad time; this says NOTHING about the credential. Same reasoning as a +// network error. +// +// - 429 (rate limited) → livenessTransient → ALLOW. A rate-limit means the key +// is being ACCEPTED (you cannot be rate-limited without authenticating) — +// the opposite of dead. Never block on it. +// +// - 2xx (and any other non-auth, non-5xx 4xx like 400/404) → livenessOK. A 2xx +// is an unambiguous pass. A 400/404 means the endpoint authenticated us and +// then disliked the request shape (e.g. a provider without a models-list +// route) — auth succeeded, so it is NOT a dead credential. Treat as OK +// rather than transient so a benign endpoint-shape mismatch never blocks. +// +// A DEAD token (401/403) MUST block; a flaky network MUST NOT. That asymmetry is +// the contract. +// ================================================================= +func probeBYOKCredentialLiveness(parent context.Context, provider providers.Provider, credKey, credVal string, probe byokLivenessProbe) livenessOutcome { + ctx, cancel := context.WithTimeout(parent, byokProbeTimeout) + defer cancel() + + req, err := buildBYOKLivenessRequest(ctx, provider, credKey, credVal) + if err != nil { + // We could not even construct a probe target (no base_url for this + // provider — e.g. an OAuth arm the CLI dials directly, no HTTP surface we + // can reach). Treat as transient/allow: we have NO evidence the + // credential is dead, and presence already passed. Never fail closed on + // "we don't know how to probe this provider." + return livenessOutcome{kind: livenessTransient, err: err} + } + + resp, err := probe(req) + if err != nil { + // Transport error or context deadline (timeout). Validity unknown → allow. + return livenessOutcome{kind: livenessTransient, err: err} + } + defer func() { + // Drain + close so the connection can be reused and we don't leak. + _, _ = io.Copy(io.Discard, io.LimitReader(resp.Body, 4<<10)) + _ = resp.Body.Close() + }() + + switch { + case resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden: + // 401 / 403 — the upstream positively rejected the credential. DEAD. + return livenessOutcome{kind: livenessDead, status: resp.StatusCode} + case resp.StatusCode == http.StatusTooManyRequests: + // 429 — rate limited means the key authenticated. NOT dead. + return livenessOutcome{kind: livenessTransient, status: resp.StatusCode} + case resp.StatusCode >= 500: + // 5xx — upstream trouble, says nothing about the credential. + return livenessOutcome{kind: livenessTransient, status: resp.StatusCode} + default: + // 2xx, or a non-auth 4xx (400/404 — auth succeeded, request shape + // disliked). Either way the credential was NOT rejected. OK. + return livenessOutcome{kind: livenessOK, status: resp.StatusCode} + } +} + +// buildBYOKLivenessRequest constructs the cheapest authenticated probe request +// from the provider's providers.yaml row — provider-generic, no per-provider +// URL hardcoding beyond what the SSOT supplies. +// +// Endpoint: prefer a models-list GET on the OpenAI-protocol base +// (base_url_template + "/models"); for anthropic-protocol providers without an +// openai base, fall back to a minimal 1-token messages POST on +// base_url_anthropic. Auth header is chosen from the protocol + the auth-env +// key the probe is exercising (Authorization: Bearer for openai/bearer keys, +// x-api-key for anthropic API keys). +// +// Returns an error when the provider exposes NO probeable HTTP base (an OAuth +// arm whose base_url_* are null — the CLI dials the vendor directly). The caller +// treats that as transient/allow (we cannot probe what has no endpoint). +func buildBYOKLivenessRequest(ctx context.Context, provider providers.Provider, credKey, credVal string) (*http.Request, error) { + openaiBase := strings.TrimRight(strings.TrimSpace(provider.BaseURLTemplate), "/") + anthropicBase := strings.TrimRight(strings.TrimSpace(provider.BaseURLAnthropic), "/") + + // Prefer the OpenAI-protocol models-list GET (cheapest, no body, no tokens). + if openaiBase != "" { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, openaiBase+"/models", nil) + if err != nil { + return nil, err + } + applyBYOKProbeAuth(req, provider, credKey, credVal) + return req, nil + } + + // Anthropic-protocol providers expose a models-list GET too (/v1/models); + // the base already includes the version segment for the registry rows + // (e.g. https://api.anthropic.com/v1). A GET /models is the cheapest probe. + if anthropicBase != "" { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, anthropicBase+"/models", nil) + if err != nil { + return nil, err + } + applyBYOKProbeAuth(req, provider, credKey, credVal) + // Anthropic requires the version header even on GET /models. + req.Header.Set("anthropic-version", "2023-06-01") + return req, nil + } + + return nil, fmt.Errorf("provider %q exposes no probeable HTTP base (oauth/CLI-direct arm)", provider.Name) +} + +// applyBYOKProbeAuth sets the auth header the upstream expects for this +// credential. Anthropic-protocol API keys use the x-api-key header; everything +// else (openai-protocol bearer keys, anthropic bearer auth tokens) uses +// Authorization: Bearer. The choice is keyed off the provider protocol + the +// auth-env name being exercised, NOT a per-provider hardcode. +func applyBYOKProbeAuth(req *http.Request, provider providers.Provider, credKey, credVal string) { + upperKey := strings.ToUpper(strings.TrimSpace(credKey)) + // Anthropic protocol + an x-api-key-style key → x-api-key header. ANTHROPIC_API_KEY + // and KIMI_API_KEY (kimi's claude-code integration uses x-api-key) go here. + // ANTHROPIC_AUTH_TOKEN is a bearer token even on anthropic-protocol providers. + if provider.Protocol == providers.ProtocolAnthropic && upperKey != "ANTHROPIC_AUTH_TOKEN" { + req.Header.Set("x-api-key", credVal) + return + } + req.Header.Set("Authorization", "Bearer "+credVal) +} + +// defaultBYOKLivenessProbe is the production probe: a real http.Client whose +// per-call timeout is governed by the request context (set by +// probeBYOKCredentialLiveness). A modest client timeout is a second belt against +// a hung dial. +func defaultBYOKLivenessProbe(req *http.Request) (*http.Response, error) { + client := &http.Client{Timeout: byokProbeTimeout} + return client.Do(req) +} + +// formatInvalidBYOKCredentialError builds the user-facing message for a +// create rejected because a present BYOK credential was positively rejected by +// the provider (401/403). Names the provider, the credential env, and the +// observed status so the user knows exactly which secret to rotate. +func formatInvalidBYOKCredentialError(providerName, credKey string, status int) string { + return fmt.Sprintf( + "the BYOK credential %q for provider %q was rejected by the provider (HTTP %d). "+ + "The credential is present but not valid (revoked, expired, or wrong scope). "+ + "Update %q under Config → Secrets with a working key for %q and retry.", + credKey, providerName, status, credKey, providerName) +} + +// makeCreateTimeCredValueResolver returns a credValueForKey function for the +// create path: it resolves a credential's VALUE by name, preferring the +// plaintext value in the create payload's own secrets (workspace scope — no +// decrypt needed) and falling back to decrypting the matching global_secrets +// row. Returns (value, true) only when a non-empty value is found; (_, false) +// otherwise (the liveness gate then skips probing that key — presence already +// passed, and we never fail closed on an unreadable value). +// +// The closure decrypts ONLY the single key being probed (the derived provider's +// auth_env), never the whole store. +func makeCreateTimeCredValueResolver(payloadSecrets map[string]string) func(ctx context.Context, key string) (string, bool) { + return func(ctx context.Context, key string) (string, bool) { + if v, ok := payloadSecrets[key]; ok && strings.TrimSpace(v) != "" { + return v, true + } + var enc []byte + var ver int + err := db.DB.QueryRowContext(ctx, + `SELECT encrypted_value, encryption_version FROM global_secrets WHERE key = $1`, + key).Scan(&enc, &ver) + if err != nil { + return "", false + } + dec, decErr := crypto.DecryptVersioned(enc, ver) + if decErr != nil { + log.Printf("byok_credential_gate: WARN — could not decrypt global secret %q for liveness probe: %v", key, decErr) + return "", false + } + if strings.TrimSpace(string(dec)) == "" { + return "", false + } + return string(dec), true + } +} diff --git a/workspace-server/internal/handlers/byok_credential_gate_test.go b/workspace-server/internal/handlers/byok_credential_gate_test.go new file mode 100644 index 00000000..c34d6c65 --- /dev/null +++ b/workspace-server/internal/handlers/byok_credential_gate_test.go @@ -0,0 +1,450 @@ +package handlers + +// byok_credential_gate_test.go — tests for the fail-closed BYOK credential gate +// at CREATE (GAP A presence + GAP B liveness). All outbound HTTP is mocked via +// an injected byokLivenessProbe — NO real network is touched. +// +// Coverage (the five contract cases from the gap-closure spec): +// 1. BYOK create, missing credential → 422 MISSING_BYOK_CREDENTIAL +// 2. BYOK create, present-but-INVALID (401) → 422 BYOK_CREDENTIAL_INVALID +// 3. BYOK create, transient probe failure → NOT blocked (loud log, allowed) +// 4. BYOK create, valid credential (200) → proceeds (201) +// 5. platform-managed create → gate is a no-op (unaffected) + +import ( + "bytes" + "context" + "errors" + "net/http" + "net/http/httptest" + "testing" + + "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/crypto" + "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/providers" + "github.com/DATA-DOG/go-sqlmock" + "github.com/gin-gonic/gin" +) + +// stubProbe returns a byokLivenessProbe that always responds with the given +// status (and no error). The response body is empty; the gate only inspects the +// status code. +func stubProbe(status int) byokLivenessProbe { + return func(req *http.Request) (*http.Response, error) { + return &http.Response{ + StatusCode: status, + Body: http.NoBody, + Header: make(http.Header), + }, nil + } +} + +// transientProbe returns a probe that fails with a transport error (the +// network-blip / timeout class). +func transientProbe(err error) byokLivenessProbe { + return func(req *http.Request) (*http.Response, error) { + return nil, err + } +} + +// expectGlobalSecretsPresenceQuery mocks the create-time global_secrets +// presence read (key names only). Pass the configured global key names. +func expectGlobalSecretsPresenceQuery(mock sqlmock.Sqlmock, keys ...string) { + rows := sqlmock.NewRows([]string{"key"}) + for _, k := range keys { + rows.AddRow(k) + } + mock.ExpectQuery(`SELECT key FROM global_secrets WHERE octet_length\(encrypted_value\) > 0 LIMIT \$1`). + WillReturnRows(rows) +} + +// expectGlobalSecretValueQuery mocks the liveness value-resolver read of a +// single global secret's encrypted value (used when the probed credential lives +// in global_secrets rather than the create payload). +func expectGlobalSecretValueQuery(mock sqlmock.Sqlmock, key, value string) { + enc, _ := crypto.Encrypt([]byte(value)) + ver := crypto.CurrentEncryptionVersion() + mock.ExpectQuery(`SELECT encrypted_value, encryption_version FROM global_secrets WHERE key = \$1`). + WithArgs(key). + WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}).AddRow(enc, ver)) +} + +// ============================================================================ +// Unit tests for the gate logic (no HTTP handler) — fast, deterministic. +// ============================================================================ + +// TestCreateBYOKGate_MissingCredential_RejectsPresence — GAP A: a byok-derived +// (runtime, model) with NO available credential key is rejected with +// MISSING_BYOK_CREDENTIAL before any liveness probe runs. +func TestCreateBYOKGate_MissingCredential_RejectsPresence(t *testing.T) { + h := &WorkspaceHandler{} + probeCalled := false + probe := func(req *http.Request) (*http.Response, error) { + probeCalled = true + return stubProbe(200)(req) + } + // claude-code + kimi-for-coding derives to kimi-coding (BYOK). No cred keys. + res := h.evaluateCreateBYOKCredentialGate( + context.Background(), + "claude-code", "kimi-for-coding", + map[string]struct{}{}, // no available credentials + func(ctx context.Context, key string) (string, bool) { return "", false }, + probe, + ) + if res.OK { + t.Fatalf("expected gate to REJECT a credential-less byok create, got OK") + } + if res.Code != "MISSING_BYOK_CREDENTIAL" { + t.Errorf("expected code MISSING_BYOK_CREDENTIAL, got %q", res.Code) + } + if res.HTTP != http.StatusUnprocessableEntity { + t.Errorf("expected 422, got %d", res.HTTP) + } + if probeCalled { + t.Errorf("liveness probe must NOT run when presence already fails") + } +} + +// TestCreateBYOKGate_InvalidCredential_FailsClosed — GAP B: a PRESENT credential +// that the provider rejects (401) → BYOK_CREDENTIAL_INVALID, fail-closed. +func TestCreateBYOKGate_InvalidCredential_FailsClosed(t *testing.T) { + h := &WorkspaceHandler{} + keys := map[string]struct{}{"KIMI_API_KEY": {}} + res := h.evaluateCreateBYOKCredentialGate( + context.Background(), + "claude-code", "kimi-for-coding", + keys, + func(ctx context.Context, key string) (string, bool) { return "sk-dead-token", true }, + stubProbe(http.StatusUnauthorized), + ) + if res.OK { + t.Fatalf("expected gate to REJECT a present-but-dead (401) credential, got OK") + } + if res.Code != "BYOK_CREDENTIAL_INVALID" { + t.Errorf("expected code BYOK_CREDENTIAL_INVALID, got %q (msg=%s)", res.Code, res.Message) + } + if res.HTTP != http.StatusUnprocessableEntity { + t.Errorf("expected 422, got %d", res.HTTP) + } +} + +// TestCreateBYOKGate_Forbidden_FailsClosed — 403 is also a hard auth failure. +func TestCreateBYOKGate_Forbidden_FailsClosed(t *testing.T) { + h := &WorkspaceHandler{} + keys := map[string]struct{}{"KIMI_API_KEY": {}} + res := h.evaluateCreateBYOKCredentialGate( + context.Background(), + "claude-code", "kimi-for-coding", keys, + func(ctx context.Context, key string) (string, bool) { return "sk-forbidden", true }, + stubProbe(http.StatusForbidden), + ) + if res.OK || res.Code != "BYOK_CREDENTIAL_INVALID" { + t.Fatalf("expected 403 to fail closed as BYOK_CREDENTIAL_INVALID, got OK=%v code=%q", res.OK, res.Code) + } +} + +// TestCreateBYOKGate_Transient_NotBlocked — GAP B crux: a transient failure +// (network error, 5xx, 429, timeout) must NOT block the create. The credential +// validity is UNKNOWN; a flaky upstream cannot wedge every create. +func TestCreateBYOKGate_Transient_NotBlocked(t *testing.T) { + cases := []struct { + name string + probe byokLivenessProbe + }{ + {"network_error", transientProbe(errors.New("dial tcp: connection refused"))}, + {"timeout", transientProbe(context.DeadlineExceeded)}, + {"500", stubProbe(http.StatusInternalServerError)}, + {"502", stubProbe(http.StatusBadGateway)}, + {"503", stubProbe(http.StatusServiceUnavailable)}, + {"429_rate_limited", stubProbe(http.StatusTooManyRequests)}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + h := &WorkspaceHandler{} + keys := map[string]struct{}{"KIMI_API_KEY": {}} + res := h.evaluateCreateBYOKCredentialGate( + context.Background(), + "claude-code", "kimi-for-coding", keys, + func(ctx context.Context, key string) (string, bool) { return "sk-maybe-fine", true }, + tc.probe, + ) + if !res.OK { + t.Fatalf("transient probe (%s) must NOT block create, got rejected code=%q", tc.name, res.Code) + } + }) + } +} + +// TestCreateBYOKGate_ValidCredential_Proceeds — GAP B happy path: a present, +// valid (200) credential proceeds. +func TestCreateBYOKGate_ValidCredential_Proceeds(t *testing.T) { + h := &WorkspaceHandler{} + keys := map[string]struct{}{"KIMI_API_KEY": {}} + res := h.evaluateCreateBYOKCredentialGate( + context.Background(), + "claude-code", "kimi-for-coding", keys, + func(ctx context.Context, key string) (string, bool) { return "sk-live", true }, + stubProbe(http.StatusOK), + ) + if !res.OK { + t.Fatalf("valid (200) credential must proceed, got rejected code=%q msg=%s", res.Code, res.Message) + } +} + +// TestCreateBYOKGate_PlatformManaged_Unaffected — GAP scope: a platform-derived +// model is NOT byok, so the gate is a no-op even with no credential and even +// with a probe that would 401. The probe must never run. +func TestCreateBYOKGate_PlatformManaged_Unaffected(t *testing.T) { + h := &WorkspaceHandler{} + probeCalled := false + probe := func(req *http.Request) (*http.Response, error) { + probeCalled = true + return stubProbe(http.StatusUnauthorized)(req) + } + // claude-code + a platform-namespaced model derives to the closed `platform` + // provider → platform_managed → gate no-op. + res := h.evaluateCreateBYOKCredentialGate( + context.Background(), + "claude-code", "anthropic/claude-opus-4-7", + map[string]struct{}{}, // no creds — irrelevant for platform + func(ctx context.Context, key string) (string, bool) { return "", false }, + probe, + ) + if !res.OK { + t.Fatalf("platform-managed create must be unaffected by the BYOK gate, got code=%q", res.Code) + } + if probeCalled { + t.Errorf("liveness probe must NOT run for a platform-managed model") + } +} + +// TestCreateBYOKGate_UnderivableModel_DefaultsClosedToPlatform — a model that +// does not derive (the model-registry gate owns rejecting it) is treated as +// non-byok (default-closed to platform), NOT failed as a credential-less byok. +func TestCreateBYOKGate_UnderivableModel_DefaultsClosedToPlatform(t *testing.T) { + h := &WorkspaceHandler{} + res := h.evaluateCreateBYOKCredentialGate( + context.Background(), + "claude-code", "totally-made-up-xyz", + map[string]struct{}{}, + func(ctx context.Context, key string) (string, bool) { return "", false }, + stubProbe(http.StatusOK), + ) + if !res.OK { + t.Fatalf("an underivable model must default-closed to platform (non-byok), not fail as MISSING_BYOK_CREDENTIAL; got code=%q", res.Code) + } +} + +// TestCreateBYOKGate_NonAuth4xx_TreatedOK — a 400/404 means auth succeeded but +// the endpoint disliked the request shape (e.g. a provider with no models-list +// route). That is NOT a dead credential → proceed. +func TestCreateBYOKGate_NonAuth4xx_TreatedOK(t *testing.T) { + h := &WorkspaceHandler{} + keys := map[string]struct{}{"KIMI_API_KEY": {}} + for _, status := range []int{http.StatusBadRequest, http.StatusNotFound} { + res := h.evaluateCreateBYOKCredentialGate( + context.Background(), + "claude-code", "kimi-for-coding", keys, + func(ctx context.Context, key string) (string, bool) { return "sk-live", true }, + stubProbe(status), + ) + if !res.OK { + t.Errorf("non-auth 4xx (%d) means auth succeeded; must proceed, got code=%q", status, res.Code) + } + } +} + +// TestProbeAuthHeader_AnthropicVsBearer pins the provider-generic auth-header +// selection: anthropic-protocol x-api-key keys use x-api-key; bearer keys use +// Authorization: Bearer. No per-provider URL hardcoding. +func TestProbeAuthHeader_AnthropicVsBearer(t *testing.T) { + anthropicProvider := providers.Provider{ + Name: "kimi-coding", + Protocol: providers.ProtocolAnthropic, + BaseURLTemplate: "https://api.kimi.com/coding/v1", + AuthEnv: []string{"KIMI_API_KEY"}, + } + req, err := buildBYOKLivenessRequest(context.Background(), anthropicProvider, "KIMI_API_KEY", "sk-kimi") + if err != nil { + t.Fatalf("buildBYOKLivenessRequest: %v", err) + } + if got := req.Header.Get("x-api-key"); got != "sk-kimi" { + t.Errorf("anthropic-protocol KIMI_API_KEY should set x-api-key, got %q", got) + } + if req.Header.Get("Authorization") != "" { + t.Errorf("anthropic x-api-key key must not also set Authorization, got %q", req.Header.Get("Authorization")) + } + if req.Method != http.MethodGet || req.URL.String() != "https://api.kimi.com/coding/v1/models" { + t.Errorf("expected GET .../models, got %s %s", req.Method, req.URL.String()) + } + + openaiProvider := providers.Provider{ + Name: "openai-api", + Protocol: providers.ProtocolOpenAI, + BaseURLTemplate: "https://api.openai.com/v1", + AuthEnv: []string{"OPENAI_API_KEY"}, + } + req2, err := buildBYOKLivenessRequest(context.Background(), openaiProvider, "OPENAI_API_KEY", "sk-openai") + if err != nil { + t.Fatalf("buildBYOKLivenessRequest openai: %v", err) + } + if got := req2.Header.Get("Authorization"); got != "Bearer sk-openai" { + t.Errorf("openai-protocol OPENAI_API_KEY should set Authorization: Bearer, got %q", got) + } +} + +// TestProbeRequest_NoBaseURL_Errors — an OAuth arm with null base URLs has no +// probeable HTTP surface; buildBYOKLivenessRequest errors and the gate treats it +// as transient/allow. +func TestProbeRequest_NoBaseURL_Errors(t *testing.T) { + oauthArm := providers.Provider{ + Name: "anthropic-oauth", + Protocol: providers.ProtocolAnthropic, + AuthEnv: []string{"CLAUDE_CODE_OAUTH_TOKEN"}, + // base_url_template + base_url_anthropic both empty (null in yaml). + } + _, err := buildBYOKLivenessRequest(context.Background(), oauthArm, "CLAUDE_CODE_OAUTH_TOKEN", "tok") + if err == nil { + t.Fatalf("expected an error for a provider with no probeable HTTP base") + } +} + +// ============================================================================ +// End-to-end through the Create HTTP handler — proves the wiring + 4xx body. +// ============================================================================ + +// TestWorkspaceCreate_BYOK_MissingCredential_Rejected422 — GAP A end-to-end: +// a credential-less byok create is rejected synchronously (no DB insert). +func TestWorkspaceCreate_BYOK_MissingCredential_Rejected422(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()) + // Probe should never run (presence fails first); stub it to 200 anyway. + handler.byokLivenessProbeOverride = stubProbe(http.StatusOK) + + // Only the global_secrets presence query is expected (returns NO keys) — + // the 422 fires before BeginTx, so any INSERT would fail ExpectationsWereMet. + expectGlobalSecretsPresenceQuery(mock /* no keys */) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + body := `{"name":"Kimi BYOK","runtime":"claude-code","model":"kimi-for-coding"}` + c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body)) + c.Request.Header.Set("Content-Type", "application/json") + handler.Create(c) + + if w.Code != http.StatusUnprocessableEntity { + t.Fatalf("expected 422 MISSING_BYOK_CREDENTIAL, got %d: %s", w.Code, w.Body.String()) + } + if !bytes.Contains(w.Body.Bytes(), []byte(`"code":"MISSING_BYOK_CREDENTIAL"`)) { + t.Errorf("expected code MISSING_BYOK_CREDENTIAL in body, got %s", w.Body.String()) + } + if !bytes.Contains(w.Body.Bytes(), []byte(`"provider":"kimi-coding"`)) { + t.Errorf("expected derived provider kimi-coding echoed in body, got %s", w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unexpected DB activity on synchronous reject: %v", err) + } +} + +// TestWorkspaceCreate_BYOK_InvalidCredential_Rejected422 — GAP B end-to-end: +// a present-but-dead (401) credential supplied in the payload is rejected +// synchronously with BYOK_CREDENTIAL_INVALID (no DB insert). +func TestWorkspaceCreate_BYOK_InvalidCredential_Rejected422(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()) + handler.byokLivenessProbeOverride = stubProbe(http.StatusUnauthorized) + + // Presence is satisfied by the PAYLOAD secret (no global keys needed); the + // value resolver reads the payload directly (no global_secrets value query). + expectGlobalSecretsPresenceQuery(mock /* no global keys */) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + body := `{"name":"Kimi Dead","runtime":"claude-code","model":"kimi-for-coding","secrets":{"KIMI_API_KEY":"sk-revoked"}}` + c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body)) + c.Request.Header.Set("Content-Type", "application/json") + handler.Create(c) + + if w.Code != http.StatusUnprocessableEntity { + t.Fatalf("expected 422 BYOK_CREDENTIAL_INVALID, got %d: %s", w.Code, w.Body.String()) + } + if !bytes.Contains(w.Body.Bytes(), []byte(`"code":"BYOK_CREDENTIAL_INVALID"`)) { + t.Errorf("expected code BYOK_CREDENTIAL_INVALID in body, got %s", w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unexpected DB activity on synchronous reject: %v", err) + } +} + +// TestWorkspaceCreate_BYOK_TransientProbe_Proceeds — GAP B end-to-end: a +// transient probe failure does NOT block; the create proceeds to 201. The +// credential is supplied via a GLOBAL secret (so there is no in-tx +// workspace_secrets write and the create flow matches the registry-proceed +// tests). Presence passes via the global key; the value resolver decrypts the +// global secret value to feed the probe, which then errors transiently. +func TestWorkspaceCreate_BYOK_TransientProbe_Proceeds(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()) + waitForHandlerAsyncBeforeDBCleanup(t, handler) + handler.byokLivenessProbeOverride = transientProbe(errors.New("dial tcp: i/o timeout")) + + // Presence: global_secrets carries KIMI_API_KEY. + expectGlobalSecretsPresenceQuery(mock, "KIMI_API_KEY") + // Liveness value resolver: decrypt the global KIMI_API_KEY value. + expectGlobalSecretValueQuery(mock, "KIMI_API_KEY", "sk-maybe-fine") + // Create proceeds (transient = allow). No payload secrets → no in-tx + // workspace_secrets write; the create flow matches the registry-proceed path. + mock.ExpectBegin() + mock.ExpectExec("INSERT INTO workspaces").WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectCommit() + mock.ExpectExec("INSERT INTO canvas_layouts").WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("INSERT INTO structure_events").WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("INSERT INTO workspace_auth_tokens").WillReturnResult(sqlmock.NewResult(0, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + body := `{"name":"Kimi Transient","runtime":"claude-code","model":"kimi-for-coding"}` + c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body)) + c.Request.Header.Set("Content-Type", "application/json") + handler.Create(c) + + if w.Code != http.StatusCreated { + t.Fatalf("transient probe must NOT block; expected 201, got %d: %s", w.Code, w.Body.String()) + } +} + +// TestWorkspaceCreate_BYOK_ValidGlobalCredential_Proceeds — GAP B happy path +// e2e: a valid (200) global credential proceeds to 201. +func TestWorkspaceCreate_BYOK_ValidGlobalCredential_Proceeds(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()) + waitForHandlerAsyncBeforeDBCleanup(t, handler) + handler.byokLivenessProbeOverride = stubProbe(http.StatusOK) + + expectGlobalSecretsPresenceQuery(mock, "KIMI_API_KEY") + expectGlobalSecretValueQuery(mock, "KIMI_API_KEY", "sk-live") + mock.ExpectBegin() + mock.ExpectExec("INSERT INTO workspaces").WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectCommit() + mock.ExpectExec("INSERT INTO canvas_layouts").WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("INSERT INTO structure_events").WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("INSERT INTO workspace_auth_tokens").WillReturnResult(sqlmock.NewResult(0, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + body := `{"name":"Kimi Live","runtime":"claude-code","model":"kimi-for-coding"}` + c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body)) + c.Request.Header.Set("Content-Type", "application/json") + handler.Create(c) + + if w.Code != http.StatusCreated { + t.Fatalf("valid credential must proceed; expected 201, got %d: %s", w.Code, w.Body.String()) + } +} diff --git a/workspace-server/internal/handlers/handlers_additional_test.go b/workspace-server/internal/handlers/handlers_additional_test.go index 82af0bd6..b66523ca 100644 --- a/workspace-server/internal/handlers/handlers_additional_test.go +++ b/workspace-server/internal/handlers/handlers_additional_test.go @@ -48,7 +48,8 @@ func TestWorkspaceCreate_WithParentID(t *testing.T) { w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) - body := `{"name":"Child Agent","model":"anthropic:claude-opus-4-7","parent_id":"parent-ws-123"}` + // Platform-managed model id → BYOK gate no-op (this test asserts parent_id). + body := `{"name":"Child Agent","model":"anthropic/claude-opus-4-7","parent_id":"parent-ws-123"}` c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body)) c.Request.Header.Set("Content-Type", "application/json") @@ -86,7 +87,9 @@ func TestWorkspaceCreate_ExplicitClaudeCodeRuntime(t *testing.T) { w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) - body := `{"name":"CC Agent","tier":2,"runtime":"claude-code","model":"sonnet","canvas":{"x":10,"y":20}}` + // Platform-managed model id → BYOK gate no-op (this test asserts the + // claude-code runtime label, not billing). + body := `{"name":"CC Agent","tier":2,"runtime":"claude-code","model":"anthropic/claude-sonnet-4-6","canvas":{"x":10,"y":20}}` c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body)) c.Request.Header.Set("Content-Type", "application/json") @@ -309,7 +312,9 @@ func TestWorkspaceCreate_MaxConcurrentTasksOverride(t *testing.T) { w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) - body := `{"name":"Leader Agent","runtime":"claude-code","model":"sonnet","max_concurrent_tasks":3}` + // Platform-managed model id → BYOK gate no-op (this test asserts + // max_concurrent_tasks). + body := `{"name":"Leader Agent","runtime":"claude-code","model":"anthropic/claude-sonnet-4-6","max_concurrent_tasks":3}` c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body)) c.Request.Header.Set("Content-Type", "application/json") diff --git a/workspace-server/internal/handlers/handlers_test.go b/workspace-server/internal/handlers/handlers_test.go index 16351f05..76242d08 100644 --- a/workspace-server/internal/handlers/handlers_test.go +++ b/workspace-server/internal/handlers/handlers_test.go @@ -398,7 +398,10 @@ func TestWorkspaceCreate(t *testing.T) { // the bare-defaults path (no template, no runtime → claude-code), so // the body must declare an explicit model. Using a claude-code-compatible // id; the test doesn't exercise model semantics beyond presence. - body := `{"name":"Test Agent","model":"anthropic:claude-opus-4-7","canvas":{"x":100,"y":200}}` + // Platform-managed model id (derives to the closed `platform` provider) so + // the fail-closed BYOK create gate is a no-op — this test exercises the + // generic create flow, not billing/credential semantics. + body := `{"name":"Test Agent","model":"anthropic/claude-opus-4-7","canvas":{"x":100,"y":200}}` c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body)) c.Request.Header.Set("Content-Type", "application/json") @@ -466,7 +469,9 @@ func TestWorkspaceCreate_ReturnsAuthToken_201(t *testing.T) { w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) - body := `{"name":"Token Holder","model":"anthropic:claude-opus-4-7"}` + // Platform-managed model id → BYOK gate no-op (this test asserts the auth + // token, not billing). + body := `{"name":"Token Holder","model":"anthropic/claude-opus-4-7"}` c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body)) c.Request.Header.Set("Content-Type", "application/json") diff --git a/workspace-server/internal/handlers/workspace.go b/workspace-server/internal/handlers/workspace.go index 19b3b16b..26d441ca 100644 --- a/workspace-server/internal/handlers/workspace.go +++ b/workspace-server/internal/handlers/workspace.go @@ -89,6 +89,21 @@ type WorkspaceHandler struct { // for async DB users (restart, provision) before asserting results. // Matches the pattern from main commit 1c3b4ff3. asyncWG sync.WaitGroup + // byokLivenessProbeOverride lets tests inject a mock outbound-HTTP probe + // for the create-time BYOK credential LIVENESS gate (GAP B) so no real + // network is touched in tests. nil in production → effectiveBYOKLivenessProbe + // returns defaultBYOKLivenessProbe (a real http.Client). Set it to a stub in + // tests, or to a sentinel that disables probing. + byokLivenessProbeOverride byokLivenessProbe +} + +// effectiveBYOKLivenessProbe returns the probe the create-time liveness gate +// should use: the test override when set, else the production http.Client probe. +func (h *WorkspaceHandler) effectiveBYOKLivenessProbe() byokLivenessProbe { + if h.byokLivenessProbeOverride != nil { + return h.byokLivenessProbeOverride + } + return defaultBYOKLivenessProbe } // seedMemoryPluginAPI is the slice of the v2 memory plugin client that @@ -550,6 +565,46 @@ func (h *WorkspaceHandler) Create(c *gin.Context) { } } + // Fail-closed BYOK credential gate (byok_credential_gate.go) — synchronous, + // BEFORE any DB write so a credential-less or dead-credential BYOK create is + // a 4xx, not a 201-then-late-provision-failure. Placed AFTER the cheap field + // validations (compute / access / dir / URL) so a malformed request is + // rejected on its own merits before the heavier credential check (one + // global_secrets read + a network probe), and immediately BEFORE BeginTx so + // the rejection path never touches the DB transaction: + // + // GAP A — PRESENCE: resolved mode == byok but no usable LLM credential at + // any scope (payload secrets + configured global_secrets) → 422 + // MISSING_BYOK_CREDENTIAL. Same presence rule the provision-time check + // keys off (anyBYOKCredentialKeyPresent), so create and provision agree. + // GAP B — LIVENESS: a credential IS present — probe it actually works. A + // positively-rejected credential (401/403) → 422 BYOK_CREDENTIAL_INVALID; + // a transient/network/5xx/timeout never blocks (see the probe's doc). + // + // Skipped for external workspaces (no model/provider contract — the URL is + // the contract, mirroring the MODEL_REQUIRED / registry gates above). + if !isExternal { + availableCredKeys := gatherCreateTimeLLMCredKeys(ctx, payload.Secrets) + gate := h.evaluateCreateBYOKCredentialGate( + ctx, + payload.Runtime, + payload.Model, + availableCredKeys, + makeCreateTimeCredValueResolver(payload.Secrets), + h.effectiveBYOKLivenessProbe(), + ) + if !gate.OK { + c.JSON(gate.HTTP, gin.H{ + "error": gate.Message, + "code": gate.Code, + "runtime": payload.Runtime, + "model": payload.Model, + "provider": gate.Provider, + }) + return + } + } + tx, txErr := db.DB.BeginTx(ctx, nil) if txErr != nil { log.Printf("Create workspace: begin tx error: %v", txErr) diff --git a/workspace-server/internal/handlers/workspace_budget_test.go b/workspace-server/internal/handlers/workspace_budget_test.go index f0b7cf64..7b2a3724 100644 --- a/workspace-server/internal/handlers/workspace_budget_test.go +++ b/workspace-server/internal/handlers/workspace_budget_test.go @@ -173,7 +173,8 @@ func TestWorkspaceBudget_Create_WithLimit(t *testing.T) { w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) - body := `{"name":"Budgeted Agent","model":"anthropic:claude-opus-4-7","budget_limit":1000}` + // Platform-managed model id → BYOK gate no-op (this test asserts budget_limit). + body := `{"name":"Budgeted Agent","model":"anthropic/claude-opus-4-7","budget_limit":1000}` c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body)) c.Request.Header.Set("Content-Type", "application/json") handler.Create(c) diff --git a/workspace-server/internal/handlers/workspace_provision.go b/workspace-server/internal/handlers/workspace_provision.go index faaa800d..aee38645 100644 --- a/workspace-server/internal/handlers/workspace_provision.go +++ b/workspace-server/internal/handlers/workspace_provision.go @@ -1141,13 +1141,21 @@ func stripPlatformManagedLLMBypassEnv(envVars map[string]string) { // global or workspace scope). Used by the byok fail-closed branch: a byok // workspace with no LLM credential at ANY scope must be aborted with // MISSING_BYOK_CREDENTIAL rather than started credential-less. +// +// This is the PROVISION-time presence rule. The CREATE-time presence gate +// (byok_credential_gate.go) reuses the SAME underlying predicate +// (anyBYOKCredentialKeyPresent) over a presence-only key set so the two +// fail-closed checks can never diverge: the rule "any non-empty +// platform-managed LLM bypass key is a usable BYOK credential" lives in exactly +// one place. func hasAnyPlatformManagedLLMKey(envVars map[string]string) bool { - for key := range platformManagedDirectLLMBypassKeys { - if strings.TrimSpace(envVars[key]) != "" { - return true + present := make(map[string]struct{}, len(envVars)) + for key, val := range envVars { + if strings.TrimSpace(val) != "" { + present[key] = struct{}{} } } - return false + return anyBYOKCredentialKeyPresent(present) } // stripNonMatchingGlobalOriginLLMCreds is the byok-branch provider-matched diff --git a/workspace-server/internal/handlers/workspace_test.go b/workspace-server/internal/handlers/workspace_test.go index 23747585..32526be2 100644 --- a/workspace-server/internal/handlers/workspace_test.go +++ b/workspace-server/internal/handlers/workspace_test.go @@ -350,7 +350,9 @@ func TestWorkspaceCreate_DBInsertError(t *testing.T) { w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) - body := `{"name":"Failing Agent","model":"anthropic:claude-opus-4-7"}` + // Platform-managed model id → BYOK gate no-op (this test asserts the DB + // insert-error rollback, not billing). + body := `{"name":"Failing Agent","model":"anthropic/claude-opus-4-7"}` c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body)) c.Request.Header.Set("Content-Type", "application/json") @@ -396,7 +398,9 @@ func TestWorkspaceCreate_DefaultsApplied(t *testing.T) { w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) - body := `{"name":"Default Agent","model":"anthropic:claude-opus-4-7"}` + // Platform-managed model id → BYOK gate no-op (this test asserts the + // status/defaults, not billing). + body := `{"name":"Default Agent","model":"anthropic/claude-opus-4-7"}` c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body)) c.Request.Header.Set("Content-Type", "application/json") @@ -519,6 +523,13 @@ func TestWorkspaceCreate_SecretPersistFails_RollsBack(t *testing.T) { broadcaster := newTestBroadcaster() handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()) + // The create-time BYOK gate runs first: anthropic:claude-opus-4-7 derives to + // anthropic-api (BYOK). Presence is satisfied by the payload OPENAI_API_KEY + // (a recognized LLM bypass key) — the liveness probe is skipped because + // OPENAI_API_KEY is not in anthropic-api's auth_env, so no value is read for + // the derived provider. Only the global presence query precedes BeginTx. + expectGlobalSecretsPresenceQuery(mock /* no global keys */) + mock.ExpectBegin() mock.ExpectExec("INSERT INTO workspaces"). WillReturnResult(sqlmock.NewResult(0, 1)) @@ -2118,6 +2129,12 @@ func TestWorkspaceCreate_718_P4_RegisteredModelProceeds(t *testing.T) { setupTestRedis(t) broadcaster := newTestBroadcaster() handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()) + // claude-opus-4-7 derives to anthropic-api (BYOK). The fail-closed BYOK + // create gate now requires a present + live credential — supply one via a + // global secret and stub the liveness probe to 200 (no real network). + handler.byokLivenessProbeOverride = stubProbe(http.StatusOK) + expectGlobalSecretsPresenceQuery(mock, "ANTHROPIC_API_KEY") + expectGlobalSecretValueQuery(mock, "ANTHROPIC_API_KEY", "sk-ant-live") mock.ExpectBegin() mock.ExpectExec("INSERT INTO workspaces"). @@ -2156,6 +2173,11 @@ func TestWorkspaceCreate_718_P4_LegacyColonVocabAccepted(t *testing.T) { setupTestRedis(t) broadcaster := newTestBroadcaster() handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()) + // anthropic:claude-opus-4-7 derives to anthropic-api (BYOK) — satisfy the + // fail-closed BYOK create gate with a present + live global credential. + handler.byokLivenessProbeOverride = stubProbe(http.StatusOK) + expectGlobalSecretsPresenceQuery(mock, "ANTHROPIC_API_KEY") + expectGlobalSecretValueQuery(mock, "ANTHROPIC_API_KEY", "sk-ant-live") mock.ExpectBegin() mock.ExpectExec("INSERT INTO workspaces"). @@ -2220,6 +2242,13 @@ func TestWorkspaceCreate_188_ExplicitRuntimeNoTemplate_OK(t *testing.T) { setupTestRedis(t) broadcaster := newTestBroadcaster() handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()) + // codex gpt-5.5 derives to openai-subscription (BYOK, OAuth arm). The OAuth + // arm has null base URLs → the liveness probe is unprobeable → transient/ + // allow; presence still requires a credential. Supply CODEX_AUTH_JSON via a + // global secret (presence passes; the probe is skipped as unprobeable). + handler.byokLivenessProbeOverride = stubProbe(http.StatusOK) + expectGlobalSecretsPresenceQuery(mock, "CODEX_AUTH_JSON") + expectGlobalSecretValueQuery(mock, "CODEX_AUTH_JSON", `{"tokens":"x"}`) mock.ExpectBegin() mock.ExpectExec("INSERT INTO workspaces"). -- 2.52.0