feat(byok): create-time credential gate + liveness preflight (fail-closed, no silent skip) #2328

Closed
claude-ceo-assistant wants to merge 4 commits from feat/byok-create-gate-and-liveness into main
8 changed files with 1079 additions and 12 deletions
@@ -0,0 +1,514 @@
package handlers
// byok_credential_gate.go — fail-closed BYOK credential checks at the CREATE
// boundary (molecule-core, BYOK billing-model gap-closure).
//
// Two related gaps in the fail-closed BYOK billing model are closed here, both
// in the same credential-validation area:
//
// GAP A — PRESENCE gate at CREATE. The provision-time fail-closed check
// (workspace_provision_shared.go) already aborts a BYOK provision with no
// usable credential, but that fires AFTER a 201 and a goroutine launch — a
// credential-less BYOK create looks successful, then dies late in provision.
// This gate moves the SAME presence rule to the synchronous create path so a
// credential-less BYOK create is a 4xx, not a 201-then-late-failure. It
// reuses anyBYOKCredentialKeyPresent — the exact predicate the provision
// check keys off (via hasAnyPlatformManagedLLMKey) — so the two checks can
// never diverge.
//
// GAP B — LIVENESS preflight. Presence is not validity: a BYOK workspace can
// carry a PRESENT-but-DEAD token (revoked/expired) and still 201 + provision,
// only to wedge at first LLM call. This gate makes the cheapest authenticated
// probe (a models-list GET, else a 1-token no-op completion) against the
// provider's base_url (derived from providers.yaml — the SSOT) and FAILS
// CLOSED on a hard auth failure (401/403). The crux distinction, documented
// on probeBYOKCredentialLiveness: a DEAD token (401/403) MUST block; a
// TRANSIENT failure (network/timeout/5xx) MUST NOT block (a flaky upstream
// cannot be allowed to wedge every create) — it logs loudly and allows.
//
// Both gates are scoped to the resolved billing mode == byok. platform_managed
// (the CP proxy owns the credential) and disabled (no platform-billed LLM) are
// untouched.
import (
"context"
"fmt"
"io"
"log"
"net/http"
"strings"
"time"
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/crypto"
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db"
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/providers"
)
// anyBYOKCredentialKeyPresent is the SINGLE presence predicate shared by the
// create-time gate (this file) and the provision-time check
// (hasAnyPlatformManagedLLMKey). It reports whether the supplied set of present
// credential env-var NAMES contains at least one platform-managed-shaped LLM
// bypass key — the tenant's own usable BYOK credential. The caller is
// responsible for only including keys whose VALUE is non-empty (the provision
// path filters on TrimSpace != "", the create path includes only non-empty
// payload secrets + configured global_secrets keys).
//
// Keeping the rule here — not duplicated in two call sites — is the whole point:
// "what counts as a usable BYOK credential" is defined once.
func anyBYOKCredentialKeyPresent(presentKeys map[string]struct{}) bool {
for key := range presentKeys {
if _, ok := platformManagedDirectLLMBypassKeys[strings.ToUpper(strings.TrimSpace(key))]; ok {
return true
}
}
return false
}
// gatherCreateTimeLLMCredKeys collects the set of credential env-var NAMES that
// will be available to the workspace at provision time, as known at CREATE:
//
// - the create payload's own secrets (workspace scope), non-empty values only;
// - the tenant's configured global_secrets keys (operator/global scope).
//
// It returns ONLY non-empty keys, so the set can be passed straight to
// anyBYOKCredentialKeyPresent (presence) and is the right input for the
// auth-env disambiguation DeriveProvider uses. Returns the full key set (not
// just bypass keys) because DeriveProvider's tie-break wants every recognized
// auth-env name; the bypass-key filter is applied by anyBYOKCredentialKeyPresent.
//
// The global_secrets read is presence-only (key names, never values) — values
// are decrypted only by the liveness probe, and only for the resolved provider's
// own auth_env keys.
func gatherCreateTimeLLMCredKeys(ctx context.Context, payloadSecrets map[string]string) map[string]struct{} {
keys := map[string]struct{}{}
for k, v := range payloadSecrets {
if strings.TrimSpace(v) != "" {
keys[k] = struct{}{}
}
}
// Global/operator-scope keys (presence only). Best-effort: a read error
// degrades to "only payload secrets" — the gate then fails CLOSED if the
// payload alone lacks a credential, which is the safe direction (a byok
// create that depends on a global cred we could not enumerate is rejected
// with an actionable error rather than silently provisioned).
globalKeys, err := loadConfiguredGlobalSecretKeys(ctx)
if err != nil {
log.Printf("byok_credential_gate: WARN — could not load global_secrets keys for create-time BYOK check: %v (proceeding with payload secrets only)", err)
return keys
}
for k := range globalKeys {
keys[k] = struct{}{}
}
return keys
}
// availableAuthEnvNamesFromKeySet returns the subset of presentKeys that are
// recognized provider auth-env names — the DeriveProvider tie-break input — so
// the create-time derive splits oauth-vs-api the same way the provision path
// does. Mirror of availableAuthEnvNames but over a key SET (the create path has
// no env-value map, only presence).
func availableAuthEnvNamesFromKeySet(presentKeys map[string]struct{}) []string {
authSet := authEnvNameSet()
var out []string
for k := range presentKeys {
if _, ok := authSet[k]; ok {
out = append(out, k)
}
}
return out
}
// createBYOKGateResult is the outcome of the create-time BYOK gate. Code is the
// structured error code ("" on pass); Message is the user-visible reason; HTTP
// is the status to return; Provider names the derived provider when known
// (surfaced in logs + the structured error). When OK is true the create
// proceeds.
type createBYOKGateResult struct {
OK bool
Code string
Message string
HTTP int
Provider string
}
// passCreateBYOKGate is the proceed sentinel.
func passCreateBYOKGate() createBYOKGateResult { return createBYOKGateResult{OK: true} }
// evaluateCreateBYOKCredentialGate runs GAP A (presence) and GAP B (liveness)
// at the create boundary for a (runtime, model) pair, given the credential keys
// that will be available at provision time and a probe function (injected so
// tests mock the network — no real outbound in unit tests).
//
// Flow:
// 1. DERIVE the provider from (runtime, model) via the registry SSOT and key
// the byok-vs-platform decision off IsPlatform(derived) — the SAME rule
// ResolveLLMBillingModeDerived's precedence-2 uses. We derive DIRECTLY here
// (not via ResolveLLMBillingModeDerived) because at create there is NO
// workspace row yet: that resolver short-circuits a "" workspaceID to the
// pure platform-default (its pre-provision contract) and would never see
// byok. There is also no operator-override column to honor pre-row, so a
// direct derive is both correct and sufficient. A derive failure
// (unregistered/ambiguous) → default-closed to platform (NOT byok), exactly
// as the resolver does — never fail a credential-less create as byok on an
// underivable model. (The model-registry gate upstream already rejects
// unregistered models, so this is belt-and-suspenders.)
// 2. GAP A: if byok and NO usable credential key is present → reject
// MISSING_BYOK_CREDENTIAL (4xx, synchronous).
// 3. GAP B: a credential IS present — probe its LIVENESS. Fail CLOSED only on a
// hard auth failure (401/403); a transient/network/5xx/timeout logs loudly
// and ALLOWS (see probeBYOKCredentialLiveness for the crux rationale).
//
// External workspaces are handled by the caller (they have no model/provider
// contract) — this function assumes a spawned-runtime create.
func (h *WorkspaceHandler) evaluateCreateBYOKCredentialGate(
ctx context.Context,
runtime, model string,
availableCredKeys map[string]struct{},
credValueForKey func(ctx context.Context, key string) (string, bool),
probe byokLivenessProbe,
) createBYOKGateResult {
authEnv := availableAuthEnvNamesFromKeySet(availableCredKeys)
manifest, mErr := providerRegistry()
if mErr != nil || manifest == nil {
// Registry unavailable (a build-time defect the verify-providers-gen /
// sync gates catch). We cannot derive the mode — default-closed to
// platform (not byok), so the gate enforces nothing. Mirrors the
// resolver's own registry-load fail-direction.
log.Printf("byok_credential_gate: WARN — provider registry unavailable, skipping BYOK create gate (runtime=%q model=%q): %v", runtime, model, mErr)
return passCreateBYOKGate()
}
provider, dErr := manifest.DeriveProvider(runtime, model, authEnv)
if dErr != nil {
// No model / unknown runtime / unregistered / ambiguous → default-closed
// to platform (NOT byok). A derive miss never fails a create as byok.
log.Printf("byok_credential_gate: provider underivable for create (runtime=%q model=%q): %v — treating as non-byok (default-closed to platform)", runtime, model, dErr)
return passCreateBYOKGate()
}
if provider.IsPlatform() {
// platform_managed — the CP proxy owns the credential; nothing for this
// gate to enforce. (disabled is never DERIVED — it is only an explicit
// operator override, which does not exist pre-row.)
return passCreateBYOKGate()
}
// A non-platform (BYOK) provider was derived. Enforce the BYOK contract.
// GAP A — PRESENCE. Reuse the EXACT predicate the provision-time check keys
// off (anyBYOKCredentialKeyPresent) so create and provision can never
// disagree on "what is a usable BYOK credential."
if !anyBYOKCredentialKeyPresent(availableCredKeys) {
msg := formatMissingBYOKCredentialError(LLMBillingModeBYOK)
log.Printf("Create: FAIL-CLOSED (MISSING_BYOK_CREDENTIAL) — runtime=%q model=%q resolved=byok provider=%q but no usable LLM credential present at any scope",
runtime, model, provider.Name)
return createBYOKGateResult{
OK: false,
Code: "MISSING_BYOK_CREDENTIAL",
Message: msg,
HTTP: http.StatusUnprocessableEntity,
Provider: provider.Name,
}
}
// GAP B — LIVENESS. A credential IS present; probe that it actually works.
// Disabled when no probe is wired (provision-only deployments / probe off).
if probe == nil {
return passCreateBYOKGate()
}
return runBYOKLivenessGate(ctx, provider, availableCredKeys, credValueForKey, probe)
}
// runBYOKLivenessGate selects the credential VALUE for the derived provider's
// auth_env and runs the liveness probe. Fails CLOSED only on a hard auth
// failure; transient failures (and an unreadable credential value) pass.
func runBYOKLivenessGate(
ctx context.Context,
provider providers.Provider,
availableCredKeys map[string]struct{},
credValueForKey func(ctx context.Context, key string) (string, bool),
probe byokLivenessProbe,
) createBYOKGateResult {
if credValueForKey == nil {
// No value resolver wired — presence already passed; nothing to probe.
return passCreateBYOKGate()
}
// Pick the FIRST of the provider's auth_env keys that is both present and
// readable. We probe a single credential — the one the provider would
// actually authenticate with — not every key.
credKey, credVal := "", ""
for _, want := range provider.AuthEnv {
if _, present := availableCredKeys[want]; !present {
continue
}
if v, ok := credValueForKey(ctx, want); ok && strings.TrimSpace(v) != "" {
credKey, credVal = want, v
break
}
}
if credKey == "" {
// The present key(s) for this provider could not be read back (decrypt
// miss / value lives only in operator store we don't decrypt here).
// Presence already passed; we cannot probe a value we don't have, so
// allow — the provision-time path will still re-check presence, and a
// truly dead token surfaces there. Do NOT fail closed on an unreadable
// value (that would block legitimate global-scope creds).
log.Printf("byok_credential_gate: liveness — provider=%q has a present auth key but no readable value at create; skipping probe (presence already satisfied)", provider.Name)
return passCreateBYOKGate()
}
outcome := probeBYOKCredentialLiveness(ctx, provider, credKey, credVal, probe)
switch outcome.kind {
case livenessDead:
msg := formatInvalidBYOKCredentialError(provider.Name, credKey, outcome.status)
log.Printf("Create: FAIL-CLOSED (BYOK_CREDENTIAL_INVALID) — provider=%q auth_env=%q probe returned HTTP %d (dead/revoked credential)",
provider.Name, credKey, outcome.status)
return createBYOKGateResult{
OK: false,
Code: "BYOK_CREDENTIAL_INVALID",
Message: msg,
HTTP: http.StatusUnprocessableEntity,
Provider: provider.Name,
}
case livenessTransient:
// LOUD log, but ALLOW — a flaky upstream / network blip / 5xx must not
// block every create. This is the documented transient-vs-dead split.
log.Printf("Create: BYOK liveness probe TRANSIENT for provider=%q auth_env=%q (status=%d err=%v) — NOT blocking (transient != dead); provision-time re-check remains",
provider.Name, credKey, outcome.status, outcome.err)
return passCreateBYOKGate()
default: // livenessOK
log.Printf("Create: BYOK liveness probe OK for provider=%q auth_env=%q (status=%d)", provider.Name, credKey, outcome.status)
return passCreateBYOKGate()
}
}
// ----------------------------------------------------------------------------
// GAP B — liveness probe
// ----------------------------------------------------------------------------
// byokLivenessProbe is the injected outbound-HTTP seam: given a prepared
// request it returns the response (or a transport error). Production wires the
// real http.Client (see defaultBYOKLivenessProbe); tests inject an httptest /
// stub so NO real network is touched. The probe MUST honor the request context
// deadline.
type byokLivenessProbe func(req *http.Request) (*http.Response, error)
// byokProbeTimeout is the hard cap on a single liveness probe. Kept tight (the
// probe is on the synchronous create path) — long enough for a healthy
// models-list GET, short enough that a hung upstream degrades to a TRANSIENT
// (allow) rather than wedging the create.
const byokProbeTimeout = 6 * time.Second
type livenessKind int
const (
livenessOK livenessKind = iota
// livenessDead — a HARD auth failure (401/403). The credential is present
// but rejected: revoked, expired, or wrong scope. MUST fail closed.
livenessDead
// livenessTransient — network error, timeout, 5xx, or any non-auth status.
// The credential MIGHT be fine; the upstream is unreachable/unhappy right
// now. MUST NOT fail closed (a flaky network cannot block all creates).
livenessTransient
)
type livenessOutcome struct {
kind livenessKind
status int // HTTP status observed (0 when no response — transport error/timeout)
err error // transport/timeout error when kind==livenessTransient with no response
}
// probeBYOKCredentialLiveness makes ONE cheapest-possible authenticated call to
// the provider's upstream and classifies the result.
//
// ============================ THE CRUX ============================
// The auth-failure vs transient-failure distinction is the entire point of this
// probe, and getting the FAIL DIRECTION right is security-vs-availability
// critical:
//
// - 401 / 403 → livenessDead → FAIL CLOSED. The upstream positively rejected
// the credential. A present-but-dead token is exactly the gap GAP B closes
// (a revoked key that would 201 + provision + wedge at first call). Blocking
// here gives the user synchronous, actionable feedback.
//
// - network error / DNS / connection refused / TLS / TIMEOUT → livenessTransient
// → ALLOW (loud log). The credential's validity is UNKNOWN — the upstream is
// unreachable. Failing closed here would let a single flaky network minute,
// or an upstream we cannot reach from this VPC, block EVERY byok create. The
// credential may be perfectly valid; we must not punish it for the network.
//
// - 5xx (500/502/503/504) → livenessTransient → ALLOW. The upstream is having
// a bad time; this says NOTHING about the credential. Same reasoning as a
// network error.
//
// - 429 (rate limited) → livenessTransient → ALLOW. A rate-limit means the key
// is being ACCEPTED (you cannot be rate-limited without authenticating) —
// the opposite of dead. Never block on it.
//
// - 2xx (and any other non-auth, non-5xx 4xx like 400/404) → livenessOK. A 2xx
// is an unambiguous pass. A 400/404 means the endpoint authenticated us and
// then disliked the request shape (e.g. a provider without a models-list
// route) — auth succeeded, so it is NOT a dead credential. Treat as OK
// rather than transient so a benign endpoint-shape mismatch never blocks.
//
// A DEAD token (401/403) MUST block; a flaky network MUST NOT. That asymmetry is
// the contract.
// =================================================================
func probeBYOKCredentialLiveness(parent context.Context, provider providers.Provider, credKey, credVal string, probe byokLivenessProbe) livenessOutcome {
ctx, cancel := context.WithTimeout(parent, byokProbeTimeout)
defer cancel()
req, err := buildBYOKLivenessRequest(ctx, provider, credKey, credVal)
if err != nil {
// We could not even construct a probe target (no base_url for this
// provider — e.g. an OAuth arm the CLI dials directly, no HTTP surface we
// can reach). Treat as transient/allow: we have NO evidence the
// credential is dead, and presence already passed. Never fail closed on
// "we don't know how to probe this provider."
return livenessOutcome{kind: livenessTransient, err: err}
}
resp, err := probe(req)
if err != nil {
// Transport error or context deadline (timeout). Validity unknown → allow.
return livenessOutcome{kind: livenessTransient, err: err}
}
defer func() {
// Drain + close so the connection can be reused and we don't leak.
_, _ = io.Copy(io.Discard, io.LimitReader(resp.Body, 4<<10))
_ = resp.Body.Close()
}()
switch {
case resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden:
// 401 / 403 — the upstream positively rejected the credential. DEAD.
return livenessOutcome{kind: livenessDead, status: resp.StatusCode}
case resp.StatusCode == http.StatusTooManyRequests:
// 429 — rate limited means the key authenticated. NOT dead.
return livenessOutcome{kind: livenessTransient, status: resp.StatusCode}
case resp.StatusCode >= 500:
// 5xx — upstream trouble, says nothing about the credential.
return livenessOutcome{kind: livenessTransient, status: resp.StatusCode}
default:
// 2xx, or a non-auth 4xx (400/404 — auth succeeded, request shape
// disliked). Either way the credential was NOT rejected. OK.
return livenessOutcome{kind: livenessOK, status: resp.StatusCode}
}
}
// buildBYOKLivenessRequest constructs the cheapest authenticated probe request
// from the provider's providers.yaml row — provider-generic, no per-provider
// URL hardcoding beyond what the SSOT supplies.
//
// Endpoint: prefer a models-list GET on the OpenAI-protocol base
// (base_url_template + "/models"); for anthropic-protocol providers without an
// openai base, fall back to a minimal 1-token messages POST on
// base_url_anthropic. Auth header is chosen from the protocol + the auth-env
// key the probe is exercising (Authorization: Bearer for openai/bearer keys,
// x-api-key for anthropic API keys).
//
// Returns an error when the provider exposes NO probeable HTTP base (an OAuth
// arm whose base_url_* are null — the CLI dials the vendor directly). The caller
// treats that as transient/allow (we cannot probe what has no endpoint).
func buildBYOKLivenessRequest(ctx context.Context, provider providers.Provider, credKey, credVal string) (*http.Request, error) {
openaiBase := strings.TrimRight(strings.TrimSpace(provider.BaseURLTemplate), "/")
anthropicBase := strings.TrimRight(strings.TrimSpace(provider.BaseURLAnthropic), "/")
// Prefer the OpenAI-protocol models-list GET (cheapest, no body, no tokens).
if openaiBase != "" {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, openaiBase+"/models", nil)
if err != nil {
return nil, err
}
applyBYOKProbeAuth(req, provider, credKey, credVal)
return req, nil
}
// Anthropic-protocol providers expose a models-list GET too (/v1/models);
// the base already includes the version segment for the registry rows
// (e.g. https://api.anthropic.com/v1). A GET /models is the cheapest probe.
if anthropicBase != "" {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, anthropicBase+"/models", nil)
if err != nil {
return nil, err
}
applyBYOKProbeAuth(req, provider, credKey, credVal)
// Anthropic requires the version header even on GET /models.
req.Header.Set("anthropic-version", "2023-06-01")
return req, nil
}
return nil, fmt.Errorf("provider %q exposes no probeable HTTP base (oauth/CLI-direct arm)", provider.Name)
}
// applyBYOKProbeAuth sets the auth header the upstream expects for this
// credential. Anthropic-protocol API keys use the x-api-key header; everything
// else (openai-protocol bearer keys, anthropic bearer auth tokens) uses
// Authorization: Bearer. The choice is keyed off the provider protocol + the
// auth-env name being exercised, NOT a per-provider hardcode.
func applyBYOKProbeAuth(req *http.Request, provider providers.Provider, credKey, credVal string) {
upperKey := strings.ToUpper(strings.TrimSpace(credKey))
// Anthropic protocol + an x-api-key-style key → x-api-key header. ANTHROPIC_API_KEY
// and KIMI_API_KEY (kimi's claude-code integration uses x-api-key) go here.
// ANTHROPIC_AUTH_TOKEN is a bearer token even on anthropic-protocol providers.
if provider.Protocol == providers.ProtocolAnthropic && upperKey != "ANTHROPIC_AUTH_TOKEN" {
req.Header.Set("x-api-key", credVal)
return
}
req.Header.Set("Authorization", "Bearer "+credVal)
}
// defaultBYOKLivenessProbe is the production probe: a real http.Client whose
// per-call timeout is governed by the request context (set by
// probeBYOKCredentialLiveness). A modest client timeout is a second belt against
// a hung dial.
func defaultBYOKLivenessProbe(req *http.Request) (*http.Response, error) {
client := &http.Client{Timeout: byokProbeTimeout}
return client.Do(req)
}
// formatInvalidBYOKCredentialError builds the user-facing message for a
// create rejected because a present BYOK credential was positively rejected by
// the provider (401/403). Names the provider, the credential env, and the
// observed status so the user knows exactly which secret to rotate.
func formatInvalidBYOKCredentialError(providerName, credKey string, status int) string {
return fmt.Sprintf(
"the BYOK credential %q for provider %q was rejected by the provider (HTTP %d). "+
"The credential is present but not valid (revoked, expired, or wrong scope). "+
"Update %q under Config → Secrets with a working key for %q and retry.",
credKey, providerName, status, credKey, providerName)
}
// makeCreateTimeCredValueResolver returns a credValueForKey function for the
// create path: it resolves a credential's VALUE by name, preferring the
// plaintext value in the create payload's own secrets (workspace scope — no
// decrypt needed) and falling back to decrypting the matching global_secrets
// row. Returns (value, true) only when a non-empty value is found; (_, false)
// otherwise (the liveness gate then skips probing that key — presence already
// passed, and we never fail closed on an unreadable value).
//
// The closure decrypts ONLY the single key being probed (the derived provider's
// auth_env), never the whole store.
func makeCreateTimeCredValueResolver(payloadSecrets map[string]string) func(ctx context.Context, key string) (string, bool) {
return func(ctx context.Context, key string) (string, bool) {
if v, ok := payloadSecrets[key]; ok && strings.TrimSpace(v) != "" {
return v, true
}
var enc []byte
var ver int
err := db.DB.QueryRowContext(ctx,
`SELECT encrypted_value, encryption_version FROM global_secrets WHERE key = $1`,
key).Scan(&enc, &ver)
if err != nil {
return "", false
}
dec, decErr := crypto.DecryptVersioned(enc, ver)
if decErr != nil {
log.Printf("byok_credential_gate: WARN — could not decrypt global secret %q for liveness probe: %v", key, decErr)
return "", false
}
if strings.TrimSpace(string(dec)) == "" {
return "", false
}
return string(dec), true
}
}
@@ -0,0 +1,450 @@
package handlers
// byok_credential_gate_test.go — tests for the fail-closed BYOK credential gate
// at CREATE (GAP A presence + GAP B liveness). All outbound HTTP is mocked via
// an injected byokLivenessProbe — NO real network is touched.
//
// Coverage (the five contract cases from the gap-closure spec):
// 1. BYOK create, missing credential → 422 MISSING_BYOK_CREDENTIAL
// 2. BYOK create, present-but-INVALID (401) → 422 BYOK_CREDENTIAL_INVALID
// 3. BYOK create, transient probe failure → NOT blocked (loud log, allowed)
// 4. BYOK create, valid credential (200) → proceeds (201)
// 5. platform-managed create → gate is a no-op (unaffected)
import (
"bytes"
"context"
"errors"
"net/http"
"net/http/httptest"
"testing"
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/crypto"
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/providers"
"github.com/DATA-DOG/go-sqlmock"
"github.com/gin-gonic/gin"
)
// stubProbe returns a byokLivenessProbe that always responds with the given
// status (and no error). The response body is empty; the gate only inspects the
// status code.
func stubProbe(status int) byokLivenessProbe {
return func(req *http.Request) (*http.Response, error) {
return &http.Response{
StatusCode: status,
Body: http.NoBody,
Header: make(http.Header),
}, nil
}
}
// transientProbe returns a probe that fails with a transport error (the
// network-blip / timeout class).
func transientProbe(err error) byokLivenessProbe {
return func(req *http.Request) (*http.Response, error) {
return nil, err
}
}
// expectGlobalSecretsPresenceQuery mocks the create-time global_secrets
// presence read (key names only). Pass the configured global key names.
func expectGlobalSecretsPresenceQuery(mock sqlmock.Sqlmock, keys ...string) {
rows := sqlmock.NewRows([]string{"key"})
for _, k := range keys {
rows.AddRow(k)
}
mock.ExpectQuery(`SELECT key FROM global_secrets WHERE octet_length\(encrypted_value\) > 0 LIMIT \$1`).
WillReturnRows(rows)
}
// expectGlobalSecretValueQuery mocks the liveness value-resolver read of a
// single global secret's encrypted value (used when the probed credential lives
// in global_secrets rather than the create payload).
func expectGlobalSecretValueQuery(mock sqlmock.Sqlmock, key, value string) {
enc, _ := crypto.Encrypt([]byte(value))
ver := crypto.CurrentEncryptionVersion()
mock.ExpectQuery(`SELECT encrypted_value, encryption_version FROM global_secrets WHERE key = \$1`).
WithArgs(key).
WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}).AddRow(enc, ver))
}
// ============================================================================
// Unit tests for the gate logic (no HTTP handler) — fast, deterministic.
// ============================================================================
// TestCreateBYOKGate_MissingCredential_RejectsPresence — GAP A: a byok-derived
// (runtime, model) with NO available credential key is rejected with
// MISSING_BYOK_CREDENTIAL before any liveness probe runs.
func TestCreateBYOKGate_MissingCredential_RejectsPresence(t *testing.T) {
h := &WorkspaceHandler{}
probeCalled := false
probe := func(req *http.Request) (*http.Response, error) {
probeCalled = true
return stubProbe(200)(req)
}
// claude-code + kimi-for-coding derives to kimi-coding (BYOK). No cred keys.
res := h.evaluateCreateBYOKCredentialGate(
context.Background(),
"claude-code", "kimi-for-coding",
map[string]struct{}{}, // no available credentials
func(ctx context.Context, key string) (string, bool) { return "", false },
probe,
)
if res.OK {
t.Fatalf("expected gate to REJECT a credential-less byok create, got OK")
}
if res.Code != "MISSING_BYOK_CREDENTIAL" {
t.Errorf("expected code MISSING_BYOK_CREDENTIAL, got %q", res.Code)
}
if res.HTTP != http.StatusUnprocessableEntity {
t.Errorf("expected 422, got %d", res.HTTP)
}
if probeCalled {
t.Errorf("liveness probe must NOT run when presence already fails")
}
}
// TestCreateBYOKGate_InvalidCredential_FailsClosed — GAP B: a PRESENT credential
// that the provider rejects (401) → BYOK_CREDENTIAL_INVALID, fail-closed.
func TestCreateBYOKGate_InvalidCredential_FailsClosed(t *testing.T) {
h := &WorkspaceHandler{}
keys := map[string]struct{}{"KIMI_API_KEY": {}}
res := h.evaluateCreateBYOKCredentialGate(
context.Background(),
"claude-code", "kimi-for-coding",
keys,
func(ctx context.Context, key string) (string, bool) { return "sk-dead-token", true },
stubProbe(http.StatusUnauthorized),
)
if res.OK {
t.Fatalf("expected gate to REJECT a present-but-dead (401) credential, got OK")
}
if res.Code != "BYOK_CREDENTIAL_INVALID" {
t.Errorf("expected code BYOK_CREDENTIAL_INVALID, got %q (msg=%s)", res.Code, res.Message)
}
if res.HTTP != http.StatusUnprocessableEntity {
t.Errorf("expected 422, got %d", res.HTTP)
}
}
// TestCreateBYOKGate_Forbidden_FailsClosed — 403 is also a hard auth failure.
func TestCreateBYOKGate_Forbidden_FailsClosed(t *testing.T) {
h := &WorkspaceHandler{}
keys := map[string]struct{}{"KIMI_API_KEY": {}}
res := h.evaluateCreateBYOKCredentialGate(
context.Background(),
"claude-code", "kimi-for-coding", keys,
func(ctx context.Context, key string) (string, bool) { return "sk-forbidden", true },
stubProbe(http.StatusForbidden),
)
if res.OK || res.Code != "BYOK_CREDENTIAL_INVALID" {
t.Fatalf("expected 403 to fail closed as BYOK_CREDENTIAL_INVALID, got OK=%v code=%q", res.OK, res.Code)
}
}
// TestCreateBYOKGate_Transient_NotBlocked — GAP B crux: a transient failure
// (network error, 5xx, 429, timeout) must NOT block the create. The credential
// validity is UNKNOWN; a flaky upstream cannot wedge every create.
func TestCreateBYOKGate_Transient_NotBlocked(t *testing.T) {
cases := []struct {
name string
probe byokLivenessProbe
}{
{"network_error", transientProbe(errors.New("dial tcp: connection refused"))},
{"timeout", transientProbe(context.DeadlineExceeded)},
{"500", stubProbe(http.StatusInternalServerError)},
{"502", stubProbe(http.StatusBadGateway)},
{"503", stubProbe(http.StatusServiceUnavailable)},
{"429_rate_limited", stubProbe(http.StatusTooManyRequests)},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
h := &WorkspaceHandler{}
keys := map[string]struct{}{"KIMI_API_KEY": {}}
res := h.evaluateCreateBYOKCredentialGate(
context.Background(),
"claude-code", "kimi-for-coding", keys,
func(ctx context.Context, key string) (string, bool) { return "sk-maybe-fine", true },
tc.probe,
)
if !res.OK {
t.Fatalf("transient probe (%s) must NOT block create, got rejected code=%q", tc.name, res.Code)
}
})
}
}
// TestCreateBYOKGate_ValidCredential_Proceeds — GAP B happy path: a present,
// valid (200) credential proceeds.
func TestCreateBYOKGate_ValidCredential_Proceeds(t *testing.T) {
h := &WorkspaceHandler{}
keys := map[string]struct{}{"KIMI_API_KEY": {}}
res := h.evaluateCreateBYOKCredentialGate(
context.Background(),
"claude-code", "kimi-for-coding", keys,
func(ctx context.Context, key string) (string, bool) { return "sk-live", true },
stubProbe(http.StatusOK),
)
if !res.OK {
t.Fatalf("valid (200) credential must proceed, got rejected code=%q msg=%s", res.Code, res.Message)
}
}
// TestCreateBYOKGate_PlatformManaged_Unaffected — GAP scope: a platform-derived
// model is NOT byok, so the gate is a no-op even with no credential and even
// with a probe that would 401. The probe must never run.
func TestCreateBYOKGate_PlatformManaged_Unaffected(t *testing.T) {
h := &WorkspaceHandler{}
probeCalled := false
probe := func(req *http.Request) (*http.Response, error) {
probeCalled = true
return stubProbe(http.StatusUnauthorized)(req)
}
// claude-code + a platform-namespaced model derives to the closed `platform`
// provider → platform_managed → gate no-op.
res := h.evaluateCreateBYOKCredentialGate(
context.Background(),
"claude-code", "anthropic/claude-opus-4-7",
map[string]struct{}{}, // no creds — irrelevant for platform
func(ctx context.Context, key string) (string, bool) { return "", false },
probe,
)
if !res.OK {
t.Fatalf("platform-managed create must be unaffected by the BYOK gate, got code=%q", res.Code)
}
if probeCalled {
t.Errorf("liveness probe must NOT run for a platform-managed model")
}
}
// TestCreateBYOKGate_UnderivableModel_DefaultsClosedToPlatform — a model that
// does not derive (the model-registry gate owns rejecting it) is treated as
// non-byok (default-closed to platform), NOT failed as a credential-less byok.
func TestCreateBYOKGate_UnderivableModel_DefaultsClosedToPlatform(t *testing.T) {
h := &WorkspaceHandler{}
res := h.evaluateCreateBYOKCredentialGate(
context.Background(),
"claude-code", "totally-made-up-xyz",
map[string]struct{}{},
func(ctx context.Context, key string) (string, bool) { return "", false },
stubProbe(http.StatusOK),
)
if !res.OK {
t.Fatalf("an underivable model must default-closed to platform (non-byok), not fail as MISSING_BYOK_CREDENTIAL; got code=%q", res.Code)
}
}
// TestCreateBYOKGate_NonAuth4xx_TreatedOK — a 400/404 means auth succeeded but
// the endpoint disliked the request shape (e.g. a provider with no models-list
// route). That is NOT a dead credential → proceed.
func TestCreateBYOKGate_NonAuth4xx_TreatedOK(t *testing.T) {
h := &WorkspaceHandler{}
keys := map[string]struct{}{"KIMI_API_KEY": {}}
for _, status := range []int{http.StatusBadRequest, http.StatusNotFound} {
res := h.evaluateCreateBYOKCredentialGate(
context.Background(),
"claude-code", "kimi-for-coding", keys,
func(ctx context.Context, key string) (string, bool) { return "sk-live", true },
stubProbe(status),
)
if !res.OK {
t.Errorf("non-auth 4xx (%d) means auth succeeded; must proceed, got code=%q", status, res.Code)
}
}
}
// TestProbeAuthHeader_AnthropicVsBearer pins the provider-generic auth-header
// selection: anthropic-protocol x-api-key keys use x-api-key; bearer keys use
// Authorization: Bearer. No per-provider URL hardcoding.
func TestProbeAuthHeader_AnthropicVsBearer(t *testing.T) {
anthropicProvider := providers.Provider{
Name: "kimi-coding",
Protocol: providers.ProtocolAnthropic,
BaseURLTemplate: "https://api.kimi.com/coding/v1",
AuthEnv: []string{"KIMI_API_KEY"},
}
req, err := buildBYOKLivenessRequest(context.Background(), anthropicProvider, "KIMI_API_KEY", "sk-kimi")
if err != nil {
t.Fatalf("buildBYOKLivenessRequest: %v", err)
}
if got := req.Header.Get("x-api-key"); got != "sk-kimi" {
t.Errorf("anthropic-protocol KIMI_API_KEY should set x-api-key, got %q", got)
}
if req.Header.Get("Authorization") != "" {
t.Errorf("anthropic x-api-key key must not also set Authorization, got %q", req.Header.Get("Authorization"))
}
if req.Method != http.MethodGet || req.URL.String() != "https://api.kimi.com/coding/v1/models" {
t.Errorf("expected GET .../models, got %s %s", req.Method, req.URL.String())
}
openaiProvider := providers.Provider{
Name: "openai-api",
Protocol: providers.ProtocolOpenAI,
BaseURLTemplate: "https://api.openai.com/v1",
AuthEnv: []string{"OPENAI_API_KEY"},
}
req2, err := buildBYOKLivenessRequest(context.Background(), openaiProvider, "OPENAI_API_KEY", "sk-openai")
if err != nil {
t.Fatalf("buildBYOKLivenessRequest openai: %v", err)
}
if got := req2.Header.Get("Authorization"); got != "Bearer sk-openai" {
t.Errorf("openai-protocol OPENAI_API_KEY should set Authorization: Bearer, got %q", got)
}
}
// TestProbeRequest_NoBaseURL_Errors — an OAuth arm with null base URLs has no
// probeable HTTP surface; buildBYOKLivenessRequest errors and the gate treats it
// as transient/allow.
func TestProbeRequest_NoBaseURL_Errors(t *testing.T) {
oauthArm := providers.Provider{
Name: "anthropic-oauth",
Protocol: providers.ProtocolAnthropic,
AuthEnv: []string{"CLAUDE_CODE_OAUTH_TOKEN"},
// base_url_template + base_url_anthropic both empty (null in yaml).
}
_, err := buildBYOKLivenessRequest(context.Background(), oauthArm, "CLAUDE_CODE_OAUTH_TOKEN", "tok")
if err == nil {
t.Fatalf("expected an error for a provider with no probeable HTTP base")
}
}
// ============================================================================
// End-to-end through the Create HTTP handler — proves the wiring + 4xx body.
// ============================================================================
// TestWorkspaceCreate_BYOK_MissingCredential_Rejected422 — GAP A end-to-end:
// a credential-less byok create is rejected synchronously (no DB insert).
func TestWorkspaceCreate_BYOK_MissingCredential_Rejected422(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
broadcaster := newTestBroadcaster()
handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
// Probe should never run (presence fails first); stub it to 200 anyway.
handler.byokLivenessProbeOverride = stubProbe(http.StatusOK)
// Only the global_secrets presence query is expected (returns NO keys) —
// the 422 fires before BeginTx, so any INSERT would fail ExpectationsWereMet.
expectGlobalSecretsPresenceQuery(mock /* no keys */)
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
body := `{"name":"Kimi BYOK","runtime":"claude-code","model":"kimi-for-coding"}`
c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
c.Request.Header.Set("Content-Type", "application/json")
handler.Create(c)
if w.Code != http.StatusUnprocessableEntity {
t.Fatalf("expected 422 MISSING_BYOK_CREDENTIAL, got %d: %s", w.Code, w.Body.String())
}
if !bytes.Contains(w.Body.Bytes(), []byte(`"code":"MISSING_BYOK_CREDENTIAL"`)) {
t.Errorf("expected code MISSING_BYOK_CREDENTIAL in body, got %s", w.Body.String())
}
if !bytes.Contains(w.Body.Bytes(), []byte(`"provider":"kimi-coding"`)) {
t.Errorf("expected derived provider kimi-coding echoed in body, got %s", w.Body.String())
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unexpected DB activity on synchronous reject: %v", err)
}
}
// TestWorkspaceCreate_BYOK_InvalidCredential_Rejected422 — GAP B end-to-end:
// a present-but-dead (401) credential supplied in the payload is rejected
// synchronously with BYOK_CREDENTIAL_INVALID (no DB insert).
func TestWorkspaceCreate_BYOK_InvalidCredential_Rejected422(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
broadcaster := newTestBroadcaster()
handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
handler.byokLivenessProbeOverride = stubProbe(http.StatusUnauthorized)
// Presence is satisfied by the PAYLOAD secret (no global keys needed); the
// value resolver reads the payload directly (no global_secrets value query).
expectGlobalSecretsPresenceQuery(mock /* no global keys */)
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
body := `{"name":"Kimi Dead","runtime":"claude-code","model":"kimi-for-coding","secrets":{"KIMI_API_KEY":"sk-revoked"}}`
c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
c.Request.Header.Set("Content-Type", "application/json")
handler.Create(c)
if w.Code != http.StatusUnprocessableEntity {
t.Fatalf("expected 422 BYOK_CREDENTIAL_INVALID, got %d: %s", w.Code, w.Body.String())
}
if !bytes.Contains(w.Body.Bytes(), []byte(`"code":"BYOK_CREDENTIAL_INVALID"`)) {
t.Errorf("expected code BYOK_CREDENTIAL_INVALID in body, got %s", w.Body.String())
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unexpected DB activity on synchronous reject: %v", err)
}
}
// TestWorkspaceCreate_BYOK_TransientProbe_Proceeds — GAP B end-to-end: a
// transient probe failure does NOT block; the create proceeds to 201. The
// credential is supplied via a GLOBAL secret (so there is no in-tx
// workspace_secrets write and the create flow matches the registry-proceed
// tests). Presence passes via the global key; the value resolver decrypts the
// global secret value to feed the probe, which then errors transiently.
func TestWorkspaceCreate_BYOK_TransientProbe_Proceeds(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
broadcaster := newTestBroadcaster()
handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
waitForHandlerAsyncBeforeDBCleanup(t, handler)
handler.byokLivenessProbeOverride = transientProbe(errors.New("dial tcp: i/o timeout"))
// Presence: global_secrets carries KIMI_API_KEY.
expectGlobalSecretsPresenceQuery(mock, "KIMI_API_KEY")
// Liveness value resolver: decrypt the global KIMI_API_KEY value.
expectGlobalSecretValueQuery(mock, "KIMI_API_KEY", "sk-maybe-fine")
// Create proceeds (transient = allow). No payload secrets → no in-tx
// workspace_secrets write; the create flow matches the registry-proceed path.
mock.ExpectBegin()
mock.ExpectExec("INSERT INTO workspaces").WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectCommit()
mock.ExpectExec("INSERT INTO canvas_layouts").WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectExec("INSERT INTO structure_events").WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectExec("INSERT INTO workspace_auth_tokens").WillReturnResult(sqlmock.NewResult(0, 1))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
body := `{"name":"Kimi Transient","runtime":"claude-code","model":"kimi-for-coding"}`
c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
c.Request.Header.Set("Content-Type", "application/json")
handler.Create(c)
if w.Code != http.StatusCreated {
t.Fatalf("transient probe must NOT block; expected 201, got %d: %s", w.Code, w.Body.String())
}
}
// TestWorkspaceCreate_BYOK_ValidGlobalCredential_Proceeds — GAP B happy path
// e2e: a valid (200) global credential proceeds to 201.
func TestWorkspaceCreate_BYOK_ValidGlobalCredential_Proceeds(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
broadcaster := newTestBroadcaster()
handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
waitForHandlerAsyncBeforeDBCleanup(t, handler)
handler.byokLivenessProbeOverride = stubProbe(http.StatusOK)
expectGlobalSecretsPresenceQuery(mock, "KIMI_API_KEY")
expectGlobalSecretValueQuery(mock, "KIMI_API_KEY", "sk-live")
mock.ExpectBegin()
mock.ExpectExec("INSERT INTO workspaces").WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectCommit()
mock.ExpectExec("INSERT INTO canvas_layouts").WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectExec("INSERT INTO structure_events").WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectExec("INSERT INTO workspace_auth_tokens").WillReturnResult(sqlmock.NewResult(0, 1))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
body := `{"name":"Kimi Live","runtime":"claude-code","model":"kimi-for-coding"}`
c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
c.Request.Header.Set("Content-Type", "application/json")
handler.Create(c)
if w.Code != http.StatusCreated {
t.Fatalf("valid credential must proceed; expected 201, got %d: %s", w.Code, w.Body.String())
}
}
@@ -48,7 +48,8 @@ func TestWorkspaceCreate_WithParentID(t *testing.T) {
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
body := `{"name":"Child Agent","model":"anthropic:claude-opus-4-7","parent_id":"parent-ws-123"}`
// Platform-managed model id → BYOK gate no-op (this test asserts parent_id).
body := `{"name":"Child Agent","model":"anthropic/claude-opus-4-7","parent_id":"parent-ws-123"}`
c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
c.Request.Header.Set("Content-Type", "application/json")
@@ -86,7 +87,9 @@ func TestWorkspaceCreate_ExplicitClaudeCodeRuntime(t *testing.T) {
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
body := `{"name":"CC Agent","tier":2,"runtime":"claude-code","model":"sonnet","canvas":{"x":10,"y":20}}`
// Platform-managed model id → BYOK gate no-op (this test asserts the
// claude-code runtime label, not billing).
body := `{"name":"CC Agent","tier":2,"runtime":"claude-code","model":"anthropic/claude-sonnet-4-6","canvas":{"x":10,"y":20}}`
c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
c.Request.Header.Set("Content-Type", "application/json")
@@ -309,7 +312,9 @@ func TestWorkspaceCreate_MaxConcurrentTasksOverride(t *testing.T) {
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
body := `{"name":"Leader Agent","runtime":"claude-code","model":"sonnet","max_concurrent_tasks":3}`
// Platform-managed model id → BYOK gate no-op (this test asserts
// max_concurrent_tasks).
body := `{"name":"Leader Agent","runtime":"claude-code","model":"anthropic/claude-sonnet-4-6","max_concurrent_tasks":3}`
c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
c.Request.Header.Set("Content-Type", "application/json")
@@ -398,7 +398,10 @@ func TestWorkspaceCreate(t *testing.T) {
// the bare-defaults path (no template, no runtime → claude-code), so
// the body must declare an explicit model. Using a claude-code-compatible
// id; the test doesn't exercise model semantics beyond presence.
body := `{"name":"Test Agent","model":"anthropic:claude-opus-4-7","canvas":{"x":100,"y":200}}`
// Platform-managed model id (derives to the closed `platform` provider) so
// the fail-closed BYOK create gate is a no-op — this test exercises the
// generic create flow, not billing/credential semantics.
body := `{"name":"Test Agent","model":"anthropic/claude-opus-4-7","canvas":{"x":100,"y":200}}`
c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
c.Request.Header.Set("Content-Type", "application/json")
@@ -466,7 +469,9 @@ func TestWorkspaceCreate_ReturnsAuthToken_201(t *testing.T) {
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
body := `{"name":"Token Holder","model":"anthropic:claude-opus-4-7"}`
// Platform-managed model id → BYOK gate no-op (this test asserts the auth
// token, not billing).
body := `{"name":"Token Holder","model":"anthropic/claude-opus-4-7"}`
c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
c.Request.Header.Set("Content-Type", "application/json")
@@ -89,6 +89,21 @@ type WorkspaceHandler struct {
// for async DB users (restart, provision) before asserting results.
// Matches the pattern from main commit 1c3b4ff3.
asyncWG sync.WaitGroup
// byokLivenessProbeOverride lets tests inject a mock outbound-HTTP probe
// for the create-time BYOK credential LIVENESS gate (GAP B) so no real
// network is touched in tests. nil in production → effectiveBYOKLivenessProbe
// returns defaultBYOKLivenessProbe (a real http.Client). Set it to a stub in
// tests, or to a sentinel that disables probing.
byokLivenessProbeOverride byokLivenessProbe
}
// effectiveBYOKLivenessProbe returns the probe the create-time liveness gate
// should use: the test override when set, else the production http.Client probe.
func (h *WorkspaceHandler) effectiveBYOKLivenessProbe() byokLivenessProbe {
if h.byokLivenessProbeOverride != nil {
return h.byokLivenessProbeOverride
}
return defaultBYOKLivenessProbe
}
// seedMemoryPluginAPI is the slice of the v2 memory plugin client that
@@ -550,6 +565,46 @@ func (h *WorkspaceHandler) Create(c *gin.Context) {
}
}
// Fail-closed BYOK credential gate (byok_credential_gate.go) — synchronous,
// BEFORE any DB write so a credential-less or dead-credential BYOK create is
// a 4xx, not a 201-then-late-provision-failure. Placed AFTER the cheap field
// validations (compute / access / dir / URL) so a malformed request is
// rejected on its own merits before the heavier credential check (one
// global_secrets read + a network probe), and immediately BEFORE BeginTx so
// the rejection path never touches the DB transaction:
//
// GAP A — PRESENCE: resolved mode == byok but no usable LLM credential at
// any scope (payload secrets + configured global_secrets) → 422
// MISSING_BYOK_CREDENTIAL. Same presence rule the provision-time check
// keys off (anyBYOKCredentialKeyPresent), so create and provision agree.
// GAP B — LIVENESS: a credential IS present — probe it actually works. A
// positively-rejected credential (401/403) → 422 BYOK_CREDENTIAL_INVALID;
// a transient/network/5xx/timeout never blocks (see the probe's doc).
//
// Skipped for external workspaces (no model/provider contract — the URL is
// the contract, mirroring the MODEL_REQUIRED / registry gates above).
if !isExternal {
availableCredKeys := gatherCreateTimeLLMCredKeys(ctx, payload.Secrets)
gate := h.evaluateCreateBYOKCredentialGate(
ctx,
payload.Runtime,
payload.Model,
availableCredKeys,
makeCreateTimeCredValueResolver(payload.Secrets),
h.effectiveBYOKLivenessProbe(),
)
if !gate.OK {
c.JSON(gate.HTTP, gin.H{
"error": gate.Message,
"code": gate.Code,
"runtime": payload.Runtime,
"model": payload.Model,
"provider": gate.Provider,
})
return
}
}
tx, txErr := db.DB.BeginTx(ctx, nil)
if txErr != nil {
log.Printf("Create workspace: begin tx error: %v", txErr)
@@ -173,7 +173,8 @@ func TestWorkspaceBudget_Create_WithLimit(t *testing.T) {
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
body := `{"name":"Budgeted Agent","model":"anthropic:claude-opus-4-7","budget_limit":1000}`
// Platform-managed model id → BYOK gate no-op (this test asserts budget_limit).
body := `{"name":"Budgeted Agent","model":"anthropic/claude-opus-4-7","budget_limit":1000}`
c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
c.Request.Header.Set("Content-Type", "application/json")
handler.Create(c)
@@ -1142,13 +1142,21 @@ func stripPlatformManagedLLMBypassEnv(envVars map[string]string) {
// global or workspace scope). Used by the byok fail-closed branch: a byok
// workspace with no LLM credential at ANY scope must be aborted with
// MISSING_BYOK_CREDENTIAL rather than started credential-less.
//
// This is the PROVISION-time presence rule. The CREATE-time presence gate
// (byok_credential_gate.go) reuses the SAME underlying predicate
// (anyBYOKCredentialKeyPresent) over a presence-only key set so the two
// fail-closed checks can never diverge: the rule "any non-empty
// platform-managed LLM bypass key is a usable BYOK credential" lives in exactly
// one place.
func hasAnyPlatformManagedLLMKey(envVars map[string]string) bool {
for key := range platformManagedDirectLLMBypassKeys {
if strings.TrimSpace(envVars[key]) != "" {
return true
present := make(map[string]struct{}, len(envVars))
for key, val := range envVars {
if strings.TrimSpace(val) != "" {
present[key] = struct{}{}
}
}
return false
return anyBYOKCredentialKeyPresent(present)
}
// stripNonMatchingGlobalOriginLLMCreds is the byok-branch provider-matched
@@ -350,7 +350,9 @@ func TestWorkspaceCreate_DBInsertError(t *testing.T) {
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
body := `{"name":"Failing Agent","model":"anthropic:claude-opus-4-7"}`
// Platform-managed model id → BYOK gate no-op (this test asserts the DB
// insert-error rollback, not billing).
body := `{"name":"Failing Agent","model":"anthropic/claude-opus-4-7"}`
c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
c.Request.Header.Set("Content-Type", "application/json")
@@ -396,7 +398,9 @@ func TestWorkspaceCreate_DefaultsApplied(t *testing.T) {
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
body := `{"name":"Default Agent","model":"anthropic:claude-opus-4-7"}`
// Platform-managed model id → BYOK gate no-op (this test asserts the
// status/defaults, not billing).
body := `{"name":"Default Agent","model":"anthropic/claude-opus-4-7"}`
c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body))
c.Request.Header.Set("Content-Type", "application/json")
@@ -519,6 +523,13 @@ func TestWorkspaceCreate_SecretPersistFails_RollsBack(t *testing.T) {
broadcaster := newTestBroadcaster()
handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
// The create-time BYOK gate runs first: anthropic:claude-opus-4-7 derives to
// anthropic-api (BYOK). Presence is satisfied by the payload OPENAI_API_KEY
// (a recognized LLM bypass key) — the liveness probe is skipped because
// OPENAI_API_KEY is not in anthropic-api's auth_env, so no value is read for
// the derived provider. Only the global presence query precedes BeginTx.
expectGlobalSecretsPresenceQuery(mock /* no global keys */)
mock.ExpectBegin()
mock.ExpectExec("INSERT INTO workspaces").
WillReturnResult(sqlmock.NewResult(0, 1))
@@ -2118,6 +2129,12 @@ func TestWorkspaceCreate_718_P4_RegisteredModelProceeds(t *testing.T) {
setupTestRedis(t)
broadcaster := newTestBroadcaster()
handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
// claude-opus-4-7 derives to anthropic-api (BYOK). The fail-closed BYOK
// create gate now requires a present + live credential — supply one via a
// global secret and stub the liveness probe to 200 (no real network).
handler.byokLivenessProbeOverride = stubProbe(http.StatusOK)
expectGlobalSecretsPresenceQuery(mock, "ANTHROPIC_API_KEY")
expectGlobalSecretValueQuery(mock, "ANTHROPIC_API_KEY", "sk-ant-live")
mock.ExpectBegin()
mock.ExpectExec("INSERT INTO workspaces").
@@ -2156,6 +2173,11 @@ func TestWorkspaceCreate_718_P4_LegacyColonVocabAccepted(t *testing.T) {
setupTestRedis(t)
broadcaster := newTestBroadcaster()
handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
// anthropic:claude-opus-4-7 derives to anthropic-api (BYOK) — satisfy the
// fail-closed BYOK create gate with a present + live global credential.
handler.byokLivenessProbeOverride = stubProbe(http.StatusOK)
expectGlobalSecretsPresenceQuery(mock, "ANTHROPIC_API_KEY")
expectGlobalSecretValueQuery(mock, "ANTHROPIC_API_KEY", "sk-ant-live")
mock.ExpectBegin()
mock.ExpectExec("INSERT INTO workspaces").
@@ -2220,6 +2242,13 @@ func TestWorkspaceCreate_188_ExplicitRuntimeNoTemplate_OK(t *testing.T) {
setupTestRedis(t)
broadcaster := newTestBroadcaster()
handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
// codex gpt-5.5 derives to openai-subscription (BYOK, OAuth arm). The OAuth
// arm has null base URLs → the liveness probe is unprobeable → transient/
// allow; presence still requires a credential. Supply CODEX_AUTH_JSON via a
// global secret (presence passes; the probe is skipped as unprobeable).
handler.byokLivenessProbeOverride = stubProbe(http.StatusOK)
expectGlobalSecretsPresenceQuery(mock, "CODEX_AUTH_JSON")
expectGlobalSecretValueQuery(mock, "CODEX_AUTH_JSON", `{"tokens":"x"}`)
mock.ExpectBegin()
mock.ExpectExec("INSERT INTO workspaces").