diff --git a/workspace-server/internal/handlers/byok_credential_gate.go b/workspace-server/internal/handlers/byok_credential_gate.go new file mode 100644 index 00000000..70b04828 --- /dev/null +++ b/workspace-server/internal/handlers/byok_credential_gate.go @@ -0,0 +1,514 @@ +package handlers + +// byok_credential_gate.go — fail-closed BYOK credential checks at the CREATE +// boundary (molecule-core, BYOK billing-model gap-closure). +// +// Two related gaps in the fail-closed BYOK billing model are closed here, both +// in the same credential-validation area: +// +// GAP A — PRESENCE gate at CREATE. The provision-time fail-closed check +// (workspace_provision_shared.go) already aborts a BYOK provision with no +// usable credential, but that fires AFTER a 201 and a goroutine launch — a +// credential-less BYOK create looks successful, then dies late in provision. +// This gate moves the SAME presence rule to the synchronous create path so a +// credential-less BYOK create is a 4xx, not a 201-then-late-failure. It +// reuses anyBYOKCredentialKeyPresent — the exact predicate the provision +// check keys off (via hasAnyPlatformManagedLLMKey) — so the two checks can +// never diverge. +// +// GAP B — LIVENESS preflight. Presence is not validity: a BYOK workspace can +// carry a PRESENT-but-DEAD token (revoked/expired) and still 201 + provision, +// only to wedge at first LLM call. This gate makes the cheapest authenticated +// probe (a models-list GET, else a 1-token no-op completion) against the +// provider's base_url (derived from providers.yaml — the SSOT) and FAILS +// CLOSED on a hard auth failure (401/403). The crux distinction, documented +// on probeBYOKCredentialLiveness: a DEAD token (401/403) MUST block; a +// TRANSIENT failure (network/timeout/5xx) MUST NOT block (a flaky upstream +// cannot be allowed to wedge every create) — it logs loudly and allows. +// +// Both gates are scoped to the resolved billing mode == byok. platform_managed +// (the CP proxy owns the credential) and disabled (no platform-billed LLM) are +// untouched. + +import ( + "context" + "fmt" + "io" + "log" + "net/http" + "strings" + "time" + + "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/crypto" + "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db" + "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/providers" +) + +// anyBYOKCredentialKeyPresent is the SINGLE presence predicate shared by the +// create-time gate (this file) and the provision-time check +// (hasAnyPlatformManagedLLMKey). It reports whether the supplied set of present +// credential env-var NAMES contains at least one platform-managed-shaped LLM +// bypass key — the tenant's own usable BYOK credential. The caller is +// responsible for only including keys whose VALUE is non-empty (the provision +// path filters on TrimSpace != "", the create path includes only non-empty +// payload secrets + configured global_secrets keys). +// +// Keeping the rule here — not duplicated in two call sites — is the whole point: +// "what counts as a usable BYOK credential" is defined once. +func anyBYOKCredentialKeyPresent(presentKeys map[string]struct{}) bool { + for key := range presentKeys { + if _, ok := platformManagedDirectLLMBypassKeys[strings.ToUpper(strings.TrimSpace(key))]; ok { + return true + } + } + return false +} + +// gatherCreateTimeLLMCredKeys collects the set of credential env-var NAMES that +// will be available to the workspace at provision time, as known at CREATE: +// +// - the create payload's own secrets (workspace scope), non-empty values only; +// - the tenant's configured global_secrets keys (operator/global scope). +// +// It returns ONLY non-empty keys, so the set can be passed straight to +// anyBYOKCredentialKeyPresent (presence) and is the right input for the +// auth-env disambiguation DeriveProvider uses. Returns the full key set (not +// just bypass keys) because DeriveProvider's tie-break wants every recognized +// auth-env name; the bypass-key filter is applied by anyBYOKCredentialKeyPresent. +// +// The global_secrets read is presence-only (key names, never values) — values +// are decrypted only by the liveness probe, and only for the resolved provider's +// own auth_env keys. +func gatherCreateTimeLLMCredKeys(ctx context.Context, payloadSecrets map[string]string) map[string]struct{} { + keys := map[string]struct{}{} + for k, v := range payloadSecrets { + if strings.TrimSpace(v) != "" { + keys[k] = struct{}{} + } + } + // Global/operator-scope keys (presence only). Best-effort: a read error + // degrades to "only payload secrets" — the gate then fails CLOSED if the + // payload alone lacks a credential, which is the safe direction (a byok + // create that depends on a global cred we could not enumerate is rejected + // with an actionable error rather than silently provisioned). + globalKeys, err := loadConfiguredGlobalSecretKeys(ctx) + if err != nil { + log.Printf("byok_credential_gate: WARN — could not load global_secrets keys for create-time BYOK check: %v (proceeding with payload secrets only)", err) + return keys + } + for k := range globalKeys { + keys[k] = struct{}{} + } + return keys +} + +// availableAuthEnvNamesFromKeySet returns the subset of presentKeys that are +// recognized provider auth-env names — the DeriveProvider tie-break input — so +// the create-time derive splits oauth-vs-api the same way the provision path +// does. Mirror of availableAuthEnvNames but over a key SET (the create path has +// no env-value map, only presence). +func availableAuthEnvNamesFromKeySet(presentKeys map[string]struct{}) []string { + authSet := authEnvNameSet() + var out []string + for k := range presentKeys { + if _, ok := authSet[k]; ok { + out = append(out, k) + } + } + return out +} + +// createBYOKGateResult is the outcome of the create-time BYOK gate. Code is the +// structured error code ("" on pass); Message is the user-visible reason; HTTP +// is the status to return; Provider names the derived provider when known +// (surfaced in logs + the structured error). When OK is true the create +// proceeds. +type createBYOKGateResult struct { + OK bool + Code string + Message string + HTTP int + Provider string +} + +// passCreateBYOKGate is the proceed sentinel. +func passCreateBYOKGate() createBYOKGateResult { return createBYOKGateResult{OK: true} } + +// evaluateCreateBYOKCredentialGate runs GAP A (presence) and GAP B (liveness) +// at the create boundary for a (runtime, model) pair, given the credential keys +// that will be available at provision time and a probe function (injected so +// tests mock the network — no real outbound in unit tests). +// +// Flow: +// 1. DERIVE the provider from (runtime, model) via the registry SSOT and key +// the byok-vs-platform decision off IsPlatform(derived) — the SAME rule +// ResolveLLMBillingModeDerived's precedence-2 uses. We derive DIRECTLY here +// (not via ResolveLLMBillingModeDerived) because at create there is NO +// workspace row yet: that resolver short-circuits a "" workspaceID to the +// pure platform-default (its pre-provision contract) and would never see +// byok. There is also no operator-override column to honor pre-row, so a +// direct derive is both correct and sufficient. A derive failure +// (unregistered/ambiguous) → default-closed to platform (NOT byok), exactly +// as the resolver does — never fail a credential-less create as byok on an +// underivable model. (The model-registry gate upstream already rejects +// unregistered models, so this is belt-and-suspenders.) +// 2. GAP A: if byok and NO usable credential key is present → reject +// MISSING_BYOK_CREDENTIAL (4xx, synchronous). +// 3. GAP B: a credential IS present — probe its LIVENESS. Fail CLOSED only on a +// hard auth failure (401/403); a transient/network/5xx/timeout logs loudly +// and ALLOWS (see probeBYOKCredentialLiveness for the crux rationale). +// +// External workspaces are handled by the caller (they have no model/provider +// contract) — this function assumes a spawned-runtime create. +func (h *WorkspaceHandler) evaluateCreateBYOKCredentialGate( + ctx context.Context, + runtime, model string, + availableCredKeys map[string]struct{}, + credValueForKey func(ctx context.Context, key string) (string, bool), + probe byokLivenessProbe, +) createBYOKGateResult { + authEnv := availableAuthEnvNamesFromKeySet(availableCredKeys) + + manifest, mErr := providerRegistry() + if mErr != nil || manifest == nil { + // Registry unavailable (a build-time defect the verify-providers-gen / + // sync gates catch). We cannot derive the mode — default-closed to + // platform (not byok), so the gate enforces nothing. Mirrors the + // resolver's own registry-load fail-direction. + log.Printf("byok_credential_gate: WARN — provider registry unavailable, skipping BYOK create gate (runtime=%q model=%q): %v", runtime, model, mErr) + return passCreateBYOKGate() + } + provider, dErr := manifest.DeriveProvider(runtime, model, authEnv) + if dErr != nil { + // No model / unknown runtime / unregistered / ambiguous → default-closed + // to platform (NOT byok). A derive miss never fails a create as byok. + log.Printf("byok_credential_gate: provider underivable for create (runtime=%q model=%q): %v — treating as non-byok (default-closed to platform)", runtime, model, dErr) + return passCreateBYOKGate() + } + if provider.IsPlatform() { + // platform_managed — the CP proxy owns the credential; nothing for this + // gate to enforce. (disabled is never DERIVED — it is only an explicit + // operator override, which does not exist pre-row.) + return passCreateBYOKGate() + } + + // A non-platform (BYOK) provider was derived. Enforce the BYOK contract. + + // GAP A — PRESENCE. Reuse the EXACT predicate the provision-time check keys + // off (anyBYOKCredentialKeyPresent) so create and provision can never + // disagree on "what is a usable BYOK credential." + if !anyBYOKCredentialKeyPresent(availableCredKeys) { + msg := formatMissingBYOKCredentialError(LLMBillingModeBYOK) + log.Printf("Create: FAIL-CLOSED (MISSING_BYOK_CREDENTIAL) — runtime=%q model=%q resolved=byok provider=%q but no usable LLM credential present at any scope", + runtime, model, provider.Name) + return createBYOKGateResult{ + OK: false, + Code: "MISSING_BYOK_CREDENTIAL", + Message: msg, + HTTP: http.StatusUnprocessableEntity, + Provider: provider.Name, + } + } + + // GAP B — LIVENESS. A credential IS present; probe that it actually works. + // Disabled when no probe is wired (provision-only deployments / probe off). + if probe == nil { + return passCreateBYOKGate() + } + return runBYOKLivenessGate(ctx, provider, availableCredKeys, credValueForKey, probe) +} + +// runBYOKLivenessGate selects the credential VALUE for the derived provider's +// auth_env and runs the liveness probe. Fails CLOSED only on a hard auth +// failure; transient failures (and an unreadable credential value) pass. +func runBYOKLivenessGate( + ctx context.Context, + provider providers.Provider, + availableCredKeys map[string]struct{}, + credValueForKey func(ctx context.Context, key string) (string, bool), + probe byokLivenessProbe, +) createBYOKGateResult { + if credValueForKey == nil { + // No value resolver wired — presence already passed; nothing to probe. + return passCreateBYOKGate() + } + // Pick the FIRST of the provider's auth_env keys that is both present and + // readable. We probe a single credential — the one the provider would + // actually authenticate with — not every key. + credKey, credVal := "", "" + for _, want := range provider.AuthEnv { + if _, present := availableCredKeys[want]; !present { + continue + } + if v, ok := credValueForKey(ctx, want); ok && strings.TrimSpace(v) != "" { + credKey, credVal = want, v + break + } + } + if credKey == "" { + // The present key(s) for this provider could not be read back (decrypt + // miss / value lives only in operator store we don't decrypt here). + // Presence already passed; we cannot probe a value we don't have, so + // allow — the provision-time path will still re-check presence, and a + // truly dead token surfaces there. Do NOT fail closed on an unreadable + // value (that would block legitimate global-scope creds). + log.Printf("byok_credential_gate: liveness — provider=%q has a present auth key but no readable value at create; skipping probe (presence already satisfied)", provider.Name) + return passCreateBYOKGate() + } + + outcome := probeBYOKCredentialLiveness(ctx, provider, credKey, credVal, probe) + switch outcome.kind { + case livenessDead: + msg := formatInvalidBYOKCredentialError(provider.Name, credKey, outcome.status) + log.Printf("Create: FAIL-CLOSED (BYOK_CREDENTIAL_INVALID) — provider=%q auth_env=%q probe returned HTTP %d (dead/revoked credential)", + provider.Name, credKey, outcome.status) + return createBYOKGateResult{ + OK: false, + Code: "BYOK_CREDENTIAL_INVALID", + Message: msg, + HTTP: http.StatusUnprocessableEntity, + Provider: provider.Name, + } + case livenessTransient: + // LOUD log, but ALLOW — a flaky upstream / network blip / 5xx must not + // block every create. This is the documented transient-vs-dead split. + log.Printf("Create: BYOK liveness probe TRANSIENT for provider=%q auth_env=%q (status=%d err=%v) — NOT blocking (transient != dead); provision-time re-check remains", + provider.Name, credKey, outcome.status, outcome.err) + return passCreateBYOKGate() + default: // livenessOK + log.Printf("Create: BYOK liveness probe OK for provider=%q auth_env=%q (status=%d)", provider.Name, credKey, outcome.status) + return passCreateBYOKGate() + } +} + +// ---------------------------------------------------------------------------- +// GAP B — liveness probe +// ---------------------------------------------------------------------------- + +// byokLivenessProbe is the injected outbound-HTTP seam: given a prepared +// request it returns the response (or a transport error). Production wires the +// real http.Client (see defaultBYOKLivenessProbe); tests inject an httptest / +// stub so NO real network is touched. The probe MUST honor the request context +// deadline. +type byokLivenessProbe func(req *http.Request) (*http.Response, error) + +// byokProbeTimeout is the hard cap on a single liveness probe. Kept tight (the +// probe is on the synchronous create path) — long enough for a healthy +// models-list GET, short enough that a hung upstream degrades to a TRANSIENT +// (allow) rather than wedging the create. +const byokProbeTimeout = 6 * time.Second + +type livenessKind int + +const ( + livenessOK livenessKind = iota + // livenessDead — a HARD auth failure (401/403). The credential is present + // but rejected: revoked, expired, or wrong scope. MUST fail closed. + livenessDead + // livenessTransient — network error, timeout, 5xx, or any non-auth status. + // The credential MIGHT be fine; the upstream is unreachable/unhappy right + // now. MUST NOT fail closed (a flaky network cannot block all creates). + livenessTransient +) + +type livenessOutcome struct { + kind livenessKind + status int // HTTP status observed (0 when no response — transport error/timeout) + err error // transport/timeout error when kind==livenessTransient with no response +} + +// probeBYOKCredentialLiveness makes ONE cheapest-possible authenticated call to +// the provider's upstream and classifies the result. +// +// ============================ THE CRUX ============================ +// The auth-failure vs transient-failure distinction is the entire point of this +// probe, and getting the FAIL DIRECTION right is security-vs-availability +// critical: +// +// - 401 / 403 → livenessDead → FAIL CLOSED. The upstream positively rejected +// the credential. A present-but-dead token is exactly the gap GAP B closes +// (a revoked key that would 201 + provision + wedge at first call). Blocking +// here gives the user synchronous, actionable feedback. +// +// - network error / DNS / connection refused / TLS / TIMEOUT → livenessTransient +// → ALLOW (loud log). The credential's validity is UNKNOWN — the upstream is +// unreachable. Failing closed here would let a single flaky network minute, +// or an upstream we cannot reach from this VPC, block EVERY byok create. The +// credential may be perfectly valid; we must not punish it for the network. +// +// - 5xx (500/502/503/504) → livenessTransient → ALLOW. The upstream is having +// a bad time; this says NOTHING about the credential. Same reasoning as a +// network error. +// +// - 429 (rate limited) → livenessTransient → ALLOW. A rate-limit means the key +// is being ACCEPTED (you cannot be rate-limited without authenticating) — +// the opposite of dead. Never block on it. +// +// - 2xx (and any other non-auth, non-5xx 4xx like 400/404) → livenessOK. A 2xx +// is an unambiguous pass. A 400/404 means the endpoint authenticated us and +// then disliked the request shape (e.g. a provider without a models-list +// route) — auth succeeded, so it is NOT a dead credential. Treat as OK +// rather than transient so a benign endpoint-shape mismatch never blocks. +// +// A DEAD token (401/403) MUST block; a flaky network MUST NOT. That asymmetry is +// the contract. +// ================================================================= +func probeBYOKCredentialLiveness(parent context.Context, provider providers.Provider, credKey, credVal string, probe byokLivenessProbe) livenessOutcome { + ctx, cancel := context.WithTimeout(parent, byokProbeTimeout) + defer cancel() + + req, err := buildBYOKLivenessRequest(ctx, provider, credKey, credVal) + if err != nil { + // We could not even construct a probe target (no base_url for this + // provider — e.g. an OAuth arm the CLI dials directly, no HTTP surface we + // can reach). Treat as transient/allow: we have NO evidence the + // credential is dead, and presence already passed. Never fail closed on + // "we don't know how to probe this provider." + return livenessOutcome{kind: livenessTransient, err: err} + } + + resp, err := probe(req) + if err != nil { + // Transport error or context deadline (timeout). Validity unknown → allow. + return livenessOutcome{kind: livenessTransient, err: err} + } + defer func() { + // Drain + close so the connection can be reused and we don't leak. + _, _ = io.Copy(io.Discard, io.LimitReader(resp.Body, 4<<10)) + _ = resp.Body.Close() + }() + + switch { + case resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden: + // 401 / 403 — the upstream positively rejected the credential. DEAD. + return livenessOutcome{kind: livenessDead, status: resp.StatusCode} + case resp.StatusCode == http.StatusTooManyRequests: + // 429 — rate limited means the key authenticated. NOT dead. + return livenessOutcome{kind: livenessTransient, status: resp.StatusCode} + case resp.StatusCode >= 500: + // 5xx — upstream trouble, says nothing about the credential. + return livenessOutcome{kind: livenessTransient, status: resp.StatusCode} + default: + // 2xx, or a non-auth 4xx (400/404 — auth succeeded, request shape + // disliked). Either way the credential was NOT rejected. OK. + return livenessOutcome{kind: livenessOK, status: resp.StatusCode} + } +} + +// buildBYOKLivenessRequest constructs the cheapest authenticated probe request +// from the provider's providers.yaml row — provider-generic, no per-provider +// URL hardcoding beyond what the SSOT supplies. +// +// Endpoint: prefer a models-list GET on the OpenAI-protocol base +// (base_url_template + "/models"); for anthropic-protocol providers without an +// openai base, fall back to a minimal 1-token messages POST on +// base_url_anthropic. Auth header is chosen from the protocol + the auth-env +// key the probe is exercising (Authorization: Bearer for openai/bearer keys, +// x-api-key for anthropic API keys). +// +// Returns an error when the provider exposes NO probeable HTTP base (an OAuth +// arm whose base_url_* are null — the CLI dials the vendor directly). The caller +// treats that as transient/allow (we cannot probe what has no endpoint). +func buildBYOKLivenessRequest(ctx context.Context, provider providers.Provider, credKey, credVal string) (*http.Request, error) { + openaiBase := strings.TrimRight(strings.TrimSpace(provider.BaseURLTemplate), "/") + anthropicBase := strings.TrimRight(strings.TrimSpace(provider.BaseURLAnthropic), "/") + + // Prefer the OpenAI-protocol models-list GET (cheapest, no body, no tokens). + if openaiBase != "" { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, openaiBase+"/models", nil) + if err != nil { + return nil, err + } + applyBYOKProbeAuth(req, provider, credKey, credVal) + return req, nil + } + + // Anthropic-protocol providers expose a models-list GET too (/v1/models); + // the base already includes the version segment for the registry rows + // (e.g. https://api.anthropic.com/v1). A GET /models is the cheapest probe. + if anthropicBase != "" { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, anthropicBase+"/models", nil) + if err != nil { + return nil, err + } + applyBYOKProbeAuth(req, provider, credKey, credVal) + // Anthropic requires the version header even on GET /models. + req.Header.Set("anthropic-version", "2023-06-01") + return req, nil + } + + return nil, fmt.Errorf("provider %q exposes no probeable HTTP base (oauth/CLI-direct arm)", provider.Name) +} + +// applyBYOKProbeAuth sets the auth header the upstream expects for this +// credential. Anthropic-protocol API keys use the x-api-key header; everything +// else (openai-protocol bearer keys, anthropic bearer auth tokens) uses +// Authorization: Bearer. The choice is keyed off the provider protocol + the +// auth-env name being exercised, NOT a per-provider hardcode. +func applyBYOKProbeAuth(req *http.Request, provider providers.Provider, credKey, credVal string) { + upperKey := strings.ToUpper(strings.TrimSpace(credKey)) + // Anthropic protocol + an x-api-key-style key → x-api-key header. ANTHROPIC_API_KEY + // and KIMI_API_KEY (kimi's claude-code integration uses x-api-key) go here. + // ANTHROPIC_AUTH_TOKEN is a bearer token even on anthropic-protocol providers. + if provider.Protocol == providers.ProtocolAnthropic && upperKey != "ANTHROPIC_AUTH_TOKEN" { + req.Header.Set("x-api-key", credVal) + return + } + req.Header.Set("Authorization", "Bearer "+credVal) +} + +// defaultBYOKLivenessProbe is the production probe: a real http.Client whose +// per-call timeout is governed by the request context (set by +// probeBYOKCredentialLiveness). A modest client timeout is a second belt against +// a hung dial. +func defaultBYOKLivenessProbe(req *http.Request) (*http.Response, error) { + client := &http.Client{Timeout: byokProbeTimeout} + return client.Do(req) +} + +// formatInvalidBYOKCredentialError builds the user-facing message for a +// create rejected because a present BYOK credential was positively rejected by +// the provider (401/403). Names the provider, the credential env, and the +// observed status so the user knows exactly which secret to rotate. +func formatInvalidBYOKCredentialError(providerName, credKey string, status int) string { + return fmt.Sprintf( + "the BYOK credential %q for provider %q was rejected by the provider (HTTP %d). "+ + "The credential is present but not valid (revoked, expired, or wrong scope). "+ + "Update %q under Config → Secrets with a working key for %q and retry.", + credKey, providerName, status, credKey, providerName) +} + +// makeCreateTimeCredValueResolver returns a credValueForKey function for the +// create path: it resolves a credential's VALUE by name, preferring the +// plaintext value in the create payload's own secrets (workspace scope — no +// decrypt needed) and falling back to decrypting the matching global_secrets +// row. Returns (value, true) only when a non-empty value is found; (_, false) +// otherwise (the liveness gate then skips probing that key — presence already +// passed, and we never fail closed on an unreadable value). +// +// The closure decrypts ONLY the single key being probed (the derived provider's +// auth_env), never the whole store. +func makeCreateTimeCredValueResolver(payloadSecrets map[string]string) func(ctx context.Context, key string) (string, bool) { + return func(ctx context.Context, key string) (string, bool) { + if v, ok := payloadSecrets[key]; ok && strings.TrimSpace(v) != "" { + return v, true + } + var enc []byte + var ver int + err := db.DB.QueryRowContext(ctx, + `SELECT encrypted_value, encryption_version FROM global_secrets WHERE key = $1`, + key).Scan(&enc, &ver) + if err != nil { + return "", false + } + dec, decErr := crypto.DecryptVersioned(enc, ver) + if decErr != nil { + log.Printf("byok_credential_gate: WARN — could not decrypt global secret %q for liveness probe: %v", key, decErr) + return "", false + } + if strings.TrimSpace(string(dec)) == "" { + return "", false + } + return string(dec), true + } +} diff --git a/workspace-server/internal/handlers/byok_credential_gate_test.go b/workspace-server/internal/handlers/byok_credential_gate_test.go new file mode 100644 index 00000000..c34d6c65 --- /dev/null +++ b/workspace-server/internal/handlers/byok_credential_gate_test.go @@ -0,0 +1,450 @@ +package handlers + +// byok_credential_gate_test.go — tests for the fail-closed BYOK credential gate +// at CREATE (GAP A presence + GAP B liveness). All outbound HTTP is mocked via +// an injected byokLivenessProbe — NO real network is touched. +// +// Coverage (the five contract cases from the gap-closure spec): +// 1. BYOK create, missing credential → 422 MISSING_BYOK_CREDENTIAL +// 2. BYOK create, present-but-INVALID (401) → 422 BYOK_CREDENTIAL_INVALID +// 3. BYOK create, transient probe failure → NOT blocked (loud log, allowed) +// 4. BYOK create, valid credential (200) → proceeds (201) +// 5. platform-managed create → gate is a no-op (unaffected) + +import ( + "bytes" + "context" + "errors" + "net/http" + "net/http/httptest" + "testing" + + "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/crypto" + "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/providers" + "github.com/DATA-DOG/go-sqlmock" + "github.com/gin-gonic/gin" +) + +// stubProbe returns a byokLivenessProbe that always responds with the given +// status (and no error). The response body is empty; the gate only inspects the +// status code. +func stubProbe(status int) byokLivenessProbe { + return func(req *http.Request) (*http.Response, error) { + return &http.Response{ + StatusCode: status, + Body: http.NoBody, + Header: make(http.Header), + }, nil + } +} + +// transientProbe returns a probe that fails with a transport error (the +// network-blip / timeout class). +func transientProbe(err error) byokLivenessProbe { + return func(req *http.Request) (*http.Response, error) { + return nil, err + } +} + +// expectGlobalSecretsPresenceQuery mocks the create-time global_secrets +// presence read (key names only). Pass the configured global key names. +func expectGlobalSecretsPresenceQuery(mock sqlmock.Sqlmock, keys ...string) { + rows := sqlmock.NewRows([]string{"key"}) + for _, k := range keys { + rows.AddRow(k) + } + mock.ExpectQuery(`SELECT key FROM global_secrets WHERE octet_length\(encrypted_value\) > 0 LIMIT \$1`). + WillReturnRows(rows) +} + +// expectGlobalSecretValueQuery mocks the liveness value-resolver read of a +// single global secret's encrypted value (used when the probed credential lives +// in global_secrets rather than the create payload). +func expectGlobalSecretValueQuery(mock sqlmock.Sqlmock, key, value string) { + enc, _ := crypto.Encrypt([]byte(value)) + ver := crypto.CurrentEncryptionVersion() + mock.ExpectQuery(`SELECT encrypted_value, encryption_version FROM global_secrets WHERE key = \$1`). + WithArgs(key). + WillReturnRows(sqlmock.NewRows([]string{"encrypted_value", "encryption_version"}).AddRow(enc, ver)) +} + +// ============================================================================ +// Unit tests for the gate logic (no HTTP handler) — fast, deterministic. +// ============================================================================ + +// TestCreateBYOKGate_MissingCredential_RejectsPresence — GAP A: a byok-derived +// (runtime, model) with NO available credential key is rejected with +// MISSING_BYOK_CREDENTIAL before any liveness probe runs. +func TestCreateBYOKGate_MissingCredential_RejectsPresence(t *testing.T) { + h := &WorkspaceHandler{} + probeCalled := false + probe := func(req *http.Request) (*http.Response, error) { + probeCalled = true + return stubProbe(200)(req) + } + // claude-code + kimi-for-coding derives to kimi-coding (BYOK). No cred keys. + res := h.evaluateCreateBYOKCredentialGate( + context.Background(), + "claude-code", "kimi-for-coding", + map[string]struct{}{}, // no available credentials + func(ctx context.Context, key string) (string, bool) { return "", false }, + probe, + ) + if res.OK { + t.Fatalf("expected gate to REJECT a credential-less byok create, got OK") + } + if res.Code != "MISSING_BYOK_CREDENTIAL" { + t.Errorf("expected code MISSING_BYOK_CREDENTIAL, got %q", res.Code) + } + if res.HTTP != http.StatusUnprocessableEntity { + t.Errorf("expected 422, got %d", res.HTTP) + } + if probeCalled { + t.Errorf("liveness probe must NOT run when presence already fails") + } +} + +// TestCreateBYOKGate_InvalidCredential_FailsClosed — GAP B: a PRESENT credential +// that the provider rejects (401) → BYOK_CREDENTIAL_INVALID, fail-closed. +func TestCreateBYOKGate_InvalidCredential_FailsClosed(t *testing.T) { + h := &WorkspaceHandler{} + keys := map[string]struct{}{"KIMI_API_KEY": {}} + res := h.evaluateCreateBYOKCredentialGate( + context.Background(), + "claude-code", "kimi-for-coding", + keys, + func(ctx context.Context, key string) (string, bool) { return "sk-dead-token", true }, + stubProbe(http.StatusUnauthorized), + ) + if res.OK { + t.Fatalf("expected gate to REJECT a present-but-dead (401) credential, got OK") + } + if res.Code != "BYOK_CREDENTIAL_INVALID" { + t.Errorf("expected code BYOK_CREDENTIAL_INVALID, got %q (msg=%s)", res.Code, res.Message) + } + if res.HTTP != http.StatusUnprocessableEntity { + t.Errorf("expected 422, got %d", res.HTTP) + } +} + +// TestCreateBYOKGate_Forbidden_FailsClosed — 403 is also a hard auth failure. +func TestCreateBYOKGate_Forbidden_FailsClosed(t *testing.T) { + h := &WorkspaceHandler{} + keys := map[string]struct{}{"KIMI_API_KEY": {}} + res := h.evaluateCreateBYOKCredentialGate( + context.Background(), + "claude-code", "kimi-for-coding", keys, + func(ctx context.Context, key string) (string, bool) { return "sk-forbidden", true }, + stubProbe(http.StatusForbidden), + ) + if res.OK || res.Code != "BYOK_CREDENTIAL_INVALID" { + t.Fatalf("expected 403 to fail closed as BYOK_CREDENTIAL_INVALID, got OK=%v code=%q", res.OK, res.Code) + } +} + +// TestCreateBYOKGate_Transient_NotBlocked — GAP B crux: a transient failure +// (network error, 5xx, 429, timeout) must NOT block the create. The credential +// validity is UNKNOWN; a flaky upstream cannot wedge every create. +func TestCreateBYOKGate_Transient_NotBlocked(t *testing.T) { + cases := []struct { + name string + probe byokLivenessProbe + }{ + {"network_error", transientProbe(errors.New("dial tcp: connection refused"))}, + {"timeout", transientProbe(context.DeadlineExceeded)}, + {"500", stubProbe(http.StatusInternalServerError)}, + {"502", stubProbe(http.StatusBadGateway)}, + {"503", stubProbe(http.StatusServiceUnavailable)}, + {"429_rate_limited", stubProbe(http.StatusTooManyRequests)}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + h := &WorkspaceHandler{} + keys := map[string]struct{}{"KIMI_API_KEY": {}} + res := h.evaluateCreateBYOKCredentialGate( + context.Background(), + "claude-code", "kimi-for-coding", keys, + func(ctx context.Context, key string) (string, bool) { return "sk-maybe-fine", true }, + tc.probe, + ) + if !res.OK { + t.Fatalf("transient probe (%s) must NOT block create, got rejected code=%q", tc.name, res.Code) + } + }) + } +} + +// TestCreateBYOKGate_ValidCredential_Proceeds — GAP B happy path: a present, +// valid (200) credential proceeds. +func TestCreateBYOKGate_ValidCredential_Proceeds(t *testing.T) { + h := &WorkspaceHandler{} + keys := map[string]struct{}{"KIMI_API_KEY": {}} + res := h.evaluateCreateBYOKCredentialGate( + context.Background(), + "claude-code", "kimi-for-coding", keys, + func(ctx context.Context, key string) (string, bool) { return "sk-live", true }, + stubProbe(http.StatusOK), + ) + if !res.OK { + t.Fatalf("valid (200) credential must proceed, got rejected code=%q msg=%s", res.Code, res.Message) + } +} + +// TestCreateBYOKGate_PlatformManaged_Unaffected — GAP scope: a platform-derived +// model is NOT byok, so the gate is a no-op even with no credential and even +// with a probe that would 401. The probe must never run. +func TestCreateBYOKGate_PlatformManaged_Unaffected(t *testing.T) { + h := &WorkspaceHandler{} + probeCalled := false + probe := func(req *http.Request) (*http.Response, error) { + probeCalled = true + return stubProbe(http.StatusUnauthorized)(req) + } + // claude-code + a platform-namespaced model derives to the closed `platform` + // provider → platform_managed → gate no-op. + res := h.evaluateCreateBYOKCredentialGate( + context.Background(), + "claude-code", "anthropic/claude-opus-4-7", + map[string]struct{}{}, // no creds — irrelevant for platform + func(ctx context.Context, key string) (string, bool) { return "", false }, + probe, + ) + if !res.OK { + t.Fatalf("platform-managed create must be unaffected by the BYOK gate, got code=%q", res.Code) + } + if probeCalled { + t.Errorf("liveness probe must NOT run for a platform-managed model") + } +} + +// TestCreateBYOKGate_UnderivableModel_DefaultsClosedToPlatform — a model that +// does not derive (the model-registry gate owns rejecting it) is treated as +// non-byok (default-closed to platform), NOT failed as a credential-less byok. +func TestCreateBYOKGate_UnderivableModel_DefaultsClosedToPlatform(t *testing.T) { + h := &WorkspaceHandler{} + res := h.evaluateCreateBYOKCredentialGate( + context.Background(), + "claude-code", "totally-made-up-xyz", + map[string]struct{}{}, + func(ctx context.Context, key string) (string, bool) { return "", false }, + stubProbe(http.StatusOK), + ) + if !res.OK { + t.Fatalf("an underivable model must default-closed to platform (non-byok), not fail as MISSING_BYOK_CREDENTIAL; got code=%q", res.Code) + } +} + +// TestCreateBYOKGate_NonAuth4xx_TreatedOK — a 400/404 means auth succeeded but +// the endpoint disliked the request shape (e.g. a provider with no models-list +// route). That is NOT a dead credential → proceed. +func TestCreateBYOKGate_NonAuth4xx_TreatedOK(t *testing.T) { + h := &WorkspaceHandler{} + keys := map[string]struct{}{"KIMI_API_KEY": {}} + for _, status := range []int{http.StatusBadRequest, http.StatusNotFound} { + res := h.evaluateCreateBYOKCredentialGate( + context.Background(), + "claude-code", "kimi-for-coding", keys, + func(ctx context.Context, key string) (string, bool) { return "sk-live", true }, + stubProbe(status), + ) + if !res.OK { + t.Errorf("non-auth 4xx (%d) means auth succeeded; must proceed, got code=%q", status, res.Code) + } + } +} + +// TestProbeAuthHeader_AnthropicVsBearer pins the provider-generic auth-header +// selection: anthropic-protocol x-api-key keys use x-api-key; bearer keys use +// Authorization: Bearer. No per-provider URL hardcoding. +func TestProbeAuthHeader_AnthropicVsBearer(t *testing.T) { + anthropicProvider := providers.Provider{ + Name: "kimi-coding", + Protocol: providers.ProtocolAnthropic, + BaseURLTemplate: "https://api.kimi.com/coding/v1", + AuthEnv: []string{"KIMI_API_KEY"}, + } + req, err := buildBYOKLivenessRequest(context.Background(), anthropicProvider, "KIMI_API_KEY", "sk-kimi") + if err != nil { + t.Fatalf("buildBYOKLivenessRequest: %v", err) + } + if got := req.Header.Get("x-api-key"); got != "sk-kimi" { + t.Errorf("anthropic-protocol KIMI_API_KEY should set x-api-key, got %q", got) + } + if req.Header.Get("Authorization") != "" { + t.Errorf("anthropic x-api-key key must not also set Authorization, got %q", req.Header.Get("Authorization")) + } + if req.Method != http.MethodGet || req.URL.String() != "https://api.kimi.com/coding/v1/models" { + t.Errorf("expected GET .../models, got %s %s", req.Method, req.URL.String()) + } + + openaiProvider := providers.Provider{ + Name: "openai-api", + Protocol: providers.ProtocolOpenAI, + BaseURLTemplate: "https://api.openai.com/v1", + AuthEnv: []string{"OPENAI_API_KEY"}, + } + req2, err := buildBYOKLivenessRequest(context.Background(), openaiProvider, "OPENAI_API_KEY", "sk-openai") + if err != nil { + t.Fatalf("buildBYOKLivenessRequest openai: %v", err) + } + if got := req2.Header.Get("Authorization"); got != "Bearer sk-openai" { + t.Errorf("openai-protocol OPENAI_API_KEY should set Authorization: Bearer, got %q", got) + } +} + +// TestProbeRequest_NoBaseURL_Errors — an OAuth arm with null base URLs has no +// probeable HTTP surface; buildBYOKLivenessRequest errors and the gate treats it +// as transient/allow. +func TestProbeRequest_NoBaseURL_Errors(t *testing.T) { + oauthArm := providers.Provider{ + Name: "anthropic-oauth", + Protocol: providers.ProtocolAnthropic, + AuthEnv: []string{"CLAUDE_CODE_OAUTH_TOKEN"}, + // base_url_template + base_url_anthropic both empty (null in yaml). + } + _, err := buildBYOKLivenessRequest(context.Background(), oauthArm, "CLAUDE_CODE_OAUTH_TOKEN", "tok") + if err == nil { + t.Fatalf("expected an error for a provider with no probeable HTTP base") + } +} + +// ============================================================================ +// End-to-end through the Create HTTP handler — proves the wiring + 4xx body. +// ============================================================================ + +// TestWorkspaceCreate_BYOK_MissingCredential_Rejected422 — GAP A end-to-end: +// a credential-less byok create is rejected synchronously (no DB insert). +func TestWorkspaceCreate_BYOK_MissingCredential_Rejected422(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()) + // Probe should never run (presence fails first); stub it to 200 anyway. + handler.byokLivenessProbeOverride = stubProbe(http.StatusOK) + + // Only the global_secrets presence query is expected (returns NO keys) — + // the 422 fires before BeginTx, so any INSERT would fail ExpectationsWereMet. + expectGlobalSecretsPresenceQuery(mock /* no keys */) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + body := `{"name":"Kimi BYOK","runtime":"claude-code","model":"kimi-for-coding"}` + c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body)) + c.Request.Header.Set("Content-Type", "application/json") + handler.Create(c) + + if w.Code != http.StatusUnprocessableEntity { + t.Fatalf("expected 422 MISSING_BYOK_CREDENTIAL, got %d: %s", w.Code, w.Body.String()) + } + if !bytes.Contains(w.Body.Bytes(), []byte(`"code":"MISSING_BYOK_CREDENTIAL"`)) { + t.Errorf("expected code MISSING_BYOK_CREDENTIAL in body, got %s", w.Body.String()) + } + if !bytes.Contains(w.Body.Bytes(), []byte(`"provider":"kimi-coding"`)) { + t.Errorf("expected derived provider kimi-coding echoed in body, got %s", w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unexpected DB activity on synchronous reject: %v", err) + } +} + +// TestWorkspaceCreate_BYOK_InvalidCredential_Rejected422 — GAP B end-to-end: +// a present-but-dead (401) credential supplied in the payload is rejected +// synchronously with BYOK_CREDENTIAL_INVALID (no DB insert). +func TestWorkspaceCreate_BYOK_InvalidCredential_Rejected422(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()) + handler.byokLivenessProbeOverride = stubProbe(http.StatusUnauthorized) + + // Presence is satisfied by the PAYLOAD secret (no global keys needed); the + // value resolver reads the payload directly (no global_secrets value query). + expectGlobalSecretsPresenceQuery(mock /* no global keys */) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + body := `{"name":"Kimi Dead","runtime":"claude-code","model":"kimi-for-coding","secrets":{"KIMI_API_KEY":"sk-revoked"}}` + c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body)) + c.Request.Header.Set("Content-Type", "application/json") + handler.Create(c) + + if w.Code != http.StatusUnprocessableEntity { + t.Fatalf("expected 422 BYOK_CREDENTIAL_INVALID, got %d: %s", w.Code, w.Body.String()) + } + if !bytes.Contains(w.Body.Bytes(), []byte(`"code":"BYOK_CREDENTIAL_INVALID"`)) { + t.Errorf("expected code BYOK_CREDENTIAL_INVALID in body, got %s", w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unexpected DB activity on synchronous reject: %v", err) + } +} + +// TestWorkspaceCreate_BYOK_TransientProbe_Proceeds — GAP B end-to-end: a +// transient probe failure does NOT block; the create proceeds to 201. The +// credential is supplied via a GLOBAL secret (so there is no in-tx +// workspace_secrets write and the create flow matches the registry-proceed +// tests). Presence passes via the global key; the value resolver decrypts the +// global secret value to feed the probe, which then errors transiently. +func TestWorkspaceCreate_BYOK_TransientProbe_Proceeds(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()) + waitForHandlerAsyncBeforeDBCleanup(t, handler) + handler.byokLivenessProbeOverride = transientProbe(errors.New("dial tcp: i/o timeout")) + + // Presence: global_secrets carries KIMI_API_KEY. + expectGlobalSecretsPresenceQuery(mock, "KIMI_API_KEY") + // Liveness value resolver: decrypt the global KIMI_API_KEY value. + expectGlobalSecretValueQuery(mock, "KIMI_API_KEY", "sk-maybe-fine") + // Create proceeds (transient = allow). No payload secrets → no in-tx + // workspace_secrets write; the create flow matches the registry-proceed path. + mock.ExpectBegin() + mock.ExpectExec("INSERT INTO workspaces").WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectCommit() + mock.ExpectExec("INSERT INTO canvas_layouts").WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("INSERT INTO structure_events").WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("INSERT INTO workspace_auth_tokens").WillReturnResult(sqlmock.NewResult(0, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + body := `{"name":"Kimi Transient","runtime":"claude-code","model":"kimi-for-coding"}` + c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body)) + c.Request.Header.Set("Content-Type", "application/json") + handler.Create(c) + + if w.Code != http.StatusCreated { + t.Fatalf("transient probe must NOT block; expected 201, got %d: %s", w.Code, w.Body.String()) + } +} + +// TestWorkspaceCreate_BYOK_ValidGlobalCredential_Proceeds — GAP B happy path +// e2e: a valid (200) global credential proceeds to 201. +func TestWorkspaceCreate_BYOK_ValidGlobalCredential_Proceeds(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()) + waitForHandlerAsyncBeforeDBCleanup(t, handler) + handler.byokLivenessProbeOverride = stubProbe(http.StatusOK) + + expectGlobalSecretsPresenceQuery(mock, "KIMI_API_KEY") + expectGlobalSecretValueQuery(mock, "KIMI_API_KEY", "sk-live") + mock.ExpectBegin() + mock.ExpectExec("INSERT INTO workspaces").WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectCommit() + mock.ExpectExec("INSERT INTO canvas_layouts").WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("INSERT INTO structure_events").WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("INSERT INTO workspace_auth_tokens").WillReturnResult(sqlmock.NewResult(0, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + body := `{"name":"Kimi Live","runtime":"claude-code","model":"kimi-for-coding"}` + c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body)) + c.Request.Header.Set("Content-Type", "application/json") + handler.Create(c) + + if w.Code != http.StatusCreated { + t.Fatalf("valid credential must proceed; expected 201, got %d: %s", w.Code, w.Body.String()) + } +} diff --git a/workspace-server/internal/handlers/handlers_additional_test.go b/workspace-server/internal/handlers/handlers_additional_test.go index 82af0bd6..b66523ca 100644 --- a/workspace-server/internal/handlers/handlers_additional_test.go +++ b/workspace-server/internal/handlers/handlers_additional_test.go @@ -48,7 +48,8 @@ func TestWorkspaceCreate_WithParentID(t *testing.T) { w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) - body := `{"name":"Child Agent","model":"anthropic:claude-opus-4-7","parent_id":"parent-ws-123"}` + // Platform-managed model id → BYOK gate no-op (this test asserts parent_id). + body := `{"name":"Child Agent","model":"anthropic/claude-opus-4-7","parent_id":"parent-ws-123"}` c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body)) c.Request.Header.Set("Content-Type", "application/json") @@ -86,7 +87,9 @@ func TestWorkspaceCreate_ExplicitClaudeCodeRuntime(t *testing.T) { w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) - body := `{"name":"CC Agent","tier":2,"runtime":"claude-code","model":"sonnet","canvas":{"x":10,"y":20}}` + // Platform-managed model id → BYOK gate no-op (this test asserts the + // claude-code runtime label, not billing). + body := `{"name":"CC Agent","tier":2,"runtime":"claude-code","model":"anthropic/claude-sonnet-4-6","canvas":{"x":10,"y":20}}` c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body)) c.Request.Header.Set("Content-Type", "application/json") @@ -309,7 +312,9 @@ func TestWorkspaceCreate_MaxConcurrentTasksOverride(t *testing.T) { w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) - body := `{"name":"Leader Agent","runtime":"claude-code","model":"sonnet","max_concurrent_tasks":3}` + // Platform-managed model id → BYOK gate no-op (this test asserts + // max_concurrent_tasks). + body := `{"name":"Leader Agent","runtime":"claude-code","model":"anthropic/claude-sonnet-4-6","max_concurrent_tasks":3}` c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body)) c.Request.Header.Set("Content-Type", "application/json") diff --git a/workspace-server/internal/handlers/handlers_test.go b/workspace-server/internal/handlers/handlers_test.go index 16351f05..76242d08 100644 --- a/workspace-server/internal/handlers/handlers_test.go +++ b/workspace-server/internal/handlers/handlers_test.go @@ -398,7 +398,10 @@ func TestWorkspaceCreate(t *testing.T) { // the bare-defaults path (no template, no runtime → claude-code), so // the body must declare an explicit model. Using a claude-code-compatible // id; the test doesn't exercise model semantics beyond presence. - body := `{"name":"Test Agent","model":"anthropic:claude-opus-4-7","canvas":{"x":100,"y":200}}` + // Platform-managed model id (derives to the closed `platform` provider) so + // the fail-closed BYOK create gate is a no-op — this test exercises the + // generic create flow, not billing/credential semantics. + body := `{"name":"Test Agent","model":"anthropic/claude-opus-4-7","canvas":{"x":100,"y":200}}` c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body)) c.Request.Header.Set("Content-Type", "application/json") @@ -466,7 +469,9 @@ func TestWorkspaceCreate_ReturnsAuthToken_201(t *testing.T) { w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) - body := `{"name":"Token Holder","model":"anthropic:claude-opus-4-7"}` + // Platform-managed model id → BYOK gate no-op (this test asserts the auth + // token, not billing). + body := `{"name":"Token Holder","model":"anthropic/claude-opus-4-7"}` c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body)) c.Request.Header.Set("Content-Type", "application/json") diff --git a/workspace-server/internal/handlers/workspace.go b/workspace-server/internal/handlers/workspace.go index 19b3b16b..26d441ca 100644 --- a/workspace-server/internal/handlers/workspace.go +++ b/workspace-server/internal/handlers/workspace.go @@ -89,6 +89,21 @@ type WorkspaceHandler struct { // for async DB users (restart, provision) before asserting results. // Matches the pattern from main commit 1c3b4ff3. asyncWG sync.WaitGroup + // byokLivenessProbeOverride lets tests inject a mock outbound-HTTP probe + // for the create-time BYOK credential LIVENESS gate (GAP B) so no real + // network is touched in tests. nil in production → effectiveBYOKLivenessProbe + // returns defaultBYOKLivenessProbe (a real http.Client). Set it to a stub in + // tests, or to a sentinel that disables probing. + byokLivenessProbeOverride byokLivenessProbe +} + +// effectiveBYOKLivenessProbe returns the probe the create-time liveness gate +// should use: the test override when set, else the production http.Client probe. +func (h *WorkspaceHandler) effectiveBYOKLivenessProbe() byokLivenessProbe { + if h.byokLivenessProbeOverride != nil { + return h.byokLivenessProbeOverride + } + return defaultBYOKLivenessProbe } // seedMemoryPluginAPI is the slice of the v2 memory plugin client that @@ -550,6 +565,46 @@ func (h *WorkspaceHandler) Create(c *gin.Context) { } } + // Fail-closed BYOK credential gate (byok_credential_gate.go) — synchronous, + // BEFORE any DB write so a credential-less or dead-credential BYOK create is + // a 4xx, not a 201-then-late-provision-failure. Placed AFTER the cheap field + // validations (compute / access / dir / URL) so a malformed request is + // rejected on its own merits before the heavier credential check (one + // global_secrets read + a network probe), and immediately BEFORE BeginTx so + // the rejection path never touches the DB transaction: + // + // GAP A — PRESENCE: resolved mode == byok but no usable LLM credential at + // any scope (payload secrets + configured global_secrets) → 422 + // MISSING_BYOK_CREDENTIAL. Same presence rule the provision-time check + // keys off (anyBYOKCredentialKeyPresent), so create and provision agree. + // GAP B — LIVENESS: a credential IS present — probe it actually works. A + // positively-rejected credential (401/403) → 422 BYOK_CREDENTIAL_INVALID; + // a transient/network/5xx/timeout never blocks (see the probe's doc). + // + // Skipped for external workspaces (no model/provider contract — the URL is + // the contract, mirroring the MODEL_REQUIRED / registry gates above). + if !isExternal { + availableCredKeys := gatherCreateTimeLLMCredKeys(ctx, payload.Secrets) + gate := h.evaluateCreateBYOKCredentialGate( + ctx, + payload.Runtime, + payload.Model, + availableCredKeys, + makeCreateTimeCredValueResolver(payload.Secrets), + h.effectiveBYOKLivenessProbe(), + ) + if !gate.OK { + c.JSON(gate.HTTP, gin.H{ + "error": gate.Message, + "code": gate.Code, + "runtime": payload.Runtime, + "model": payload.Model, + "provider": gate.Provider, + }) + return + } + } + tx, txErr := db.DB.BeginTx(ctx, nil) if txErr != nil { log.Printf("Create workspace: begin tx error: %v", txErr) diff --git a/workspace-server/internal/handlers/workspace_budget_test.go b/workspace-server/internal/handlers/workspace_budget_test.go index f0b7cf64..7b2a3724 100644 --- a/workspace-server/internal/handlers/workspace_budget_test.go +++ b/workspace-server/internal/handlers/workspace_budget_test.go @@ -173,7 +173,8 @@ func TestWorkspaceBudget_Create_WithLimit(t *testing.T) { w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) - body := `{"name":"Budgeted Agent","model":"anthropic:claude-opus-4-7","budget_limit":1000}` + // Platform-managed model id → BYOK gate no-op (this test asserts budget_limit). + body := `{"name":"Budgeted Agent","model":"anthropic/claude-opus-4-7","budget_limit":1000}` c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body)) c.Request.Header.Set("Content-Type", "application/json") handler.Create(c) diff --git a/workspace-server/internal/handlers/workspace_provision.go b/workspace-server/internal/handlers/workspace_provision.go index 8dc4f921..460a1e99 100644 --- a/workspace-server/internal/handlers/workspace_provision.go +++ b/workspace-server/internal/handlers/workspace_provision.go @@ -1142,13 +1142,21 @@ func stripPlatformManagedLLMBypassEnv(envVars map[string]string) { // global or workspace scope). Used by the byok fail-closed branch: a byok // workspace with no LLM credential at ANY scope must be aborted with // MISSING_BYOK_CREDENTIAL rather than started credential-less. +// +// This is the PROVISION-time presence rule. The CREATE-time presence gate +// (byok_credential_gate.go) reuses the SAME underlying predicate +// (anyBYOKCredentialKeyPresent) over a presence-only key set so the two +// fail-closed checks can never diverge: the rule "any non-empty +// platform-managed LLM bypass key is a usable BYOK credential" lives in exactly +// one place. func hasAnyPlatformManagedLLMKey(envVars map[string]string) bool { - for key := range platformManagedDirectLLMBypassKeys { - if strings.TrimSpace(envVars[key]) != "" { - return true + present := make(map[string]struct{}, len(envVars)) + for key, val := range envVars { + if strings.TrimSpace(val) != "" { + present[key] = struct{}{} } } - return false + return anyBYOKCredentialKeyPresent(present) } // stripNonMatchingGlobalOriginLLMCreds is the byok-branch provider-matched diff --git a/workspace-server/internal/handlers/workspace_test.go b/workspace-server/internal/handlers/workspace_test.go index 23747585..32526be2 100644 --- a/workspace-server/internal/handlers/workspace_test.go +++ b/workspace-server/internal/handlers/workspace_test.go @@ -350,7 +350,9 @@ func TestWorkspaceCreate_DBInsertError(t *testing.T) { w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) - body := `{"name":"Failing Agent","model":"anthropic:claude-opus-4-7"}` + // Platform-managed model id → BYOK gate no-op (this test asserts the DB + // insert-error rollback, not billing). + body := `{"name":"Failing Agent","model":"anthropic/claude-opus-4-7"}` c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body)) c.Request.Header.Set("Content-Type", "application/json") @@ -396,7 +398,9 @@ func TestWorkspaceCreate_DefaultsApplied(t *testing.T) { w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) - body := `{"name":"Default Agent","model":"anthropic:claude-opus-4-7"}` + // Platform-managed model id → BYOK gate no-op (this test asserts the + // status/defaults, not billing). + body := `{"name":"Default Agent","model":"anthropic/claude-opus-4-7"}` c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body)) c.Request.Header.Set("Content-Type", "application/json") @@ -519,6 +523,13 @@ func TestWorkspaceCreate_SecretPersistFails_RollsBack(t *testing.T) { broadcaster := newTestBroadcaster() handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()) + // The create-time BYOK gate runs first: anthropic:claude-opus-4-7 derives to + // anthropic-api (BYOK). Presence is satisfied by the payload OPENAI_API_KEY + // (a recognized LLM bypass key) — the liveness probe is skipped because + // OPENAI_API_KEY is not in anthropic-api's auth_env, so no value is read for + // the derived provider. Only the global presence query precedes BeginTx. + expectGlobalSecretsPresenceQuery(mock /* no global keys */) + mock.ExpectBegin() mock.ExpectExec("INSERT INTO workspaces"). WillReturnResult(sqlmock.NewResult(0, 1)) @@ -2118,6 +2129,12 @@ func TestWorkspaceCreate_718_P4_RegisteredModelProceeds(t *testing.T) { setupTestRedis(t) broadcaster := newTestBroadcaster() handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()) + // claude-opus-4-7 derives to anthropic-api (BYOK). The fail-closed BYOK + // create gate now requires a present + live credential — supply one via a + // global secret and stub the liveness probe to 200 (no real network). + handler.byokLivenessProbeOverride = stubProbe(http.StatusOK) + expectGlobalSecretsPresenceQuery(mock, "ANTHROPIC_API_KEY") + expectGlobalSecretValueQuery(mock, "ANTHROPIC_API_KEY", "sk-ant-live") mock.ExpectBegin() mock.ExpectExec("INSERT INTO workspaces"). @@ -2156,6 +2173,11 @@ func TestWorkspaceCreate_718_P4_LegacyColonVocabAccepted(t *testing.T) { setupTestRedis(t) broadcaster := newTestBroadcaster() handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()) + // anthropic:claude-opus-4-7 derives to anthropic-api (BYOK) — satisfy the + // fail-closed BYOK create gate with a present + live global credential. + handler.byokLivenessProbeOverride = stubProbe(http.StatusOK) + expectGlobalSecretsPresenceQuery(mock, "ANTHROPIC_API_KEY") + expectGlobalSecretValueQuery(mock, "ANTHROPIC_API_KEY", "sk-ant-live") mock.ExpectBegin() mock.ExpectExec("INSERT INTO workspaces"). @@ -2220,6 +2242,13 @@ func TestWorkspaceCreate_188_ExplicitRuntimeNoTemplate_OK(t *testing.T) { setupTestRedis(t) broadcaster := newTestBroadcaster() handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()) + // codex gpt-5.5 derives to openai-subscription (BYOK, OAuth arm). The OAuth + // arm has null base URLs → the liveness probe is unprobeable → transient/ + // allow; presence still requires a credential. Supply CODEX_AUTH_JSON via a + // global secret (presence passes; the probe is skipped as unprobeable). + handler.byokLivenessProbeOverride = stubProbe(http.StatusOK) + expectGlobalSecretsPresenceQuery(mock, "CODEX_AUTH_JSON") + expectGlobalSecretValueQuery(mock, "CODEX_AUTH_JSON", `{"tokens":"x"}`) mock.ExpectBegin() mock.ExpectExec("INSERT INTO workspaces").