Pre-existing silent-failure path: IsRunning decoded CP responses
regardless of HTTP status, so a CP 500 → empty body → State="" →
returned (false, nil). The sweeper couldn't distinguish "workspace
stopped" from "CP broken" and would leave a dead row in place.
## Fix
- Non-2xx → wrapped error, does NOT echo body (CP 5xx bodies may
contain echoed headers; leaking into logs would expose bearer)
- JSON decode error → wrapped error
- Transport error → now wrapped with "cp provisioner: status:"
prefix for easier log grepping
## Tests
+7 cases (5-status table + malformed JSON + existing transport).
IsRunning coverage 100%; overall cp_provisioner at 98%.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
201 lines
7.0 KiB
Go
201 lines
7.0 KiB
Go
package provisioner
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"net/http"
|
|
"os"
|
|
"time"
|
|
)
|
|
|
|
// CPProvisioner provisions workspace agents by calling the control plane's
|
|
// workspace provision API. The control plane creates EC2 instances with
|
|
// Docker + the workspace runtime installed at boot from PyPI.
|
|
//
|
|
// Auto-activated when MOLECULE_ORG_ID is set (SaaS tenant).
|
|
type CPProvisioner struct {
|
|
baseURL string
|
|
orgID string
|
|
sharedSecret string // Authorization: Bearer — platform-wide gate
|
|
adminToken string // X-Molecule-Admin-Token — per-tenant identity (controlplane #118/#130)
|
|
httpClient *http.Client
|
|
}
|
|
|
|
// NewCPProvisioner creates a provisioner that delegates to the control plane.
|
|
func NewCPProvisioner() (*CPProvisioner, error) {
|
|
orgID := os.Getenv("MOLECULE_ORG_ID")
|
|
if orgID == "" {
|
|
return nil, fmt.Errorf("MOLECULE_ORG_ID required for control plane provisioner")
|
|
}
|
|
|
|
// Auto-derive control plane URL.
|
|
baseURL := os.Getenv("CP_PROVISION_URL")
|
|
if baseURL == "" {
|
|
baseURL = os.Getenv("MOLECULE_CP_URL")
|
|
}
|
|
if baseURL == "" {
|
|
baseURL = "https://api.moleculesai.app"
|
|
}
|
|
|
|
// CP gates /cp/workspaces/* behind two credentials now:
|
|
// 1. Shared secret (Authorization: Bearer) — gates the route at
|
|
// the router level, proves the caller is a tenant platform.
|
|
// 2. Admin token (X-Molecule-Admin-Token) — proves WHICH tenant.
|
|
// Introduced in controlplane #118/#130 to prevent cross-tenant
|
|
// provisioning when the shared secret leaks from one tenant.
|
|
sharedSecret := os.Getenv("MOLECULE_CP_SHARED_SECRET")
|
|
if sharedSecret == "" {
|
|
// Fall back to PROVISION_SHARED_SECRET so a single env-var name
|
|
// works on both sides of the wire.
|
|
sharedSecret = os.Getenv("PROVISION_SHARED_SECRET")
|
|
}
|
|
// ADMIN_TOKEN is injected into the tenant container at provision
|
|
// time by the control plane (see provisioner/ec2.go Secrets Manager
|
|
// bootstrap path). Without it, post-#118 CP rejects every
|
|
// /cp/workspaces/* call with 401.
|
|
adminToken := os.Getenv("ADMIN_TOKEN")
|
|
|
|
return &CPProvisioner{
|
|
baseURL: baseURL,
|
|
orgID: orgID,
|
|
sharedSecret: sharedSecret,
|
|
adminToken: adminToken,
|
|
httpClient: &http.Client{Timeout: 120 * time.Second},
|
|
}, nil
|
|
}
|
|
|
|
// authHeaders sets both auth headers on the outbound request:
|
|
// - Authorization: Bearer <shared secret> — platform gate
|
|
// - X-Molecule-Admin-Token: <per-tenant token> — identity gate
|
|
//
|
|
// Either is a no-op when its value is empty so self-hosted / dev
|
|
// deployments without a real CP still work (those don't hit a CP that
|
|
// enforces either gate). In prod both are set by the controlplane
|
|
// bootstrap, so both headers land on every outbound call.
|
|
func (p *CPProvisioner) authHeaders(req *http.Request) {
|
|
if p.sharedSecret != "" {
|
|
req.Header.Set("Authorization", "Bearer "+p.sharedSecret)
|
|
}
|
|
if p.adminToken != "" {
|
|
req.Header.Set("X-Molecule-Admin-Token", p.adminToken)
|
|
}
|
|
}
|
|
|
|
type cpProvisionRequest struct {
|
|
OrgID string `json:"org_id"`
|
|
WorkspaceID string `json:"workspace_id"`
|
|
Runtime string `json:"runtime"`
|
|
Tier int `json:"tier"`
|
|
PlatformURL string `json:"platform_url"`
|
|
Env map[string]string `json:"env"`
|
|
}
|
|
|
|
type cpProvisionResponse struct {
|
|
InstanceID string `json:"instance_id"`
|
|
PrivateIP string `json:"private_ip"`
|
|
State string `json:"state"`
|
|
Error string `json:"error"`
|
|
}
|
|
|
|
// Start provisions a workspace by calling the control plane → EC2.
|
|
func (p *CPProvisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, error) {
|
|
req := cpProvisionRequest{
|
|
OrgID: p.orgID,
|
|
WorkspaceID: cfg.WorkspaceID,
|
|
Runtime: cfg.Runtime,
|
|
Tier: cfg.Tier,
|
|
PlatformURL: cfg.PlatformURL,
|
|
Env: cfg.EnvVars,
|
|
}
|
|
|
|
body, err := json.Marshal(req)
|
|
if err != nil {
|
|
return "", fmt.Errorf("cp provisioner: marshal: %w", err)
|
|
}
|
|
|
|
url := p.baseURL + "/cp/workspaces/provision"
|
|
httpReq, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(body))
|
|
if err != nil {
|
|
return "", fmt.Errorf("cp provisioner: create request: %w", err)
|
|
}
|
|
httpReq.Header.Set("Content-Type", "application/json")
|
|
p.authHeaders(httpReq)
|
|
|
|
resp, err := p.httpClient.Do(httpReq)
|
|
if err != nil {
|
|
return "", fmt.Errorf("cp provisioner: send: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
// Cap body read at 64 KiB — the CP only ever returns small JSON
|
|
// responses; an unbounded read could be weaponized into log-flood
|
|
// DoS by a compromised upstream.
|
|
respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 64<<10))
|
|
var result cpProvisionResponse
|
|
json.Unmarshal(respBody, &result)
|
|
|
|
if resp.StatusCode != http.StatusCreated {
|
|
// Prefer the structured {"error":"..."} field. Do NOT fall back
|
|
// to string(respBody) — our logs ingest errors, and an upstream
|
|
// misconfiguration that echoed the Authorization header or
|
|
// request body into the response would leak bearer tokens.
|
|
errMsg := result.Error
|
|
if errMsg == "" {
|
|
errMsg = fmt.Sprintf("<unstructured body, %d bytes>", len(respBody))
|
|
}
|
|
return "", fmt.Errorf("cp provisioner: provision failed (%d): %s", resp.StatusCode, errMsg)
|
|
}
|
|
|
|
log.Printf("CP provisioner: workspace %s → EC2 instance %s (%s)", cfg.WorkspaceID, result.InstanceID, result.State)
|
|
return result.InstanceID, nil
|
|
}
|
|
|
|
// Stop terminates the workspace's EC2 instance via the control plane.
|
|
func (p *CPProvisioner) Stop(ctx context.Context, workspaceID string) error {
|
|
url := fmt.Sprintf("%s/cp/workspaces/%s?instance_id=%s", p.baseURL, workspaceID, workspaceID)
|
|
req, _ := http.NewRequestWithContext(ctx, "DELETE", url, nil)
|
|
p.authHeaders(req)
|
|
resp, err := p.httpClient.Do(req)
|
|
if err != nil {
|
|
return fmt.Errorf("cp provisioner: stop: %w", err)
|
|
}
|
|
resp.Body.Close()
|
|
return nil
|
|
}
|
|
|
|
// IsRunning checks workspace EC2 instance state via the control plane.
|
|
//
|
|
// Contract:
|
|
// - transport error → (false, error)
|
|
// - non-2xx HTTP response → (false, error). Previously swallowed;
|
|
// a CP 500 would return (false, nil) and the sweeper couldn't
|
|
// distinguish "workspace stopped" from "CP broken".
|
|
// - 2xx with state!="running" → (false, nil)
|
|
// - 2xx with state=="running" → (true, nil)
|
|
func (p *CPProvisioner) IsRunning(ctx context.Context, workspaceID string) (bool, error) {
|
|
url := fmt.Sprintf("%s/cp/workspaces/%s/status?instance_id=%s", p.baseURL, workspaceID, workspaceID)
|
|
req, _ := http.NewRequestWithContext(ctx, "GET", url, nil)
|
|
p.authHeaders(req)
|
|
resp, err := p.httpClient.Do(req)
|
|
if err != nil {
|
|
return false, fmt.Errorf("cp provisioner: status: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
|
// Don't leak the body — upstream errors may echo headers.
|
|
return false, fmt.Errorf("cp provisioner: status: unexpected %d", resp.StatusCode)
|
|
}
|
|
var result struct{ State string `json:"state"` }
|
|
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
|
|
return false, fmt.Errorf("cp provisioner: status decode: %w", err)
|
|
}
|
|
return result.State == "running", nil
|
|
}
|
|
|
|
// Close is a no-op.
|
|
func (p *CPProvisioner) Close() error { return nil }
|