diff --git a/.coverage-allowlist.txt b/.coverage-allowlist.txt new file mode 100644 index 00000000..f5f85412 --- /dev/null +++ b/.coverage-allowlist.txt @@ -0,0 +1,41 @@ +# Coverage allowlist — security-critical files that are currently below +# the 10% per-file floor and are being tracked for remediation. +# +# Format: one path per line, relative to workspace-server/. +# Lines starting with # and blank lines are ignored. +# +# Process: +# - A path in this list is WARNED on each CI run, not failed. +# - Each entry must reference a tracking issue and expiry date. +# - On expiry, either the coverage is fixed OR the path graduates to +# hard-fail (revert the allowlist entry). +# +# See #1823 for the gate design and ratchet plan. + +# ============== Active exceptions ============== + +# Filed 2026-04-23 — expiry 2026-05-23 (30 days). Tracking: #1823. +# These are the files flagged by the first run of the critical-path gate. +# QA team + platform team share ownership of test coverage remediation. + +internal/handlers/a2a_proxy.go +internal/handlers/a2a_proxy_helpers.go +internal/handlers/registry.go +internal/handlers/secrets.go +internal/handlers/tokens.go +internal/handlers/workspace_provision.go +internal/middleware/wsauth_middleware.go + +# The following paths matched via looser CRITICAL_PATH substrings +# (e.g. "registry" matched both internal/registry/ and internal/channels/registry.go). +# Adding them here so the gate can land without blocking staging merges; +# a follow-up PR will tighten CRITICAL_PATHS to exact prefixes so these +# graduate to hard-fail precisely where security-critical. + +internal/channels/registry.go +internal/crypto/aes.go +internal/registry/access.go +internal/registry/healthsweep.go +internal/registry/hibernation.go +internal/registry/provisiontimeout.go +internal/wsauth/tokens.go diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 12f3be2f..efa043f8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -81,15 +81,98 @@ jobs: continue-on-error: true # Warn but don't block until codebase is clean - name: Run tests with race detection and coverage run: go test -race -coverprofile=coverage.out ./... - - name: Check coverage baseline + + - name: Per-file coverage report + # Advisory — lists every source file with its coverage so reviewers + # can see at-a-glance where gaps are. Sorted ascending so the worst + # offenders float to the top. Does NOT fail the build; the hard + # gate is the threshold check below. (#1823) run: | - COVERAGE=$(go tool cover -func=coverage.out | grep total | awk '{print $3}' | sed 's/%//') - echo "Total coverage: ${COVERAGE}%" - THRESHOLD=25 - awk "BEGIN{if ($COVERAGE < $THRESHOLD) exit 1}" || { - echo "::error::Coverage ${COVERAGE}% is below the ${THRESHOLD}% threshold" + echo "=== Per-file coverage (worst first) ===" + go tool cover -func=coverage.out \ + | grep -v '^total:' \ + | awk '{file=$1; sub(/:[0-9][0-9.]*:.*/, "", file); pct=$NF; gsub(/%/,"",pct); s[file]+=pct; c[file]++} + END {for (f in s) printf "%6.1f%% %s\n", s[f]/c[f], f}' \ + | sort -n + + - name: Check coverage thresholds + # Enforces two gates from #1823 Layer 1: + # 1. Total floor (25% — ratchet plan in COVERAGE_FLOOR.md). + # 2. Per-file floor — non-test .go files in security-critical + # paths with coverage <10% fail the build, UNLESS the file + # path is listed in .coverage-allowlist.txt (acknowledged + # historical debt with a tracking issue + expiry). + run: | + set -e + TOTAL_FLOOR=25 + # Security-critical paths where a 0%-coverage file is a real risk. + CRITICAL_PATHS=( + "internal/handlers/tokens" + "internal/handlers/workspace_provision" + "internal/handlers/a2a_proxy" + "internal/handlers/registry" + "internal/handlers/secrets" + "internal/middleware/wsauth" + "internal/crypto" + ) + + TOTAL=$(go tool cover -func=coverage.out | grep '^total:' | awk '{print $3}' | sed 's/%//') + echo "Total coverage: ${TOTAL}%" + if awk "BEGIN{exit !($TOTAL < $TOTAL_FLOOR)}"; then + echo "::error::Total coverage ${TOTAL}% is below the ${TOTAL_FLOOR}% floor. See COVERAGE_FLOOR.md for ratchet plan." exit 1 - } + fi + + # Aggregate per-file coverage → /tmp/perfile.txt: " " + go tool cover -func=coverage.out \ + | grep -v '^total:' \ + | awk '{file=$1; sub(/:[0-9][0-9.]*:.*/, "", file); pct=$NF; gsub(/%/,"",pct); s[file]+=pct; c[file]++} + END {for (f in s) printf "%s %.1f\n", f, s[f]/c[f]}' \ + > /tmp/perfile.txt + + # Build allowlist — paths relative to workspace-server, one per line. + # Lines starting with # are comments. + ALLOWLIST="" + if [ -f ../.coverage-allowlist.txt ]; then + ALLOWLIST=$(grep -vE '^(#|[[:space:]]*$)' ../.coverage-allowlist.txt || true) + fi + + FAILED=0 + WARNED=0 + for path in "${CRITICAL_PATHS[@]}"; do + while read -r file pct; do + [[ "$file" == *_test.go ]] && continue + [[ "$file" == *"$path"* ]] || continue + awk "BEGIN{exit !($pct < 10)}" || continue + + # Strip the package-import prefix so we can match .coverage-allowlist.txt + # entries written as paths relative to workspace-server/. + rel=$(echo "$file" | sed 's|^github.com/Molecule-AI/molecule-monorepo/platform/||') + + if echo "$ALLOWLIST" | grep -qxF "$rel"; then + echo "::warning file=workspace-server/$rel::Critical file at ${pct}% coverage (allowlisted, #1823) — fix before expiry." + WARNED=$((WARNED+1)) + else + echo "::error file=workspace-server/$rel::Critical file at ${pct}% coverage — must be >=10% (target 80%). See #1823. To acknowledge as known debt, add this path to .coverage-allowlist.txt." + FAILED=$((FAILED+1)) + fi + done < /tmp/perfile.txt + done + + echo "" + echo "Critical-path check: $FAILED new failures, $WARNED allowlisted warnings." + + if [ "$FAILED" -gt 0 ]; then + echo "" + echo "$FAILED security-critical file(s) have <10% test coverage and are" + echo "NOT in the allowlist. These paths handle auth, tokens, secrets, or" + echo "workspace provisioning — a 0% file here is the exact gap that let" + echo "CWE-22, CWE-78, KI-005 slip through in past incidents. Either:" + echo " (a) add tests to raise coverage above 10%, or" + echo " (b) add the path to .coverage-allowlist.txt with an expiry date" + echo " and a tracking issue reference." + exit 1 + fi canvas-build: name: Canvas (Next.js) diff --git a/COVERAGE_FLOOR.md b/COVERAGE_FLOOR.md new file mode 100644 index 00000000..2870a649 --- /dev/null +++ b/COVERAGE_FLOOR.md @@ -0,0 +1,78 @@ +# Coverage Floor + +CI enforces three coverage gates on `workspace-server` (Go). All defined in +`.github/workflows/ci.yml` → `platform-build` job. + +## Current floors (2026-04-23) + +| Gate | Threshold | What fails | +|---|---|---| +| **Total floor** | `25%` | `go tool cover -func` reports total below floor | +| **Critical-path per-file floor** | `10%` | Any non-test source file in a security-critical path with coverage ≤10% | +| **Per-file report** | advisory | Printed in CI log, sorted worst-first, does not fail | + +Total floor starts at 25% (unchanged from pre-#1823 to keep this PR strictly +additive). The new protection is the critical-path per-file floor, which +directly closes the gap that prompted the issue. Ratchet plan below begins +the month after to let the team first observe the gate in action. + +## Security-critical paths (Gate 2) + +Changes to these paths have historically introduced security issues (CWE-22, +CWE-78, KI-005, SSRF) or billing/auth risk. Coverage must not drop to zero. + +- `internal/handlers/tokens*` +- `internal/handlers/workspace_provision*` +- `internal/handlers/a2a_proxy*` +- `internal/handlers/registry*` +- `internal/handlers/secrets*` +- `internal/middleware/wsauth*` +- `internal/crypto*` + +## Ratchet plan + +Floor ratchets upward on a fixed cadence. Any ratchet is a PR — reviewable, +reversible, and creates history. The table below is the intended schedule. + +| Date | Total floor | Critical-path floor | Notes | +|---|---|---|---| +| 2026-04-23 | 25% | 10% | Initial gate (this file). | +| 2026-05-23 | 30% | 20% | First ratchet | +| 2026-06-23 | 40% | 30% | | +| 2026-07-23 | 50% | 40% | | +| 2026-08-23 | 55% | 50% | | +| 2026-09-23 | 60% | 60% | | +| 2026-10-23 | 70% | 70% | Target steady-state | + +The target end-state matches the per-role QA prompts which specify +"coverage >80% on changed files". CI enforces the floor; reviewers still +enforce the per-PR bar. + +## Exceptions + +If a critical-path file genuinely cannot have coverage above the floor (e.g. +thin wrapper around a third-party SDK with no branches to test), add an entry +here with: + +1. **File**: `internal/handlers/example.go` +2. **Reason**: Why coverage can't hit the floor +3. **Tracking issue**: GitHub issue for the real fix +4. **Expiry**: 14 days from entry date; after expiry either coverage is fixed + or the issue is closed as "accepted technical debt" + +### Active exceptions + +*(none — add here if you need to land code that legitimately can't clear the floor)* + +## Why this gate exists + +Issue #1823: an external audit found critical files at 0% coverage despite +test files existing with hundreds of lines. The existing CI step measured +coverage but didn't enforce a meaningful threshold. Any file could go from +80% → 0% and CI stayed green, because the single gate (total ≥25%) ignored +per-file distribution. + +This gate makes "no untested critical paths merged" a mechanical property of +the CI, not a behavioural property of QA agents or individual reviewers — +which is the only way to make it survive fleet outages, agent rotations, or +QA process changes. diff --git a/workspace-server/internal/handlers/registry.go b/workspace-server/internal/handlers/registry.go index 97ef8537..4e3d6675 100644 --- a/workspace-server/internal/handlers/registry.go +++ b/workspace-server/internal/handlers/registry.go @@ -451,16 +451,17 @@ func (h *RegistryHandler) evaluateStatus(c *gin.Context, payload models.Heartbea h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_ONLINE", payload.WorkspaceID, map[string]interface{}{}) } - // Auto-recovery: if a workspace is marked "failed" or "provisioning" but is - // actively sending heartbeats, it has clearly booted successfully. Transition - // to "online" so the scheduler and dashboard reflect reality. This catches - // cases where the provisioner crashed mid-setup or an earlier error left the - // status stale. - if currentStatus == "failed" || currentStatus == "provisioning" { - if _, err := db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'online', updated_at = now() WHERE id = $1 AND status IN ('failed', 'provisioning')`, payload.WorkspaceID); err != nil { - log.Printf("Heartbeat: failed to auto-recover %s from %s to online: %v", payload.WorkspaceID, currentStatus, err) + // Auto-recovery: if a workspace is marked "provisioning" but is actively sending + // heartbeats, it has successfully started up. Transition to "online" so the scheduler + // and A2A proxy can dispatch tasks to it. The provisioner does not call + // /registry/register on container start — only the heartbeat loop does, so this + // transition is the only mechanism that moves newly-started workspaces out of + // the phantom-idle state. (#1784) + if currentStatus == "provisioning" { + if _, err := db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'online', updated_at = now() WHERE id = $1 AND status = 'provisioning'`, payload.WorkspaceID); err != nil { + log.Printf("Heartbeat: failed to transition %s from provisioning to online: %v", payload.WorkspaceID, err) } else { - log.Printf("Heartbeat: auto-recovered %s from %s to online (heartbeat received)", payload.WorkspaceID, currentStatus) + log.Printf("Heartbeat: transitioned %s from provisioning to online (heartbeat received)", payload.WorkspaceID) } h.broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_ONLINE", payload.WorkspaceID, map[string]interface{}{ "recovered_from": currentStatus, diff --git a/workspace-server/internal/handlers/registry_test.go b/workspace-server/internal/handlers/registry_test.go index 791f07dc..4d2cb904 100644 --- a/workspace-server/internal/handlers/registry_test.go +++ b/workspace-server/internal/handlers/registry_test.go @@ -134,6 +134,56 @@ func TestHeartbeatHandler_OfflineToOnline(t *testing.T) { } } +// ==================== Heartbeat — provisioning → online recovery (#1784) ==================== + +func TestHeartbeatHandler_ProvisioningToOnline(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + handler := NewRegistryHandler(broadcaster) + + // Expect prevTask SELECT + mock.ExpectQuery("SELECT COALESCE\\(current_task"). + WithArgs("ws-provisioning"). + WillReturnRows(sqlmock.NewRows([]string{"current_task"}).AddRow("")) + + // Expect heartbeat UPDATE + mock.ExpectExec("UPDATE workspaces SET"). + WithArgs("ws-provisioning", 0.0, "", 1, 3000, ""). + WillReturnResult(sqlmock.NewResult(0, 1)) + + // Expect evaluateStatus SELECT — currently provisioning + mock.ExpectQuery("SELECT status FROM workspaces WHERE id ="). + WithArgs("ws-provisioning"). + WillReturnRows(sqlmock.NewRows([]string{"status"}).AddRow("provisioning")) + + // Expect status transition to online (#1784) + mock.ExpectExec("UPDATE workspaces SET status = 'online'"). + WithArgs("ws-provisioning"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + // Expect RecordAndBroadcast INSERT for WORKSPACE_ONLINE + mock.ExpectExec("INSERT INTO structure_events"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + + body := `{"workspace_id":"ws-provisioning","error_rate":0.0,"sample_error":"","active_tasks":1,"uptime_seconds":3000}` + c.Request = httptest.NewRequest("POST", "/registry/heartbeat", bytes.NewBufferString(body)) + c.Request.Header.Set("Content-Type", "application/json") + + handler.Heartbeat(c) + + if w.Code != http.StatusOK { + t.Errorf("expected status 200, got %d: %s", w.Code, w.Body.String()) + } + + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + func TestHeartbeatHandler_BadJSON(t *testing.T) { setupTestDB(t) setupTestRedis(t) diff --git a/workspace-server/internal/handlers/workspace.go b/workspace-server/internal/handlers/workspace.go index 6af680f1..c55f1543 100644 --- a/workspace-server/internal/handlers/workspace.go +++ b/workspace-server/internal/handlers/workspace.go @@ -95,9 +95,18 @@ func (h *WorkspaceHandler) Create(c *gin.Context) { payload.Tier = 1 } - // Detect runtime from template config.yaml if not specified in request. - // Must happen before DB insert so the correct runtime is persisted. - if payload.Runtime == "" && payload.Template != "" { + // Detect runtime + default model from template config.yaml when the + // caller omitted them. Must happen before DB insert so persisted + // fields match the template's intent. + // + // Model default pre-fills the hermes-trap gap (PR #1714 + TemplatePalette + // patch): any create path (canvas dialog, TemplatePalette, direct API) + // that names a template but forgets a model slug now inherits the + // template's `runtime_config.model` — without it, hermes-agent falls + // back to its compiled-in Anthropic default and 401s when the user's + // key is for a different provider. Non-hermes runtimes are unaffected + // (the server still passes model through, they just don't use it). + if payload.Template != "" && (payload.Runtime == "" || payload.Model == "") { // #226: payload.Template is attacker-controllable. resolveInsideRoot // rejects absolute paths and any ".." that escapes configsDir so the // provisioner can't be pointed at host directories. @@ -111,10 +120,32 @@ func (h *WorkspaceHandler) Create(c *gin.Context) { if readErr != nil { log.Printf("Create: could not read config.yaml for template %q: %v", payload.Template, readErr) } - for _, line := range strings.Split(string(cfgData), "\n") { - line = strings.TrimSpace(line) - if strings.HasPrefix(line, "runtime:") { - payload.Runtime = strings.TrimSpace(strings.TrimPrefix(line, "runtime:")) + // Two-pass line scanner: the old parser found top-level `runtime:` + // by substring match on trimmed lines. We extend it to also find + // the nested `runtime_config.model:` (new format) and top-level + // `model:` (legacy format). A minimal state var tracks whether + // we're inside the runtime_config block based on indentation. + inRuntimeConfig := false + for _, rawLine := range strings.Split(string(cfgData), "\n") { + // Track indentation to detect block transitions. + trimmed := strings.TrimLeft(rawLine, " \t") + indented := len(rawLine) > len(trimmed) + if !indented { + // Left the runtime_config block (or never entered it). + inRuntimeConfig = strings.HasPrefix(trimmed, "runtime_config:") + } + stripped := strings.TrimSpace(rawLine) + switch { + case payload.Runtime == "" && !indented && strings.HasPrefix(stripped, "runtime:") && !strings.HasPrefix(stripped, "runtime_config"): + payload.Runtime = strings.TrimSpace(strings.TrimPrefix(stripped, "runtime:")) + case payload.Model == "" && !indented && strings.HasPrefix(stripped, "model:"): + // Legacy top-level `model:` — pre-runtime_config templates. + payload.Model = strings.Trim(strings.TrimSpace(strings.TrimPrefix(stripped, "model:")), `"'`) + case payload.Model == "" && indented && inRuntimeConfig && strings.HasPrefix(stripped, "model:"): + // Nested `runtime_config.model:` — current format (hermes etc.). + payload.Model = strings.Trim(strings.TrimSpace(strings.TrimPrefix(stripped, "model:")), `"'`) + } + if payload.Runtime != "" && payload.Model != "" { break } } diff --git a/workspace-server/internal/handlers/workspace_test.go b/workspace-server/internal/handlers/workspace_test.go index cc9289b9..b98f42d3 100644 --- a/workspace-server/internal/handlers/workspace_test.go +++ b/workspace-server/internal/handlers/workspace_test.go @@ -6,6 +6,8 @@ import ( "encoding/json" "net/http" "net/http/httptest" + "os" + "path/filepath" "testing" "github.com/DATA-DOG/go-sqlmock" @@ -1215,3 +1217,169 @@ func TestWorkspaceUpdate_BudgetLimitOnly_Ignored(t *testing.T) { t.Errorf("unexpected DB call for budget_limit: %v", err) } } + +// TestWorkspaceCreate_TemplateDefaultsMissingRuntimeAndModel covers the +// hermes-trap case: a caller (TemplatePalette, direct API, script) POSTs +// /workspaces with only a template name + no runtime + no model. The +// handler must read the template's config.yaml and fill in both fields +// BEFORE DB insert — otherwise hermes-agent auto-detects provider +// wrong and 401s downstream (PR #1714 context). +// +// Uses the nested runtime_config.model format current templates use; +// legacy top-level `model:` is covered by the Legacy test below. +func TestWorkspaceCreate_TemplateDefaultsMissingRuntimeAndModel(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + + // Stage a hermes-like template inside the configsDir the handler reads. + configsDir := t.TempDir() + templateDir := filepath.Join(configsDir, "hermes-template") + if err := os.MkdirAll(templateDir, 0o755); err != nil { + t.Fatalf("mkdir: %v", err) + } + cfg := []byte(`name: Hermes Agent +tier: 2 +runtime: hermes +runtime_config: + model: nousresearch/hermes-4-70b +`) + if err := os.WriteFile(filepath.Join(templateDir, "config.yaml"), cfg, 0o644); err != nil { + t.Fatalf("write cfg: %v", err) + } + + handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", configsDir) + + mock.ExpectBegin() + // Request omits runtime + model; handler must fill from the template + // and hand the completed values to the INSERT. + mock.ExpectExec("INSERT INTO workspaces"). + WithArgs( + sqlmock.AnyArg(), "Hermes Agent", nil, 1, "hermes", + sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil)). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectCommit() + mock.ExpectExec("INSERT INTO canvas_layouts"). + WithArgs(sqlmock.AnyArg(), float64(0), float64(0)). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("INSERT INTO structure_events"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + body := `{"name":"Hermes Agent","template":"hermes-template"}` + c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body)) + c.Request.Header.Set("Content-Type", "application/json") + + handler.Create(c) + + if w.Code != http.StatusCreated { + t.Fatalf("expected 201, got %d: %s", w.Code, w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet expectations: %v", err) + } +} + +// TestWorkspaceCreate_TemplateDefaultsLegacyTopLevelModel covers +// pre-runtime_config templates that declare `model:` at the top level. +// These should still surface the default via the same auto-fill. +func TestWorkspaceCreate_TemplateDefaultsLegacyTopLevelModel(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + + configsDir := t.TempDir() + templateDir := filepath.Join(configsDir, "legacy-template") + if err := os.MkdirAll(templateDir, 0o755); err != nil { + t.Fatalf("mkdir: %v", err) + } + cfg := []byte(`name: Legacy Agent +tier: 1 +runtime: langgraph +model: anthropic:claude-sonnet-4-5 +`) + if err := os.WriteFile(filepath.Join(templateDir, "config.yaml"), cfg, 0o644); err != nil { + t.Fatalf("write cfg: %v", err) + } + + handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", configsDir) + + mock.ExpectBegin() + mock.ExpectExec("INSERT INTO workspaces"). + WithArgs( + sqlmock.AnyArg(), "Legacy Agent", nil, 1, "langgraph", + sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil)). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectCommit() + mock.ExpectExec("INSERT INTO canvas_layouts"). + WithArgs(sqlmock.AnyArg(), float64(0), float64(0)). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("INSERT INTO structure_events"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + body := `{"name":"Legacy Agent","template":"legacy-template"}` + c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body)) + c.Request.Header.Set("Content-Type", "application/json") + + handler.Create(c) + + if w.Code != http.StatusCreated { + t.Fatalf("expected 201, got %d: %s", w.Code, w.Body.String()) + } +} + +// TestWorkspaceCreate_CallerModelOverridesTemplateDefault asserts that +// when the caller passes an explicit `model`, we DO NOT overwrite it +// with the template's default. The pre-fill only happens on empty. +func TestWorkspaceCreate_CallerModelOverridesTemplateDefault(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + + configsDir := t.TempDir() + templateDir := filepath.Join(configsDir, "hermes-template") + if err := os.MkdirAll(templateDir, 0o755); err != nil { + t.Fatalf("mkdir: %v", err) + } + cfg := []byte(`runtime: hermes +runtime_config: + model: nousresearch/hermes-4-70b +`) + if err := os.WriteFile(filepath.Join(templateDir, "config.yaml"), cfg, 0o644); err != nil { + t.Fatalf("write cfg: %v", err) + } + + handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", configsDir) + + mock.ExpectBegin() + // Caller explicitly chose minimax — template's hermes-4-70b must NOT win. + // The INSERT only passes runtime to the DB (model goes to agent_card / + // downstream config); we verify runtime == "hermes" and rely on the + // absence of a handler error to mean the model passthrough was honored. + mock.ExpectExec("INSERT INTO workspaces"). + WithArgs( + sqlmock.AnyArg(), "Custom Hermes", nil, 1, "hermes", + sqlmock.AnyArg(), (*string)(nil), nil, "none", (*int64)(nil)). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectCommit() + mock.ExpectExec("INSERT INTO canvas_layouts"). + WithArgs(sqlmock.AnyArg(), float64(0), float64(0)). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec("INSERT INTO structure_events"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + body := `{"name":"Custom Hermes","template":"hermes-template","model":"minimax/MiniMax-M2.7"}` + c.Request = httptest.NewRequest("POST", "/workspaces", bytes.NewBufferString(body)) + c.Request.Header.Set("Content-Type", "application/json") + + handler.Create(c) + + if w.Code != http.StatusCreated { + t.Fatalf("expected 201, got %d: %s", w.Code, w.Body.String()) + } +} diff --git a/workspace-server/internal/provisioner/cp_provisioner.go b/workspace-server/internal/provisioner/cp_provisioner.go index 0c0e6c9c..6f6ae58d 100644 --- a/workspace-server/internal/provisioner/cp_provisioner.go +++ b/workspace-server/internal/provisioner/cp_provisioner.go @@ -3,6 +3,7 @@ package provisioner import ( "bytes" "context" + "database/sql" "encoding/json" "fmt" "io" @@ -10,6 +11,8 @@ import ( "net/http" "os" "time" + + "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" ) // CPProvisioner provisions workspace agents by calling the control plane's @@ -182,8 +185,26 @@ func (p *CPProvisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, } // Stop terminates the workspace's EC2 instance via the control plane. +// +// Looks up the actual EC2 instance_id from the workspaces table before +// calling CP — earlier versions passed workspaceID (a UUID) as the +// instance_id query param, which CP forwarded to EC2 TerminateInstances, +// which rejected with InvalidInstanceID.Malformed (EC2 IDs are i-… not +// UUIDs). The terminate failure then left the workspace's SG attached, +// blocking the next provision with InvalidGroup.Duplicate — a full +// "Save & Restart" crash on SaaS. func (p *CPProvisioner) Stop(ctx context.Context, workspaceID string) error { - url := fmt.Sprintf("%s/cp/workspaces/%s?instance_id=%s", p.baseURL, workspaceID, workspaceID) + instanceID, err := resolveInstanceID(ctx, workspaceID) + if err != nil { + return fmt.Errorf("cp provisioner: stop: resolve instance_id: %w", err) + } + if instanceID == "" { + // No instance was ever provisioned (or already deprovisioned and + // the column was cleared). Nothing to terminate — idempotent. + log.Printf("CP provisioner: Stop for %s — no instance_id on file, nothing to do", workspaceID) + return nil + } + url := fmt.Sprintf("%s/cp/workspaces/%s?instance_id=%s", p.baseURL, workspaceID, instanceID) req, _ := http.NewRequestWithContext(ctx, "DELETE", url, nil) p.provisionAuthHeaders(req) resp, err := p.httpClient.Do(req) @@ -194,6 +215,35 @@ func (p *CPProvisioner) Stop(ctx context.Context, workspaceID string) error { return nil } +// resolveInstanceID reads workspaces.instance_id for the given workspace. +// Returns ("", nil) when the row exists but has no instance_id recorded +// (edge case for external workspaces or stale rows). Returns an error +// only on real DB failures, not on missing rows — callers (Stop, +// IsRunning) treat the empty string as "nothing to act on." +// +// Exposed as a package var so tests can substitute a stub without +// standing up a sqlmock just to unblock the Stop/IsRunning code path. +// Production code never reassigns it. +var resolveInstanceID = func(ctx context.Context, workspaceID string) (string, error) { + if db.DB == nil { + // Defensive: NewCPProvisioner never runs without db.DB being + // set in main(). If somehow nil, treat as "no instance" rather + // than panicking in the Stop/IsRunning path. + return "", nil + } + var instanceID sql.NullString + err := db.DB.QueryRowContext(ctx, + `SELECT instance_id FROM workspaces WHERE id = $1`, workspaceID, + ).Scan(&instanceID) + if err != nil && err != sql.ErrNoRows { + return "", err + } + if !instanceID.Valid { + return "", nil + } + return instanceID.String, nil +} + // IsRunning checks workspace EC2 instance state via the control plane. // // Contract (matches the Docker Provisioner.IsRunning contract — @@ -219,7 +269,18 @@ func (p *CPProvisioner) Stop(ctx context.Context, workspaceID string) error { // Both callers are happy with (true, err); callers that need the // previous (false, err) shape must inspect err themselves. func (p *CPProvisioner) IsRunning(ctx context.Context, workspaceID string) (bool, error) { - url := fmt.Sprintf("%s/cp/workspaces/%s/status?instance_id=%s", p.baseURL, workspaceID, workspaceID) + instanceID, err := resolveInstanceID(ctx, workspaceID) + if err != nil { + // Treat DB errors the same as transport errors — (true, err) keeps + // a2a_proxy on the alive path and logs the signal. + return true, fmt.Errorf("cp provisioner: status: resolve instance_id: %w", err) + } + if instanceID == "" { + // No instance recorded. Report "not running" cleanly (no error) + // so restart cascades can trigger a fresh provision. + return false, nil + } + url := fmt.Sprintf("%s/cp/workspaces/%s/status?instance_id=%s", p.baseURL, workspaceID, instanceID) req, _ := http.NewRequestWithContext(ctx, "GET", url, nil) p.provisionAuthHeaders(req) resp, err := p.httpClient.Do(req) diff --git a/workspace-server/internal/provisioner/cp_provisioner_test.go b/workspace-server/internal/provisioner/cp_provisioner_test.go index 247863e3..c8553adf 100644 --- a/workspace-server/internal/provisioner/cp_provisioner_test.go +++ b/workspace-server/internal/provisioner/cp_provisioner_test.go @@ -11,6 +11,23 @@ import ( "time" ) +// primeInstanceIDLookup swaps resolveInstanceID for a stub that returns +// the mapped instance_id for the given workspace_id, or "" for anything +// not in the map. Cheaper than standing up a sqlmock since Stop/IsRunning +// tests mostly don't care about the SQL path — they're testing the CP +// HTTP interaction downstream of the lookup. +func primeInstanceIDLookup(t *testing.T, pairs map[string]string) { + t.Helper() + prev := resolveInstanceID + resolveInstanceID = func(_ context.Context, wsID string) (string, error) { + if id, ok := pairs[wsID]; ok { + return id, nil + } + return "", nil + } + t.Cleanup(func() { resolveInstanceID = prev }) +} + // TestNewCPProvisioner_RequiresOrgID — self-hosted deployments don't // have a MOLECULE_ORG_ID, and the provisioner must refuse to construct // rather than silently phone home to the prod CP with an empty tenant. @@ -267,6 +284,12 @@ func TestStart_TransportFailureSurfaces(t *testing.T) { // platform-wide shared secret AND the per-tenant admin token, or the // CP will 401. func TestStop_SendsBothAuthHeaders(t *testing.T) { + // resolveInstanceID looks up the real EC2 id from the workspaces + // table; previously this test passed when the tenant buggily + // reused workspaceID AS instance_id. Now we assert the correct + // EC2 id round-trips. + primeInstanceIDLookup(t, map[string]string{"ws-1": "i-abc123"}) + var sawBearer, sawAdminToken, sawMethod, sawPath string var sawInstance string srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -295,8 +318,8 @@ func TestStop_SendsBothAuthHeaders(t *testing.T) { if sawPath != "/cp/workspaces/ws-1" { t.Errorf("path = %q, want /cp/workspaces/ws-1", sawPath) } - if sawInstance != "ws-1" { - t.Errorf("instance_id query = %q, want ws-1", sawInstance) + if sawInstance != "i-abc123" { + t.Errorf("instance_id query = %q, want i-abc123 (from DB lookup, NOT the workspace UUID)", sawInstance) } if sawBearer != "Bearer s3cret" { t.Errorf("bearer = %q, want Bearer s3cret", sawBearer) @@ -310,6 +333,7 @@ func TestStop_SendsBothAuthHeaders(t *testing.T) { // teardown call hits a dead CP, the error must surface so the caller // knows the workspace might still be running and needs retry. func TestStop_TransportErrorSurfaces(t *testing.T) { + primeInstanceIDLookup(t, map[string]string{"ws-1": "i-abc123"}) p := &CPProvisioner{ baseURL: "http://127.0.0.1:1", orgID: "org-1", @@ -327,6 +351,7 @@ func TestStop_TransportErrorSurfaces(t *testing.T) { // TestIsRunning_ParsesStateField — CP returns the EC2 state, we expose // a bool ("running"/"pending"/"terminated" → true only for "running"). func TestIsRunning_ParsesStateField(t *testing.T) { + primeInstanceIDLookup(t, map[string]string{"ws-1": "i-parsesstatefield"}) cases := map[string]bool{ "running": true, "pending": false, @@ -364,6 +389,7 @@ func TestIsRunning_ParsesStateField(t *testing.T) { // require the same per-tenant auth because they leak public_ip + // private_ip to the caller. func TestIsRunning_SendsBothAuthHeaders(t *testing.T) { + primeInstanceIDLookup(t, map[string]string{"ws-1": "i-auth-headers"}) var sawBearer, sawAdminToken string srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { sawBearer = r.Header.Get("Authorization") @@ -398,6 +424,7 @@ func TestIsRunning_SendsBothAuthHeaders(t *testing.T) { // The sweeper (healthsweep.go) inspects err independently and skips // on any error, so (true, err) is equally safe for that caller. func TestIsRunning_TransportErrorReturnsTrue(t *testing.T) { + primeInstanceIDLookup(t, map[string]string{"ws-1": "i-transport"}) p := &CPProvisioner{ baseURL: "http://127.0.0.1:1", orgID: "org-1", @@ -418,6 +445,7 @@ func TestIsRunning_TransportErrorReturnsTrue(t *testing.T) { // the sweeper would see the workspace as not-running. Now every // non-2xx is an error the caller can log + retry. func TestIsRunning_Non2xxSurfacesError(t *testing.T) { + primeInstanceIDLookup(t, map[string]string{"ws-1": "i-non2xx"}) cases := []struct { name string status int @@ -459,6 +487,7 @@ func TestIsRunning_Non2xxSurfacesError(t *testing.T) { // a middleware glitch (HTML error page with 200) from looking like // "workspace stopped". func TestIsRunning_MalformedJSONBodyReturnsError(t *testing.T) { + primeInstanceIDLookup(t, map[string]string{"ws-1": "i-malformed"}) srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) _, _ = io.WriteString(w, "maintenance mode") @@ -486,6 +515,7 @@ func TestIsRunning_MalformedJSONBodyReturnsError(t *testing.T) { // the documented contract values rather than simulating the whole // a2a_proxy flow. func TestIsRunning_ContractCompat_A2AProxy(t *testing.T) { + primeInstanceIDLookup(t, map[string]string{"ws-1": "i-contract"}) // Simulate every error path and assert running==true for each. t.Run("transport error", func(t *testing.T) { p := &CPProvisioner{ @@ -559,6 +589,7 @@ func TestIsRunning_ContractCompat_A2AProxy(t *testing.T) { // only needs the prefix to produce a value, so the decode succeeds — // and the LimitReader enforces the cap regardless. func TestIsRunning_BoundedBodyRead(t *testing.T) { + primeInstanceIDLookup(t, map[string]string{"ws-1": "i-bounded"}) srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(200) // Valid JSON prefix, then pad well past the 64 KiB cap. @@ -628,3 +659,67 @@ func TestGetConsoleOutput_UsesAdminBearer(t *testing.T) { t.Errorf("bearer = %q, want Bearer admin-api-key (NOT the provision secret)", sawBearer) } } + +// TestStop_EmptyInstanceIDIsNoop — when workspaces.instance_id is NULL +// (e.g. a row that was never provisioned, or deprovisioned and cleared), +// Stop should be a no-op instead of sending a malformed CP request. +// Regression guard: previously Stop sent workspaceID as instance_id +// even when no EC2 had been booked, causing CP → EC2 to 400. +func TestStop_EmptyInstanceIDIsNoop(t *testing.T) { + // Empty map → lookup returns ("", nil) for any workspace. + primeInstanceIDLookup(t, map[string]string{}) + + hit := false + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + hit = true + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + p := &CPProvisioner{baseURL: srv.URL, orgID: "org-1", httpClient: srv.Client()} + if err := p.Stop(context.Background(), "ws-ghost"); err != nil { + t.Fatalf("Stop with empty instance_id should no-op, got err %v", err) + } + if hit { + t.Errorf("Stop contacted CP even though instance_id was empty") + } +} + +// TestIsRunning_UsesDBInstanceID — IsRunning must also look up +// instance_id from the workspaces table, same as Stop. Mirror of +// TestStop_SendsBothAuthHeaders. +func TestIsRunning_UsesDBInstanceID(t *testing.T) { + primeInstanceIDLookup(t, map[string]string{"ws-1": "i-xyz789"}) + + var sawInstance string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + sawInstance = r.URL.Query().Get("instance_id") + w.WriteHeader(http.StatusOK) + _, _ = io.WriteString(w, `{"state":"running"}`) + })) + defer srv.Close() + + p := &CPProvisioner{baseURL: srv.URL, orgID: "org-1", httpClient: srv.Client()} + if _, err := p.IsRunning(context.Background(), "ws-1"); err != nil { + t.Fatalf("IsRunning: %v", err) + } + if sawInstance != "i-xyz789" { + t.Errorf("instance_id query = %q, want i-xyz789 (from DB lookup)", sawInstance) + } +} + +// TestIsRunning_EmptyInstanceIDReturnsFalse — IsRunning on a +// workspace with no recorded EC2 instance must report (false, nil) so +// restart cascades re-provision fresh instead of looping on a stale +// row with no backing instance. +func TestIsRunning_EmptyInstanceIDReturnsFalse(t *testing.T) { + primeInstanceIDLookup(t, map[string]string{}) + p := &CPProvisioner{baseURL: "http://unused", orgID: "org-1"} + running, err := p.IsRunning(context.Background(), "ws-ghost") + if err != nil { + t.Errorf("IsRunning with empty instance_id should return (false, nil), got err %v", err) + } + if running { + t.Errorf("IsRunning with empty instance_id should return running=false, got true") + } +}