diff --git a/.github/workflows/e2e-api.yml b/.github/workflows/e2e-api.yml index 43f1004c..a0238dcd 100644 --- a/.github/workflows/e2e-api.yml +++ b/.github/workflows/e2e-api.yml @@ -1,35 +1,21 @@ name: E2E API Smoke Test # Extracted from ci.yml so workflow-level concurrency can protect this job # from run-level cancellation (issue #458). -# -# Problem: the job-level `concurrency.cancel-in-progress: false` in ci.yml -# prevented *sibling* E2E jobs from killing each other, but GitHub still -# cancelled the parent *workflow run* when a new push arrived. Since the job -# lived inside that run, it got cancelled too. -# -# Fix: a dedicated workflow gets its own concurrency group at the workflow -# level. New pushes to the same branch queue here instead of cancelling. -# Fast jobs (platform-build, canvas-build, etc.) stay in ci.yml and continue -# to benefit from run-level cancellation for quick feedback. on: push: - branches: [main] + branches: [main, staging] paths: - 'workspace-server/**' - 'tests/e2e/**' - '.github/workflows/e2e-api.yml' pull_request: - branches: [main] + branches: [main, staging] paths: - 'workspace-server/**' - 'tests/e2e/**' - '.github/workflows/e2e-api.yml' -# Workflow-level concurrency: new runs queue rather than cancel. -# `cancel-in-progress: false` is load-bearing — without it GitHub would still -# cancel this run when the next push arrives, defeating the whole fix. -# The group key includes github.ref so PRs don't compete with main. concurrency: group: e2e-api-${{ github.ref }} cancel-in-progress: false @@ -39,12 +25,6 @@ jobs: name: E2E API Smoke Test runs-on: ubuntu-latest timeout-minutes: 15 - # Postgres + Redis run as sibling containers via `docker run`. Could - # switch to a `services:` block now that we're on Linux, but the - # explicit start-and-wait gives us pg_isready / PING readiness checks - # that match the 30-tick timeouts the rest of the job expects. Ports - # 15432/16379 avoid collision with anything the host may already have - # on the standard ports. env: DATABASE_URL: postgres://dev:dev@localhost:15432/molecule?sslmode=disable REDIS_URL: redis://localhost:16379 @@ -61,12 +41,7 @@ jobs: - name: Start Postgres (docker) run: | docker rm -f "$PG_CONTAINER" 2>/dev/null || true - docker run -d --name "$PG_CONTAINER" \ - -e POSTGRES_USER=dev \ - -e POSTGRES_PASSWORD=dev \ - -e POSTGRES_DB=molecule \ - -p 15432:5432 \ - postgres:16 + docker run -d --name "$PG_CONTAINER" -e POSTGRES_USER=dev -e POSTGRES_PASSWORD=dev -e POSTGRES_DB=molecule -p 15432:5432 postgres:16 for i in $(seq 1 30); do if docker exec "$PG_CONTAINER" pg_isready -U dev >/dev/null 2>&1; then echo "Postgres ready after ${i}s" @@ -89,6 +64,7 @@ jobs: sleep 1 done echo "::error::Redis did not become ready in 15s" + docker logs "$REDIS_CONTAINER" || true exit 1 - name: Build platform working-directory: workspace-server @@ -111,16 +87,14 @@ jobs: cat workspace-server/platform.log || true exit 1 - name: Assert migrations applied - # Migrations auto-run at platform boot. Fail fast if they silently - # didn't — catches future migration-author mistakes before the E2E run. run: | tables=$(docker exec "$PG_CONTAINER" psql -U dev -d molecule -tAc "SELECT count(*) FROM information_schema.tables WHERE table_schema='public' AND table_name='workspaces'") if [ "$tables" != "1" ]; then - echo "::error::Migrations did not apply — 'workspaces' table missing" + echo "::error::Migrations did not apply" cat workspace-server/platform.log || true exit 1 fi - echo "Migrations OK (workspaces table present)" + echo "Migrations OK" - name: Run E2E API tests run: bash tests/e2e/test_api.sh - name: Dump platform log on failure diff --git a/canvas/src/components/MissingKeysModal.tsx b/canvas/src/components/MissingKeysModal.tsx index 2c2a648e..e6712177 100644 --- a/canvas/src/components/MissingKeysModal.tsx +++ b/canvas/src/components/MissingKeysModal.tsx @@ -387,7 +387,6 @@ function AllKeysModal({ }) { const [entries, setEntries] = useState([]); const [globalError, setGlobalError] = useState(null); - const firstInputRef = useRef(null); useEffect(() => { if (!open) return; @@ -403,12 +402,6 @@ function AllKeysModal({ setGlobalError(null); }, [open, missingKeys]); - useEffect(() => { - if (!open) return; - const raf = requestAnimationFrame(() => firstInputRef.current?.focus()); - return () => cancelAnimationFrame(raf); - }, [open]); - useEffect(() => { if (!open) return; const handler = (e: KeyboardEvent) => { @@ -471,6 +464,15 @@ function AllKeysModal({ onKeysAdded(); }, [entries, onKeysAdded]); + // Focus trap: auto-focus first input when modal opens + useEffect(() => { + if (!open) return; + const timer = requestAnimationFrame(() => { + document.getElementById("missing-keys-title")?.focus(); + }); + return () => cancelAnimationFrame(timer); + }, [open]); + if (!open) return null; const allSaved = entries.length > 0 && entries.every((e) => e.saved); @@ -482,8 +484,8 @@ function AllKeysModal({ return (
{entry.saved && ( -
, +})); + +import { ConfigTab } from "../ConfigTab"; + +// helper — wire the api.get mock for one scenario +function wireApi(opts: { + workspaceRuntime?: string; + workspaceModel?: string; + configYamlContent?: string | null; // null = 404 + templates?: Array<{ id: string; name?: string; runtime?: string; models?: unknown[] }>; +}) { + apiGet.mockImplementation((path: string) => { + if (path === `/workspaces/ws-test`) { + return Promise.resolve({ runtime: opts.workspaceRuntime ?? "" }); + } + if (path === `/workspaces/ws-test/model`) { + return Promise.resolve({ model: opts.workspaceModel ?? "" }); + } + if (path === `/workspaces/ws-test/files/config.yaml`) { + if (opts.configYamlContent === null) { + return Promise.reject(new Error("not found")); + } + return Promise.resolve({ content: opts.configYamlContent ?? "" }); + } + if (path === "/templates") { + return Promise.resolve(opts.templates ?? []); + } + return Promise.reject(new Error(`unmocked api.get: ${path}`)); + }); +} + +beforeEach(() => { + apiGet.mockReset(); + apiPatch.mockReset(); + apiPut.mockReset(); +}); + +describe("ConfigTab — hermes workspace", () => { + it("loads runtime from workspace metadata when config.yaml is missing (#1894 bug 1)", async () => { + // This is the hermes case: no platform config.yaml, so the form must + // fall back to GET /workspaces/:id's runtime field. Before the fix, the + // runtime dropdown showed "LangGraph (default)" because the fallback + // didn't exist. + wireApi({ + workspaceRuntime: "hermes", + workspaceModel: "openai/gpt-4o", + configYamlContent: null, + templates: [{ id: "t-hermes", name: "Hermes", runtime: "hermes", models: [] }], + }); + + render(); + + // Wait for loads + const select = await waitFor(() => screen.getByRole("combobox", { name: /runtime/i })); + expect((select as HTMLSelectElement).value).toBe("hermes"); + }); + + it("does NOT show 'No config.yaml found' error for hermes (#1894 bug 3)", async () => { + // Hermes manages its own config at ~/.hermes/config.yaml on the + // workspace host — the platform config.yaml NOT existing is expected, + // not an error. Showing a red error banner misleads the user. + wireApi({ + workspaceRuntime: "hermes", + configYamlContent: null, + templates: [{ id: "t-hermes", name: "Hermes", runtime: "hermes", models: [] }], + }); + + render(); + + await waitFor(() => { + const node = screen.queryByText(/No config\.yaml found/i); + // Assert the red error is absent; a gray info banner with the same + // phrase would also fail this (which is what we want — we don't + // want any "no config.yaml" phrasing on hermes at all). + expect(node).toBeNull(); + }); + }); + + it("shows hermes-specific info banner pointing to Terminal tab (#1894)", async () => { + wireApi({ + workspaceRuntime: "hermes", + configYamlContent: null, + templates: [{ id: "t-hermes", name: "Hermes", runtime: "hermes", models: [] }], + }); + + render(); + + await waitFor(() => { + expect(screen.getByText(/Hermes manages its own config/i)).toBeTruthy(); + }); + }); + + it("DOES show 'No config.yaml found' error for langgraph workspace (default runtime)", async () => { + // Regression guard the other way — the gray info banner is hermes- + // specific. A langgraph workspace with no config.yaml SHOULD still + // see the red error so the user knows to provide a template config. + wireApi({ + workspaceRuntime: "", + configYamlContent: null, + templates: [], + }); + + render(); + + await waitFor(() => { + expect(screen.getByText(/No config\.yaml found/i)).toBeTruthy(); + }); + }); +}); + +describe("ConfigTab — config.yaml on disk", () => { + it("config.yaml runtime/model wins when present, workspace metadata is fallback", async () => { + // If the workspace DB has runtime=langgraph but config.yaml declares + // runtime: crewai, the form should show crewai (config.yaml wins). + // Prevents silent runtime drift across reads. + wireApi({ + workspaceRuntime: "langgraph", // DB + configYamlContent: 'runtime: crewai\nmodel: "claude-opus"\n', + templates: [ + { id: "t-crewai", name: "CrewAI", runtime: "crewai", models: [] }, + ], + }); + + render(); + + const select = await waitFor(() => screen.getByRole("combobox", { name: /runtime/i })); + expect((select as HTMLSelectElement).value).toBe("crewai"); + }); +}); diff --git a/tests/e2e/test_staging_full_saas.sh b/tests/e2e/test_staging_full_saas.sh index 072d5fe3..317c761b 100755 --- a/tests/e2e/test_staging_full_saas.sh +++ b/tests/e2e/test_staging_full_saas.sh @@ -277,20 +277,48 @@ else fi # ─── 7. Wait for workspace(s) online ─────────────────────────────────── -log "7/11 Waiting for workspace(s) to reach status=online..." -WS_DEADLINE=$(( $(date +%s) + 600 )) +# Hermes cold-boot takes 10-13 min on slow apt days (apt + uv + hermes +# install + npm browser-tools). The controlplane bootstrap-watcher +# deadline fires at 5 min and sets status=failed prematurely; heartbeat +# then transitions failed → online after install.sh finishes. So: +# +# - 20 min deadline (hermes worst-case + slack) +# - 'failed' is a TRANSIENT state we must tolerate — log and keep +# polling, only hard-fail at the deadline. Pre-bootstrap-watcher-fix +# (controlplane#245) this was a flake generator: workspace went +# failed→online inside our window but we bailed at the failed read. +log "7/11 Waiting for workspace(s) to reach status=online (up to 20 min — hermes cold boot)..." +WS_DEADLINE=$(( $(date +%s) + 1200 )) WS_TO_CHECK="$PARENT_ID" [ -n "$CHILD_ID" ] && WS_TO_CHECK="$WS_TO_CHECK $CHILD_ID" for wid in $WS_TO_CHECK; do + WS_LAST_STATUS="" + WS_FAILED_LOGGED=0 while true; do if [ "$(date +%s)" -gt "$WS_DEADLINE" ]; then - fail "Workspace $wid never reached online within 10 min" + WS_LAST_ERR=$(tenant_call GET "/workspaces/$wid" 2>/dev/null | \ + python3 -c "import json,sys; print(json.load(sys.stdin).get('last_sample_error',''))" 2>/dev/null || echo "") + fail "Workspace $wid never reached online within 20 min (last status=$WS_LAST_STATUS, err=$WS_LAST_ERR)" fi WS_JSON=$(tenant_call GET "/workspaces/$wid" 2>/dev/null || echo '{}') WS_STATUS=$(echo "$WS_JSON" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))" 2>/dev/null) + if [ "$WS_STATUS" != "$WS_LAST_STATUS" ]; then + log " $wid → $WS_STATUS" + WS_LAST_STATUS="$WS_STATUS" + fi case "$WS_STATUS" in online) break ;; - failed) fail "Workspace $wid status=failed: $(echo "$WS_JSON" | python3 -c 'import json,sys; print(json.load(sys.stdin).get("last_sample_error",""))')" ;; + failed) + # Not a hard fail — bootstrap-watcher frequently marks failed at + # 5 min on hermes, then heartbeat recovers to online around 10-13 + # min when install.sh finishes. Log once per workspace so the CI + # output isn't spammy. + if [ "$WS_FAILED_LOGGED" = "0" ]; then + log " $wid transiently failed — waiting for heartbeat recovery (bootstrap-watcher deadline, see cp#245)" + WS_FAILED_LOGGED=1 + fi + sleep 10 + ;; *) sleep 10 ;; esac done @@ -326,9 +354,47 @@ print(parts[0].get('text', '') if parts else '') if [ -z "$AGENT_TEXT" ]; then fail "A2A returned no text. Raw: $A2A_RESP" fi + +# Specific error-class checks — each pattern caught a real P0 bug on +# 2026-04-23 that a generic "error|exception" check missed or misreported: +# +# "[hermes-agent error 401]" → gateway API_SERVER_KEY not propagated (hermes #12) +# "Invalid API key" → tenant auth chain (CP #238 race) +# "model_not_found" → hermes custom provider slug passthrough (#13) +# "Encrypted content is not supported" → hermes codex_responses API misroute (#14) +# "Unknown provider" → bridge misconfigured PROVIDER= (regression of #13 fix) +# "hermes-agent unreachable" → gateway process died +# +# Fail LOUD with the specific pattern so CI log + alert channel makes the +# regression unambiguous. +if echo "$AGENT_TEXT" | grep -qF "[hermes-agent error 401]"; then + fail "A2A — REGRESSION: hermes gateway auth broken (API_SERVER_KEY not in runtime env). See template-hermes#12. Raw: $AGENT_TEXT" +fi +if echo "$AGENT_TEXT" | grep -qF "hermes-agent unreachable"; then + fail "A2A — REGRESSION: hermes gateway process down. Check /var/log/hermes-gateway.log on the workspace EC2. Raw: $AGENT_TEXT" +fi +if echo "$AGENT_TEXT" | grep -qF "model_not_found"; then + fail "A2A — REGRESSION: model slug passed through with provider prefix. See template-hermes#13. Raw: $AGENT_TEXT" +fi +if echo "$AGENT_TEXT" | grep -qF "Encrypted content is not supported"; then + fail "A2A — REGRESSION: hermes custom provider hit /v1/responses instead of chat_completions. Config.yaml should declare api_mode: chat_completions. See template-hermes#14. Raw: $AGENT_TEXT" +fi +if echo "$AGENT_TEXT" | grep -qF "Unknown provider"; then + fail "A2A — REGRESSION: install.sh set PROVIDER to a value not in hermes's registry. Run 'hermes doctor' on the workspace to see valid values. Raw: $AGENT_TEXT" +fi +# Generic catch-all — falls through if none of the known regressions hit. if echo "$AGENT_TEXT" | grep -qiE "error|exception"; then fail "A2A returned an error-shaped response: $AGENT_TEXT" fi + +# Content assertion — the prompt asks the model to reply with exactly "PONG". +# Real models produce "PONG" (possibly with minor wrapping); a broken pipeline +# that echoes the prompt back or returns truncated context won't. Normalize +# to uppercase before matching to tolerate "pong" / "Pong". +if ! echo "$AGENT_TEXT" | tr '[:lower:]' '[:upper:]' | grep -qF "PONG"; then + fail "A2A reply didn't contain expected PONG token. Real: $AGENT_TEXT" +fi + ok "A2A parent round-trip succeeded: \"${AGENT_TEXT:0:80}\"" # ─── 9. HMA + peers + activity (full mode) ───────────────────────────── diff --git a/tools/test-hermes-bridge.sh b/tools/test-hermes-bridge.sh new file mode 100755 index 00000000..a1ee6328 --- /dev/null +++ b/tools/test-hermes-bridge.sh @@ -0,0 +1,199 @@ +#!/usr/bin/env bash +# test-hermes-bridge.sh — regression tests for template-hermes install.sh's +# OpenAI bridge logic. Runs offline (no network, no docker, no CI dependency). +# +# These tests pin the bridge invariants that we fixed on 2026-04-23 after +# production found these bugs: +# +# template-hermes#12: API_SERVER_KEY must be written to /etc/environment +# + /etc/profile.d/ so molecule-runtime inherits it. +# +# template-hermes#13: When bridging OPENAI_API_KEY, the model slug's +# "openai/" prefix must be stripped — OpenAI rejects prefixed names. +# +# template-hermes#14: The bridge must emit `api_mode: "chat_completions"` +# in config.yaml — otherwise hermes's custom provider defaults to +# codex_responses which sends include=[reasoning.encrypted_content], +# rejected by gpt-4o/gpt-4.1. +# +# Also pins the "don't fire" invariants — the bridge must NOT activate +# when the operator has explicitly configured HERMES_CUSTOM_*, and +# setting PROVIDER=openai would crash the hermes gateway ("Unknown provider"). +# +# Invocation: +# +# bash tools/test-hermes-bridge.sh /path/to/template-hermes/install.sh +# +# Default path: ../molecule-ai-workspace-template-hermes/install.sh relative +# to this script, which matches the dev-machine layout of the sibling repo. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +INSTALL_SH="${1:-$SCRIPT_DIR/../../molecule-ai-workspace-template-hermes/install.sh}" + +if [ ! -f "$INSTALL_SH" ]; then + echo "error: install.sh not found at $INSTALL_SH" >&2 + echo "usage: $0 [install.sh-path]" >&2 + exit 2 +fi + +TMP=$(mktemp -d) +trap 'rm -rf "$TMP"' EXIT + +PASS=0 +FAIL=0 + +# run_case — extract just the bridge + config.yaml write blocks from +# install.sh, stub out the parts that would require real side effects +# (system package installs, API_SERVER_KEY write to /etc/, gateway start), +# set up a minimal env, run, and capture the config.yaml output. +# +# Args: +# $1 = test name +# $2+ = env assignments (e.g. OPENAI_API_KEY=xxx, HERMES_DEFAULT_MODEL=openai/gpt-4o) +run_case() { + local name="$1"; shift + local case_dir="$TMP/$name" + mkdir -p "$case_dir" + + # Build a minimal harness that: + # 1. Sources scripts/derive-provider.sh (real, from the template repo) + # 2. Applies the bridge if-block (inlined verbatim from install.sh) + # 3. Emits config.yaml + # Intentionally skips: apt installs, hermes download, /etc writes, + # gateway start. We care about the BRANCH LOGIC not the system effects. + local template_dir + template_dir=$(cd "$(dirname "$INSTALL_SH")" && pwd) + + HERMES_HOME="$case_dir" \ + bash -c " +set -euo pipefail +HERMES_HOME='$case_dir' +$(for kv in "$@"; do printf 'export %s\n' "$kv"; done) +# Source derive-provider from the real template repo +. '$template_dir/scripts/derive-provider.sh' +DEFAULT_MODEL=\"\${HERMES_DEFAULT_MODEL:-nousresearch/hermes-4-70b}\" + +# Bridge block — extracted 1:1 from install.sh (the shape must stay in sync). +if [ \"\${PROVIDER}\" = \"custom\" ] && [ -n \"\${OPENAI_API_KEY:-}\" ] && [ -z \"\${HERMES_CUSTOM_BASE_URL:-}\" ] && [ -z \"\${HERMES_CUSTOM_API_KEY:-}\" ]; then + export HERMES_CUSTOM_BASE_URL='https://api.openai.com/v1' + export HERMES_CUSTOM_API_KEY=\"\${OPENAI_API_KEY}\" + export HERMES_CUSTOM_API_MODE='chat_completions' + DEFAULT_MODEL=\"\${DEFAULT_MODEL#openai/}\" +fi + +# Emit config.yaml (same shape as install.sh) +{ + echo 'model:' + echo \" default: \\\"\${DEFAULT_MODEL}\\\"\" + echo \" provider: \\\"\${PROVIDER}\\\"\" + if [ -n \"\${HERMES_CUSTOM_BASE_URL:-}\" ]; then + echo \" base_url: \\\"\${HERMES_CUSTOM_BASE_URL}\\\"\" + fi + if [ -n \"\${HERMES_CUSTOM_API_KEY:-}\" ]; then + echo \" api_key: \\\"\${HERMES_CUSTOM_API_KEY}\\\"\" + fi + if [ -n \"\${HERMES_CUSTOM_API_MODE:-}\" ]; then + echo \" api_mode: \\\"\${HERMES_CUSTOM_API_MODE}\\\"\" + fi +} > '$case_dir/config.yaml' +" >"$case_dir/stdout" 2>"$case_dir/stderr" || { + printf 'FAIL %s: harness exited non-zero\n' "$name" >&2 + echo "stderr:" >&2 + sed 's/^/ /' "$case_dir/stderr" >&2 + FAIL=$((FAIL+1)) + return 1 + } + cat "$case_dir/config.yaml" +} + +# assert_in — assert a fragment appears in the config.yaml of the named case. +assert_in() { + local name="$1" pattern="$2" + if grep -qF "$pattern" "$TMP/$name/config.yaml"; then + printf 'PASS %s: contains %q\n' "$name" "$pattern" + PASS=$((PASS+1)) + else + printf 'FAIL %s: missing %q\n' "$name" "$pattern" >&2 + echo " actual config.yaml:" >&2 + sed 's/^/ /' "$TMP/$name/config.yaml" >&2 + FAIL=$((FAIL+1)) + fi +} + +assert_not_in() { + local name="$1" pattern="$2" + if grep -qF "$pattern" "$TMP/$name/config.yaml"; then + printf 'FAIL %s: unexpected %q present\n' "$name" "$pattern" >&2 + echo " actual config.yaml:" >&2 + sed 's/^/ /' "$TMP/$name/config.yaml" >&2 + FAIL=$((FAIL+1)) + else + printf 'PASS %s: absent %q\n' "$name" "$pattern" + PASS=$((PASS+1)) + fi +} + +# ─── Case 1: OpenAI bridge fires, strips prefix, sets api_mode ────────── +# Regression guard for #13 + #14. When only OPENAI_API_KEY is set and the +# user specifies openai/gpt-4o, install.sh must: +# - KEEP provider=custom (not flip to "openai" — hermes has no native +# openai provider, gateway would crash "Unknown provider") +# - strip "openai/" prefix from the model → "gpt-4o" +# - emit api_mode: "chat_completions" (so hermes doesn't hit /v1/responses +# with include=[reasoning.encrypted_content] which gpt-4o rejects) +run_case "openai-bridge-happy" \ + OPENAI_API_KEY=sk-test-abc \ + HERMES_DEFAULT_MODEL=openai/gpt-4o >/dev/null + +assert_in "openai-bridge-happy" 'default: "gpt-4o"' +assert_in "openai-bridge-happy" 'provider: "custom"' +assert_in "openai-bridge-happy" 'base_url: "https://api.openai.com/v1"' +assert_in "openai-bridge-happy" 'api_key: "sk-test-abc"' +assert_in "openai-bridge-happy" 'api_mode: "chat_completions"' +assert_not_in "openai-bridge-happy" 'provider: "openai"' +assert_not_in "openai-bridge-happy" 'default: "openai/gpt-4o"' + +# ─── Case 2: Bridge skipped when operator sets HERMES_CUSTOM_* ────────── +# When an operator points at a self-hosted vLLM or similar, the bridge +# must NOT overwrite their values. api_mode should NOT be forced to +# chat_completions (the operator might want codex_responses for o1 models). +run_case "operator-custom-wins" \ + OPENAI_API_KEY=sk-test-abc \ + HERMES_CUSTOM_BASE_URL=http://my-vllm:8080/v1 \ + HERMES_CUSTOM_API_KEY=operator-key \ + HERMES_DEFAULT_MODEL=openai/gpt-4o >/dev/null + +assert_in "operator-custom-wins" 'base_url: "http://my-vllm:8080/v1"' +assert_in "operator-custom-wins" 'api_key: "operator-key"' +assert_not_in "operator-custom-wins" 'api_mode: "chat_completions"' +assert_not_in "operator-custom-wins" 'base_url: "https://api.openai.com/v1"' + +# ─── Case 3: Non-custom providers untouched ───────────────────────────── +# An OPENROUTER_API_KEY should pick provider=openrouter (per +# derive-provider.sh), and the bridge must not fire. +run_case "openrouter-not-touched" \ + OPENROUTER_API_KEY=sk-or-test \ + OPENAI_API_KEY=sk-test-abc \ + HERMES_DEFAULT_MODEL=openai/gpt-4o >/dev/null + +assert_in "openrouter-not-touched" 'provider: "openrouter"' +assert_not_in "openrouter-not-touched" 'api_mode: "chat_completions"' +assert_not_in "openrouter-not-touched" 'base_url: "https://api.openai.com/v1"' +# openrouter keeps the full slug (it can resolve openai/gpt-4o) +assert_in "openrouter-not-touched" 'default: "openai/gpt-4o"' + +# ─── Case 4: Non-openai model on bridge path leaves slug alone ────────── +# If the bridge fires but the model isn't prefixed with openai/, we don't +# want to break the string. Prefix-strip is a no-op when the prefix isn't there. +run_case "non-prefixed-model" \ + OPENAI_API_KEY=sk-test-abc \ + HERMES_DEFAULT_MODEL=gpt-4o >/dev/null + +assert_in "non-prefixed-model" 'default: "gpt-4o"' + +# ─── Summary ──────────────────────────────────────────────────────────── +echo "" +echo "Hermes bridge test: PASS=$PASS FAIL=$FAIL" +[ "$FAIL" = "0" ] diff --git a/workspace-server/go.mod b/workspace-server/go.mod index b585328c..6c50916a 100644 --- a/workspace-server/go.mod +++ b/workspace-server/go.mod @@ -4,7 +4,7 @@ go 1.25.0 require ( github.com/DATA-DOG/go-sqlmock v1.5.2 - github.com/Molecule-AI/molecule-ai-plugin-github-app-auth v0.0.0-20260416194734-2cd28737f845 + github.com/Molecule-AI/molecule-ai-plugin-github-app-auth v0.0.0-20260421064811-7d98ae51e31d github.com/alicebob/miniredis/v2 v2.37.0 github.com/creack/pty v1.1.18 github.com/docker/docker v28.2.2+incompatible @@ -16,6 +16,7 @@ require ( github.com/google/uuid v1.6.0 github.com/gorilla/websocket v1.5.3 github.com/lib/pq v1.10.9 + github.com/opencontainers/image-spec v1.1.1 github.com/redis/go-redis/v9 v9.7.0 github.com/robfig/cron/v3 v3.0.1 golang.org/x/crypto v0.49.0 @@ -56,7 +57,6 @@ require ( github.com/modern-go/reflect2 v1.0.2 // indirect github.com/morikuni/aec v1.1.0 // indirect github.com/opencontainers/go-digest v1.0.0 // indirect - github.com/opencontainers/image-spec v1.1.1 // indirect github.com/pelletier/go-toml/v2 v2.2.2 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/twitchyliquid64/golang-asm v0.15.1 // indirect @@ -78,4 +78,3 @@ require ( google.golang.org/protobuf v1.36.11 // indirect gotest.tools/v3 v3.5.2 // indirect ) - diff --git a/workspace-server/go.sum b/workspace-server/go.sum index 0e897247..681bb0cd 100644 --- a/workspace-server/go.sum +++ b/workspace-server/go.sum @@ -4,8 +4,8 @@ github.com/DATA-DOG/go-sqlmock v1.5.2 h1:OcvFkGmslmlZibjAjaHm3L//6LiuBgolP7Oputl github.com/DATA-DOG/go-sqlmock v1.5.2/go.mod h1:88MAG/4G7SMwSE3CeA0ZKzrT5CiOU3OJ+JlNzwDqpNU= github.com/Microsoft/go-winio v0.4.21 h1:+6mVbXh4wPzUrl1COX9A+ZCvEpYsOBZ6/+kwDnvLyro= github.com/Microsoft/go-winio v0.4.21/go.mod h1:JPGBdM1cNvN/6ISo+n8V5iA4v8pBzdOpzfwIujj1a84= -github.com/Molecule-AI/molecule-ai-plugin-github-app-auth v0.0.0-20260416194734-2cd28737f845 h1:Pae8GmpJOP/Bpf2KE1FhdN3zoPSbV/tl25yiAqEc4lM= -github.com/Molecule-AI/molecule-ai-plugin-github-app-auth v0.0.0-20260416194734-2cd28737f845/go.mod h1:3a6LR/zd7FjR9ZwLTbytwYlWuCBsbCOVFlEg0WnoYiM= +github.com/Molecule-AI/molecule-ai-plugin-github-app-auth v0.0.0-20260421064811-7d98ae51e31d h1:GpYhP6FxaJZc1Ljy5/YJ9ZIVGvfOqZBmDolNr2S5x2g= +github.com/Molecule-AI/molecule-ai-plugin-github-app-auth v0.0.0-20260421064811-7d98ae51e31d/go.mod h1:3a6LR/zd7FjR9ZwLTbytwYlWuCBsbCOVFlEg0WnoYiM= github.com/alicebob/miniredis/v2 v2.37.0 h1:RheObYW32G1aiJIj81XVt78ZHJpHonHLHW7OLIshq68= github.com/alicebob/miniredis/v2 v2.37.0/go.mod h1:TcL7YfarKPGDAthEtl5NBeHZfeUQj6OXMm/+iu5cLMM= github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs= diff --git a/workspace-server/internal/handlers/discovery.go b/workspace-server/internal/handlers/discovery.go index e27c256f..18ab225a 100644 --- a/workspace-server/internal/handlers/discovery.go +++ b/workspace-server/internal/handlers/discovery.go @@ -357,6 +357,18 @@ func validateDiscoveryCaller(ctx context.Context, c *gin.Context, workspaceID st tok := wsauth.BearerTokenFromHeader(c.GetHeader("Authorization")) if tok == "" { + // Canvas hits this endpoint via session cookie, not bearer token. + // Add verifiedCPSession() as a fallback after the bearer check so + // SaaS canvas Peers tab doesn't 401. Self-hosted workspaces are + // unaffected — they have no CP session cookie. + ok, presented := middleware.VerifiedCPSession(c.GetHeader("Cookie")) + if ok { + return nil + } + if presented { + c.JSON(http.StatusUnauthorized, gin.H{"error": "invalid session"}) + return errors.New("invalid session") + } c.JSON(http.StatusUnauthorized, gin.H{"error": "missing workspace auth token"}) return errors.New("missing token") } diff --git a/workspace-server/internal/handlers/terminal_test.go b/workspace-server/internal/handlers/terminal_test.go index 930d1a28..326354c6 100644 --- a/workspace-server/internal/handlers/terminal_test.go +++ b/workspace-server/internal/handlers/terminal_test.go @@ -105,6 +105,183 @@ func TestTerminalConnect_KI005_RejectsUnauthorizedCrossWorkspace(t *testing.T) { } } +// TestKI005_SelfAccess_AlwaysAllowed — when callerID equals the target workspace +// ID the request always passes (self-access: workspace's own token reaches its +// own terminal without needing the hierarchy check). +func TestKI005_SelfAccess_AlwaysAllowed(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + + mock.ExpectQuery("SELECT COALESCE"). + WithArgs("ws-self"). + WillReturnRows(sqlmock.NewRows([]string{"instance_id"}).AddRow("")) + + h := NewTerminalHandler(nil) + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-self"}} + c.Request = httptest.NewRequest("GET", "/workspaces/ws-self/terminal", nil) + // Self-access: X-Workspace-ID matches the route param, no auth needed. + c.Request.Header.Set("X-Workspace-ID", "ws-self") + + h.HandleConnect(c) + + // Self-access passes without any token check or CanCommunicate query. + if w.Code != http.StatusServiceUnavailable { + t.Errorf("self-access: expected 503 (Docker unavailable), got %d: %s", w.Code, w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + +// TestKI005_CanCommunicatePeer_Allowed — when the caller and target are siblings +// (share a parent), CanCommunicate returns true and the terminal access is granted. +func TestKI005_CanCommunicatePeer_Allowed(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + + // DB: caller workspace row for token validation. + mock.ExpectQuery("SELECT t.id, t.workspace_id"). + WithArgs(sqlmock.AnyArg()). + WillReturnRows(sqlmock.NewRows([]string{"id", "workspace_id"}). + AddRow("tok-caller", "ws-peer-a")) + + // DB: caller and target are siblings → CanCommunicate queries both. + mock.ExpectQuery("SELECT id, parent_id FROM workspaces WHERE id"). + WithArgs("ws-peer-a"). + WillReturnRows(sqlmock.NewRows([]string{"id", "parent_id"}). + AddRow("ws-peer-a", "org-lead")) + mock.ExpectQuery("SELECT id, parent_id FROM workspaces WHERE id"). + WithArgs("ws-peer-b"). + WillReturnRows(sqlmock.NewRows([]string{"id", "parent_id"}). + AddRow("ws-peer-b", "org-lead")) + + // DB: target workspace has no instance_id → local Docker path. + mock.ExpectQuery("SELECT COALESCE"). + WithArgs("ws-peer-b"). + WillReturnRows(sqlmock.NewRows([]string{"instance_id"}).AddRow("")) + + h := NewTerminalHandler(nil) + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-peer-b"}} + c.Request = httptest.NewRequest("GET", "/workspaces/ws-peer-b/terminal", nil) + c.Request.Header.Set("X-Workspace-ID", "ws-peer-a") + c.Request.Header.Set("Authorization", "Bearer peer-token") + + h.HandleConnect(c) + + if w.Code != http.StatusServiceUnavailable { + t.Errorf("peer access: expected 503 (Docker unavailable), got %d: %s", w.Code, w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + +// TestKI005_CanCommunicateNonPeer_Forbidden — when caller and target have +// different parents (not siblings, not root-level), CanCommunicate returns +// false and the terminal access is blocked with 403. +func TestKI005_CanCommunicateNonPeer_Forbidden(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + + // DB: caller workspace row for token validation. + mock.ExpectQuery("SELECT t.id, t.workspace_id"). + WithArgs(sqlmock.AnyArg()). + WillReturnRows(sqlmock.NewRows([]string{"id", "workspace_id"}). + AddRow("tok-attacker", "ws-attacker")) + + // DB: caller and target have different parents → CanCommunicate denies. + mock.ExpectQuery("SELECT id, parent_id FROM workspaces WHERE id"). + WithArgs("ws-attacker"). + WillReturnRows(sqlmock.NewRows([]string{"id", "parent_id"}). + AddRow("ws-attacker", "org-a")) + mock.ExpectQuery("SELECT id, parent_id FROM workspaces WHERE id"). + WithArgs("ws-victim"). + WillReturnRows(sqlmock.NewRows([]string{"id", "parent_id"}). + AddRow("ws-victim", "org-b")) + + h := NewTerminalHandler(nil) + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-victim"}} + c.Request = httptest.NewRequest("GET", "/workspaces/ws-victim/terminal", nil) + c.Request.Header.Set("X-Workspace-ID", "ws-attacker") + c.Request.Header.Set("Authorization", "Bearer attacker-token") + + h.HandleConnect(c) + + if w.Code != http.StatusForbidden { + t.Errorf("cross-workspace: expected 403, got %d: %s", w.Code, w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + +// TestKI005_TokenMismatch_Unauthorized — when the bearer token belongs to a +// different workspace than the claimed X-Workspace-ID, ValidateToken fails +// and the request is rejected with 401 before CanCommunicate is checked. +func TestKI005_TokenMismatch_Unauthorized(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + + // DB: token belongs to a different workspace than claimed — ValidateToken + // returns ErrInvalidToken (workspaceID mismatch). + mock.ExpectQuery("SELECT t.id, t.workspace_id"). + WithArgs(sqlmock.AnyArg()). + WillReturnRows(sqlmock.NewRows([]string{"id", "workspace_id"})) + + h := NewTerminalHandler(nil) + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-target"}} + c.Request = httptest.NewRequest("GET", "/workspaces/ws-target/terminal", nil) + c.Request.Header.Set("X-Workspace-ID", "ws-claimed") + c.Request.Header.Set("Authorization", "Bearer wrong-workspace-token") + + h.HandleConnect(c) + + if w.Code != http.StatusUnauthorized { + t.Errorf("token mismatch: expected 401, got %d: %s", w.Code, w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + +// TestKI005_NoXWorkspaceIDHeader_LegacyAllowed — when no X-Workspace-ID header +// is present (legacy canvas, direct browser access), the hierarchy check is +// skipped and the request proceeds to the container (standard WorkspaceAuth +// gates apply upstream). +func TestKI005_NoXWorkspaceIDHeader_LegacyAllowed(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + + // DB: no instance_id → local Docker path. + mock.ExpectQuery("SELECT COALESCE"). + WithArgs("ws-legacy"). + WillReturnRows(sqlmock.NewRows([]string{"instance_id"}).AddRow("")) + + h := NewTerminalHandler(nil) + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-legacy"}} + c.Request = httptest.NewRequest("GET", "/workspaces/ws-legacy/terminal", nil) + // No X-Workspace-ID header: legacy access, no hierarchy check. + + h.HandleConnect(c) + + if w.Code != http.StatusServiceUnavailable { + t.Errorf("legacy access: expected 503 (Docker unavailable), got %d: %s", w.Code, w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + // TestOpenTunnelCmd_BuildsArgv guards against silent drift in the EIC // tunnel invocation (e.g. someone flipping --local-port to --port). func TestOpenTunnelCmd_BuildsArgv(t *testing.T) { diff --git a/workspace-server/internal/handlers/workspace_provision.go b/workspace-server/internal/handlers/workspace_provision.go index 5e74ee73..0ebb0503 100644 --- a/workspace-server/internal/handlers/workspace_provision.go +++ b/workspace-server/internal/handlers/workspace_provision.go @@ -402,6 +402,17 @@ func (h *WorkspaceHandler) issueAndInjectToken(ctx context.Context, workspaceID cfg.ConfigFiles = make(map[string][]byte) } cfg.ConfigFiles[".auth_token"] = []byte(token) + // Option B (issue #1877): write token to volume BEFORE ContainerStart. + // Pre-write eliminates the race window where a restarted container could + // read a stale /configs/.auth_token before WriteFilesToContainer runs. + // This call is best-effort — if it fails (or provisioner is nil in tests) + // we still log and fall through; the runtime's heartbeat.py will retry + // on 401 if needed. + if h.provisioner != nil { + if writeErr := h.provisioner.WriteAuthTokenToVolume(ctx, workspaceID, token); writeErr != nil { + log.Printf("Provisioner: warning — pre-write token to volume failed for %s: %v (token still injected via WriteFilesToContainer after start)", workspaceID, writeErr) + } + } log.Printf("Provisioner: injected fresh auth token for workspace %s into config volume", workspaceID) } diff --git a/workspace-server/internal/handlers/workspace_restart.go b/workspace-server/internal/handlers/workspace_restart.go index 9c12b22a..ab1723cf 100644 --- a/workspace-server/internal/handlers/workspace_restart.go +++ b/workspace-server/internal/handlers/workspace_restart.go @@ -132,6 +132,17 @@ func (h *WorkspaceHandler) Restart(c *gin.Context) { RebuildConfig: body.RebuildConfig, }) + // #239: rebuild_config=true — try org-templates as last-resort source so a + // workspace with a destroyed config volume can self-recover without admin + // intervention. Only fires when no other template was resolved above. + if templatePath == "" && body.RebuildConfig { + if p, label := resolveOrgTemplate(h.configsDir, wsName); p != "" { + templatePath = p + configLabel = label + log.Printf("Restart: rebuild_config — using org-template %s for %s (%s)", label, wsName, id) + } + } + if templatePath == "" { log.Printf("Restart: reusing existing config volume for %s (%s)", wsName, id) } else { diff --git a/workspace-server/internal/middleware/session_auth.go b/workspace-server/internal/middleware/session_auth.go index 359e540d..33ce2ac2 100644 --- a/workspace-server/internal/middleware/session_auth.go +++ b/workspace-server/internal/middleware/session_auth.go @@ -193,7 +193,7 @@ func verifiedCPSession(cookieHeader string) (valid, presented bool) { client := &http.Client{Timeout: 3 * time.Second} req, err := http.NewRequest("GET", verifyURL, nil) if err != nil { - log.Printf("verifiedCPSession: build req: %v", err) + log.Printf("VerifiedCPSession: build req: %v", err) return false, true } req.Header.Set("Cookie", cookieHeader) @@ -201,7 +201,7 @@ func verifiedCPSession(cookieHeader string) (valid, presented bool) { resp, err := client.Do(req) if err != nil { - log.Printf("verifiedCPSession: upstream: %v", err) + log.Printf("VerifiedCPSession: upstream: %v", err) // NOTE: we deliberately do NOT cache transport failures. // Caching them would mean a 3s CP blip locks out all users // for the negative-TTL window. Next request retries. @@ -231,10 +231,10 @@ func verifiedCPSession(cookieHeader string) (valid, presented bool) { return true, true } -// VerifiedCPSession is the exported alias for handlers/discovery.go. -// Internal-only deployments (self-hosted / dev) where CP_UPSTREAM_URL -// is unset get (false, true) so the session path is skipped and the -// bearer token path runs as normal. +// VerifiedCPSession is the exported alias — callers in other packages +// (discovery.go, wsauth_middleware.go) use this name. Internal-only +// deployments (self-hosted/dev) where CP_UPSTREAM_URL is unset get +// (false, true) so the session path is skipped and bearer token auth runs. func VerifiedCPSession(cookieHeader string) (valid, presented bool) { return verifiedCPSession(cookieHeader) } diff --git a/workspace-server/internal/middleware/session_auth_test.go b/workspace-server/internal/middleware/session_auth_test.go index b60cc7f7..6e6e9a08 100644 --- a/workspace-server/internal/middleware/session_auth_test.go +++ b/workspace-server/internal/middleware/session_auth_test.go @@ -37,7 +37,7 @@ func mockCPServer(t *testing.T, status int, body string) (*httptest.Server, *ato func TestVerifiedCPSession_EmptyCookie(t *testing.T) { resetSessionCache() - ok, presented := verifiedCPSession("") + ok, presented := VerifiedCPSession("") if ok || presented { t.Errorf("empty cookie should be (false, false); got (%v, %v)", ok, presented) } @@ -47,7 +47,7 @@ func TestVerifiedCPSession_NoSlugConfigured(t *testing.T) { resetSessionCache() t.Setenv("CP_UPSTREAM_URL", "https://cp.test") t.Setenv("MOLECULE_ORG_SLUG", "") - ok, presented := verifiedCPSession("session=foo") + ok, presented := VerifiedCPSession("session=foo") // Without a slug we can't ask about tenant membership. Must // refuse (false, false) — caller falls through to bearer tier. if ok || presented { @@ -59,7 +59,7 @@ func TestVerifiedCPSession_NoCPConfigured(t *testing.T) { resetSessionCache() t.Setenv("CP_UPSTREAM_URL", "") t.Setenv("MOLECULE_ORG_SLUG", "acme") - ok, presented := verifiedCPSession("session=foo") + ok, presented := VerifiedCPSession("session=foo") // Self-hosted path: CP not configured, but cookie WAS presented. // Presented=true lets the caller know not to fall through to // bearer as if no credential arrived. @@ -74,7 +74,7 @@ func TestVerifiedCPSession_MemberTrue(t *testing.T) { t.Setenv("CP_UPSTREAM_URL", srv.URL) t.Setenv("MOLECULE_ORG_SLUG", "acme") - ok, presented := verifiedCPSession("session=valid") + ok, presented := VerifiedCPSession("session=valid") if !ok || !presented { t.Errorf("valid member should be (true, true); got (%v, %v)", ok, presented) } @@ -83,7 +83,7 @@ func TestVerifiedCPSession_MemberTrue(t *testing.T) { } // Second call must be served from cache. - ok, _ = verifiedCPSession("session=valid") + ok, _ = VerifiedCPSession("session=valid") if !ok { t.Errorf("cached call should still be true") } @@ -99,7 +99,7 @@ func TestVerifiedCPSession_MemberFalse(t *testing.T) { t.Setenv("CP_UPSTREAM_URL", srv.URL) t.Setenv("MOLECULE_ORG_SLUG", "acme") - ok, presented := verifiedCPSession("session=wrong-tenant") + ok, presented := VerifiedCPSession("session=wrong-tenant") if ok || !presented { t.Errorf("non-member should be (false, true); got (%v, %v)", ok, presented) } @@ -107,7 +107,7 @@ func TestVerifiedCPSession_MemberFalse(t *testing.T) { t.Fatalf("expected 1 upstream hit") } // Cached negatively. - _, _ = verifiedCPSession("session=wrong-tenant") + _, _ = VerifiedCPSession("session=wrong-tenant") if hits.Load() != 1 { t.Errorf("negative result should cache too; got %d hits", hits.Load()) } @@ -119,7 +119,7 @@ func TestVerifiedCPSession_Upstream401(t *testing.T) { t.Setenv("CP_UPSTREAM_URL", srv.URL) t.Setenv("MOLECULE_ORG_SLUG", "acme") - ok, presented := verifiedCPSession("session=expired") + ok, presented := VerifiedCPSession("session=expired") if ok || !presented { t.Errorf("401 upstream should be (false, true); got (%v, %v)", ok, presented) } @@ -131,7 +131,7 @@ func TestVerifiedCPSession_MalformedJSON(t *testing.T) { t.Setenv("CP_UPSTREAM_URL", srv.URL) t.Setenv("MOLECULE_ORG_SLUG", "acme") - ok, presented := verifiedCPSession("session=broken") + ok, presented := VerifiedCPSession("session=broken") if ok || !presented { t.Errorf("malformed body should be (false, true); got (%v, %v)", ok, presented) } @@ -143,7 +143,7 @@ func TestVerifiedCPSession_TransportErrorNotCached(t *testing.T) { t.Setenv("CP_UPSTREAM_URL", "http://127.0.0.1:1") t.Setenv("MOLECULE_ORG_SLUG", "acme") - ok, presented := verifiedCPSession("session=whatever") + ok, presented := VerifiedCPSession("session=whatever") if ok || !presented { t.Errorf("transport error should be (false, true); got (%v, %v)", ok, presented) } @@ -178,12 +178,12 @@ func TestVerifiedCPSession_CrossTenantIsolation(t *testing.T) { cookie := "session=shared-auth" t.Setenv("MOLECULE_ORG_SLUG", "acme") - if ok, _ := verifiedCPSession(cookie); !ok { + if ok, _ := VerifiedCPSession(cookie); !ok { t.Errorf("acme should say member=true") } t.Setenv("MOLECULE_ORG_SLUG", "bob") - if ok, _ := verifiedCPSession(cookie); ok { + if ok, _ := VerifiedCPSession(cookie); ok { t.Errorf("bob tenant must NOT accept acme cookie despite same session bytes") } if len(reqs) != 2 { diff --git a/workspace-server/internal/middleware/wsauth_middleware.go b/workspace-server/internal/middleware/wsauth_middleware.go index 66b8f261..a391fda3 100644 --- a/workspace-server/internal/middleware/wsauth_middleware.go +++ b/workspace-server/internal/middleware/wsauth_middleware.go @@ -174,7 +174,7 @@ func AdminAuth(database *sql.DB) gin.HandlerFunc { // hosted / dev deploys without a CP fall through to the // bearer-only path unchanged. if cookieHeader := c.GetHeader("Cookie"); cookieHeader != "" { - if ok, _ := verifiedCPSession(cookieHeader); ok { + if ok, _ := VerifiedCPSession(cookieHeader); ok { c.Next() return } diff --git a/workspace-server/internal/provisioner/cp_provisioner_instance_id_test.go b/workspace-server/internal/provisioner/cp_provisioner_instance_id_test.go new file mode 100644 index 00000000..bb77c934 --- /dev/null +++ b/workspace-server/internal/provisioner/cp_provisioner_instance_id_test.go @@ -0,0 +1,127 @@ +package provisioner + +// Regression tests for PR #1738 (merged 2026-04-23) — CPProvisioner.Stop + +// IsRunning must look up the real EC2 instance_id (i-*) from the DB +// before calling the control plane, NOT pass the workspace UUID verbatim. +// +// Original bug: +// url := fmt.Sprintf("%s/cp/workspaces/%s?instance_id=%s", +// baseURL, workspaceID, workspaceID) +// ^^^^^^^^^^^^^^ +// sends UUID as instance_id +// +// AWS then rejects with InvalidInstanceID.Malformed, the next provision +// hits InvalidGroup.Duplicate on the leftover SG, and Save & Restart +// cascades into a full failure. Production incident 2026-04-22 on +// hongmingwang workspace a8af9d79 + recurrent on every SaaS workspace +// secret update that triggers a restart. +// +// These tests pin two invariants of the fix: +// 1. Stop + IsRunning query resolveInstanceID(ctx, workspaceID) BEFORE +// hitting CP, and use the returned i-* ID (not the workspace UUID) +// in the instance_id query param. +// 2. Empty instance_id → no CP call (idempotent no-op). + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" +) + +// TestStop_UsesRealInstanceIDNotWorkspaceUUID is the load-bearing +// regression guard for #1738. If someone reverts the resolveInstanceID +// lookup and ships the `workspaceID, workspaceID` version back, this +// test fails immediately. +func TestStop_UsesRealInstanceIDNotWorkspaceUUID(t *testing.T) { + primeInstanceIDLookup(t, map[string]string{ + "ws-cd5c9906-bfd7-4e2a-8c0b-9f1e2d3a4b5c": "i-0a1b2c3d4e5f67890", + }) + + var sawInstance string + var sawPath string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + sawInstance = r.URL.Query().Get("instance_id") + sawPath = r.URL.Path + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + p := &CPProvisioner{ + baseURL: srv.URL, + orgID: "org-1", + sharedSecret: "s3cret", + adminToken: "tok-xyz", + httpClient: srv.Client(), + } + if err := p.Stop(context.Background(), "ws-cd5c9906-bfd7-4e2a-8c0b-9f1e2d3a4b5c"); err != nil { + t.Fatalf("Stop: %v", err) + } + + // Load-bearing assertion: the AWS-facing instance_id must be the + // i-* ID from the DB, NEVER the workspace UUID. + if sawInstance != "i-0a1b2c3d4e5f67890" { + t.Errorf("#1738 REGRESSION: instance_id query = %q, want i-0a1b2c3d4e5f67890. "+ + "CP would forward this to AWS TerminateInstances — a UUID triggers "+ + "InvalidInstanceID.Malformed and orphans the EC2. See PR #1738.", sawInstance) + } + + // Sanity: path still carries the workspace UUID (that's how CP looks + // up the row). Only the instance_id query param changed. + if sawPath != "/cp/workspaces/ws-cd5c9906-bfd7-4e2a-8c0b-9f1e2d3a4b5c" { + t.Errorf("path = %q, want /cp/workspaces/ws-cd5c9906-bfd7-4e2a-8c0b-9f1e2d3a4b5c", sawPath) + } +} + +// TestStop_NoInstanceIDSkipsCPCall — when the workspace has no EC2 on +// file (never provisioned, already deprovisioned, or external runtime), +// Stop must be a no-op. Calling CP with empty instance_id triggers the +// exact AWS error the fix was meant to prevent. +func TestStop_NoInstanceIDSkipsCPCall(t *testing.T) { + primeInstanceIDLookup(t, map[string]string{}) // empty map → "" for everything + + called := false + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + called = true + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + p := &CPProvisioner{baseURL: srv.URL, orgID: "org-1", httpClient: srv.Client()} + if err := p.Stop(context.Background(), "ws-never-provisioned"); err != nil { + t.Errorf("Stop with no instance_id should be no-op, got err: %v", err) + } + if called { + t.Error("#1738 REGRESSION: Stop hit CP with empty instance_id — would trigger " + + "InvalidInstanceID.Malformed downstream. Fix must short-circuit on empty lookup.") + } +} + +// TestIsRunning_UsesRealInstanceIDNotWorkspaceUUID mirrors the Stop test +// for IsRunning's GET /cp/workspaces/:id/status?instance_id=... path. +// Same class of bug, same acceptance criterion. +func TestIsRunning_UsesRealInstanceIDNotWorkspaceUUID(t *testing.T) { + primeInstanceIDLookup(t, map[string]string{ + "ws-abc": "i-deadbeef", + }) + + var sawInstance string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + sawInstance = r.URL.Query().Get("instance_id") + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"state":"running"}`)) + })) + defer srv.Close() + + p := &CPProvisioner{baseURL: srv.URL, orgID: "org-1", httpClient: srv.Client()} + running, err := p.IsRunning(context.Background(), "ws-abc") + if err != nil { + t.Fatalf("IsRunning: %v", err) + } + if !running { + t.Errorf("expected running=true") + } + if sawInstance != "i-deadbeef" { + t.Errorf("#1738 REGRESSION: IsRunning sent instance_id=%q, want i-deadbeef", sawInstance) + } +} diff --git a/workspace-server/internal/provisioner/provisioner.go b/workspace-server/internal/provisioner/provisioner.go index 481f09b7..ac04b15f 100644 --- a/workspace-server/internal/provisioner/provisioner.go +++ b/workspace-server/internal/provisioner/provisioner.go @@ -749,6 +749,41 @@ func (p *Provisioner) ReadFromVolume(ctx context.Context, volumeName, filePath s return clean, nil } +// WriteAuthTokenToVolume writes the workspace auth token into the config volume +// BEFORE the container starts, eliminating the token-injection race window where +// a restarted container could read a stale token from /configs/.auth_token before +// WriteFilesToContainer writes the new one. Issue #1877. +// +// Uses a throwaway alpine container to write directly to the named volume, +// bypassing the container lifecycle entirely. +func (p *Provisioner) WriteAuthTokenToVolume(ctx context.Context, workspaceID, token string) error { + volName := ConfigVolumeName(workspaceID) + resp, err := p.cli.ContainerCreate(ctx, &container.Config{ + Image: "alpine", + Cmd: []string{"sh", "-c", "mkdir -p /vol && printf '%s' $TOKEN > /vol/.auth_token && chmod 0600 /vol/.auth_token"}, + Env: []string{"TOKEN=" + token}, + }, &container.HostConfig{ + Binds: []string{volName + ":/vol"}, + }, nil, nil, "") + if err != nil { + return fmt.Errorf("failed to create token-write container: %w", err) + } + defer p.cli.ContainerRemove(ctx, resp.ID, container.RemoveOptions{Force: true}) + if err := p.cli.ContainerStart(ctx, resp.ID, container.StartOptions{}); err != nil { + return fmt.Errorf("failed to start token-write container: %w", err) + } + waitCh, errCh := p.cli.ContainerWait(ctx, resp.ID, container.WaitConditionNotRunning) + select { + case <-waitCh: + case writeErr := <-errCh: + if writeErr != nil { + return fmt.Errorf("token-write container exited with error: %w", writeErr) + } + } + log.Printf("Provisioner: wrote auth token to volume %s/.auth_token", volName) + return nil +} + // execInContainer runs a command inside a running container as root. // Best-effort: logs errors but does not fail the caller. func (p *Provisioner) execInContainer(ctx context.Context, containerID string, cmd []string) { diff --git a/workspace/heartbeat.py b/workspace/heartbeat.py index a67bec7b..1eb5b4fd 100644 --- a/workspace/heartbeat.py +++ b/workspace/heartbeat.py @@ -17,7 +17,7 @@ from pathlib import Path import httpx -from platform_auth import auth_headers +from platform_auth import auth_headers, refresh_cache logger = logging.getLogger(__name__) @@ -102,6 +102,35 @@ class HeartbeatLoop: self._consecutive_failures = 0 except Exception as e: self._consecutive_failures += 1 + # Issue #1877: if heartbeat 401'd, re-read the token from disk + # and retry once. This handles the platform's token-rotation race + # where WriteFilesToContainer hasn't finished writing the new + # token before the runtime boots and caches the old value. + is_401 = False + if isinstance(e, httpx.HTTPStatusError) and e.response.status_code == 401: + is_401 = True + if is_401: + logger.warning("Heartbeat 401 for %s — refreshing token cache and retrying once", self.workspace_id) + refresh_cache() + try: + await client.post( + f"{self.platform_url}/registry/heartbeat", + json={ + "workspace_id": self.workspace_id, + "error_rate": self.error_rate, + "sample_error": self.sample_error, + "active_tasks": self.active_tasks, + "current_task": self.current_task, + "uptime_seconds": int(time.time() - self.start_time), + }, + headers=auth_headers(), + ) + self._consecutive_failures = 0 + self.request_count += 1 + except Exception: + # Retry also failed — fall through to the normal + # failure tracking below. + pass if self._consecutive_failures <= 3 or self._consecutive_failures % MAX_CONSECUTIVE_FAILURES == 0: logger.warning("Heartbeat failed (%d consecutive): %s", self._consecutive_failures, e) if self._consecutive_failures >= MAX_CONSECUTIVE_FAILURES: diff --git a/workspace/platform_auth.py b/workspace/platform_auth.py index d4a1e180..39a17075 100644 --- a/workspace/platform_auth.py +++ b/workspace/platform_auth.py @@ -103,3 +103,14 @@ def clear_cache() -> None: files between cases.""" global _cached_token _cached_token = None + + +def refresh_cache() -> str | None: + """Force re-read of the token from disk, discarding the in-process cache. + + Use this when a 401 response suggests the cached token is stale — + e.g. after the platform rotates tokens during a restart (issue #1877). + Returns the (new) token value or None if not found/error.""" + global _cached_token + _cached_token = None + return get_token() diff --git a/workspace/plugins_registry/builtins.py b/workspace/plugins_registry/builtins.py index 9816ee85..c065aaff 100644 --- a/workspace/plugins_registry/builtins.py +++ b/workspace/plugins_registry/builtins.py @@ -24,7 +24,7 @@ Planned as the ecosystem matures (none are implemented yet — rule of three: promote a class here only after 3+ plugins ship the same custom shape via their own ``adapters/.py``): -* ``MCPServerAdaptor`` — install a plugin as an MCP server *(TODO)* +* :class:`MCPServerAdaptor` — install a plugin as an MCP server ✅ (issue #847) * ``DeepAgentsSubagentAdaptor`` — register a DeepAgents sub-agent (runtime-locked to deepagents) *(TODO)* * ``LangGraphSubgraphAdaptor`` — install a LangGraph sub-graph *(TODO)* @@ -339,5 +339,95 @@ def _deep_merge_hooks(existing: dict, fragment: dict) -> dict: for top_key, val in fragment.items(): if top_key == "hooks": continue - out.setdefault(top_key, val) + # mcpServers must be deep-merged: plugin A ships "firecrawl" and + # plugin B ships "github" → both entries land in settings.json. + # Using setdefault would skip the fragment's value when the key + # already exists, so we explicitly handle the dict case. + if top_key in out and isinstance(out[top_key], dict) and isinstance(val, dict): + out[top_key] = {**out[top_key], **val} + else: + out.setdefault(top_key, val) return out + + +# ---------------------------------------------------------------------- +# MCPServerAdaptor — issue #847. +# Promoted from custom adapters after 4 plugin proposals (molecule-firecrawl +# #512, molecule-github-mcp #520, molecule-browser-use #553, mcp-connector +# #573) all shipped the same pattern independently. +# ---------------------------------------------------------------------- + + +class MCPServerAdaptor: + """Sub-type adaptor for plugins that wrap an MCP server. + + The plugin ships: + + * ``settings-fragment.json`` with an ``mcpServers`` block — standard + Claude Code ``claude_desktop_config`` format, e.g.: + + .. code-block:: json + + { + "mcpServers": { + "my-server": { + "command": "npx", + "args": ["-y", "@org/my-mcp-server"] + } + } + } + + * ``skills//SKILL.md`` (optional) — agentskills.io skill docs; + ``AgentskillsAdaptor`` logic handles these. + * ``rules/*.md`` (optional) — always-on prose appended to CLAUDE.md; + ``AgentskillsAdaptor`` logic handles these. + * ``setup.sh`` (optional) — install npm packages, build binaries, etc.; + ``AgentskillsAdaptor`` logic handles these. + + On ``install()``: + + 1. ``settings-fragment.json`` → ``_install_claude_layer()`` merges the + ``mcpServers`` block into ``/.claude/settings.json``. + Hooks are also merged via the same path (so MCP-server plugins + can also ship hooks if they need them). + 2. Skills + rules + setup.sh → delegated to ``AgentskillsAdaptor``. + + On ``uninstall()``: + + 1. Skills + rules → delegated to ``AgentskillsAdaptor.uninstall()``. + 2. ``mcpServers`` entries are intentionally **not** removed from + ``settings.json`` on uninstall. MCP server configurations are + often shared with other tools or manually curated, so removing + them could break a user's setup. The user must remove them + manually if desired. + + Usage — in the plugin's per-runtime adapter file: + + .. code-block:: python + + # plugins//adapters/claude_code.py + from plugins_registry.builtins import MCPServerAdaptor as Adaptor + """ + + def __init__(self, plugin_name: str, runtime: str) -> None: + self.plugin_name = plugin_name + self.runtime = runtime + + async def install(self, ctx: InstallContext) -> InstallResult: + result = InstallResult( + plugin_name=self.plugin_name, + runtime=self.runtime, + source="plugin", + ) + # 1. Merge mcpServers (and any hooks) from settings-fragment.json. + _install_claude_layer(ctx, result, self.plugin_name) + # 2. Skills + rules + setup.sh — reuse AgentskillsAdaptor logic. + sub = await AgentskillsAdaptor(self.plugin_name, self.runtime).install(ctx) + result.files_written.extend(sub.files_written) + result.warnings.extend(sub.warnings) + return result + + async def uninstall(self, ctx: InstallContext) -> None: + # Delegate to AgentskillsAdaptor for skills + rules cleanup. + # NOTE: mcpServers entries are intentionally NOT removed (see class docstring). + await AgentskillsAdaptor(self.plugin_name, self.runtime).uninstall(ctx) diff --git a/workspace/tests/test_plugins_builtins.py b/workspace/tests/test_plugins_builtins.py index 31d14cae..fe6b5607 100644 --- a/workspace/tests/test_plugins_builtins.py +++ b/workspace/tests/test_plugins_builtins.py @@ -481,3 +481,234 @@ def test_deep_merge_hooks_top_level_keys_merged(): # setdefault semantics: existing keys win, new keys are added assert result["someKey"] == "old" assert result["anotherKey"] == "value" + + +def test_deep_merge_hooks_mcpServers_deep_merged(): + """mcpServers dicts from two plugins must be merged, not replaced. + + Plugin A ships firecrawl, plugin B ships github → both land in the + final settings.json (issue #847 motivation). + """ + existing = { + "mcpServers": { + "firecrawl": { + "command": "npx", + "args": ["-y", "@org/firecrawl-mcp"], + } + } + } + fragment = { + "mcpServers": { + "github": { + "command": "npx", + "args": ["-y", "@github/github-mcp-server"], + } + }, + "hooks": {}, + } + result = _deep_merge_hooks(existing, fragment) + assert "firecrawl" in result["mcpServers"] + assert "github" in result["mcpServers"] + # existing entries must not be overwritten + assert result["mcpServers"]["firecrawl"]["command"] == "npx" + + +def test_deep_merge_hooks_mcpServers_idempotent(): + """Re-merging the same mcpServers fragment must not duplicate entries.""" + fragment = { + "mcpServers": { + "firecrawl": {"command": "npx", "args": ["-y", "@org/firecrawl-mcp"]} + }, + "hooks": {}, + } + state = _deep_merge_hooks({}, fragment) + state = _deep_merge_hooks(state, fragment) + state = _deep_merge_hooks(state, fragment) + assert len(state["mcpServers"]) == 1 + + +def test_deep_merge_hooks_mcpServers_three_plugins(): + """Three plugins each contributing one mcpServer all land in final output.""" + state = {} + for name in ["firecrawl", "github", "browser-use"]: + fragment = { + "mcpServers": {name: {"command": "npx", "args": [f"-y @{name}"]}}, + "hooks": {}, + } + state = _deep_merge_hooks(state, fragment) + + assert set(state["mcpServers"].keys()) == {"firecrawl", "github", "browser-use"} + + +# --------------------------------------------------------------------------- +# MCPServerAdaptor tests — issue #847 +# --------------------------------------------------------------------------- + +from plugins_registry.builtins import MCPServerAdaptor # noqa: E402 + + +async def test_mcp_server_adaptor_install_writes_mcpServers(tmp_path: Path): + """install() must merge mcpServers from settings-fragment.json into settings.json.""" + plugin = tmp_path / "my-mcp-plugin" + plugin.mkdir() + (plugin / "settings-fragment.json").write_text( + json.dumps({ + "mcpServers": { + "my-server": { + "command": "npx", + "args": ["-y", "@org/my-mcp-server"], + } + } + }) + ) + # Also add a skill so we can verify AgentskillsAdaptor delegation. + (plugin / "skills" / "docs").mkdir(parents=True) + (plugin / "skills" / "docs" / "SKILL.md").write_text("# docs skill\n") + + configs = tmp_path / "configs" + configs.mkdir() + result = await MCPServerAdaptor("my-mcp-plugin", "claude_code").install( + _make_ctx(configs, plugin) + ) + + settings = json.loads((configs / ".claude" / "settings.json").read_text()) + assert "mcpServers" in settings + assert "my-server" in settings["mcpServers"] + assert settings["mcpServers"]["my-server"]["command"] == "npx" + # Skills were also installed (AgentskillsAdaptor delegation). + assert (configs / "skills" / "docs" / "SKILL.md").exists() + assert ".claude/settings.json" in result.files_written + + +async def test_mcp_server_adaptor_install_no_fragment_no_warning(tmp_path: Path): + """Plugin without settings-fragment.json must install silently (no settings.json created).""" + plugin = tmp_path / "bare-mcp" + plugin.mkdir() + configs = tmp_path / "configs" + configs.mkdir() + + result = await MCPServerAdaptor("bare-mcp", "claude_code").install( + _make_ctx(configs, plugin) + ) + # _install_claude_layer creates .claude dir, but no settings.json when + # there's no settings-fragment.json. + assert not (configs / ".claude" / "settings.json").exists() + assert result.warnings == [] + + +async def test_mcp_server_adaptor_uninstall_does_not_remove_mcpServers(tmp_path: Path): + """uninstall() must remove skills/rules but leave mcpServers in settings.json. + + Rationale: MCP server configs are often shared or manually curated; + removing them on plugin uninstall could break the user's environment. + """ + plugin = tmp_path / "my-mcp-plugin" + plugin.mkdir() + (plugin / "settings-fragment.json").write_text( + json.dumps({ + "mcpServers": { + "my-server": { + "command": "npx", + "args": ["-y", "@org/my-mcp-server"], + } + } + }) + ) + (plugin / "rules").mkdir(parents=True) + (plugin / "rules" / "r.md").write_text("- my rule\n") + (plugin / "skills" / "s").mkdir(parents=True) + (plugin / "skills" / "s" / "SKILL.md").write_text("# skill\n") + + configs = tmp_path / "configs" + configs.mkdir() + adaptor = MCPServerAdaptor("my-mcp-plugin", "claude_code") + + await adaptor.install(_make_ctx(configs, plugin)) + assert (configs / "skills" / "s").exists() + assert "my-server" in json.loads((configs / ".claude" / "settings.json").read_text()).get("mcpServers", {}) + + await adaptor.uninstall(_make_ctx(configs, plugin)) + + # Skills and rules removed by AgentskillsAdaptor delegation. + assert not (configs / "skills" / "s").exists() + assert not (configs / "CLAUDE.md").exists() or "# Plugin: my-mcp-plugin" not in (configs / "CLAUDE.md").read_text() + # mcpServers intentionally kept. + settings = json.loads((configs / ".claude" / "settings.json").read_text()) + assert "mcpServers" in settings + assert "my-server" in settings["mcpServers"] + + +async def test_mcp_server_adaptor_install_merges_with_existing_settings(tmp_path: Path): + """install() must deep-merge mcpServers with an already-populated settings.json.""" + plugin = tmp_path / "second-mcp" + plugin.mkdir() + (plugin / "settings-fragment.json").write_text( + json.dumps({ + "mcpServers": { + "github": { + "command": "npx", + "args": ["-y", "@github/github-mcp-server"], + } + } + }) + ) + + configs = tmp_path / "configs" + configs.mkdir() + # Pre-existing settings.json with an mcpServer already present. + claude_dir = configs / ".claude" + claude_dir.mkdir(parents=True) + (claude_dir / "settings.json").write_text( + json.dumps({ + "mcpServers": { + "firecrawl": { + "command": "npx", + "args": ["-y", "@firecrawl/firecrawl-mcp"], + } + } + }) + ) + + await MCPServerAdaptor("second-mcp", "claude_code").install(_make_ctx(configs, plugin)) + + settings = json.loads((claude_dir / "settings.json").read_text()) + assert "firecrawl" in settings["mcpServers"] + assert "github" in settings["mcpServers"] + + +async def test_mcp_server_adaptor_install_also_handles_hooks(tmp_path: Path): + """An MCPServer plugin can also ship PreToolUse/PostToolUse hooks via the + same settings-fragment.json; they must be merged without duplication.""" + plugin = tmp_path / "mcp-with-hooks" + plugin.mkdir() + (plugin / "hooks").mkdir(parents=True) + (plugin / "hooks" / "lint.sh").write_text("#!/bin/bash\necho ok\n") + (plugin / "hooks" / "lint.sh").chmod(0o755) + (plugin / "settings-fragment.json").write_text( + json.dumps({ + "mcpServers": { + "my-server": {"command": "npx", "args": ["-y", "@x/server"]} + }, + "hooks": { + "PreToolUse": [ + { + "matcher": "Bash", + "hooks": [{"type": "command", "command": "${CLAUDE_DIR}/hooks/lint.sh"}], + } + ] + }, + }) + ) + + configs = tmp_path / "configs" + configs.mkdir() + await MCPServerAdaptor("mcp-with-hooks", "claude_code").install(_make_ctx(configs, plugin)) + + settings = json.loads((configs / ".claude" / "settings.json").read_text()) + assert "my-server" in settings["mcpServers"] + assert len(settings["hooks"]["PreToolUse"]) == 1 + assert settings["hooks"]["PreToolUse"][0]["matcher"] == "Bash" + + +import json # noqa: E402 — also used in new tests above +