diff --git a/.github/workflows/auto-tag-runtime.yml b/.github/workflows/auto-tag-runtime.yml new file mode 100644 index 00000000..2b9070bc --- /dev/null +++ b/.github/workflows/auto-tag-runtime.yml @@ -0,0 +1,113 @@ +name: auto-tag-runtime + +# Auto-tag runtime releases on every merge to main that touches workspace/. +# This is the entry point of the runtime CD chain: +# +# merge PR → auto-tag-runtime (this) → publish-runtime → cascade → template +# image rebuilds → repull on hosts. +# +# Default bump is patch. Override via PR label `release:minor` or +# `release:major` BEFORE merging — the label is read off the merged PR +# associated with the push commit. +# +# Skips when: +# - The push isn't to main (other branches don't auto-release). +# - The merge commit message contains `[skip-release]` (escape hatch +# for cleanup PRs that touch workspace/ but shouldn't ship). + +on: + push: + branches: [main] + paths: + - "workspace/**" + - "scripts/build_runtime_package.py" + - ".github/workflows/auto-tag-runtime.yml" + - ".github/workflows/publish-runtime.yml" + +permissions: + contents: write # to push the new tag + pull-requests: read # to read labels off the merged PR + +concurrency: + # Serialize tag bumps so two near-simultaneous merges can't both think + # they're 0.1.6 and race to push the same tag. + group: auto-tag-runtime + cancel-in-progress: false + +jobs: + tag: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 # need full tag history for `git describe` / sort + + - name: Skip when commit asks + id: skip + run: | + MSG=$(git log -1 --format=%B "${{ github.sha }}") + if echo "$MSG" | grep -qiE '\[skip-release\]|\[no-release\]'; then + echo "skip=true" >> "$GITHUB_OUTPUT" + echo "Commit message contains [skip-release] — no tag will be created." + else + echo "skip=false" >> "$GITHUB_OUTPUT" + fi + + - name: Determine bump kind from PR label + id: bump + if: steps.skip.outputs.skip != 'true' + env: + GH_TOKEN: ${{ github.token }} + run: | + # The merged PR for this push commit. `gh pr list --search` finds + # closed PRs whose merge commit matches; we take the first. + PR=$(gh pr list --state merged --search "${{ github.sha }}" --json number,labels --jq '.[0]' 2>/dev/null || echo "") + if [ -z "$PR" ] || [ "$PR" = "null" ]; then + echo "No merged PR found for ${{ github.sha }} — defaulting to patch bump." + echo "kind=patch" >> "$GITHUB_OUTPUT" + exit 0 + fi + LABELS=$(echo "$PR" | jq -r '.labels[].name') + if echo "$LABELS" | grep -qx 'release:major'; then + echo "kind=major" >> "$GITHUB_OUTPUT" + elif echo "$LABELS" | grep -qx 'release:minor'; then + echo "kind=minor" >> "$GITHUB_OUTPUT" + else + echo "kind=patch" >> "$GITHUB_OUTPUT" + fi + + - name: Compute next version from latest runtime-v* tag + id: version + if: steps.skip.outputs.skip != 'true' + run: | + # Find the highest runtime-vX.Y.Z tag. `sort -V` handles semver + # ordering; `grep` filters to the right tag prefix. + LATEST=$(git tag --list 'runtime-v*' | sort -V | tail -1) + if [ -z "$LATEST" ]; then + # No prior tag — start the runtime line at 0.1.0. + CURRENT="0.0.0" + else + CURRENT="${LATEST#runtime-v}" + fi + MAJOR=$(echo "$CURRENT" | cut -d. -f1) + MINOR=$(echo "$CURRENT" | cut -d. -f2) + PATCH=$(echo "$CURRENT" | cut -d. -f3) + case "${{ steps.bump.outputs.kind }}" in + major) MAJOR=$((MAJOR+1)); MINOR=0; PATCH=0;; + minor) MINOR=$((MINOR+1)); PATCH=0;; + patch) PATCH=$((PATCH+1));; + esac + NEW="$MAJOR.$MINOR.$PATCH" + echo "current=$CURRENT" >> "$GITHUB_OUTPUT" + echo "new=$NEW" >> "$GITHUB_OUTPUT" + echo "Bumping runtime $CURRENT → $NEW (${{ steps.bump.outputs.kind }})" + + - name: Push new tag + if: steps.skip.outputs.skip != 'true' + run: | + NEW_TAG="runtime-v${{ steps.version.outputs.new }}" + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + git tag -a "$NEW_TAG" -m "runtime $NEW_TAG (auto-bump from ${{ steps.bump.outputs.kind }})" + git push origin "$NEW_TAG" + echo "Pushed $NEW_TAG — publish-runtime workflow will fire on the tag." diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2ee5fe5b..a9902658 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -283,7 +283,9 @@ jobs: cache: pip cache-dependency-path: workspace/requirements.txt - run: pip install -r requirements.txt pytest pytest-asyncio pytest-cov - - run: python -m pytest --tb=short -q --cov=. --cov-report=term-missing + # Coverage flags + fail-under floor moved into workspace/pytest.ini + # (issue #1817) so local `pytest` and CI use identical config. + - run: python -m pytest --tb=short # SDK + plugin validation moved to standalone repo: # github.com/Molecule-AI/molecule-sdk-python diff --git a/.github/workflows/publish-runtime.yml b/.github/workflows/publish-runtime.yml new file mode 100644 index 00000000..61054f8a --- /dev/null +++ b/.github/workflows/publish-runtime.yml @@ -0,0 +1,161 @@ +name: publish-runtime + +# Publishes molecule-ai-workspace-runtime to PyPI from monorepo workspace/. +# Monorepo workspace/ is the only source-of-truth for runtime code; this +# workflow is the bridge from monorepo edits to the PyPI artifact that +# the 8 workspace-template-* repos depend on. +# +# Triggered by: +# - Pushing a tag matching `runtime-vX.Y.Z` (the version is derived from +# the tag — `runtime-v0.1.6` publishes `0.1.6`). +# - Manual workflow_dispatch with an explicit `version` input (useful for +# dev/test releases without tagging the repo). +# +# The workflow: +# 1. Runs scripts/build_runtime_package.py to copy workspace/ → +# build/molecule_runtime/ with imports rewritten (`a2a_client` → +# `molecule_runtime.a2a_client`). +# 2. Builds wheel + sdist with `python -m build`. +# 3. Publishes to PyPI via twine + repo secret PYPI_TOKEN. +# +# After publish: the 8 template repos pick up the new version on their +# next image rebuild (their requirements.txt pin +# `molecule-ai-workspace-runtime>=0.1.0`, so any new release is eligible). +# To force-pull immediately, bump the pin in each template repo's +# requirements.txt and merge — that triggers their own publish-image.yml. + +on: + push: + tags: + - "runtime-v*" + workflow_dispatch: + inputs: + version: + description: "Version to publish (e.g. 0.1.6). Required for manual dispatch." + required: true + type: string + +permissions: + contents: read + +jobs: + publish: + runs-on: ubuntu-latest + environment: pypi-publish + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: pip + + - name: Derive version from tag or input + id: version + run: | + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + VERSION="${{ inputs.version }}" + else + # Tag is `runtime-vX.Y.Z` — strip the prefix. + VERSION="${GITHUB_REF_NAME#runtime-v}" + fi + if ! echo "$VERSION" | grep -qE '^[0-9]+\.[0-9]+\.[0-9]+(\.dev[0-9]+|rc[0-9]+|a[0-9]+|b[0-9]+|\.post[0-9]+)?$'; then + echo "::error::version $VERSION does not match PEP 440" + exit 1 + fi + echo "version=$VERSION" >> "$GITHUB_OUTPUT" + echo "Publishing molecule-ai-workspace-runtime $VERSION" + + - name: Install build tooling + run: pip install build twine + + - name: Build package from workspace/ + run: | + python scripts/build_runtime_package.py \ + --version "${{ steps.version.outputs.version }}" \ + --out "${{ runner.temp }}/runtime-build" + + - name: Build wheel + sdist + working-directory: ${{ runner.temp }}/runtime-build + run: python -m build + + - name: Verify package contents (sanity) + working-directory: ${{ runner.temp }}/runtime-build + run: | + python -m twine check dist/* + # Smoke-import the built wheel to catch import-rewrite mistakes + # before they hit PyPI. The package depends on a2a-sdk + httpx + # via pyproject; install those so the smoke import resolves. + python -m venv /tmp/smoke + /tmp/smoke/bin/pip install --quiet dist/*.whl + WORKSPACE_ID=00000000-0000-0000-0000-000000000000 \ + PLATFORM_URL=http://localhost:8080 \ + /tmp/smoke/bin/python -c " + from molecule_runtime import a2a_client, a2a_tools + from molecule_runtime.builtin_tools import memory + from molecule_runtime.adapters import get_adapter, BaseAdapter, AdapterConfig + assert a2a_client._A2A_QUEUED_PREFIX, 'queued prefix missing — chat-leak fix not in build' + print('✓ smoke import passed') + " + + - name: Publish to PyPI + working-directory: ${{ runner.temp }}/runtime-build + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} + run: python -m twine upload dist/* + + cascade: + # After PyPI accepts the upload, fan out a repository_dispatch to each + # template repo so they rebuild their image against the new runtime. + # Each template's `runtime-published.yml` receiver picks up the event, + # pulls the new PyPI version (their requirements.txt pin is `>=`), and + # republishes ghcr.io/molecule-ai/workspace-template-:latest. + # + # Soft-fail per repo: if one template's dispatch fails (perms missing, + # repo archived, etc.) we still try the others and surface the failures + # in the workflow summary instead of aborting the whole cascade. + needs: publish + runs-on: ubuntu-latest + steps: + - name: Fan out repository_dispatch + env: + # Fine-grained PAT with `actions:write` on the 8 template repos. + # GITHUB_TOKEN can't fire dispatches across repos — needs an explicit + # token. Stored as a repo secret; rotate per the standard schedule. + DISPATCH_TOKEN: ${{ secrets.TEMPLATE_DISPATCH_TOKEN }} + RUNTIME_VERSION: ${{ needs.publish.outputs.version || steps.version.outputs.version }} + run: | + set +e # don't abort on a single repo failure — collect them all + if [ -z "$DISPATCH_TOKEN" ]; then + echo "::warning::TEMPLATE_DISPATCH_TOKEN secret not set — skipping cascade. PyPI was published; templates will pick up the new version on their own next rebuild." + exit 0 + fi + # Re-derive version from the tag here too (in case publish job + # didn't expose an output the previous step's reference reads). + VERSION="${GITHUB_REF_NAME#runtime-v}" + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + VERSION="${{ inputs.version }}" + fi + TEMPLATES="claude-code langgraph crewai autogen deepagents hermes gemini-cli openclaw" + FAILED="" + for tpl in $TEMPLATES; do + REPO="Molecule-AI/molecule-ai-workspace-template-$tpl" + STATUS=$(curl -sS -o /tmp/dispatch.out -w "%{http_code}" \ + -X POST "https://api.github.com/repos/$REPO/dispatches" \ + -H "Authorization: Bearer $DISPATCH_TOKEN" \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + -d "{\"event_type\":\"runtime-published\",\"client_payload\":{\"runtime_version\":\"$VERSION\"}}") + if [ "$STATUS" = "204" ]; then + echo "✓ dispatched $tpl ($VERSION)" + else + echo "::warning::✗ failed to dispatch $tpl: HTTP $STATUS — $(cat /tmp/dispatch.out)" + FAILED="$FAILED $tpl" + fi + done + if [ -n "$FAILED" ]; then + echo "::warning::Cascade incomplete. Failed templates:$FAILED" + # Don't fail the whole job — PyPI publish already succeeded; + # operators can retry the failed templates manually. + fi diff --git a/.github/workflows/runtime-pin-compat.yml b/.github/workflows/runtime-pin-compat.yml new file mode 100644 index 00000000..976c3194 --- /dev/null +++ b/.github/workflows/runtime-pin-compat.yml @@ -0,0 +1,77 @@ +name: Runtime Pin Compatibility + +# CI gate that prevents the 5-hour staging outage from 2026-04-24 from +# recurring (controlplane#253). The original failure mode: +# 1. molecule-ai-workspace-runtime 0.1.13 declared `a2a-sdk<1.0` in its +# requires_dist metadata (incorrect — it actually imports +# a2a.server.routes which only exists in a2a-sdk 1.0+) +# 2. `pip install molecule-ai-workspace-runtime` resolved cleanly +# 3. `from molecule_runtime.main import main_sync` raised ImportError +# 4. Every tenant workspace crashed; the canary tenant caught it but +# only after 5 hours of degraded staging +# +# This workflow installs the runtime in a fresh Python venv from PyPI +# and tries the same import the EC2 user-data does. If pip resolution +# silently produces a broken combo, this gate fails before the tenant +# image gets published. + +on: + push: + branches: [main, staging] + paths: + - 'workspace/requirements.txt' + - '.github/workflows/runtime-pin-compat.yml' + pull_request: + branches: [main, staging] + paths: + - 'workspace/requirements.txt' + - '.github/workflows/runtime-pin-compat.yml' + # Daily catch for upstream PyPI publishes that break the pin combo + # without any change in our repo (e.g. someone re-yanks an a2a-sdk + # release or molecule-ai-workspace-runtime publishes a bad bump). + schedule: + - cron: '0 13 * * *' # 06:00 PT + workflow_dispatch: + # Required-check support: when this becomes a branch-protection gate, + # merge_group runs let the queue green-check this in addition to PRs. + merge_group: + types: [checks_requested] + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + default-install: + name: Default install + import smoke + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: pip + cache-dependency-path: workspace/requirements.txt + - name: Install runtime + workspace requirements + # Install order is load-bearing: install the runtime FIRST so pip + # honors whatever a2a-sdk constraint the runtime metadata declares + # (this is the surface that broke in 2026-04-24 — runtime declared + # `a2a-sdk<1.0` but actually needed >=1.0). The follow-up install + # of workspace/requirements.txt then upgrades a2a-sdk to the + # constraint our runtime image actually pins. The import smoke + # below verifies the upgraded combination is consistent. + run: | + python -m venv /tmp/venv + /tmp/venv/bin/pip install --upgrade pip + /tmp/venv/bin/pip install molecule-ai-workspace-runtime + /tmp/venv/bin/pip install -r workspace/requirements.txt + /tmp/venv/bin/pip show molecule-ai-workspace-runtime a2a-sdk \ + | grep -E '^(Name|Version):' + - name: Smoke import — fail if metadata declares deps that don't satisfy real imports + # WORKSPACE_ID is validated at import time by platform_auth.py — EC2 + # user-data sets it from the cloud-init template; set a placeholder + # here so the import smoke doesn't trip on the env-var guard. + env: + WORKSPACE_ID: 00000000-0000-0000-0000-000000000001 + run: | + /tmp/venv/bin/python -c "from molecule_runtime.main import main_sync; print('runtime imports OK')" diff --git a/.github/workflows/sweep-cf-orphans.yml b/.github/workflows/sweep-cf-orphans.yml new file mode 100644 index 00000000..7fb35328 --- /dev/null +++ b/.github/workflows/sweep-cf-orphans.yml @@ -0,0 +1,124 @@ +name: Sweep stale Cloudflare DNS records + +# Janitor for Cloudflare DNS records whose backing tenant/workspace no +# longer exists. Without this loop, every short-lived E2E or canary +# leaves a CF record on the moleculesai.app zone — the zone has a +# 200-record quota (controlplane#239 hit it 2026-04-23+) and provisions +# start failing with code 81045 once exhausted. +# +# Why a separate workflow vs sweep-stale-e2e-orgs.yml: +# - That workflow operates at the CP layer (DELETE /cp/admin/tenants/:slug +# drives the cascade). It assumes CP has the org row to drive the +# deprovision from. It doesn't catch records left behind when CP +# itself never knew about the tenant (canary scratch, manual ops +# experiments) or when the cascade's CF-delete branch failed. +# - sweep-cf-orphans.sh enumerates the CF zone directly and matches +# each record against live CP slugs + AWS EC2 names. It catches +# leaks the CP-driven sweep can't. +# +# Safety: the script's own MAX_DELETE_PCT gate refuses to nuke more +# than 50% of records in a single run. If something has gone weird +# (CP admin endpoint returns no orgs → every tenant looks orphan) the +# gate halts before damage. Decision-function unit tests in +# scripts/ops/test_sweep_cf_decide.py (#2027) cover the rule +# classifier. + +on: + schedule: + # Hourly. Mirrors sweep-stale-e2e-orgs cadence so the two janitors + # converge on the same tick. CF API rate budget is generous (1200 + # req/5min); a single sweep makes ~1 list + N deletes (N<=quota/2). + - cron: '15 * * * *' # offset from sweep-stale-e2e-orgs (top of hour) + workflow_dispatch: + inputs: + dry_run: + description: "Dry run only — list what would be deleted, no deletion" + required: false + type: boolean + default: true + max_delete_pct: + description: "Override safety gate (default 50, set higher only for major cleanup)" + required: false + default: "50" + # No `merge_group:` trigger on purpose. This is a janitor — it doesn't + # need to gate merges, and including it as written before #2088 fired + # the full sweep job (or its secret-check) on every PR going through + # the merge queue, generating one red CI run per merge-queue eval. If + # this workflow is ever wired up as a required check, re-add + # merge_group: { types: [checks_requested] } + # AND gate the sweep step with `if: github.event_name != 'merge_group'` + # so merge-queue evals report success without actually running. + +# Don't let two sweeps race the same zone. workflow_dispatch during a +# scheduled run would otherwise issue duplicate DELETE calls. +concurrency: + group: sweep-cf-orphans + cancel-in-progress: false + +permissions: + contents: read + +jobs: + sweep: + name: Sweep CF orphans + runs-on: ubuntu-latest + # 3 min surfaces hangs (CF API stall, AWS describe-instances stuck) + # within one cron interval instead of burning a full tick. Realistic + # worst case is ~2 min: 4 sequential curls + 1 aws + N×CF-DELETE + # each individually capped at 10s by the script's curl -m flag. + timeout-minutes: 3 + env: + CF_API_TOKEN: ${{ secrets.CF_API_TOKEN }} + CF_ZONE_ID: ${{ secrets.CF_ZONE_ID }} + CP_PROD_ADMIN_TOKEN: ${{ secrets.CP_PROD_ADMIN_TOKEN }} + CP_STAGING_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_TOKEN }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: us-east-2 + MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '50' }} + + steps: + - uses: actions/checkout@v4 + + - name: Verify required secrets present + id: verify + # Soft skip when secrets aren't configured. The 6 secrets have + # to be set on the repo manually before this workflow can do + # real work; until they are, the schedule is a no-op rather + # than a recurring red CI run. workflow_dispatch surfaces a + # warning so an operator running it ad-hoc sees the gap. + run: | + missing=() + for var in CF_API_TOKEN CF_ZONE_ID CP_PROD_ADMIN_TOKEN CP_STAGING_ADMIN_TOKEN AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; do + if [ -z "${!var:-}" ]; then + missing+=("$var") + fi + done + if [ ${#missing[@]} -gt 0 ]; then + echo "::warning::skipping sweep — secrets not yet configured: ${missing[*]}" + echo "skip=true" >> "$GITHUB_OUTPUT" + exit 0 + fi + echo "All required secrets present ✓" + echo "skip=false" >> "$GITHUB_OUTPUT" + + - name: Run sweep + if: steps.verify.outputs.skip != 'true' + # Schedule-vs-dispatch dry-run asymmetry (intentional): + # - Scheduled runs: github.event.inputs.dry_run is empty → + # defaults to "false" below → script runs with --execute + # (the whole point of an hourly janitor). + # - Manual workflow_dispatch: input default is true (line 38) + # so an ad-hoc operator-triggered run is dry-run by default; + # they have to flip the toggle to actually delete. + # The script's MAX_DELETE_PCT gate (default 50%) is the second + # line of defense regardless of mode. + run: | + set -euo pipefail + if [ "${{ github.event.inputs.dry_run || 'false' }}" = "true" ]; then + echo "Running in dry-run mode — no deletions" + bash scripts/ops/sweep-cf-orphans.sh + else + echo "Running with --execute — will delete identified orphans" + bash scripts/ops/sweep-cf-orphans.sh --execute + fi diff --git a/.github/workflows/test-ops-scripts.yml b/.github/workflows/test-ops-scripts.yml new file mode 100644 index 00000000..9a3a5fa3 --- /dev/null +++ b/.github/workflows/test-ops-scripts.yml @@ -0,0 +1,36 @@ +name: Ops Scripts Tests + +# Runs the unittest suite for scripts/ops/ on every PR + push that touches +# the directory. Kept separate from the main CI so a script-only change +# doesn't trigger the heavier Go/Canvas/Python pipelines. + +on: + push: + branches: [main, staging] + paths: + - 'scripts/ops/**' + - '.github/workflows/test-ops-scripts.yml' + pull_request: + branches: [main, staging] + paths: + - 'scripts/ops/**' + - '.github/workflows/test-ops-scripts.yml' + merge_group: + types: [checks_requested] + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + test: + name: Ops scripts (unittest) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + - name: Run unittest + working-directory: scripts/ops + run: python -m unittest discover -p 'test_*.py' -v diff --git a/canvas/e2e/staging-setup.ts b/canvas/e2e/staging-setup.ts index 963f9ccb..122efdf9 100644 --- a/canvas/e2e/staging-setup.ts +++ b/canvas/e2e/staging-setup.ts @@ -46,7 +46,16 @@ const TENANT_DOMAIN = process.env.STAGING_TENANT_DOMAIN || "staging.moleculesai. // were blocking staging→main syncs on 2026-04-24. const PROVISION_TIMEOUT_MS = 20 * 60 * 1000; const WORKSPACE_ONLINE_TIMEOUT_MS = 20 * 60 * 1000; -const TLS_TIMEOUT_MS = 3 * 60 * 1000; + +// TLS readiness depends on (1) Cloudflare DNS propagation through the +// edge, (2) the tenant's CF Tunnel registering the new hostname, (3) +// CF's edge ACME cert provisioning + cache. Each of these layers can +// add 1-3 min on its own under heavy staging load. The original 3-min +// cap blocked four cycles of staging→main PRs across 2026-04-24+. +// 10 min stays inside the 20-min PROVISION_TIMEOUT envelope (so a +// genuinely-stuck tenant still fails-loud at the provision step) but +// absorbs the realistic worst case for a one-shot tenant TLS handshake. +const TLS_TIMEOUT_MS = 10 * 60 * 1000; async function jsonFetch( url: string, diff --git a/canvas/src/components/ProvisioningTimeout.tsx b/canvas/src/components/ProvisioningTimeout.tsx index 1c09fa3b..2f2ee564 100644 --- a/canvas/src/components/ProvisioningTimeout.tsx +++ b/canvas/src/components/ProvisioningTimeout.tsx @@ -71,15 +71,22 @@ export function ProvisioningTimeout({ // Runtime included so the timeout threshold can be resolved per-node // (hermes cold-boot legitimately takes 8-13 min vs 30-90s for docker // runtimes — a single threshold would false-alarm on one or the other). - // Separator: `|` between fields, `,` between nodes. Names may contain - // anything the user typed; strip `|` and `,` so serialization round-trips. + // provisionTimeoutMs added by #2054 — server-declared per-workspace + // override that wins over the runtime profile when present. + // Separator: `|` between fields, `,` between nodes. Only `name` is + // user-typed (gets sanitized below); the other fields are + // primitive-typed (id is a UUID, runtime is a [a-z-]+ slug, + // provisionTimeoutMs is numeric). If a future field is string-typed, + // extend the sanitize step to strip `|` + `,` from it too. + // Empty-string sentinels for missing values so split/index stays positional. const provisioningNodes = useCanvasStore((s) => { const result = s.nodes .filter((n) => n.data.status === "provisioning") .map((n) => { const safeName = (n.data.name ?? "").replace(/[|,]/g, " "); const runtime = n.data.runtime ?? ""; - return `${n.id}|${safeName}|${runtime}`; + const provisionTimeoutMs = n.data.provisionTimeoutMs ?? ""; + return `${n.id}|${safeName}|${runtime}|${provisionTimeoutMs}`; }); return result.join(","); }); @@ -87,8 +94,14 @@ export function ProvisioningTimeout({ () => provisioningNodes ? provisioningNodes.split(",").map((entry) => { - const [id, name, runtime] = entry.split("|"); - return { id, name, runtime }; + const [id, name, runtime, provisionTimeoutMs] = entry.split("|"); + const ptms = provisionTimeoutMs ? Number(provisionTimeoutMs) : undefined; + return { + id, + name, + runtime, + provisionTimeoutMs: Number.isFinite(ptms) ? ptms : undefined, + }; }) : [], [provisioningNodes], @@ -138,10 +151,19 @@ export function ProvisioningTimeout({ // default), then scales by concurrent-provisioning count. A // hermes workspace in a batch alongside two langgraph workspaces // gets hermes's 12-min base, not langgraph's 2-min base. + // + // Resolution priority (most specific wins): + // 1. node.provisionTimeoutMs — server-declared per-workspace + // override (#2054, sourced from template manifest) + // 2. timeoutMs prop — single-threshold test override + // 3. runtime profile in @/lib/runtimeProfiles + // 4. DEFAULT_RUNTIME_PROFILE for (const node of parsedProvisioningNodes) { const startedAt = tracking.get(node.id); if (!startedAt) continue; - const base = timeoutMs ?? provisionTimeoutForRuntime(node.runtime); + const base = provisionTimeoutForRuntime(node.runtime, { + provisionTimeoutMs: node.provisionTimeoutMs ?? timeoutMs, + }); const effective = effectiveTimeoutMs( base, parsedProvisioningNodes.length, diff --git a/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx b/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx index 2424ea49..dedb1fb3 100644 --- a/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx +++ b/canvas/src/components/__tests__/ProvisioningTimeout.test.tsx @@ -287,5 +287,60 @@ describe("ProvisioningTimeout", () => { ); }); }); + + // #2054 — per-workspace server override threading from socket + // payload through node-data into ProvisioningTimeout's resolver. + // Doesn't render the component; verifies the data path lands the + // value where ProvisioningTimeout reads it from. + describe("server-side per-workspace override (#2054)", () => { + it("hydrate carries provision_timeout_ms onto node.data.provisionTimeoutMs", () => { + useCanvasStore.getState().hydrate([ + makeWS({ + id: "ws-slow", + name: "Slow", + status: "provisioning", + runtime: "future-runtime", + provision_timeout_ms: 600_000, + }), + ]); + const node = useCanvasStore + .getState() + .nodes.find((n) => n.id === "ws-slow"); + expect(node?.data.provisionTimeoutMs).toBe(600_000); + }); + + it("absent provision_timeout_ms hydrates to null (falls through to runtime profile)", () => { + useCanvasStore.getState().hydrate([ + makeWS({ id: "ws-default", name: "Default", status: "provisioning", runtime: "hermes" }), + ]); + const node = useCanvasStore + .getState() + .nodes.find((n) => n.id === "ws-default"); + expect(node?.data.provisionTimeoutMs).toBeNull(); + // And the resolver still returns hermes' profile value when + // no override is supplied — proves the fall-through stays intact. + expect( + provisionTimeoutForRuntime("hermes", { + provisionTimeoutMs: node?.data.provisionTimeoutMs ?? undefined, + }), + ).toBe(RUNTIME_PROFILES.hermes.provisionTimeoutMs); + }); + + it("server override wins over runtime profile via the resolver path the component uses", () => { + // Mirrors ProvisioningTimeout.tsx:144 where node.provisionTimeoutMs + // is passed as overrides — verifies the resolver respects it + // even when the runtime has its own profile entry. + const override = 30_000; + expect( + provisionTimeoutForRuntime("hermes", { + provisionTimeoutMs: override, + }), + ).toBe(override); + // Sanity — the runtime profile would have been much larger. + expect(RUNTIME_PROFILES.hermes.provisionTimeoutMs).toBeGreaterThan( + override, + ); + }); + }); }); }); diff --git a/canvas/src/store/canvas-topology.ts b/canvas/src/store/canvas-topology.ts index 9c1cb25f..fbd02601 100644 --- a/canvas/src/store/canvas-topology.ts +++ b/canvas/src/store/canvas-topology.ts @@ -478,6 +478,9 @@ export function buildNodesAndEdges( needsRestart: false, budgetLimit: ws.budget_limit ?? null, budgetUsed: ws.budget_used ?? null, + // #2054 — server-declared per-workspace provisioning timeout. + // Falls through to the runtime profile when null/absent. + provisionTimeoutMs: ws.provision_timeout_ms ?? null, }, }; if (hasParent) { diff --git a/canvas/src/store/canvas.ts b/canvas/src/store/canvas.ts index 2cec82ea..02f93b25 100644 --- a/canvas/src/store/canvas.ts +++ b/canvas/src/store/canvas.ts @@ -92,6 +92,12 @@ export interface WorkspaceNodeData extends Record { budgetLimit: number | null; /** Cumulative USD spend. Present when the platform tracks spend (issue #541). */ budgetUsed?: number | null; + /** Per-workspace provisioning-timeout override in milliseconds (#2054). + * Sourced server-side from the workspace's template manifest at provision + * time. null/absent = fall through to runtime profile + default in + * @/lib/runtimeProfiles. Lets a slow runtime declare its cold-boot + * expectation without a canvas release. */ + provisionTimeoutMs?: number | null; } export type PanelTab = "details" | "skills" | "chat" | "terminal" | "config" | "schedule" | "channels" | "files" | "memory" | "traces" | "events" | "activity" | "audit"; diff --git a/canvas/src/store/socket.ts b/canvas/src/store/socket.ts index f350c4d7..858fc875 100644 --- a/canvas/src/store/socket.ts +++ b/canvas/src/store/socket.ts @@ -122,6 +122,13 @@ export interface WorkspaceData { budget_limit: number | null; /** Cumulative USD spend for this workspace. Present when the platform tracks spend. */ budget_used?: number | null; + /** Server-declared provisioning-timeout override in milliseconds (#2054). + * Sourced from the workspace's template manifest at provision time — + * lets a slow runtime declare its cold-boot expectation without a + * canvas release. Falls through to the per-runtime profile in + * `@/lib/runtimeProfiles` when absent (the default behavior for any + * template that hasn't yet declared the field). */ + provision_timeout_ms?: number | null; } let socket: ReconnectingSocket | null = null; diff --git a/docker-compose.yml b/docker-compose.yml index c9c88d7c..2be0d3f6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -126,6 +126,13 @@ services: REDIS_URL: redis://redis:6379 PORT: "${PLATFORM_PORT:-8080}" PLATFORM_URL: "http://platform:${PLATFORM_PORT:-8080}" + # Default MOLECULE_ENV=development so the WorkspaceAuth / AdminAuth + # middleware fail-open path activates when ADMIN_TOKEN is unset — + # otherwise the canvas (which runs without a bearer in pure local + # dev) gets 401 "missing workspace auth token" on every request. + # Override to "production" for SaaS/staged deploys; in those modes + # ADMIN_TOKEN must also be set or every request rejects. + MOLECULE_ENV: "${MOLECULE_ENV:-development}" CORS_ORIGINS: ${CORS_ORIGINS:-http://localhost:${CANVAS_PUBLISH_PORT:-3000},http://127.0.0.1:${CANVAS_PUBLISH_PORT:-3000},http://localhost:3001} RATE_LIMIT: "${RATE_LIMIT:-1000}" CONFIGS_DIR: /configs @@ -153,6 +160,24 @@ services: HIBERNATION_IDLE_MINUTES: "${HIBERNATION_IDLE_MINUTES:-}" # Plugin supply chain hardening (issue #768 / PR #775). Never set in production. PLUGIN_ALLOW_UNPINNED: "${PLUGIN_ALLOW_UNPINNED:-}" + # Force ImagePull/ContainerCreate to request linux/amd64 manifests + # for the workspace-template-* images. The templates ship single-arch + # amd64 today; without this override, an arm64 host (Apple Silicon) + # asks the daemon for linux/arm64/v8, which doesn't match the manifest + # and the pull fails with "no matching manifest". Apple Silicon will + # run the amd64 image under Rosetta — slower (~2-3×) but functional. + # Override to "" or another platform when the templates start shipping + # multi-arch (then this hardcoded amd64 becomes unnecessary). + MOLECULE_IMAGE_PLATFORM: "${MOLECULE_IMAGE_PLATFORM:-linux/amd64}" + # GHCR auth for the workspace-images refresh endpoint + # (POST /admin/workspace-images/refresh). When set, the platform's + # Docker SDK ImagePull on private workspace-template-* images + # succeeds without per-host `docker login`. GHCR_USER is the GitHub + # username; GHCR_TOKEN is a fine-grained PAT with `read:packages` + # on the Molecule-AI org. Both unset → endpoint can only pull + # public images (current state for all 8 templates). + GHCR_USER: "${GHCR_USER:-}" + GHCR_TOKEN: "${GHCR_TOKEN:-}" volumes: - ./workspace-configs-templates:/configs - ./org-templates:/org-templates:ro diff --git a/docs/workspace-runtime-package.md b/docs/workspace-runtime-package.md index a24ce42b..aed86368 100644 --- a/docs/workspace-runtime-package.md +++ b/docs/workspace-runtime-package.md @@ -2,29 +2,67 @@ ## Overview -The shared workspace runtime infrastructure lives in two places: +The shared workspace runtime infrastructure has **one editable source** and +**one published artifact**: -1. **Source of truth (monorepo):** `workspace/` — this is where all development happens -2. **Published package:** [`molecule-ai-workspace-runtime`](https://pypi.org/project/molecule-ai-workspace-runtime/) on PyPI +1. **Source of truth (monorepo, editable):** `workspace/` — every runtime + change lands here. Edit it like any other monorepo code. +2. **Published artifact (PyPI, generated):** [`molecule-ai-workspace-runtime`](https://pypi.org/project/molecule-ai-workspace-runtime/) + — produced by `.github/workflows/publish-runtime.yml` on every + `runtime-vX.Y.Z` tag push. Do NOT edit this independently — it gets + overwritten on every publish. + +The legacy sibling repo `molecule-ai-workspace-runtime` (the GitHub repo, as +distinct from the PyPI package) is no longer the source-of-truth and should +be treated as a publish artifact only. It can be archived or used as a +read-only mirror. + +## Why this shape + +The 8 workspace template repos (claude-code, langgraph, hermes, etc.) each +build their own Docker image and `pip install molecule-ai-workspace-runtime` +from PyPI. PyPI is the right distribution channel — semver, reproducible +builds, no submodule dance per-repo. But the runtime ALSO needs to evolve +in lock-step with the platform's wire protocol (queue shape, A2A metadata, +event payloads). Shipping cross-cutting protocol changes as separate +runtime + platform PRs in two repos creates ordering pain and broken +intermediate states. + +The monorepo + auto-publish split gives both: edit cross-cutting changes +in one PR, publish the runtime artifact via a tag. ## What's in the package -Everything in `workspace/` except adapter-specific code: +Everything in `workspace/*.py` plus the `adapters/`, `builtin_tools/`, +`plugins_registry/`, `policies/`, `skill_loader/` subpackages. Build +artifacts (`Dockerfile`, `*.sh`, `pytest.ini`, `requirements.txt`) are +excluded. -- `molecule_runtime/` — all shared `.py` files (main.py, config.py, heartbeat.py, etc.) -- `molecule_runtime/adapters/` — `BaseAdapter`, `AdapterConfig`, `SetupResult`, `shared_runtime` -- `molecule_runtime/builtin_tools/` — delegation, memory, approvals, sandbox, telemetry -- `molecule_runtime/skill_loader/` — skill loading + hot-reload -- `molecule_runtime/plugins_registry/` — plugin discovery and install pipeline -- `molecule_runtime/policies/` — namespace routing policies -- Console script: `molecule-runtime` → `molecule_runtime.main:main_sync` +The build script rewrites bare imports so the published package is a +proper Python namespace: + +``` +# In monorepo workspace/: +from a2a_client import discover_peer +from builtin_tools.memory import store + +# In published molecule_runtime/ (auto-rewritten at publish time): +from molecule_runtime.a2a_client import discover_peer +from molecule_runtime.builtin_tools.memory import store +``` + +The closed allowlist of rewritten module names lives in +`scripts/build_runtime_package.py` (`TOP_LEVEL_MODULES` + `SUBPACKAGES`). +Add a new top-level module to workspace/? Add it to the allowlist in the +same PR. ## Adapter repos -Each of the 8 adapter repos now contains: +Each of the 8 adapter template repos contains: - `adapter.py` — runtime-specific `Adapter` class -- `requirements.txt` — `molecule-ai-workspace-runtime>=0.1.0` + adapter deps -- `Dockerfile` — standalone image (no longer extends workspace-template:base) +- `requirements.txt` — `molecule-ai-workspace-runtime>=0.1.X` + adapter deps +- `Dockerfile` — standalone image with `ENV ADAPTER_MODULE=adapter` and + `ENTRYPOINT ["molecule-runtime"]` | Adapter | Repo | |---------|------| @@ -39,8 +77,8 @@ Each of the 8 adapter repos now contains: ## Adapter discovery (ADAPTER_MODULE) -Standalone adapter repos set `ENV ADAPTER_MODULE=adapter` in their Dockerfile. -The runtime's `get_adapter()` checks this env var first: +Standalone adapter repos set `ENV ADAPTER_MODULE=adapter` in their +Dockerfile. The runtime's `get_adapter()` checks this env var first: ```python # In molecule_runtime/adapters/__init__.py @@ -49,25 +87,104 @@ def get_adapter(runtime: str) -> type[BaseAdapter]: if adapter_module: mod = importlib.import_module(adapter_module) return getattr(mod, "Adapter") - # Fall back to built-in subdirectory scan (monorepo local dev) - ... + raise KeyError(...) ``` ## Publishing a new version ```bash -cd workspace-template -# 1. Bump version in pyproject.toml -# 2. Sync to molecule-ai-workspace-runtime repo -# 3. Tag and push — CI publishes to PyPI via PYPI_TOKEN secret +# From any local checkout of monorepo, after merging your runtime change: +git tag runtime-v0.1.6 +git push origin runtime-v0.1.6 ``` -Or manually: -```bash -cd workspace-template -python -m build -python -m twine upload dist/* +The `publish-runtime` workflow takes over — checks out the tag, runs +`scripts/build_runtime_package.py --version 0.1.6`, builds wheel + sdist, +runs a smoke import to catch broken rewrites, and uploads to PyPI via +the `PYPI_TOKEN` repo secret. + +For dev/test releases without tagging, dispatch the workflow manually +with an explicit version (e.g. `0.1.6.dev1` — PEP 440 dev/rc/post forms +are accepted). + +After publish, the 8 template repos pick up the new version on their +next `:latest` rebuild. To force-pull immediately, bump the pin in each +template's `requirements.txt`. + +## End-to-end CD chain + +The full chain from monorepo merge → workspace containers running new code: + ``` +1. Merge PR with workspace/ changes to main + ↓ +2. .github/workflows/auto-tag-runtime.yml fires + ↓ reads PR labels (release:major/minor) or defaults to patch + ↓ pushes runtime-vX.Y.Z tag + ↓ +3. .github/workflows/publish-runtime.yml fires (on the tag) + ↓ builds wheel via scripts/build_runtime_package.py + ↓ smoke-imports the wheel + ↓ uploads to PyPI + ↓ cascade job fires repository_dispatch (event-type: runtime-published) + ↓ to all 8 workspace-template-* repos + ↓ +4. Each template's publish-image.yml fires (on repository_dispatch) + ↓ rebuilds Dockerfile (which pip-installs the new PyPI version) + ↓ pushes ghcr.io/molecule-ai/workspace-template-:latest + ↓ +5. Production hosts run scripts/refresh-workspace-images.sh + OR an operator hits POST /admin/workspace-images/refresh on the platform + ↓ docker pull all 8 :latest tags + ↓ remove + force-recreate any running ws-* containers using a refreshed image + ↓ canvas re-provisions the workspaces on next interaction +``` + +Steps 1-4 are fully automated. Step 5 is one-click: a single curl or shell +command. SaaS deployments typically wire step 5 into their normal deploy +pipeline (every release pulls fresh images on every host); local dev fires +it manually after a runtime release lands. + +### Required secrets + +| Secret | Where | Why | +|---|---|---| +| `PYPI_TOKEN` | molecule-core repo | Twine upload auth (PyPI) | +| `TEMPLATE_DISPATCH_TOKEN` | molecule-core repo | Fine-grained PAT with `actions:write` on the 8 template repos. Without it the `cascade` job warns and exits clean — PyPI still publishes; templates just don't auto-rebuild. | + +### Step 5 specifics + +**Local dev (compose stack):** +```bash +bash scripts/refresh-workspace-images.sh # all runtimes +bash scripts/refresh-workspace-images.sh --runtime claude-code +bash scripts/refresh-workspace-images.sh --no-recreate # pull only, leave containers +``` + +**Via platform admin endpoint (any deploy):** +```bash +curl -X POST "$PLATFORM/admin/workspace-images/refresh" +curl -X POST "$PLATFORM/admin/workspace-images/refresh?runtime=claude-code" +curl -X POST "$PLATFORM/admin/workspace-images/refresh?recreate=false" +``` + +The endpoint pulls + recreates from inside the platform container, so it +needs Docker socket access (the compose stack mounts +`/var/run/docker.sock` already) AND GHCR auth on the host's docker config +(`docker login ghcr.io` once per host). On a fresh host without GHCR auth, +the pull step warns per runtime and the response surfaces the failures. + +## Local dev (build the package without publishing) + +```bash +python3 scripts/build_runtime_package.py --version 0.1.0-local --out /tmp/runtime-build +cd /tmp/runtime-build +python -m build # produces dist/*.whl + dist/*.tar.gz +pip install dist/*.whl # install into a venv to test locally +``` + +This is the same pipeline CI runs. Use it to validate import-rewrite +correctness before pushing a `runtime-v*` tag. ## Writing a new adapter @@ -75,5 +192,18 @@ python -m twine upload dist/* 2. Copy `adapter.py` pattern from any existing adapter repo 3. Change imports: `from molecule_runtime.adapters.base import BaseAdapter, AdapterConfig` 4. Create `requirements.txt` with `molecule-ai-workspace-runtime>=0.1.0` + your deps -5. Create `Dockerfile` with `ENV ADAPTER_MODULE=adapter` and `ENTRYPOINT ["molecule-runtime"]` +5. Create `Dockerfile` with `ENV ADAPTER_MODULE=adapter` and + `ENTRYPOINT ["molecule-runtime"]` 6. Register the runtime name in the platform's known runtimes list + +## Migration note + +Prior to this workflow, the runtime was duplicated across monorepo +`workspace/` AND a sibling repo `molecule-ai-workspace-runtime`, with no +sync mechanism. That caused 30+ files to drift between the two trees and +tonight's chat-leak / queued-classification fixes existed only in the +monorepo copy until manually ported. + +If you have an old local checkout of `molecule-ai-workspace-runtime`, treat +it as outdated. The monorepo `workspace/` is now authoritative; the PyPI +artifact is rebuilt from it on every `runtime-v*` tag. diff --git a/scripts/build_runtime_package.py b/scripts/build_runtime_package.py new file mode 100755 index 00000000..91e121b2 --- /dev/null +++ b/scripts/build_runtime_package.py @@ -0,0 +1,298 @@ +#!/usr/bin/env python3 +"""Build the molecule-ai-workspace-runtime PyPI package from monorepo workspace/. + +Monorepo workspace/ is the single source-of-truth for runtime code. The PyPI +package is a publish-time mirror produced by this script, NOT a parallel +editable copy. Anyone editing the runtime should edit workspace/, never the +sibling molecule-ai-workspace-runtime repo. + +What this does +-------------- +1. Copies workspace/ source into build/molecule_runtime/ (note the rename: + bare modules become a real Python package). +2. Rewrites top-level imports so e.g. `from a2a_client import X` becomes + `from molecule_runtime.a2a_client import X`. The rewrite is regex-based + on a closed allowlist of modules — third-party imports like `from a2a.X` + (the a2a-sdk package) are left alone because the regex is anchored on + exact module names. +3. Writes a pyproject.toml with the requested version + the README + the + py.typed marker. +4. Leaves the build dir ready for `python -m build` to produce a wheel/sdist. + +Usage +----- + scripts/build_runtime_package.py --version 0.1.6 --out /tmp/runtime-build + cd /tmp/runtime-build && python -m build + python -m twine upload dist/* + +The publish workflow (.github/workflows/publish-runtime.yml) drives this +on every `runtime-v*` tag push. +""" + +from __future__ import annotations + +import argparse +import re +import shutil +import sys +from pathlib import Path + +# Top-level Python modules in workspace/ that become molecule_runtime.X. +# Anything imported as `from import` or `import ` (where +# matches one of these) gets rewritten to use the package prefix. +# +# Closed list (not "every .py we copy") because a typo in workspace/ would +# otherwise leak into a wrong rewrite. Update this when adding a new +# top-level module to workspace/. +TOP_LEVEL_MODULES = { + "a2a_cli", + "a2a_client", + "a2a_executor", + "a2a_mcp_server", + "a2a_tools", + "adapter_base", + "agent", + "agents_md", + "claude_sdk_executor", + "cli_executor", + "config", + "consolidation", + "coordinator", + "events", + "executor_helpers", + "heartbeat", + "hermes_executor", + "initial_prompt", + "main", + "molecule_ai_status", + "platform_auth", + "plugins", + "preflight", + "prompt", + "shared_runtime", +} + +# Subdirectory packages — these are already real packages (they have or will +# have __init__.py) so the rewrite is `from ` → `from molecule_runtime.`. +SUBPACKAGES = { + "adapters", + "builtin_tools", + "plugins_registry", + "policies", + "skill_loader", +} + +# Files in workspace/ NOT included in the published package. These are +# build artifacts, dev scripts, or monorepo-only scaffolding. +EXCLUDE_FILES = { + "Dockerfile", + "build-all.sh", + "rebuild-runtime-images.sh", + "entrypoint.sh", + "pytest.ini", + "requirements.txt", + # Note: adapter_base.py, agents_md.py, hermes_executor.py, shared_runtime.py + # are kept (referenced by adapters/__init__.py and other modules); they get + # their imports rewritten via TOP_LEVEL_MODULES. Excluding them broke the + # smoke-test install with `ModuleNotFoundError: adapter_base`. +} + +EXCLUDE_DIRS = { + "__pycache__", + "tests", + "lib", + "molecule_audit", + "scripts", +} + + +def build_import_rewriter() -> re.Pattern: + """Compile a single regex matching all import statements that need + rewriting. The match groups capture the keyword + module name so the + replacement preserves whitespace and trailing punctuation. + + Modules included: TOP_LEVEL_MODULES ∪ SUBPACKAGES. + + The negative-lookahead on `\\.` in the suffix prevents matching + `from a2a.server.X import Y` against bare `a2a` (which isn't in our + set, but the principle matters for any future short module name that + happens to be a prefix of a real package name). + """ + names = sorted(TOP_LEVEL_MODULES | SUBPACKAGES) + alt = "|".join(re.escape(n) for n in names) + # Matches: + # from (\.|\s|import) + # import (\s|$|,) + # And captures the keyword + name so we can re-emit with prefix. + pattern = ( + r"(?m)^(?P\s*)" # leading whitespace (preserved) + r"(?Pfrom|import)\s+" # 'from' or 'import' + r"(?P" + alt + r")" # the module name + r"(?P[\s.,]|$)" # what follows: '.subpath', ' import …', ',', whitespace, EOL + ) + return re.compile(pattern) + + +def rewrite_imports(text: str, regex: re.Pattern) -> str: + """Replace bare imports with package-prefixed ones. + + `import X` → `import molecule_runtime.X as X` (preserve binding) + `from X import Y` → `from molecule_runtime.X import Y` + `from X.sub import Y` → `from molecule_runtime.X.sub import Y` + """ + def repl(m: re.Match) -> str: + indent, kw, mod, rest = m.group("indent"), m.group("kw"), m.group("mod"), m.group("rest") + if kw == "from": + # `from X` or `from X.sub` — always safe to prefix. + return f"{indent}from molecule_runtime.{mod}{rest}" + # `import X` — preserve the binding name `X` (callers do `X.foo`) + # by aliasing. `import X.sub` is uncommon for our modules and would + # need a different binding form, but isn't used in workspace/ today. + if rest.startswith("."): + # `import X.sub` — rewrite as `import molecule_runtime.X.sub` and + # leave the trailing dot pattern intact for the rest of the line. + return f"{indent}import molecule_runtime.{mod}{rest}" + # Plain `import X` — alias preserves the local name. + return f"{indent}import molecule_runtime.{mod} as {mod}{rest}" + return regex.sub(repl, text) + + +def copy_tree_filtered(src: Path, dst: Path) -> list[Path]: + """Copy src/ → dst/ skipping EXCLUDE_FILES + EXCLUDE_DIRS. Returns the + list of .py files copied so the caller can run the import rewrite over + them in one pass.""" + py_files: list[Path] = [] + if dst.exists(): + shutil.rmtree(dst) + dst.mkdir(parents=True) + for entry in src.iterdir(): + if entry.is_dir(): + if entry.name in EXCLUDE_DIRS: + continue + sub_py = copy_tree_filtered(entry, dst / entry.name) + py_files.extend(sub_py) + else: + if entry.name in EXCLUDE_FILES: + continue + shutil.copy2(entry, dst / entry.name) + if entry.suffix == ".py": + py_files.append(dst / entry.name) + return py_files + + +PYPROJECT_TEMPLATE = """\ +[build-system] +requires = ["setuptools>=68.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "molecule-ai-workspace-runtime" +version = "{version}" +description = "Molecule AI workspace runtime — shared infrastructure for all agent adapters" +requires-python = ">=3.11" +license = {{text = "BSL-1.1"}} +readme = "README.md" +dependencies = [ + "a2a-sdk[http-server]>=1.0.0,<2.0", + "httpx>=0.27.0", + "uvicorn>=0.30.0", + "starlette>=0.38.0", + "websockets>=12.0", + "pyyaml>=6.0", + "langchain-core>=0.3.0", + "opentelemetry-api>=1.24.0", + "opentelemetry-sdk>=1.24.0", + "opentelemetry-exporter-otlp-proto-http>=1.24.0", + "temporalio>=1.7.0", +] + +[project.scripts] +molecule-runtime = "molecule_runtime.main:main_sync" + +[tool.setuptools.packages.find] +where = ["."] +include = ["molecule_runtime*"] + +[tool.setuptools.package-data] +"molecule_runtime" = ["py.typed"] +""" + + +README_TEMPLATE = """\ +# molecule-ai-workspace-runtime + +Shared workspace runtime for [Molecule AI](https://github.com/Molecule-AI/molecule-core) +agent adapters. Installed by every workspace template image +(`workspace-template-claude-code`, `-langgraph`, `-hermes`, etc.) to provide +A2A delegation, heartbeat, memory, plugin loading, and skill management. + +This package is **published from the molecule-core monorepo `workspace/` +directory** by the `publish-runtime` GitHub Actions workflow on every +`runtime-v*` tag push. **Do not edit this package directly** — edit +`workspace/` in the monorepo. + +See [`docs/workspace-runtime-package.md`](https://github.com/Molecule-AI/molecule-core/blob/main/docs/workspace-runtime-package.md) +for the publish flow and architecture. +""" + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--version", required=True, help="Package version, e.g. 0.1.6") + parser.add_argument("--out", required=True, type=Path, help="Build output directory (will be wiped)") + parser.add_argument("--source", type=Path, default=Path(__file__).resolve().parent.parent / "workspace", + help="Path to monorepo workspace/ directory (default: ../workspace from this script)") + args = parser.parse_args() + + src = args.source.resolve() + out = args.out.resolve() + if not src.is_dir(): + print(f"error: source not a directory: {src}", file=sys.stderr) + return 2 + + pkg_dir = out / "molecule_runtime" + print(f"[build] source: {src}") + print(f"[build] output: {out}") + print(f"[build] package: {pkg_dir}") + + if out.exists(): + shutil.rmtree(out) + out.mkdir(parents=True) + + py_files = copy_tree_filtered(src, pkg_dir) + print(f"[build] copied {len(py_files)} .py files") + + # Ensure top-level package marker exists. workspace/ doesn't have one + # (it's not a package in monorepo), but the published artifact must. + init = pkg_dir / "__init__.py" + if not init.exists(): + init.write_text('"""Molecule AI workspace runtime."""\n') + + # Touch py.typed so type-checkers in adapter consumers see the package + # as typed. Empty file is the convention. + (pkg_dir / "py.typed").touch() + + # Rewrite imports in every .py file we copied + the new __init__.py. + regex = build_import_rewriter() + rewrites = 0 + for f in [*py_files, init]: + original = f.read_text() + rewritten = rewrite_imports(original, regex) + if rewritten != original: + f.write_text(rewritten) + rewrites += 1 + print(f"[build] rewrote imports in {rewrites} files") + + # Emit pyproject.toml + README at build root. + (out / "pyproject.toml").write_text(PYPROJECT_TEMPLATE.format(version=args.version)) + (out / "README.md").write_text(README_TEMPLATE) + + print(f"[build] done. To publish:") + print(f" cd {out}") + print(f" python -m build") + print(f" python -m twine upload dist/*") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/ops/sweep-cf-orphans.sh b/scripts/ops/sweep-cf-orphans.sh index 5e757b79..569bcbcf 100755 --- a/scripts/ops/sweep-cf-orphans.sh +++ b/scripts/ops/sweep-cf-orphans.sh @@ -117,62 +117,69 @@ log " CF records: $TOTAL_CF" # 5. Anything else → keep (we only sweep patterns we understand). export PROD_SLUGS STAGING_SLUGS EC2_NAMES TOTAL_CF +# Edits inside the CANONICAL DECIDE block below must mirror +# scripts/ops/sweep_cf_decide.py — the parity test in +# test_sweep_cf_decide.py asserts they match byte-for-byte. DECISIONS=$(echo "$CF_JSON" | python3 -c ' import json, os, re, sys d = json.load(sys.stdin) -prod = set(os.environ["PROD_SLUGS"].split()) -staging = set(os.environ["STAGING_SLUGS"].split()) -all_slugs = prod | staging +prod_slugs = set(os.environ["PROD_SLUGS"].split()) +staging_slugs = set(os.environ["STAGING_SLUGS"].split()) +all_slugs = prod_slugs | staging_slugs ec2_names = set(n for n in os.environ["EC2_NAMES"].split() if n) -def decide(r): +_PLATFORM_CORE_NAMES = { + "api.moleculesai.app", "app.moleculesai.app", "doc.moleculesai.app", + "send.moleculesai.app", "status.moleculesai.app", "www.moleculesai.app", + "staging-api.moleculesai.app", +} +_WS_RE = re.compile(r"^(ws-[a-f0-9]{8}-[a-f0-9]+)(?:\.staging)?\.moleculesai\.app$") +_E2E_RE = re.compile(r"^(e2e-[^.]+)(?:\.staging)?\.moleculesai\.app$") +_TENANT_RE = re.compile(r"^([a-z0-9][a-z0-9-]*)(?:\.staging)?\.moleculesai\.app$") + +# CANONICAL DECIDE BEGIN +def decide(r, all_slugs, ec2_names): n = r["name"] rid = r["id"] typ = r["type"] - # Rule 1: platform core — leave alone if n == "moleculesai.app": return ("keep", "apex", rid, n, typ) if n.startswith("_") or n.endswith("._domainkey.moleculesai.app"): return ("keep", "verification/key", rid, n, typ) - if n in {"api.moleculesai.app","app.moleculesai.app","doc.moleculesai.app", - "send.moleculesai.app","status.moleculesai.app","www.moleculesai.app", - "staging-api.moleculesai.app"}: + if n in _PLATFORM_CORE_NAMES: return ("keep", "platform-core", rid, n, typ) - # Rule 3: ws--.(staging.)moleculesai.app - m = re.match(r"^(ws-[a-f0-9]{8}-[a-f0-9]+)(?:\.staging)?\.moleculesai\.app$", n) + m = _WS_RE.match(n) if m: prefix = m.group(1) - # Live EC2 names are like "ws-d3605ef2-f7d" — same shape as DNS subdomain. + # Live EC2 names share the ws-- shape with the DNS subdomain. for ename in ec2_names: if ename.startswith(prefix): return ("keep", "live-ec2", rid, n, typ) return ("delete", "orphan-ws", rid, n, typ) - # Rule 4: e2e-* tenants (includes canary, canvas variants) - m = re.match(r"^(e2e-[^.]+)(?:\.staging)?\.moleculesai\.app$", n) + m = _E2E_RE.match(n) if m: slug = m.group(1) if slug in all_slugs: return ("keep", "live-e2e-tenant", rid, n, typ) return ("delete", "orphan-e2e-tenant", rid, n, typ) - # Rule 2: any other tenant subdomain (slug.moleculesai.app or slug.staging.moleculesai.app) - m = re.match(r"^([a-z0-9][a-z0-9-]*)(?:\.staging)?\.moleculesai\.app$", n) + m = _TENANT_RE.match(n) if m: slug = m.group(1) if slug in all_slugs: return ("keep", "live-tenant", rid, n, typ) - # Only flag as orphan if name looks like a tenant (not a one-off like "hermes-final-*") - # To avoid false-positive nukes on ad-hoc records, we KEEP anything that - # does not match a known pattern. Orphan only for explicit tenant-shaped names. + # KEEP unknown tenant-shaped names — avoid false-positive nukes on + # ad-hoc records (e.g. hermes-final-*) that do not match a known slug. return ("keep", "unknown-subdomain-kept-for-safety", rid, n, typ) return ("keep", "not-a-pattern-we-sweep", rid, n, typ) +# CANONICAL DECIDE END for r in d["result"]: - action, reason, rid, name, typ = decide(r) + action, reason, rid, name, typ = decide(r, all_slugs, ec2_names) print(json.dumps({"action": action, "reason": reason, "id": rid, "name": name, "type": typ})) ') diff --git a/scripts/ops/sweep_cf_decide.py b/scripts/ops/sweep_cf_decide.py new file mode 100644 index 00000000..23cfb2fd --- /dev/null +++ b/scripts/ops/sweep_cf_decide.py @@ -0,0 +1,94 @@ +"""Decision logic extracted from sweep-cf-orphans.sh for unit testing (#2027). + +The bash script embeds the same logic inline as a python heredoc — this +module is a verbatim copy used by test_sweep_cf_decide.py. The parity +test (TestParityWithBashScript) reads the bash script and asserts the +canonical block in this file is present byte-for-byte, so the two +cannot drift apart silently. + +If you change the rules: edit BOTH this file AND the inline block in +``scripts/ops/sweep-cf-orphans.sh`` (the canonical block runs from +``# CANONICAL DECIDE BEGIN`` to ``# CANONICAL DECIDE END`` markers in +both files; the parity check compares those slices). + +Inputs to ``decide(record, all_slugs, ec2_names)``: + record Cloudflare DNS record dict {name, id, type} + all_slugs set of CP org slugs (prod ∪ staging) — caller computes the + union once instead of per-record (decide is hot-path: 100s + to 1000s of records per sweep) + ec2_names set of live EC2 Name tags (e.g. ``ws-d3605ef2-f7d``) + +Returns ``(action, reason, id, name, type)`` matching the bash heredoc. +""" +from __future__ import annotations + +import re + + +# Pre-compile per-record regexes once at module load — saves the per-call +# pattern-cache lookup across 1000s of CF records per sweep. Mirrored at +# the same scope in sweep-cf-orphans.sh's heredoc. +_PLATFORM_CORE_NAMES = { + "api.moleculesai.app", "app.moleculesai.app", "doc.moleculesai.app", + "send.moleculesai.app", "status.moleculesai.app", "www.moleculesai.app", + "staging-api.moleculesai.app", +} +_WS_RE = re.compile(r"^(ws-[a-f0-9]{8}-[a-f0-9]+)(?:\.staging)?\.moleculesai\.app$") +_E2E_RE = re.compile(r"^(e2e-[^.]+)(?:\.staging)?\.moleculesai\.app$") +_TENANT_RE = re.compile(r"^([a-z0-9][a-z0-9-]*)(?:\.staging)?\.moleculesai\.app$") + + +# CANONICAL DECIDE BEGIN +def decide(r, all_slugs, ec2_names): + n = r["name"] + rid = r["id"] + typ = r["type"] + + if n == "moleculesai.app": + return ("keep", "apex", rid, n, typ) + if n.startswith("_") or n.endswith("._domainkey.moleculesai.app"): + return ("keep", "verification/key", rid, n, typ) + if n in _PLATFORM_CORE_NAMES: + return ("keep", "platform-core", rid, n, typ) + + m = _WS_RE.match(n) + if m: + prefix = m.group(1) + # Live EC2 names share the ws-- shape with the DNS subdomain. + for ename in ec2_names: + if ename.startswith(prefix): + return ("keep", "live-ec2", rid, n, typ) + return ("delete", "orphan-ws", rid, n, typ) + + m = _E2E_RE.match(n) + if m: + slug = m.group(1) + if slug in all_slugs: + return ("keep", "live-e2e-tenant", rid, n, typ) + return ("delete", "orphan-e2e-tenant", rid, n, typ) + + m = _TENANT_RE.match(n) + if m: + slug = m.group(1) + if slug in all_slugs: + return ("keep", "live-tenant", rid, n, typ) + # KEEP unknown tenant-shaped names — avoid false-positive nukes on + # ad-hoc records (e.g. hermes-final-*) that do not match a known slug. + return ("keep", "unknown-subdomain-kept-for-safety", rid, n, typ) + + return ("keep", "not-a-pattern-we-sweep", rid, n, typ) +# CANONICAL DECIDE END + + +def safety_gate(total: int, delete_count: int, max_delete_pct: int = 50) -> bool: + """Return True iff the sweep is safe to execute. + + Mirrors the shell-side gate: if the deletion fraction exceeds + ``max_delete_pct`` the sweep refuses to run. Same integer arithmetic + as the bash script (``DELETE_COUNT*100/TOTAL``) so a future threshold + tweak only needs to land in one semantic place. + """ + if total <= 0: + return True + pct = delete_count * 100 // total + return pct <= max_delete_pct diff --git a/scripts/ops/test_sweep_cf_decide.py b/scripts/ops/test_sweep_cf_decide.py new file mode 100644 index 00000000..930ba3c0 --- /dev/null +++ b/scripts/ops/test_sweep_cf_decide.py @@ -0,0 +1,225 @@ +"""Tests for the sweep-cf-orphans.sh decision function (#2027). + +Run locally: ``python3 -m unittest scripts/ops/test_sweep_cf_decide.py -v`` + +Why this exists: the inline Python heredoc in sweep-cf-orphans.sh decides +which Cloudflare DNS records to delete. A misclassification could nuke a +live workspace's DNS record. These tests cover the rule priority order + +the safety gate, plus a parity check that asserts the inline block in the +shell script matches the importable module byte-for-byte (so the two +cannot drift silently). +""" +from __future__ import annotations + +import os +import unittest + +import sweep_cf_decide as M + + +# Caller responsibility (per the new decide signature): compute the union once. +ALL_SLUGS = {"acme", "globex", "initech", "e2e-test-runner", "soak", "playground"} +LIVE_EC2 = {"ws-d3605ef2-f7d", "ws-aaaaaaaa-bbb", "ws-cafef00d-dec"} + + +def rec(name: str, rid: str = "rid-x", typ: str = "A") -> dict: + return {"name": name, "id": rid, "type": typ} + + +def call(record: dict) -> tuple: + return M.decide(record, ALL_SLUGS, LIVE_EC2) + + +class TestPlatformCore(unittest.TestCase): + """Apex, www, api, app, _verification keys must NEVER be touched.""" + + def test_apex_kept(self): + action, reason, *_ = call(rec("moleculesai.app")) + self.assertEqual((action, reason), ("keep", "apex")) + + def test_underscore_records_kept(self): + for n in ("_vercel.moleculesai.app", "_railway-verify.moleculesai.app"): + with self.subTest(name=n): + action, reason, *_ = call(rec(n)) + self.assertEqual((action, reason), ("keep", "verification/key")) + + def test_dkim_kept(self): + action, reason, *_ = call(rec("send._domainkey.moleculesai.app")) + self.assertEqual((action, reason), ("keep", "verification/key")) + + def test_platform_subdomains_kept(self): + for n in ( + "api.moleculesai.app", + "app.moleculesai.app", + "doc.moleculesai.app", + "send.moleculesai.app", + "status.moleculesai.app", + "www.moleculesai.app", + "staging-api.moleculesai.app", + ): + with self.subTest(name=n): + action, reason, *_ = call(rec(n)) + self.assertEqual((action, reason), ("keep", "platform-core")) + + +class TestWsRule(unittest.TestCase): + """ws-* DNS records keep iff a live EC2 with the same prefix exists.""" + + def test_live_ws_kept(self): + action, reason, *_ = call(rec("ws-d3605ef2-f7d.moleculesai.app")) + self.assertEqual((action, reason), ("keep", "live-ec2")) + + def test_live_ws_kept_on_staging(self): + action, reason, *_ = call(rec("ws-aaaaaaaa-bbb.staging.moleculesai.app")) + self.assertEqual((action, reason), ("keep", "live-ec2")) + + def test_dead_ws_deleted(self): + action, reason, *_ = call(rec("ws-deadbeef-fff.moleculesai.app")) + self.assertEqual((action, reason), ("delete", "orphan-ws")) + + def test_dead_ws_on_staging_deleted(self): + action, reason, *_ = call(rec("ws-deadbeef-fff.staging.moleculesai.app")) + self.assertEqual((action, reason), ("delete", "orphan-ws")) + + +class TestE2ERule(unittest.TestCase): + def test_live_e2e_kept(self): + action, reason, *_ = call(rec("e2e-test-runner.staging.moleculesai.app")) + self.assertEqual((action, reason), ("keep", "live-e2e-tenant")) + + def test_dead_e2e_deleted(self): + action, reason, *_ = call(rec("e2e-ghost-1234.staging.moleculesai.app")) + self.assertEqual((action, reason), ("delete", "orphan-e2e-tenant")) + + def test_dead_e2e_on_prod_deleted(self): + action, reason, *_ = call(rec("e2e-ghost.moleculesai.app")) + self.assertEqual((action, reason), ("delete", "orphan-e2e-tenant")) + + +class TestTenantSubdomainRule(unittest.TestCase): + def test_live_prod_tenant_kept(self): + action, reason, *_ = call(rec("acme.moleculesai.app")) + self.assertEqual((action, reason), ("keep", "live-tenant")) + + def test_live_staging_tenant_kept(self): + action, reason, *_ = call(rec("soak.staging.moleculesai.app")) + self.assertEqual((action, reason), ("keep", "live-tenant")) + + def test_unknown_subdomain_kept_for_safety(self): + action, reason, *_ = call(rec("hermes-final-2.moleculesai.app")) + self.assertEqual((action, reason), ("keep", "unknown-subdomain-kept-for-safety")) + + +class TestNotASweepPattern(unittest.TestCase): + def test_external_domain_kept(self): + # Domain-spoofing attempt — must NOT match any of the moleculesai.app rules. + action, reason, *_ = call(rec("api.openai.com.evil.internal")) + self.assertEqual((action, reason), ("keep", "not-a-pattern-we-sweep")) + + def test_unrelated_apex_kept(self): + action, reason, *_ = call(rec("example.com")) + self.assertEqual((action, reason), ("keep", "not-a-pattern-we-sweep")) + + +class TestRulePriority(unittest.TestCase): + """Platform-core check must precede the tenant-subdomain regex — + e.g. ``api.moleculesai.app`` matches the tenant pattern but must + classify as platform-core.""" + + def test_api_subdomain_classified_as_platform_not_tenant(self): + action, reason, *_ = call(rec("api.moleculesai.app")) + self.assertEqual(reason, "platform-core") + + def test_underscore_record_classified_before_tenant(self): + action, reason, *_ = call(rec("_vercel.moleculesai.app")) + self.assertEqual(reason, "verification/key") + + +class TestSafetyGate(unittest.TestCase): + """The bash gate refuses to delete >MAX_DELETE_PCT (default 50%).""" + + def test_under_threshold_passes(self): + self.assertTrue(M.safety_gate(total=100, delete_count=49)) + self.assertTrue(M.safety_gate(total=100, delete_count=50)) + + def test_over_threshold_fails(self): + self.assertFalse(M.safety_gate(total=100, delete_count=51)) + self.assertFalse(M.safety_gate(total=10, delete_count=10)) + + def test_zero_total_passes_trivially(self): + # No records → nothing to delete → gate trivially OK (no div-by-zero). + self.assertTrue(M.safety_gate(total=0, delete_count=0)) + + def test_custom_threshold(self): + self.assertTrue(M.safety_gate(total=100, delete_count=70, max_delete_pct=75)) + self.assertFalse(M.safety_gate(total=100, delete_count=76, max_delete_pct=75)) + + +class TestEmptyLiveSets(unittest.TestCase): + """If the CP admin API returns no orgs (auth broken, network blip), + every tenant-shaped record looks orphan. decide() alone has no + defense — that's safety_gate's job. This test pins the contract so + a future "make decide() defensive" change doesn't silently bypass + the gate.""" + + def test_dead_e2e_orphans_when_live_set_empty(self): + action, reason, *_ = M.decide( + rec("e2e-test-runner.staging.moleculesai.app"), + set(), set(), + ) + self.assertEqual((action, reason), ("delete", "orphan-e2e-tenant")) + + def test_live_ws_still_kept_when_ec2_set_empty(self): + action, reason, *_ = M.decide( + rec("ws-cafef00d-dec.moleculesai.app"), + ALL_SLUGS, set(), + ) + self.assertEqual((action, reason), ("delete", "orphan-ws")) + + +class TestParityWithBashScript(unittest.TestCase): + """The decision logic exists in two places: the canonical block in + sweep_cf_decide.py and the inline heredoc in sweep-cf-orphans.sh. + This test asserts the two match between the + ``# CANONICAL DECIDE BEGIN`` / ``# CANONICAL DECIDE END`` markers, + so an edit to one without the other fails CI loudly. The mirror- + reminder comment lives OUTSIDE the markers in the .sh file so we + don't need to special-case it here.""" + + @staticmethod + def _slice_canonical(text: str) -> list[str]: + """Return the lines between the canonical markers, exclusive. + Markers are matched line-anchored (a stripped-line literal match) + so the docstring's prose mention is ignored.""" + lines = text.splitlines() + begin_idx = end_idx = None + for i, line in enumerate(lines): + stripped = line.strip() + if begin_idx is None and stripped == "# CANONICAL DECIDE BEGIN": + begin_idx = i + elif begin_idx is not None and stripped == "# CANONICAL DECIDE END": + end_idx = i + break + if begin_idx is None or end_idx is None: + raise AssertionError( + "missing CANONICAL DECIDE BEGIN/END markers — " + "first 30 lines were:\n" + "\n".join(lines[:30]) + ) + return lines[begin_idx + 1:end_idx] + + def test_blocks_match(self): + here = os.path.dirname(__file__) + with open(os.path.join(here, "sweep_cf_decide.py"), "r", encoding="utf-8") as f: + py_block = self._slice_canonical(f.read()) + with open(os.path.join(here, "sweep-cf-orphans.sh"), "r", encoding="utf-8") as f: + sh_block = self._slice_canonical(f.read()) + self.assertEqual( + py_block, + sh_block, + "CANONICAL DECIDE block has drifted between sweep_cf_decide.py " + "and sweep-cf-orphans.sh — re-sync them.", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/scripts/refresh-workspace-images.sh b/scripts/refresh-workspace-images.sh new file mode 100755 index 00000000..ec9ea0ba --- /dev/null +++ b/scripts/refresh-workspace-images.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# refresh-workspace-images.sh — pull the latest workspace template images +# from GHCR and recreate any running ws-* containers against the new digest. +# +# This is the local-dev / single-host equivalent of step 5 of the runtime CD +# chain (see docs/workspace-runtime-package.md). On a SaaS deployment the +# host's deploy pipeline does the pull on every release; this script is +# what to run on a local docker-compose host after a runtime release lands. +# +# Usage: +# bash scripts/refresh-workspace-images.sh # pull all 8 + recreate running ws-* +# bash scripts/refresh-workspace-images.sh --runtime claude-code # pull just one template +# bash scripts/refresh-workspace-images.sh --no-recreate # pull only, leave containers +# +# Behavior: +# - Always pulls fresh; docker is a no-op if local matches remote, so +# repeated runs are cheap. +# - Recreate is "kill + remove + let the next canvas interaction re- +# provision" — simpler than `docker stop / docker run` because the +# platform owns the run flags. Workspaces re-register on next probe. +# - If a container is mid-conversation, the kill cancels in-flight work. +# Run during a quiet window OR add --no-recreate and recreate manually +# via canvas Restart buttons. + +set -euo pipefail + +GREEN='\033[0;32m' +YELLOW='\033[0;33m' +RED='\033[0;31m' +NC='\033[0m' +log() { echo -e "${GREEN}[refresh]${NC} $1" >&2; } +warn() { echo -e "${YELLOW}[refresh]${NC} $1" >&2; } +err() { echo -e "${RED}[refresh]${NC} $1" >&2; } + +ALL_RUNTIMES=(claude-code langgraph crewai autogen deepagents hermes gemini-cli openclaw) +RUNTIMES=("${ALL_RUNTIMES[@]}") +RECREATE=true + +while [ $# -gt 0 ]; do + case "$1" in + --runtime) RUNTIMES=("$2"); shift 2;; + --no-recreate) RECREATE=false; shift;; + -h|--help) sed -n '2,30p' "$0"; exit 0;; + *) err "unknown arg: $1"; exit 2;; + esac +done + +# 1. Pull fresh tags. Soft-fail per runtime — one missing image (e.g., a +# template that hasn't been published yet) shouldn't abort the others. +log "pulling latest images for: ${RUNTIMES[*]}" +PULLED=() +FAILED=() +for rt in "${RUNTIMES[@]}"; do + IMG="ghcr.io/molecule-ai/workspace-template-$rt:latest" + if docker pull "$IMG" >/dev/null 2>&1; then + log " ✓ $rt" + PULLED+=("$rt") + else + warn " ✗ $rt (pull failed — image may not exist or auth missing)" + FAILED+=("$rt") + fi +done + +if [ "$RECREATE" = "false" ]; then + log "skip-recreate set — leaving containers untouched" + log "done. pulled=${#PULLED[@]} failed=${#FAILED[@]}" + exit 0 +fi + +# 2. Find ws-* containers whose image is one of the runtimes we pulled. +# `docker inspect` exposes the image tag/digest each was created from. +log "scanning ws-* containers for stale images..." +TO_RECREATE=() +for cn in $(docker ps -a --filter "name=ws-" --format "{{.Names}}"); do + IMG=$(docker inspect "$cn" --format '{{.Config.Image}}' 2>/dev/null || echo "") + for rt in "${PULLED[@]}"; do + if [[ "$IMG" == *"workspace-template-$rt"* ]]; then + TO_RECREATE+=("$cn") + break + fi + done +done + +if [ "${#TO_RECREATE[@]}" -eq 0 ]; then + log "no running ws-* containers using a refreshed image — nothing to recreate" + exit 0 +fi + +# 3. Kill + remove. Canvas next-interaction will re-provision. +log "recreating ${#TO_RECREATE[@]} containers (canvas will re-provision on next interaction)" +for cn in "${TO_RECREATE[@]}"; do + docker rm -f "$cn" >/dev/null 2>&1 && log " removed $cn" || warn " failed to remove $cn" +done + +log "done. open the canvas and the workspaces will re-provision against the new image." diff --git a/tests/e2e/test_staging_full_saas.sh b/tests/e2e/test_staging_full_saas.sh index ba0fc7a9..ce3be39b 100755 --- a/tests/e2e/test_staging_full_saas.sh +++ b/tests/e2e/test_staging_full_saas.sh @@ -195,14 +195,21 @@ TENANT_TOKEN=$(echo "$TENANT_TOKEN_RESP" | python3 -c "import json,sys; print(js ok "Tenant admin token retrieved (len=${#TENANT_TOKEN})" # ─── 4. Wait for tenant TLS / DNS propagation ────────────────────────── +# 10 min — same envelope as canvas/e2e/staging-setup.ts TLS_TIMEOUT_MS. +# CF DNS propagation + tunnel hostname registration + ACME cert + edge +# cache routinely take 5-7 min under staging load; the original 3-min +# cap blocked multiple staging→main PRs across 2026-04-24+. Stays +# inside the parent provision envelope so a genuinely-stuck tenant +# still fails loud at the earlier provision step rather than masquerading +# as a TLS issue. log "4/11 Waiting for tenant TLS / DNS propagation..." -TLS_DEADLINE=$(( $(date +%s) + 180 )) +TLS_DEADLINE=$(( $(date +%s) + 600 )) while true; do if curl -sSfk --max-time 5 "$TENANT_URL/health" >/dev/null 2>&1; then break fi if [ "$(date +%s)" -gt "$TLS_DEADLINE" ]; then - fail "Tenant URL never responded 2xx on /health within 3 min" + fail "Tenant URL never responded 2xx on /health within 10 min" fi sleep 5 done @@ -403,6 +410,13 @@ fi if echo "$AGENT_TEXT" | grep -qF "Unknown provider"; then fail "A2A — REGRESSION: install.sh set PROVIDER to a value not in hermes's registry. Run 'hermes doctor' on the workspace to see valid values. Raw: $AGENT_TEXT" fi +# "Invalid API key" — the comment block lists this as a CP #238 race +# (tenant auth chain) signal but the grep was missing. Caller-side +# 401's containing this exact phrase don't match the generic +# "error|exception" catch-all below, so they'd slip through. +if echo "$AGENT_TEXT" | grep -qF "Invalid API key"; then + fail "A2A — REGRESSION: tenant auth chain returned 'Invalid API key'. Likely CP boot-event 401 race (CP #238) or stale OPENAI_API_KEY in the runtime env. Raw: $AGENT_TEXT" +fi # Generic catch-all — falls through if none of the known regressions hit. if echo "$AGENT_TEXT" | grep -qiE "error|exception"; then fail "A2A returned an error-shaped response: $AGENT_TEXT" diff --git a/workspace-server/internal/events/broadcaster.go b/workspace-server/internal/events/broadcaster.go index 514d9781..53427010 100644 --- a/workspace-server/internal/events/broadcaster.go +++ b/workspace-server/internal/events/broadcaster.go @@ -15,6 +15,29 @@ import ( const broadcastChannel = "events:broadcast" +// EventEmitter is the contract handler code needs from a broadcaster. +// Defining it here lets tests substitute a capture-only stub instead +// of standing up the full Redis + WebSocket hub topology that the +// concrete *Broadcaster builds (and that previously blocked +// TestProvisionWorkspace_* regression tests on issue #1814). +// +// Includes BroadcastOnly because the activity-log + A2A-response paths +// inside the handler package fan out via that method — narrowing +// further would force production callers back to the concrete type. +// +// *Broadcaster satisfies this interface trivially. Production code that +// needs the wider surface (SubscribeSSE, Subscribe) keeps using the +// concrete *Broadcaster type — sse.go + cmd/server/main.go are the +// only such call sites today. +type EventEmitter interface { + RecordAndBroadcast(ctx context.Context, eventType string, workspaceID string, payload interface{}) error + BroadcastOnly(workspaceID string, eventType string, payload interface{}) +} + +// Compile-time assertion: a renamed/reshaped Broadcaster method that +// silently broke this interface would fail to build here. +var _ EventEmitter = (*Broadcaster)(nil) + // sseSubscription is a single in-process SSE subscriber. // deliverToSSE writes to ch; StreamEvents reads from it. type sseSubscription struct { diff --git a/workspace-server/internal/handlers/a2a_proxy.go b/workspace-server/internal/handlers/a2a_proxy.go index 13c46641..934af511 100644 --- a/workspace-server/internal/handlers/a2a_proxy.go +++ b/workspace-server/internal/handlers/a2a_proxy.go @@ -483,6 +483,16 @@ func normalizeA2APayload(body []byte) ([]byte, string, *proxyA2AError) { // canvas (callerID == "") = 5 min, agent-to-agent = 30 min. Callers can // override via the X-Timeout header (applied to ctx upstream in ProxyA2A). func (h *WorkspaceHandler) dispatchA2A(ctx context.Context, agentURL string, body []byte, callerID string) (*http.Response, context.CancelFunc, error) { + // #1483 SSRF defense-in-depth: the primary call path through + // proxyA2ARequest → resolveAgentURL already validates via isSafeURL + // (a2a_proxy.go:424), but adding the check here closes the gap for + // any future code path that calls dispatchA2A directly without + // going through resolveAgentURL. Wrapping as proxyDispatchBuildError + // keeps the caller's error-classification path unchanged — the same + // shape it already produces a 500 for. + if err := isSafeURL(agentURL); err != nil { + return nil, nil, &proxyDispatchBuildError{err: err} + } forwardCtx := context.WithoutCancel(ctx) var cancel context.CancelFunc if _, hasDeadline := ctx.Deadline(); !hasDeadline { diff --git a/workspace-server/internal/handlers/a2a_proxy_test.go b/workspace-server/internal/handlers/a2a_proxy_test.go index dcad98e2..81733ad5 100644 --- a/workspace-server/internal/handlers/a2a_proxy_test.go +++ b/workspace-server/internal/handlers/a2a_proxy_test.go @@ -1155,6 +1155,39 @@ func TestDispatchA2A_ContextDeadline_NoCancelAdded(t *testing.T) { } } +// TestDispatchA2A_RejectsUnsafeURL is the #1483 defense-in-depth +// regression. setupTestDB disables SSRF for normal tests so existing +// dispatchA2A unit tests can hit httptest.NewServer (loopback) — we +// re-enable it here to verify the new in-function isSafeURL guard. +// Production callers go through resolveAgentURL which already +// validates; this test pins that dispatchA2A is now safe even when +// called directly by a future caller that skips resolveAgentURL. +func TestDispatchA2A_RejectsUnsafeURL(t *testing.T) { + setupTestDB(t) + setupTestRedis(t) + restoreSSRF := setSSRFCheckForTest(true) + t.Cleanup(restoreSSRF) + handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir()) + + // Cloud metadata IP — must be rejected before any HTTP call goes out. + _, cancel, err := handler.dispatchA2A( + context.Background(), + "http://169.254.169.254/latest/meta-data/", + []byte(`{}`), + "", + ) + if cancel != nil { + cancel() + t.Error("cancel must be nil when the URL is rejected pre-request") + } + if err == nil { + t.Fatal("expected SSRF rejection error, got nil") + } + if _, ok := err.(*proxyDispatchBuildError); !ok { + t.Errorf("expected *proxyDispatchBuildError (caller maps to 500), got %T: %v", err, err) + } +} + // --- handleA2ADispatchError --- func TestHandleA2ADispatchError_ContextDeadline(t *testing.T) { diff --git a/workspace-server/internal/handlers/a2a_queue.go b/workspace-server/internal/handlers/a2a_queue.go index b747fac4..fb0e3b80 100644 --- a/workspace-server/internal/handlers/a2a_queue.go +++ b/workspace-server/internal/handlers/a2a_queue.go @@ -288,7 +288,7 @@ func (h *WorkspaceHandler) DrainQueueForWorkspace(ctx context.Context, workspace } // logActivity=false: the original EnqueueA2A callsite already logged // the dispatch attempt; re-logging here would double-count events. - status, _, proxyErr := h.proxyA2ARequest(ctx, workspaceID, item.Body, callerID, false) + status, respBody, proxyErr := h.proxyA2ARequest(ctx, workspaceID, item.Body, callerID, false) // 202 Accepted = the dispatch was itself queued again (target still busy). // That's not a failure — the queued item just stays queued naturally on @@ -321,4 +321,89 @@ func (h *WorkspaceHandler) DrainQueueForWorkspace(ctx context.Context, workspace MarkQueueItemCompleted(ctx, item.ID) log.Printf("A2AQueue drain: dispatched %s to workspace %s (attempt=%d)", item.ID, workspaceID, item.Attempts) + + // Stitch the response back to the originating delegation row, if this + // queue item was a delegation. Without this, check_task_status would + // see status='queued' (set by the executeDelegation queued-branch) and + // the LLM would think the work was never done. We embed delegation_id + // in params.message.metadata at Delegate-handler time; pull it out + // here and UPDATE the delegate_result row so the original caller can + // observe the real reply. + if delegationID := extractDelegationIDFromBody(item.Body); delegationID != "" { + h.stitchDrainResponseToDelegation(ctx, callerID, item.WorkspaceID, delegationID, respBody) + } +} + +// extractDelegationIDFromBody pulls params.message.metadata.delegation_id +// out of an A2A JSON-RPC body. Empty string when absent — drain treats +// that as "this queue item didn't originate from /workspaces/:id/delegate" +// and skips the stitch, so non-delegation queue uses (cross-workspace +// peer-direct A2A) aren't affected. +func extractDelegationIDFromBody(body []byte) string { + var envelope struct { + Params struct { + Message struct { + Metadata struct { + DelegationID string `json:"delegation_id"` + } `json:"metadata"` + } `json:"message"` + } `json:"params"` + } + if err := json.Unmarshal(body, &envelope); err != nil { + return "" + } + return envelope.Params.Message.Metadata.DelegationID +} + +// stitchDrainResponseToDelegation writes the drained response into the +// delegation's existing delegate_result row (created with status='queued' +// by executeDelegation when the proxy first returned queued). This is the +// other half of the loop that closes "queued → completed" so the LLM's +// check_task_status reflects ground truth. +// +// Errors are logged-only — drain is fire-and-forget from Heartbeat, and a +// stitch failure shouldn't block other queued items. The delegation will +// just remain stuck at 'queued' in this case, which is the pre-fix +// behaviour (no regression vs. shipping nothing). +func (h *WorkspaceHandler) stitchDrainResponseToDelegation(ctx context.Context, sourceID, targetID, delegationID string, respBody []byte) { + if sourceID == "" || delegationID == "" { + return + } + responseText := extractResponseText(respBody) + respJSON, _ := json.Marshal(map[string]interface{}{ + "text": responseText, + "delegation_id": delegationID, + }) + res, err := db.DB.ExecContext(ctx, ` + UPDATE activity_logs + SET status = 'completed', + summary = $1, + response_body = $2::jsonb + WHERE workspace_id = $3 + AND method = 'delegate_result' + AND target_id = $4 + AND response_body->>'delegation_id' = $5 + `, "Delegation completed ("+truncate(responseText, 80)+")", string(respJSON), + sourceID, targetID, delegationID) + if err != nil { + log.Printf("A2AQueue drain stitch: update failed for delegation %s: %v", delegationID, err) + return + } + if rows, _ := res.RowsAffected(); rows == 0 { + log.Printf("A2AQueue drain stitch: no delegate_result row for delegation %s (queued-row may not exist yet)", delegationID) + return + } + log.Printf("A2AQueue drain stitch: delegation %s queued → completed (%d chars)", delegationID, len(responseText)) + + // Broadcast DELEGATION_COMPLETE so the canvas chat feed flips the + // "⏸ queued" line to "✓ completed" in real time. Without this the + // transition only surfaces after the user reloads or polls activity. + if h.broadcaster != nil { + h.broadcaster.RecordAndBroadcast(ctx, "DELEGATION_COMPLETE", sourceID, map[string]interface{}{ + "delegation_id": delegationID, + "target_id": targetID, + "response_preview": truncate(responseText, 200), + "via": "queue_drain", + }) + } } diff --git a/workspace-server/internal/handlers/a2a_queue_test.go b/workspace-server/internal/handlers/a2a_queue_test.go index 940f7f2f..57000910 100644 --- a/workspace-server/internal/handlers/a2a_queue_test.go +++ b/workspace-server/internal/handlers/a2a_queue_test.go @@ -80,6 +80,39 @@ func TestExtractIdempotencyKey_emptyOnMissing(t *testing.T) { } } +func TestExtractDelegationIDFromBody(t *testing.T) { + cases := []struct { + name string + body string + want string + }{ + { + name: "delegation body — metadata.delegation_id present", + body: `{"method":"message/send","params":{"message":{"role":"user","messageId":"abc-123","metadata":{"delegation_id":"abc-123"},"parts":[{"type":"text","text":"hi"}]}}}`, + want: "abc-123", + }, + { + name: "non-delegation body — no metadata (peer-direct A2A)", + body: `{"method":"message/send","params":{"message":{"role":"user","messageId":"m-1","parts":[{"type":"text","text":"hi"}]}}}`, + want: "", + }, + { + name: "metadata present but no delegation_id key", + body: `{"params":{"message":{"metadata":{"trace_id":"t-1"}}}}`, + want: "", + }, + {"malformed JSON", `not json`, ""}, + {"empty body", ``, ""}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := extractDelegationIDFromBody([]byte(tc.body)); got != tc.want { + t.Errorf("extractDelegationIDFromBody = %q, want %q", got, tc.want) + } + }) + } +} + // ────────────────────────────────────────────────────────────────────────────── // DrainQueueForWorkspace — nil-safe error extraction regression tests // diff --git a/workspace-server/internal/handlers/activity.go b/workspace-server/internal/handlers/activity.go index 4d98e9fa..ba6d9f0f 100644 --- a/workspace-server/internal/handlers/activity.go +++ b/workspace-server/internal/handlers/activity.go @@ -286,6 +286,37 @@ func (h *ActivityHandler) Notify(c *gin.Context) { "name": wsName, }) + // Persist to activity_logs so the chat history loader restores this + // message after a page reload. Pre-fix, send_message_to_user pushes + // were broadcast-only — survived the WebSocket session but vanished + // when the user refreshed because nothing wrote them to the DB. + // + // Shape chosen to match the existing loader query + // (`type=a2a_receive&source=canvas`): + // - activity_type='a2a_receive' so it joins the same query path + // - source_id=NULL so the canvas-source filter accepts it + // - method='notify' to distinguish from real A2A receives in audits + // - request_body=NULL so the loader doesn't append a duplicate + // "user message" bubble for it + // - response_body={"result": ""} matches extractResponseText's + // simplest branch ({result: string} → take verbatim) + // + // Errors are logged-only — broadcast already succeeded, the user + // sees the message; persistence failure just means the message + // won't survive reload (pre-fix behavior). Don't fail the whole + // notify on a DB hiccup. + respJSON, _ := json.Marshal(map[string]interface{}{"result": body.Message}) + preview := body.Message + if len(preview) > 80 { + preview = preview[:80] + "…" + } + if _, err := db.DB.ExecContext(c.Request.Context(), ` + INSERT INTO activity_logs (workspace_id, activity_type, method, summary, response_body, status) + VALUES ($1, 'a2a_receive', 'notify', $2, $3::jsonb, 'ok') + `, workspaceID, "Agent message: "+preview, string(respJSON)); err != nil { + log.Printf("Notify: failed to persist message for %s: %v", workspaceID, err) + } + c.JSON(http.StatusOK, gin.H{"status": "sent"}) } @@ -373,7 +404,9 @@ func (h *ActivityHandler) Report(c *gin.Context) { } // LogActivity inserts an activity log and optionally broadcasts via WebSocket. -func LogActivity(ctx context.Context, broadcaster *events.Broadcaster, params ActivityParams) { +// Takes events.EventEmitter (#1814) so callers passing a stub broadcaster +// in tests no longer need to construct the full *events.Broadcaster. +func LogActivity(ctx context.Context, broadcaster events.EventEmitter, params ActivityParams) { reqJSON, reqErr := json.Marshal(params.RequestBody) if reqErr != nil { log.Printf("LogActivity: failed to marshal request_body for %s: %v", params.WorkspaceID, reqErr) diff --git a/workspace-server/internal/handlers/activity_test.go b/workspace-server/internal/handlers/activity_test.go index 1780be3b..9cba5873 100644 --- a/workspace-server/internal/handlers/activity_test.go +++ b/workspace-server/internal/handlers/activity_test.go @@ -217,6 +217,86 @@ func TestActivityReport_RejectsUnknownType(t *testing.T) { } } +func TestNotify_PersistsToActivityLogsForReloadRecovery(t *testing.T) { + // Regression guard for the "responses gone on reload" bug. send_message_to_user + // pushes (which route through Notify) used to be broadcast-only — they + // rendered in the canvas but vanished on page reload because nothing + // wrote them to activity_logs. The chat history loader queries + // `type=a2a_receive&source=canvas`, so the persisted row must: + // - Use activity_type='a2a_receive' (loader's filter) + // - Have source_id NULL (canvas-source filter) + // - Carry the message text in response_body so extractResponseText + // can reconstruct the agent reply on reload + mockDB, mock, _ := sqlmock.New() + defer mockDB.Close() + db.DB = mockDB + + // Workspace existence check + mock.ExpectQuery(`SELECT name FROM workspaces`). + WithArgs("ws-notify"). + WillReturnRows(sqlmock.NewRows([]string{"name"}).AddRow("DD")) + + // Persistence INSERT — verify shape + mock.ExpectExec(`INSERT INTO activity_logs`). + WithArgs( + "ws-notify", + sqlmock.AnyArg(), // summary + sqlmock.AnyArg(), // response_body JSON + ). + WillReturnResult(sqlmock.NewResult(1, 1)) + + broadcaster := newTestBroadcaster() + handler := NewActivityHandler(broadcaster) + + gin.SetMode(gin.TestMode) + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-notify"}} + body := `{"message":"agent reply that arrived after the sync POST timed out"}` + c.Request = httptest.NewRequest("POST", "/workspaces/ws-notify/notify", strings.NewReader(body)) + c.Request.Header.Set("Content-Type", "application/json") + handler.Notify(c) + + if w.Code != http.StatusOK { + t.Errorf("expected 200, got %d: %s", w.Code, w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("DB expectations not met: %v", err) + } +} + +func TestNotify_DBFailure_StillBroadcastsAnd200(t *testing.T) { + // Persistence is best-effort — a DB hiccup must NOT block the + // WebSocket push (which the user is already seeing in their open + // canvas). Pre-fix the WS push always succeeded; we don't want + // the new persistence step to regress that path. + mockDB, mock, _ := sqlmock.New() + defer mockDB.Close() + db.DB = mockDB + + mock.ExpectQuery(`SELECT name FROM workspaces`). + WithArgs("ws-x"). + WillReturnRows(sqlmock.NewRows([]string{"name"}).AddRow("DD")) + mock.ExpectExec(`INSERT INTO activity_logs`). + WillReturnError(fmt.Errorf("simulated db hiccup")) + + broadcaster := newTestBroadcaster() + handler := NewActivityHandler(broadcaster) + + gin.SetMode(gin.TestMode) + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-x"}} + body := `{"message":"hi"}` + c.Request = httptest.NewRequest("POST", "/workspaces/ws-x/notify", strings.NewReader(body)) + c.Request.Header.Set("Content-Type", "application/json") + handler.Notify(c) + + if w.Code != http.StatusOK { + t.Errorf("DB failure must not break the response; got %d", w.Code) + } +} + // ==================== Direct unit tests for SessionSearch helpers ==================== // --- parseSessionSearchParams --- diff --git a/workspace-server/internal/handlers/admin_workspace_images.go b/workspace-server/internal/handlers/admin_workspace_images.go new file mode 100644 index 00000000..147bf8ad --- /dev/null +++ b/workspace-server/internal/handlers/admin_workspace_images.go @@ -0,0 +1,227 @@ +package handlers + +import ( + "context" + "encoding/base64" + "encoding/json" + "fmt" + "io" + "log" + "net/http" + "os" + "strings" + "time" + + "github.com/docker/docker/api/types/container" + "github.com/docker/docker/api/types/filters" + dockerimage "github.com/docker/docker/api/types/image" + dockerclient "github.com/docker/docker/client" + "github.com/gin-gonic/gin" + + "github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner" +) + +// AdminWorkspaceImagesHandler serves POST /admin/workspace-images/refresh — the +// production-side end of the runtime CD chain. Operators (or post-publish +// automation) hit this to (1) pull the latest workspace template images from +// GHCR via the Docker SDK and (2) recreate any running ws-* containers so +// they adopt the new image. Without this, a freshly-published runtime sits +// in the registry but containers keep running the old image until the next +// manual restart. +// +// On a SaaS deployment the deploy pipeline already pulls on every release, +// so the pull step is a no-op there; the recreate step is still the way to +// make running workspaces adopt the new image without a full host restart. +// +// POST /admin/workspace-images/refresh +// +// ?runtime=claude-code (optional; default = all 8 templates) +// &recreate=true|false (default true; false = pull only) +// +// Returns JSON {pulled: [...], failed: [...], recreated: [...]} +type AdminWorkspaceImagesHandler struct { + docker *dockerclient.Client +} + +func NewAdminWorkspaceImagesHandler(docker *dockerclient.Client) *AdminWorkspaceImagesHandler { + return &AdminWorkspaceImagesHandler{docker: docker} +} + +// allRuntimes is the canonical list mirroring docs/workspace-runtime-package.md. +// Update both when a new template is added. +var allRuntimes = []string{ + "claude-code", "langgraph", "crewai", "autogen", + "deepagents", "hermes", "gemini-cli", "openclaw", +} + +type refreshResult struct { + Pulled []string `json:"pulled"` + Failed []string `json:"failed"` + Recreated []string `json:"recreated"` +} + +// ghcrAuthHeader returns the base64-encoded JSON auth payload Docker's +// ImagePull expects in PullOptions.RegistryAuth, or empty string when no +// GHCR_USER/GHCR_TOKEN env is set (lets public images pull through). +// +// The Docker SDK doesn't read ~/.docker/config.json — every authenticated +// pull needs an explicit RegistryAuth string. Format per the Docker +// engine API: {"username":"…","password":"…","serveraddress":"ghcr.io"} +// → base64-encoded JSON with no trailing padding stripped (engine handles +// either form). +func ghcrAuthHeader() string { + user := strings.TrimSpace(os.Getenv("GHCR_USER")) + token := strings.TrimSpace(os.Getenv("GHCR_TOKEN")) + if user == "" || token == "" { + return "" + } + payload := map[string]string{ + "username": user, + "password": token, + "serveraddress": "ghcr.io", + } + js, err := json.Marshal(payload) + if err != nil { + // Should be unreachable for a static map[string]string. Log so a + // future contributor adding a non-marshallable field notices. + log.Printf("workspace-images: failed to marshal GHCR auth: %v", err) + return "" + } + return base64.URLEncoding.EncodeToString(js) +} + +func (h *AdminWorkspaceImagesHandler) Refresh(c *gin.Context) { + runtimes := allRuntimes + if r := c.Query("runtime"); r != "" { + // Accept a single runtime; reject anything not in the canonical list + // so a typo doesn't silently no-op. + found := false + for _, known := range allRuntimes { + if known == r { + found = true + break + } + } + if !found { + c.JSON(http.StatusBadRequest, gin.H{ + "error": fmt.Sprintf("unknown runtime: %s", r), + "known_runtimes": allRuntimes, + }) + return + } + runtimes = []string{r} + } + recreate := c.DefaultQuery("recreate", "true") == "true" + + res := refreshResult{Pulled: []string{}, Failed: []string{}, Recreated: []string{}} + auth := ghcrAuthHeader() + + // 1. Pull each template image via the Docker SDK. Soft-fail per-runtime + // so one missing image (e.g. unpublished template) doesn't abort + // the others. Each pull's progress stream is drained to completion + // — the engine treats early-close as "abandon", leaving partial + // layers around with no reference. + pullCtx, cancel := context.WithTimeout(c.Request.Context(), 5*time.Minute) + defer cancel() + for _, rt := range runtimes { + image := fmt.Sprintf("ghcr.io/molecule-ai/workspace-template-%s:latest", rt) + opts := dockerimage.PullOptions{Platform: provisioner.DefaultImagePlatform()} + if auth != "" { + opts.RegistryAuth = auth + } + rc, err := h.docker.ImagePull(pullCtx, image, opts) + if err != nil { + log.Printf("workspace-images/refresh: pull %s failed: %v", rt, err) + res.Failed = append(res.Failed, rt) + continue + } + // Drain to completion. We discard progress payload because no + // caller renders it; the platform log already records pulled/failed + // per runtime. If a future caller wants live progress, decode the + // JSON-line stream into events here. + if _, err := io.Copy(io.Discard, rc); err != nil { + rc.Close() + log.Printf("workspace-images/refresh: drain %s failed: %v", rt, err) + res.Failed = append(res.Failed, rt) + continue + } + rc.Close() + res.Pulled = append(res.Pulled, rt) + } + + if !recreate { + c.JSON(http.StatusOK, res) + return + } + + // 2. Find ws-* containers running an image we just pulled. Recreate + // them — kill+remove and let the platform's normal provisioning + // flow re-create on next canvas interaction. + listCtx, listCancel := context.WithTimeout(c.Request.Context(), 30*time.Second) + defer listCancel() + containers, err := h.docker.ContainerList(listCtx, container.ListOptions{ + All: true, + Filters: filters.NewArgs(filters.Arg("name", "ws-")), + }) + if err != nil { + log.Printf("workspace-images/refresh: container list failed: %v", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "container list failed", "partial_result": res}) + return + } + + pulledSet := map[string]struct{}{} + for _, rt := range res.Pulled { + pulledSet[rt] = struct{}{} + } + for _, ctr := range containers { + // ContainerList's ctr.Image is the *resolved digest* (sha256:…), + // not the human-readable tag. Use ContainerInspect to get the + // original Config.Image (e.g. "ghcr.io/molecule-ai/workspace- + // template-claude-code:latest") so we can match against the + // pulled-runtime set. The cost is one extra round-trip per + // ws-* container — there are at most 8 typically, so this is + // well below any UX threshold. + inspectCtx, inspectCancel := context.WithTimeout(c.Request.Context(), 10*time.Second) + full, err := h.docker.ContainerInspect(inspectCtx, ctr.ID) + inspectCancel() + if err != nil { + log.Printf("workspace-images/refresh: inspect %s failed: %v", ctr.ID[:12], err) + continue + } + imageRef := "" + if full.Config != nil { + imageRef = full.Config.Image + } + matched := "" + for rt := range pulledSet { + if strings.Contains(imageRef, "workspace-template-"+rt) { + matched = rt + break + } + } + if matched == "" { + continue + } + name := strings.TrimPrefix(ctr.Names[0], "/") + // Remove with force — the workspace will re-provision on the next + // canvas interaction. This drops in-flight conversations on the + // removed container; document via the response so callers can + // schedule the refresh during a quiet window. + rmCtx, rmCancel := context.WithTimeout(c.Request.Context(), 30*time.Second) + err = h.docker.ContainerRemove(rmCtx, ctr.ID, container.RemoveOptions{Force: true}) + rmCancel() + if err != nil { + log.Printf("workspace-images/refresh: remove %s failed: %v", name, err) + continue + } + res.Recreated = append(res.Recreated, name) + } + + authStatus := "no GHCR auth (public images only)" + if auth != "" { + authStatus = "GHCR_USER/GHCR_TOKEN auth" + } + log.Printf("workspace-images/refresh: pulled=%d failed=%d recreated=%d (%s)", + len(res.Pulled), len(res.Failed), len(res.Recreated), authStatus) + c.JSON(http.StatusOK, res) +} diff --git a/workspace-server/internal/handlers/admin_workspace_images_test.go b/workspace-server/internal/handlers/admin_workspace_images_test.go new file mode 100644 index 00000000..26e61f95 --- /dev/null +++ b/workspace-server/internal/handlers/admin_workspace_images_test.go @@ -0,0 +1,73 @@ +package handlers + +import ( + "encoding/base64" + "encoding/json" + "testing" +) + +func TestGHCRAuthHeader_NoEnvReturnsEmpty(t *testing.T) { + t.Setenv("GHCR_USER", "") + t.Setenv("GHCR_TOKEN", "") + if got := ghcrAuthHeader(); got != "" { + t.Errorf("expected empty (no auth → public-only), got %q", got) + } +} + +func TestGHCRAuthHeader_PartialEnvReturnsEmpty(t *testing.T) { + // Both must be set — defensive against half-configured env. + t.Setenv("GHCR_USER", "alice") + t.Setenv("GHCR_TOKEN", "") + if got := ghcrAuthHeader(); got != "" { + t.Errorf("user-only env should disable auth, got %q", got) + } + t.Setenv("GHCR_USER", "") + t.Setenv("GHCR_TOKEN", "fake-tok-xxx") + if got := ghcrAuthHeader(); got != "" { + t.Errorf("token-only env should disable auth, got %q", got) + } +} + +func TestGHCRAuthHeader_EncodesDockerEnginePayload(t *testing.T) { + t.Setenv("GHCR_USER", "alice") + t.Setenv("GHCR_TOKEN", "fake-tok-value") + got := ghcrAuthHeader() + if got == "" { + t.Fatal("expected non-empty auth header") + } + raw, err := base64.URLEncoding.DecodeString(got) + if err != nil { + t.Fatalf("auth header is not valid base64-url: %v", err) + } + var payload map[string]string + if err := json.Unmarshal(raw, &payload); err != nil { + t.Fatalf("decoded auth is not valid JSON: %v (raw=%s)", err, raw) + } + if payload["username"] != "alice" { + t.Errorf("username: got %q, want alice", payload["username"]) + } + if payload["password"] != "fake-tok-value" { + t.Errorf("password: got %q, want fake-tok-value", payload["password"]) + } + if payload["serveraddress"] != "ghcr.io" { + t.Errorf("serveraddress: got %q, want ghcr.io", payload["serveraddress"]) + } +} + +func TestGHCRAuthHeader_TrimsWhitespace(t *testing.T) { + // .env lines often have trailing newlines or accidental spaces. Without + // trimming, a stray space would produce an auth payload the engine + // rejects with a confusing 401. + t.Setenv("GHCR_USER", " alice ") + t.Setenv("GHCR_TOKEN", "\tfake-tok-value\n") + got := ghcrAuthHeader() + raw, _ := base64.URLEncoding.DecodeString(got) + var payload map[string]string + _ = json.Unmarshal(raw, &payload) + if payload["username"] != "alice" { + t.Errorf("username not trimmed: got %q", payload["username"]) + } + if payload["password"] != "fake-tok-value" { + t.Errorf("password not trimmed: got %q", payload["password"]) + } +} diff --git a/workspace-server/internal/handlers/delegation.go b/workspace-server/internal/handlers/delegation.go index 8c0d681f..59da198c 100644 --- a/workspace-server/internal/handlers/delegation.go +++ b/workspace-server/internal/handlers/delegation.go @@ -78,13 +78,21 @@ func (h *DelegationHandler) Delegate(c *gin.Context) { // reason (logged); we still dispatch the A2A request and surface the // warning in the response. - // Build A2A payload + // Build A2A payload. Embedding delegation_id in metadata gives the + // queue drain path a way to look up the originating delegation row + // when stitching the response back (issue: previously the drain + // dispatched successfully but discarded the response, so + // check_task_status returned status='queued' forever even after a + // real reply landed). messageId mirrors delegation_id so the + // platform's idempotency-key extraction also keys off the same id. a2aBody, _ := json.Marshal(map[string]interface{}{ "method": "message/send", "params": map[string]interface{}{ "message": map[string]interface{}{ - "role": "user", - "parts": []map[string]interface{}{{"type": "text", "text": body.Task}}, + "role": "user", + "messageId": delegationID, + "parts": []map[string]interface{}{{"type": "text", "text": body.Task}}, + "metadata": map[string]interface{}{"delegation_id": delegationID}, }, }, }) @@ -284,6 +292,40 @@ func (h *DelegationHandler) executeDelegation(sourceID, targetID, delegationID s return } + // 202 + {queued: true} means the target was busy and the proxy + // enqueued the request for the next drain tick — NOT a completion. + // Treat it as such: write a clean 'queued' activity row with no + // JSON-as-text leakage into the summary, broadcast a status update, + // and return. The eventual drain doesn't (yet) feed a result back + // into this delegation, so callers polling check_task_status will + // see status='queued' and know to retry instead of believing the + // queued JSON is the agent's reply. Fixes the chat-leak where the + // LLM echoed "Delegation completed (workspace agent busy ...)" to + // the user. + if status == http.StatusAccepted && isQueuedProxyResponse(respBody) { + log.Printf("Delegation %s: target %s busy — queued for drain", delegationID, targetID) + h.updateDelegationStatus(sourceID, delegationID, "queued", "") + // Store delegation_id in response_body so DrainQueueForWorkspace's + // stitch step can find this row by JSON-path key after the queued + // dispatch eventually succeeds. Without the key, the drain finds + // the row by (workspace_id, target_id, method) but can't tell + // multiple-queued-delegations-to-same-target apart. + queuedJSON, _ := json.Marshal(map[string]interface{}{ + "delegation_id": delegationID, + "queued": true, + }) + if _, err := db.DB.ExecContext(ctx, ` + INSERT INTO activity_logs (workspace_id, activity_type, method, source_id, target_id, summary, response_body, status) + VALUES ($1, 'delegation', 'delegate_result', $2, $3, $4, $5::jsonb, 'queued') + `, sourceID, sourceID, targetID, "Delegation queued — target at capacity", string(queuedJSON)); err != nil { + log.Printf("Delegation %s: failed to insert queued log: %v", delegationID, err) + } + h.broadcaster.RecordAndBroadcast(ctx, "DELEGATION_STATUS", sourceID, map[string]interface{}{ + "delegation_id": delegationID, "target_id": targetID, "status": "queued", + }) + return + } + // A2A returned 200 — target received and processed the task // Status: dispatched → received → completed (we don't have a separate "received" signal from the target yet) responseText := extractResponseText(respBody) @@ -517,6 +559,21 @@ func isTransientProxyError(err *proxyA2AError) bool { return false } +// isQueuedProxyResponse reports whether the proxy returned a body shaped like +// `{"queued": true, "queue_id": ..., "queue_depth": ..., "message": ...}` — +// the busy-target enqueue path in a2a_proxy_helpers.go. Caller checks this +// alongside HTTP 202 to distinguish a successful agent reply from a deferred +// dispatch; without the distinction we'd write the queued-message JSON into +// the delegation result row and the LLM would surface it as agent output. +func isQueuedProxyResponse(body []byte) bool { + var resp map[string]interface{} + if json.Unmarshal(body, &resp) != nil { + return false + } + queued, _ := resp["queued"].(bool) + return queued +} + func extractResponseText(body []byte) string { var resp map[string]interface{} if json.Unmarshal(body, &resp) != nil { diff --git a/workspace-server/internal/handlers/delegation_test.go b/workspace-server/internal/handlers/delegation_test.go index caa5118d..21cc3a90 100644 --- a/workspace-server/internal/handlers/delegation_test.go +++ b/workspace-server/internal/handlers/delegation_test.go @@ -376,6 +376,39 @@ func TestIsTransientProxyError_RetriesOnRestartRaceStatuses(t *testing.T) { } } +func TestIsQueuedProxyResponse(t *testing.T) { + // Regression guard for the chat-leak bug: when the proxy returns + // 202 with a queued-shape body, executeDelegation must classify it + // as "queued" — not "completed". Mis-classifying it causes the + // queued JSON to land in activity_logs.summary, which the LLM then + // echoes verbatim into the agent chat as + // "Delegation completed: Delegation completed (workspace agent + // busy — request queued, will dispatch...)". + cases := []struct { + name string + body string + want bool + }{ + { + name: "real proxy busy-enqueue body", + body: `{"queued":true,"queue_id":"d0993390-5f5a-4f5d-90a2-66639e53e3c9","queue_depth":1,"message":"workspace agent busy — request queued, will dispatch when capacity available"}`, + want: true, + }, + {"queued false explicitly", `{"queued":false}`, false}, + {"queued field absent (real A2A reply)", `{"jsonrpc":"2.0","id":"1","result":{"kind":"message","parts":[{"kind":"text","text":"hi"}]}}`, false}, + {"non-bool queued value (defensive)", `{"queued":"true"}`, false}, + {"malformed JSON", `not-json`, false}, + {"empty body", ``, false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := isQueuedProxyResponse([]byte(tc.body)); got != tc.want { + t.Errorf("isQueuedProxyResponse(%q) = %v, want %v", tc.body, got, tc.want) + } + }) + } +} + func TestDelegationRetryDelay_IsSaneWindow(t *testing.T) { // Regression guard: the retry delay must be long enough for the // reactive URL refresh in proxyA2ARequest to kick in (which involves diff --git a/workspace-server/internal/handlers/discovery.go b/workspace-server/internal/handlers/discovery.go index d8edf814..79315016 100644 --- a/workspace-server/internal/handlers/discovery.go +++ b/workspace-server/internal/handlers/discovery.go @@ -102,6 +102,19 @@ func discoverHostPeer(ctx context.Context, c *gin.Context, targetID string) { return } + // #1484 SSRF defense-in-depth: the URL came from the workspaces table + // without any per-write validation (workspace runtimes register their + // own URLs via /registry/register, and a misbehaving / compromised + // runtime could register a 169.254.169.254 metadata URL). Validate + // before handing it to the caller, who might dispatch HTTP against it. + // Currently gated by the bearer-required Discover handler, but this + // guard makes discoverHostPeer safe regardless of upstream auth shape. + if err := isSafeURL(url.String); err != nil { + log.Printf("Discovery: rejecting unsafe registered URL for %s (#1484): %v", resolvedID, err) + c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace URL failed safety check", "status": status}) + return + } + db.CacheURL(ctx, resolvedID, url.String) c.JSON(http.StatusOK, gin.H{ "id": resolvedID, @@ -172,11 +185,31 @@ func writeExternalWorkspaceURL(ctx context.Context, c *gin.Context, callerID, ta outURL = strings.Replace(outURL, "127.0.0.1", "host.docker.internal", 1) outURL = strings.Replace(outURL, "localhost", "host.docker.internal", 1) } + + // #1484 SSRF defense-in-depth — same rationale as discoverHostPeer. + // We validate the post-rewrite URL because the rewrite changes which + // host the caller would dispatch against (host.docker.internal is + // only reachable inside a docker network; isSafeURL accepts it but + // blocks a metadata IP that survived the rewrite untouched). + if err := isSafeURL(outURL); err != nil { + log.Printf("Discovery: rejecting unsafe external workspace URL for %s (#1484): %v", targetID, err) + c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace URL failed safety check"}) + return true + } + c.JSON(http.StatusOK, gin.H{"id": targetID, "url": outURL, "name": wsName}) return true } // Peers handles GET /registry/:id/peers +// +// Optional ``?q=`` filters the result by case-insensitive +// substring match against ``name`` or ``role`` (#1038). Filtering is done +// in Go after the DB read — keeps the SQL identical to the no-filter path +// (no injection risk, no DB-driver collation surprises) at the cost of +// loading the unfiltered set first. Acceptable because the peer set is +// always bounded by the small fanout of a single workspace's parent + +// children + siblings (typically <50 rows). func (h *DiscoveryHandler) Peers(c *gin.Context) { workspaceID := c.Param("id") ctx := c.Request.Context() @@ -241,12 +274,34 @@ func (h *DiscoveryHandler) Peers(c *gin.Context) { peers = append(peers, parent...) } + peers = filterPeersByQuery(peers, c.Query("q")) + if peers == nil { peers = make([]map[string]interface{}, 0) } c.JSON(http.StatusOK, peers) } +// filterPeersByQuery returns peers whose name or role case-insensitively +// contains q. Whitespace-trimmed empty q is a no-op (returns input unchanged). +func filterPeersByQuery(peers []map[string]interface{}, q string) []map[string]interface{} { + q = strings.TrimSpace(q) + if q == "" { + return peers + } + needle := strings.ToLower(q) + out := make([]map[string]interface{}, 0, len(peers)) + for _, p := range peers { + name := p["name"].(string) + role := p["role"].(string) + if strings.Contains(strings.ToLower(name), needle) || + strings.Contains(strings.ToLower(role), needle) { + out = append(out, p) + } + } + return out +} + // queryPeerMaps returns clean JSON-serializable maps instead of Workspace structs. func queryPeerMaps(query string, args ...interface{}) ([]map[string]interface{}, error) { rows, err := db.DB.Query(query, args...) diff --git a/workspace-server/internal/handlers/discovery_test.go b/workspace-server/internal/handlers/discovery_test.go index 7c90005e..892a1f0a 100644 --- a/workspace-server/internal/handlers/discovery_test.go +++ b/workspace-server/internal/handlers/discovery_test.go @@ -7,6 +7,7 @@ import ( "encoding/json" "net/http" "net/http/httptest" + "strings" "testing" "github.com/DATA-DOG/go-sqlmock" @@ -258,6 +259,149 @@ func TestPeers_RootWorkspace_NoPeers(t *testing.T) { } } +// ==================== Peers — ?q= filter (#1038) ==================== + +// peersFilterFixture mocks the 4 SQL reads (parent_id lookup + siblings + +// children + parent) with a known 3-peer set so each q-filter test can +// focus on the post-fetch substring-match behaviour. Returns the handler +// and the live mock so callers can assert ExpectationsWereMet at the end. +func peersFilterFixture(t *testing.T) (*DiscoveryHandler, sqlmock.Sqlmock) { + t.Helper() + mock := setupTestDB(t) + setupTestRedis(t) + + mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id ="). + WithArgs("ws-self"). + WillReturnRows(sqlmock.NewRows([]string{"parent_id"}).AddRow("ws-pm")) + + cols := []string{"id", "name", "role", "tier", "status", "agent_card", "url", "parent_id", "active_tasks"} + + mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1 AND w.id != \\$2"). + WithArgs("ws-pm", "ws-self"). + WillReturnRows(sqlmock.NewRows(cols). + AddRow("ws-alpha", "Alpha Researcher", "researcher", 1, "online", []byte("null"), "http://a", "ws-pm", 0). + AddRow("ws-beta", "Beta Designer", "designer", 1, "online", []byte("null"), "http://b", "ws-pm", 0)) + + mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1 AND w.status"). + WithArgs("ws-self"). + WillReturnRows(sqlmock.NewRows(cols)) + + mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.id = \\$1 AND w.status"). + WithArgs("ws-pm"). + WillReturnRows(sqlmock.NewRows(cols). + AddRow("ws-pm", "PM Workspace", "manager", 2, "online", []byte("null"), "http://pm", nil, 1)) + + return NewDiscoveryHandler(), mock +} + +// runPeersWithQuery invokes Peers and returns BOTH the parsed peers and +// the raw response body. The raw body is needed by TestPeers_Q_NoMatches +// to distinguish JSON `[]` (intended) from `null` (regression of the +// nil-guard at line 254-256) — once unmarshalled, both collapse to +// len==0 and re-marshal to `[]`, so checking only the parsed value is +// tautological. +func runPeersWithQuery(t *testing.T, handler *DiscoveryHandler, q string) ([]map[string]interface{}, []byte) { + t.Helper() + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-self"}} + url := "/registry/ws-self/peers" + if q != "" { + url += "?q=" + q + } + c.Request = httptest.NewRequest("GET", url, nil) + handler.Peers(c) + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String()) + } + body := w.Body.Bytes() + var peers []map[string]interface{} + if err := json.Unmarshal(body, &peers); err != nil { + t.Fatalf("parse response: %v", err) + } + return peers, body +} + +// peerIDSet returns the set of peer ids — order-independent comparison +// avoids fragile peers[0]["id"] asserts that would silently mask a future +// sort/order change. +func peerIDSet(peers []map[string]interface{}) map[string]struct{} { + out := make(map[string]struct{}, len(peers)) + for _, p := range peers { + out[p["id"].(string)] = struct{}{} + } + return out +} + +// TestPeers_QFilter covers the rule classifier — append-order +// independent (uses set membership) so a future sort regression on the +// production code can't slip through. NoMatches has its own raw-body +// check (see TestPeers_Q_NoMatches_RawBodyIsArrayNotNull below) because +// the `[]` vs `null` distinction collapses after json.Unmarshal. +func TestPeers_QFilter(t *testing.T) { + cases := []struct { + name string + q string + wantIDs []string + }{ + {"no-q returns all", "", []string{"ws-alpha", "ws-beta", "ws-pm"}}, + {"name match", "alpha", []string{"ws-alpha"}}, + {"name match case-insensitive", "ALPHA", []string{"ws-alpha"}}, + {"role match", "design", []string{"ws-beta"}}, + {"no matches", "nonexistent", nil}, + {"whitespace-only is no-op", "%20%20", []string{"ws-alpha", "ws-beta", "ws-pm"}}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + handler, mock := peersFilterFixture(t) + peers, _ := runPeersWithQuery(t, handler, tc.q) + + got := peerIDSet(peers) + want := make(map[string]struct{}, len(tc.wantIDs)) + for _, id := range tc.wantIDs { + want[id] = struct{}{} + } + if len(got) != len(want) { + t.Fatalf("len: got %d %v, want %d %v", len(got), keysOf(got), len(want), tc.wantIDs) + } + for id := range want { + if _, ok := got[id]; !ok { + t.Errorf("missing id %q (got %v, want %v)", id, keysOf(got), tc.wantIDs) + } + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } + }) + } +} + +// TestPeers_Q_NoMatches_RawBodyIsArrayNotNull verifies the `peers = make(...)` +// nil-guard at the end of Peers — when filtering reduces the slice to +// non-nil-but-empty AND the original was nil, JSON must be `[]` not `null`. +// This is the assertion the previous TestPeers_Q_NoMatches falsely claimed +// to make: re-encoding an unmarshalled []map collapses null and [] both +// to []. The fix here checks the recorder body bytes BEFORE unmarshal. +func TestPeers_Q_NoMatches_RawBodyIsArrayNotNull(t *testing.T) { + handler, mock := peersFilterFixture(t) + _, body := runPeersWithQuery(t, handler, "nonexistent") + got := strings.TrimSpace(string(body)) + if got != "[]" { + t.Errorf("raw body: got %q, want []", got) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + +func keysOf(m map[string]struct{}) []string { + out := make([]string, 0, len(m)) + for k := range m { + out = append(out, k) + } + return out +} + // ==================== CheckAccess ==================== func TestCheckAccess_BadJSON(t *testing.T) { @@ -527,6 +671,94 @@ func TestWriteExternalWorkspaceURL_RewritesLocalhostForDockerCaller(t *testing.T } } +// --- #1484 SSRF defense-in-depth regression tests --- + +// TestDiscoverHostPeer_RejectsMetadataIPURL pins the #1484 guard: +// even though discoverHostPeer is currently gated by a bearer-required +// Discover handler, the function MUST refuse to hand back a URL that +// resolves into the cloud-metadata range. setupTestDB disables SSRF +// for normal tests, so we re-enable it here for the duration of the +// case and provide a literal IP so the check doesn't depend on DNS. +func TestDiscoverHostPeer_RejectsMetadataIPURL(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + restoreSSRF := setSSRFCheckForTest(true) + t.Cleanup(restoreSSRF) + + mock.ExpectQuery(`SELECT url, status, forwarded_to FROM workspaces WHERE id =`). + WithArgs("ws-bad"). + WillReturnRows(sqlmock.NewRows([]string{"url", "status", "forwarded_to"}). + AddRow("http://169.254.169.254/latest/meta-data/", "online", nil)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request = httptest.NewRequest("GET", "/x", nil) + + discoverHostPeer(context.Background(), c, "ws-bad") + if w.Code != http.StatusServiceUnavailable { + t.Errorf("expected 503 for metadata-IP URL, got %d: %s", w.Code, w.Body.String()) + } + if !strings.Contains(w.Body.String(), "safety check") { + t.Errorf("response should mention 'safety check', got %s", w.Body.String()) + } +} + +// TestDiscoverHostPeer_AcceptsPublicURL is the positive counterpart — +// a routable hostname (literal public-range IP, no DNS dependency) +// passes through the new guard and returns 200. Without it, the +// rejection test above could pass by falsely failing every URL. +func TestDiscoverHostPeer_AcceptsPublicURL(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + restoreSSRF := setSSRFCheckForTest(true) + t.Cleanup(restoreSSRF) + + mock.ExpectQuery(`SELECT url, status, forwarded_to FROM workspaces WHERE id =`). + WithArgs("ws-good"). + WillReturnRows(sqlmock.NewRows([]string{"url", "status", "forwarded_to"}). + AddRow("http://8.8.8.8/a2a", "online", nil)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request = httptest.NewRequest("GET", "/x", nil) + + discoverHostPeer(context.Background(), c, "ws-good") + if w.Code != http.StatusOK { + t.Errorf("expected 200 for public-IP URL, got %d: %s", w.Code, w.Body.String()) + } +} + +// TestWriteExternalWorkspaceURL_RejectsMetadataIPURL is the parallel +// guard for the external-runtime path. Same #1484 rationale as the +// host-peer test above; covers writeExternalWorkspaceURL specifically. +func TestWriteExternalWorkspaceURL_RejectsMetadataIPURL(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + restoreSSRF := setSSRFCheckForTest(true) + t.Cleanup(restoreSSRF) + + mock.ExpectQuery(`SELECT COALESCE\(url,''\) FROM workspaces WHERE id =`). + WithArgs("ws-ext"). + WillReturnRows(sqlmock.NewRows([]string{"url"}). + AddRow("http://169.254.169.254/computeMetadata/v1/")) + // callerRuntime lookup happens before isSafeURL — must mock it. + mock.ExpectQuery(`SELECT COALESCE\(runtime,'langgraph'\) FROM workspaces WHERE id =`). + WithArgs("ws-caller"). + WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("langgraph")) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request = httptest.NewRequest("GET", "/x", nil) + + handled := writeExternalWorkspaceURL(context.Background(), c, "ws-caller", "ws-ext", "Bad WS") + if !handled { + t.Fatal("expected handled=true (the function did write a 503)") + } + if w.Code != http.StatusServiceUnavailable { + t.Errorf("expected 503 for metadata-IP URL, got %d: %s", w.Code, w.Body.String()) + } +} + // --- discoverHostPeer smoke (currently unreachable via Discover) --- func TestDiscoverHostPeer_Smoke_CacheHit(t *testing.T) { diff --git a/workspace-server/internal/handlers/runtime_provision_timeouts.go b/workspace-server/internal/handlers/runtime_provision_timeouts.go new file mode 100644 index 00000000..9e922fb9 --- /dev/null +++ b/workspace-server/internal/handlers/runtime_provision_timeouts.go @@ -0,0 +1,102 @@ +package handlers + +import ( + "log" + "os" + "path/filepath" + "sync" + + "gopkg.in/yaml.v3" +) + +// runtimeProvisionTimeoutsCache holds the per-runtime provision-timeout +// values declared in template config.yaml manifests (#2054 phase 2). +// Lazy-init so the first workspace API request after process start pays +// the read cost (a few KB of yaml across ~50 templates) and every +// subsequent one is a map lookup. +// +// Cache lifetime = process lifetime. Templates only change on container +// rebuild + workspace-server restart, which already invalidates the +// in-memory state. A future template-hot-reload feature would need to +// refresh this cache; today there is no such hook. +// +// NOT SAFE for package-level reuse — sync.Once cannot be reset, so a +// shared singleton would lock tests out of trying multiple template +// fixtures. Tests construct fresh struct values; production code holds +// one per WorkspaceHandler instance. +type runtimeProvisionTimeoutsCache struct { + once sync.Once + m map[string]int // runtime → seconds +} + +func (c *runtimeProvisionTimeoutsCache) get(configsDir string, runtime string) int { + c.once.Do(func() { + c.m = loadRuntimeProvisionTimeouts(configsDir) + }) + return c.m[runtime] +} + +// walkTemplateConfigs invokes fn(id, configBytes) for every immediate +// subdirectory of configsDir that has a readable config.yaml. Bad reads +// and empty dirs are silently skipped — the caller decides whether +// missing fields warrant the slot. A bad configsDir logs once and +// returns without invoking fn, matching the "start clean with no +// templates" semantics. +// +// Shared between the templates.List handler (which decodes the full +// templateSummary) and loadRuntimeProvisionTimeouts (which decodes the +// narrow runtime-timeout subset). Centralising the walk means a future +// template-discovery rule (subdir naming convention, README sentinel, +// etc.) lands in one place. +func walkTemplateConfigs(configsDir string, fn func(id string, configBytes []byte)) { + entries, err := os.ReadDir(configsDir) + if err != nil { + log.Printf("walkTemplateConfigs: read configsDir %s: %v", configsDir, err) + return + } + for _, e := range entries { + if !e.IsDir() { + continue + } + data, err := os.ReadFile(filepath.Join(configsDir, e.Name(), "config.yaml")) + if err != nil { + continue + } + fn(e.Name(), data) + } +} + +// loadRuntimeProvisionTimeouts returns a map of runtime → seconds for +// templates that declared `runtime_config.provision_timeout_seconds`. +// +// Templates without the field aren't represented (lookup returns zero, +// which the caller treats as "fall through to canvas runtime profile"). +// +// Multiple templates with the same runtime: take the MAX timeout — a +// slow template's threshold should win over a fast template's so users +// of either template see a true-positive timeout signal rather than a +// false alarm. Same-runtime divergence is rare in practice (typically +// one canonical template-{runtime} per runtime) but the rule is the +// safer default. +func loadRuntimeProvisionTimeouts(configsDir string) map[string]int { + out := map[string]int{} + walkTemplateConfigs(configsDir, func(_ string, data []byte) { + var raw struct { + Runtime string `yaml:"runtime"` + RuntimeConfig struct { + ProvisionTimeoutSeconds int `yaml:"provision_timeout_seconds"` + } `yaml:"runtime_config"` + } + if err := yaml.Unmarshal(data, &raw); err != nil { + return + } + secs := raw.RuntimeConfig.ProvisionTimeoutSeconds + if secs <= 0 || raw.Runtime == "" { + return + } + if existing, ok := out[raw.Runtime]; !ok || secs > existing { + out[raw.Runtime] = secs + } + }) + return out +} diff --git a/workspace-server/internal/handlers/runtime_provision_timeouts_test.go b/workspace-server/internal/handlers/runtime_provision_timeouts_test.go new file mode 100644 index 00000000..a6cd3b52 --- /dev/null +++ b/workspace-server/internal/handlers/runtime_provision_timeouts_test.go @@ -0,0 +1,128 @@ +package handlers + +import ( + "os" + "path/filepath" + "testing" +) + +// writeTemplate is a tiny test fixture: drop a config.yaml under +// tmp//config.yaml with the given content. Mirrors the real +// configsDir layout (one subdir per template, each with its own +// config.yaml). +func writeTemplate(t *testing.T, dir, name, content string) { + t.Helper() + p := filepath.Join(dir, name) + if err := os.MkdirAll(p, 0o755); err != nil { + t.Fatalf("mkdir %s: %v", p, err) + } + if err := os.WriteFile(filepath.Join(p, "config.yaml"), []byte(content), 0o644); err != nil { + t.Fatalf("write config.yaml: %v", err) + } +} + +func TestLoadRuntimeProvisionTimeouts_HappyPath(t *testing.T) { + dir := t.TempDir() + writeTemplate(t, dir, "template-hermes", ` +name: Hermes +runtime: hermes +runtime_config: + provision_timeout_seconds: 720 +`) + writeTemplate(t, dir, "template-claude-code", ` +name: Claude +runtime: claude-code +runtime_config: + model: anthropic:claude-opus +`) + got := loadRuntimeProvisionTimeouts(dir) + if got["hermes"] != 720 { + t.Errorf("hermes: got %d, want 720", got["hermes"]) + } + // claude-code didn't declare a timeout — must not appear in the map + // (zero-value lookup is the no-override signal). + if _, ok := got["claude-code"]; ok { + t.Errorf("claude-code: present without declaration: %d", got["claude-code"]) + } +} + +func TestLoadRuntimeProvisionTimeouts_MaxOnDuplicateRuntime(t *testing.T) { + dir := t.TempDir() + writeTemplate(t, dir, "template-hermes-fast", ` +runtime: hermes +runtime_config: + provision_timeout_seconds: 300 +`) + writeTemplate(t, dir, "template-hermes-slow", ` +runtime: hermes +runtime_config: + provision_timeout_seconds: 900 +`) + got := loadRuntimeProvisionTimeouts(dir) + // Max wins so the slowest template's threshold doesn't false-alarm + // when both templates use the same runtime. + if got["hermes"] != 900 { + t.Errorf("max-on-duplicate: got %d, want 900", got["hermes"]) + } +} + +func TestLoadRuntimeProvisionTimeouts_SkipsBadInputs(t *testing.T) { + dir := t.TempDir() + // Missing runtime field — has timeout but no key to map under. + writeTemplate(t, dir, "template-no-runtime", ` +runtime_config: + provision_timeout_seconds: 600 +`) + // Zero/negative timeout — same as no declaration. + writeTemplate(t, dir, "template-zero", ` +runtime: zero-runtime +runtime_config: + provision_timeout_seconds: 0 +`) + // Malformed yaml — must not crash. + writeTemplate(t, dir, "template-bad", "not: valid: yaml: at: all:") + // Loose file at the top level (not a dir) — must be ignored. + if err := os.WriteFile(filepath.Join(dir, "stray.txt"), []byte("ignore me"), 0o644); err != nil { + t.Fatal(err) + } + got := loadRuntimeProvisionTimeouts(dir) + if len(got) != 0 { + t.Errorf("expected empty map for skip cases, got %v", got) + } +} + +func TestLoadRuntimeProvisionTimeouts_MissingDirReturnsEmpty(t *testing.T) { + got := loadRuntimeProvisionTimeouts("/nonexistent/path/should/not/exist/12345") + if len(got) != 0 { + t.Errorf("expected empty map on missing dir, got %v", got) + } +} + +func TestRuntimeProvisionTimeoutsCache_LazyInitAndCached(t *testing.T) { + dir := t.TempDir() + writeTemplate(t, dir, "template-hermes", ` +runtime: hermes +runtime_config: + provision_timeout_seconds: 720 +`) + c := runtimeProvisionTimeoutsCache{} + + // First call populates. + if got := c.get(dir, "hermes"); got != 720 { + t.Errorf("first call: got %d, want 720", got) + } + // Second call hits cache — even if the underlying file changed we + // still see the original value (sync.Once contract). + if err := os.WriteFile(filepath.Join(dir, "template-hermes", "config.yaml"), + []byte("runtime: hermes\nruntime_config:\n provision_timeout_seconds: 60\n"), 0o644); err != nil { + t.Fatal(err) + } + if got := c.get(dir, "hermes"); got != 720 { + t.Errorf("cached call: got %d, want 720 (cache must not re-read)", got) + } + // Unknown runtime returns zero — caller's signal to fall through to + // the canvas runtime profile default. + if got := c.get(dir, "unknown"); got != 0 { + t.Errorf("unknown runtime: got %d, want 0", got) + } +} diff --git a/workspace-server/internal/handlers/ssrf.go b/workspace-server/internal/handlers/ssrf.go index a84426f1..2e795e90 100644 --- a/workspace-server/internal/handlers/ssrf.go +++ b/workspace-server/internal/handlers/ssrf.go @@ -127,7 +127,16 @@ var testAllowLoopback = false // container deployments the relaxation is off and every private range // stays blocked. func isPrivateOrMetadataIP(ip net.IP) bool { - saas := saasMode() + // MOLECULE_ENV=development is the dev-host pattern: platform and + // workspace containers share a docker bridge network (172.18.0.0/16, + // RFC-1918). Treat that the same as SaaS for private-range relaxation + // — both share the "trusted intra-network routing" property. Without + // this, every workspace registration via docker-internal hostname + // resolves to 172.18.x.x and gets rejected as + // "workspace URL is not publicly routable", breaking the entire + // docker-compose dev loop. Always-blocked categories (metadata link- + // local, TEST-NET, CGNAT) remain blocked regardless. + saas := saasMode() || devModeAllowsLoopback() // IPv4 path. if ip4 := ip.To4(); ip4 != nil { diff --git a/workspace-server/internal/handlers/ssrf_test.go b/workspace-server/internal/handlers/ssrf_test.go index 37b2b358..8e246ebc 100644 --- a/workspace-server/internal/handlers/ssrf_test.go +++ b/workspace-server/internal/handlers/ssrf_test.go @@ -387,6 +387,37 @@ func TestIsSafeURL_SaaSMode_StillBlocksMetadataEtAl(t *testing.T) { } } +// TestIsSafeURL_DevMode_AllowsRFC1918 pins the dev-mode RFC-1918 + ULA +// relaxation that #2103 widened. The dev-host docker-compose pattern +// puts the platform + workspaces on the same docker bridge (172.18.0.0/16), +// so workspace registration via 172.18.x.x must NOT be rejected in dev. +// SaaS already allowed this; dev mode now matches via +// `saas := saasMode() || devModeAllowsLoopback()` in isPrivateOrMetadataIP. +// +// Without this test, a future refactor that quietly drops the +// `|| devModeAllowsLoopback()` from line 130 wouldn't trip any test — +// the existing `TestIsSafeURL_DevMode_StillBlocksOtherRanges` only +// pins the security floor (metadata / TEST-NET / CGNAT), not the +// behavior change. +func TestIsSafeURL_DevMode_AllowsRFC1918(t *testing.T) { + // Make sure saasMode() returns false so the test exercises the + // devModeAllowsLoopback() branch specifically — not a SaaS-mode pass. + t.Setenv("MOLECULE_DEPLOY_MODE", "self-hosted") + t.Setenv("MOLECULE_ORG_ID", "") + t.Setenv("MOLECULE_ENV", "development") + + for _, url := range []string{ + "http://10.1.2.3/agent", + "http://172.18.0.42:8000/a2a", // the docker-compose case from the issue + "http://192.168.1.100/agent", + "http://[fd00::1]/agent", // IPv6 ULA fd00::/8 also relaxed + } { + if err := isSafeURL(url); err != nil { + t.Errorf("isSafeURL(%q) in dev mode: got %v, want nil", url, err) + } + } +} + // TestIsSafeURL_StrictMode_BlocksRFC1918 is the strict-mode counterpart to // TestIsSafeURL_SaaSMode_AllowsRFC1918. In self-hosted / single-container // deployments there is no legitimate reason to reach RFC-1918 agents, so the diff --git a/workspace-server/internal/handlers/templates.go b/workspace-server/internal/handlers/templates.go index 38735830..e33c06d6 100644 --- a/workspace-server/internal/handlers/templates.go +++ b/workspace-server/internal/handlers/templates.go @@ -61,6 +61,13 @@ type templateSummary struct { RequiredEnv []string `json:"required_env,omitempty"` Skills []string `json:"skills"` SkillCount int `json:"skill_count"` + // ProvisionTimeoutSeconds lets a slow runtime declare its expected + // cold-boot duration in its template manifest. Canvas's + // ProvisioningTimeout banner respects this per-workspace via the + // `provision_timeout_ms` field in the workspace API response (#2054). + // 0 = template hasn't declared one, falls through to canvas's + // runtime-profile default. + ProvisionTimeoutSeconds int `json:"provision_timeout_seconds,omitempty"` } // resolveTemplateDir finds the template directory for a workspace on the host. @@ -80,24 +87,8 @@ func (h *TemplatesHandler) resolveTemplateDir(wsName string) string { // List handles GET /templates func (h *TemplatesHandler) List(c *gin.Context) { - entries, err := os.ReadDir(h.configsDir) - if err != nil { - c.JSON(http.StatusOK, []templateSummary{}) - return - } - templates := make([]templateSummary, 0) - for _, entry := range entries { - if !entry.IsDir() { - continue - } - - configPath := filepath.Join(h.configsDir, entry.Name(), "config.yaml") - data, err := os.ReadFile(configPath) - if err != nil { - continue - } - + walkTemplateConfigs(h.configsDir, func(id string, data []byte) { var raw struct { Name string `yaml:"name"` Description string `yaml:"description"` @@ -106,13 +97,14 @@ func (h *TemplatesHandler) List(c *gin.Context) { Model string `yaml:"model"` Skills []string `yaml:"skills"` RuntimeConfig struct { - Model string `yaml:"model"` - Models []modelSpec `yaml:"models"` - RequiredEnv []string `yaml:"required_env"` + Model string `yaml:"model"` + Models []modelSpec `yaml:"models"` + RequiredEnv []string `yaml:"required_env"` + ProvisionTimeoutSeconds int `yaml:"provision_timeout_seconds"` } `yaml:"runtime_config"` } if err := yaml.Unmarshal(data, &raw); err != nil { - continue + return } // Model comes from either top-level (legacy) or runtime_config.model (current). @@ -122,18 +114,19 @@ func (h *TemplatesHandler) List(c *gin.Context) { } templates = append(templates, templateSummary{ - ID: entry.Name(), - Name: raw.Name, - Description: raw.Description, - Tier: raw.Tier, - Runtime: raw.Runtime, - Model: model, - Models: raw.RuntimeConfig.Models, - RequiredEnv: raw.RuntimeConfig.RequiredEnv, - Skills: raw.Skills, - SkillCount: len(raw.Skills), + ID: id, + Name: raw.Name, + Description: raw.Description, + Tier: raw.Tier, + Runtime: raw.Runtime, + Model: model, + Models: raw.RuntimeConfig.Models, + RequiredEnv: raw.RuntimeConfig.RequiredEnv, + Skills: raw.Skills, + SkillCount: len(raw.Skills), + ProvisionTimeoutSeconds: raw.RuntimeConfig.ProvisionTimeoutSeconds, }) - } + }) c.JSON(http.StatusOK, templates) } diff --git a/workspace-server/internal/handlers/tokens_sqlmock_test.go b/workspace-server/internal/handlers/tokens_sqlmock_test.go new file mode 100644 index 00000000..b0166293 --- /dev/null +++ b/workspace-server/internal/handlers/tokens_sqlmock_test.go @@ -0,0 +1,331 @@ +package handlers + +// Sqlmock-backed coverage for tokens.go. Closes #1819. +// +// The existing tokens_test.go uses the real `db.DB` and t.Skip's when +// the test DB isn't reachable — which is the default in CI, so the +// file shows 0% coverage. This file substitutes the package-level +// `db.DB` with a sqlmock instance so every code path (List, Create, +// Revoke + their error branches) is exercised in `go test` without +// any external dependency. +// +// What's covered: +// List — happy path, empty rows, scan failure, query error +// Create — rate-limited, IssueToken DB error, happy path +// Revoke — happy path, not found, DB error +// +// What's NOT covered here (intentional): +// - Wsauth/middleware-level cross-tenant gating: those are exercised +// by middleware/wsauth_middleware_test.go. The handler-level code +// trusts WorkspaceAuth has already gated the request. +// - The full IssueToken path's correctness (random bytes, +// base64 encoding, prefix derivation): wsauth/tokens_test.go owns +// that. Here we only verify the handler hands off + reports +// errors correctly. + +import ( + "context" + "database/sql" + "database/sql/driver" + "encoding/json" + "errors" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/DATA-DOG/go-sqlmock" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" + "github.com/gin-gonic/gin" +) + +func init() { gin.SetMode(gin.TestMode) } + +// withMockDB swaps `db.DB` for a sqlmock and returns the mock plus a +// restore func. Tests use this in place of setupTokenTestDB which +// skips on a missing real DB. +func withMockDB(t *testing.T) (sqlmock.Sqlmock, func()) { + t.Helper() + mock, m, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock: %v", err) + } + prev := db.DB + db.DB = mock + cleanup := func() { + db.DB = prev + _ = mock.Close() + } + return m, cleanup +} + +// makeReq builds a recorder + Gin context with the given URL params, +// drives the handler, and returns the recorder. Centralised so each +// scenario is one-line setup + assertion. +func makeReq(t *testing.T, h gin.HandlerFunc, method, url string, params gin.Params) *httptest.ResponseRecorder { + t.Helper() + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request = httptest.NewRequest(method, url, nil) + c.Params = params + h(c) + return w +} + +// ---- List ------------------------------------------------------------ + +func TestTokenHandler_List_HappyPath(t *testing.T) { + mock, cleanup := withMockDB(t) + defer cleanup() + + created := time.Date(2026, 4, 1, 12, 0, 0, 0, time.UTC) + last := created.Add(time.Hour) + mock.ExpectQuery(`SELECT id, prefix, created_at, last_used_at\s+FROM workspace_auth_tokens`). + WithArgs("ws-1", 50, 0). + WillReturnRows(sqlmock.NewRows([]string{"id", "prefix", "created_at", "last_used_at"}). + AddRow("tok-1", "abc12345", created, last). + AddRow("tok-2", "def67890", created, nil)) + + w := makeReq(t, NewTokenHandler().List, "GET", + "/workspaces/ws-1/tokens", gin.Params{{Key: "id", Value: "ws-1"}}) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String()) + } + var body struct { + Tokens []tokenListItem `json:"tokens"` + Count int `json:"count"` + } + if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil { + t.Fatalf("decode: %v", err) + } + if body.Count != 2 || len(body.Tokens) != 2 { + t.Fatalf("count=%d tokens=%d, want 2/2", body.Count, len(body.Tokens)) + } + if body.Tokens[0].ID != "tok-1" || body.Tokens[1].ID != "tok-2" { + t.Errorf("wrong order: %+v", body.Tokens) + } + if body.Tokens[1].LastUsed != nil { + t.Errorf("token-2 should have nil last_used_at, got %v", body.Tokens[1].LastUsed) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet: %v", err) + } +} + +func TestTokenHandler_List_EmptyResult(t *testing.T) { + mock, cleanup := withMockDB(t) + defer cleanup() + + mock.ExpectQuery(`SELECT id, prefix, created_at, last_used_at`). + WillReturnRows(sqlmock.NewRows([]string{"id", "prefix", "created_at", "last_used_at"})) + + w := makeReq(t, NewTokenHandler().List, "GET", + "/workspaces/ws-2/tokens", gin.Params{{Key: "id", Value: "ws-2"}}) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200 on empty list, got %d", w.Code) + } + var body struct { + Tokens []tokenListItem `json:"tokens"` + Count int `json:"count"` + } + _ = json.Unmarshal(w.Body.Bytes(), &body) + if body.Count != 0 || body.Tokens == nil { + // Tokens MUST be `[]` not `null` so callers iterating with + // `.length` or `for...of` don't NPE on JS. + t.Errorf("empty: count=%d, tokens=%v (want 0 + non-nil)", body.Count, body.Tokens) + } +} + +func TestTokenHandler_List_QueryError(t *testing.T) { + mock, cleanup := withMockDB(t) + defer cleanup() + + mock.ExpectQuery(`SELECT id, prefix, created_at, last_used_at`). + WillReturnError(errors.New("connection refused")) + + w := makeReq(t, NewTokenHandler().List, "GET", + "/workspaces/ws-3/tokens", gin.Params{{Key: "id", Value: "ws-3"}}) + + if w.Code != http.StatusInternalServerError { + t.Errorf("query error must surface as 500, got %d", w.Code) + } +} + +func TestTokenHandler_List_RespectsLimit(t *testing.T) { + mock, cleanup := withMockDB(t) + defer cleanup() + + mock.ExpectQuery(`SELECT id, prefix, created_at, last_used_at`). + WithArgs("ws-1", 10, 5). + WillReturnRows(sqlmock.NewRows([]string{"id", "prefix", "created_at", "last_used_at"})) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request = httptest.NewRequest("GET", "/workspaces/ws-1/tokens?limit=10&offset=5", nil) + c.Params = gin.Params{{Key: "id", Value: "ws-1"}} + NewTokenHandler().List(c) + + if w.Code != http.StatusOK { + t.Errorf("limit/offset query: %d", w.Code) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("limit/offset args not bound correctly: %v", err) + } +} + +func TestTokenHandler_List_ScanError(t *testing.T) { + mock, cleanup := withMockDB(t) + defer cleanup() + + // Inject a bad row that fails to Scan: pass non-time value where + // created_at expects time.Time. sqlmock surfaces this as a Scan err. + mock.ExpectQuery(`SELECT id, prefix, created_at, last_used_at`). + WillReturnRows(sqlmock.NewRows([]string{"id", "prefix", "created_at", "last_used_at"}). + AddRow("tok-1", "abc", "not-a-timestamp", nil)) + + w := makeReq(t, NewTokenHandler().List, "GET", + "/workspaces/ws-1/tokens", gin.Params{{Key: "id", Value: "ws-1"}}) + + if w.Code != http.StatusInternalServerError { + t.Errorf("scan error must surface as 500, got %d: %s", w.Code, w.Body.String()) + } +} + +// ---- Create ---------------------------------------------------------- + +func TestTokenHandler_Create_RateLimited(t *testing.T) { + mock, cleanup := withMockDB(t) + defer cleanup() + + // Count query returns 50 (== max) → 429. + mock.ExpectQuery(`SELECT COUNT\(\*\) FROM workspace_auth_tokens`). + WithArgs("ws-1"). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(50)) + + w := makeReq(t, NewTokenHandler().Create, "POST", + "/workspaces/ws-1/tokens", gin.Params{{Key: "id", Value: "ws-1"}}) + + if w.Code != http.StatusTooManyRequests { + t.Errorf("max active tokens should 429, got %d", w.Code) + } +} + +func TestTokenHandler_Create_IssueFails(t *testing.T) { + mock, cleanup := withMockDB(t) + defer cleanup() + + // Count = 0 → fall through to IssueToken, which does an INSERT + // into workspace_auth_tokens. Mock the INSERT to fail; handler + // surfaces as 500. + mock.ExpectQuery(`SELECT COUNT\(\*\) FROM workspace_auth_tokens`). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0)) + mock.ExpectExec(`INSERT INTO workspace_auth_tokens`). + WillReturnError(errors.New("disk full")) + + w := makeReq(t, NewTokenHandler().Create, "POST", + "/workspaces/ws-1/tokens", gin.Params{{Key: "id", Value: "ws-1"}}) + + if w.Code != http.StatusInternalServerError { + t.Errorf("IssueToken DB error must 500, got %d", w.Code) + } +} + +func TestTokenHandler_Create_HappyPath(t *testing.T) { + mock, cleanup := withMockDB(t) + defer cleanup() + + mock.ExpectQuery(`SELECT COUNT\(\*\) FROM workspace_auth_tokens`). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0)) + mock.ExpectExec(`INSERT INTO workspace_auth_tokens`). + WillReturnResult(sqlmock.NewResult(1, 1)) + + w := makeReq(t, NewTokenHandler().Create, "POST", + "/workspaces/ws-1/tokens", gin.Params{{Key: "id", Value: "ws-1"}}) + + if w.Code != http.StatusCreated { + t.Fatalf("expected 201, got %d: %s", w.Code, w.Body.String()) + } + var body struct { + AuthToken string `json:"auth_token"` + WorkspaceID string `json:"workspace_id"` + } + if err := json.Unmarshal(w.Body.Bytes(), &body); err != nil { + t.Fatalf("decode: %v", err) + } + if body.AuthToken == "" { + t.Errorf("auth_token must be present and non-empty in response") + } + if body.WorkspaceID != "ws-1" { + t.Errorf("workspace_id mismatch: %q", body.WorkspaceID) + } +} + +// ---- Revoke ---------------------------------------------------------- + +func TestTokenHandler_Revoke_HappyPath(t *testing.T) { + mock, cleanup := withMockDB(t) + defer cleanup() + + mock.ExpectExec(`UPDATE workspace_auth_tokens\s+SET revoked_at = now\(\)`). + WithArgs("tok-1", "ws-1"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + w := makeReq(t, NewTokenHandler().Revoke, "DELETE", + "/workspaces/ws-1/tokens/tok-1", gin.Params{ + {Key: "id", Value: "ws-1"}, + {Key: "tokenId", Value: "tok-1"}, + }) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200 on revoke, got %d: %s", w.Code, w.Body.String()) + } +} + +func TestTokenHandler_Revoke_NotFound(t *testing.T) { + mock, cleanup := withMockDB(t) + defer cleanup() + + // 0 rows affected → token not found OR already revoked. + mock.ExpectExec(`UPDATE workspace_auth_tokens`). + WithArgs("tok-ghost", "ws-1"). + WillReturnResult(sqlmock.NewResult(0, 0)) + + w := makeReq(t, NewTokenHandler().Revoke, "DELETE", + "/workspaces/ws-1/tokens/tok-ghost", gin.Params{ + {Key: "id", Value: "ws-1"}, + {Key: "tokenId", Value: "tok-ghost"}, + }) + + if w.Code != http.StatusNotFound { + t.Errorf("revoke missing token must 404, got %d", w.Code) + } +} + +func TestTokenHandler_Revoke_DBError(t *testing.T) { + mock, cleanup := withMockDB(t) + defer cleanup() + + mock.ExpectExec(`UPDATE workspace_auth_tokens`). + WillReturnError(errors.New("conn lost")) + + w := makeReq(t, NewTokenHandler().Revoke, "DELETE", + "/workspaces/ws-1/tokens/tok-1", gin.Params{ + {Key: "id", Value: "ws-1"}, + {Key: "tokenId", Value: "tok-1"}, + }) + + if w.Code != http.StatusInternalServerError { + t.Errorf("DB error must 500, got %d", w.Code) + } +} + +// Compile-time noise removal: the imports list pulls in the sql / +// driver packages and the silenced ctx so a future scenario that +// needs them doesn't have to re-add the import. Documented here so +// the apparent "unused import" dance isn't surprising. +var ( + _ context.Context = context.Background() + _ driver.Value = (sql.RawBytes)(nil) +) diff --git a/workspace-server/internal/handlers/workspace.go b/workspace-server/internal/handlers/workspace.go index f5ef268e..5bd67e19 100644 --- a/workspace-server/internal/handlers/workspace.go +++ b/workspace-server/internal/handlers/workspace.go @@ -27,7 +27,13 @@ import ( ) type WorkspaceHandler struct { - broadcaster *events.Broadcaster + // broadcaster narrowed from `*events.Broadcaster` to the + // events.EventEmitter interface (#1814) so tests can substitute a + // capture-only stub without standing up the real Redis + WS-hub + // topology. Production callers still pass *events.Broadcaster, which + // satisfies the interface — see the compile-time assertion in + // internal/events/broadcaster.go. + broadcaster events.EventEmitter provisioner *provisioner.Provisioner cpProv *provisioner.CPProvisioner platformURL string @@ -41,9 +47,13 @@ type WorkspaceHandler struct { // calls made by HibernateWorkspace without requiring a running Docker daemon. // Always nil in production; the real provisioner path is used when nil. stopFnOverride func(ctx context.Context, workspaceID string) + // provisionTimeouts caches per-runtime provision-timeout values from + // template manifests (#2054 phase 2). Lazy-init on first scan; see + // runtime_provision_timeouts.go for the loader contract. + provisionTimeouts runtimeProvisionTimeoutsCache } -func NewWorkspaceHandler(b *events.Broadcaster, p *provisioner.Provisioner, platformURL, configsDir string) *WorkspaceHandler { +func NewWorkspaceHandler(b events.EventEmitter, p *provisioner.Provisioner, platformURL, configsDir string) *WorkspaceHandler { return &WorkspaceHandler{ broadcaster: b, provisioner: p, @@ -410,6 +420,17 @@ func (h *WorkspaceHandler) Create(c *gin.Context) { }) } +// addProvisionTimeoutMs decorates a workspace response map with the +// per-runtime provision-timeout override (#2054 phase 2) when one is +// declared in the runtime's template manifest. No-op when the runtime +// has no declared timeout — the canvas-side resolver falls through to +// its runtime-profile default. +func (h *WorkspaceHandler) addProvisionTimeoutMs(ws map[string]interface{}, runtime string) { + if secs := h.provisionTimeouts.get(h.configsDir, runtime); secs > 0 { + ws["provision_timeout_ms"] = secs * 1000 + } +} + // scanWorkspaceRow is a helper to scan workspace+layout rows into a clean JSON map. func scanWorkspaceRow(rows interface { Scan(dest ...interface{}) error @@ -508,6 +529,13 @@ func (h *WorkspaceHandler) List(c *gin.Context) { log.Printf("List scan error: %v", err) continue } + // #2054 phase 2: surface per-runtime provision-timeout for + // canvas's ProvisioningTimeout banner. Decorating per-row + // (vs map-once-and-reuse) keeps the helper self-contained; + // the cache hit is sub-microsecond. + if rt, _ := ws["runtime"].(string); rt != "" { + h.addProvisionTimeoutMs(ws, rt) + } workspaces = append(workspaces, ws) } if err := rows.Err(); err != nil { @@ -575,5 +603,11 @@ func (h *WorkspaceHandler) Get(c *gin.Context) { ws["last_outbound_at"] = nil } + // #2054 phase 2: per-runtime provision-timeout for canvas's + // ProvisioningTimeout banner. + if rt, _ := ws["runtime"].(string); rt != "" { + h.addProvisionTimeoutMs(ws, rt) + } + c.JSON(http.StatusOK, ws) } diff --git a/workspace-server/internal/handlers/workspace_provision_test.go b/workspace-server/internal/handlers/workspace_provision_test.go index 65960071..5297fd15 100644 --- a/workspace-server/internal/handlers/workspace_provision_test.go +++ b/workspace-server/internal/handlers/workspace_provision_test.go @@ -9,7 +9,6 @@ import ( "testing" "github.com/DATA-DOG/go-sqlmock" - "github.com/Molecule-AI/molecule-monorepo/platform/internal/events" "github.com/Molecule-AI/molecule-monorepo/platform/internal/models" "github.com/Molecule-AI/molecule-monorepo/platform/internal/plugins" "github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner" @@ -1044,13 +1043,20 @@ var errInternalDB = fmt.Errorf("pq: connection refused") var errInternalOS = fmt.Errorf("operation failed: no such file or directory") // captureBroadcaster is a test broadcaster that captures the last data -// payload passed to RecordAndBroadcast so tests can inspect it. +// payload passed to RecordAndBroadcast so tests can inspect it. Now +// satisfies events.EventEmitter (#1814) directly — RecordAndBroadcast +// captures, BroadcastOnly is a no-op since none of the +// WorkspaceHandler paths under test call it. type captureBroadcaster struct { - events.Broadcaster // embed to satisfy the interface — only RecordAndBroadcast is overridden lastData map[string]interface{} lastErr error } +// BroadcastOnly is required to satisfy events.EventEmitter. None of the +// captureBroadcaster's exercising tests should land here — if a future +// test does, it'll need to add capture state for that channel. +func (c *captureBroadcaster) BroadcastOnly(_ string, _ string, _ interface{}) {} + func (c *captureBroadcaster) RecordAndBroadcast(_ context.Context, _, _ string, data interface{}) error { if m, ok := data.(map[string]interface{}); ok { // Shallow-copy so the caller can't mutate our capture. @@ -1107,14 +1113,14 @@ func containsUnsafeString(v interface{}) bool { // never leaks internal error details in WORKSPACE_PROVISION_FAILED broadcasts. // Regression test for issue #1206. func TestProvisionWorkspace_NoInternalErrorsInBroadcast(t *testing.T) { - t.Skip("TODO: captureBroadcaster type mismatch with WorkspaceHandler.broadcaster (*events.Broadcaster). Needs broadcaster interface refactor — currently blocking package compile on main (2026-04-21).") + t.Skip("TODO: write the test body. The interface blocker (#1814) is fixed — captureBroadcaster now satisfies events.EventEmitter and can be passed to NewWorkspaceHandler. The remaining work is constructing the provisionWorkspace failure path + asserting captured payload doesn't contain unsafeErrorStrings.") } // TestProvisionWorkspaceCP_NoInternalErrorsInBroadcast asserts that // provisionWorkspaceCP never leaks err.Error() in WORKSPACE_PROVISION_FAILED // broadcasts. Regression test for issue #1206. func TestProvisionWorkspaceCP_NoInternalErrorsInBroadcast(t *testing.T) { - t.Skip("TODO: captureBroadcaster type mismatch with WorkspaceHandler.broadcaster (*events.Broadcaster). Needs broadcaster interface refactor — currently blocking package compile on main (2026-04-21).") + t.Skip("TODO: write the test body. Same status as TestProvisionWorkspace_NoInternalErrorsInBroadcast — interface blocker fixed (#1814), need to construct the provisionWorkspaceCP failure path + assert payload sanitization.") } // mockEnvMutator is a provisionhook.Registry stub that always returns a fixed error. diff --git a/workspace-server/internal/middleware/wsauth_middleware_canvasorbearer_test.go b/workspace-server/internal/middleware/wsauth_middleware_canvasorbearer_test.go new file mode 100644 index 00000000..75ee87cb --- /dev/null +++ b/workspace-server/internal/middleware/wsauth_middleware_canvasorbearer_test.go @@ -0,0 +1,296 @@ +package middleware + +// Coverage tests for the CanvasOrBearer middleware and IsSameOriginCanvas +// per issue #1818. The existing wsauth_middleware_test.go covered the +// fail-open and Origin paths but missed the bearer-validate, admin-secret, +// same-origin-canvas, and IsSameOriginCanvas wrapper branches — leaving +// CanvasOrBearer at 50% and IsSameOriginCanvas at 0% line coverage. +// +// These tests target the gaps without re-asserting behavior already +// pinned by the existing suite. + +import ( + "crypto/sha256" + "net/http" + "net/http/httptest" + "testing" + + "github.com/DATA-DOG/go-sqlmock" + "github.com/gin-gonic/gin" +) + +// ── CanvasOrBearer: bearer-token branches (#1818) ─────────────────────────── + +// TestCanvasOrBearer_ValidBearer_Passes exercises path 1 (the success +// branch of the bearer-token validation block). Without this test the +// only "OK" path covered was the Origin allowlist match — bearer +// success was never asserted. +func TestCanvasOrBearer_ValidBearer_Passes(t *testing.T) { + mockDB, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock: %v", err) + } + defer mockDB.Close() + + bearer := "valid-bearer-for-canvas-or-bearer" + hash := sha256.Sum256([]byte(bearer)) + + mock.ExpectQuery(hasAnyLiveTokenGlobalQuery). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1)) + mock.ExpectQuery(validateAnyTokenSelectQuery). + WithArgs(hash[:]). + WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("tok-1")) + mock.ExpectExec(validateTokenUpdateQuery). + WithArgs("tok-1"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + r := gin.New() + r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) { + c.JSON(http.StatusOK, gin.H{"ok": true}) + }) + + w := httptest.NewRecorder() + req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil) + req.Header.Set("Authorization", "Bearer "+bearer) + r.ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Errorf("valid bearer: got %d, want 200 (%s)", w.Code, w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock: %v", err) + } +} + +// TestCanvasOrBearer_InvalidBearer_Returns401 covers the rejection +// branch when a bearer is supplied but ValidateAnyToken fails. This +// is the auth-escape case the issue called out: an attacker with a +// revoked/expired token + matching Origin previously could bypass +// auth, so the code MUST reject on bad bearer instead of falling +// through to Origin. +func TestCanvasOrBearer_InvalidBearer_Returns401(t *testing.T) { + mockDB, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock: %v", err) + } + defer mockDB.Close() + + bad := "expired-or-revoked-token" + hash := sha256.Sum256([]byte(bad)) + + mock.ExpectQuery(hasAnyLiveTokenGlobalQuery). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1)) + mock.ExpectQuery(validateAnyTokenSelectQuery). + WithArgs(hash[:]). + WillReturnRows(sqlmock.NewRows([]string{"id"})) // empty → ErrNoRows + + // Origin would otherwise grant access — assert it doesn't here. + t.Setenv("CORS_ORIGINS", "https://acme.moleculesai.app") + + handlerCalled := false + r := gin.New() + r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) { + handlerCalled = true + c.JSON(http.StatusOK, gin.H{"ok": true}) + }) + + w := httptest.NewRecorder() + req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil) + req.Header.Set("Authorization", "Bearer "+bad) + req.Header.Set("Origin", "https://acme.moleculesai.app") + r.ServeHTTP(w, req) + + if w.Code != http.StatusUnauthorized { + t.Errorf("invalid bearer with matching origin: got %d, want 401", w.Code) + } + if handlerCalled { + t.Error("invalid bearer leaked to handler — Origin fallback bypassed bearer validation") + } +} + +// TestCanvasOrBearer_AdminTokenEnv_Passes exercises the ADMIN_TOKEN +// constant-time-compare branch. The env-secret short-circuit is what +// keeps the canvas dashboard usable when no DB-backed token rows are +// provisioned yet (Hyperion bootstrap path). +func TestCanvasOrBearer_AdminTokenEnv_Passes(t *testing.T) { + mockDB, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock: %v", err) + } + defer mockDB.Close() + + mock.ExpectQuery(hasAnyLiveTokenGlobalQuery). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1)) + // No ValidateAnyToken expectation — env match short-circuits before that. + + t.Setenv("ADMIN_TOKEN", "platform-admin-secret") + + r := gin.New() + r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) { + c.JSON(http.StatusOK, gin.H{"ok": true}) + }) + + w := httptest.NewRecorder() + req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil) + req.Header.Set("Authorization", "Bearer platform-admin-secret") + r.ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Errorf("admin env match: got %d, want 200 (%s)", w.Code, w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock: %v", err) + } +} + +// TestCanvasOrBearer_DBError_FailOpen pins the documented behavior on a +// HasAnyLiveTokenGlobal failure. The middleware logs and falls open so a +// flaky DB doesn't lock canvas users out of cosmetic routes. Hardcoded in +// the comment block; this is a reminder if anyone changes that semantic. +func TestCanvasOrBearer_DBError_FailOpen(t *testing.T) { + mockDB, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock: %v", err) + } + defer mockDB.Close() + mock.ExpectQuery(hasAnyLiveTokenGlobalQuery). + WillReturnError(http.ErrAbortHandler) // any non-nil error suffices + + r := gin.New() + r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) { + c.JSON(http.StatusOK, gin.H{"ok": true}) + }) + + w := httptest.NewRecorder() + req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil) + r.ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Errorf("DB error fail-open: got %d, want 200 (%s)", w.Code, w.Body.String()) + } +} + +// TestCanvasOrBearer_SameOriginCanvas_Passes exercises path 3: when no +// CORS Origin matches but the Referer/Host pair indicates the canvas is +// served from the same combined-tenant image. CANVAS_PROXY_URL gates +// this branch — without flipping canvasProxyActive the path is dead. +func TestCanvasOrBearer_SameOriginCanvas_Passes(t *testing.T) { + prev := canvasProxyActive + canvasProxyActive = true + defer func() { canvasProxyActive = prev }() + + mockDB, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock: %v", err) + } + defer mockDB.Close() + mock.ExpectQuery(hasAnyLiveTokenGlobalQuery). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1)) + + r := gin.New() + r.PUT("/canvas/viewport", CanvasOrBearer(mockDB), func(c *gin.Context) { + c.JSON(http.StatusOK, gin.H{"ok": true}) + }) + + w := httptest.NewRecorder() + req, _ := http.NewRequest(http.MethodPut, "/canvas/viewport", nil) + req.Host = "tenant.moleculesai.app" + req.Header.Set("Referer", "https://tenant.moleculesai.app/dashboard") + r.ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Errorf("same-origin canvas: got %d, want 200 (%s)", w.Code, w.Body.String()) + } +} + +// ── IsSameOriginCanvas: exported wrapper + branch coverage (#1818) ────────── + +// TestIsSameOriginCanvas_ExportedWrapper_DelegatesToInternal — IsSame... +// is at 0% per the audit because nothing in the test suite calls the +// exported variant. It's a one-line delegation but the auth-boundary +// surface must stay testable from outside the package. +func TestIsSameOriginCanvas_ExportedWrapper_DelegatesToInternal(t *testing.T) { + prev := canvasProxyActive + canvasProxyActive = true + defer func() { canvasProxyActive = prev }() + + c, _ := gin.CreateTestContext(httptest.NewRecorder()) + c.Request = httptest.NewRequest(http.MethodGet, "/x", nil) + c.Request.Host = "tenant.moleculesai.app" + c.Request.Header.Set("Referer", "https://tenant.moleculesai.app/x") + + if !IsSameOriginCanvas(c) { + t.Error("exported IsSameOriginCanvas should accept matching Referer") + } +} + +// TestIsSameOriginCanvas_DisabledByEnv — when CANVAS_PROXY_URL was unset +// at boot, canvasProxyActive is false and the wrapper must return false +// even on a perfect Referer match. Self-hosted / dev installs rely on +// this short-circuit. +func TestIsSameOriginCanvas_DisabledByEnv(t *testing.T) { + prev := canvasProxyActive + canvasProxyActive = false + defer func() { canvasProxyActive = prev }() + + c, _ := gin.CreateTestContext(httptest.NewRecorder()) + c.Request = httptest.NewRequest(http.MethodGet, "/x", nil) + c.Request.Host = "tenant.moleculesai.app" + c.Request.Header.Set("Referer", "https://tenant.moleculesai.app/x") + + if IsSameOriginCanvas(c) { + t.Error("CANVAS_PROXY_URL unset → IsSameOriginCanvas must return false") + } +} + +// TestIsSameOriginCanvas_BranchCoverage exercises every Referer/Origin +// combination isSameOriginCanvas accepts or rejects. Table-driven so +// adding a new edge case is one row. +func TestIsSameOriginCanvas_BranchCoverage(t *testing.T) { + prev := canvasProxyActive + canvasProxyActive = true + defer func() { canvasProxyActive = prev }() + + cases := []struct { + name string + host string + referer string + origin string + want bool + }{ + // Referer accepts: + {"https referer matches host with path", "h.example.com", "https://h.example.com/dash", "", true}, + {"http referer matches host with path", "h.example.com", "http://h.example.com/dash", "", true}, + {"https referer matches host root no path", "h.example.com", "https://h.example.com", "", true}, + {"http referer matches host root no path", "h.example.com", "http://h.example.com", "", true}, + + // Origin fallback (no Referer, used by WS upgrade): + {"https origin only (no referer)", "h.example.com", "", "https://h.example.com", true}, + {"http origin only (no referer)", "h.example.com", "", "http://h.example.com", true}, + + // Reject paths — the security-critical ones: + {"empty host short-circuits", "", "https://h.example.com/", "", false}, + {"referer host suffix attack — h.example.com.evil.com", "h.example.com", "https://h.example.com.evil.com/", "", false}, + {"referer different host", "h.example.com", "https://other.example.com/", "", false}, + {"origin different host", "h.example.com", "", "https://other.example.com", false}, + {"no referer no origin", "h.example.com", "", "", false}, + } + + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + ctx, _ := gin.CreateTestContext(httptest.NewRecorder()) + ctx.Request = httptest.NewRequest(http.MethodGet, "/x", nil) + ctx.Request.Host = c.host + if c.referer != "" { + ctx.Request.Header.Set("Referer", c.referer) + } + if c.origin != "" { + ctx.Request.Header.Set("Origin", c.origin) + } + if got := isSameOriginCanvas(ctx); got != c.want { + t.Errorf("isSameOriginCanvas(host=%q, referer=%q, origin=%q) = %v, want %v", + c.host, c.referer, c.origin, got, c.want) + } + }) + } +} diff --git a/workspace-server/internal/provisioner/backend_contract_test.go b/workspace-server/internal/provisioner/backend_contract_test.go index 0c31daf0..a5948f54 100644 --- a/workspace-server/internal/provisioner/backend_contract_test.go +++ b/workspace-server/internal/provisioner/backend_contract_test.go @@ -38,6 +38,7 @@ package provisioner import ( "context" + "errors" "testing" ) @@ -138,23 +139,90 @@ func runBackendContract(t *testing.T, backend Backend) { } // TestDockerBackend_Contract feeds the Docker backend through the -// shared runner. Skipped pending hardening: the scaffold exposed a -// real bug — neither backend's Stop/IsRunning handles a nil underlying -// client gracefully (both panic). Filing that as a separate issue; -// once both backends return (*, error) instead of panicking, flip this -// to t.Run and the contract scenarios exercise the fix. +// shared runner. Was Skip'd pending hardening of Provisioner.{Stop, +// IsRunning} against nil Docker client — that hardening landed in +// fix/provisioner-nil-guards-1813, so the scenarios run now. A +// zero-valued *Provisioner returns ErrNoBackend from each method +// instead of panicking, satisfying the no-panic contract. func TestDockerBackend_Contract(t *testing.T) { - t.Skip("scaffolding only — unblock by hardening Provisioner.{Stop,IsRunning} against nil Docker client; see docs/architecture/backends.md drift risk #6") var p Provisioner runBackendContract(t, &p) } // TestCPProvisionerBackend_Contract — same story as the Docker variant. -// CPProvisioner panics on nil httpClient today; once that's hardened, -// remove the Skip and this runner exercises both backends through a -// single contract. +// Hardening landed in fix/provisioner-nil-guards-1813; zero-valued +// *CPProvisioner returns ErrNoBackend from Stop/IsRunning instead of +// panicking on nil httpClient. func TestCPProvisionerBackend_Contract(t *testing.T) { - t.Skip("scaffolding only — unblock by hardening CPProvisioner.{Stop,IsRunning} against nil httpClient; see docs/architecture/backends.md drift risk #6") var p CPProvisioner runBackendContract(t, &p) } + +// TestZeroValuedBackends_NoPanic pins the contract that motivated #1813: +// a Backend with no underlying client wired up must return cleanly, +// never panic. The orphan sweeper and shutdown paths both call these +// methods speculatively where the receiver could be unset. +// +// The exact error shape varies between backends: +// - Docker Provisioner has no DB-lookup layer; zero-valued always +// returns ErrNoBackend. +// - CPProvisioner threads through a package-level resolveInstanceID +// lookup; when the DB has no row for the workspace (or db.DB +// itself is nil), instance_id comes back empty and the method +// short-circuits to (false, nil). Only when there's a real +// instance_id to query does the missing-httpClient case surface +// as ErrNoBackend. +// +// Both shapes are acceptable — the contract is "no panic, error is +// nil or ErrNoBackend." Anything else is a bug. +func TestZeroValuedBackends_NoPanic(t *testing.T) { + acceptableErr := func(err error) bool { + return err == nil || errors.Is(err, ErrNoBackend) + } + t.Run("Provisioner.Stop", func(t *testing.T) { + var p Provisioner + if err := p.Stop(context.Background(), "any"); !errors.Is(err, ErrNoBackend) { + t.Errorf("zero-valued Provisioner.Stop: got %v, want ErrNoBackend", err) + } + }) + t.Run("Provisioner.IsRunning", func(t *testing.T) { + var p Provisioner + running, err := p.IsRunning(context.Background(), "any") + if !errors.Is(err, ErrNoBackend) { + t.Errorf("zero-valued Provisioner.IsRunning: got err=%v, want ErrNoBackend", err) + } + if running { + t.Errorf("zero-valued Provisioner.IsRunning: got running=true, want false") + } + }) + t.Run("CPProvisioner.Stop", func(t *testing.T) { + var p CPProvisioner + if err := p.Stop(context.Background(), "any"); !acceptableErr(err) { + t.Errorf("zero-valued CPProvisioner.Stop: got %v, want nil or ErrNoBackend", err) + } + }) + t.Run("CPProvisioner.IsRunning", func(t *testing.T) { + var p CPProvisioner + _, err := p.IsRunning(context.Background(), "any") + if !acceptableErr(err) { + t.Errorf("zero-valued CPProvisioner.IsRunning: got err=%v, want nil or ErrNoBackend", err) + } + }) + // Nil receivers must also be safe. The orphan sweeper code path can + // hit Stop on a *Provisioner that's nil (not just zero-valued) when + // the wiring path hasn't initialized at all. For nil receivers we + // always expect ErrNoBackend (the nil-check at the top fires before + // any DB lookup could absorb the case). + t.Run("nil_Provisioner.Stop", func(t *testing.T) { + var p *Provisioner + if err := p.Stop(context.Background(), "any"); !errors.Is(err, ErrNoBackend) { + t.Errorf("nil *Provisioner.Stop: got %v, want ErrNoBackend", err) + } + }) + t.Run("nil_CPProvisioner.Stop", func(t *testing.T) { + var p *CPProvisioner + if err := p.Stop(context.Background(), "any"); !errors.Is(err, ErrNoBackend) { + t.Errorf("nil *CPProvisioner.Stop: got %v, want ErrNoBackend", err) + } + }) +} diff --git a/workspace-server/internal/provisioner/cp_provisioner.go b/workspace-server/internal/provisioner/cp_provisioner.go index 6f6ae58d..e9cfc8c9 100644 --- a/workspace-server/internal/provisioner/cp_provisioner.go +++ b/workspace-server/internal/provisioner/cp_provisioner.go @@ -194,6 +194,9 @@ func (p *CPProvisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, // blocking the next provision with InvalidGroup.Duplicate — a full // "Save & Restart" crash on SaaS. func (p *CPProvisioner) Stop(ctx context.Context, workspaceID string) error { + if p == nil { + return ErrNoBackend + } instanceID, err := resolveInstanceID(ctx, workspaceID) if err != nil { return fmt.Errorf("cp provisioner: stop: resolve instance_id: %w", err) @@ -201,9 +204,17 @@ func (p *CPProvisioner) Stop(ctx context.Context, workspaceID string) error { if instanceID == "" { // No instance was ever provisioned (or already deprovisioned and // the column was cleared). Nothing to terminate — idempotent. + // Reached even when httpClient is nil since the empty-instance + // path doesn't need HTTP — symmetric with IsRunning. log.Printf("CP provisioner: Stop for %s — no instance_id on file, nothing to do", workspaceID) return nil } + if p.httpClient == nil { + // HTTP wiring missing but we have an instance_id to terminate — + // can't make the DELETE call. Report ErrNoBackend so the + // orphan sweeper / shutdown path can branch. + return ErrNoBackend + } url := fmt.Sprintf("%s/cp/workspaces/%s?instance_id=%s", p.baseURL, workspaceID, instanceID) req, _ := http.NewRequestWithContext(ctx, "DELETE", url, nil) p.provisionAuthHeaders(req) @@ -269,6 +280,9 @@ var resolveInstanceID = func(ctx context.Context, workspaceID string) (string, e // Both callers are happy with (true, err); callers that need the // previous (false, err) shape must inspect err themselves. func (p *CPProvisioner) IsRunning(ctx context.Context, workspaceID string) (bool, error) { + if p == nil { + return false, ErrNoBackend + } instanceID, err := resolveInstanceID(ctx, workspaceID) if err != nil { // Treat DB errors the same as transport errors — (true, err) keeps @@ -277,9 +291,20 @@ func (p *CPProvisioner) IsRunning(ctx context.Context, workspaceID string) (bool } if instanceID == "" { // No instance recorded. Report "not running" cleanly (no error) - // so restart cascades can trigger a fresh provision. + // so restart cascades can trigger a fresh provision. This path + // is reached even on a zero-valued provisioner (no httpClient + // wired) — that's intentional; the resolveInstanceID lookup + // goes through the package-level db var, not p.httpClient, so + // a no-instance workspace gets a clean answer regardless of + // HTTP wiring state. return false, nil } + if p.httpClient == nil { + // HTTP wiring missing but we have an instance_id to query — + // can't proceed without a client. Report ErrNoBackend so the + // caller can branch (a2a_proxy keeps alive, healthsweep skips). + return false, ErrNoBackend + } url := fmt.Sprintf("%s/cp/workspaces/%s/status?instance_id=%s", p.baseURL, workspaceID, instanceID) req, _ := http.NewRequestWithContext(ctx, "GET", url, nil) p.provisionAuthHeaders(req) diff --git a/workspace-server/internal/provisioner/provisioner.go b/workspace-server/internal/provisioner/provisioner.go index ac04b15f..a3cd37d2 100644 --- a/workspace-server/internal/provisioner/provisioner.go +++ b/workspace-server/internal/provisioner/provisioner.go @@ -5,6 +5,7 @@ import ( "archive/tar" "bytes" "context" + "errors" "fmt" "io" "log" @@ -24,6 +25,15 @@ import ( ocispec "github.com/opencontainers/image-spec/specs-go/v1" ) +// ErrNoBackend is returned by lifecycle methods (Stop, IsRunning) when +// the receiver is zero-valued — i.e. no Docker daemon connection has +// been wired up. The orphan sweeper and the contract-test scaffolding +// both speculatively call these methods on a possibly-nil receiver; +// returning a typed error rather than panicking lets callers reason +// about the case explicitly. See docs/architecture/backends.md +// drift-risk #6. +var ErrNoBackend = errors.New("provisioner: no backend configured (zero-valued receiver)") + // RuntimeImages maps runtime names to their Docker image refs on GHCR. // Each standalone template repo publishes its image via the reusable // publish-template-image workflow in molecule-ci on every main merge. @@ -823,6 +833,9 @@ func (p *Provisioner) RemoveVolume(ctx context.Context, workspaceID string) erro // respawn the container before ContainerRemove runs, leaving a zombie // that re-registers via heartbeat after deletion. func (p *Provisioner) Stop(ctx context.Context, workspaceID string) error { + if p == nil || p.cli == nil { + return ErrNoBackend + } name := ContainerName(workspaceID) // Force-remove kills and removes in one atomic operation, bypassing @@ -856,6 +869,9 @@ func (p *Provisioner) Stop(ctx context.Context, workspaceID string) error { // so a real crash still surfaces via exec heartbeat or TTL, both of which // have narrower false-positive windows than daemon-inspect RPC. func (p *Provisioner) IsRunning(ctx context.Context, workspaceID string) (bool, error) { + if p == nil || p.cli == nil { + return false, ErrNoBackend + } name := ContainerName(workspaceID) info, err := p.cli.ContainerInspect(ctx, name) if err != nil { @@ -1066,6 +1082,13 @@ func pullImageAndDrain(ctx context.Context, cli dockerImageClient, ref, platform // // Tracked in issue #1875; remove this fallback once the template repos // publish multi-arch manifests. +// DefaultImagePlatform is the exported alias used by the admin +// workspace-images handler so its ImagePull picks the same platform as +// the provisioner's. Avoids duplicating the Apple-Silicon-needs-amd64 +// logic and keeps both call sites in sync if Docker manifest support +// changes (e.g., when the templates start shipping multi-arch). +func DefaultImagePlatform() string { return defaultImagePlatform() } + func defaultImagePlatform() string { if v, ok := os.LookupEnv("MOLECULE_IMAGE_PLATFORM"); ok { return v diff --git a/workspace-server/internal/router/router.go b/workspace-server/internal/router/router.go index 6242067a..b337ef32 100644 --- a/workspace-server/internal/router/router.go +++ b/workspace-server/internal/router/router.go @@ -402,6 +402,17 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi r.POST("/admin/a2a-queue/drop-stale", middleware.AdminAuth(db.DB), qH.DropStale) } + // Admin — workspace template image refresh. Pulls latest images from GHCR + // and recreates running ws-* containers so they adopt the new image. + // Final step of the runtime CD chain — see docs/workspace-runtime-package.md. + // Operators (or post-publish automation) hit this after a runtime release. + // Reuses the provisioner's Docker client; no-op when prov is nil + // (test / non-Docker deploy). + if prov != nil { + imgH := handlers.NewAdminWorkspaceImagesHandler(prov.DockerClient()) + r.POST("/admin/workspace-images/refresh", middleware.AdminAuth(db.DB), imgH.Refresh) + } + // Admin — test token minting (issue #6). Hidden in production via TestTokensEnabled(). // NOT behind AdminAuth — this is the bootstrap endpoint E2E tests and // fresh installs use to obtain their first admin bearer. Adding AdminAuth diff --git a/workspace/.coveragerc b/workspace/.coveragerc new file mode 100644 index 00000000..b14f2f88 --- /dev/null +++ b/workspace/.coveragerc @@ -0,0 +1,13 @@ +# coverage.py config — consumed by `pytest --cov` via the pytest-cov +# plugin. Lives here (not in pytest.ini) because coverage.py only reads +# .coveragerc / setup.cfg / tox.ini / pyproject.toml — the [coverage:*] +# sections in pytest.ini are silently ignored. See issue #1817. +[run] +omit = + */tests/* + */__init__.py + plugins_registry/* + +[report] +# Skip files at 100% in the term-missing output to keep CI logs readable. +skip_covered = True diff --git a/workspace/a2a_executor.py b/workspace/a2a_executor.py index b550a350..69c56398 100644 --- a/workspace/a2a_executor.py +++ b/workspace/a2a_executor.py @@ -422,7 +422,14 @@ class LangGraphA2AExecutor(AgentExecutor): try: msg.metadata = {"tool_trace": tool_trace} except (AttributeError, TypeError): - pass + # `new_agent_text_message()` returns a plain string in + # MagicMock paths in tests, where assignment to + # .metadata raises despite hasattr being true (the + # mock has the attribute as a property). Suppression + # is intentional — production Message objects always + # accept the assignment. See #1787 + commit dcbcf19 + # for the original test-mock motivation. + logger.debug("metadata attach skipped (non-Message return from new_agent_text_message)") await event_queue.enqueue_event(msg) _result = final_text diff --git a/workspace/pytest.ini b/workspace/pytest.ini index 50e89f85..22fee4eb 100644 --- a/workspace/pytest.ini +++ b/workspace/pytest.ini @@ -3,4 +3,24 @@ testpaths = tests python_files = test_*.py python_functions = test_* asyncio_mode = auto -addopts = -q +# Coverage config moved here from .github/workflows/ci.yml so local +# `pytest` matches CI without operator-typed flags. cov-fail-under +# pins the floor at 86% — 5pp below the 91.11% measured on staging +# (run 24957664272, 2026-04-26). Floor exists so a regression that +# drops coverage doesn't sneak past CI; tightening above 86% should +# follow real measurement, not aspiration. See issue #1817. +# +# Why 86 not 92: the earlier 97% measurement was without the +# .coveragerc omit list. Once `*/__init__.py`, `*/tests/*`, and +# `plugins_registry/*` are excluded (the issue's prescribed omit +# set, more meaningful since those files don't carry behavior), +# the actual measurement of behavior-bearing code is 91.11% — and +# 86% sits at the issue's prescribed `current - 5pp` margin. +addopts = + -q + --cov=. + --cov-report=term-missing + --cov-fail-under=86 +# Coverage omit / report config lives in workspace/.coveragerc — coverage.py +# only reads .coveragerc / setup.cfg / tox.ini / pyproject.toml, NOT +# pytest.ini, so [coverage:*] sections here would be silently ignored. diff --git a/workspace/scripts/molecule-git-token-helper.sh b/workspace/scripts/molecule-git-token-helper.sh index 0faab0fc..125d5109 100755 --- a/workspace/scripts/molecule-git-token-helper.sh +++ b/workspace/scripts/molecule-git-token-helper.sh @@ -110,18 +110,45 @@ _read_cache() { } # _write_cache — atomically persist token + expiry. +# +# Hardened per #1552: +# - umask 077 around the writes so .tmp files are 600 from creation, +# closing the TOCTOU window where a concurrent reader could read +# the token while it was still mode 644 (between the create-with- +# default-umask and the later chmod 600). +# - Don't swallow chmod errors with `|| true`. A chmod failure leaves +# tokens potentially world-readable; surface it as a WARN line so +# ops can grep `[molecule-git-token-helper] WARN` and see real +# permission failures instead of silent 644 files. _write_cache() { local token="$1" mkdir -p "${CACHE_DIR}" - chmod 700 "${CACHE_DIR}" 2>/dev/null || true + if ! chmod 700 "${CACHE_DIR}" 2>/dev/null; then + echo "[molecule-git-token-helper] WARN: failed to chmod 700 ${CACHE_DIR} — cache dir may be world-readable" >&2 + fi now=$(_now_epoch) expiry=$((now + TOKEN_CACHE_TTL_SEC)) + + # Restrictive umask so the .tmp files are 600 from creation. Restored + # before return so callers' umask isn't perturbed. + local prev_umask + prev_umask=$(umask) + umask 077 + # Write atomically via tmp + mv to avoid partial reads. printf '%s' "${token}" > "${CACHE_TOKEN_FILE}.tmp" printf '%s' "${expiry}" > "${CACHE_EXPIRY_FILE}.tmp" mv -f "${CACHE_TOKEN_FILE}.tmp" "${CACHE_TOKEN_FILE}" mv -f "${CACHE_EXPIRY_FILE}.tmp" "${CACHE_EXPIRY_FILE}" - chmod 600 "${CACHE_TOKEN_FILE}" "${CACHE_EXPIRY_FILE}" 2>/dev/null || true + + umask "${prev_umask}" + + # Belt-and-suspenders chmod — umask 077 should make the files 600 + # already, but a chmod that fails on the post-rename file is itself + # a real signal worth surfacing. + if ! chmod 600 "${CACHE_TOKEN_FILE}" "${CACHE_EXPIRY_FILE}" 2>/dev/null; then + echo "[molecule-git-token-helper] WARN: chmod 600 failed on cache files — token may be world-readable" >&2 + fi } # _fetch_token_from_api — hit the platform endpoint. @@ -230,10 +257,21 @@ case "${ACTION}" in echo "[molecule-git-token-helper] _refresh_gh: gh auth login failed (non-fatal)" >&2 } # Also update GH_TOKEN file for scripts that source it. + # Same #1552 hardening as _write_cache — umask 077 around the + # write so the .tmp file is 600 from creation, and surface a + # WARN on chmod failure instead of swallowing it. gh_token_file="${HOME}/.gh_token" + # `local` is illegal here (top-level case branch, not a + # function); shadow with a uniquely-named global instead. + _gh_prev_umask=$(umask) + umask 077 printf '%s' "${api_token}" > "${gh_token_file}.tmp" mv -f "${gh_token_file}.tmp" "${gh_token_file}" - chmod 600 "${gh_token_file}" 2>/dev/null || true + umask "${_gh_prev_umask}" + unset _gh_prev_umask + if ! chmod 600 "${gh_token_file}" 2>/dev/null; then + echo "[molecule-git-token-helper] WARN: chmod 600 failed on ${gh_token_file} — token may be world-readable" >&2 + fi echo "[molecule-git-token-helper] _refresh_gh: token refreshed successfully" >&2 ;; _invalidate_cache) diff --git a/workspace/tests/test_config.py b/workspace/tests/test_config.py index 786bc8b7..2a805b3f 100644 --- a/workspace/tests/test_config.py +++ b/workspace/tests/test_config.py @@ -2,10 +2,12 @@ import os +import pytest import yaml from config import ( A2AConfig, + ComplianceConfig, DelegationConfig, SandboxConfig, WorkspaceConfig, @@ -244,3 +246,46 @@ def test_shared_context_from_yaml(tmp_path): cfg = load_config(str(tmp_path)) assert cfg.shared_context == ["guidelines.md", "architecture.md"] + + +# ===== Compliance default lock (#2059) ===== +# +# PR #2056 flipped ComplianceConfig.mode default from "" to "owasp_agentic" +# so every shipped template gets prompt-injection detection + PII redaction +# by default. These tests pin the new default at all four entry points so +# a silent revert (or a refactor that reintroduces the old no-op default) +# fails fast instead of shipping a workspace with compliance silently off. + + +def test_compliance_dataclass_default(): + """ComplianceConfig() — no args — must default to owasp_agentic + detect.""" + cfg = ComplianceConfig() + assert cfg.mode == "owasp_agentic" + assert cfg.prompt_injection == "detect" + + +@pytest.mark.parametrize( + "yaml_payload, expected_mode", + [ + # No `compliance:` key at all — full default path. + ({}, "owasp_agentic"), + # Explicit empty block — exercises load_config's + # `.get("mode", "owasp_agentic")` default-fill at config.py:377. + # Common shape during template editing. + ({"compliance": {}}, "owasp_agentic"), + # Documented opt-out: explicit `mode: ""` disables compliance. + ({"compliance": {"mode": ""}}, ""), + ], + ids=["yaml_omits_block", "yaml_block_empty", "yaml_explicit_optout"], +) +def test_compliance_default_via_load_config(tmp_path, yaml_payload, expected_mode): + """load_config honors the owasp_agentic default at every yaml shape and + still respects explicit opt-out.""" + config_yaml = tmp_path / "config.yaml" + config_yaml.write_text(yaml.dump(yaml_payload)) + + cfg = load_config(str(tmp_path)) + assert cfg.compliance.mode == expected_mode + # prompt_injection was never overridden in any payload — must stay at + # the dataclass default regardless of the mode value. + assert cfg.compliance.prompt_injection == "detect"