diff --git a/.github/workflows/canary-verify.yml b/.github/workflows/canary-verify.yml new file mode 100644 index 00000000..16d06a70 --- /dev/null +++ b/.github/workflows/canary-verify.yml @@ -0,0 +1,113 @@ +name: canary-verify + +# Runs the canary smoke suite against the staging canary tenant fleet +# after a new :staging- image lands in GHCR. On green, promotes +# :staging- → :latest so the prod tenant fleet's 5-minute +# auto-updater picks up the verified digest. On red, :latest stays +# on the prior known-good digest and prod is untouched. +# +# Dependencies: +# - publish-workspace-server-image.yml publishes :staging- +# (NOT :latest) on main merge +# - canary tenants are configured to pull :staging- as their +# tenant image (set TENANT_IMAGE=ghcr.io/…:staging- on the +# canary provisioner code path OR rotate via an admin endpoint) +# - Repo secrets CANARY_TENANT_URLS / CANARY_ADMIN_TOKENS / +# CANARY_CP_SHARED_SECRET are populated + +on: + workflow_run: + workflows: ["publish-workspace-server-image"] + types: [completed] + workflow_dispatch: + +permissions: + contents: read + packages: write + actions: read + +env: + IMAGE_NAME: ghcr.io/molecule-ai/platform + TENANT_IMAGE_NAME: ghcr.io/molecule-ai/platform-tenant + +jobs: + canary-smoke: + # Skip when the upstream workflow failed — no image to test against. + if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }} + runs-on: ubuntu-latest + outputs: + sha: ${{ steps.compute.outputs.sha }} + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Compute sha + id: compute + run: echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT" + + - name: Wait for canary tenants to pick up :staging- + # Tenant auto-updater runs every 5 min. Sleep 6 min to give every + # canary time to pull + restart. Cheaper than polling. + run: sleep 360 + + - name: Run canary smoke suite + env: + CANARY_TENANT_URLS: ${{ secrets.CANARY_TENANT_URLS }} + CANARY_ADMIN_TOKENS: ${{ secrets.CANARY_ADMIN_TOKENS }} + CANARY_CP_BASE_URL: https://staging-api.moleculesai.app + CANARY_CP_SHARED_SECRET: ${{ secrets.CANARY_CP_SHARED_SECRET }} + run: bash scripts/canary-smoke.sh + + - name: Summary on failure + if: ${{ failure() }} + run: | + { + echo "## Canary smoke FAILED" + echo + echo "Canary tenants rejected image \`staging-${{ steps.compute.outputs.sha }}\`." + echo ":latest stays pinned to the prior good digest — prod is untouched." + echo + echo "Fix forward and merge again, or investigate the specific failed" + echo "assertions in the canary-smoke step log above." + } >> "$GITHUB_STEP_SUMMARY" + + promote-to-latest: + # On green, retag :staging- → :latest for BOTH images. + # crane is a lightweight registry client (no Docker daemon needed on + # the runner) that can retag remotely with a single API call each. + needs: canary-smoke + if: ${{ needs.canary-smoke.result == 'success' }} + runs-on: ubuntu-latest + steps: + - name: Install crane + run: | + curl -fsSL https://github.com/google/go-containerregistry/releases/download/v0.20.2/go-containerregistry_Linux_x86_64.tar.gz | \ + tar xz -C /usr/local/bin crane + + - name: GHCR login + run: | + echo "${{ secrets.GITHUB_TOKEN }}" | \ + crane auth login ghcr.io -u "${{ github.actor }}" --password-stdin + + - name: Retag platform :staging- → :latest + run: | + crane tag \ + "${IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}" \ + latest + + - name: Retag tenant :staging- → :latest + run: | + crane tag \ + "${TENANT_IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}" \ + latest + + - name: Summary + run: | + { + echo "## Canary verified — :latest promoted" + echo + echo "- \`${IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}\` → \`${IMAGE_NAME}:latest\`" + echo "- \`${TENANT_IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}\` → \`${TENANT_IMAGE_NAME}:latest\`" + echo + echo "Prod tenant fleet will pick up the new digest on its next 5-min auto-update cycle." + } >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/publish-workspace-server-image.yml b/.github/workflows/publish-workspace-server-image.yml index 28ef0b79..b76681c4 100644 --- a/.github/workflows/publish-workspace-server-image.yml +++ b/.github/workflows/publish-workspace-server-image.yml @@ -55,7 +55,17 @@ jobs: run: | echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT" - - name: Build & push platform image to GHCR + # Canary-gated release: we publish :staging- ONLY here. The + # :latest tag (which existing prod tenants auto-pull every 5 min) + # is promoted by .github/workflows/canary-verify.yml after the + # staging canary fleet green-lights this digest. + # That means: + # - Every main merge produces a :staging- image + # - Canary tenants (configured to pull :staging-) pick it up + # - canary-verify.yml runs smoke tests against them + # - On green → canary-verify retags :staging- → :latest + # - On red → :latest stays on the prior good digest, prod is safe + - name: Build & push platform image to GHCR (staging- only) uses: docker/build-push-action@v6 with: context: . @@ -63,16 +73,15 @@ jobs: platforms: linux/amd64 push: true tags: | - ${{ env.IMAGE_NAME }}:latest - ${{ env.IMAGE_NAME }}:sha-${{ steps.tags.outputs.sha }} + ${{ env.IMAGE_NAME }}:staging-${{ steps.tags.outputs.sha }} cache-from: type=gha cache-to: type=gha,mode=max labels: | org.opencontainers.image.source=https://github.com/${{ github.repository }} org.opencontainers.image.revision=${{ github.sha }} - org.opencontainers.image.description=Molecule AI platform (Go API server) + org.opencontainers.image.description=Molecule AI platform (Go API server) — pending canary verify - - name: Build & push tenant image to GHCR + - name: Build & push tenant image to GHCR (staging- only) uses: docker/build-push-action@v6 with: context: . @@ -80,11 +89,10 @@ jobs: platforms: linux/amd64 push: true tags: | - ${{ env.TENANT_IMAGE_NAME }}:latest - ${{ env.TENANT_IMAGE_NAME }}:sha-${{ steps.tags.outputs.sha }} + ${{ env.TENANT_IMAGE_NAME }}:staging-${{ steps.tags.outputs.sha }} cache-from: type=gha cache-to: type=gha,mode=max labels: | org.opencontainers.image.source=https://github.com/${{ github.repository }} org.opencontainers.image.revision=${{ github.sha }} - org.opencontainers.image.description=Molecule AI tenant platform + canvas (one EC2 instance per org) + org.opencontainers.image.description=Molecule AI tenant platform + canvas — pending canary verify diff --git a/canvas/src/app/orgs/page.tsx b/canvas/src/app/orgs/page.tsx new file mode 100644 index 00000000..955d3f46 --- /dev/null +++ b/canvas/src/app/orgs/page.tsx @@ -0,0 +1,278 @@ +"use client"; + +// /orgs — the post-signup landing page. +// +// The control plane's Callback handler (authorized via WorkOS) redirects +// every new session to APP_URL/orgs after login/signup succeeds. Before +// this route existed that redirect 404'd and new users were stranded. +// Now: +// - Signed-out browsers are bounced back to /cp/auth/login +// - Zero-org users see a slug-picker → POST /cp/orgs → refresh +// - `awaiting_payment` orgs get a "Complete payment" CTA → /pricing +// - `running` orgs show a link to the tenant URL +// - `provisioning` / `failed` surface the state so the user knows +// why their tenant isn't available yet +// +// Everything here is intentionally server-light: one GET /cp/orgs, +// zero WebSocket, no canvas store hydration — the whole point is a +// quick bounce between signup and either Checkout or the tenant UI. + +import { useEffect, useState } from "react"; +import { fetchSession, redirectToLogin, type Session } from "@/lib/auth"; +import { PLATFORM_URL } from "@/lib/api"; + +type OrgStatus = "awaiting_payment" | "provisioning" | "running" | "failed" | string; + +interface Org { + id: string; + slug: string; + name: string; + plan: string; + status: OrgStatus; + created_at: string; + updated_at: string; +} + +export default function OrgsPage() { + const [session, setSession] = useState("loading"); + const [orgs, setOrgs] = useState(null); + const [error, setError] = useState(null); + + useEffect(() => { + let cancelled = false; + (async () => { + try { + const sess = await fetchSession(); + if (cancelled) return; + if (!sess) { + redirectToLogin(); + return; + } + setSession(sess); + const res = await fetch(`${PLATFORM_URL}/cp/orgs`, { + credentials: "include", + signal: AbortSignal.timeout(15_000), + }); + if (!res.ok) { + throw new Error(`GET /cp/orgs: ${res.status}`); + } + const body = (await res.json()) as { orgs?: Org[] } | Org[]; + const list = Array.isArray(body) ? body : body.orgs ?? []; + if (!cancelled) setOrgs(list); + } catch (err) { + if (!cancelled) { + setError(err instanceof Error ? err.message : String(err)); + } + } + })(); + return () => { + cancelled = true; + }; + }, []); + + if (session === "loading" || (orgs === null && error === null)) { + return

Loading…

; + } + if (error) { + return ( + +

Error: {error}

+ +
+ ); + } + if (!orgs || orgs.length === 0) { + return ; + } + return ( + +
    + {orgs.map((o) => ( + + ))} +
+
+ { + // Refresh the list so the new org appears + its CTA fires. + window.location.reload(); + void slug; + }} + /> +
+
+ ); +} + +function Shell({ children }: { children: React.ReactNode }) { + return ( +
+
+

Your organizations

+

+ Each org is an isolated Molecule workspace. +

+
{children}
+
+
+ ); +} + +function OrgRow({ org }: { org: Org }) { + return ( +
  • +
    +
    +
    {org.name}
    +
    + {org.slug} · · {org.plan || "free"} +
    +
    + +
    +
  • + ); +} + +function StatusLabel({ status }: { status: OrgStatus }) { + const cls = + status === "running" + ? "text-emerald-400" + : status === "awaiting_payment" + ? "text-amber-400" + : status === "failed" + ? "text-red-400" + : "text-sky-400"; + const label = + status === "awaiting_payment" + ? "awaiting payment" + : status; + return {label}; +} + +function OrgCTA({ org }: { org: Org }) { + if (org.status === "running") { + const host = typeof window !== "undefined" ? window.location.hostname : "moleculesai.app"; + const appDomain = host.endsWith(".moleculesai.app") + ? host.split(".").slice(-2).join(".") + : "moleculesai.app"; + const href = `https://${org.slug}.${appDomain}`; + return ( + + Open + + ); + } + if (org.status === "awaiting_payment") { + return ( + + Complete payment + + ); + } + if (org.status === "failed") { + return ( + + Contact support + + ); + } + // provisioning / unknown — non-interactive + return {org.status}…; +} + +function EmptyState() { + return ( + +

    + You don't have any organizations yet. Create one to get started — your + workspace spins up automatically once billing is set up. +

    +
    + { + window.location.reload(); + }} + /> +
    +
    + ); +} + +function CreateOrgForm({ onCreated }: { onCreated: (slug: string) => void }) { + const [slug, setSlug] = useState(""); + const [name, setName] = useState(""); + const [submitting, setSubmitting] = useState(false); + const [err, setErr] = useState(null); + + async function submit(e: React.FormEvent) { + e.preventDefault(); + setSubmitting(true); + setErr(null); + try { + const res = await fetch(`${PLATFORM_URL}/cp/orgs`, { + method: "POST", + credentials: "include", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ slug, name }), + signal: AbortSignal.timeout(15_000), + }); + if (!res.ok) { + const body = await res.text(); + throw new Error(`${res.status}: ${body}`); + } + onCreated(slug); + } catch (e) { + setErr(e instanceof Error ? e.message : String(e)); + setSubmitting(false); + } + } + + return ( +
    + + + {err &&

    {err}

    } + +
    + ); +} diff --git a/canvas/src/lib/api.ts b/canvas/src/lib/api.ts index 6bb091b1..3721dce6 100644 --- a/canvas/src/lib/api.ts +++ b/canvas/src/lib/api.ts @@ -8,6 +8,12 @@ import { getTenantSlug } from "./tenant"; export const PLATFORM_URL = process.env.NEXT_PUBLIC_PLATFORM_URL ?? "http://localhost:8080"; +// 15s is long enough for slow CP queries but short enough that a +// hung backend doesn't leave the UI spinning forever. The abort +// propagates through AbortController so React components can observe +// the error and render a retry affordance. +const DEFAULT_TIMEOUT_MS = 15_000; + async function request( method: string, path: string, @@ -28,6 +34,7 @@ async function request( headers, body: body ? JSON.stringify(body) : undefined, credentials: "include", + signal: AbortSignal.timeout(DEFAULT_TIMEOUT_MS), }); if (!res.ok) { const text = await res.text(); diff --git a/docs/architecture/canary-release.md b/docs/architecture/canary-release.md new file mode 100644 index 00000000..eb795eda --- /dev/null +++ b/docs/architecture/canary-release.md @@ -0,0 +1,79 @@ +# Canary release pipeline + +How a workspace-server code change reaches the prod tenant fleet — and how to stop it if something's wrong. + +## The loop + +``` +PR merged to staging → main + │ + ▼ +publish-workspace-server-image.yml ← pushes :staging- ONLY + │ (NOT :latest — prod is untouched) + ▼ +Canary tenants auto-update to :staging- + │ (5-min auto-updater cycle on each canary EC2) + ▼ +canary-verify.yml waits 6 min, runs scripts/canary-smoke.sh + │ + ├─► GREEN → crane tag :staging- → :latest + │ │ + │ ▼ + │ Prod tenants auto-update within 5 min + │ + └─► RED → :latest stays on prior good digest + GitHub Step Summary flags the rejected sha + Ops fixes forward OR rolls back manually +``` + +## Canary fleet + +Lives in a separate AWS account (`molecule-canary`, `004947743811`) via an assumed role (`MoleculeStagingProvisioner`). The CP's `is_canary` org flag routes provisioning there; every other org goes to the default staging account. See `docs/architecture/saas-prod-migration-2026-04-19.md` for the account bootstrap. + +Canary tenants are configured to pull `:staging-` (not `:latest`) via `TENANT_IMAGE` on their provisioner, so they ingest each new build before prod does. + +## Smoke suite + +`scripts/canary-smoke.sh` hits each canary tenant (URL + ADMIN_TOKEN pair) and asserts: + +- `/admin/liveness` returns a subsystems map (tenant booted, AdminAuth reachable) +- `/workspaces` returns a JSON array (wsAuth + DB healthy) +- `/memories/commit` + `/memories/search` round-trip (encryption + scrubber) +- `/events` admin read (C4 fail-closed proof) +- `/admin/liveness` without bearer → 401 (C4 regression gate) + +Expand by editing the script — each `check "name" "expected" "$response"` call is one line. + +## Adding a canary tenant + +1. `POST /cp/orgs` — create the org normally (is_canary defaults to false) +2. `POST /cp/admin/orgs//canary` with `{"is_canary": true}` — admin only, refuses to flip if already provisioned +3. Re-trigger provision (or delete + recreate if the org was already provisioned into staging) — the fresh EC2 lands in account `004947743811` + +Then set repo secrets: +- `CANARY_TENANT_URLS` — append the new tenant's URL +- `CANARY_ADMIN_TOKENS` — append its ADMIN_TOKEN in the same position + +## Rolling back `:latest` + +When canary was green but something surfaces post-promotion, retag `:latest` to a prior digest: + +```bash +export GITHUB_TOKEN=ghp_... # write:packages +scripts/rollback-latest.sh 4c1d56e # retags both platform + tenant images +``` + +`scripts/rollback-latest.sh` pre-checks that `:staging-` exists before moving `:latest`, and verifies the digest after the move. Prod tenants pick up the rolled-back image on their next 5-min auto-update. + +A post-mortem should always include: +- the commit sha that broke +- why canary didn't catch it (new code path the smoke suite doesn't exercise?) +- whether the smoke suite should grow a new check to prevent the same class of bug + +## What this gate doesn't catch + +- Bugs that only surface under prod-only data (customer workloads with scale or shape canary doesn't produce). Canary uses real traffic shapes but can't simulate weeks of accumulated state. +- Config drift between canary and prod (different env-var values, different feature flags). Keep canary's config deltas minimal and documented. +- Cross-tenant interactions — canary tenants run in their own AWS account, so a bug that only appears when two tenants compete for a shared resource won't reproduce here. + +When these miss, `rollback-latest.sh` is the escape hatch. diff --git a/docs/architecture/saas-prod-migration-2026-04-19.md b/docs/architecture/saas-prod-migration-2026-04-19.md new file mode 100644 index 00000000..05963f76 --- /dev/null +++ b/docs/architecture/saas-prod-migration-2026-04-19.md @@ -0,0 +1,72 @@ +# SaaS prod migration — 2026-04-19 + +Promoted staging → main on both `Molecule-AI/molecule-controlplane` and `Molecule-AI/molecule-core`. This note captures the prod cutover deltas so ops can cross-check against the running system. + +## What changed + +Ten PRs landed, split across the two repos: + +**Control plane (`molecule-controlplane`)** +- PR #50 — C1/C2/C3: bearer auth on `/cp/workspaces/*`, shell-escape tenant user-data, per-tenant security group +- PR #51 — H1/H2: crash-safe `SECRETS_ENCRYPTION_KEY` log, dropped `admin_token` from `/instance` SELECT +- PR #52 — SSRF guard on `platform_url` +- PR #53 — CP injects `MOLECULE_CP_SHARED_SECRET` + `MOLECULE_CP_URL` into tenant env +- PR #54 — Stripe webhook body capped at 1 MiB + +**Core (`molecule-core` / this repo)** +- PR #978 — H3/H4: LimitReader on Discord webhook + workspace config PATCH +- PR #979 — C4: `AdminAuth` fail-closed on fresh install when `ADMIN_TOKEN` is set +- PR #980 — log-scrub: dropped token prefix logging, stopped logging raw upstream response bodies +- PR #981 — tenant `CPProvisioner` attaches the CP bearer on every outbound `/cp/workspaces/*` call +- PR #982 — Canvas API fetch timeout (15s) +- PR #984 — E2E smoke test sync for #966 (public GET no longer exposes `current_task`) + +## New prod env vars (Railway, project `molecule-platform`, env `production`) + +Set before the CP merge landed: + +| Variable | Value shape | Purpose | +|---|---|---| +| `PROVISION_SHARED_SECRET` | 32-byte hex | Gates `/cp/workspaces/*` on CP. Routes refuse to mount when unset — C1 fail-closed. | +| `EC2_VPC_ID` | `vpc-…` | Enables per-tenant SG creation (C3). Shared-SG fallback emits a startup warning. | +| `CP_BASE_URL` | `https://api.moleculesai.app` | Injected into newly-provisioned tenant containers as `MOLECULE_CP_URL`. | + +The live prod `PROVISION_SHARED_SECRET` value is held only in Railway; not committed anywhere. Rotate by `railway variables --set` + redeploy. + +## Existing-tenant migration (the sharp edge) + +Tenants provisioned **before** this cutover are still running the previous workspace-server image. When they pull the new image on their next boot or auto-update cycle, their `CPProvisioner` will start expecting `MOLECULE_CP_SHARED_SECRET` in the container env — but the existing tenant EC2s don't have that variable in their user-data (the CP only started injecting it from PR #53 onward). + +**Symptom**: a pre-cutover tenant can still serve its users' existing workspaces, but any attempt to **provision a new workspace** from inside the tenant UI will hit the CP's new bearer gate and get `401` or `404` back, surfacing as "workspace provision failed" with a generic error. + +**Fix per existing tenant (pick one)**: + +1. **SSH in + add the env var** + - Copy `PROVISION_SHARED_SECRET` from Railway prod env. + - `ssh ubuntu@` and append to the running container's env (`docker stop && docker run … -e MOLECULE_CP_SHARED_SECRET='…' -e MOLECULE_CP_URL=https://api.moleculesai.app …`). Rolling this into an auto-update hook is follow-up work. + +2. **Re-provision the tenant** + - `DELETE /cp/orgs/:slug` → re-create via normal signup flow. Tenant-level data survives only if the tenant's own Postgres volume is preserved; workspace_id values change. This is the heavy hammer — only for tenants where existing data can be recreated easily. + +3. **Wait for the auto-update + user-data refresh cycle** + - Tenant auto-updater (cron, 5-minute cadence) pulls the new container image but **does not refresh env vars** — those are frozen from the initial user-data. So option 3 alone doesn't fix this; it still needs option 1 or 2. + +Script at `scripts/migrate-tenant-cp-secret.sh` (follow-up) will automate option 1 across all running tenants in the prod AWS account. + +## Post-deploy verification checklist + +- [ ] Railway prod deploy for `controlplane` lands on the new commit (check `https://railway.com/project/7ccc…/service/ae76…`) +- [ ] `curl https://api.moleculesai.app/health` → 200 `{service: molecule-cp, status: ok}` +- [ ] `curl -X POST https://api.moleculesai.app/cp/workspaces/provision` (no bearer) → 401 (**not** 404 — proves the env var is live and routes mounted) +- [ ] GHCR publishes new `workspace-server` image for the core main commit +- [ ] Vercel canvas prod deploy lands + +## Rollback + +If prod is on fire: + +1. `gh pr revert 46 -R Molecule-AI/molecule-controlplane` — reverts all 6 CP PRs together. +2. `gh pr revert 983 -R Molecule-AI/molecule-core` — reverts the core bundle. +3. Both reverts auto-deploy via Railway / GHCR / Vercel. + +Existing tenants aren't affected by a rollback — they're running whichever tenant image tag they booted with. Only newly-provisioned tenants pick up the reverted control plane code. diff --git a/scripts/canary-smoke.sh b/scripts/canary-smoke.sh new file mode 100755 index 00000000..0d549de2 --- /dev/null +++ b/scripts/canary-smoke.sh @@ -0,0 +1,120 @@ +#!/bin/bash +# canary-smoke.sh — runs the post-deploy smoke suite against the +# staging canary tenant fleet. Called by the canary-verify.yml GitHub +# Actions workflow after a new workspace-server image gets pushed to +# GHCR; exits non-zero on any failure so the workflow can skip the +# :staging-sha → :latest retag that would otherwise release broken +# code to the prod tenant fleet. +# +# Environment: +# CANARY_TENANT_URLS space-sep list of canary tenant base URLs +# (e.g. "https://canary-pm.staging.moleculesai.app +# https://canary-mcp.staging.moleculesai.app") +# CANARY_ADMIN_TOKENS space-sep list of ADMIN_TOKENs, positionally +# matched to CANARY_TENANT_URLS. Canary tenants +# are provisioned with known ADMIN_TOKENs so CI +# can hit their admin-gated endpoints. +# CANARY_CP_BASE_URL CP base URL the canaries call back to +# (https://staging-api.moleculesai.app) +# CANARY_CP_SHARED_SECRET matches CP's PROVISION_SHARED_SECRET so this +# script can also exercise /cp/workspaces/* via +# the canary's own CPProvisioner identity. +# +# Exit codes: 0 = all green, 1 = assertion failure, 2 = setup/env problem. + +set -euo pipefail + +# ── Setup ──────────────────────────────────────────────────────────────── + +: "${CANARY_TENANT_URLS:?space-sep list of canary base URLs required}" +: "${CANARY_ADMIN_TOKENS:?space-sep list of ADMIN_TOKENs required, same order as URLs}" +: "${CANARY_CP_BASE_URL:?CP base URL required}" + +read -r -a URLS <<< "$CANARY_TENANT_URLS" +read -r -a TOKENS <<< "$CANARY_ADMIN_TOKENS" + +if [ "${#URLS[@]}" -ne "${#TOKENS[@]}" ]; then + echo "ERROR: URLS(${#URLS[@]}) and TOKENS(${#TOKENS[@]}) length mismatch" >&2 + exit 2 +fi +if [ "${#URLS[@]}" -eq 0 ]; then + echo "ERROR: no canary URLs configured" >&2 + exit 2 +fi + +PASS=0 +FAIL=0 + +# ── Helpers ────────────────────────────────────────────────────────────── + +check() { + local desc="$1" expected="$2" actual="$3" + if echo "$actual" | grep -qF "$expected"; then + printf " PASS %s\n" "$desc" + PASS=$((PASS + 1)) + else + printf " FAIL %s\n expected to contain: %s\n got: %s\n" "$desc" "$expected" "$actual" >&2 + FAIL=$((FAIL + 1)) + fi +} + +# acurl does an admin-authenticated GET/POST/etc. against a canary tenant. +# Takes +BASE_URL +ADMIN_TOKEN as its first two positional args; the rest +# are passed through to curl. Keeps the two values paired so the wrong +# tenant never gets the wrong token. +acurl() { + local base="$1" token="$2"; shift 2 + curl -sS --max-time 20 -H "Authorization: Bearer $token" "$@" -- "$base${CANARY_ACURL_PATH:-}" +} + +# ── Checks (run per canary tenant) ─────────────────────────────────────── + +for i in "${!URLS[@]}"; do + base="${URLS[$i]}" + token="${TOKENS[$i]}" + printf "\n── %s ──\n" "$base" + + # 1. Liveness — the tenant is up and responding to admin auth. + CANARY_ACURL_PATH="/admin/liveness" resp=$(acurl "$base" "$token" || true) + check "liveness returns a subsystems map" '"subsystems"' "$resp" + + # 2. CP env refresh — the workspace-server fetched MOLECULE_CP_SHARED_SECRET + # from CP on startup. We can't read env directly, but we can assert the + # liveness + workspace list both work, which together imply the binary + # booted without crashing on the refresh call. A startup failure in + # refreshEnvFromCP logs but still boots (best-effort semantics), so + # this is a sanity check, not a proof. + CANARY_ACURL_PATH="/workspaces" resp=$(acurl "$base" "$token" || true) + check "workspace list is JSON array" "[" "$resp" + + # 3. Memory commit round-trip — scope=LOCAL so test data stays on this + # tenant. Verifies encryption + scrubber + retrieval end-to-end. + probe_id="canary-smoke-$(date +%s)-$i" + body=$(printf '{"scope":"LOCAL","namespace":"canary-smoke","content":"probe-%s"}' "$probe_id") + CANARY_ACURL_PATH="/memories/commit" resp=$(curl -sS --max-time 20 \ + -X POST -H "Content-Type: application/json" -H "Authorization: Bearer $token" \ + --data "$body" "$base/memories/commit" || true) + check "memory commit accepted" '"id"' "$resp" + + CANARY_ACURL_PATH="/memories/search?query=probe-${probe_id}" \ + resp=$(curl -sS --max-time 20 -H "Authorization: Bearer $token" \ + "$base/memories/search?query=probe-${probe_id}" || true) + check "memory search finds the probe" "probe-${probe_id}" "$resp" + + # 4. Events admin read — AdminAuth path (C4 fail-closed proof on SaaS). + CANARY_ACURL_PATH="/events" resp=$(acurl "$base" "$token" || true) + check "events endpoint returns JSON" "[" "$resp" + + # 5. Negative: unauth'd admin call must 401 (C4 regression gate). + unauth_code=$(curl -sS -o /dev/null -w '%{http_code}' --max-time 10 "$base/admin/liveness" || echo "000") + check "unauth'd /admin/liveness returns 401" "401" "$unauth_code" +done + +# ── Summary ────────────────────────────────────────────────────────────── + +printf "\n=== CANARY SMOKE RESULTS ===\n" +printf " PASS: %d\n FAIL: %d\n" "$PASS" "$FAIL" + +if [ "$FAIL" -gt 0 ]; then + exit 1 +fi diff --git a/scripts/rollback-latest.sh b/scripts/rollback-latest.sh new file mode 100755 index 00000000..ade2051b --- /dev/null +++ b/scripts/rollback-latest.sh @@ -0,0 +1,80 @@ +#!/bin/bash +# rollback-latest.sh — moves the :latest tag on ghcr.io/molecule-ai/platform +# (and the matching tenant image) back to a prior :staging- digest +# without rebuilding anything. Prod tenants auto-pull :latest every 5 +# min, so this is the fast path when a canary-verified image turns out +# to have a runtime regression that canary didn't catch. +# +# Usage: +# scripts/rollback-latest.sh +# scripts/rollback-latest.sh 4c1d56e +# +# Prereqs: +# - crane on $PATH (brew install crane OR download from +# https://github.com/google/go-containerregistry/releases) +# - GHCR token exported as GITHUB_TOKEN with write:packages scope +# +# What it does (per image — platform + tenant): +# crane digest ghcr.io/…: # verify the target sha exists +# crane tag ghcr.io/…: latest # retag remotely, single API call +# crane digest ghcr.io/…:latest # confirm the move +# +# Exit codes: 0 = both retagged, 1 = tag missing / crane error, 2 = bad args. + +set -euo pipefail + +if [ "${1:-}" = "" ]; then + echo "usage: $0 " >&2 + echo " e.g. $0 4c1d56e — retags :latest to :staging-4c1d56e" >&2 + exit 2 +fi + +TARGET_SHA="$1" +PLATFORM=ghcr.io/molecule-ai/platform +TENANT=ghcr.io/molecule-ai/platform-tenant + +if ! command -v crane >/dev/null; then + echo "ERROR: crane not installed. brew install crane" >&2 + exit 1 +fi +if [ -z "${GITHUB_TOKEN:-}" ]; then + echo "ERROR: GITHUB_TOKEN unset. export it with write:packages scope." >&2 + exit 1 +fi + +# Log in once. crane stores creds in a config file keyed by registry; +# re-running is cheap. +printf '%s\n' "$GITHUB_TOKEN" | crane auth login ghcr.io -u "${GITHUB_ACTOR:-$(whoami)}" --password-stdin >/dev/null + +roll() { + local image="$1" + local src="$image:staging-$TARGET_SHA" + local dst="$image:latest" + + echo "→ $image" + # Abort rollout if the target tag doesn't exist in the registry. + # Otherwise crane tag would error anyway, but a pre-check gives a + # clearer message for ops. + if ! crane digest "$src" >/dev/null 2>&1; then + echo " FAIL: $src not found in registry. Did you type the wrong sha?" >&2 + return 1 + fi + src_digest=$(crane digest "$src") + + crane tag "$src" latest + new_digest=$(crane digest "$dst") + + if [ "$new_digest" != "$src_digest" ]; then + echo " FAIL: $dst digest $new_digest does not match expected $src_digest" >&2 + return 1 + fi + echo " OK $dst → $new_digest" +} + +roll "$PLATFORM" +roll "$TENANT" + +echo +echo "=== ROLLBACK COMPLETE ===" +echo "Both images now point :latest at staging-$TARGET_SHA." +echo "Prod tenants will pick up the rollback within their 5-min auto-update cycle." diff --git a/workspace-server/.gitignore b/workspace-server/.gitignore index 254defdd..3f67c92f 100644 --- a/workspace-server/.gitignore +++ b/workspace-server/.gitignore @@ -1 +1,2 @@ -server +# The compiled binary, not the cmd/server package. +/server diff --git a/workspace-server/cmd/server/cp_config.go b/workspace-server/cmd/server/cp_config.go new file mode 100644 index 00000000..ff3f24e0 --- /dev/null +++ b/workspace-server/cmd/server/cp_config.go @@ -0,0 +1,107 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "io" + "log" + "net/http" + "os" + "time" +) + +// refreshEnvFromCP pulls the tenant's current config-plane env vars +// from the control plane and applies them via os.Setenv BEFORE any +// other code calls os.Getenv on them. +// +// Why: +// - user-data on the tenant EC2 bakes env vars into `docker run` at +// provision time. Those values are frozen. When we rotate a secret +// on CP (e.g. PROVISION_SHARED_SECRET) there's no way to push the +// new value into already-provisioned tenants. +// - the Docker image auto-updater already pulls the latest workspace- +// server image every 5 min. If THAT image knows how to refresh its +// own env from the CP on startup, every tenant heals itself within +// the update cycle — no ssh, no re-provision, no ops toil. +// +// Contract (paired with cp-side GET /cp/tenants/config): +// Request: GET {MOLECULE_CP_URL or https://api.moleculesai.app}/cp/tenants/config +// Authorization: Bearer +// X-Molecule-Org-Id: +// Response: 200 {"MOLECULE_CP_SHARED_SECRET":"…","MOLECULE_CP_URL":"…", …} +// 401 on bearer mismatch or unknown org +// +// Best-effort: any failure logs and returns — main() keeps booting. +// Self-hosted deploys without MOLECULE_ORG_ID or ADMIN_TOKEN set +// short-circuit silently so this function is a no-op there. +func refreshEnvFromCP() error { + orgID := os.Getenv("MOLECULE_ORG_ID") + adminToken := os.Getenv("ADMIN_TOKEN") + if orgID == "" || adminToken == "" { + // Not a SaaS tenant (self-hosted dev or not yet provisioned). + return nil + } + + base := os.Getenv("MOLECULE_CP_URL") + if base == "" { + // Default to prod for any tenant that lost track of its CP URL + // (e.g. older user-data that only set MOLECULE_ORG_ID). + base = "https://api.moleculesai.app" + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + req, err := http.NewRequestWithContext(ctx, "GET", base+"/cp/tenants/config", nil) + if err != nil { + return fmt.Errorf("build request: %w", err) + } + req.Header.Set("Authorization", "Bearer "+adminToken) + req.Header.Set("X-Molecule-Org-Id", orgID) + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return fmt.Errorf("do request: %w", err) + } + defer resp.Body.Close() + + // 64 KiB cap — the CP only returns small JSON blobs here. An + // unbounded read would be weaponizable if a compromised upstream + // ever echoed back a gigabyte. + body, err := io.ReadAll(io.LimitReader(resp.Body, 64<<10)) + if err != nil { + return fmt.Errorf("read body: %w", err) + } + + if resp.StatusCode != http.StatusOK { + // 401 on first boot-after-restart is expected for tenants still + // running under old user-data where admin_token on-disk hasn't + // had its corresponding row seeded. Don't treat as fatal — just + // log so operators can spot repeat offenders in logs. + return fmt.Errorf("cp returned %d", resp.StatusCode) + } + + var cfg map[string]string + if err := json.Unmarshal(body, &cfg); err != nil { + return fmt.Errorf("decode: %w", err) + } + + // Apply only strings; reject oversized values defensively. An + // operator-supplied config should never exceed 4 KiB per key — + // workspace-server env vars are URLs, hex secrets, short identifiers. + const maxValueBytes = 4 << 10 + applied := 0 + for k, v := range cfg { + if k == "" || len(v) > maxValueBytes { + continue + } + if err := os.Setenv(k, v); err != nil { + log.Printf("CP env refresh: setenv %s: %v", k, err) + continue + } + applied++ + } + log.Printf("CP env refresh: applied %d values from %s/cp/tenants/config", applied, base) + return nil +} diff --git a/workspace-server/cmd/server/cp_config_test.go b/workspace-server/cmd/server/cp_config_test.go new file mode 100644 index 00000000..fddcedde --- /dev/null +++ b/workspace-server/cmd/server/cp_config_test.go @@ -0,0 +1,100 @@ +package main + +import ( + "fmt" + "net/http" + "net/http/httptest" + "os" + "testing" +) + +// TestRefreshEnvFromCP_NoopWhenNotSaaS: without MOLECULE_ORG_ID or +// ADMIN_TOKEN, the function short-circuits silently — self-hosted dev +// must not fail or log spam here. +func TestRefreshEnvFromCP_NoopWhenNotSaaS(t *testing.T) { + t.Setenv("MOLECULE_ORG_ID", "") + t.Setenv("ADMIN_TOKEN", "") + if err := refreshEnvFromCP(); err != nil { + t.Errorf("expected nil on non-SaaS, got %v", err) + } +} + +// TestRefreshEnvFromCP_AppliesCPResponse: wire a stub CP, run refresh, +// confirm the returned env vars ended up in os.Environ(). +func TestRefreshEnvFromCP_AppliesCPResponse(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if got := r.Header.Get("Authorization"); got != "Bearer tenant-admin-token" { + t.Errorf("bearer: got %q", got) + } + if got := r.Header.Get("X-Molecule-Org-Id"); got != "org-abc" { + t.Errorf("org id header: got %q", got) + } + w.Header().Set("Content-Type", "application/json") + fmt.Fprint(w, `{"MOLECULE_CP_SHARED_SECRET":"new-secret","MOLECULE_CP_URL":"https://api.moleculesai.app"}`) + })) + defer srv.Close() + + t.Setenv("MOLECULE_ORG_ID", "org-abc") + t.Setenv("ADMIN_TOKEN", "tenant-admin-token") + t.Setenv("MOLECULE_CP_URL", srv.URL) + t.Setenv("MOLECULE_CP_SHARED_SECRET", "") // clear before refresh + + if err := refreshEnvFromCP(); err != nil { + t.Fatalf("refreshEnvFromCP: %v", err) + } + if got := os.Getenv("MOLECULE_CP_SHARED_SECRET"); got != "new-secret" { + t.Errorf("SHARED_SECRET: want new-secret, got %q", got) + } +} + +// TestRefreshEnvFromCP_CPUnreachableDoesNotFailBoot: network errors must +// return non-nil BUT main.go treats that as warn-and-continue. We assert +// the function returns an error (not a panic) so the caller can log. +func TestRefreshEnvFromCP_CPUnreachableDoesNotFailBoot(t *testing.T) { + t.Setenv("MOLECULE_ORG_ID", "org-abc") + t.Setenv("ADMIN_TOKEN", "t") + t.Setenv("MOLECULE_CP_URL", "http://127.0.0.1:1") // closed port + err := refreshEnvFromCP() + if err == nil { + t.Error("expected an error when CP is unreachable") + } +} + +// TestRefreshEnvFromCP_NonOKPropagates: CP returns 500 → error. +func TestRefreshEnvFromCP_NonOKPropagates(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.Error(w, "boom", http.StatusInternalServerError) + })) + defer srv.Close() + t.Setenv("MOLECULE_ORG_ID", "org-abc") + t.Setenv("ADMIN_TOKEN", "t") + t.Setenv("MOLECULE_CP_URL", srv.URL) + if err := refreshEnvFromCP(); err == nil { + t.Error("expected error on 500, got nil") + } +} + +// TestRefreshEnvFromCP_RejectsOversizedValue: a single-value-over-4KiB +// payload must NOT poison the environment. +func TestRefreshEnvFromCP_RejectsOversizedValue(t *testing.T) { + giant := make([]byte, 5<<10) + for i := range giant { + giant[i] = 'x' + } + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + fmt.Fprintf(w, `{"MOLECULE_CP_SHARED_SECRET":%q}`, string(giant)) + })) + defer srv.Close() + t.Setenv("MOLECULE_ORG_ID", "org-abc") + t.Setenv("ADMIN_TOKEN", "t") + t.Setenv("MOLECULE_CP_URL", srv.URL) + t.Setenv("MOLECULE_CP_SHARED_SECRET", "original") + if err := refreshEnvFromCP(); err != nil { + t.Fatalf("refreshEnvFromCP: %v", err) + } + if got := os.Getenv("MOLECULE_CP_SHARED_SECRET"); got != "original" { + t.Errorf("oversized value was applied — want %q, got %d bytes", + "original", len(got)) + } +} diff --git a/workspace-server/cmd/server/main.go b/workspace-server/cmd/server/main.go index 88ef581d..3855a859 100644 --- a/workspace-server/cmd/server/main.go +++ b/workspace-server/cmd/server/main.go @@ -30,6 +30,16 @@ import ( ) func main() { + // CP self-refresh: pull any operator-rotated config (e.g. a new + // MOLECULE_CP_SHARED_SECRET) before any other code reads env. + // Best-effort — if the CP is unreachable we keep booting with the + // env we were provisioned with. Older SaaS tenants predate PR #53 + // and can arrive here with MOLECULE_CP_SHARED_SECRET unset; this + // is how they heal without SSH. + if err := refreshEnvFromCP(); err != nil { + log.Printf("CP env refresh: %v (continuing with baked-in env)", err) + } + // Secrets encryption. In MOLECULE_ENV=prod, boot refuses to start // without a valid SECRETS_ENCRYPTION_KEY (fail-secure — Top-5 #5). // In any other environment, missing keys just log a warning and diff --git a/workspace-server/internal/provisioner/cp_provisioner_test.go b/workspace-server/internal/provisioner/cp_provisioner_test.go new file mode 100644 index 00000000..ce49a352 --- /dev/null +++ b/workspace-server/internal/provisioner/cp_provisioner_test.go @@ -0,0 +1,150 @@ +package provisioner + +import ( + "context" + "encoding/json" + "io" + "net/http" + "net/http/httptest" + "strings" + "testing" +) + +// TestNewCPProvisioner_RequiresOrgID — self-hosted deployments don't +// have a MOLECULE_ORG_ID, and the provisioner must refuse to construct +// rather than silently phone home to the prod CP with an empty tenant. +func TestNewCPProvisioner_RequiresOrgID(t *testing.T) { + t.Setenv("MOLECULE_ORG_ID", "") + if _, err := NewCPProvisioner(); err == nil { + t.Error("want error when MOLECULE_ORG_ID is unset, got nil") + } +} + +// TestNewCPProvisioner_FallsBackToProvisionSharedSecret — operators +// may set PROVISION_SHARED_SECRET on both sides of the wire with a +// single value; the tenant accepts that name as a fallback for +// MOLECULE_CP_SHARED_SECRET. The fallback is documented in +// NewCPProvisioner; this test is the regression gate. +func TestNewCPProvisioner_FallsBackToProvisionSharedSecret(t *testing.T) { + t.Setenv("MOLECULE_ORG_ID", "org-abc") + t.Setenv("MOLECULE_CP_SHARED_SECRET", "") + t.Setenv("PROVISION_SHARED_SECRET", "from-fallback") + + p, err := NewCPProvisioner() + if err != nil { + t.Fatalf("NewCPProvisioner: %v", err) + } + if p.sharedSecret != "from-fallback" { + t.Errorf("sharedSecret = %q, want %q", p.sharedSecret, "from-fallback") + } +} + +// TestAuthHeader_NoopWhenSecretEmpty — the self-hosted path that +// doesn't gate /cp/workspaces/* must not add a stray Authorization +// header (bearer-like content would be surprising to non-bearer +// intermediaries). +func TestAuthHeader_NoopWhenSecretEmpty(t *testing.T) { + p := &CPProvisioner{sharedSecret: ""} + req := httptest.NewRequest("GET", "http://x/", nil) + p.authHeader(req) + if got := req.Header.Get("Authorization"); got != "" { + t.Errorf("Authorization set to %q with empty secret; want unset", got) + } +} + +// TestAuthHeader_SetsBearerWhenSecretSet — happy path. +func TestAuthHeader_SetsBearerWhenSecretSet(t *testing.T) { + p := &CPProvisioner{sharedSecret: "the-secret"} + req := httptest.NewRequest("GET", "http://x/", nil) + p.authHeader(req) + if got := req.Header.Get("Authorization"); got != "Bearer the-secret" { + t.Errorf("Authorization = %q, want %q", got, "Bearer the-secret") + } +} + +// TestStart_HappyPath — Start posts to the stubbed CP, passes the +// bearer, and parses the returned instance_id. +func TestStart_HappyPath(t *testing.T) { + var sawBearer string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + sawBearer = r.Header.Get("Authorization") + if r.URL.Path != "/cp/workspaces/provision" { + t.Errorf("unexpected path %s", r.URL.Path) + } + // Verify the request body round-trips our fields + var body cpProvisionRequest + _ = json.NewDecoder(r.Body).Decode(&body) + if body.WorkspaceID != "ws-1" || body.Runtime != "python" { + t.Errorf("body mismatch: %+v", body) + } + w.WriteHeader(http.StatusCreated) + _, _ = io.WriteString(w, `{"instance_id":"i-abc123","state":"pending"}`) + })) + defer srv.Close() + + p := &CPProvisioner{ + baseURL: srv.URL, + orgID: "org-1", + sharedSecret: "s3cret", + httpClient: srv.Client(), + } + + id, err := p.Start(context.Background(), WorkspaceConfig{ + WorkspaceID: "ws-1", Runtime: "python", Tier: 1, PlatformURL: "http://tenant", + }) + if err != nil { + t.Fatalf("Start: %v", err) + } + if id != "i-abc123" { + t.Errorf("instance id = %q, want i-abc123", id) + } + if sawBearer != "Bearer s3cret" { + t.Errorf("server saw Authorization = %q, want Bearer s3cret", sawBearer) + } +} + +// TestStart_Non201ReturnsStructuredError — when CP returns 401 with a +// structured {"error":"..."} body, Start surfaces that error message. +// Verifies the defense against log-leaking raw upstream bodies. +func TestStart_Non201ReturnsStructuredError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusUnauthorized) + _, _ = io.WriteString(w, `{"error":"invalid credentials"}`) + })) + defer srv.Close() + + p := &CPProvisioner{baseURL: srv.URL, orgID: "org-1", httpClient: srv.Client()} + + _, err := p.Start(context.Background(), WorkspaceConfig{WorkspaceID: "ws-1", Runtime: "py"}) + if err == nil { + t.Fatal("expected error on 401, got nil") + } + if !strings.Contains(err.Error(), "invalid credentials") { + t.Errorf("error message %q should include upstream error field", err.Error()) + } +} + +// TestStart_NoStructuredErrorFallsBackToSize — the anti-leak path: +// when upstream returns non-JSON, we refuse to log the body and +// report only the byte count, preventing Authorization header echoes +// from landing in our logs. +func TestStart_NoStructuredErrorFallsBackToSize(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + _, _ = io.WriteString(w, "raw proxy error page, could contain echoed headers") + })) + defer srv.Close() + + p := &CPProvisioner{baseURL: srv.URL, orgID: "org-1", httpClient: srv.Client()} + + _, err := p.Start(context.Background(), WorkspaceConfig{WorkspaceID: "ws-1", Runtime: "py"}) + if err == nil { + t.Fatal("expected error on 500, got nil") + } + if strings.Contains(err.Error(), "raw proxy error") { + t.Errorf("error leaked raw body: %q", err.Error()) + } + if !strings.Contains(err.Error(), "= 3 { lastStatus = "stale" lastError = fmt.Sprintf("empty response %d consecutive times — agent may be phantom-producing (#795)", consecEmpty)