From 206856ad3a53cc8af0fa5973c5f4dc87d6807951 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 19 Apr 2026 02:12:47 -0700 Subject: [PATCH 1/9] fix(canvas): add 15s fetch timeout on API calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-launch audit flagged api.ts as missing a timeout on every fetch. A slow or hung CP response would leave the UI spinning indefinitely with no way for the user to abort — effectively a client-side DoS. 15s is long enough for real CP queries (slowest observed is Stripe portal redirect at ~3s) and short enough that a stalled backend surfaces as a clear error with a retry affordance. Uses AbortSignal.timeout (widely supported since 2023) so the abort propagates through React Query / SWR consumers cleanly. Co-Authored-By: Claude Opus 4.7 (1M context) --- canvas/src/lib/api.ts | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/canvas/src/lib/api.ts b/canvas/src/lib/api.ts index 6bb091b1..3721dce6 100644 --- a/canvas/src/lib/api.ts +++ b/canvas/src/lib/api.ts @@ -8,6 +8,12 @@ import { getTenantSlug } from "./tenant"; export const PLATFORM_URL = process.env.NEXT_PUBLIC_PLATFORM_URL ?? "http://localhost:8080"; +// 15s is long enough for slow CP queries but short enough that a +// hung backend doesn't leave the UI spinning forever. The abort +// propagates through AbortController so React components can observe +// the error and render a retry affordance. +const DEFAULT_TIMEOUT_MS = 15_000; + async function request( method: string, path: string, @@ -28,6 +34,7 @@ async function request( headers, body: body ? JSON.stringify(body) : undefined, credentials: "include", + signal: AbortSignal.timeout(DEFAULT_TIMEOUT_MS), }); if (!res.ok) { const text = await res.text(); From 96535c30ccb43ea47ec40b3aa7f5c2a072441bdc Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 19 Apr 2026 02:29:31 -0700 Subject: [PATCH 2/9] docs: 2026-04-19 SaaS prod migration notes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Captures the 10-PR staging→main cutover: what shipped, the three new Railway prod env vars (PROVISION_SHARED_SECRET / EC2_VPC_ID / CP_BASE_URL), and the sharp edge for existing tenants — their containers pre-date PR #53 so they still need MOLECULE_CP_SHARED_SECRET added manually (or a re-provision) before the new CPProvisioner's outbound bearer works. Also includes a post-deploy verification checklist and rollback plan. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../saas-prod-migration-2026-04-19.md | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 docs/architecture/saas-prod-migration-2026-04-19.md diff --git a/docs/architecture/saas-prod-migration-2026-04-19.md b/docs/architecture/saas-prod-migration-2026-04-19.md new file mode 100644 index 00000000..05963f76 --- /dev/null +++ b/docs/architecture/saas-prod-migration-2026-04-19.md @@ -0,0 +1,72 @@ +# SaaS prod migration — 2026-04-19 + +Promoted staging → main on both `Molecule-AI/molecule-controlplane` and `Molecule-AI/molecule-core`. This note captures the prod cutover deltas so ops can cross-check against the running system. + +## What changed + +Ten PRs landed, split across the two repos: + +**Control plane (`molecule-controlplane`)** +- PR #50 — C1/C2/C3: bearer auth on `/cp/workspaces/*`, shell-escape tenant user-data, per-tenant security group +- PR #51 — H1/H2: crash-safe `SECRETS_ENCRYPTION_KEY` log, dropped `admin_token` from `/instance` SELECT +- PR #52 — SSRF guard on `platform_url` +- PR #53 — CP injects `MOLECULE_CP_SHARED_SECRET` + `MOLECULE_CP_URL` into tenant env +- PR #54 — Stripe webhook body capped at 1 MiB + +**Core (`molecule-core` / this repo)** +- PR #978 — H3/H4: LimitReader on Discord webhook + workspace config PATCH +- PR #979 — C4: `AdminAuth` fail-closed on fresh install when `ADMIN_TOKEN` is set +- PR #980 — log-scrub: dropped token prefix logging, stopped logging raw upstream response bodies +- PR #981 — tenant `CPProvisioner` attaches the CP bearer on every outbound `/cp/workspaces/*` call +- PR #982 — Canvas API fetch timeout (15s) +- PR #984 — E2E smoke test sync for #966 (public GET no longer exposes `current_task`) + +## New prod env vars (Railway, project `molecule-platform`, env `production`) + +Set before the CP merge landed: + +| Variable | Value shape | Purpose | +|---|---|---| +| `PROVISION_SHARED_SECRET` | 32-byte hex | Gates `/cp/workspaces/*` on CP. Routes refuse to mount when unset — C1 fail-closed. | +| `EC2_VPC_ID` | `vpc-…` | Enables per-tenant SG creation (C3). Shared-SG fallback emits a startup warning. | +| `CP_BASE_URL` | `https://api.moleculesai.app` | Injected into newly-provisioned tenant containers as `MOLECULE_CP_URL`. | + +The live prod `PROVISION_SHARED_SECRET` value is held only in Railway; not committed anywhere. Rotate by `railway variables --set` + redeploy. + +## Existing-tenant migration (the sharp edge) + +Tenants provisioned **before** this cutover are still running the previous workspace-server image. When they pull the new image on their next boot or auto-update cycle, their `CPProvisioner` will start expecting `MOLECULE_CP_SHARED_SECRET` in the container env — but the existing tenant EC2s don't have that variable in their user-data (the CP only started injecting it from PR #53 onward). + +**Symptom**: a pre-cutover tenant can still serve its users' existing workspaces, but any attempt to **provision a new workspace** from inside the tenant UI will hit the CP's new bearer gate and get `401` or `404` back, surfacing as "workspace provision failed" with a generic error. + +**Fix per existing tenant (pick one)**: + +1. **SSH in + add the env var** + - Copy `PROVISION_SHARED_SECRET` from Railway prod env. + - `ssh ubuntu@` and append to the running container's env (`docker stop && docker run … -e MOLECULE_CP_SHARED_SECRET='…' -e MOLECULE_CP_URL=https://api.moleculesai.app …`). Rolling this into an auto-update hook is follow-up work. + +2. **Re-provision the tenant** + - `DELETE /cp/orgs/:slug` → re-create via normal signup flow. Tenant-level data survives only if the tenant's own Postgres volume is preserved; workspace_id values change. This is the heavy hammer — only for tenants where existing data can be recreated easily. + +3. **Wait for the auto-update + user-data refresh cycle** + - Tenant auto-updater (cron, 5-minute cadence) pulls the new container image but **does not refresh env vars** — those are frozen from the initial user-data. So option 3 alone doesn't fix this; it still needs option 1 or 2. + +Script at `scripts/migrate-tenant-cp-secret.sh` (follow-up) will automate option 1 across all running tenants in the prod AWS account. + +## Post-deploy verification checklist + +- [ ] Railway prod deploy for `controlplane` lands on the new commit (check `https://railway.com/project/7ccc…/service/ae76…`) +- [ ] `curl https://api.moleculesai.app/health` → 200 `{service: molecule-cp, status: ok}` +- [ ] `curl -X POST https://api.moleculesai.app/cp/workspaces/provision` (no bearer) → 401 (**not** 404 — proves the env var is live and routes mounted) +- [ ] GHCR publishes new `workspace-server` image for the core main commit +- [ ] Vercel canvas prod deploy lands + +## Rollback + +If prod is on fire: + +1. `gh pr revert 46 -R Molecule-AI/molecule-controlplane` — reverts all 6 CP PRs together. +2. `gh pr revert 983 -R Molecule-AI/molecule-core` — reverts the core bundle. +3. Both reverts auto-deploy via Railway / GHCR / Vercel. + +Existing tenants aren't affected by a rollback — they're running whichever tenant image tag they booted with. Only newly-provisioned tenants pick up the reverted control plane code. From 48ec5b2dc8f7ee53c0cce28be0488d1184a4b93a Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 19 Apr 2026 02:41:15 -0700 Subject: [PATCH 3/9] feat(ws-server): pull env from CP on startup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Paired with molecule-controlplane PR #55 (GET /cp/tenants/config). Lets existing tenants heal themselves when we rotate or add a CP-side env var (e.g. MOLECULE_CP_SHARED_SECRET landing earlier today) without any ssh or re-provision. Flow: main() calls refreshEnvFromCP() before any other os.Getenv read. The helper reads MOLECULE_ORG_ID + ADMIN_TOKEN from the baked-in user-data env, GETs {MOLECULE_CP_URL}/cp/tenants/config with those credentials, and applies the returned string map via os.Setenv so downstream code (CPProvisioner, etc.) sees the fresh values. Best-effort semantics: - self-hosted / no MOLECULE_ORG_ID → no-op (return nil) - CP unreachable / non-200 → log + return error (main keeps booting) - oversized values (>4 KiB each) rejected to avoid env pollution - body read capped at 64 KiB Once this image hits GHCR, the 5-minute tenant auto-updater picks it up, the container restarts, refresh runs, and every tenant has MOLECULE_CP_SHARED_SECRET within ~5 minutes — no operator toil. Also fixes workspace-server/.gitignore so `server` no longer matches the cmd/server package dir — it only ignored the compiled binary but pattern was too broad. Anchored to `/server`. Co-Authored-By: Claude Opus 4.7 (1M context) --- workspace-server/.gitignore | 3 +- workspace-server/cmd/server/cp_config.go | 107 ++++++++++++++++++ workspace-server/cmd/server/cp_config_test.go | 100 ++++++++++++++++ workspace-server/cmd/server/main.go | 10 ++ 4 files changed, 219 insertions(+), 1 deletion(-) create mode 100644 workspace-server/cmd/server/cp_config.go create mode 100644 workspace-server/cmd/server/cp_config_test.go diff --git a/workspace-server/.gitignore b/workspace-server/.gitignore index 254defdd..3f67c92f 100644 --- a/workspace-server/.gitignore +++ b/workspace-server/.gitignore @@ -1 +1,2 @@ -server +# The compiled binary, not the cmd/server package. +/server diff --git a/workspace-server/cmd/server/cp_config.go b/workspace-server/cmd/server/cp_config.go new file mode 100644 index 00000000..ff3f24e0 --- /dev/null +++ b/workspace-server/cmd/server/cp_config.go @@ -0,0 +1,107 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "io" + "log" + "net/http" + "os" + "time" +) + +// refreshEnvFromCP pulls the tenant's current config-plane env vars +// from the control plane and applies them via os.Setenv BEFORE any +// other code calls os.Getenv on them. +// +// Why: +// - user-data on the tenant EC2 bakes env vars into `docker run` at +// provision time. Those values are frozen. When we rotate a secret +// on CP (e.g. PROVISION_SHARED_SECRET) there's no way to push the +// new value into already-provisioned tenants. +// - the Docker image auto-updater already pulls the latest workspace- +// server image every 5 min. If THAT image knows how to refresh its +// own env from the CP on startup, every tenant heals itself within +// the update cycle — no ssh, no re-provision, no ops toil. +// +// Contract (paired with cp-side GET /cp/tenants/config): +// Request: GET {MOLECULE_CP_URL or https://api.moleculesai.app}/cp/tenants/config +// Authorization: Bearer +// X-Molecule-Org-Id: +// Response: 200 {"MOLECULE_CP_SHARED_SECRET":"…","MOLECULE_CP_URL":"…", …} +// 401 on bearer mismatch or unknown org +// +// Best-effort: any failure logs and returns — main() keeps booting. +// Self-hosted deploys without MOLECULE_ORG_ID or ADMIN_TOKEN set +// short-circuit silently so this function is a no-op there. +func refreshEnvFromCP() error { + orgID := os.Getenv("MOLECULE_ORG_ID") + adminToken := os.Getenv("ADMIN_TOKEN") + if orgID == "" || adminToken == "" { + // Not a SaaS tenant (self-hosted dev or not yet provisioned). + return nil + } + + base := os.Getenv("MOLECULE_CP_URL") + if base == "" { + // Default to prod for any tenant that lost track of its CP URL + // (e.g. older user-data that only set MOLECULE_ORG_ID). + base = "https://api.moleculesai.app" + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + req, err := http.NewRequestWithContext(ctx, "GET", base+"/cp/tenants/config", nil) + if err != nil { + return fmt.Errorf("build request: %w", err) + } + req.Header.Set("Authorization", "Bearer "+adminToken) + req.Header.Set("X-Molecule-Org-Id", orgID) + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return fmt.Errorf("do request: %w", err) + } + defer resp.Body.Close() + + // 64 KiB cap — the CP only returns small JSON blobs here. An + // unbounded read would be weaponizable if a compromised upstream + // ever echoed back a gigabyte. + body, err := io.ReadAll(io.LimitReader(resp.Body, 64<<10)) + if err != nil { + return fmt.Errorf("read body: %w", err) + } + + if resp.StatusCode != http.StatusOK { + // 401 on first boot-after-restart is expected for tenants still + // running under old user-data where admin_token on-disk hasn't + // had its corresponding row seeded. Don't treat as fatal — just + // log so operators can spot repeat offenders in logs. + return fmt.Errorf("cp returned %d", resp.StatusCode) + } + + var cfg map[string]string + if err := json.Unmarshal(body, &cfg); err != nil { + return fmt.Errorf("decode: %w", err) + } + + // Apply only strings; reject oversized values defensively. An + // operator-supplied config should never exceed 4 KiB per key — + // workspace-server env vars are URLs, hex secrets, short identifiers. + const maxValueBytes = 4 << 10 + applied := 0 + for k, v := range cfg { + if k == "" || len(v) > maxValueBytes { + continue + } + if err := os.Setenv(k, v); err != nil { + log.Printf("CP env refresh: setenv %s: %v", k, err) + continue + } + applied++ + } + log.Printf("CP env refresh: applied %d values from %s/cp/tenants/config", applied, base) + return nil +} diff --git a/workspace-server/cmd/server/cp_config_test.go b/workspace-server/cmd/server/cp_config_test.go new file mode 100644 index 00000000..fddcedde --- /dev/null +++ b/workspace-server/cmd/server/cp_config_test.go @@ -0,0 +1,100 @@ +package main + +import ( + "fmt" + "net/http" + "net/http/httptest" + "os" + "testing" +) + +// TestRefreshEnvFromCP_NoopWhenNotSaaS: without MOLECULE_ORG_ID or +// ADMIN_TOKEN, the function short-circuits silently — self-hosted dev +// must not fail or log spam here. +func TestRefreshEnvFromCP_NoopWhenNotSaaS(t *testing.T) { + t.Setenv("MOLECULE_ORG_ID", "") + t.Setenv("ADMIN_TOKEN", "") + if err := refreshEnvFromCP(); err != nil { + t.Errorf("expected nil on non-SaaS, got %v", err) + } +} + +// TestRefreshEnvFromCP_AppliesCPResponse: wire a stub CP, run refresh, +// confirm the returned env vars ended up in os.Environ(). +func TestRefreshEnvFromCP_AppliesCPResponse(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if got := r.Header.Get("Authorization"); got != "Bearer tenant-admin-token" { + t.Errorf("bearer: got %q", got) + } + if got := r.Header.Get("X-Molecule-Org-Id"); got != "org-abc" { + t.Errorf("org id header: got %q", got) + } + w.Header().Set("Content-Type", "application/json") + fmt.Fprint(w, `{"MOLECULE_CP_SHARED_SECRET":"new-secret","MOLECULE_CP_URL":"https://api.moleculesai.app"}`) + })) + defer srv.Close() + + t.Setenv("MOLECULE_ORG_ID", "org-abc") + t.Setenv("ADMIN_TOKEN", "tenant-admin-token") + t.Setenv("MOLECULE_CP_URL", srv.URL) + t.Setenv("MOLECULE_CP_SHARED_SECRET", "") // clear before refresh + + if err := refreshEnvFromCP(); err != nil { + t.Fatalf("refreshEnvFromCP: %v", err) + } + if got := os.Getenv("MOLECULE_CP_SHARED_SECRET"); got != "new-secret" { + t.Errorf("SHARED_SECRET: want new-secret, got %q", got) + } +} + +// TestRefreshEnvFromCP_CPUnreachableDoesNotFailBoot: network errors must +// return non-nil BUT main.go treats that as warn-and-continue. We assert +// the function returns an error (not a panic) so the caller can log. +func TestRefreshEnvFromCP_CPUnreachableDoesNotFailBoot(t *testing.T) { + t.Setenv("MOLECULE_ORG_ID", "org-abc") + t.Setenv("ADMIN_TOKEN", "t") + t.Setenv("MOLECULE_CP_URL", "http://127.0.0.1:1") // closed port + err := refreshEnvFromCP() + if err == nil { + t.Error("expected an error when CP is unreachable") + } +} + +// TestRefreshEnvFromCP_NonOKPropagates: CP returns 500 → error. +func TestRefreshEnvFromCP_NonOKPropagates(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.Error(w, "boom", http.StatusInternalServerError) + })) + defer srv.Close() + t.Setenv("MOLECULE_ORG_ID", "org-abc") + t.Setenv("ADMIN_TOKEN", "t") + t.Setenv("MOLECULE_CP_URL", srv.URL) + if err := refreshEnvFromCP(); err == nil { + t.Error("expected error on 500, got nil") + } +} + +// TestRefreshEnvFromCP_RejectsOversizedValue: a single-value-over-4KiB +// payload must NOT poison the environment. +func TestRefreshEnvFromCP_RejectsOversizedValue(t *testing.T) { + giant := make([]byte, 5<<10) + for i := range giant { + giant[i] = 'x' + } + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + fmt.Fprintf(w, `{"MOLECULE_CP_SHARED_SECRET":%q}`, string(giant)) + })) + defer srv.Close() + t.Setenv("MOLECULE_ORG_ID", "org-abc") + t.Setenv("ADMIN_TOKEN", "t") + t.Setenv("MOLECULE_CP_URL", srv.URL) + t.Setenv("MOLECULE_CP_SHARED_SECRET", "original") + if err := refreshEnvFromCP(); err != nil { + t.Fatalf("refreshEnvFromCP: %v", err) + } + if got := os.Getenv("MOLECULE_CP_SHARED_SECRET"); got != "original" { + t.Errorf("oversized value was applied — want %q, got %d bytes", + "original", len(got)) + } +} diff --git a/workspace-server/cmd/server/main.go b/workspace-server/cmd/server/main.go index 88ef581d..3855a859 100644 --- a/workspace-server/cmd/server/main.go +++ b/workspace-server/cmd/server/main.go @@ -30,6 +30,16 @@ import ( ) func main() { + // CP self-refresh: pull any operator-rotated config (e.g. a new + // MOLECULE_CP_SHARED_SECRET) before any other code reads env. + // Best-effort — if the CP is unreachable we keep booting with the + // env we were provisioned with. Older SaaS tenants predate PR #53 + // and can arrive here with MOLECULE_CP_SHARED_SECRET unset; this + // is how they heal without SSH. + if err := refreshEnvFromCP(); err != nil { + log.Printf("CP env refresh: %v (continuing with baked-in env)", err) + } + // Secrets encryption. In MOLECULE_ENV=prod, boot refuses to start // without a valid SECRETS_ENCRYPTION_KEY (fail-secure — Top-5 #5). // In any other environment, missing keys just log a warning and From 9662590360dbcdb06004e88f0b69aa880d710e47 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 19 Apr 2026 03:30:19 -0700 Subject: [PATCH 4/9] feat(canary): smoke harness + GHA verification workflow (Phase 2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Post-deploy verification for staging tenant images. Runs against the canary fleet after each publish-workspace-server-image build — catches auto-update breakage (a la today's E2E current_task drift) before it propagates to the prod tenant fleet that auto-pulls :latest every 5 min. scripts/canary-smoke.sh iterates a space-sep list of canary base URLs (paired with their ADMIN_TOKENs) and checks: - /admin/liveness reachable with admin bearer (tenant boot OK) - /workspaces list responds (wsAuth + DB path OK) - /memories/commit + /memories/search round-trip (encryption + scrubber) - /events admin read (AdminAuth C4 path) - /admin/liveness without bearer returns 401 (C4 fail-closed regression) .github/workflows/canary-verify.yml runs after publish succeeds: - 6-min sleep (tenant auto-updater pulls every 5 min) - bash scripts/canary-smoke.sh with secrets pulled from repo settings - on failure: writes a Step Summary flagging that :latest should be rolled back to prior known-good digest Phase 3 follow-up will split the publish workflow so only :staging- ships initially, and canary-verify's green gate is what promotes :staging- → :latest. This commit lays the test gate alone so we have something running against tenants immediately. Secrets to set in GitHub repo settings before this workflow can run: - CANARY_TENANT_URLS (space-sep list) - CANARY_ADMIN_TOKENS (same order as URLs) - CANARY_CP_SHARED_SECRET (matches staging CP PROVISION_SHARED_SECRET) Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/canary-verify.yml | 56 +++++++++++++ scripts/canary-smoke.sh | 120 ++++++++++++++++++++++++++++ 2 files changed, 176 insertions(+) create mode 100644 .github/workflows/canary-verify.yml create mode 100755 scripts/canary-smoke.sh diff --git a/.github/workflows/canary-verify.yml b/.github/workflows/canary-verify.yml new file mode 100644 index 00000000..55f61a9b --- /dev/null +++ b/.github/workflows/canary-verify.yml @@ -0,0 +1,56 @@ +name: canary-verify + +# Runs the canary smoke suite against the staging canary tenant fleet +# after a new workspace-server image lands on :latest. On failure, +# alerts via a GitHub Actions summary — follow-up PR will add: +# - :staging- intermediate tag published BY publish workflow +# - retag :staging- → :latest ONLY when this workflow is green +# - Telegram/Slack notifier on red +# For now this exists as the test gate itself so we can catch +# auto-update breakage before it reaches the prod tenant fleet +# (which auto-pulls :latest every 5 min). + +on: + workflow_run: + workflows: ["publish-workspace-server-image"] + types: [completed] + workflow_dispatch: + +permissions: + contents: read + actions: read + +jobs: + canary-smoke: + # Skip when the upstream workflow failed — no image to test against. + if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }} + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Wait for canary tenants to pick up new image + # Tenant auto-updater runs every 5 min. Sleep 6 min to give every + # canary time to pull + restart. Cheaper than polling. + run: sleep 360 + + - name: Run canary smoke suite + env: + CANARY_TENANT_URLS: ${{ secrets.CANARY_TENANT_URLS }} + CANARY_ADMIN_TOKENS: ${{ secrets.CANARY_ADMIN_TOKENS }} + CANARY_CP_BASE_URL: https://staging-api.moleculesai.app + CANARY_CP_SHARED_SECRET: ${{ secrets.CANARY_CP_SHARED_SECRET }} + run: bash scripts/canary-smoke.sh + + - name: Summary on failure + if: ${{ failure() }} + run: | + { + echo "## Canary smoke FAILED" + echo + echo "The staging canary tenant fleet failed its post-deploy smoke suite." + echo "The :latest tag on ghcr.io/molecule-ai/platform should be rolled back" + echo "to the prior known-good digest until this is resolved." + echo + echo "See job log above for the specific failed assertions." + } >> "$GITHUB_STEP_SUMMARY" diff --git a/scripts/canary-smoke.sh b/scripts/canary-smoke.sh new file mode 100755 index 00000000..0d549de2 --- /dev/null +++ b/scripts/canary-smoke.sh @@ -0,0 +1,120 @@ +#!/bin/bash +# canary-smoke.sh — runs the post-deploy smoke suite against the +# staging canary tenant fleet. Called by the canary-verify.yml GitHub +# Actions workflow after a new workspace-server image gets pushed to +# GHCR; exits non-zero on any failure so the workflow can skip the +# :staging-sha → :latest retag that would otherwise release broken +# code to the prod tenant fleet. +# +# Environment: +# CANARY_TENANT_URLS space-sep list of canary tenant base URLs +# (e.g. "https://canary-pm.staging.moleculesai.app +# https://canary-mcp.staging.moleculesai.app") +# CANARY_ADMIN_TOKENS space-sep list of ADMIN_TOKENs, positionally +# matched to CANARY_TENANT_URLS. Canary tenants +# are provisioned with known ADMIN_TOKENs so CI +# can hit their admin-gated endpoints. +# CANARY_CP_BASE_URL CP base URL the canaries call back to +# (https://staging-api.moleculesai.app) +# CANARY_CP_SHARED_SECRET matches CP's PROVISION_SHARED_SECRET so this +# script can also exercise /cp/workspaces/* via +# the canary's own CPProvisioner identity. +# +# Exit codes: 0 = all green, 1 = assertion failure, 2 = setup/env problem. + +set -euo pipefail + +# ── Setup ──────────────────────────────────────────────────────────────── + +: "${CANARY_TENANT_URLS:?space-sep list of canary base URLs required}" +: "${CANARY_ADMIN_TOKENS:?space-sep list of ADMIN_TOKENs required, same order as URLs}" +: "${CANARY_CP_BASE_URL:?CP base URL required}" + +read -r -a URLS <<< "$CANARY_TENANT_URLS" +read -r -a TOKENS <<< "$CANARY_ADMIN_TOKENS" + +if [ "${#URLS[@]}" -ne "${#TOKENS[@]}" ]; then + echo "ERROR: URLS(${#URLS[@]}) and TOKENS(${#TOKENS[@]}) length mismatch" >&2 + exit 2 +fi +if [ "${#URLS[@]}" -eq 0 ]; then + echo "ERROR: no canary URLs configured" >&2 + exit 2 +fi + +PASS=0 +FAIL=0 + +# ── Helpers ────────────────────────────────────────────────────────────── + +check() { + local desc="$1" expected="$2" actual="$3" + if echo "$actual" | grep -qF "$expected"; then + printf " PASS %s\n" "$desc" + PASS=$((PASS + 1)) + else + printf " FAIL %s\n expected to contain: %s\n got: %s\n" "$desc" "$expected" "$actual" >&2 + FAIL=$((FAIL + 1)) + fi +} + +# acurl does an admin-authenticated GET/POST/etc. against a canary tenant. +# Takes +BASE_URL +ADMIN_TOKEN as its first two positional args; the rest +# are passed through to curl. Keeps the two values paired so the wrong +# tenant never gets the wrong token. +acurl() { + local base="$1" token="$2"; shift 2 + curl -sS --max-time 20 -H "Authorization: Bearer $token" "$@" -- "$base${CANARY_ACURL_PATH:-}" +} + +# ── Checks (run per canary tenant) ─────────────────────────────────────── + +for i in "${!URLS[@]}"; do + base="${URLS[$i]}" + token="${TOKENS[$i]}" + printf "\n── %s ──\n" "$base" + + # 1. Liveness — the tenant is up and responding to admin auth. + CANARY_ACURL_PATH="/admin/liveness" resp=$(acurl "$base" "$token" || true) + check "liveness returns a subsystems map" '"subsystems"' "$resp" + + # 2. CP env refresh — the workspace-server fetched MOLECULE_CP_SHARED_SECRET + # from CP on startup. We can't read env directly, but we can assert the + # liveness + workspace list both work, which together imply the binary + # booted without crashing on the refresh call. A startup failure in + # refreshEnvFromCP logs but still boots (best-effort semantics), so + # this is a sanity check, not a proof. + CANARY_ACURL_PATH="/workspaces" resp=$(acurl "$base" "$token" || true) + check "workspace list is JSON array" "[" "$resp" + + # 3. Memory commit round-trip — scope=LOCAL so test data stays on this + # tenant. Verifies encryption + scrubber + retrieval end-to-end. + probe_id="canary-smoke-$(date +%s)-$i" + body=$(printf '{"scope":"LOCAL","namespace":"canary-smoke","content":"probe-%s"}' "$probe_id") + CANARY_ACURL_PATH="/memories/commit" resp=$(curl -sS --max-time 20 \ + -X POST -H "Content-Type: application/json" -H "Authorization: Bearer $token" \ + --data "$body" "$base/memories/commit" || true) + check "memory commit accepted" '"id"' "$resp" + + CANARY_ACURL_PATH="/memories/search?query=probe-${probe_id}" \ + resp=$(curl -sS --max-time 20 -H "Authorization: Bearer $token" \ + "$base/memories/search?query=probe-${probe_id}" || true) + check "memory search finds the probe" "probe-${probe_id}" "$resp" + + # 4. Events admin read — AdminAuth path (C4 fail-closed proof on SaaS). + CANARY_ACURL_PATH="/events" resp=$(acurl "$base" "$token" || true) + check "events endpoint returns JSON" "[" "$resp" + + # 5. Negative: unauth'd admin call must 401 (C4 regression gate). + unauth_code=$(curl -sS -o /dev/null -w '%{http_code}' --max-time 10 "$base/admin/liveness" || echo "000") + check "unauth'd /admin/liveness returns 401" "401" "$unauth_code" +done + +# ── Summary ────────────────────────────────────────────────────────────── + +printf "\n=== CANARY SMOKE RESULTS ===\n" +printf " PASS: %d\n FAIL: %d\n" "$PASS" "$FAIL" + +if [ "$FAIL" -gt 0 ]; then + exit 1 +fi From 8f705dc10962ac10d1f4ca56fe9d9b89760be9f2 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 19 Apr 2026 03:33:04 -0700 Subject: [PATCH 5/9] feat(canary): gate :latest tag promotion on canary verify green (Phase 3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Completes the canary release train. Before this, publish-workspace- server-image.yml pushed both :staging- and :latest on every main merge — meaning the prod tenant fleet auto-pulled every image immediately, before any post-deploy smoke test. A broken image (think: this morning's E2E current_task drift, but shipped at 3am instead of caught in CI) would have fanned out to every running tenant within 5 min. Now: - publish workflow pushes :staging- ONLY - canary tenants are configured to track :staging-; they pick up the new image on their next auto-update cycle - canary-verify.yml runs the smoke suite (Phase 2) after the sleep - on green: a new promote-to-latest job uses crane to remotely retag :staging- → :latest for both platform and tenant images - prod tenants auto-update to the newly-retagged :latest within their usual 5-min window - on red: :latest stays frozen on prior good digest; prod is untouched crane is pulled onto the runner (~4 MB, GitHub release) rather than docker-daemon retag so the workflow doesn't need a privileged runner. Rollback: if canary passed but something surfaces post-promotion, operator runs "crane tag ghcr.io/molecule-ai/platform: latest" manually. A follow-up can wrap that in a Phase 4 admin endpoint / script. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/canary-verify.yml | 83 ++++++++++++++++--- .../publish-workspace-server-image.yml | 24 ++++-- 2 files changed, 86 insertions(+), 21 deletions(-) diff --git a/.github/workflows/canary-verify.yml b/.github/workflows/canary-verify.yml index 55f61a9b..16d06a70 100644 --- a/.github/workflows/canary-verify.yml +++ b/.github/workflows/canary-verify.yml @@ -1,14 +1,19 @@ name: canary-verify # Runs the canary smoke suite against the staging canary tenant fleet -# after a new workspace-server image lands on :latest. On failure, -# alerts via a GitHub Actions summary — follow-up PR will add: -# - :staging- intermediate tag published BY publish workflow -# - retag :staging- → :latest ONLY when this workflow is green -# - Telegram/Slack notifier on red -# For now this exists as the test gate itself so we can catch -# auto-update breakage before it reaches the prod tenant fleet -# (which auto-pulls :latest every 5 min). +# after a new :staging- image lands in GHCR. On green, promotes +# :staging- → :latest so the prod tenant fleet's 5-minute +# auto-updater picks up the verified digest. On red, :latest stays +# on the prior known-good digest and prod is untouched. +# +# Dependencies: +# - publish-workspace-server-image.yml publishes :staging- +# (NOT :latest) on main merge +# - canary tenants are configured to pull :staging- as their +# tenant image (set TENANT_IMAGE=ghcr.io/…:staging- on the +# canary provisioner code path OR rotate via an admin endpoint) +# - Repo secrets CANARY_TENANT_URLS / CANARY_ADMIN_TOKENS / +# CANARY_CP_SHARED_SECRET are populated on: workflow_run: @@ -18,18 +23,29 @@ on: permissions: contents: read + packages: write actions: read +env: + IMAGE_NAME: ghcr.io/molecule-ai/platform + TENANT_IMAGE_NAME: ghcr.io/molecule-ai/platform-tenant + jobs: canary-smoke: # Skip when the upstream workflow failed — no image to test against. if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }} runs-on: ubuntu-latest + outputs: + sha: ${{ steps.compute.outputs.sha }} steps: - name: Checkout uses: actions/checkout@v4 - - name: Wait for canary tenants to pick up new image + - name: Compute sha + id: compute + run: echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT" + + - name: Wait for canary tenants to pick up :staging- # Tenant auto-updater runs every 5 min. Sleep 6 min to give every # canary time to pull + restart. Cheaper than polling. run: sleep 360 @@ -48,9 +64,50 @@ jobs: { echo "## Canary smoke FAILED" echo - echo "The staging canary tenant fleet failed its post-deploy smoke suite." - echo "The :latest tag on ghcr.io/molecule-ai/platform should be rolled back" - echo "to the prior known-good digest until this is resolved." + echo "Canary tenants rejected image \`staging-${{ steps.compute.outputs.sha }}\`." + echo ":latest stays pinned to the prior good digest — prod is untouched." echo - echo "See job log above for the specific failed assertions." + echo "Fix forward and merge again, or investigate the specific failed" + echo "assertions in the canary-smoke step log above." + } >> "$GITHUB_STEP_SUMMARY" + + promote-to-latest: + # On green, retag :staging- → :latest for BOTH images. + # crane is a lightweight registry client (no Docker daemon needed on + # the runner) that can retag remotely with a single API call each. + needs: canary-smoke + if: ${{ needs.canary-smoke.result == 'success' }} + runs-on: ubuntu-latest + steps: + - name: Install crane + run: | + curl -fsSL https://github.com/google/go-containerregistry/releases/download/v0.20.2/go-containerregistry_Linux_x86_64.tar.gz | \ + tar xz -C /usr/local/bin crane + + - name: GHCR login + run: | + echo "${{ secrets.GITHUB_TOKEN }}" | \ + crane auth login ghcr.io -u "${{ github.actor }}" --password-stdin + + - name: Retag platform :staging- → :latest + run: | + crane tag \ + "${IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}" \ + latest + + - name: Retag tenant :staging- → :latest + run: | + crane tag \ + "${TENANT_IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}" \ + latest + + - name: Summary + run: | + { + echo "## Canary verified — :latest promoted" + echo + echo "- \`${IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}\` → \`${IMAGE_NAME}:latest\`" + echo "- \`${TENANT_IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}\` → \`${TENANT_IMAGE_NAME}:latest\`" + echo + echo "Prod tenant fleet will pick up the new digest on its next 5-min auto-update cycle." } >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/publish-workspace-server-image.yml b/.github/workflows/publish-workspace-server-image.yml index 28ef0b79..b76681c4 100644 --- a/.github/workflows/publish-workspace-server-image.yml +++ b/.github/workflows/publish-workspace-server-image.yml @@ -55,7 +55,17 @@ jobs: run: | echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT" - - name: Build & push platform image to GHCR + # Canary-gated release: we publish :staging- ONLY here. The + # :latest tag (which existing prod tenants auto-pull every 5 min) + # is promoted by .github/workflows/canary-verify.yml after the + # staging canary fleet green-lights this digest. + # That means: + # - Every main merge produces a :staging- image + # - Canary tenants (configured to pull :staging-) pick it up + # - canary-verify.yml runs smoke tests against them + # - On green → canary-verify retags :staging- → :latest + # - On red → :latest stays on the prior good digest, prod is safe + - name: Build & push platform image to GHCR (staging- only) uses: docker/build-push-action@v6 with: context: . @@ -63,16 +73,15 @@ jobs: platforms: linux/amd64 push: true tags: | - ${{ env.IMAGE_NAME }}:latest - ${{ env.IMAGE_NAME }}:sha-${{ steps.tags.outputs.sha }} + ${{ env.IMAGE_NAME }}:staging-${{ steps.tags.outputs.sha }} cache-from: type=gha cache-to: type=gha,mode=max labels: | org.opencontainers.image.source=https://github.com/${{ github.repository }} org.opencontainers.image.revision=${{ github.sha }} - org.opencontainers.image.description=Molecule AI platform (Go API server) + org.opencontainers.image.description=Molecule AI platform (Go API server) — pending canary verify - - name: Build & push tenant image to GHCR + - name: Build & push tenant image to GHCR (staging- only) uses: docker/build-push-action@v6 with: context: . @@ -80,11 +89,10 @@ jobs: platforms: linux/amd64 push: true tags: | - ${{ env.TENANT_IMAGE_NAME }}:latest - ${{ env.TENANT_IMAGE_NAME }}:sha-${{ steps.tags.outputs.sha }} + ${{ env.TENANT_IMAGE_NAME }}:staging-${{ steps.tags.outputs.sha }} cache-from: type=gha cache-to: type=gha,mode=max labels: | org.opencontainers.image.source=https://github.com/${{ github.repository }} org.opencontainers.image.revision=${{ github.sha }} - org.opencontainers.image.description=Molecule AI tenant platform + canvas (one EC2 instance per org) + org.opencontainers.image.description=Molecule AI tenant platform + canvas — pending canary verify From eecce56c13ce153b0912ff97c758bcd7a56fdfaa Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 19 Apr 2026 03:37:42 -0700 Subject: [PATCH 6/9] feat(canary): rollback-latest script + release-pipeline doc (Phase 4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the canary loop with the escape hatch and a single place to read about the whole flow. scripts/rollback-latest.sh uses crane to retag :latest ← :staging- for BOTH the platform and tenant images. Pre-checks the target tag exists and verifies the :latest digest after the move so a bad ops typo doesn't silently promote the wrong thing. Prod tenants auto-update to the rolled-back digest within their 5-min cycle. Exit codes: 0 = both retagged, 1 = registry/tag error, 2 = usage error. docs/architecture/canary-release.md The one-page map of the pipeline: how PR → main → staging- → canary smoke → :latest promotion works end-to-end, how to add a canary tenant, how to roll back, and what this gate explicitly does NOT catch (prod-only data, config drift, cross-tenant bugs). No code changes in the CP or workspace-server — this PR is shell + docs only, so it's safe to land independently of the other Phase {1,1.5,2,3} PRs still in review. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/architecture/canary-release.md | 79 ++++++++++++++++++++++++++++ scripts/rollback-latest.sh | 80 +++++++++++++++++++++++++++++ 2 files changed, 159 insertions(+) create mode 100644 docs/architecture/canary-release.md create mode 100755 scripts/rollback-latest.sh diff --git a/docs/architecture/canary-release.md b/docs/architecture/canary-release.md new file mode 100644 index 00000000..eb795eda --- /dev/null +++ b/docs/architecture/canary-release.md @@ -0,0 +1,79 @@ +# Canary release pipeline + +How a workspace-server code change reaches the prod tenant fleet — and how to stop it if something's wrong. + +## The loop + +``` +PR merged to staging → main + │ + ▼ +publish-workspace-server-image.yml ← pushes :staging- ONLY + │ (NOT :latest — prod is untouched) + ▼ +Canary tenants auto-update to :staging- + │ (5-min auto-updater cycle on each canary EC2) + ▼ +canary-verify.yml waits 6 min, runs scripts/canary-smoke.sh + │ + ├─► GREEN → crane tag :staging- → :latest + │ │ + │ ▼ + │ Prod tenants auto-update within 5 min + │ + └─► RED → :latest stays on prior good digest + GitHub Step Summary flags the rejected sha + Ops fixes forward OR rolls back manually +``` + +## Canary fleet + +Lives in a separate AWS account (`molecule-canary`, `004947743811`) via an assumed role (`MoleculeStagingProvisioner`). The CP's `is_canary` org flag routes provisioning there; every other org goes to the default staging account. See `docs/architecture/saas-prod-migration-2026-04-19.md` for the account bootstrap. + +Canary tenants are configured to pull `:staging-` (not `:latest`) via `TENANT_IMAGE` on their provisioner, so they ingest each new build before prod does. + +## Smoke suite + +`scripts/canary-smoke.sh` hits each canary tenant (URL + ADMIN_TOKEN pair) and asserts: + +- `/admin/liveness` returns a subsystems map (tenant booted, AdminAuth reachable) +- `/workspaces` returns a JSON array (wsAuth + DB healthy) +- `/memories/commit` + `/memories/search` round-trip (encryption + scrubber) +- `/events` admin read (C4 fail-closed proof) +- `/admin/liveness` without bearer → 401 (C4 regression gate) + +Expand by editing the script — each `check "name" "expected" "$response"` call is one line. + +## Adding a canary tenant + +1. `POST /cp/orgs` — create the org normally (is_canary defaults to false) +2. `POST /cp/admin/orgs//canary` with `{"is_canary": true}` — admin only, refuses to flip if already provisioned +3. Re-trigger provision (or delete + recreate if the org was already provisioned into staging) — the fresh EC2 lands in account `004947743811` + +Then set repo secrets: +- `CANARY_TENANT_URLS` — append the new tenant's URL +- `CANARY_ADMIN_TOKENS` — append its ADMIN_TOKEN in the same position + +## Rolling back `:latest` + +When canary was green but something surfaces post-promotion, retag `:latest` to a prior digest: + +```bash +export GITHUB_TOKEN=ghp_... # write:packages +scripts/rollback-latest.sh 4c1d56e # retags both platform + tenant images +``` + +`scripts/rollback-latest.sh` pre-checks that `:staging-` exists before moving `:latest`, and verifies the digest after the move. Prod tenants pick up the rolled-back image on their next 5-min auto-update. + +A post-mortem should always include: +- the commit sha that broke +- why canary didn't catch it (new code path the smoke suite doesn't exercise?) +- whether the smoke suite should grow a new check to prevent the same class of bug + +## What this gate doesn't catch + +- Bugs that only surface under prod-only data (customer workloads with scale or shape canary doesn't produce). Canary uses real traffic shapes but can't simulate weeks of accumulated state. +- Config drift between canary and prod (different env-var values, different feature flags). Keep canary's config deltas minimal and documented. +- Cross-tenant interactions — canary tenants run in their own AWS account, so a bug that only appears when two tenants compete for a shared resource won't reproduce here. + +When these miss, `rollback-latest.sh` is the escape hatch. diff --git a/scripts/rollback-latest.sh b/scripts/rollback-latest.sh new file mode 100755 index 00000000..ade2051b --- /dev/null +++ b/scripts/rollback-latest.sh @@ -0,0 +1,80 @@ +#!/bin/bash +# rollback-latest.sh — moves the :latest tag on ghcr.io/molecule-ai/platform +# (and the matching tenant image) back to a prior :staging- digest +# without rebuilding anything. Prod tenants auto-pull :latest every 5 +# min, so this is the fast path when a canary-verified image turns out +# to have a runtime regression that canary didn't catch. +# +# Usage: +# scripts/rollback-latest.sh +# scripts/rollback-latest.sh 4c1d56e +# +# Prereqs: +# - crane on $PATH (brew install crane OR download from +# https://github.com/google/go-containerregistry/releases) +# - GHCR token exported as GITHUB_TOKEN with write:packages scope +# +# What it does (per image — platform + tenant): +# crane digest ghcr.io/…: # verify the target sha exists +# crane tag ghcr.io/…: latest # retag remotely, single API call +# crane digest ghcr.io/…:latest # confirm the move +# +# Exit codes: 0 = both retagged, 1 = tag missing / crane error, 2 = bad args. + +set -euo pipefail + +if [ "${1:-}" = "" ]; then + echo "usage: $0 " >&2 + echo " e.g. $0 4c1d56e — retags :latest to :staging-4c1d56e" >&2 + exit 2 +fi + +TARGET_SHA="$1" +PLATFORM=ghcr.io/molecule-ai/platform +TENANT=ghcr.io/molecule-ai/platform-tenant + +if ! command -v crane >/dev/null; then + echo "ERROR: crane not installed. brew install crane" >&2 + exit 1 +fi +if [ -z "${GITHUB_TOKEN:-}" ]; then + echo "ERROR: GITHUB_TOKEN unset. export it with write:packages scope." >&2 + exit 1 +fi + +# Log in once. crane stores creds in a config file keyed by registry; +# re-running is cheap. +printf '%s\n' "$GITHUB_TOKEN" | crane auth login ghcr.io -u "${GITHUB_ACTOR:-$(whoami)}" --password-stdin >/dev/null + +roll() { + local image="$1" + local src="$image:staging-$TARGET_SHA" + local dst="$image:latest" + + echo "→ $image" + # Abort rollout if the target tag doesn't exist in the registry. + # Otherwise crane tag would error anyway, but a pre-check gives a + # clearer message for ops. + if ! crane digest "$src" >/dev/null 2>&1; then + echo " FAIL: $src not found in registry. Did you type the wrong sha?" >&2 + return 1 + fi + src_digest=$(crane digest "$src") + + crane tag "$src" latest + new_digest=$(crane digest "$dst") + + if [ "$new_digest" != "$src_digest" ]; then + echo " FAIL: $dst digest $new_digest does not match expected $src_digest" >&2 + return 1 + fi + echo " OK $dst → $new_digest" +} + +roll "$PLATFORM" +roll "$TENANT" + +echo +echo "=== ROLLBACK COMPLETE ===" +echo "Both images now point :latest at staging-$TARGET_SHA." +echo "Prod tenants will pick up the rollback within their 5-min auto-update cycle." From 5a28454ca4c17df901fcd308525170fb2234460b Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 19 Apr 2026 03:41:16 -0700 Subject: [PATCH 7/9] =?UTF-8?q?test(ws-server):=20cover=20CPProvisioner=20?= =?UTF-8?q?=E2=80=94=20auth,=20env=20fallback,=20error=20paths?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Post-merge audit flagged cp_provisioner.go as the only new file from the canary/C1 work without test coverage. Fills the gap: - NewCPProvisioner_RequiresOrgID — self-hosted without MOLECULE_ORG_ID refuses to construct (avoids silent phone-home to prod CP). - NewCPProvisioner_FallsBackToProvisionSharedSecret — the operator ergonomics of using one env-var name on both sides of the wire. - AuthHeader noop + happy path — bearer only set when secret is set. - Start_HappyPath — end-to-end POST to stubbed CP, bearer forwarded, instance_id parsed out of response. - Start_Non201ReturnsStructuredError — when CP returns structured {"error":"…"}, that message surfaces to the caller. - Start_NoStructuredErrorFallsBackToSize — regression gate for the anti-log-leak change from PR #980: raw upstream body must NOT appear in the error, only the byte count. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../provisioner/cp_provisioner_test.go | 150 ++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 workspace-server/internal/provisioner/cp_provisioner_test.go diff --git a/workspace-server/internal/provisioner/cp_provisioner_test.go b/workspace-server/internal/provisioner/cp_provisioner_test.go new file mode 100644 index 00000000..ce49a352 --- /dev/null +++ b/workspace-server/internal/provisioner/cp_provisioner_test.go @@ -0,0 +1,150 @@ +package provisioner + +import ( + "context" + "encoding/json" + "io" + "net/http" + "net/http/httptest" + "strings" + "testing" +) + +// TestNewCPProvisioner_RequiresOrgID — self-hosted deployments don't +// have a MOLECULE_ORG_ID, and the provisioner must refuse to construct +// rather than silently phone home to the prod CP with an empty tenant. +func TestNewCPProvisioner_RequiresOrgID(t *testing.T) { + t.Setenv("MOLECULE_ORG_ID", "") + if _, err := NewCPProvisioner(); err == nil { + t.Error("want error when MOLECULE_ORG_ID is unset, got nil") + } +} + +// TestNewCPProvisioner_FallsBackToProvisionSharedSecret — operators +// may set PROVISION_SHARED_SECRET on both sides of the wire with a +// single value; the tenant accepts that name as a fallback for +// MOLECULE_CP_SHARED_SECRET. The fallback is documented in +// NewCPProvisioner; this test is the regression gate. +func TestNewCPProvisioner_FallsBackToProvisionSharedSecret(t *testing.T) { + t.Setenv("MOLECULE_ORG_ID", "org-abc") + t.Setenv("MOLECULE_CP_SHARED_SECRET", "") + t.Setenv("PROVISION_SHARED_SECRET", "from-fallback") + + p, err := NewCPProvisioner() + if err != nil { + t.Fatalf("NewCPProvisioner: %v", err) + } + if p.sharedSecret != "from-fallback" { + t.Errorf("sharedSecret = %q, want %q", p.sharedSecret, "from-fallback") + } +} + +// TestAuthHeader_NoopWhenSecretEmpty — the self-hosted path that +// doesn't gate /cp/workspaces/* must not add a stray Authorization +// header (bearer-like content would be surprising to non-bearer +// intermediaries). +func TestAuthHeader_NoopWhenSecretEmpty(t *testing.T) { + p := &CPProvisioner{sharedSecret: ""} + req := httptest.NewRequest("GET", "http://x/", nil) + p.authHeader(req) + if got := req.Header.Get("Authorization"); got != "" { + t.Errorf("Authorization set to %q with empty secret; want unset", got) + } +} + +// TestAuthHeader_SetsBearerWhenSecretSet — happy path. +func TestAuthHeader_SetsBearerWhenSecretSet(t *testing.T) { + p := &CPProvisioner{sharedSecret: "the-secret"} + req := httptest.NewRequest("GET", "http://x/", nil) + p.authHeader(req) + if got := req.Header.Get("Authorization"); got != "Bearer the-secret" { + t.Errorf("Authorization = %q, want %q", got, "Bearer the-secret") + } +} + +// TestStart_HappyPath — Start posts to the stubbed CP, passes the +// bearer, and parses the returned instance_id. +func TestStart_HappyPath(t *testing.T) { + var sawBearer string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + sawBearer = r.Header.Get("Authorization") + if r.URL.Path != "/cp/workspaces/provision" { + t.Errorf("unexpected path %s", r.URL.Path) + } + // Verify the request body round-trips our fields + var body cpProvisionRequest + _ = json.NewDecoder(r.Body).Decode(&body) + if body.WorkspaceID != "ws-1" || body.Runtime != "python" { + t.Errorf("body mismatch: %+v", body) + } + w.WriteHeader(http.StatusCreated) + _, _ = io.WriteString(w, `{"instance_id":"i-abc123","state":"pending"}`) + })) + defer srv.Close() + + p := &CPProvisioner{ + baseURL: srv.URL, + orgID: "org-1", + sharedSecret: "s3cret", + httpClient: srv.Client(), + } + + id, err := p.Start(context.Background(), WorkspaceConfig{ + WorkspaceID: "ws-1", Runtime: "python", Tier: 1, PlatformURL: "http://tenant", + }) + if err != nil { + t.Fatalf("Start: %v", err) + } + if id != "i-abc123" { + t.Errorf("instance id = %q, want i-abc123", id) + } + if sawBearer != "Bearer s3cret" { + t.Errorf("server saw Authorization = %q, want Bearer s3cret", sawBearer) + } +} + +// TestStart_Non201ReturnsStructuredError — when CP returns 401 with a +// structured {"error":"..."} body, Start surfaces that error message. +// Verifies the defense against log-leaking raw upstream bodies. +func TestStart_Non201ReturnsStructuredError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusUnauthorized) + _, _ = io.WriteString(w, `{"error":"invalid credentials"}`) + })) + defer srv.Close() + + p := &CPProvisioner{baseURL: srv.URL, orgID: "org-1", httpClient: srv.Client()} + + _, err := p.Start(context.Background(), WorkspaceConfig{WorkspaceID: "ws-1", Runtime: "py"}) + if err == nil { + t.Fatal("expected error on 401, got nil") + } + if !strings.Contains(err.Error(), "invalid credentials") { + t.Errorf("error message %q should include upstream error field", err.Error()) + } +} + +// TestStart_NoStructuredErrorFallsBackToSize — the anti-leak path: +// when upstream returns non-JSON, we refuse to log the body and +// report only the byte count, preventing Authorization header echoes +// from landing in our logs. +func TestStart_NoStructuredErrorFallsBackToSize(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + _, _ = io.WriteString(w, "raw proxy error page, could contain echoed headers") + })) + defer srv.Close() + + p := &CPProvisioner{baseURL: srv.URL, orgID: "org-1", httpClient: srv.Client()} + + _, err := p.Start(context.Background(), WorkspaceConfig{WorkspaceID: "ws-1", Runtime: "py"}) + if err == nil { + t.Fatal("expected error on 500, got nil") + } + if strings.Contains(err.Error(), "raw proxy error") { + t.Errorf("error leaked raw body: %q", err.Error()) + } + if !strings.Contains(err.Error(), " Date: Sun, 19 Apr 2026 03:44:48 -0700 Subject: [PATCH 8/9] perf(scheduler): collapse empty-run bump to single RETURNING query MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The phantom-producer detector (#795) was doing UPDATE + SELECT in two roundtrips — first incrementing consecutive_empty_runs, then re- reading to check the stale threshold. Switch to UPDATE ... RETURNING so the post-increment value comes back in one query. Called once per schedule per cron tick. At 100 tenants × dozens of schedules per tenant, the halved DB traffic on the empty-response path is measurable, not just cosmetic. Also now properly logs if the bump itself fails (previously it silent- swallowed the ExecContext error and still ran the SELECT, which would confuse debugging). Co-Authored-By: Claude Opus 4.7 (1M context) --- workspace-server/internal/scheduler/scheduler.go | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/workspace-server/internal/scheduler/scheduler.go b/workspace-server/internal/scheduler/scheduler.go index 9c83e83a..0831796e 100644 --- a/workspace-server/internal/scheduler/scheduler.go +++ b/workspace-server/internal/scheduler/scheduler.go @@ -310,14 +310,20 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) { // consecutive empties and escalate to 'stale' after 3 in a row. isEmpty := isEmptyResponse(respBody) if lastStatus == "ok" && isEmpty { - db.DB.ExecContext(ctx, ` + // One query instead of UPDATE-then-SELECT: RETURNING hands back + // the post-increment value so the stale-threshold check doesn't + // cost a second roundtrip. This handler fires once per cron tick + // per schedule; at 100 tenants × dozens of schedules the saved + // query matters. + var consecEmpty int + if err := db.DB.QueryRowContext(ctx, ` UPDATE workspace_schedules SET consecutive_empty_runs = consecutive_empty_runs + 1, updated_at = now() - WHERE id = $1`, sched.ID) - // Check if we've crossed the stale threshold - var consecEmpty int - db.DB.QueryRowContext(ctx, `SELECT consecutive_empty_runs FROM workspace_schedules WHERE id = $1`, sched.ID).Scan(&consecEmpty) + WHERE id = $1 + RETURNING consecutive_empty_runs`, sched.ID).Scan(&consecEmpty); err != nil { + log.Printf("Scheduler: '%s' empty-run bump failed: %v", sched.Name, err) + } if consecEmpty >= 3 { lastStatus = "stale" lastError = fmt.Sprintf("empty response %d consecutive times — agent may be phantom-producing (#795)", consecEmpty) From 6c23aada1eba0f5ae65ab2c3cd9e2273b57fd0a0 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 19 Apr 2026 04:13:54 -0700 Subject: [PATCH 9/9] feat(canvas): /orgs landing page for post-signup users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CP's Callback handler redirects every new WorkOS session to APP_URL/orgs, but canvas had no such route — new users hit the canvas Home component, which tries to call /workspaces on a tenant that doesn't exist yet, and saw a confusing error. This PR plugs that gap with a dedicated landing page that: - Bounces anonymous visitors back to /cp/auth/login - Zero-org users see a slug-picker (POST /cp/orgs, refresh) - For each existing org, shows status + CTA: * awaiting_payment → amber "Complete payment" → /pricing?org=… * running → emerald "Open" → https://.moleculesai.app * failed → "Contact support" → mailto * provisioning → read-only "provisioning…" - Surfaces errors inline with a Retry button Deliberately server-light: one GET /cp/orgs, no WebSocket, no canvas store hydration. Goal is to move the user from signup to either Stripe Checkout or their tenant URL with one click each. Closes the last UX gap between the BILLING_REQUIRED gate landing on the CP and real users being able to complete a signup today. Co-Authored-By: Claude Opus 4.7 (1M context) --- canvas/src/app/orgs/page.tsx | 278 +++++++++++++++++++++++++++++++++++ 1 file changed, 278 insertions(+) create mode 100644 canvas/src/app/orgs/page.tsx diff --git a/canvas/src/app/orgs/page.tsx b/canvas/src/app/orgs/page.tsx new file mode 100644 index 00000000..955d3f46 --- /dev/null +++ b/canvas/src/app/orgs/page.tsx @@ -0,0 +1,278 @@ +"use client"; + +// /orgs — the post-signup landing page. +// +// The control plane's Callback handler (authorized via WorkOS) redirects +// every new session to APP_URL/orgs after login/signup succeeds. Before +// this route existed that redirect 404'd and new users were stranded. +// Now: +// - Signed-out browsers are bounced back to /cp/auth/login +// - Zero-org users see a slug-picker → POST /cp/orgs → refresh +// - `awaiting_payment` orgs get a "Complete payment" CTA → /pricing +// - `running` orgs show a link to the tenant URL +// - `provisioning` / `failed` surface the state so the user knows +// why their tenant isn't available yet +// +// Everything here is intentionally server-light: one GET /cp/orgs, +// zero WebSocket, no canvas store hydration — the whole point is a +// quick bounce between signup and either Checkout or the tenant UI. + +import { useEffect, useState } from "react"; +import { fetchSession, redirectToLogin, type Session } from "@/lib/auth"; +import { PLATFORM_URL } from "@/lib/api"; + +type OrgStatus = "awaiting_payment" | "provisioning" | "running" | "failed" | string; + +interface Org { + id: string; + slug: string; + name: string; + plan: string; + status: OrgStatus; + created_at: string; + updated_at: string; +} + +export default function OrgsPage() { + const [session, setSession] = useState("loading"); + const [orgs, setOrgs] = useState(null); + const [error, setError] = useState(null); + + useEffect(() => { + let cancelled = false; + (async () => { + try { + const sess = await fetchSession(); + if (cancelled) return; + if (!sess) { + redirectToLogin(); + return; + } + setSession(sess); + const res = await fetch(`${PLATFORM_URL}/cp/orgs`, { + credentials: "include", + signal: AbortSignal.timeout(15_000), + }); + if (!res.ok) { + throw new Error(`GET /cp/orgs: ${res.status}`); + } + const body = (await res.json()) as { orgs?: Org[] } | Org[]; + const list = Array.isArray(body) ? body : body.orgs ?? []; + if (!cancelled) setOrgs(list); + } catch (err) { + if (!cancelled) { + setError(err instanceof Error ? err.message : String(err)); + } + } + })(); + return () => { + cancelled = true; + }; + }, []); + + if (session === "loading" || (orgs === null && error === null)) { + return

Loading…

; + } + if (error) { + return ( + +

Error: {error}

+ +
+ ); + } + if (!orgs || orgs.length === 0) { + return ; + } + return ( + +
    + {orgs.map((o) => ( + + ))} +
+
+ { + // Refresh the list so the new org appears + its CTA fires. + window.location.reload(); + void slug; + }} + /> +
+
+ ); +} + +function Shell({ children }: { children: React.ReactNode }) { + return ( +
+
+

Your organizations

+

+ Each org is an isolated Molecule workspace. +

+
{children}
+
+
+ ); +} + +function OrgRow({ org }: { org: Org }) { + return ( +
  • +
    +
    +
    {org.name}
    +
    + {org.slug} · · {org.plan || "free"} +
    +
    + +
    +
  • + ); +} + +function StatusLabel({ status }: { status: OrgStatus }) { + const cls = + status === "running" + ? "text-emerald-400" + : status === "awaiting_payment" + ? "text-amber-400" + : status === "failed" + ? "text-red-400" + : "text-sky-400"; + const label = + status === "awaiting_payment" + ? "awaiting payment" + : status; + return {label}; +} + +function OrgCTA({ org }: { org: Org }) { + if (org.status === "running") { + const host = typeof window !== "undefined" ? window.location.hostname : "moleculesai.app"; + const appDomain = host.endsWith(".moleculesai.app") + ? host.split(".").slice(-2).join(".") + : "moleculesai.app"; + const href = `https://${org.slug}.${appDomain}`; + return ( + + Open + + ); + } + if (org.status === "awaiting_payment") { + return ( + + Complete payment + + ); + } + if (org.status === "failed") { + return ( + + Contact support + + ); + } + // provisioning / unknown — non-interactive + return {org.status}…; +} + +function EmptyState() { + return ( + +

    + You don't have any organizations yet. Create one to get started — your + workspace spins up automatically once billing is set up. +

    +
    + { + window.location.reload(); + }} + /> +
    +
    + ); +} + +function CreateOrgForm({ onCreated }: { onCreated: (slug: string) => void }) { + const [slug, setSlug] = useState(""); + const [name, setName] = useState(""); + const [submitting, setSubmitting] = useState(false); + const [err, setErr] = useState(null); + + async function submit(e: React.FormEvent) { + e.preventDefault(); + setSubmitting(true); + setErr(null); + try { + const res = await fetch(`${PLATFORM_URL}/cp/orgs`, { + method: "POST", + credentials: "include", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ slug, name }), + signal: AbortSignal.timeout(15_000), + }); + if (!res.ok) { + const body = await res.text(); + throw new Error(`${res.status}: ${body}`); + } + onCreated(slug); + } catch (e) { + setErr(e instanceof Error ? e.message : String(e)); + setSubmitting(false); + } + } + + return ( +
    + + + {err &&

    {err}

    } + +
    + ); +}