forked from molecule-ai/molecule-core
Merge pull request #993 from Molecule-AI/staging
promote: staging → main — canary infra + /orgs + env refresh + perf
This commit is contained in:
commit
43880f580b
113
.github/workflows/canary-verify.yml
vendored
Normal file
113
.github/workflows/canary-verify.yml
vendored
Normal file
@ -0,0 +1,113 @@
|
||||
name: canary-verify
|
||||
|
||||
# Runs the canary smoke suite against the staging canary tenant fleet
|
||||
# after a new :staging-<sha> image lands in GHCR. On green, promotes
|
||||
# :staging-<sha> → :latest so the prod tenant fleet's 5-minute
|
||||
# auto-updater picks up the verified digest. On red, :latest stays
|
||||
# on the prior known-good digest and prod is untouched.
|
||||
#
|
||||
# Dependencies:
|
||||
# - publish-workspace-server-image.yml publishes :staging-<sha>
|
||||
# (NOT :latest) on main merge
|
||||
# - canary tenants are configured to pull :staging-<sha> as their
|
||||
# tenant image (set TENANT_IMAGE=ghcr.io/…:staging-<sha> on the
|
||||
# canary provisioner code path OR rotate via an admin endpoint)
|
||||
# - Repo secrets CANARY_TENANT_URLS / CANARY_ADMIN_TOKENS /
|
||||
# CANARY_CP_SHARED_SECRET are populated
|
||||
|
||||
on:
|
||||
workflow_run:
|
||||
workflows: ["publish-workspace-server-image"]
|
||||
types: [completed]
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
actions: read
|
||||
|
||||
env:
|
||||
IMAGE_NAME: ghcr.io/molecule-ai/platform
|
||||
TENANT_IMAGE_NAME: ghcr.io/molecule-ai/platform-tenant
|
||||
|
||||
jobs:
|
||||
canary-smoke:
|
||||
# Skip when the upstream workflow failed — no image to test against.
|
||||
if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }}
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
sha: ${{ steps.compute.outputs.sha }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Compute sha
|
||||
id: compute
|
||||
run: echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Wait for canary tenants to pick up :staging-<sha>
|
||||
# Tenant auto-updater runs every 5 min. Sleep 6 min to give every
|
||||
# canary time to pull + restart. Cheaper than polling.
|
||||
run: sleep 360
|
||||
|
||||
- name: Run canary smoke suite
|
||||
env:
|
||||
CANARY_TENANT_URLS: ${{ secrets.CANARY_TENANT_URLS }}
|
||||
CANARY_ADMIN_TOKENS: ${{ secrets.CANARY_ADMIN_TOKENS }}
|
||||
CANARY_CP_BASE_URL: https://staging-api.moleculesai.app
|
||||
CANARY_CP_SHARED_SECRET: ${{ secrets.CANARY_CP_SHARED_SECRET }}
|
||||
run: bash scripts/canary-smoke.sh
|
||||
|
||||
- name: Summary on failure
|
||||
if: ${{ failure() }}
|
||||
run: |
|
||||
{
|
||||
echo "## Canary smoke FAILED"
|
||||
echo
|
||||
echo "Canary tenants rejected image \`staging-${{ steps.compute.outputs.sha }}\`."
|
||||
echo ":latest stays pinned to the prior good digest — prod is untouched."
|
||||
echo
|
||||
echo "Fix forward and merge again, or investigate the specific failed"
|
||||
echo "assertions in the canary-smoke step log above."
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
promote-to-latest:
|
||||
# On green, retag :staging-<sha> → :latest for BOTH images.
|
||||
# crane is a lightweight registry client (no Docker daemon needed on
|
||||
# the runner) that can retag remotely with a single API call each.
|
||||
needs: canary-smoke
|
||||
if: ${{ needs.canary-smoke.result == 'success' }}
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Install crane
|
||||
run: |
|
||||
curl -fsSL https://github.com/google/go-containerregistry/releases/download/v0.20.2/go-containerregistry_Linux_x86_64.tar.gz | \
|
||||
tar xz -C /usr/local/bin crane
|
||||
|
||||
- name: GHCR login
|
||||
run: |
|
||||
echo "${{ secrets.GITHUB_TOKEN }}" | \
|
||||
crane auth login ghcr.io -u "${{ github.actor }}" --password-stdin
|
||||
|
||||
- name: Retag platform :staging-<sha> → :latest
|
||||
run: |
|
||||
crane tag \
|
||||
"${IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}" \
|
||||
latest
|
||||
|
||||
- name: Retag tenant :staging-<sha> → :latest
|
||||
run: |
|
||||
crane tag \
|
||||
"${TENANT_IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}" \
|
||||
latest
|
||||
|
||||
- name: Summary
|
||||
run: |
|
||||
{
|
||||
echo "## Canary verified — :latest promoted"
|
||||
echo
|
||||
echo "- \`${IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}\` → \`${IMAGE_NAME}:latest\`"
|
||||
echo "- \`${TENANT_IMAGE_NAME}:staging-${{ needs.canary-smoke.outputs.sha }}\` → \`${TENANT_IMAGE_NAME}:latest\`"
|
||||
echo
|
||||
echo "Prod tenant fleet will pick up the new digest on its next 5-min auto-update cycle."
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
@ -55,7 +55,17 @@ jobs:
|
||||
run: |
|
||||
echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Build & push platform image to GHCR
|
||||
# Canary-gated release: we publish :staging-<sha> ONLY here. The
|
||||
# :latest tag (which existing prod tenants auto-pull every 5 min)
|
||||
# is promoted by .github/workflows/canary-verify.yml after the
|
||||
# staging canary fleet green-lights this digest.
|
||||
# That means:
|
||||
# - Every main merge produces a :staging-<sha> image
|
||||
# - Canary tenants (configured to pull :staging-<sha>) pick it up
|
||||
# - canary-verify.yml runs smoke tests against them
|
||||
# - On green → canary-verify retags :staging-<sha> → :latest
|
||||
# - On red → :latest stays on the prior good digest, prod is safe
|
||||
- name: Build & push platform image to GHCR (staging-<sha> only)
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
@ -63,16 +73,15 @@ jobs:
|
||||
platforms: linux/amd64
|
||||
push: true
|
||||
tags: |
|
||||
${{ env.IMAGE_NAME }}:latest
|
||||
${{ env.IMAGE_NAME }}:sha-${{ steps.tags.outputs.sha }}
|
||||
${{ env.IMAGE_NAME }}:staging-${{ steps.tags.outputs.sha }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
labels: |
|
||||
org.opencontainers.image.source=https://github.com/${{ github.repository }}
|
||||
org.opencontainers.image.revision=${{ github.sha }}
|
||||
org.opencontainers.image.description=Molecule AI platform (Go API server)
|
||||
org.opencontainers.image.description=Molecule AI platform (Go API server) — pending canary verify
|
||||
|
||||
- name: Build & push tenant image to GHCR
|
||||
- name: Build & push tenant image to GHCR (staging-<sha> only)
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
@ -80,11 +89,10 @@ jobs:
|
||||
platforms: linux/amd64
|
||||
push: true
|
||||
tags: |
|
||||
${{ env.TENANT_IMAGE_NAME }}:latest
|
||||
${{ env.TENANT_IMAGE_NAME }}:sha-${{ steps.tags.outputs.sha }}
|
||||
${{ env.TENANT_IMAGE_NAME }}:staging-${{ steps.tags.outputs.sha }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
labels: |
|
||||
org.opencontainers.image.source=https://github.com/${{ github.repository }}
|
||||
org.opencontainers.image.revision=${{ github.sha }}
|
||||
org.opencontainers.image.description=Molecule AI tenant platform + canvas (one EC2 instance per org)
|
||||
org.opencontainers.image.description=Molecule AI tenant platform + canvas — pending canary verify
|
||||
|
||||
278
canvas/src/app/orgs/page.tsx
Normal file
278
canvas/src/app/orgs/page.tsx
Normal file
@ -0,0 +1,278 @@
|
||||
"use client";
|
||||
|
||||
// /orgs — the post-signup landing page.
|
||||
//
|
||||
// The control plane's Callback handler (authorized via WorkOS) redirects
|
||||
// every new session to APP_URL/orgs after login/signup succeeds. Before
|
||||
// this route existed that redirect 404'd and new users were stranded.
|
||||
// Now:
|
||||
// - Signed-out browsers are bounced back to /cp/auth/login
|
||||
// - Zero-org users see a slug-picker → POST /cp/orgs → refresh
|
||||
// - `awaiting_payment` orgs get a "Complete payment" CTA → /pricing
|
||||
// - `running` orgs show a link to the tenant URL
|
||||
// - `provisioning` / `failed` surface the state so the user knows
|
||||
// why their tenant isn't available yet
|
||||
//
|
||||
// Everything here is intentionally server-light: one GET /cp/orgs,
|
||||
// zero WebSocket, no canvas store hydration — the whole point is a
|
||||
// quick bounce between signup and either Checkout or the tenant UI.
|
||||
|
||||
import { useEffect, useState } from "react";
|
||||
import { fetchSession, redirectToLogin, type Session } from "@/lib/auth";
|
||||
import { PLATFORM_URL } from "@/lib/api";
|
||||
|
||||
type OrgStatus = "awaiting_payment" | "provisioning" | "running" | "failed" | string;
|
||||
|
||||
interface Org {
|
||||
id: string;
|
||||
slug: string;
|
||||
name: string;
|
||||
plan: string;
|
||||
status: OrgStatus;
|
||||
created_at: string;
|
||||
updated_at: string;
|
||||
}
|
||||
|
||||
export default function OrgsPage() {
|
||||
const [session, setSession] = useState<Session | null | "loading">("loading");
|
||||
const [orgs, setOrgs] = useState<Org[] | null>(null);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
|
||||
useEffect(() => {
|
||||
let cancelled = false;
|
||||
(async () => {
|
||||
try {
|
||||
const sess = await fetchSession();
|
||||
if (cancelled) return;
|
||||
if (!sess) {
|
||||
redirectToLogin();
|
||||
return;
|
||||
}
|
||||
setSession(sess);
|
||||
const res = await fetch(`${PLATFORM_URL}/cp/orgs`, {
|
||||
credentials: "include",
|
||||
signal: AbortSignal.timeout(15_000),
|
||||
});
|
||||
if (!res.ok) {
|
||||
throw new Error(`GET /cp/orgs: ${res.status}`);
|
||||
}
|
||||
const body = (await res.json()) as { orgs?: Org[] } | Org[];
|
||||
const list = Array.isArray(body) ? body : body.orgs ?? [];
|
||||
if (!cancelled) setOrgs(list);
|
||||
} catch (err) {
|
||||
if (!cancelled) {
|
||||
setError(err instanceof Error ? err.message : String(err));
|
||||
}
|
||||
}
|
||||
})();
|
||||
return () => {
|
||||
cancelled = true;
|
||||
};
|
||||
}, []);
|
||||
|
||||
if (session === "loading" || (orgs === null && error === null)) {
|
||||
return <Shell><p className="text-zinc-400">Loading…</p></Shell>;
|
||||
}
|
||||
if (error) {
|
||||
return (
|
||||
<Shell>
|
||||
<p className="text-red-400">Error: {error}</p>
|
||||
<button
|
||||
onClick={() => window.location.reload()}
|
||||
className="mt-4 rounded bg-zinc-800 px-4 py-2 text-sm text-zinc-200 hover:bg-zinc-700"
|
||||
>
|
||||
Retry
|
||||
</button>
|
||||
</Shell>
|
||||
);
|
||||
}
|
||||
if (!orgs || orgs.length === 0) {
|
||||
return <EmptyState />;
|
||||
}
|
||||
return (
|
||||
<Shell>
|
||||
<ul className="space-y-3">
|
||||
{orgs.map((o) => (
|
||||
<OrgRow key={o.id} org={o} />
|
||||
))}
|
||||
</ul>
|
||||
<div className="mt-8 border-t border-zinc-800 pt-6">
|
||||
<CreateOrgForm
|
||||
onCreated={(slug) => {
|
||||
// Refresh the list so the new org appears + its CTA fires.
|
||||
window.location.reload();
|
||||
void slug;
|
||||
}}
|
||||
/>
|
||||
</div>
|
||||
</Shell>
|
||||
);
|
||||
}
|
||||
|
||||
function Shell({ children }: { children: React.ReactNode }) {
|
||||
return (
|
||||
<main className="min-h-screen bg-zinc-950 text-zinc-100">
|
||||
<div className="mx-auto max-w-2xl px-6 pt-20 pb-12">
|
||||
<h1 className="text-3xl font-bold text-white">Your organizations</h1>
|
||||
<p className="mt-2 text-zinc-400">
|
||||
Each org is an isolated Molecule workspace.
|
||||
</p>
|
||||
<div className="mt-8">{children}</div>
|
||||
</div>
|
||||
</main>
|
||||
);
|
||||
}
|
||||
|
||||
function OrgRow({ org }: { org: Org }) {
|
||||
return (
|
||||
<li className="rounded-lg border border-zinc-800 bg-zinc-900 p-4">
|
||||
<div className="flex items-center justify-between">
|
||||
<div>
|
||||
<div className="font-medium text-white">{org.name}</div>
|
||||
<div className="text-sm text-zinc-400">
|
||||
{org.slug} · <StatusLabel status={org.status} /> · {org.plan || "free"}
|
||||
</div>
|
||||
</div>
|
||||
<OrgCTA org={org} />
|
||||
</div>
|
||||
</li>
|
||||
);
|
||||
}
|
||||
|
||||
function StatusLabel({ status }: { status: OrgStatus }) {
|
||||
const cls =
|
||||
status === "running"
|
||||
? "text-emerald-400"
|
||||
: status === "awaiting_payment"
|
||||
? "text-amber-400"
|
||||
: status === "failed"
|
||||
? "text-red-400"
|
||||
: "text-sky-400";
|
||||
const label =
|
||||
status === "awaiting_payment"
|
||||
? "awaiting payment"
|
||||
: status;
|
||||
return <span className={cls}>{label}</span>;
|
||||
}
|
||||
|
||||
function OrgCTA({ org }: { org: Org }) {
|
||||
if (org.status === "running") {
|
||||
const host = typeof window !== "undefined" ? window.location.hostname : "moleculesai.app";
|
||||
const appDomain = host.endsWith(".moleculesai.app")
|
||||
? host.split(".").slice(-2).join(".")
|
||||
: "moleculesai.app";
|
||||
const href = `https://${org.slug}.${appDomain}`;
|
||||
return (
|
||||
<a
|
||||
href={href}
|
||||
className="rounded bg-emerald-600 px-4 py-2 text-sm font-medium text-white hover:bg-emerald-500"
|
||||
>
|
||||
Open
|
||||
</a>
|
||||
);
|
||||
}
|
||||
if (org.status === "awaiting_payment") {
|
||||
return (
|
||||
<a
|
||||
href={`/pricing?org=${encodeURIComponent(org.slug)}`}
|
||||
className="rounded bg-amber-600 px-4 py-2 text-sm font-medium text-white hover:bg-amber-500"
|
||||
>
|
||||
Complete payment
|
||||
</a>
|
||||
);
|
||||
}
|
||||
if (org.status === "failed") {
|
||||
return (
|
||||
<a
|
||||
href="mailto:support@moleculesai.app"
|
||||
className="rounded bg-zinc-700 px-4 py-2 text-sm font-medium text-zinc-200 hover:bg-zinc-600"
|
||||
>
|
||||
Contact support
|
||||
</a>
|
||||
);
|
||||
}
|
||||
// provisioning / unknown — non-interactive
|
||||
return <span className="text-sm text-zinc-500">{org.status}…</span>;
|
||||
}
|
||||
|
||||
function EmptyState() {
|
||||
return (
|
||||
<Shell>
|
||||
<p className="text-zinc-300">
|
||||
You don't have any organizations yet. Create one to get started — your
|
||||
workspace spins up automatically once billing is set up.
|
||||
</p>
|
||||
<div className="mt-6">
|
||||
<CreateOrgForm
|
||||
onCreated={() => {
|
||||
window.location.reload();
|
||||
}}
|
||||
/>
|
||||
</div>
|
||||
</Shell>
|
||||
);
|
||||
}
|
||||
|
||||
function CreateOrgForm({ onCreated }: { onCreated: (slug: string) => void }) {
|
||||
const [slug, setSlug] = useState("");
|
||||
const [name, setName] = useState("");
|
||||
const [submitting, setSubmitting] = useState(false);
|
||||
const [err, setErr] = useState<string | null>(null);
|
||||
|
||||
async function submit(e: React.FormEvent) {
|
||||
e.preventDefault();
|
||||
setSubmitting(true);
|
||||
setErr(null);
|
||||
try {
|
||||
const res = await fetch(`${PLATFORM_URL}/cp/orgs`, {
|
||||
method: "POST",
|
||||
credentials: "include",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({ slug, name }),
|
||||
signal: AbortSignal.timeout(15_000),
|
||||
});
|
||||
if (!res.ok) {
|
||||
const body = await res.text();
|
||||
throw new Error(`${res.status}: ${body}`);
|
||||
}
|
||||
onCreated(slug);
|
||||
} catch (e) {
|
||||
setErr(e instanceof Error ? e.message : String(e));
|
||||
setSubmitting(false);
|
||||
}
|
||||
}
|
||||
|
||||
return (
|
||||
<form onSubmit={submit} className="space-y-3">
|
||||
<label className="block">
|
||||
<span className="text-sm text-zinc-300">Slug (URL)</span>
|
||||
<input
|
||||
value={slug}
|
||||
onChange={(e) => setSlug(e.target.value.toLowerCase())}
|
||||
pattern="^[a-z][a-z0-9-]{2,31}$"
|
||||
placeholder="acme"
|
||||
required
|
||||
className="mt-1 w-full rounded border border-zinc-700 bg-zinc-800 px-3 py-2 text-sm text-zinc-100"
|
||||
/>
|
||||
</label>
|
||||
<label className="block">
|
||||
<span className="text-sm text-zinc-300">Display name</span>
|
||||
<input
|
||||
value={name}
|
||||
onChange={(e) => setName(e.target.value)}
|
||||
placeholder="Acme Corp"
|
||||
required
|
||||
className="mt-1 w-full rounded border border-zinc-700 bg-zinc-800 px-3 py-2 text-sm text-zinc-100"
|
||||
/>
|
||||
</label>
|
||||
{err && <p className="text-sm text-red-400">{err}</p>}
|
||||
<button
|
||||
type="submit"
|
||||
disabled={submitting}
|
||||
className="rounded bg-blue-600 px-4 py-2 text-sm font-medium text-white hover:bg-blue-500 disabled:opacity-50"
|
||||
>
|
||||
{submitting ? "Creating…" : "Create organization"}
|
||||
</button>
|
||||
</form>
|
||||
);
|
||||
}
|
||||
@ -8,6 +8,12 @@ import { getTenantSlug } from "./tenant";
|
||||
export const PLATFORM_URL =
|
||||
process.env.NEXT_PUBLIC_PLATFORM_URL ?? "http://localhost:8080";
|
||||
|
||||
// 15s is long enough for slow CP queries but short enough that a
|
||||
// hung backend doesn't leave the UI spinning forever. The abort
|
||||
// propagates through AbortController so React components can observe
|
||||
// the error and render a retry affordance.
|
||||
const DEFAULT_TIMEOUT_MS = 15_000;
|
||||
|
||||
async function request<T>(
|
||||
method: string,
|
||||
path: string,
|
||||
@ -28,6 +34,7 @@ async function request<T>(
|
||||
headers,
|
||||
body: body ? JSON.stringify(body) : undefined,
|
||||
credentials: "include",
|
||||
signal: AbortSignal.timeout(DEFAULT_TIMEOUT_MS),
|
||||
});
|
||||
if (!res.ok) {
|
||||
const text = await res.text();
|
||||
|
||||
79
docs/architecture/canary-release.md
Normal file
79
docs/architecture/canary-release.md
Normal file
@ -0,0 +1,79 @@
|
||||
# Canary release pipeline
|
||||
|
||||
How a workspace-server code change reaches the prod tenant fleet — and how to stop it if something's wrong.
|
||||
|
||||
## The loop
|
||||
|
||||
```
|
||||
PR merged to staging → main
|
||||
│
|
||||
▼
|
||||
publish-workspace-server-image.yml ← pushes :staging-<sha> ONLY
|
||||
│ (NOT :latest — prod is untouched)
|
||||
▼
|
||||
Canary tenants auto-update to :staging-<sha>
|
||||
│ (5-min auto-updater cycle on each canary EC2)
|
||||
▼
|
||||
canary-verify.yml waits 6 min, runs scripts/canary-smoke.sh
|
||||
│
|
||||
├─► GREEN → crane tag :staging-<sha> → :latest
|
||||
│ │
|
||||
│ ▼
|
||||
│ Prod tenants auto-update within 5 min
|
||||
│
|
||||
└─► RED → :latest stays on prior good digest
|
||||
GitHub Step Summary flags the rejected sha
|
||||
Ops fixes forward OR rolls back manually
|
||||
```
|
||||
|
||||
## Canary fleet
|
||||
|
||||
Lives in a separate AWS account (`molecule-canary`, `004947743811`) via an assumed role (`MoleculeStagingProvisioner`). The CP's `is_canary` org flag routes provisioning there; every other org goes to the default staging account. See `docs/architecture/saas-prod-migration-2026-04-19.md` for the account bootstrap.
|
||||
|
||||
Canary tenants are configured to pull `:staging-<sha>` (not `:latest`) via `TENANT_IMAGE` on their provisioner, so they ingest each new build before prod does.
|
||||
|
||||
## Smoke suite
|
||||
|
||||
`scripts/canary-smoke.sh` hits each canary tenant (URL + ADMIN_TOKEN pair) and asserts:
|
||||
|
||||
- `/admin/liveness` returns a subsystems map (tenant booted, AdminAuth reachable)
|
||||
- `/workspaces` returns a JSON array (wsAuth + DB healthy)
|
||||
- `/memories/commit` + `/memories/search` round-trip (encryption + scrubber)
|
||||
- `/events` admin read (C4 fail-closed proof)
|
||||
- `/admin/liveness` without bearer → 401 (C4 regression gate)
|
||||
|
||||
Expand by editing the script — each `check "name" "expected" "$response"` call is one line.
|
||||
|
||||
## Adding a canary tenant
|
||||
|
||||
1. `POST /cp/orgs` — create the org normally (is_canary defaults to false)
|
||||
2. `POST /cp/admin/orgs/<slug>/canary` with `{"is_canary": true}` — admin only, refuses to flip if already provisioned
|
||||
3. Re-trigger provision (or delete + recreate if the org was already provisioned into staging) — the fresh EC2 lands in account `004947743811`
|
||||
|
||||
Then set repo secrets:
|
||||
- `CANARY_TENANT_URLS` — append the new tenant's URL
|
||||
- `CANARY_ADMIN_TOKENS` — append its ADMIN_TOKEN in the same position
|
||||
|
||||
## Rolling back `:latest`
|
||||
|
||||
When canary was green but something surfaces post-promotion, retag `:latest` to a prior digest:
|
||||
|
||||
```bash
|
||||
export GITHUB_TOKEN=ghp_... # write:packages
|
||||
scripts/rollback-latest.sh 4c1d56e # retags both platform + tenant images
|
||||
```
|
||||
|
||||
`scripts/rollback-latest.sh` pre-checks that `:staging-<sha>` exists before moving `:latest`, and verifies the digest after the move. Prod tenants pick up the rolled-back image on their next 5-min auto-update.
|
||||
|
||||
A post-mortem should always include:
|
||||
- the commit sha that broke
|
||||
- why canary didn't catch it (new code path the smoke suite doesn't exercise?)
|
||||
- whether the smoke suite should grow a new check to prevent the same class of bug
|
||||
|
||||
## What this gate doesn't catch
|
||||
|
||||
- Bugs that only surface under prod-only data (customer workloads with scale or shape canary doesn't produce). Canary uses real traffic shapes but can't simulate weeks of accumulated state.
|
||||
- Config drift between canary and prod (different env-var values, different feature flags). Keep canary's config deltas minimal and documented.
|
||||
- Cross-tenant interactions — canary tenants run in their own AWS account, so a bug that only appears when two tenants compete for a shared resource won't reproduce here.
|
||||
|
||||
When these miss, `rollback-latest.sh` is the escape hatch.
|
||||
72
docs/architecture/saas-prod-migration-2026-04-19.md
Normal file
72
docs/architecture/saas-prod-migration-2026-04-19.md
Normal file
@ -0,0 +1,72 @@
|
||||
# SaaS prod migration — 2026-04-19
|
||||
|
||||
Promoted staging → main on both `Molecule-AI/molecule-controlplane` and `Molecule-AI/molecule-core`. This note captures the prod cutover deltas so ops can cross-check against the running system.
|
||||
|
||||
## What changed
|
||||
|
||||
Ten PRs landed, split across the two repos:
|
||||
|
||||
**Control plane (`molecule-controlplane`)**
|
||||
- PR #50 — C1/C2/C3: bearer auth on `/cp/workspaces/*`, shell-escape tenant user-data, per-tenant security group
|
||||
- PR #51 — H1/H2: crash-safe `SECRETS_ENCRYPTION_KEY` log, dropped `admin_token` from `/instance` SELECT
|
||||
- PR #52 — SSRF guard on `platform_url`
|
||||
- PR #53 — CP injects `MOLECULE_CP_SHARED_SECRET` + `MOLECULE_CP_URL` into tenant env
|
||||
- PR #54 — Stripe webhook body capped at 1 MiB
|
||||
|
||||
**Core (`molecule-core` / this repo)**
|
||||
- PR #978 — H3/H4: LimitReader on Discord webhook + workspace config PATCH
|
||||
- PR #979 — C4: `AdminAuth` fail-closed on fresh install when `ADMIN_TOKEN` is set
|
||||
- PR #980 — log-scrub: dropped token prefix logging, stopped logging raw upstream response bodies
|
||||
- PR #981 — tenant `CPProvisioner` attaches the CP bearer on every outbound `/cp/workspaces/*` call
|
||||
- PR #982 — Canvas API fetch timeout (15s)
|
||||
- PR #984 — E2E smoke test sync for #966 (public GET no longer exposes `current_task`)
|
||||
|
||||
## New prod env vars (Railway, project `molecule-platform`, env `production`)
|
||||
|
||||
Set before the CP merge landed:
|
||||
|
||||
| Variable | Value shape | Purpose |
|
||||
|---|---|---|
|
||||
| `PROVISION_SHARED_SECRET` | 32-byte hex | Gates `/cp/workspaces/*` on CP. Routes refuse to mount when unset — C1 fail-closed. |
|
||||
| `EC2_VPC_ID` | `vpc-…` | Enables per-tenant SG creation (C3). Shared-SG fallback emits a startup warning. |
|
||||
| `CP_BASE_URL` | `https://api.moleculesai.app` | Injected into newly-provisioned tenant containers as `MOLECULE_CP_URL`. |
|
||||
|
||||
The live prod `PROVISION_SHARED_SECRET` value is held only in Railway; not committed anywhere. Rotate by `railway variables --set` + redeploy.
|
||||
|
||||
## Existing-tenant migration (the sharp edge)
|
||||
|
||||
Tenants provisioned **before** this cutover are still running the previous workspace-server image. When they pull the new image on their next boot or auto-update cycle, their `CPProvisioner` will start expecting `MOLECULE_CP_SHARED_SECRET` in the container env — but the existing tenant EC2s don't have that variable in their user-data (the CP only started injecting it from PR #53 onward).
|
||||
|
||||
**Symptom**: a pre-cutover tenant can still serve its users' existing workspaces, but any attempt to **provision a new workspace** from inside the tenant UI will hit the CP's new bearer gate and get `401` or `404` back, surfacing as "workspace provision failed" with a generic error.
|
||||
|
||||
**Fix per existing tenant (pick one)**:
|
||||
|
||||
1. **SSH in + add the env var**
|
||||
- Copy `PROVISION_SHARED_SECRET` from Railway prod env.
|
||||
- `ssh ubuntu@<tenant-ip>` and append to the running container's env (`docker stop && docker run … -e MOLECULE_CP_SHARED_SECRET='…' -e MOLECULE_CP_URL=https://api.moleculesai.app …`). Rolling this into an auto-update hook is follow-up work.
|
||||
|
||||
2. **Re-provision the tenant**
|
||||
- `DELETE /cp/orgs/:slug` → re-create via normal signup flow. Tenant-level data survives only if the tenant's own Postgres volume is preserved; workspace_id values change. This is the heavy hammer — only for tenants where existing data can be recreated easily.
|
||||
|
||||
3. **Wait for the auto-update + user-data refresh cycle**
|
||||
- Tenant auto-updater (cron, 5-minute cadence) pulls the new container image but **does not refresh env vars** — those are frozen from the initial user-data. So option 3 alone doesn't fix this; it still needs option 1 or 2.
|
||||
|
||||
Script at `scripts/migrate-tenant-cp-secret.sh` (follow-up) will automate option 1 across all running tenants in the prod AWS account.
|
||||
|
||||
## Post-deploy verification checklist
|
||||
|
||||
- [ ] Railway prod deploy for `controlplane` lands on the new commit (check `https://railway.com/project/7ccc…/service/ae76…`)
|
||||
- [ ] `curl https://api.moleculesai.app/health` → 200 `{service: molecule-cp, status: ok}`
|
||||
- [ ] `curl -X POST https://api.moleculesai.app/cp/workspaces/provision` (no bearer) → 401 (**not** 404 — proves the env var is live and routes mounted)
|
||||
- [ ] GHCR publishes new `workspace-server` image for the core main commit
|
||||
- [ ] Vercel canvas prod deploy lands
|
||||
|
||||
## Rollback
|
||||
|
||||
If prod is on fire:
|
||||
|
||||
1. `gh pr revert 46 -R Molecule-AI/molecule-controlplane` — reverts all 6 CP PRs together.
|
||||
2. `gh pr revert 983 -R Molecule-AI/molecule-core` — reverts the core bundle.
|
||||
3. Both reverts auto-deploy via Railway / GHCR / Vercel.
|
||||
|
||||
Existing tenants aren't affected by a rollback — they're running whichever tenant image tag they booted with. Only newly-provisioned tenants pick up the reverted control plane code.
|
||||
120
scripts/canary-smoke.sh
Executable file
120
scripts/canary-smoke.sh
Executable file
@ -0,0 +1,120 @@
|
||||
#!/bin/bash
|
||||
# canary-smoke.sh — runs the post-deploy smoke suite against the
|
||||
# staging canary tenant fleet. Called by the canary-verify.yml GitHub
|
||||
# Actions workflow after a new workspace-server image gets pushed to
|
||||
# GHCR; exits non-zero on any failure so the workflow can skip the
|
||||
# :staging-sha → :latest retag that would otherwise release broken
|
||||
# code to the prod tenant fleet.
|
||||
#
|
||||
# Environment:
|
||||
# CANARY_TENANT_URLS space-sep list of canary tenant base URLs
|
||||
# (e.g. "https://canary-pm.staging.moleculesai.app
|
||||
# https://canary-mcp.staging.moleculesai.app")
|
||||
# CANARY_ADMIN_TOKENS space-sep list of ADMIN_TOKENs, positionally
|
||||
# matched to CANARY_TENANT_URLS. Canary tenants
|
||||
# are provisioned with known ADMIN_TOKENs so CI
|
||||
# can hit their admin-gated endpoints.
|
||||
# CANARY_CP_BASE_URL CP base URL the canaries call back to
|
||||
# (https://staging-api.moleculesai.app)
|
||||
# CANARY_CP_SHARED_SECRET matches CP's PROVISION_SHARED_SECRET so this
|
||||
# script can also exercise /cp/workspaces/* via
|
||||
# the canary's own CPProvisioner identity.
|
||||
#
|
||||
# Exit codes: 0 = all green, 1 = assertion failure, 2 = setup/env problem.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# ── Setup ────────────────────────────────────────────────────────────────
|
||||
|
||||
: "${CANARY_TENANT_URLS:?space-sep list of canary base URLs required}"
|
||||
: "${CANARY_ADMIN_TOKENS:?space-sep list of ADMIN_TOKENs required, same order as URLs}"
|
||||
: "${CANARY_CP_BASE_URL:?CP base URL required}"
|
||||
|
||||
read -r -a URLS <<< "$CANARY_TENANT_URLS"
|
||||
read -r -a TOKENS <<< "$CANARY_ADMIN_TOKENS"
|
||||
|
||||
if [ "${#URLS[@]}" -ne "${#TOKENS[@]}" ]; then
|
||||
echo "ERROR: URLS(${#URLS[@]}) and TOKENS(${#TOKENS[@]}) length mismatch" >&2
|
||||
exit 2
|
||||
fi
|
||||
if [ "${#URLS[@]}" -eq 0 ]; then
|
||||
echo "ERROR: no canary URLs configured" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
PASS=0
|
||||
FAIL=0
|
||||
|
||||
# ── Helpers ──────────────────────────────────────────────────────────────
|
||||
|
||||
check() {
|
||||
local desc="$1" expected="$2" actual="$3"
|
||||
if echo "$actual" | grep -qF "$expected"; then
|
||||
printf " PASS %s\n" "$desc"
|
||||
PASS=$((PASS + 1))
|
||||
else
|
||||
printf " FAIL %s\n expected to contain: %s\n got: %s\n" "$desc" "$expected" "$actual" >&2
|
||||
FAIL=$((FAIL + 1))
|
||||
fi
|
||||
}
|
||||
|
||||
# acurl does an admin-authenticated GET/POST/etc. against a canary tenant.
|
||||
# Takes +BASE_URL +ADMIN_TOKEN as its first two positional args; the rest
|
||||
# are passed through to curl. Keeps the two values paired so the wrong
|
||||
# tenant never gets the wrong token.
|
||||
acurl() {
|
||||
local base="$1" token="$2"; shift 2
|
||||
curl -sS --max-time 20 -H "Authorization: Bearer $token" "$@" -- "$base${CANARY_ACURL_PATH:-}"
|
||||
}
|
||||
|
||||
# ── Checks (run per canary tenant) ───────────────────────────────────────
|
||||
|
||||
for i in "${!URLS[@]}"; do
|
||||
base="${URLS[$i]}"
|
||||
token="${TOKENS[$i]}"
|
||||
printf "\n── %s ──\n" "$base"
|
||||
|
||||
# 1. Liveness — the tenant is up and responding to admin auth.
|
||||
CANARY_ACURL_PATH="/admin/liveness" resp=$(acurl "$base" "$token" || true)
|
||||
check "liveness returns a subsystems map" '"subsystems"' "$resp"
|
||||
|
||||
# 2. CP env refresh — the workspace-server fetched MOLECULE_CP_SHARED_SECRET
|
||||
# from CP on startup. We can't read env directly, but we can assert the
|
||||
# liveness + workspace list both work, which together imply the binary
|
||||
# booted without crashing on the refresh call. A startup failure in
|
||||
# refreshEnvFromCP logs but still boots (best-effort semantics), so
|
||||
# this is a sanity check, not a proof.
|
||||
CANARY_ACURL_PATH="/workspaces" resp=$(acurl "$base" "$token" || true)
|
||||
check "workspace list is JSON array" "[" "$resp"
|
||||
|
||||
# 3. Memory commit round-trip — scope=LOCAL so test data stays on this
|
||||
# tenant. Verifies encryption + scrubber + retrieval end-to-end.
|
||||
probe_id="canary-smoke-$(date +%s)-$i"
|
||||
body=$(printf '{"scope":"LOCAL","namespace":"canary-smoke","content":"probe-%s"}' "$probe_id")
|
||||
CANARY_ACURL_PATH="/memories/commit" resp=$(curl -sS --max-time 20 \
|
||||
-X POST -H "Content-Type: application/json" -H "Authorization: Bearer $token" \
|
||||
--data "$body" "$base/memories/commit" || true)
|
||||
check "memory commit accepted" '"id"' "$resp"
|
||||
|
||||
CANARY_ACURL_PATH="/memories/search?query=probe-${probe_id}" \
|
||||
resp=$(curl -sS --max-time 20 -H "Authorization: Bearer $token" \
|
||||
"$base/memories/search?query=probe-${probe_id}" || true)
|
||||
check "memory search finds the probe" "probe-${probe_id}" "$resp"
|
||||
|
||||
# 4. Events admin read — AdminAuth path (C4 fail-closed proof on SaaS).
|
||||
CANARY_ACURL_PATH="/events" resp=$(acurl "$base" "$token" || true)
|
||||
check "events endpoint returns JSON" "[" "$resp"
|
||||
|
||||
# 5. Negative: unauth'd admin call must 401 (C4 regression gate).
|
||||
unauth_code=$(curl -sS -o /dev/null -w '%{http_code}' --max-time 10 "$base/admin/liveness" || echo "000")
|
||||
check "unauth'd /admin/liveness returns 401" "401" "$unauth_code"
|
||||
done
|
||||
|
||||
# ── Summary ──────────────────────────────────────────────────────────────
|
||||
|
||||
printf "\n=== CANARY SMOKE RESULTS ===\n"
|
||||
printf " PASS: %d\n FAIL: %d\n" "$PASS" "$FAIL"
|
||||
|
||||
if [ "$FAIL" -gt 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
80
scripts/rollback-latest.sh
Executable file
80
scripts/rollback-latest.sh
Executable file
@ -0,0 +1,80 @@
|
||||
#!/bin/bash
|
||||
# rollback-latest.sh — moves the :latest tag on ghcr.io/molecule-ai/platform
|
||||
# (and the matching tenant image) back to a prior :staging-<sha> digest
|
||||
# without rebuilding anything. Prod tenants auto-pull :latest every 5
|
||||
# min, so this is the fast path when a canary-verified image turns out
|
||||
# to have a runtime regression that canary didn't catch.
|
||||
#
|
||||
# Usage:
|
||||
# scripts/rollback-latest.sh <sha>
|
||||
# scripts/rollback-latest.sh 4c1d56e
|
||||
#
|
||||
# Prereqs:
|
||||
# - crane on $PATH (brew install crane OR download from
|
||||
# https://github.com/google/go-containerregistry/releases)
|
||||
# - GHCR token exported as GITHUB_TOKEN with write:packages scope
|
||||
#
|
||||
# What it does (per image — platform + tenant):
|
||||
# crane digest ghcr.io/…:<sha> # verify the target sha exists
|
||||
# crane tag ghcr.io/…:<sha> latest # retag remotely, single API call
|
||||
# crane digest ghcr.io/…:latest # confirm the move
|
||||
#
|
||||
# Exit codes: 0 = both retagged, 1 = tag missing / crane error, 2 = bad args.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
if [ "${1:-}" = "" ]; then
|
||||
echo "usage: $0 <staging-sha>" >&2
|
||||
echo " e.g. $0 4c1d56e — retags :latest to :staging-4c1d56e" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
TARGET_SHA="$1"
|
||||
PLATFORM=ghcr.io/molecule-ai/platform
|
||||
TENANT=ghcr.io/molecule-ai/platform-tenant
|
||||
|
||||
if ! command -v crane >/dev/null; then
|
||||
echo "ERROR: crane not installed. brew install crane" >&2
|
||||
exit 1
|
||||
fi
|
||||
if [ -z "${GITHUB_TOKEN:-}" ]; then
|
||||
echo "ERROR: GITHUB_TOKEN unset. export it with write:packages scope." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Log in once. crane stores creds in a config file keyed by registry;
|
||||
# re-running is cheap.
|
||||
printf '%s\n' "$GITHUB_TOKEN" | crane auth login ghcr.io -u "${GITHUB_ACTOR:-$(whoami)}" --password-stdin >/dev/null
|
||||
|
||||
roll() {
|
||||
local image="$1"
|
||||
local src="$image:staging-$TARGET_SHA"
|
||||
local dst="$image:latest"
|
||||
|
||||
echo "→ $image"
|
||||
# Abort rollout if the target tag doesn't exist in the registry.
|
||||
# Otherwise crane tag would error anyway, but a pre-check gives a
|
||||
# clearer message for ops.
|
||||
if ! crane digest "$src" >/dev/null 2>&1; then
|
||||
echo " FAIL: $src not found in registry. Did you type the wrong sha?" >&2
|
||||
return 1
|
||||
fi
|
||||
src_digest=$(crane digest "$src")
|
||||
|
||||
crane tag "$src" latest
|
||||
new_digest=$(crane digest "$dst")
|
||||
|
||||
if [ "$new_digest" != "$src_digest" ]; then
|
||||
echo " FAIL: $dst digest $new_digest does not match expected $src_digest" >&2
|
||||
return 1
|
||||
fi
|
||||
echo " OK $dst → $new_digest"
|
||||
}
|
||||
|
||||
roll "$PLATFORM"
|
||||
roll "$TENANT"
|
||||
|
||||
echo
|
||||
echo "=== ROLLBACK COMPLETE ==="
|
||||
echo "Both images now point :latest at staging-$TARGET_SHA."
|
||||
echo "Prod tenants will pick up the rollback within their 5-min auto-update cycle."
|
||||
3
workspace-server/.gitignore
vendored
3
workspace-server/.gitignore
vendored
@ -1 +1,2 @@
|
||||
server
|
||||
# The compiled binary, not the cmd/server package.
|
||||
/server
|
||||
|
||||
107
workspace-server/cmd/server/cp_config.go
Normal file
107
workspace-server/cmd/server/cp_config.go
Normal file
@ -0,0 +1,107 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"time"
|
||||
)
|
||||
|
||||
// refreshEnvFromCP pulls the tenant's current config-plane env vars
|
||||
// from the control plane and applies them via os.Setenv BEFORE any
|
||||
// other code calls os.Getenv on them.
|
||||
//
|
||||
// Why:
|
||||
// - user-data on the tenant EC2 bakes env vars into `docker run` at
|
||||
// provision time. Those values are frozen. When we rotate a secret
|
||||
// on CP (e.g. PROVISION_SHARED_SECRET) there's no way to push the
|
||||
// new value into already-provisioned tenants.
|
||||
// - the Docker image auto-updater already pulls the latest workspace-
|
||||
// server image every 5 min. If THAT image knows how to refresh its
|
||||
// own env from the CP on startup, every tenant heals itself within
|
||||
// the update cycle — no ssh, no re-provision, no ops toil.
|
||||
//
|
||||
// Contract (paired with cp-side GET /cp/tenants/config):
|
||||
// Request: GET {MOLECULE_CP_URL or https://api.moleculesai.app}/cp/tenants/config
|
||||
// Authorization: Bearer <ADMIN_TOKEN>
|
||||
// X-Molecule-Org-Id: <MOLECULE_ORG_ID>
|
||||
// Response: 200 {"MOLECULE_CP_SHARED_SECRET":"…","MOLECULE_CP_URL":"…", …}
|
||||
// 401 on bearer mismatch or unknown org
|
||||
//
|
||||
// Best-effort: any failure logs and returns — main() keeps booting.
|
||||
// Self-hosted deploys without MOLECULE_ORG_ID or ADMIN_TOKEN set
|
||||
// short-circuit silently so this function is a no-op there.
|
||||
func refreshEnvFromCP() error {
|
||||
orgID := os.Getenv("MOLECULE_ORG_ID")
|
||||
adminToken := os.Getenv("ADMIN_TOKEN")
|
||||
if orgID == "" || adminToken == "" {
|
||||
// Not a SaaS tenant (self-hosted dev or not yet provisioned).
|
||||
return nil
|
||||
}
|
||||
|
||||
base := os.Getenv("MOLECULE_CP_URL")
|
||||
if base == "" {
|
||||
// Default to prod for any tenant that lost track of its CP URL
|
||||
// (e.g. older user-data that only set MOLECULE_ORG_ID).
|
||||
base = "https://api.moleculesai.app"
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", base+"/cp/tenants/config", nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("build request: %w", err)
|
||||
}
|
||||
req.Header.Set("Authorization", "Bearer "+adminToken)
|
||||
req.Header.Set("X-Molecule-Org-Id", orgID)
|
||||
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("do request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// 64 KiB cap — the CP only returns small JSON blobs here. An
|
||||
// unbounded read would be weaponizable if a compromised upstream
|
||||
// ever echoed back a gigabyte.
|
||||
body, err := io.ReadAll(io.LimitReader(resp.Body, 64<<10))
|
||||
if err != nil {
|
||||
return fmt.Errorf("read body: %w", err)
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
// 401 on first boot-after-restart is expected for tenants still
|
||||
// running under old user-data where admin_token on-disk hasn't
|
||||
// had its corresponding row seeded. Don't treat as fatal — just
|
||||
// log so operators can spot repeat offenders in logs.
|
||||
return fmt.Errorf("cp returned %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
var cfg map[string]string
|
||||
if err := json.Unmarshal(body, &cfg); err != nil {
|
||||
return fmt.Errorf("decode: %w", err)
|
||||
}
|
||||
|
||||
// Apply only strings; reject oversized values defensively. An
|
||||
// operator-supplied config should never exceed 4 KiB per key —
|
||||
// workspace-server env vars are URLs, hex secrets, short identifiers.
|
||||
const maxValueBytes = 4 << 10
|
||||
applied := 0
|
||||
for k, v := range cfg {
|
||||
if k == "" || len(v) > maxValueBytes {
|
||||
continue
|
||||
}
|
||||
if err := os.Setenv(k, v); err != nil {
|
||||
log.Printf("CP env refresh: setenv %s: %v", k, err)
|
||||
continue
|
||||
}
|
||||
applied++
|
||||
}
|
||||
log.Printf("CP env refresh: applied %d values from %s/cp/tenants/config", applied, base)
|
||||
return nil
|
||||
}
|
||||
100
workspace-server/cmd/server/cp_config_test.go
Normal file
100
workspace-server/cmd/server/cp_config_test.go
Normal file
@ -0,0 +1,100 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestRefreshEnvFromCP_NoopWhenNotSaaS: without MOLECULE_ORG_ID or
|
||||
// ADMIN_TOKEN, the function short-circuits silently — self-hosted dev
|
||||
// must not fail or log spam here.
|
||||
func TestRefreshEnvFromCP_NoopWhenNotSaaS(t *testing.T) {
|
||||
t.Setenv("MOLECULE_ORG_ID", "")
|
||||
t.Setenv("ADMIN_TOKEN", "")
|
||||
if err := refreshEnvFromCP(); err != nil {
|
||||
t.Errorf("expected nil on non-SaaS, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRefreshEnvFromCP_AppliesCPResponse: wire a stub CP, run refresh,
|
||||
// confirm the returned env vars ended up in os.Environ().
|
||||
func TestRefreshEnvFromCP_AppliesCPResponse(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if got := r.Header.Get("Authorization"); got != "Bearer tenant-admin-token" {
|
||||
t.Errorf("bearer: got %q", got)
|
||||
}
|
||||
if got := r.Header.Get("X-Molecule-Org-Id"); got != "org-abc" {
|
||||
t.Errorf("org id header: got %q", got)
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
fmt.Fprint(w, `{"MOLECULE_CP_SHARED_SECRET":"new-secret","MOLECULE_CP_URL":"https://api.moleculesai.app"}`)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
t.Setenv("MOLECULE_ORG_ID", "org-abc")
|
||||
t.Setenv("ADMIN_TOKEN", "tenant-admin-token")
|
||||
t.Setenv("MOLECULE_CP_URL", srv.URL)
|
||||
t.Setenv("MOLECULE_CP_SHARED_SECRET", "") // clear before refresh
|
||||
|
||||
if err := refreshEnvFromCP(); err != nil {
|
||||
t.Fatalf("refreshEnvFromCP: %v", err)
|
||||
}
|
||||
if got := os.Getenv("MOLECULE_CP_SHARED_SECRET"); got != "new-secret" {
|
||||
t.Errorf("SHARED_SECRET: want new-secret, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRefreshEnvFromCP_CPUnreachableDoesNotFailBoot: network errors must
|
||||
// return non-nil BUT main.go treats that as warn-and-continue. We assert
|
||||
// the function returns an error (not a panic) so the caller can log.
|
||||
func TestRefreshEnvFromCP_CPUnreachableDoesNotFailBoot(t *testing.T) {
|
||||
t.Setenv("MOLECULE_ORG_ID", "org-abc")
|
||||
t.Setenv("ADMIN_TOKEN", "t")
|
||||
t.Setenv("MOLECULE_CP_URL", "http://127.0.0.1:1") // closed port
|
||||
err := refreshEnvFromCP()
|
||||
if err == nil {
|
||||
t.Error("expected an error when CP is unreachable")
|
||||
}
|
||||
}
|
||||
|
||||
// TestRefreshEnvFromCP_NonOKPropagates: CP returns 500 → error.
|
||||
func TestRefreshEnvFromCP_NonOKPropagates(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
http.Error(w, "boom", http.StatusInternalServerError)
|
||||
}))
|
||||
defer srv.Close()
|
||||
t.Setenv("MOLECULE_ORG_ID", "org-abc")
|
||||
t.Setenv("ADMIN_TOKEN", "t")
|
||||
t.Setenv("MOLECULE_CP_URL", srv.URL)
|
||||
if err := refreshEnvFromCP(); err == nil {
|
||||
t.Error("expected error on 500, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
// TestRefreshEnvFromCP_RejectsOversizedValue: a single-value-over-4KiB
|
||||
// payload must NOT poison the environment.
|
||||
func TestRefreshEnvFromCP_RejectsOversizedValue(t *testing.T) {
|
||||
giant := make([]byte, 5<<10)
|
||||
for i := range giant {
|
||||
giant[i] = 'x'
|
||||
}
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
fmt.Fprintf(w, `{"MOLECULE_CP_SHARED_SECRET":%q}`, string(giant))
|
||||
}))
|
||||
defer srv.Close()
|
||||
t.Setenv("MOLECULE_ORG_ID", "org-abc")
|
||||
t.Setenv("ADMIN_TOKEN", "t")
|
||||
t.Setenv("MOLECULE_CP_URL", srv.URL)
|
||||
t.Setenv("MOLECULE_CP_SHARED_SECRET", "original")
|
||||
if err := refreshEnvFromCP(); err != nil {
|
||||
t.Fatalf("refreshEnvFromCP: %v", err)
|
||||
}
|
||||
if got := os.Getenv("MOLECULE_CP_SHARED_SECRET"); got != "original" {
|
||||
t.Errorf("oversized value was applied — want %q, got %d bytes",
|
||||
"original", len(got))
|
||||
}
|
||||
}
|
||||
@ -30,6 +30,16 @@ import (
|
||||
)
|
||||
|
||||
func main() {
|
||||
// CP self-refresh: pull any operator-rotated config (e.g. a new
|
||||
// MOLECULE_CP_SHARED_SECRET) before any other code reads env.
|
||||
// Best-effort — if the CP is unreachable we keep booting with the
|
||||
// env we were provisioned with. Older SaaS tenants predate PR #53
|
||||
// and can arrive here with MOLECULE_CP_SHARED_SECRET unset; this
|
||||
// is how they heal without SSH.
|
||||
if err := refreshEnvFromCP(); err != nil {
|
||||
log.Printf("CP env refresh: %v (continuing with baked-in env)", err)
|
||||
}
|
||||
|
||||
// Secrets encryption. In MOLECULE_ENV=prod, boot refuses to start
|
||||
// without a valid SECRETS_ENCRYPTION_KEY (fail-secure — Top-5 #5).
|
||||
// In any other environment, missing keys just log a warning and
|
||||
|
||||
150
workspace-server/internal/provisioner/cp_provisioner_test.go
Normal file
150
workspace-server/internal/provisioner/cp_provisioner_test.go
Normal file
@ -0,0 +1,150 @@
|
||||
package provisioner
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestNewCPProvisioner_RequiresOrgID — self-hosted deployments don't
|
||||
// have a MOLECULE_ORG_ID, and the provisioner must refuse to construct
|
||||
// rather than silently phone home to the prod CP with an empty tenant.
|
||||
func TestNewCPProvisioner_RequiresOrgID(t *testing.T) {
|
||||
t.Setenv("MOLECULE_ORG_ID", "")
|
||||
if _, err := NewCPProvisioner(); err == nil {
|
||||
t.Error("want error when MOLECULE_ORG_ID is unset, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
// TestNewCPProvisioner_FallsBackToProvisionSharedSecret — operators
|
||||
// may set PROVISION_SHARED_SECRET on both sides of the wire with a
|
||||
// single value; the tenant accepts that name as a fallback for
|
||||
// MOLECULE_CP_SHARED_SECRET. The fallback is documented in
|
||||
// NewCPProvisioner; this test is the regression gate.
|
||||
func TestNewCPProvisioner_FallsBackToProvisionSharedSecret(t *testing.T) {
|
||||
t.Setenv("MOLECULE_ORG_ID", "org-abc")
|
||||
t.Setenv("MOLECULE_CP_SHARED_SECRET", "")
|
||||
t.Setenv("PROVISION_SHARED_SECRET", "from-fallback")
|
||||
|
||||
p, err := NewCPProvisioner()
|
||||
if err != nil {
|
||||
t.Fatalf("NewCPProvisioner: %v", err)
|
||||
}
|
||||
if p.sharedSecret != "from-fallback" {
|
||||
t.Errorf("sharedSecret = %q, want %q", p.sharedSecret, "from-fallback")
|
||||
}
|
||||
}
|
||||
|
||||
// TestAuthHeader_NoopWhenSecretEmpty — the self-hosted path that
|
||||
// doesn't gate /cp/workspaces/* must not add a stray Authorization
|
||||
// header (bearer-like content would be surprising to non-bearer
|
||||
// intermediaries).
|
||||
func TestAuthHeader_NoopWhenSecretEmpty(t *testing.T) {
|
||||
p := &CPProvisioner{sharedSecret: ""}
|
||||
req := httptest.NewRequest("GET", "http://x/", nil)
|
||||
p.authHeader(req)
|
||||
if got := req.Header.Get("Authorization"); got != "" {
|
||||
t.Errorf("Authorization set to %q with empty secret; want unset", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestAuthHeader_SetsBearerWhenSecretSet — happy path.
|
||||
func TestAuthHeader_SetsBearerWhenSecretSet(t *testing.T) {
|
||||
p := &CPProvisioner{sharedSecret: "the-secret"}
|
||||
req := httptest.NewRequest("GET", "http://x/", nil)
|
||||
p.authHeader(req)
|
||||
if got := req.Header.Get("Authorization"); got != "Bearer the-secret" {
|
||||
t.Errorf("Authorization = %q, want %q", got, "Bearer the-secret")
|
||||
}
|
||||
}
|
||||
|
||||
// TestStart_HappyPath — Start posts to the stubbed CP, passes the
|
||||
// bearer, and parses the returned instance_id.
|
||||
func TestStart_HappyPath(t *testing.T) {
|
||||
var sawBearer string
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
sawBearer = r.Header.Get("Authorization")
|
||||
if r.URL.Path != "/cp/workspaces/provision" {
|
||||
t.Errorf("unexpected path %s", r.URL.Path)
|
||||
}
|
||||
// Verify the request body round-trips our fields
|
||||
var body cpProvisionRequest
|
||||
_ = json.NewDecoder(r.Body).Decode(&body)
|
||||
if body.WorkspaceID != "ws-1" || body.Runtime != "python" {
|
||||
t.Errorf("body mismatch: %+v", body)
|
||||
}
|
||||
w.WriteHeader(http.StatusCreated)
|
||||
_, _ = io.WriteString(w, `{"instance_id":"i-abc123","state":"pending"}`)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
p := &CPProvisioner{
|
||||
baseURL: srv.URL,
|
||||
orgID: "org-1",
|
||||
sharedSecret: "s3cret",
|
||||
httpClient: srv.Client(),
|
||||
}
|
||||
|
||||
id, err := p.Start(context.Background(), WorkspaceConfig{
|
||||
WorkspaceID: "ws-1", Runtime: "python", Tier: 1, PlatformURL: "http://tenant",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("Start: %v", err)
|
||||
}
|
||||
if id != "i-abc123" {
|
||||
t.Errorf("instance id = %q, want i-abc123", id)
|
||||
}
|
||||
if sawBearer != "Bearer s3cret" {
|
||||
t.Errorf("server saw Authorization = %q, want Bearer s3cret", sawBearer)
|
||||
}
|
||||
}
|
||||
|
||||
// TestStart_Non201ReturnsStructuredError — when CP returns 401 with a
|
||||
// structured {"error":"..."} body, Start surfaces that error message.
|
||||
// Verifies the defense against log-leaking raw upstream bodies.
|
||||
func TestStart_Non201ReturnsStructuredError(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusUnauthorized)
|
||||
_, _ = io.WriteString(w, `{"error":"invalid credentials"}`)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
p := &CPProvisioner{baseURL: srv.URL, orgID: "org-1", httpClient: srv.Client()}
|
||||
|
||||
_, err := p.Start(context.Background(), WorkspaceConfig{WorkspaceID: "ws-1", Runtime: "py"})
|
||||
if err == nil {
|
||||
t.Fatal("expected error on 401, got nil")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "invalid credentials") {
|
||||
t.Errorf("error message %q should include upstream error field", err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
// TestStart_NoStructuredErrorFallsBackToSize — the anti-leak path:
|
||||
// when upstream returns non-JSON, we refuse to log the body and
|
||||
// report only the byte count, preventing Authorization header echoes
|
||||
// from landing in our logs.
|
||||
func TestStart_NoStructuredErrorFallsBackToSize(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
_, _ = io.WriteString(w, "raw proxy error page, could contain echoed headers")
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
p := &CPProvisioner{baseURL: srv.URL, orgID: "org-1", httpClient: srv.Client()}
|
||||
|
||||
_, err := p.Start(context.Background(), WorkspaceConfig{WorkspaceID: "ws-1", Runtime: "py"})
|
||||
if err == nil {
|
||||
t.Fatal("expected error on 500, got nil")
|
||||
}
|
||||
if strings.Contains(err.Error(), "raw proxy error") {
|
||||
t.Errorf("error leaked raw body: %q", err.Error())
|
||||
}
|
||||
if !strings.Contains(err.Error(), "<unstructured body") {
|
||||
t.Errorf("expected byte-count fallback, got %q", err.Error())
|
||||
}
|
||||
}
|
||||
@ -310,14 +310,20 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) {
|
||||
// consecutive empties and escalate to 'stale' after 3 in a row.
|
||||
isEmpty := isEmptyResponse(respBody)
|
||||
if lastStatus == "ok" && isEmpty {
|
||||
db.DB.ExecContext(ctx, `
|
||||
// One query instead of UPDATE-then-SELECT: RETURNING hands back
|
||||
// the post-increment value so the stale-threshold check doesn't
|
||||
// cost a second roundtrip. This handler fires once per cron tick
|
||||
// per schedule; at 100 tenants × dozens of schedules the saved
|
||||
// query matters.
|
||||
var consecEmpty int
|
||||
if err := db.DB.QueryRowContext(ctx, `
|
||||
UPDATE workspace_schedules
|
||||
SET consecutive_empty_runs = consecutive_empty_runs + 1,
|
||||
updated_at = now()
|
||||
WHERE id = $1`, sched.ID)
|
||||
// Check if we've crossed the stale threshold
|
||||
var consecEmpty int
|
||||
db.DB.QueryRowContext(ctx, `SELECT consecutive_empty_runs FROM workspace_schedules WHERE id = $1`, sched.ID).Scan(&consecEmpty)
|
||||
WHERE id = $1
|
||||
RETURNING consecutive_empty_runs`, sched.ID).Scan(&consecEmpty); err != nil {
|
||||
log.Printf("Scheduler: '%s' empty-run bump failed: %v", sched.Name, err)
|
||||
}
|
||||
if consecEmpty >= 3 {
|
||||
lastStatus = "stale"
|
||||
lastError = fmt.Sprintf("empty response %d consecutive times — agent may be phantom-producing (#795)", consecEmpty)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user