From bfdb0e3b6996997f240ef3e1569bf94321e2f4cf Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Fri, 17 Apr 2026 23:58:55 -0700 Subject: [PATCH] docs: Cloudflare Tunnel migration report + track Worker source - Full session retrospective: tunnel E2E verified on prod + staging subdomains - Worker source tracked in infra/cloudflare-worker/ (was only in /tmp) - Worker changes: reserved slug passthrough + multi-level subdomain bypass - Known issues, follow-ups, cost impact, key learnings documented Co-Authored-By: Claude Opus 4.6 (1M context) --- .../2026-04-18-tunnel-migration.md | 223 ++++++++++++++ infra/cloudflare-worker/package.json | 11 + infra/cloudflare-worker/src/index.ts | 280 ++++++++++++++++++ infra/cloudflare-worker/wrangler.toml | 19 ++ 4 files changed, 533 insertions(+) create mode 100644 docs/retrospectives/2026-04-18-tunnel-migration.md create mode 100644 infra/cloudflare-worker/package.json create mode 100644 infra/cloudflare-worker/src/index.ts create mode 100644 infra/cloudflare-worker/wrangler.toml diff --git a/docs/retrospectives/2026-04-18-tunnel-migration.md b/docs/retrospectives/2026-04-18-tunnel-migration.md new file mode 100644 index 00000000..997bd94a --- /dev/null +++ b/docs/retrospectives/2026-04-18-tunnel-migration.md @@ -0,0 +1,223 @@ +# Cloudflare Tunnel Migration — Session Report (2026-04-18) + +> **Duration:** ~4 hours +> **Scope:** Replace Cloudflare Worker + wildcard DNS with per-tenant Cloudflare Tunnels +> **Issue:** #933 +> **Status:** Tunnel E2E verified on both production and staging subdomains. Ready for production tenant migration. + +--- + +## What Was Done + +### 1. PR Triage (15 PRs merged) + +Before tunnel work, cleared the PR backlog since CI runner was slow: + +| PR | Type | Description | +|----|------|-------------| +| #934 | docs | Staging environment design + Phase 36 plan | +| #849 | docs | Partner API Keys (Phase 34) — resolved PLAN.md conflict | +| #922 | docs | ANTHROPIC_API_KEY as required global secret | +| #880 | docs | SAFE-MCP internal advisory | +| #927 | docs | Ecosystem watch daily sweep | +| #923 | security | Slack OAuth state param — random nonce replaces workspace_id | +| #913 | security | Redact secrets from commit_memory before persistence | +| #925 | security | HITL audit log on approval grant/denial | +| #879 | fix | Canvas TypeScript fixture drift | +| #915 | feature | A2A topology overlay + hermes plugin declarations | +| #921 | feature | Audit trail visualization panel | +| #929 | feature | Temporal crash-resume checkpoints | +| #937 | fix | go vet errors + supply chain hardening (created + merged) | +| #938 | fix | Canvas a11y — TeamMemberChip keyboard nav (created + merged) | + +Also closed issue #920 (Slack OAuth) and commented on #889 (VULN-004 dead letter). + +### 2. Cloudflare API Token — Tunnel Permission + +**Problem:** The existing CF API token (`cfut_loLR...`) had DNS:Edit but NOT Cloudflare Tunnel:Edit permission. Tunnel create/list/delete calls returned `code 10000: Authentication error`. + +**Fix:** CEO added Account → Cloudflare Tunnel → Edit permission in Cloudflare Dashboard → API Tokens. + +### 3. Tunnel API Integration Tests + +Ran three progressively comprehensive tests: + +| Test | Result | What it proved | +|------|--------|----------------| +| API roundtrip | ✓ | Create tunnel → create DNS CNAME → delete both | +| DNS resolution | ✓ | CNAME resolves on first attempt (instant, zero propagation delay) | +| Full E2E with EC2 | ✓ | Tunnel + DNS + EC2 with cloudflared → HTTP 200 through subdomain | + +### 4. Worker Coexistence Fix + +**Problem:** The Cloudflare Worker route `*.moleculesai.app/*` intercepted tunnel CNAME requests before they could reach the tunnel origin. Tunnel subdomains got the Worker's "Organization not found" page instead of routing through the tunnel. + +**Fix (two changes to Worker):** + +```typescript +// 1. Reserved slugs now pass through instead of returning 404 +if (!slug || slug === host || RESERVED.has(slug) || slug.includes(".")) { + return fetch(request); // was: return new Response("Not found", { status: 404 }); +} + +// 2. Multi-level subdomains (*.staging.moleculesai.app) bypass Worker entirely +// slug.includes(".") catches "foo.staging" and passes to tunnel CNAME +``` + +Worker redeployed. Production tenants unaffected — they still route through the Worker. Tunnel-routed subdomains pass through to origin. + +### 5. SSL Certificate for Staging Subdomains + +**Problem:** Cloudflare's free Universal SSL only covers `*.moleculesai.app` (one wildcard level). `*.staging.moleculesai.app` (two levels) fails TLS handshake — no certificate. + +**Fix:** Ordered Advanced Certificate via Cloudflare Dashboard: +- Hostnames: `*.staging.moleculesai.app`, `staging.moleculesai.app` +- CA: Let's Encrypt +- Validity: 90 days, auto-renewal 30 days before expiry +- Cost: included in Cloudflare free plan (1 of 100 advanced certs) + +### 6. Staging Tunnel E2E — Full Pass + +Final test on `*.staging.moleculesai.app` (fully isolated from production): + +``` +1. Create Tunnel → OK (ea5aaa13...) +2. Configure ingress → OK (→ localhost:8080) +3. Create DNS CNAME → OK (tunnel-stg-test.staging.moleculesai.app) +4. Launch EC2 t3.micro → OK (cloudflared binary download) +5. Tunnel connected → OK (healthy in 30s) +6. HTTP 200 through tunnel → OK + Response: {"status":"ok","domain":"tunnel-stg-test.staging.moleculesai.app"} +7. Cleanup → OK (EC2 terminated, DNS + tunnel deleted) +``` + +### 7. Platform Build Verification + +After merging 15 PRs, verified everything still builds and passes: +- Go: `go test -race ./...` — 15/15 packages pass, 0 failures +- Go: `go vet ./...` — clean +- Canvas: `npm run build` — success +- Canvas: `vitest run` — 762/762 tests pass + +--- + +## Architecture: Before vs After + +### Before (Cloudflare Worker) + +``` +User → *.moleculesai.app (wildcard A record, proxied) + → Cloudflare Worker (extracts slug, looks up EC2 IP from CP API) + → Worker proxies to EC2 public IP:8080 + → EC2 must have public IP + open port 8080 +``` + +**Problems:** +- Edge cache poisoning when wildcard A record IP changes (2+ hour recovery) +- ADMIN_TOKEN transmitted in plaintext via Worker header injection +- EC2 requires public IP + open inbound ports (security surface) +- Worker is a single point of failure for all tenant routing +- KV cache stale-while-revalidate adds latency on cold starts + +### After (Cloudflare Tunnel) + +``` +User → slug.moleculesai.app (CNAME → tunnel-id.cfargotunnel.com, proxied) + → Cloudflare edge routes to tunnel + → cloudflared on EC2 (outbound-only connection) receives request + → cloudflared forwards to localhost:8080 + → EC2 needs NO public IP, NO open inbound ports +``` + +**Advantages:** +- No edge cache — CNAME resolves instantly via Cloudflare's anycast +- No plaintext secrets in transit — tunnel is encrypted end-to-end +- EC2 can be in private subnet (no public IP, no security group rules) +- Each tenant has its own tunnel (no single point of failure) +- No Worker maintenance, no KV cache management +- Faster provisioning — DNS works immediately, no cache warming + +--- + +## Known Issues & Risks + +### 1. Worker Must Stay Until All Tenants Migrate +The Worker route `*.moleculesai.app/*` still serves existing tenants (e.g., `hongmingwang.moleculesai.app`). Cannot delete until every tenant has a tunnel + CNAME. The Worker passthrough for reserved/multi-level slugs is the bridge. + +### 2. Worker Source Not in Version Control +The Worker code lives in `/tmp/molecule-tenant-proxy/` — not tracked in any repo. Needs to be committed somewhere before the session ends. Two changes were deployed: +- `fetch(request)` passthrough for reserved slugs (was `404`) +- `slug.includes(".")` bypass for multi-level subdomains + +### 3. cloudflared Binary Download at Boot +Current EC2 user-data downloads `cloudflared` from GitHub releases at boot time. This adds ~5 seconds and depends on GitHub availability. Pre-baked AMI would eliminate this dependency. + +### 4. Tunnel Token in User-Data +The `cloudflared` tunnel token is passed in EC2 user-data (base64 encoded). AWS user-data is accessible to anyone with EC2 instance metadata access. The token grants tunnel connection rights — if leaked, an attacker could impersonate the tenant's tunnel. Mitigation: use AWS Secrets Manager or SSM Parameter Store instead. + +### 5. Tunnel Cleanup on Org Delete +The `DeprovisionInstance` function has a TODO for tunnel deletion. When an org is deleted, the tunnel and DNS CNAME must be cleaned up. The tunnel ID is stored in EC2 tags (`TunnelID`), but needs to be persisted in `org_instances` table for reliable cleanup. + +### 6. No Health Check on Tunnel +If `cloudflared` crashes on the EC2 but the instance stays running, the tunnel goes inactive but the DNS CNAME still points to it. Need a health sweep that checks tunnel status via CF API and restarts `cloudflared` or the instance. + +### 7. Staging CP Uses Production Tenant Image +`TENANT_IMAGE` on staging is still `ghcr.io/molecule-ai/platform-tenant:latest` (production). Should be `:staging` once the staging image pipeline is set up. + +--- + +## Follow-Up Tasks + +### Immediate (before next deploy) + +- [ ] **Commit Worker code to repo** — decide location (monorepo `infra/` or separate repo), commit current state with the two passthrough changes +- [ ] **Persist tunnel ID in org_instances table** — add `tunnel_id` column so deprovision cascade can clean up tunnels reliably +- [ ] **Wire tunnel cleanup into DeprovisionInstance** — delete tunnel + DNS CNAME when org is deleted + +### Short-term (this week) + +- [ ] **Migrate `hongmingwang` tenant to tunnel** — create tunnel, add CNAME, update EC2 to run cloudflared, add slug to Worker RESERVED, verify, then remove old A record +- [ ] **Staging image pipeline** — publish `:staging` tag on main merge, `:latest` only on manual promote +- [ ] **Move tunnel token to SSM Parameter Store** — EC2 user-data is not secret-safe; retrieve token at boot via instance role + +### Medium-term (this month) + +- [ ] **Pre-baked AMI with cloudflared** — eliminate GitHub download dependency at boot +- [ ] **Tunnel health sweep** — periodic check of tunnel status via CF API, restart cloudflared if inactive +- [ ] **Delete Worker** — once all tenants are on tunnels, remove Worker + wildcard A record entirely +- [ ] **Private subnet for tenant EC2s** — with tunnels, EC2s don't need public IPs; move to private subnet with NAT gateway for outbound + +### Nice-to-have + +- [ ] **Cloudflare Access** — add zero-trust access policies on tunnel routes (IP allow-list, mTLS) +- [ ] **Tunnel metrics** — export tunnel connection count, latency, bandwidth to Prometheus/Grafana +- [ ] **Multi-region tunnels** — cloudflared connects to nearest Cloudflare edge; for multi-region deployments, each region's EC2 gets its own tunnel + +--- + +## Cost Impact + +| Item | Before | After | +|------|--------|-------| +| Cloudflare Worker | Free (100k req/day) | Eliminated | +| Workers KV | Free tier | Eliminated | +| Advanced SSL Cert | $0 | $0 (1 of 100 free) | +| EC2 public IPs | ~$3.65/mo per tenant | $0 (no public IP needed) | +| Cloudflare Tunnel | N/A | Free (unlimited tunnels) | +| **Net change** | | **Saves ~$3.65/tenant/mo** | + +--- + +## Key Learnings + +1. **Worker routes take priority over DNS CNAMEs** — even with a CNAME pointing to `cfargotunnel.com`, the Worker's wildcard route fires first. Must explicitly pass through via `fetch(request)`. + +2. **Free Universal SSL only covers one wildcard level** — `*.moleculesai.app` works, `*.staging.moleculesai.app` doesn't. Advanced Certificate (free, Let's Encrypt) solves this. + +3. **Let's Encrypt rejects mixed wildcard+parent certs** — can't put `*.moleculesai.app` and `*.staging.moleculesai.app` in the same cert. Issue separate certs for each level. + +4. **Tunnel connects in ~30 seconds** — from EC2 boot to tunnel healthy, including cloudflared binary download (~5s) + connection establishment (~25s). Faster than DNS propagation ever was. + +5. **DNS CNAME resolves instantly** — no propagation delay, no edge cache, no NXDOMAIN caching. This is the fundamental advantage over the wildcard A record approach. + +6. **cloudflared binary download is faster than apt** — `curl` from GitHub releases (~5s) vs `apt-get install cloudflared` (~30s). Use binary download in boot scripts. diff --git a/infra/cloudflare-worker/package.json b/infra/cloudflare-worker/package.json new file mode 100644 index 00000000..5981ecb8 --- /dev/null +++ b/infra/cloudflare-worker/package.json @@ -0,0 +1,11 @@ +{ + "name": "molecule-tenant-proxy", + "private": true, + "scripts": { + "dev": "wrangler dev", + "deploy": "wrangler deploy" + }, + "devDependencies": { + "wrangler": "^4.0.0" + } +} diff --git a/infra/cloudflare-worker/src/index.ts b/infra/cloudflare-worker/src/index.ts new file mode 100644 index 00000000..fedee0c3 --- /dev/null +++ b/infra/cloudflare-worker/src/index.ts @@ -0,0 +1,280 @@ +/** + * Molecule AI tenant proxy — Cloudflare Worker + * + * Routes *.moleculesai.app requests to the correct EC2 tenant instance. + * Replaces per-tenant DNS records with a single wildcard + edge routing. + * + * Cache strategy (3-tier): + * L1: in-memory Map (60s TTL, per-isolate) + * L2: Workers KV (5 min TTL, stale-while-revalidate) + * L3: CP API — GET /cp/orgs/:slug/instance + * Fallback: serve stale KV when CP is unreachable + */ + +export interface Env { + TENANT_CACHE: KVNamespace; + CP_API_URL: string; +} + +interface TenantInfo { + slug: string; + status: string; // "running" | "provisioning" | "failed" + ip: string | null; + org_id: string; + admin_token?: string; +} + +// L1: in-memory cache (per-isolate, 60s TTL) +const memCache = new Map(); +const MEM_TTL_MS = 60_000; +const KV_TTL_S = 300; // 5 min + +// Subdomains that are NOT tenants — handled by explicit DNS records +const RESERVED = new Set(["api", "app", "www", "docs", "doc", "status", "staging-api", "tunneltest"]); + +// Routes that go to platform (:8080) vs canvas (:3000) +const API_PREFIXES = [ + "/health", "/metrics", "/workspaces", "/registry", "/templates", + "/org", "/settings", "/plugins", "/events", "/bundles", "/channels", + "/webhooks", "/approvals", "/admin", "/canvas", "/ws", +]; + +export default { + async fetch(request: Request, env: Env): Promise { + const url = new URL(request.url); + const host = url.hostname; + + // Extract slug from hostname: "acme.moleculesai.app" → "acme" + const slug = host.replace(".moleculesai.app", ""); + if (!slug || slug === host || RESERVED.has(slug) || slug.includes(".")) { + // Pass through to origin (tunnel CNAME or explicit DNS record). + // slug.includes(".") catches multi-level subdomains like + // "foo.staging.moleculesai.app" which are routed via CF Tunnel. + return fetch(request); + } + + // Lookup tenant backend + const tenant = await resolveTenant(slug, env); + + if (!tenant) { + return notFoundPage(slug); + } + + if (tenant.status === "provisioning" || !tenant.ip) { + return provisioningPage(slug); + } + + if (tenant.status === "failed") { + return errorPage(slug); + } + + // Route ALL traffic to :8080 (Go platform). The platform proxies non-API + // routes to Canvas internally via CANVAS_PROXY_URL. We don't split traffic + // between :8080 and :3000 because Canvas may bind to 127.0.0.1 only + // (not externally reachable) while the platform is always on 0.0.0.0. + const backendUrl = `http://${tenant.ip}:8080${url.pathname}${url.search}`; + + // WebSocket upgrade + if (request.headers.get("Upgrade") === "websocket") { + return fetch(backendUrl, request); + } + + // Proxy the request + const headers = new Headers(request.headers); + headers.set("X-Molecule-Org-Id", tenant.org_id); + headers.set("Origin", `https://${slug}.moleculesai.app`); + headers.set("X-Forwarded-For", request.headers.get("CF-Connecting-IP") || ""); + headers.set("X-Forwarded-Proto", "https"); + headers.set("Host", `${slug}.moleculesai.app`); + // Inject ADMIN_TOKEN for AdminAuth — the tenant platform validates this + // as a dedicated admin credential (not a workspace token). + if (tenant.admin_token) { + headers.set("Authorization", `Bearer ${tenant.admin_token}`); + } + + const proxyReq = new Request(backendUrl, { + method: request.method, + headers, + body: request.body, + redirect: "manual", + }); + + try { + const resp = await fetch(proxyReq); + // Strip backend hop headers, pass everything else through + const respHeaders = new Headers(resp.headers); + respHeaders.delete("transfer-encoding"); + return new Response(resp.body, { + status: resp.status, + statusText: resp.statusText, + headers: respHeaders, + }); + } catch { + return new Response("Backend unavailable", { status: 502 }); + } + }, +}; + +// --------------------------------------------------------------------------- +// 3-tier cache resolution +// --------------------------------------------------------------------------- + +async function resolveTenant( + slug: string, + env: Env, +): Promise { + // L1: in-memory + const mem = memCache.get(slug); + if (mem && Date.now() < mem.expires) { + return mem.data; + } + + // L2: KV (stale-while-revalidate) + let kvData: TenantInfo | null = null; + try { + const kvRaw = await env.TENANT_CACHE.get(slug); + if (kvRaw) { + kvData = JSON.parse(kvRaw) as TenantInfo; + // Populate L1 from KV + memCache.set(slug, { data: kvData, expires: Date.now() + MEM_TTL_MS }); + } + } catch { /* KV read failure — continue to L3 */ } + + // L3: CP API + try { + const resp = await fetch( + `${env.CP_API_URL}/cp/orgs/${encodeURIComponent(slug)}/instance`, + { headers: { "User-Agent": "molecule-tenant-proxy/1.0" } }, + ); + + if (resp.status === 404) { + // Org doesn't exist — cache the miss briefly to avoid hammering CP + memCache.set(slug, { + data: { slug, status: "not_found", ip: null, org_id: "" }, + expires: Date.now() + 10_000, // 10s negative cache + }); + return null; + } + + if (resp.ok) { + const data = (await resp.json()) as TenantInfo; + // Update both caches + memCache.set(slug, { data, expires: Date.now() + MEM_TTL_MS }); + await env.TENANT_CACHE.put(slug, JSON.stringify(data), { + expirationTtl: KV_TTL_S, + }).catch(() => {}); // KV write failure is non-fatal + return data; + } + } catch { + // CP unreachable — fall back to stale KV + } + + // Fallback: stale KV data (any age) is better than an error + return kvData; +} + +// --------------------------------------------------------------------------- +// Static response pages +// --------------------------------------------------------------------------- + +function provisioningPage(slug: string): Response { + return new Response( + ` + + + + + + ${slug} - Setting up | Molecule AI + + + +
+
+

Setting up your workspace

+

Your cloud instance is starting up. This usually takes 2-3 minutes.

+

This page refreshes automatically.

+
+ +`, + { + status: 202, + headers: { + "Content-Type": "text/html;charset=utf-8", + "Cache-Control": "no-cache", + "Retry-After": "5", + }, + }, + ); +} + +function notFoundPage(slug: string): Response { + return new Response( + ` + + + + + Not Found | Molecule AI + + + +
+

Organization not found

+

${slug}.moleculesai.app doesn't exist.

+

Go to Molecule AI

+
+ +`, + { status: 404, headers: { "Content-Type": "text/html;charset=utf-8" } }, + ); +} + +function errorPage(slug: string): Response { + return new Response( + ` + + + + + Error | Molecule AI + + + +
+

Provisioning failed

+

Something went wrong setting up ${slug}.

+

Return to dashboard

+
+ +`, + { status: 503, headers: { "Content-Type": "text/html;charset=utf-8" } }, + ); +} diff --git a/infra/cloudflare-worker/wrangler.toml b/infra/cloudflare-worker/wrangler.toml new file mode 100644 index 00000000..1a0a4272 --- /dev/null +++ b/infra/cloudflare-worker/wrangler.toml @@ -0,0 +1,19 @@ +name = "molecule-tenant-proxy" +main = "src/index.ts" +compatibility_date = "2024-09-23" + +account_id = "bfa4e604e168a938e565600b27e2828c" + +# KV namespace for caching org→IP mappings (L2 cache, 5 min TTL) +[[kv_namespaces]] +binding = "TENANT_CACHE" +id = "752aaa0783514143a1eda9f44a412d7d" + +# Route: all tenant subdomains (wildcard). Explicit records (api, app, www) +# take priority in Cloudflare DNS — the Worker only fires for tenant slugs. +[[routes]] +pattern = "*.moleculesai.app/*" +zone_id = "a034108eda16d131ef7f766b923ef464" + +[vars] +CP_API_URL = "https://api.moleculesai.app"