From bfdb0e3b6996997f240ef3e1569bf94321e2f4cf Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Date: Fri, 17 Apr 2026 23:58:55 -0700
Subject: [PATCH] docs: Cloudflare Tunnel migration report + track Worker
 source

- Full session retrospective: tunnel E2E verified on prod + staging subdomains
- Worker source tracked in infra/cloudflare-worker/ (was only in /tmp)
- Worker changes: reserved slug passthrough + multi-level subdomain bypass
- Known issues, follow-ups, cost impact, key learnings documented

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../2026-04-18-tunnel-migration.md            | 223 ++++++++++++++
 infra/cloudflare-worker/package.json          |  11 +
 infra/cloudflare-worker/src/index.ts          | 280 ++++++++++++++++++
 infra/cloudflare-worker/wrangler.toml         |  19 ++
 4 files changed, 533 insertions(+)
 create mode 100644 docs/retrospectives/2026-04-18-tunnel-migration.md
 create mode 100644 infra/cloudflare-worker/package.json
 create mode 100644 infra/cloudflare-worker/src/index.ts
 create mode 100644 infra/cloudflare-worker/wrangler.toml

diff --git a/docs/retrospectives/2026-04-18-tunnel-migration.md b/docs/retrospectives/2026-04-18-tunnel-migration.md
new file mode 100644
index 00000000..997bd94a
--- /dev/null
+++ b/docs/retrospectives/2026-04-18-tunnel-migration.md
@@ -0,0 +1,223 @@
+# Cloudflare Tunnel Migration — Session Report (2026-04-18)
+
+> **Duration:** ~4 hours
+> **Scope:** Replace Cloudflare Worker + wildcard DNS with per-tenant Cloudflare Tunnels
+> **Issue:** #933
+> **Status:** Tunnel E2E verified on both production and staging subdomains. Ready for production tenant migration.
+
+---
+
+## What Was Done
+
+### 1. PR Triage (15 PRs merged)
+
+Before tunnel work, cleared the PR backlog since CI runner was slow:
+
+| PR | Type | Description |
+|----|------|-------------|
+| #934 | docs | Staging environment design + Phase 36 plan |
+| #849 | docs | Partner API Keys (Phase 34) — resolved PLAN.md conflict |
+| #922 | docs | ANTHROPIC_API_KEY as required global secret |
+| #880 | docs | SAFE-MCP internal advisory |
+| #927 | docs | Ecosystem watch daily sweep |
+| #923 | security | Slack OAuth state param — random nonce replaces workspace_id |
+| #913 | security | Redact secrets from commit_memory before persistence |
+| #925 | security | HITL audit log on approval grant/denial |
+| #879 | fix | Canvas TypeScript fixture drift |
+| #915 | feature | A2A topology overlay + hermes plugin declarations |
+| #921 | feature | Audit trail visualization panel |
+| #929 | feature | Temporal crash-resume checkpoints |
+| #937 | fix | go vet errors + supply chain hardening (created + merged) |
+| #938 | fix | Canvas a11y — TeamMemberChip keyboard nav (created + merged) |
+
+Also closed issue #920 (Slack OAuth) and commented on #889 (VULN-004 dead letter).
+
+### 2. Cloudflare API Token — Tunnel Permission
+
+**Problem:** The existing CF API token (`cfut_loLR...`) had DNS:Edit but NOT Cloudflare Tunnel:Edit permission. Tunnel create/list/delete calls returned `code 10000: Authentication error`.
+
+**Fix:** CEO added Account → Cloudflare Tunnel → Edit permission in Cloudflare Dashboard → API Tokens.
+
+### 3. Tunnel API Integration Tests
+
+Ran three progressively comprehensive tests:
+
+| Test | Result | What it proved |
+|------|--------|----------------|
+| API roundtrip | ✓ | Create tunnel → create DNS CNAME → delete both |
+| DNS resolution | ✓ | CNAME resolves on first attempt (instant, zero propagation delay) |
+| Full E2E with EC2 | ✓ | Tunnel + DNS + EC2 with cloudflared → HTTP 200 through subdomain |
+
+### 4. Worker Coexistence Fix
+
+**Problem:** The Cloudflare Worker route `*.moleculesai.app/*` intercepted tunnel CNAME requests before they could reach the tunnel origin. Tunnel subdomains got the Worker's "Organization not found" page instead of routing through the tunnel.
+
+**Fix (two changes to Worker):**
+
+```typescript
+// 1. Reserved slugs now pass through instead of returning 404
+if (!slug || slug === host || RESERVED.has(slug) || slug.includes(".")) {
+  return fetch(request);  // was: return new Response("Not found", { status: 404 });
+}
+
+// 2. Multi-level subdomains (*.staging.moleculesai.app) bypass Worker entirely
+// slug.includes(".") catches "foo.staging" and passes to tunnel CNAME
+```
+
+Worker redeployed. Production tenants unaffected — they still route through the Worker. Tunnel-routed subdomains pass through to origin.
+
+### 5. SSL Certificate for Staging Subdomains
+
+**Problem:** Cloudflare's free Universal SSL only covers `*.moleculesai.app` (one wildcard level). `*.staging.moleculesai.app` (two levels) fails TLS handshake — no certificate.
+
+**Fix:** Ordered Advanced Certificate via Cloudflare Dashboard:
+- Hostnames: `*.staging.moleculesai.app`, `staging.moleculesai.app`
+- CA: Let's Encrypt
+- Validity: 90 days, auto-renewal 30 days before expiry
+- Cost: included in Cloudflare free plan (1 of 100 advanced certs)
+
+### 6. Staging Tunnel E2E — Full Pass
+
+Final test on `*.staging.moleculesai.app` (fully isolated from production):
+
+```
+1. Create Tunnel           → OK (ea5aaa13...)
+2. Configure ingress       → OK (→ localhost:8080)
+3. Create DNS CNAME        → OK (tunnel-stg-test.staging.moleculesai.app)
+4. Launch EC2 t3.micro     → OK (cloudflared binary download)
+5. Tunnel connected        → OK (healthy in 30s)
+6. HTTP 200 through tunnel → OK
+   Response: {"status":"ok","domain":"tunnel-stg-test.staging.moleculesai.app"}
+7. Cleanup                 → OK (EC2 terminated, DNS + tunnel deleted)
+```
+
+### 7. Platform Build Verification
+
+After merging 15 PRs, verified everything still builds and passes:
+- Go: `go test -race ./...` — 15/15 packages pass, 0 failures
+- Go: `go vet ./...` — clean
+- Canvas: `npm run build` — success
+- Canvas: `vitest run` — 762/762 tests pass
+
+---
+
+## Architecture: Before vs After
+
+### Before (Cloudflare Worker)
+
+```
+User → *.moleculesai.app (wildcard A record, proxied)
+     → Cloudflare Worker (extracts slug, looks up EC2 IP from CP API)
+     → Worker proxies to EC2 public IP:8080
+     → EC2 must have public IP + open port 8080
+```
+
+**Problems:**
+- Edge cache poisoning when wildcard A record IP changes (2+ hour recovery)
+- ADMIN_TOKEN transmitted in plaintext via Worker header injection
+- EC2 requires public IP + open inbound ports (security surface)
+- Worker is a single point of failure for all tenant routing
+- KV cache stale-while-revalidate adds latency on cold starts
+
+### After (Cloudflare Tunnel)
+
+```
+User → slug.moleculesai.app (CNAME → tunnel-id.cfargotunnel.com, proxied)
+     → Cloudflare edge routes to tunnel
+     → cloudflared on EC2 (outbound-only connection) receives request
+     → cloudflared forwards to localhost:8080
+     → EC2 needs NO public IP, NO open inbound ports
+```
+
+**Advantages:**
+- No edge cache — CNAME resolves instantly via Cloudflare's anycast
+- No plaintext secrets in transit — tunnel is encrypted end-to-end
+- EC2 can be in private subnet (no public IP, no security group rules)
+- Each tenant has its own tunnel (no single point of failure)
+- No Worker maintenance, no KV cache management
+- Faster provisioning — DNS works immediately, no cache warming
+
+---
+
+## Known Issues & Risks
+
+### 1. Worker Must Stay Until All Tenants Migrate
+The Worker route `*.moleculesai.app/*` still serves existing tenants (e.g., `hongmingwang.moleculesai.app`). Cannot delete until every tenant has a tunnel + CNAME. The Worker passthrough for reserved/multi-level slugs is the bridge.
+
+### 2. Worker Source Not in Version Control
+The Worker code lives in `/tmp/molecule-tenant-proxy/` — not tracked in any repo. Needs to be committed somewhere before the session ends. Two changes were deployed:
+- `fetch(request)` passthrough for reserved slugs (was `404`)
+- `slug.includes(".")` bypass for multi-level subdomains
+
+### 3. cloudflared Binary Download at Boot
+Current EC2 user-data downloads `cloudflared` from GitHub releases at boot time. This adds ~5 seconds and depends on GitHub availability. Pre-baked AMI would eliminate this dependency.
+
+### 4. Tunnel Token in User-Data
+The `cloudflared` tunnel token is passed in EC2 user-data (base64 encoded). AWS user-data is accessible to anyone with EC2 instance metadata access. The token grants tunnel connection rights — if leaked, an attacker could impersonate the tenant's tunnel. Mitigation: use AWS Secrets Manager or SSM Parameter Store instead.
+
+### 5. Tunnel Cleanup on Org Delete
+The `DeprovisionInstance` function has a TODO for tunnel deletion. When an org is deleted, the tunnel and DNS CNAME must be cleaned up. The tunnel ID is stored in EC2 tags (`TunnelID`), but needs to be persisted in `org_instances` table for reliable cleanup.
+
+### 6. No Health Check on Tunnel
+If `cloudflared` crashes on the EC2 but the instance stays running, the tunnel goes inactive but the DNS CNAME still points to it. Need a health sweep that checks tunnel status via CF API and restarts `cloudflared` or the instance.
+
+### 7. Staging CP Uses Production Tenant Image
+`TENANT_IMAGE` on staging is still `ghcr.io/molecule-ai/platform-tenant:latest` (production). Should be `:staging` once the staging image pipeline is set up.
+
+---
+
+## Follow-Up Tasks
+
+### Immediate (before next deploy)
+
+- [ ] **Commit Worker code to repo** — decide location (monorepo `infra/` or separate repo), commit current state with the two passthrough changes
+- [ ] **Persist tunnel ID in org_instances table** — add `tunnel_id` column so deprovision cascade can clean up tunnels reliably
+- [ ] **Wire tunnel cleanup into DeprovisionInstance** — delete tunnel + DNS CNAME when org is deleted
+
+### Short-term (this week)
+
+- [ ] **Migrate `hongmingwang` tenant to tunnel** — create tunnel, add CNAME, update EC2 to run cloudflared, add slug to Worker RESERVED, verify, then remove old A record
+- [ ] **Staging image pipeline** — publish `:staging` tag on main merge, `:latest` only on manual promote
+- [ ] **Move tunnel token to SSM Parameter Store** — EC2 user-data is not secret-safe; retrieve token at boot via instance role
+
+### Medium-term (this month)
+
+- [ ] **Pre-baked AMI with cloudflared** — eliminate GitHub download dependency at boot
+- [ ] **Tunnel health sweep** — periodic check of tunnel status via CF API, restart cloudflared if inactive
+- [ ] **Delete Worker** — once all tenants are on tunnels, remove Worker + wildcard A record entirely
+- [ ] **Private subnet for tenant EC2s** — with tunnels, EC2s don't need public IPs; move to private subnet with NAT gateway for outbound
+
+### Nice-to-have
+
+- [ ] **Cloudflare Access** — add zero-trust access policies on tunnel routes (IP allow-list, mTLS)
+- [ ] **Tunnel metrics** — export tunnel connection count, latency, bandwidth to Prometheus/Grafana
+- [ ] **Multi-region tunnels** — cloudflared connects to nearest Cloudflare edge; for multi-region deployments, each region's EC2 gets its own tunnel
+
+---
+
+## Cost Impact
+
+| Item | Before | After |
+|------|--------|-------|
+| Cloudflare Worker | Free (100k req/day) | Eliminated |
+| Workers KV | Free tier | Eliminated |
+| Advanced SSL Cert | $0 | $0 (1 of 100 free) |
+| EC2 public IPs | ~$3.65/mo per tenant | $0 (no public IP needed) |
+| Cloudflare Tunnel | N/A | Free (unlimited tunnels) |
+| **Net change** | | **Saves ~$3.65/tenant/mo** |
+
+---
+
+## Key Learnings
+
+1. **Worker routes take priority over DNS CNAMEs** — even with a CNAME pointing to `cfargotunnel.com`, the Worker's wildcard route fires first. Must explicitly pass through via `fetch(request)`.
+
+2. **Free Universal SSL only covers one wildcard level** — `*.moleculesai.app` works, `*.staging.moleculesai.app` doesn't. Advanced Certificate (free, Let's Encrypt) solves this.
+
+3. **Let's Encrypt rejects mixed wildcard+parent certs** — can't put `*.moleculesai.app` and `*.staging.moleculesai.app` in the same cert. Issue separate certs for each level.
+
+4. **Tunnel connects in ~30 seconds** — from EC2 boot to tunnel healthy, including cloudflared binary download (~5s) + connection establishment (~25s). Faster than DNS propagation ever was.
+
+5. **DNS CNAME resolves instantly** — no propagation delay, no edge cache, no NXDOMAIN caching. This is the fundamental advantage over the wildcard A record approach.
+
+6. **cloudflared binary download is faster than apt** — `curl` from GitHub releases (~5s) vs `apt-get install cloudflared` (~30s). Use binary download in boot scripts.
diff --git a/infra/cloudflare-worker/package.json b/infra/cloudflare-worker/package.json
new file mode 100644
index 00000000..5981ecb8
--- /dev/null
+++ b/infra/cloudflare-worker/package.json
@@ -0,0 +1,11 @@
+{
+  "name": "molecule-tenant-proxy",
+  "private": true,
+  "scripts": {
+    "dev": "wrangler dev",
+    "deploy": "wrangler deploy"
+  },
+  "devDependencies": {
+    "wrangler": "^4.0.0"
+  }
+}
diff --git a/infra/cloudflare-worker/src/index.ts b/infra/cloudflare-worker/src/index.ts
new file mode 100644
index 00000000..fedee0c3
--- /dev/null
+++ b/infra/cloudflare-worker/src/index.ts
@@ -0,0 +1,280 @@
+/**
+ * Molecule AI tenant proxy — Cloudflare Worker
+ *
+ * Routes *.moleculesai.app requests to the correct EC2 tenant instance.
+ * Replaces per-tenant DNS records with a single wildcard + edge routing.
+ *
+ * Cache strategy (3-tier):
+ *   L1: in-memory Map (60s TTL, per-isolate)
+ *   L2: Workers KV (5 min TTL, stale-while-revalidate)
+ *   L3: CP API — GET /cp/orgs/:slug/instance
+ *   Fallback: serve stale KV when CP is unreachable
+ */
+
+export interface Env {
+  TENANT_CACHE: KVNamespace;
+  CP_API_URL: string;
+}
+
+interface TenantInfo {
+  slug: string;
+  status: string; // "running" | "provisioning" | "failed"
+  ip: string | null;
+  org_id: string;
+  admin_token?: string;
+}
+
+// L1: in-memory cache (per-isolate, 60s TTL)
+const memCache = new Map<string, { data: TenantInfo; expires: number }>();
+const MEM_TTL_MS = 60_000;
+const KV_TTL_S = 300; // 5 min
+
+// Subdomains that are NOT tenants — handled by explicit DNS records
+const RESERVED = new Set(["api", "app", "www", "docs", "doc", "status", "staging-api", "tunneltest"]);
+
+// Routes that go to platform (:8080) vs canvas (:3000)
+const API_PREFIXES = [
+  "/health", "/metrics", "/workspaces", "/registry", "/templates",
+  "/org", "/settings", "/plugins", "/events", "/bundles", "/channels",
+  "/webhooks", "/approvals", "/admin", "/canvas", "/ws",
+];
+
+export default {
+  async fetch(request: Request, env: Env): Promise<Response> {
+    const url = new URL(request.url);
+    const host = url.hostname;
+
+    // Extract slug from hostname: "acme.moleculesai.app" → "acme"
+    const slug = host.replace(".moleculesai.app", "");
+    if (!slug || slug === host || RESERVED.has(slug) || slug.includes(".")) {
+      // Pass through to origin (tunnel CNAME or explicit DNS record).
+      // slug.includes(".") catches multi-level subdomains like
+      // "foo.staging.moleculesai.app" which are routed via CF Tunnel.
+      return fetch(request);
+    }
+
+    // Lookup tenant backend
+    const tenant = await resolveTenant(slug, env);
+
+    if (!tenant) {
+      return notFoundPage(slug);
+    }
+
+    if (tenant.status === "provisioning" || !tenant.ip) {
+      return provisioningPage(slug);
+    }
+
+    if (tenant.status === "failed") {
+      return errorPage(slug);
+    }
+
+    // Route ALL traffic to :8080 (Go platform). The platform proxies non-API
+    // routes to Canvas internally via CANVAS_PROXY_URL. We don't split traffic
+    // between :8080 and :3000 because Canvas may bind to 127.0.0.1 only
+    // (not externally reachable) while the platform is always on 0.0.0.0.
+    const backendUrl = `http://${tenant.ip}:8080${url.pathname}${url.search}`;
+
+    // WebSocket upgrade
+    if (request.headers.get("Upgrade") === "websocket") {
+      return fetch(backendUrl, request);
+    }
+
+    // Proxy the request
+    const headers = new Headers(request.headers);
+    headers.set("X-Molecule-Org-Id", tenant.org_id);
+    headers.set("Origin", `https://${slug}.moleculesai.app`);
+    headers.set("X-Forwarded-For", request.headers.get("CF-Connecting-IP") || "");
+    headers.set("X-Forwarded-Proto", "https");
+    headers.set("Host", `${slug}.moleculesai.app`);
+    // Inject ADMIN_TOKEN for AdminAuth — the tenant platform validates this
+    // as a dedicated admin credential (not a workspace token).
+    if (tenant.admin_token) {
+      headers.set("Authorization", `Bearer ${tenant.admin_token}`);
+    }
+
+    const proxyReq = new Request(backendUrl, {
+      method: request.method,
+      headers,
+      body: request.body,
+      redirect: "manual",
+    });
+
+    try {
+      const resp = await fetch(proxyReq);
+      // Strip backend hop headers, pass everything else through
+      const respHeaders = new Headers(resp.headers);
+      respHeaders.delete("transfer-encoding");
+      return new Response(resp.body, {
+        status: resp.status,
+        statusText: resp.statusText,
+        headers: respHeaders,
+      });
+    } catch {
+      return new Response("Backend unavailable", { status: 502 });
+    }
+  },
+};
+
+// ---------------------------------------------------------------------------
+// 3-tier cache resolution
+// ---------------------------------------------------------------------------
+
+async function resolveTenant(
+  slug: string,
+  env: Env,
+): Promise<TenantInfo | null> {
+  // L1: in-memory
+  const mem = memCache.get(slug);
+  if (mem && Date.now() < mem.expires) {
+    return mem.data;
+  }
+
+  // L2: KV (stale-while-revalidate)
+  let kvData: TenantInfo | null = null;
+  try {
+    const kvRaw = await env.TENANT_CACHE.get(slug);
+    if (kvRaw) {
+      kvData = JSON.parse(kvRaw) as TenantInfo;
+      // Populate L1 from KV
+      memCache.set(slug, { data: kvData, expires: Date.now() + MEM_TTL_MS });
+    }
+  } catch { /* KV read failure — continue to L3 */ }
+
+  // L3: CP API
+  try {
+    const resp = await fetch(
+      `${env.CP_API_URL}/cp/orgs/${encodeURIComponent(slug)}/instance`,
+      { headers: { "User-Agent": "molecule-tenant-proxy/1.0" } },
+    );
+
+    if (resp.status === 404) {
+      // Org doesn't exist — cache the miss briefly to avoid hammering CP
+      memCache.set(slug, {
+        data: { slug, status: "not_found", ip: null, org_id: "" },
+        expires: Date.now() + 10_000, // 10s negative cache
+      });
+      return null;
+    }
+
+    if (resp.ok) {
+      const data = (await resp.json()) as TenantInfo;
+      // Update both caches
+      memCache.set(slug, { data, expires: Date.now() + MEM_TTL_MS });
+      await env.TENANT_CACHE.put(slug, JSON.stringify(data), {
+        expirationTtl: KV_TTL_S,
+      }).catch(() => {}); // KV write failure is non-fatal
+      return data;
+    }
+  } catch {
+    // CP unreachable — fall back to stale KV
+  }
+
+  // Fallback: stale KV data (any age) is better than an error
+  return kvData;
+}
+
+// ---------------------------------------------------------------------------
+// Static response pages
+// ---------------------------------------------------------------------------
+
+function provisioningPage(slug: string): Response {
+  return new Response(
+    `<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=device-width,initial-scale=1">
+  <meta http-equiv="refresh" content="5">
+  <title>${slug} - Setting up | Molecule AI</title>
+  <style>
+    *{margin:0;padding:0;box-sizing:border-box}
+    body{background:#09090b;color:#f4f4f5;font-family:-apple-system,BlinkMacSystemFont,sans-serif;
+         display:flex;align-items:center;justify-content:center;min-height:100vh}
+    .card{text-align:center;max-width:420px;padding:3rem 2rem}
+    .spinner{width:48px;height:48px;border:3px solid #27272a;border-top-color:#3b82f6;
+             border-radius:50%;animation:spin 1s linear infinite;margin:0 auto 1.5rem}
+    @keyframes spin{to{transform:rotate(360deg)}}
+    h1{font-size:1.25rem;font-weight:600;margin-bottom:.5rem}
+    p{font-size:.875rem;color:#a1a1aa;line-height:1.6}
+    .hint{margin-top:1.5rem;font-size:.75rem;color:#52525b}
+  </style>
+</head>
+<body>
+  <div class="card">
+    <div class="spinner"></div>
+    <h1>Setting up your workspace</h1>
+    <p>Your cloud instance is starting up. This usually takes 2-3 minutes.</p>
+    <p class="hint">This page refreshes automatically.</p>
+  </div>
+</body>
+</html>`,
+    {
+      status: 202,
+      headers: {
+        "Content-Type": "text/html;charset=utf-8",
+        "Cache-Control": "no-cache",
+        "Retry-After": "5",
+      },
+    },
+  );
+}
+
+function notFoundPage(slug: string): Response {
+  return new Response(
+    `<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=device-width,initial-scale=1">
+  <title>Not Found | Molecule AI</title>
+  <style>
+    *{margin:0;padding:0;box-sizing:border-box}
+    body{background:#09090b;color:#f4f4f5;font-family:-apple-system,BlinkMacSystemFont,sans-serif;
+         display:flex;align-items:center;justify-content:center;min-height:100vh}
+    .card{text-align:center;max-width:420px;padding:3rem 2rem}
+    h1{font-size:1.25rem;font-weight:600;margin-bottom:.5rem}
+    p{font-size:.875rem;color:#a1a1aa;line-height:1.6}
+    a{color:#3b82f6;text-decoration:none}a:hover{text-decoration:underline}
+  </style>
+</head>
+<body>
+  <div class="card">
+    <h1>Organization not found</h1>
+    <p><strong>${slug}.moleculesai.app</strong> doesn't exist.</p>
+    <p style="margin-top:1rem"><a href="https://app.moleculesai.app">Go to Molecule AI</a></p>
+  </div>
+</body>
+</html>`,
+    { status: 404, headers: { "Content-Type": "text/html;charset=utf-8" } },
+  );
+}
+
+function errorPage(slug: string): Response {
+  return new Response(
+    `<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=device-width,initial-scale=1">
+  <title>Error | Molecule AI</title>
+  <style>
+    *{margin:0;padding:0;box-sizing:border-box}
+    body{background:#09090b;color:#f4f4f5;font-family:-apple-system,BlinkMacSystemFont,sans-serif;
+         display:flex;align-items:center;justify-content:center;min-height:100vh}
+    .card{text-align:center;max-width:420px;padding:3rem 2rem}
+    h1{font-size:1.25rem;font-weight:600;margin-bottom:.5rem;color:#ef4444}
+    p{font-size:.875rem;color:#a1a1aa;line-height:1.6}
+    a{color:#3b82f6;text-decoration:none}a:hover{text-decoration:underline}
+  </style>
+</head>
+<body>
+  <div class="card">
+    <h1>Provisioning failed</h1>
+    <p>Something went wrong setting up <strong>${slug}</strong>.</p>
+    <p style="margin-top:1rem"><a href="https://app.moleculesai.app">Return to dashboard</a></p>
+  </div>
+</body>
+</html>`,
+    { status: 503, headers: { "Content-Type": "text/html;charset=utf-8" } },
+  );
+}
diff --git a/infra/cloudflare-worker/wrangler.toml b/infra/cloudflare-worker/wrangler.toml
new file mode 100644
index 00000000..1a0a4272
--- /dev/null
+++ b/infra/cloudflare-worker/wrangler.toml
@@ -0,0 +1,19 @@
+name = "molecule-tenant-proxy"
+main = "src/index.ts"
+compatibility_date = "2024-09-23"
+
+account_id = "bfa4e604e168a938e565600b27e2828c"
+
+# KV namespace for caching org→IP mappings (L2 cache, 5 min TTL)
+[[kv_namespaces]]
+binding = "TENANT_CACHE"
+id = "752aaa0783514143a1eda9f44a412d7d"
+
+# Route: all tenant subdomains (wildcard). Explicit records (api, app, www)
+# take priority in Cloudflare DNS — the Worker only fires for tenant slugs.
+[[routes]]
+pattern = "*.moleculesai.app/*"
+zone_id = "a034108eda16d131ef7f766b923ef464"
+
+[vars]
+CP_API_URL = "https://api.moleculesai.app"