From 284ef6d33a1861968edabe8d27716428244f0be9 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Tue, 14 Apr 2026 15:20:33 -0700 Subject: [PATCH] =?UTF-8?q?feat(platform):=20TenantGuard=20middleware=20?= =?UTF-8?q?=E2=80=94=20public=20repo's=20only=20SaaS=20hook?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 32 foundation. The SaaS control plane (private molecule-controlplane repo) provisions one platform instance per customer org on Fly Machines and sets MOLECULE_ORG_ID= on the machine. Its subdomain router forwards requests with X-Molecule-Org-Id=. TenantGuard: - When MOLECULE_ORG_ID is set → every non-allowlisted request must carry a matching X-Molecule-Org-Id header. Mismatched/missing header → 404 (not 403 — don't leak tenant existence by letting probers distinguish "wrong org" from "route doesn't exist"). - When unset → passthrough. Self-hosted / dev / CI behavior unchanged. - Allowlist is exact-match, not prefix — /health and /metrics only. No orgs table, no signup, no billing, no Fly provisioning in this repo — all that lives in the private control plane. The public repo's SaaS surface is exactly this one middleware. 6 tests covering: unset-is-passthrough, matching header, mismatched header 404 (with empty body), missing header 404, allowlist bypass, and allowlist-is-exact-match. Co-Authored-By: Claude Opus 4.6 (1M context) --- CLAUDE.md | 2 +- platform/internal/middleware/tenant_guard.go | 69 +++++++++++++ .../internal/middleware/tenant_guard_test.go | 99 +++++++++++++++++++ platform/internal/router/router.go | 7 ++ 4 files changed, 176 insertions(+), 1 deletion(-) create mode 100644 platform/internal/middleware/tenant_guard.go create mode 100644 platform/internal/middleware/tenant_guard_test.go diff --git a/CLAUDE.md b/CLAUDE.md index bf10679a..b92b221c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -122,7 +122,7 @@ go run ./cmd/server # Run server (requires Postgres + Redis running) go build -o molecli ./cmd/cli # Build TUI dashboard ./molecli # Run TUI dashboard (requires platform running) ``` -Must run from `platform/` directory (not repo root). Env vars: `DATABASE_URL`, `REDIS_URL`, `PORT`, `PLATFORM_URL` (default `http://host.docker.internal:PORT` — passed to agent containers so they can reach the platform), `SECRETS_ENCRYPTION_KEY` (optional AES-256, 32 bytes), `CONFIGS_DIR` (auto-discovered), `PLUGINS_DIR` (deprecated — plugins are now installed per-workspace via API; the `plugins/` registry at repo root is auto-discovered), `ACTIVITY_RETENTION_DAYS` (default `7`), `ACTIVITY_CLEANUP_INTERVAL_HOURS` (default `6`), `CORS_ORIGINS` (comma-separated, default `http://localhost:3000,http://localhost:3001`), `RATE_LIMIT` (requests/min, default `600`), `WORKSPACE_DIR` (optional — global fallback host path for `/workspace` bind-mount; overridden by per-workspace `workspace_dir` column in DB; if neither is set, each workspace gets an isolated Docker named volume), `AWARENESS_URL` (optional — if set, injected into workspace containers along with a deterministic `AWARENESS_NAMESPACE` derived from workspace ID), `MOLECULE_IN_DOCKER` (optional — set to `1` when the platform itself runs inside Docker so the A2A proxy rewrites `127.0.0.1:` URLs to container hostnames; auto-detected via `/.dockerenv`), `MOLECULE_ENV` (optional — set to `production` to hide the `/admin/workspaces/:id/test-token` E2E helper endpoint; unset or any other value leaves it enabled), `MOLECULE_ENABLE_TEST_TOKENS` (optional — set to `1` to force-enable the test-token endpoint even when `MOLECULE_ENV=production`; intended for staging runs only). +Must run from `platform/` directory (not repo root). Env vars: `DATABASE_URL`, `REDIS_URL`, `PORT`, `PLATFORM_URL` (default `http://host.docker.internal:PORT` — passed to agent containers so they can reach the platform), `SECRETS_ENCRYPTION_KEY` (optional AES-256, 32 bytes), `CONFIGS_DIR` (auto-discovered), `PLUGINS_DIR` (deprecated — plugins are now installed per-workspace via API; the `plugins/` registry at repo root is auto-discovered), `ACTIVITY_RETENTION_DAYS` (default `7`), `ACTIVITY_CLEANUP_INTERVAL_HOURS` (default `6`), `CORS_ORIGINS` (comma-separated, default `http://localhost:3000,http://localhost:3001`), `RATE_LIMIT` (requests/min, default `600`), `WORKSPACE_DIR` (optional — global fallback host path for `/workspace` bind-mount; overridden by per-workspace `workspace_dir` column in DB; if neither is set, each workspace gets an isolated Docker named volume), `AWARENESS_URL` (optional — if set, injected into workspace containers along with a deterministic `AWARENESS_NAMESPACE` derived from workspace ID), `MOLECULE_IN_DOCKER` (optional — set to `1` when the platform itself runs inside Docker so the A2A proxy rewrites `127.0.0.1:` URLs to container hostnames; auto-detected via `/.dockerenv`), `MOLECULE_ENV` (optional — set to `production` to hide the `/admin/workspaces/:id/test-token` E2E helper endpoint; unset or any other value leaves it enabled), `MOLECULE_ENABLE_TEST_TOKENS` (optional — set to `1` to force-enable the test-token endpoint even when `MOLECULE_ENV=production`; intended for staging runs only), `MOLECULE_ORG_ID` (optional — the public repo's only SaaS hook. When set to a UUID, every non-allowlisted request must carry a matching `X-Molecule-Org-Id` header or gets a 404; when unset, the guard is a passthrough so self-hosted / dev / CI are unaffected. Set only by the private `molecule-controlplane` provisioner on Fly Machines tenant instances — never by self-hosters). **Workspace tier resource limits** (issue #14 — override the per-tier memory/CPU caps in `provisioner.ApplyTierConfig`; CPU_SHARES follows Docker's 1024 = 1 CPU convention, translated to NanoCPUs for a hard cap): - `TIER2_MEMORY_MB` / `TIER2_CPU_SHARES` — Standard tier (defaults `512` / `1024`) diff --git a/platform/internal/middleware/tenant_guard.go b/platform/internal/middleware/tenant_guard.go new file mode 100644 index 00000000..a48c0bbc --- /dev/null +++ b/platform/internal/middleware/tenant_guard.go @@ -0,0 +1,69 @@ +package middleware + +import ( + "os" + "strings" + + "github.com/gin-gonic/gin" +) + +// Tenant-mode guard — public repo's only SaaS hook. +// +// The SaaS control plane (private `molecule-controlplane` repo) provisions one +// platform instance per customer org on Fly Machines and sets: +// - MOLECULE_ORG_ID= (env on the machine) +// - forwards requests with X-Molecule-Org-Id= (control-plane router) +// +// TenantGuard wraps every non-allowlisted route so a mis-routed request from +// another org bounces with 404 (not 403 — don't leak existence). +// +// When MOLECULE_ORG_ID is unset (self-hosted / dev / CI), the guard is a +// passthrough — self-hosters see no behavior change. +// +// The guard intentionally knows nothing about orgs, signup, billing, or +// provisioning. Those live in the private control-plane repo. All this code +// does is: "am I the tenant for this request? if not, 404." + +// tenantOrgIDHeader is the HTTP header the control-plane router sets when it +// uses fly-replay to route a request to a tenant machine. Case-insensitive at +// the HTTP layer (Gin normalizes). +const tenantOrgIDHeader = "X-Molecule-Org-Id" + +// tenantGuardAllowlist is the set of paths that MUST remain accessible even in +// tenant mode without the org header (health checks, Prometheus scrapes). +// Exact-match — no prefix semantics — to avoid accidentally exposing admin +// routes via e.g. "/health/debug/admin". +var tenantGuardAllowlist = map[string]struct{}{ + "/health": {}, + "/metrics": {}, +} + +// TenantGuard returns a Gin middleware configured from the MOLECULE_ORG_ID env +// var. Reads env once at construction — changing the env at runtime requires +// a restart (matches every other platform env var). Pass the orgID directly to +// TenantGuardWithOrgID if you need to test a specific configuration without +// mutating the process environment. +func TenantGuard() gin.HandlerFunc { + return TenantGuardWithOrgID(strings.TrimSpace(os.Getenv("MOLECULE_ORG_ID"))) +} + +// TenantGuardWithOrgID is the constructor used by tests; ordinary callers use +// TenantGuard. When configuredOrgID is empty the guard is a no-op. +func TenantGuardWithOrgID(configuredOrgID string) gin.HandlerFunc { + if configuredOrgID == "" { + return func(c *gin.Context) { c.Next() } + } + return func(c *gin.Context) { + if _, ok := tenantGuardAllowlist[c.Request.URL.Path]; ok { + c.Next() + return + } + if c.GetHeader(tenantOrgIDHeader) != configuredOrgID { + // 404 not 403 — existence of this tenant must not be inferable by + // probing other orgs' machines. + c.AbortWithStatus(404) + return + } + c.Next() + } +} diff --git a/platform/internal/middleware/tenant_guard_test.go b/platform/internal/middleware/tenant_guard_test.go new file mode 100644 index 00000000..97c0679c --- /dev/null +++ b/platform/internal/middleware/tenant_guard_test.go @@ -0,0 +1,99 @@ +package middleware + +import ( + "net/http" + "net/http/httptest" + "testing" + + "github.com/gin-gonic/gin" +) + +// helper: build a router with TenantGuard configured to `orgID` and two +// representative routes — a regular API route and two allowlisted ones. +func newGuardedRouter(orgID string) *gin.Engine { + gin.SetMode(gin.TestMode) + r := gin.New() + r.Use(TenantGuardWithOrgID(orgID)) + r.GET("/health", func(c *gin.Context) { c.String(200, "ok") }) + r.GET("/metrics", func(c *gin.Context) { c.String(200, "metrics") }) + r.GET("/workspaces", func(c *gin.Context) { c.String(200, "workspaces") }) + return r +} + +func doRequest(r *gin.Engine, path, orgIDHeader string) *httptest.ResponseRecorder { + req := httptest.NewRequest("GET", path, nil) + if orgIDHeader != "" { + req.Header.Set("X-Molecule-Org-Id", orgIDHeader) + } + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + return w +} + +// MOLECULE_ORG_ID unset → passthrough. Existing self-hosted behavior preserved. +func TestTenantGuard_UnsetIsPassthrough(t *testing.T) { + r := newGuardedRouter("") + for _, path := range []string{"/health", "/metrics", "/workspaces"} { + if w := doRequest(r, path, ""); w.Code != 200 { + t.Errorf("%s: expected 200 with guard disabled, got %d", path, w.Code) + } + } +} + +// Set + matching header → 200. +func TestTenantGuard_MatchingHeader(t *testing.T) { + r := newGuardedRouter("org-abc") + if w := doRequest(r, "/workspaces", "org-abc"); w.Code != 200 { + t.Errorf("matching header: expected 200, got %d", w.Code) + } +} + +// Set + mismatching header → 404 (not 403 — don't leak tenant existence). +func TestTenantGuard_MismatchedHeaderIs404(t *testing.T) { + r := newGuardedRouter("org-abc") + w := doRequest(r, "/workspaces", "org-xyz") + if w.Code != 404 { + t.Errorf("mismatched header: expected 404, got %d", w.Code) + } + if w.Body.String() != "" { + // Bouncing via AbortWithStatus leaves an empty body, which is what we + // want — no response body means no tenant fingerprint. + t.Errorf("expected empty body on 404, got %q", w.Body.String()) + } +} + +// Set + missing header → 404. +func TestTenantGuard_MissingHeaderIs404(t *testing.T) { + r := newGuardedRouter("org-abc") + if w := doRequest(r, "/workspaces", ""); w.Code != 404 { + t.Errorf("missing header: expected 404, got %d", w.Code) + } +} + +// Allowlisted paths bypass the guard even in tenant mode — required for health +// probes (Fly Machines checks) and Prometheus scrape. +func TestTenantGuard_AllowlistBypassesCheck(t *testing.T) { + r := newGuardedRouter("org-abc") + for _, path := range []string{"/health", "/metrics"} { + w := doRequest(r, path, "") // no header + if w.Code != 200 { + t.Errorf("%s: allowlisted path should return 200 without header, got %d", path, w.Code) + } + } +} + +// The allowlist is exact-match, not prefix. "/health/debug" must NOT bypass. +func TestTenantGuard_AllowlistIsExactMatch(t *testing.T) { + gin.SetMode(gin.TestMode) + r := gin.New() + r.Use(TenantGuardWithOrgID("org-abc")) + r.GET("/health/debug", func(c *gin.Context) { c.String(200, "debug") }) + + req := httptest.NewRequest("GET", "/health/debug", nil) + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + + if w.Code != http.StatusNotFound { + t.Errorf("expected /health/debug to be guarded (404), got %d", w.Code) + } +} diff --git a/platform/internal/router/router.go b/platform/internal/router/router.go index 6bf08b3a..b8b9730b 100644 --- a/platform/internal/router/router.go +++ b/platform/internal/router/router.go @@ -51,6 +51,13 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi // Must be registered after rate limiter so aborted requests are also counted. r.Use(metrics.Middleware()) + // Tenant guard — the public repo's only SaaS hook. When MOLECULE_ORG_ID is + // set (only by the private molecule-controlplane provisioner on tenant Fly + // Machines), rejects requests whose X-Molecule-Org-Id header doesn't match. + // Unset (self-hosted / dev / CI) → no-op. Registered after metrics so + // rejected requests still land on the 4xx counter. + r.Use(middleware.TenantGuard()) + // Health r.GET("/health", func(c *gin.Context) { c.JSON(200, gin.H{"status": "ok"})