From 644734bb7cf10ad143a98eaa3afb6bad75e55158 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 7 Jun 2026 21:31:29 -0700 Subject: [PATCH] fix(registry): allow pending-DNS platform tunnel URL at register (#36/#2421) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cross-cloud workspaces (e.g. Hetzner under a GCP tenant) register advertising their per-workspace Cloudflare tunnel hostname ws-.. That DNS record is eventually-consistent, and a FAST-booting box (a Hetzner cpx reports 'workspace ready after ~1s') registers BEFORE it propagates → validateAgentURL's net.LookupIP fails → the handler returns 400 → and the runtime does NOT retry a 4xx → so agent_card never lands and the agent never comes online. AWS/GCP boot slowly enough to miss the race, which is why ONLY the fast cloud broke. Diagnosed live: faithful Hetzner repro boxes register against a warm tenant and still 400 with {"error":"hostname \"ws-...\" cannot be resolved (DNS error)..."} Fix: when DNS resolution fails, allow the hostname through in SaaS mode iff it is a platform-tunnel hostname (ws- under the platform's own domain, MOLECULE_APP_DOMAIN default moleculesai.app). Such a hostname is NOT an SSRF vector — only the platform controls DNS there, so an attacker cannot point it at 169.254/127/private space, and the unconditional metadata/ loopback blocks still apply once it resolves. Restores the pre-#1130 'let an unresolvable platform URL through' behaviour, scoped to the trusted tunnel domain. Self-hosted keeps the strict block. This is the register half of #36; the provision half (Hetzner location capacity failover) shipped in cp#619. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../internal/handlers/registry.go | 43 +++++++++++++++++-- .../internal/handlers/registry_test.go | 36 ++++++++++++++++ 2 files changed, 76 insertions(+), 3 deletions(-) diff --git a/workspace-server/internal/handlers/registry.go b/workspace-server/internal/handlers/registry.go index 2856e5ab0..2964f9e1f 100644 --- a/workspace-server/internal/handlers/registry.go +++ b/workspace-server/internal/handlers/registry.go @@ -261,9 +261,28 @@ func validateAgentURL(rawURL string) error { // the agent won't be reachable anyway, so blocking on DNS failure is safe. ips, lookupErr := net.LookupIP(hostname) if lookupErr != nil { - // DNS lookup failed — block the URL rather than allow a potentially- - // unreachable or intentionally-unresolvable hostname through. The - // platform has no use for a workspace it cannot reach. + // #36/#2421: a freshly-provisioned CROSS-CLOUD workspace advertises its + // per-workspace Cloudflare tunnel hostname (ws-.). That DNS + // record is eventually-consistent, and a FAST-booting box (a Hetzner cpx + // reports "workspace ready after ~1s") registers BEFORE the record + // propagates → the lookup fails → 400 → and the runtime does not retry a + // 4xx → agent_card never lands and the agent never comes online. AWS boots + // slowly enough to miss the race, which is why only the fast cloud broke. + // + // Such a hostname is NOT an SSRF vector: it lives under the platform's own + // domain (only the platform can create records there, so it can't be + // pointed at 169.254/127/private space by an attacker), and it resolves to + // nothing right now. So in SaaS mode allow a platform-tunnel hostname + // through while its DNS settles; everything else stays blocked. The + // unconditional metadata/loopback blocks above still apply once it + // resolves. (Restores the pre-#1130 "let an unresolvable platform URL + // through" behaviour, scoped to the trusted tunnel domain.) + if saasMode() && isPlatformTunnelHostname(hostname) { + log.Printf("Registry validateAgentURL: allowing not-yet-resolvable platform tunnel hostname %q (DNS still propagating)", hostname) + return nil + } + // DNS lookup failed for a non-platform hostname — block it. The platform + // has no use for a workspace it cannot reach. return fmt.Errorf("hostname %q cannot be resolved (DNS error): %w", hostname, lookupErr) } for _, ip := range ips { @@ -274,6 +293,24 @@ func validateAgentURL(rawURL string) error { return nil } +// isPlatformTunnelHostname reports whether h is a platform-provisioned per- +// workspace Cloudflare tunnel hostname — `ws-.` under the +// platform's OWN domain. Only the platform controls DNS there, so a not-yet- +// resolvable such hostname is a pending-DNS tunnel (DNS propagation race), never +// an attacker-controlled SSRF URL. The domain defaults to moleculesai.app +// (covers prod `*.moleculesai.app` and staging `*.staging.moleculesai.app`) and +// is overridable via MOLECULE_APP_DOMAIN for other deployments. +func isPlatformTunnelHostname(h string) bool { + if !strings.HasPrefix(h, "ws-") { + return false + } + domain := strings.TrimSpace(os.Getenv("MOLECULE_APP_DOMAIN")) + if domain == "" { + domain = "moleculesai.app" + } + return strings.HasSuffix(h, "."+domain) +} + // Register handles POST /registry/register // Upserts workspace, sets Redis TTL, broadcasts WORKSPACE_ONLINE. func (h *RegistryHandler) Register(c *gin.Context) { diff --git a/workspace-server/internal/handlers/registry_test.go b/workspace-server/internal/handlers/registry_test.go index 51640051a..a4058241d 100644 --- a/workspace-server/internal/handlers/registry_test.go +++ b/workspace-server/internal/handlers/registry_test.go @@ -882,6 +882,42 @@ func TestValidateAgentURL_SaaSMode_AllowsRFC1918(t *testing.T) { } } +// TestValidateAgentURL_PendingPlatformTunnel (#36/#2421): a freshly-provisioned +// cross-cloud workspace advertises its per-workspace tunnel hostname +// (ws-.) whose DNS has not propagated yet when a FAST box (Hetzner +// ~1s boot) registers. validateAgentURL must allow such a platform-tunnel +// hostname through in SaaS mode instead of 400 (which the runtime never retries +// → agent_card never lands). Non-platform unresolvable hostnames stay blocked. +func TestValidateAgentURL_PendingPlatformTunnel(t *testing.T) { + for _, tc := range []struct { + h string + want bool + }{ + {"ws-abc123.moleculesai.app", true}, + {"ws-abc123.staging.moleculesai.app", true}, + {"ws-abc123.evil.com", false}, // not under the platform domain + {"api.moleculesai.app", false}, // no ws- prefix + {"ws-x.fakemoleculesai.app", false}, // lookalike domain, not a subdomain + } { + if got := isPlatformTunnelHostname(tc.h); got != tc.want { + t.Errorf("isPlatformTunnelHostname(%q)=%v want %v", tc.h, got, tc.want) + } + } + t.Setenv("MOLECULE_ORG_ID", "") + t.Setenv("MOLECULE_DEPLOY_MODE", "saas") + // A platform tunnel hostname is allowed — whether or not its DNS has + // propagated (a resolved record is a public Cloudflare IP = allowed; an + // unresolved one is allowed by the pending-tunnel branch). + if err := validateAgentURL("https://ws-deadbeef0001.staging.moleculesai.app/a2a"); err != nil { + t.Errorf("SaaS: pending platform tunnel must be allowed, got %v", err) + } + // A NON-platform unresolvable hostname stays blocked even in SaaS + // (.invalid never resolves — RFC 2606). + if err := validateAgentURL("https://ws-x.attacker.invalid/a2a"); err == nil { + t.Error("SaaS: non-platform unresolvable hostname must stay blocked") + } +} + // TestValidateAgentURL_SaaSMode_StillBlocksMetadataEtAl verifies that even in // SaaS mode the always-blocked ranges (metadata, loopback, TEST-NET, CGNAT, // non-fd00 ULA) stay blocked. -- 2.52.0