fix(registry): allow pending-DNS platform tunnel URL at register (#36 register half) #2425

Merged
devops-engineer merged 1 commits from fix/validate-agent-url-pending-tunnel into main 2026-06-08 04:44:04 +00:00
2 changed files with 76 additions and 3 deletions
+40 -3
View File
@@ -261,9 +261,28 @@ func validateAgentURL(rawURL string) error {
// the agent won't be reachable anyway, so blocking on DNS failure is safe.
ips, lookupErr := net.LookupIP(hostname)
if lookupErr != nil {
// DNS lookup failed — block the URL rather than allow a potentially-
// unreachable or intentionally-unresolvable hostname through. The
// platform has no use for a workspace it cannot reach.
// #36/#2421: a freshly-provisioned CROSS-CLOUD workspace advertises its
// per-workspace Cloudflare tunnel hostname (ws-<id>.<appDomain>). That DNS
// record is eventually-consistent, and a FAST-booting box (a Hetzner cpx
// reports "workspace ready after ~1s") registers BEFORE the record
// propagates → the lookup fails → 400 → and the runtime does not retry a
// 4xx → agent_card never lands and the agent never comes online. AWS boots
// slowly enough to miss the race, which is why only the fast cloud broke.
//
// Such a hostname is NOT an SSRF vector: it lives under the platform's own
// domain (only the platform can create records there, so it can't be
// pointed at 169.254/127/private space by an attacker), and it resolves to
// nothing right now. So in SaaS mode allow a platform-tunnel hostname
// through while its DNS settles; everything else stays blocked. The
// unconditional metadata/loopback blocks above still apply once it
// resolves. (Restores the pre-#1130 "let an unresolvable platform URL
// through" behaviour, scoped to the trusted tunnel domain.)
if saasMode() && isPlatformTunnelHostname(hostname) {
log.Printf("Registry validateAgentURL: allowing not-yet-resolvable platform tunnel hostname %q (DNS still propagating)", hostname)
return nil
}
// DNS lookup failed for a non-platform hostname — block it. The platform
// has no use for a workspace it cannot reach.
return fmt.Errorf("hostname %q cannot be resolved (DNS error): %w", hostname, lookupErr)
}
for _, ip := range ips {
@@ -274,6 +293,24 @@ func validateAgentURL(rawURL string) error {
return nil
}
// isPlatformTunnelHostname reports whether h is a platform-provisioned per-
// workspace Cloudflare tunnel hostname — `ws-<id>.<appDomain>` under the
// platform's OWN domain. Only the platform controls DNS there, so a not-yet-
// resolvable such hostname is a pending-DNS tunnel (DNS propagation race), never
// an attacker-controlled SSRF URL. The domain defaults to moleculesai.app
// (covers prod `*.moleculesai.app` and staging `*.staging.moleculesai.app`) and
// is overridable via MOLECULE_APP_DOMAIN for other deployments.
func isPlatformTunnelHostname(h string) bool {
if !strings.HasPrefix(h, "ws-") {
return false
}
domain := strings.TrimSpace(os.Getenv("MOLECULE_APP_DOMAIN"))
if domain == "" {
domain = "moleculesai.app"
}
return strings.HasSuffix(h, "."+domain)
}
// Register handles POST /registry/register
// Upserts workspace, sets Redis TTL, broadcasts WORKSPACE_ONLINE.
func (h *RegistryHandler) Register(c *gin.Context) {
@@ -882,6 +882,42 @@ func TestValidateAgentURL_SaaSMode_AllowsRFC1918(t *testing.T) {
}
}
// TestValidateAgentURL_PendingPlatformTunnel (#36/#2421): a freshly-provisioned
// cross-cloud workspace advertises its per-workspace tunnel hostname
// (ws-<id>.<appDomain>) whose DNS has not propagated yet when a FAST box (Hetzner
// ~1s boot) registers. validateAgentURL must allow such a platform-tunnel
// hostname through in SaaS mode instead of 400 (which the runtime never retries
// → agent_card never lands). Non-platform unresolvable hostnames stay blocked.
func TestValidateAgentURL_PendingPlatformTunnel(t *testing.T) {
for _, tc := range []struct {
h string
want bool
}{
{"ws-abc123.moleculesai.app", true},
{"ws-abc123.staging.moleculesai.app", true},
{"ws-abc123.evil.com", false}, // not under the platform domain
{"api.moleculesai.app", false}, // no ws- prefix
{"ws-x.fakemoleculesai.app", false}, // lookalike domain, not a subdomain
} {
if got := isPlatformTunnelHostname(tc.h); got != tc.want {
t.Errorf("isPlatformTunnelHostname(%q)=%v want %v", tc.h, got, tc.want)
}
}
t.Setenv("MOLECULE_ORG_ID", "")
t.Setenv("MOLECULE_DEPLOY_MODE", "saas")
// A platform tunnel hostname is allowed — whether or not its DNS has
// propagated (a resolved record is a public Cloudflare IP = allowed; an
// unresolved one is allowed by the pending-tunnel branch).
if err := validateAgentURL("https://ws-deadbeef0001.staging.moleculesai.app/a2a"); err != nil {
t.Errorf("SaaS: pending platform tunnel must be allowed, got %v", err)
}
// A NON-platform unresolvable hostname stays blocked even in SaaS
// (.invalid never resolves — RFC 2606).
if err := validateAgentURL("https://ws-x.attacker.invalid/a2a"); err == nil {
t.Error("SaaS: non-platform unresolvable hostname must stay blocked")
}
}
// TestValidateAgentURL_SaaSMode_StillBlocksMetadataEtAl verifies that even in
// SaaS mode the always-blocked ranges (metadata, loopback, TEST-NET, CGNAT,
// non-fd00 ULA) stay blocked.