diff --git a/workspace-server/internal/bundle/importer.go b/workspace-server/internal/bundle/importer.go index 3031f57c..dc1728a8 100644 --- a/workspace-server/internal/bundle/importer.go +++ b/workspace-server/internal/bundle/importer.go @@ -71,7 +71,7 @@ func Import( } } // Store runtime in DB - _ = db.DB.ExecContext(ctx, `UPDATE workspaces SET runtime = $1 WHERE id = $2`, bundleRuntime, wsID) + _, _ = db.DB.ExecContext(ctx, `UPDATE workspaces SET runtime = $1 WHERE id = $2`, bundleRuntime, wsID) // Provision the container if provisioner is available if prov != nil { diff --git a/workspace-server/internal/handlers/a2a_proxy.go b/workspace-server/internal/handlers/a2a_proxy.go index 785130c3..fd6cd50f 100644 --- a/workspace-server/internal/handlers/a2a_proxy.go +++ b/workspace-server/internal/handlers/a2a_proxy.go @@ -779,29 +779,78 @@ func isSafeURL(rawURL string) error { return nil } -// isPrivateOrMetadataIP returns true for RFC-1918 private, carrier-grade NAT, -// link-local, and cloud metadata ranges. +// isPrivateOrMetadataIP returns true for cloud-metadata / loopback / link-local +// ranges (always) and RFC-1918 / IPv6 ULA ranges (self-hosted only). +// +// In SaaS cross-EC2 mode (see saasMode() in registry.go) the tenant platform +// and its workspaces share a VPC, so workspaces register with their +// VPC-private IP — typically 172.31.x.x on AWS default VPCs. Blocking RFC-1918 +// unconditionally would reject every legitimate registration. Cloud metadata +// (169.254.0.0/16, fe80::/10), loopback, and TEST-NET ranges stay blocked in +// both modes; they are never a legitimate agent URL. +// +// Both IPv4 and IPv6 are checked. The previous implementation returned false +// for every non-IPv4 input, which meant a registered `[::1]` or `[fe80::…]` +// URL would bypass the SSRF gate entirely. func isPrivateOrMetadataIP(ip net.IP) bool { - var privateRanges = []net.IPNet{ - {IP: net.ParseIP("10.0.0.0"), Mask: net.CIDRMask(8, 32)}, - {IP: net.ParseIP("172.16.0.0"), Mask: net.CIDRMask(12, 32)}, - {IP: net.ParseIP("192.168.0.0"), Mask: net.CIDRMask(16, 32)}, - {IP: net.ParseIP("169.254.0.0"), Mask: net.CIDRMask(16, 32)}, - {IP: net.ParseIP("100.64.0.0"), Mask: net.CIDRMask(10, 32)}, - {IP: net.ParseIP("192.0.2.0"), Mask: net.CIDRMask(24, 32)}, - {IP: net.ParseIP("198.51.100.0"), Mask: net.CIDRMask(24, 32)}, - {IP: net.ParseIP("203.0.113.0"), Mask: net.CIDRMask(24, 32)}, + // Always blocked — IPv4 cloud metadata + network-test ranges. + metadataRangesV4 := []string{ + "169.254.0.0/16", // link-local / IMDSv1-v2 + "100.64.0.0/10", // CGNAT — reachable via some VPC configs, not a legit agent URL + "192.0.2.0/24", // TEST-NET-1 + "198.51.100.0/24", // TEST-NET-2 + "203.0.113.0/24", // TEST-NET-3 } - ip = ip.To4() - if ip == nil { + // Always blocked — IPv6 cloud-metadata / loopback equivalents. + metadataRangesV6 := []string{ + "::1/128", // loopback + "fe80::/10", // link-local (IMDS analogue) + "::ffff:0:0/96", // IPv4-mapped loopback (defence-in-depth; To4() below usually normalises first) + } + // RFC-1918 private — blocked in self-hosted, allowed in SaaS. + rfc1918RangesV4 := []string{ + "10.0.0.0/8", + "172.16.0.0/12", + "192.168.0.0/16", + } + // RFC-4193 ULA — IPv6 analogue of RFC-1918. Same SaaS-mode treatment. + ulaRangesV6 := []string{ + "fc00::/7", + } + + contains := func(cidrs []string, target net.IP) bool { + for _, c := range cidrs { + _, n, err := net.ParseCIDR(c) + if err != nil { + continue + } + if n.Contains(target) { + return true + } + } return false } - for _, r := range privateRanges { - if r.Contains(ip) { + + // Prefer IPv4 semantics when the input is an IPv4 address encoded in any + // form (raw v4, ::ffff:a.b.c.d, etc.) — To4() normalises all of them. + if ip4 := ip.To4(); ip4 != nil { + if contains(metadataRangesV4, ip4) { return true } + if saasMode() { + return false + } + return contains(rfc1918RangesV4, ip4) } - return false + + // True IPv6 path. + if contains(metadataRangesV6, ip) { + return true + } + if saasMode() { + return false + } + return contains(ulaRangesV6, ip) } // readUsageMap extracts input_tokens / output_tokens from the "usage" key of m. diff --git a/workspace-server/internal/handlers/mcp.go b/workspace-server/internal/handlers/mcp.go index ee662e8a..8d7ac598 100644 --- a/workspace-server/internal/handlers/mcp.go +++ b/workspace-server/internal/handlers/mcp.go @@ -27,9 +27,7 @@ import ( "fmt" "io" "log" - "net" "net/http" - "net/url" "os" "strings" "time" @@ -826,75 +824,10 @@ func (h *MCPHandler) toolRecallMemory(ctx context.Context, workspaceID string, a return string(b), nil } -// isSafeURL validates that a URL resolves to a publicly-routable address, -// preventing A2A requests from being redirected to internal/cloud-metadata -// infrastructure (SSRF, CWE-918). Workspace URLs come from DB/Redis caches -// so we validate before making any outbound HTTP call. -func isSafeURL(rawURL string) error { - u, err := url.Parse(rawURL) - if err != nil { - return fmt.Errorf("invalid URL: %w", err) - } - // Reject non-HTTP(S) schemes. - if u.Scheme != "http" && u.Scheme != "https" { - return fmt.Errorf("forbidden scheme: %s (only http/https allowed)", u.Scheme) - } - host := u.Hostname() - if host == "" { - return fmt.Errorf("empty hostname") - } - // Block direct IP addresses. - if ip := net.ParseIP(host); ip != nil { - if ip.IsLoopback() || ip.IsUnspecified() || ip.IsLinkLocalUnicast() { - return fmt.Errorf("forbidden loopback/unspecified IP: %s", ip) - } - if isPrivateOrMetadataIP(ip) { - return fmt.Errorf("forbidden private/metadata IP: %s", ip) - } - return nil - } - // For hostnames, resolve and validate each returned IP. - addrs, err := net.LookupHost(host) - if err != nil { - // DNS resolution failure — block it. Could be an internal hostname. - return fmt.Errorf("DNS resolution blocked for hostname: %s (%v)", host, err) - } - if len(addrs) == 0 { - return fmt.Errorf("DNS returned no addresses for: %s", host) - } - for _, addr := range addrs { - ip := net.ParseIP(addr) - if ip != nil && (ip.IsLoopback() || ip.IsUnspecified() || ip.IsLinkLocalUnicast() || isPrivateOrMetadataIP(ip)) { - return fmt.Errorf("hostname %s resolves to forbidden IP: %s", host, ip) - } - } - return nil -} - -// isPrivateOrMetadataIP returns true for RFC-1918 private, carrier-grade NAT, -// link-local, and cloud metadata ranges. -func isPrivateOrMetadataIP(ip net.IP) bool { - var privateRanges = []net.IPNet{ - {IP: net.ParseIP("10.0.0.0"), Mask: net.CIDRMask(8, 32)}, - {IP: net.ParseIP("172.16.0.0"), Mask: net.CIDRMask(12, 32)}, - {IP: net.ParseIP("192.168.0.0"), Mask: net.CIDRMask(16, 32)}, - {IP: net.ParseIP("169.254.0.0"), Mask: net.CIDRMask(16, 32)}, - {IP: net.ParseIP("100.64.0.0"), Mask: net.CIDRMask(10, 32)}, - {IP: net.ParseIP("192.0.2.0"), Mask: net.CIDRMask(24, 32)}, - {IP: net.ParseIP("198.51.100.0"), Mask: net.CIDRMask(24, 32)}, - {IP: net.ParseIP("203.0.113.0"), Mask: net.CIDRMask(24, 32)}, - } - ip = ip.To4() - if ip == nil { - return false - } - for _, r := range privateRanges { - if r.Contains(ip) { - return true - } - } - return false -} +// isSafeURL and isPrivateOrMetadataIP live in a2a_proxy.go -- same package, +// shared across MCP + A2A proxy call sites. Keeping a single copy avoids +// drift between the two SSRF gates when one is tightened and the other +// isn't. // ───────────────────────────────────────────────────────────────────────────── // Helpers diff --git a/workspace-server/internal/handlers/org_tokens.go b/workspace-server/internal/handlers/org_tokens.go index 19568ace..799f7f21 100644 --- a/workspace-server/internal/handlers/org_tokens.go +++ b/workspace-server/internal/handlers/org_tokens.go @@ -111,7 +111,7 @@ func (h *OrgTokenHandler) Revoke(c *gin.Context) { c.JSON(http.StatusNotFound, gin.H{"error": "token not found or already revoked"}) return } - actor := orgTokenActor(c) + actor, _ := orgTokenActor(c) log.Printf("orgtoken: revoked id=%s by=%s", id, actor) c.JSON(http.StatusOK, gin.H{"revoked": id}) } diff --git a/workspace-server/internal/handlers/registry.go b/workspace-server/internal/handlers/registry.go index 38772529..2cb37b67 100644 --- a/workspace-server/internal/handlers/registry.go +++ b/workspace-server/internal/handlers/registry.go @@ -8,6 +8,7 @@ import ( "net" "net/http" "net/url" + "os" "strings" "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" @@ -17,6 +18,43 @@ import ( "github.com/gin-gonic/gin" ) +// blockedRange is a named CIDR block so the conditional blocklist in +// validateAgentURL reads as a slice of homogeneous values instead of +// repeated anonymous struct literals. +type blockedRange struct { + cidr string + label string +} + +// saasMode reports whether this tenant platform is running in SaaS cross-EC2 +// mode, where workspaces live on sibling EC2s in the same VPC and register +// themselves by their RFC-1918 VPC-private IP (typically 172.31.x.x on AWS +// default VPCs). In that shape, the SSRF hardening that blocks RFC-1918 +// addresses would reject every legitimate workspace registration — the +// control plane provisioned these instances, so their intra-VPC URLs are +// trusted by construction. +// +// Resolution order: +// 1. MOLECULE_DEPLOY_MODE — explicit operator-set flag ("saas" | "self-hosted"). +// Prefer this for new deployments: the signal is unambiguous and can't +// collide with some future non-SaaS path that legitimately needs +// MOLECULE_ORG_ID. +// 2. MOLECULE_ORG_ID presence — implicit legacy signal, kept so deployments +// that haven't yet adopted MOLECULE_DEPLOY_MODE keep working. +// +// Self-hosted / single-container deployments set neither and keep the strict +// blocklist. +func saasMode() bool { + mode := strings.ToLower(strings.TrimSpace(os.Getenv("MOLECULE_DEPLOY_MODE"))) + switch mode { + case "saas": + return true + case "self-hosted", "selfhosted", "standalone": + return false + } + return strings.TrimSpace(os.Getenv("MOLECULE_ORG_ID")) != "" +} + type RegistryHandler struct { broadcaster *events.Broadcaster } @@ -64,18 +102,27 @@ func validateAgentURL(rawURL string) error { } hostname := parsed.Hostname() - blockedRanges := []struct { - cidr string - label string - }{ + // Link-local / loopback / IPv6 metadata classes are blocked in every + // mode — they are never a legitimate agent URL and they cover the AWS/ + // GCP/Azure IMDS endpoints. RFC-1918 ranges are conditionally blocked: + // in SaaS mode workspaces register with their VPC-private IP and the + // control plane is the source of truth for which instances exist, so + // allowing 10/8, 172.16/12, 192.168/16 is safe. In self-hosted mode + // we keep the strict blocklist — those deployments have no legitimate + // reason to accept private-range URLs from agents. + blockedRanges := []blockedRange{ {"169.254.0.0/16", "link-local address (cloud metadata endpoint)"}, {"127.0.0.0/8", "loopback address"}, - {"10.0.0.0/8", "RFC-1918 private address"}, - {"172.16.0.0/12", "RFC-1918 private address"}, - {"192.168.0.0/16", "RFC-1918 private address"}, {"fe80::/10", "IPv6 link-local address (cloud metadata analogue)"}, {"::1/128", "IPv6 loopback address"}, - {"fc00::/7", "IPv6 ULA address (RFC-4193 private)"}, + } + if !saasMode() { + blockedRanges = append(blockedRanges, + blockedRange{"10.0.0.0/8", "RFC-1918 private address"}, + blockedRange{"172.16.0.0/12", "RFC-1918 private address"}, + blockedRange{"192.168.0.0/16", "RFC-1918 private address"}, + blockedRange{"fc00::/7", "IPv6 ULA address (RFC-4193 private)"}, + ) } // Helper: check a single IP against the blocklist. diff --git a/workspace-server/internal/handlers/ssrf_test.go b/workspace-server/internal/handlers/ssrf_test.go index 391d22c9..38359329 100644 --- a/workspace-server/internal/handlers/ssrf_test.go +++ b/workspace-server/internal/handlers/ssrf_test.go @@ -7,6 +7,114 @@ import ( // isSafeURL is defined in mcp.go. // isPrivateOrMetadataIP is defined in mcp.go. +// saasMode is defined in registry.go. + +// TestSaasMode covers the env-resolution ladder so a self-hosted +// operator can't accidentally flip into SaaS mode by leaving a stale +// MOLECULE_ORG_ID around, and an explicit MOLECULE_DEPLOY_MODE wins +// over the legacy implicit signal. +func TestSaasMode(t *testing.T) { + cases := []struct { + name string + deployMode string + orgID string + want bool + }{ + {"both unset", "", "", false}, + {"legacy org id only", "", "7b2179dc-8cc6-4581-a3c6-c8bff4481086", true}, + {"explicit saas", "saas", "", true}, + {"explicit saas overrides missing org", "SaaS", "", true}, // case-insensitive + {"explicit self-hosted wins over legacy org id", "self-hosted", "some-org", false}, + {"explicit selfhosted wins over legacy org id", "selfhosted", "some-org", false}, + {"explicit standalone wins over legacy org id", "standalone", "some-org", false}, + {"whitespace-only deploy mode falls through to legacy", " ", "some-org", true}, + {"whitespace-only org id falls through to false", "", " ", false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Setenv("MOLECULE_DEPLOY_MODE", tc.deployMode) + t.Setenv("MOLECULE_ORG_ID", tc.orgID) + if got := saasMode(); got != tc.want { + t.Errorf("saasMode() = %v, want %v (MOLECULE_DEPLOY_MODE=%q MOLECULE_ORG_ID=%q)", + got, tc.want, tc.deployMode, tc.orgID) + } + }) + } +} + +// TestIsPrivateOrMetadataIP_SaaSMode covers the SaaS-mode relaxation: +// RFC-1918 and ULA ranges are allowed, but metadata / loopback / TEST-NET +// classes stay blocked in every mode. Regression guard for the core +// SaaS provisioning fix (issue: workspaces register with their VPC +// private IP, which is 172.31.x.x on AWS default VPCs). +func TestIsPrivateOrMetadataIP_SaaSMode(t *testing.T) { + t.Setenv("MOLECULE_DEPLOY_MODE", "saas") + t.Setenv("MOLECULE_ORG_ID", "") + cases := []struct { + name string + ipStr string + want bool + }{ + // RFC-1918 must be ALLOWED in SaaS mode. + {"172.31 allowed in saas", "172.31.44.78", false}, + {"10/8 allowed in saas", "10.0.0.5", false}, + {"192.168 allowed in saas", "192.168.1.1", false}, + // IPv6 ULA must be ALLOWED in SaaS mode (AWS IPv6 VPC analogue). + {"fd00 ULA allowed in saas", "fd12:3456:789a::1", false}, + // Metadata / loopback stay BLOCKED even in SaaS mode. + {"169.254 still blocked", "169.254.169.254", true}, + {"127/8 still blocked", "127.0.0.1", true}, + {"::1 still blocked", "::1", true}, + {"fe80 still blocked", "fe80::1", true}, + // TEST-NET stays blocked. + {"192.0.2.x still blocked", "192.0.2.5", true}, + {"198.51.100.x still blocked", "198.51.100.5", true}, + {"203.0.113.x still blocked", "203.0.113.5", true}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + ip := net.ParseIP(tc.ipStr) + if ip == nil { + t.Fatalf("ParseIP(%q) returned nil", tc.ipStr) + } + if got := isPrivateOrMetadataIP(ip); got != tc.want { + t.Errorf("isPrivateOrMetadataIP(%s) = %v, want %v", tc.ipStr, got, tc.want) + } + }) + } +} + +// TestIsPrivateOrMetadataIP_IPv6 covers the IPv6 gap the previous +// implementation had — it returned false for every IPv6 literal +// unconditionally, which would let a registered [::1] or [fe80::…] +// URL bypass the SSRF check entirely. +func TestIsPrivateOrMetadataIP_IPv6(t *testing.T) { + t.Setenv("MOLECULE_DEPLOY_MODE", "") + t.Setenv("MOLECULE_ORG_ID", "") + cases := []struct { + name string + ipStr string + want bool + }{ + {"::1 loopback blocked", "::1", true}, + {"fe80 link-local blocked", "fe80::1", true}, + {"fe80 link-local with mac blocked", "fe80::a00:27ff:fe00:1", true}, + {"fc00 ULA blocked (non-saas)", "fc00::1", true}, + {"fd00 ULA blocked (non-saas)", "fd12::1", true}, + {"public v6 allowed", "2606:4700:4700::1111", false}, // 1.1.1.1 v6 + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + ip := net.ParseIP(tc.ipStr) + if ip == nil { + t.Fatalf("ParseIP(%q) returned nil", tc.ipStr) + } + if got := isPrivateOrMetadataIP(ip); got != tc.want { + t.Errorf("isPrivateOrMetadataIP(%s) = %v, want %v", tc.ipStr, got, tc.want) + } + }) + } +} func TestIsPrivateOrMetadataIP(t *testing.T) { cases := []struct { diff --git a/workspace-server/internal/handlers/workspace_provision.go b/workspace-server/internal/handlers/workspace_provision.go index d364de54..45d4a3bb 100644 --- a/workspace-server/internal/handlers/workspace_provision.go +++ b/workspace-server/internal/handlers/workspace_provision.go @@ -243,10 +243,11 @@ func seedInitialMemories(ctx context.Context, workspaceID string, memories []mod log.Printf("seedInitialMemories: truncated memory content for %s (scope=%s) from %d to %d bytes", workspaceID, scope, len(mem.Content), maxMemoryContentLength) } + redactedContent, _ := redactSecrets(workspaceID, content) if _, err := db.DB.ExecContext(ctx, ` INSERT INTO agent_memories (workspace_id, content, scope, namespace) VALUES ($1, $2, $3, $4) - `, workspaceID, redactSecrets(workspaceID, content), scope, awarenessNamespace); err != nil { + `, workspaceID, redactedContent, scope, awarenessNamespace); err != nil { log.Printf("seedInitialMemories: failed to insert memory for %s (scope=%s): %v", workspaceID, scope, err) } } @@ -329,6 +330,20 @@ func (h *WorkspaceHandler) buildProvisionerConfig( // provisioning continues — the workspace will get 401 on its first heartbeat // and can recover on the next restart. func (h *WorkspaceHandler) issueAndInjectToken(ctx context.Context, workspaceID string, cfg *provisioner.WorkspaceConfig) { + // SaaS mode skip: the CP provisioner ships workspaces to a remote EC2 + // via user-data and does not carry cfg.ConfigFiles across the wire, so + // any token we mint here never reaches the workspace. Minting it anyway + // would leave a live token in the DB with no plaintext on disk — + // RegistryHandler.requireWorkspaceToken then 401s every /registry/register + // attempt because the workspace has a live token on file (which trips + // bootstrap-allowed) but no way to present it. Defer to the register + // handler's own bootstrap-issuance path, which mints a token only after + // a successful first register and returns the plaintext in the response + // for the runtime to persist locally. + if saasMode() { + return + } + // Revoke any existing live tokens. If this fails we bail out rather than // issuing a second live token whose plaintext we can't also deliver. if err := wsauth.RevokeAllForWorkspace(ctx, db.DB, workspaceID); err != nil { @@ -606,14 +621,13 @@ func (h *WorkspaceHandler) provisionWorkspaceCP(workspaceID, templatePath string } log.Printf("CPProvisioner: workspace %s started as machine %s via control plane", workspaceID, machineID) - // Issue token so the agent can authenticate on boot - token, tokenErr := wsauth.IssueToken(ctx, db.DB, workspaceID) - if tokenErr != nil { - log.Printf("CPProvisioner: failed to issue token for %s: %v", workspaceID, tokenErr) - } else { - // Don't log any prefix of the token. Earlier H1 regression showed - // this slice pattern (token[:8]) panics when a helper returns a - // short value. Length alone is enough to confirm a token issued. - log.Printf("CPProvisioner: issued auth token for workspace %s (len=%d)", workspaceID, len(token)) - } + // Token issuance is deliberately deferred to the workspace's first + // /registry/register call. Minting here without also delivering the + // plaintext to the workspace (via user-data or a follow-up callback) + // would leave a live token in DB that the workspace has no copy of — + // RegistryHandler.requireWorkspaceToken would then 401 every + // /registry/register attempt because the workspace is no longer in the + // "no live tokens → bootstrap-allowed" state. The register handler + // already mints a token on first successful register and returns it in + // the response body for the workspace to persist. } diff --git a/workspace-server/internal/handlers/workspace_provision_test.go b/workspace-server/internal/handlers/workspace_provision_test.go index 9c3e8659..ec48050a 100644 --- a/workspace-server/internal/handlers/workspace_provision_test.go +++ b/workspace-server/internal/handlers/workspace_provision_test.go @@ -9,6 +9,7 @@ import ( "testing" "github.com/DATA-DOG/go-sqlmock" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" "github.com/Molecule-AI/molecule-monorepo/platform/internal/events" "github.com/Molecule-AI/molecule-monorepo/platform/internal/models" "github.com/Molecule-AI/molecule-monorepo/platform/internal/plugins" @@ -901,9 +902,21 @@ func containsStr(s, substr string) bool { // // Each test injects a known-internal error and verifies the response body // or broadcast payload contains ONLY the generic prod-safe message. - largeContent := string(make([]byte, 100_001)) - copy([]byte(largeContent), "X") // fill with "X" so test is deterministic +// TestSeedInitialMemories_ContentAtLimitTruncates exercises the 100_000-byte +// content truncation guard in seedInitialMemories — a 100_001-byte input +// must persist as exactly 100_000 bytes of "X". +func TestSeedInitialMemories_ContentAtLimitTruncates(t *testing.T) { + origDB := db.DB + sqldb, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer sqldb.Close() + db.DB = sqldb + t.Cleanup(func() { db.DB = origDB }) + + largeContent := strings.Repeat("X", 100_001) memories := []models.MemorySeed{ {Content: largeContent, Scope: "LOCAL"}, } diff --git a/workspace-server/internal/scheduler/scheduler.go b/workspace-server/internal/scheduler/scheduler.go index dc18dece..4fa12880 100644 --- a/workspace-server/internal/scheduler/scheduler.go +++ b/workspace-server/internal/scheduler/scheduler.go @@ -255,6 +255,7 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) { if _, execErr := db.DB.ExecContext(context.Background(), `UPDATE workspace_schedules SET next_run_at=$1, updated_at=now() WHERE id=$2`, nextTime, sched.ID); execErr != nil { log.Printf("Scheduler: panic-recovery next_run_at UPDATE failed for schedule %s: %v", sched.ID, execErr) } + } } }()