From eaaa6c834886167c0e6b815eda3053afd4244b7e Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer B (MiniMax)" Date: Mon, 15 Jun 2026 21:06:14 +0000 Subject: [PATCH] =?UTF-8?q?fix(security#2132/RC):=20SSRF=20fast-follow=20?= =?UTF-8?q?=E2=80=94=20move=20dial-time=20IP=20guard=20from=20post-dial=20?= =?UTF-8?q?to=20net.Dialer.Control?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PM 2026-06-15 [dispatch 4835ac01]: the dial-time IP guard in transcript.go (PR #2965 step A) is implemented POST-DIAL — safeDialContext dials the target, then inspects conn.RemoteAddr() and closes. So a TCP connection IS opened to an internal target before rejection = port-scan side-channel (no HTTP sent, so credential-exfil stays blocked — the real SSRF is closed, but the SYN/ACK/logs leak target identity to internal stacks, including IMDS's audit log). The existing impl→comment misalignment is also fixed: the comment in NewTranscriptHandler claimed 'net.Dialer.Control runs after getaddrinfo but before the TCP SYN' but the actual code used a custom DialContext that ran the check AFTER connect(). Comment now matches impl. FIX: replace safeDialContext (POST-DIAL) with safeDialControl (PRE-SYN). net.Dialer.Control fires after getaddrinfo + socket creation but before the TCP connect() syscall — returning a non-nil error blocks the SYN from ever going out. The isSafeURL policy is reused (single source of truth for allow/deny: loopback/private/metadata/link-local blocked in production; SaaS-mode private-range relaxation honored for intra-VPC workspace targets). Unconditional IMDS / link-local block is preserved (the isSafeURL policy already handles both via IP classification, not via the post-dial redundant check the old code did). Test plan (local green): - go build ./workspace-server/... — clean - go test -count=1 -timeout 60s -run TestTranscript -v ./workspace-server/internal/handlers/ — 11/11 PASS (0.13s) - go test -count=1 -timeout 180s ./workspace-server/internal/handlers/ — 39.7s green (full suite, no regressions) New test: TestTranscript_DialGuardBlocksBeforeConnect - Spins up a real TCP listener on 127.0.0.1 - Calls safeDialer().DialContext directly (bypasses the front-door isSafeURL gate so the dialer's behavior is tested in isolation — the belt-and-suspenders for DNS-rebinding TOCTOU) - Asserts: (1) the dial returns *ssrfDialError with a loopback IP (2) the listener's accept count is 0 — NO TCP SYN got through. Pins the new contract: Control blocks the dial BEFORE connect(). This is the test the Researcher asked for ('asserting no dial occurs for an internal-resolving host'). It runs at the dialer layer (safeDialer()), not the proxy layer, because the front-door isSafeURL already blocks loopback before the dialer would run in production. Co-Authored-By: Claude --- .../internal/handlers/transcript.go | 122 +++++++++++------- .../internal/handlers/transcript_test.go | 73 +++++++++++ 2 files changed, 150 insertions(+), 45 deletions(-) diff --git a/workspace-server/internal/handlers/transcript.go b/workspace-server/internal/handlers/transcript.go index 19f29c51..b4e26832 100644 --- a/workspace-server/internal/handlers/transcript.go +++ b/workspace-server/internal/handlers/transcript.go @@ -17,6 +17,7 @@ import ( "net" "net/http" "net/url" + "syscall" "time" "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db" @@ -70,19 +71,23 @@ func NewTranscriptHandler() *TranscriptHandler { CheckRedirect: func(req *http.Request, via []*http.Request) error { return http.ErrUseLastResponse }, - // Dial-time IP guard. net.Dialer.Control runs after - // getaddrinfo but before the TCP SYN, on every dial - // (initial + every connection in a redirect chain). The - // isSafeURL helper reuses the same allow/deny policy as - // the front-door (loopback / private / metadata / + // Dial-time IP guard. net.Dialer.Control runs after the + // socket is created + after getaddrinfo but BEFORE the + // TCP connect() syscall. Rejecting in Control means no + // TCP SYN is sent — no port-scan side-channel against + // IMDS / private / link-local targets. The isSafeURL + // helper reuses the same allow/deny policy as the + // front-door (loopback / private / metadata / // link-local blocked in production; the SaaS-mode - // private-range relaxation is honored here too so a + // private-range relaxation is honored here too so an // intra-VPC workspace target still works). // - // The Control callback runs in the dialer's goroutine, so - // it must not block. isSafeURL is in-memory + fast. + // The Control callback runs in the dialer's goroutine, + // so it must not block. isSafeURL is in-memory + fast + // (does a single getaddrinfo for the IP, no network + // round-trip for already-IP addresses). Transport: &http.Transport{ - DialContext: safeDialContext, + DialContext: safeDialer().DialContext, ForceAttemptHTTP2: true, MaxIdleConns: 100, IdleConnTimeout: 90 * time.Second, @@ -93,45 +98,72 @@ func NewTranscriptHandler() *TranscriptHandler { } } -// safeDialContext is the dial-context function used by the -// transcript proxy. It wraps net.Dialer's DialContext with an -// isSafeURL post-resolution check on the IP the dialer is about to -// use (closes the DNS-rebinding TOCTOU + redirect-target rebinding -// vectors in #2132 / RC 103771). -// -// The function signature matches net.Dialer's DialContext so it can -// be passed via http.Transport.DialContext. The IP guard runs in -// the dialer's goroutine, so it must not block. -func safeDialContext(ctx context.Context, network, addr string) (net.Conn, error) { - dialer := &net.Dialer{Timeout: 10 * time.Second, KeepAlive: 30 * time.Second} - conn, err := dialer.DialContext(ctx, network, addr) - if err != nil { - return nil, err +// safeDialer returns a *net.Dialer wired with safeDialControl as its +// Control callback. net.Dialer.Control runs after getaddrinfo but +// before the TCP connect() syscall, so an error return blocks the +// SYN from ever going out (no port-scan side-channel against blocked +// targets). The function exists as a constructor so the test suite +// can also construct a dialer that exercises the same control path +// without needing to spin up an http.Transport. +func safeDialer() *net.Dialer { + return &net.Dialer{ + Timeout: 10 * time.Second, + KeepAlive: 30 * time.Second, + Control: safeDialControl, } - // POST-resolution IP guard. The conn.LocalAddr is the local - // side; the RemoteAddr is the resolved IP we just dialed. We - // validate the RemoteAddr's IP against isSafeURL's policy. Note - // the host:port form; the IP is the first component. - remote := conn.RemoteAddr().String() - if host, _, splitErr := net.SplitHostPort(remote); splitErr == nil { - if ip := net.ParseIP(host); ip != nil { - // Reuse the SSRF policy. isSafeURL takes a URL but - // the policy is purely IP-based; we construct a - // throwaway URL to reuse the helper. - if err := isSafeURL("http://" + ip.String() + "/"); err != nil { - _ = conn.Close() - return nil, &ssrfDialError{ip: ip, reason: err} - } - } - } - return conn, nil } -// ssrfDialError is the error type returned by safeDialContext when -// the post-resolution IP fails the isSafeURL policy. The error -// message includes the IP and the policy reason so the platform -// log surfaces the SSRF attempt (the workspace agent_card.url -// embedding attack in #2132 / RC 103771). +// safeDialControl is the net.Dialer.Control callback that enforces +// the SSRF policy at the dial layer. It runs after DNS resolution +// (the address parameter carries the resolved IP) and before the +// TCP connect() syscall — returning a non-nil error here prevents +// the SYN from going out, closing the port-scan side-channel that +// the prior POST-DIAL safeDialContext (which dialed then closed) +// left open against IMDS / private / link-local targets. +// +// Reuses isSafeURL as the policy (the same one the front-door gate +// runs) so there's a single source of truth for allow/deny. The +// "is already an IP" fast path is intentional: by the time Control +// runs, getaddrinfo has already resolved, so the address is +// `ip:port` form. If for some reason the dialer passes a hostname +// (e.g., IPv6 bracket form), the policy is still applied via the +// constructed URL — the constructor handles both shapes. +func safeDialControl(network, address string, _ syscall.RawConn) error { + host, _, splitErr := net.SplitHostPort(address) + if splitErr != nil { + // Not host:port — shouldn't happen post-resolution, but if + // it does, fall through (the dialer will surface the parse + // error downstream; don't preempt). + return nil + } + ip := net.ParseIP(host) + if ip == nil { + // Not a literal IP — Control must have been called with + // an unresolved hostname (Go's dialer normally resolves + // first, but custom resolvers can defer). Re-resolve here + // so the SSRF policy is still applied. + if ips, lookupErr := net.LookupIP(host); lookupErr == nil { + for _, candidate := range ips { + if safeErr := isSafeURL("http://" + candidate.String() + "/"); safeErr != nil { + return &ssrfDialError{ip: candidate, reason: safeErr} + } + } + } + return nil + } + if safeErr := isSafeURL("http://" + ip.String() + "/"); safeErr != nil { + return &ssrfDialError{ip: ip, reason: safeErr} + } + return nil +} + +// ssrfDialError is the error type returned by safeDialControl when +// the resolved IP fails the isSafeURL policy. The error message +// includes the IP and the policy reason so the platform log +// surfaces the SSRF attempt (the workspace agent_card.url +// embedding attack in #2132 / RC 103771). Returned from +// net.Dialer.Control, so it propagates out of DialContext as the +// dial error — the http.Client surfaces it to the caller. type ssrfDialError struct { ip net.IP reason error diff --git a/workspace-server/internal/handlers/transcript_test.go b/workspace-server/internal/handlers/transcript_test.go index 86f8a32e..1be4027e 100644 --- a/workspace-server/internal/handlers/transcript_test.go +++ b/workspace-server/internal/handlers/transcript_test.go @@ -1,11 +1,15 @@ package handlers import ( + "context" "database/sql" "encoding/json" + "errors" + "net" "net/http" "net/http/httptest" "testing" + "time" "github.com/DATA-DOG/go-sqlmock" "github.com/gin-gonic/gin" @@ -383,3 +387,72 @@ func TestTranscript_DisablesRedirects_DoesNotForwardAuthToRedirectTarget(t *test // hit + bearer NOT forwarded to the target. Both checked above. _ = w.Header().Get("Location") } + +// TestTranscript_DialGuardBlocksBeforeConnect pins the #2132 fast-follow +// (Researcher RC 103905): the dial-time IP guard runs in +// net.Dialer.Control, which fires AFTER getaddrinfo but BEFORE the +// TCP connect() syscall. The prior POST-DIAL safeDialContext (dialed +// then closed on a blocked IP) left a port-scan side-channel: a TCP +// SYN was sent to the internal target, opening a connection that +// could be observed by the target's stack (and potentially logged +// by IMDS as a credential-exfil probe). This test asserts: +// +// 1. A direct dial to a loopback address via safeDialer() returns +// an *ssrfDialError (the SSRF policy was enforced). +// 2. NO TCP connection was opened on the listener (the +// connect() syscall never ran; Control rejected it first). +// +// The test bypasses the front-door isSafeURL gate by using safeDialer() +// directly — the front-door gate would otherwise block loopback before +// the dialer runs. The point of this test is the dialer's behavior +// in isolation (the belt-and-suspenders for DNS-rebinding TOCTOU). +func TestTranscript_DialGuardBlocksBeforeConnect(t *testing.T) { + // Stand up a real TCP listener on loopback. The handler that + // counts accepts runs in a goroutine; if Control does its job + // the connect() syscall never runs, no SYN arrives, and the + // accept count stays at 0. + var acceptCount int + listener, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + t.Fatalf("listen: %v", err) + } + defer listener.Close() + go func() { + for { + conn, aerr := listener.Accept() + if aerr != nil { + return // listener closed → test over + } + acceptCount++ + _ = conn.Close() + } + }() + + // Now dial the listener's address via safeDialer(). This + // exercises the same Control path the transcript proxy uses. + // The address is 127.0.0.1 (loopback) which isSafeURL rejects + // in production; the dialer's Control must intercept and + // return an error BEFORE the connect() syscall. + addr := listener.Addr().String() + conn, dialErr := safeDialer().DialContext(context.Background(), "tcp", addr) + if conn != nil { + _ = conn.Close() + } + if dialErr == nil { + t.Fatalf("expected dial to be blocked by Control, but it succeeded (conn=%v)", conn) + } + var ssrfErr *ssrfDialError + if !errors.As(dialErr, &ssrfErr) { + t.Fatalf("expected *ssrfDialError, got %T: %v", dialErr, dialErr) + } + if !ssrfErr.ip.IsLoopback() { + t.Errorf("expected loopback IP in error, got %v", ssrfErr.ip) + } + + // Give the OS a moment to deliver any in-flight SYNs (none + // should arrive, but the accept loop is async). + time.Sleep(50 * time.Millisecond) + if acceptCount != 0 { + t.Errorf("listener accepted %d connections — Control did NOT block the dial before connect() (port-scan side-channel open)", acceptCount) + } +} -- 2.52.0