fix(security#2132/RC): SSRF fast-follow — move dial-time IP guard from post-dial to net.Dialer.Control #2967

Merged
devops-engineer merged 1 commits from fix/2132-transcript-ssrf-control-pre-sy into main 2026-06-15 21:11:31 +00:00
2 changed files with 150 additions and 45 deletions
@@ -17,6 +17,7 @@ import (
"net"
"net/http"
"net/url"
"syscall"
"time"
"git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/db"
@@ -70,19 +71,23 @@ func NewTranscriptHandler() *TranscriptHandler {
CheckRedirect: func(req *http.Request, via []*http.Request) error {
return http.ErrUseLastResponse
},
// Dial-time IP guard. net.Dialer.Control runs after
// getaddrinfo but before the TCP SYN, on every dial
// (initial + every connection in a redirect chain). The
// isSafeURL helper reuses the same allow/deny policy as
// the front-door (loopback / private / metadata /
// Dial-time IP guard. net.Dialer.Control runs after the
// socket is created + after getaddrinfo but BEFORE the
// TCP connect() syscall. Rejecting in Control means no
// TCP SYN is sent — no port-scan side-channel against
// IMDS / private / link-local targets. The isSafeURL
// helper reuses the same allow/deny policy as the
// front-door (loopback / private / metadata /
// link-local blocked in production; the SaaS-mode
// private-range relaxation is honored here too so a
// private-range relaxation is honored here too so an
// intra-VPC workspace target still works).
//
// The Control callback runs in the dialer's goroutine, so
// it must not block. isSafeURL is in-memory + fast.
// The Control callback runs in the dialer's goroutine,
// so it must not block. isSafeURL is in-memory + fast
// (does a single getaddrinfo for the IP, no network
// round-trip for already-IP addresses).
Transport: &http.Transport{
DialContext: safeDialContext,
DialContext: safeDialer().DialContext,
ForceAttemptHTTP2: true,
MaxIdleConns: 100,
IdleConnTimeout: 90 * time.Second,
@@ -93,45 +98,72 @@ func NewTranscriptHandler() *TranscriptHandler {
}
}
// safeDialContext is the dial-context function used by the
// transcript proxy. It wraps net.Dialer's DialContext with an
// isSafeURL post-resolution check on the IP the dialer is about to
// use (closes the DNS-rebinding TOCTOU + redirect-target rebinding
// vectors in #2132 / RC 103771).
//
// The function signature matches net.Dialer's DialContext so it can
// be passed via http.Transport.DialContext. The IP guard runs in
// the dialer's goroutine, so it must not block.
func safeDialContext(ctx context.Context, network, addr string) (net.Conn, error) {
dialer := &net.Dialer{Timeout: 10 * time.Second, KeepAlive: 30 * time.Second}
conn, err := dialer.DialContext(ctx, network, addr)
if err != nil {
return nil, err
// safeDialer returns a *net.Dialer wired with safeDialControl as its
// Control callback. net.Dialer.Control runs after getaddrinfo but
// before the TCP connect() syscall, so an error return blocks the
// SYN from ever going out (no port-scan side-channel against blocked
// targets). The function exists as a constructor so the test suite
// can also construct a dialer that exercises the same control path
// without needing to spin up an http.Transport.
func safeDialer() *net.Dialer {
return &net.Dialer{
Timeout: 10 * time.Second,
KeepAlive: 30 * time.Second,
Control: safeDialControl,
}
// POST-resolution IP guard. The conn.LocalAddr is the local
// side; the RemoteAddr is the resolved IP we just dialed. We
// validate the RemoteAddr's IP against isSafeURL's policy. Note
// the host:port form; the IP is the first component.
remote := conn.RemoteAddr().String()
if host, _, splitErr := net.SplitHostPort(remote); splitErr == nil {
if ip := net.ParseIP(host); ip != nil {
// Reuse the SSRF policy. isSafeURL takes a URL but
// the policy is purely IP-based; we construct a
// throwaway URL to reuse the helper.
if err := isSafeURL("http://" + ip.String() + "/"); err != nil {
_ = conn.Close()
return nil, &ssrfDialError{ip: ip, reason: err}
}
}
}
return conn, nil
}
// ssrfDialError is the error type returned by safeDialContext when
// the post-resolution IP fails the isSafeURL policy. The error
// message includes the IP and the policy reason so the platform
// log surfaces the SSRF attempt (the workspace agent_card.url
// embedding attack in #2132 / RC 103771).
// safeDialControl is the net.Dialer.Control callback that enforces
// the SSRF policy at the dial layer. It runs after DNS resolution
// (the address parameter carries the resolved IP) and before the
// TCP connect() syscall — returning a non-nil error here prevents
// the SYN from going out, closing the port-scan side-channel that
// the prior POST-DIAL safeDialContext (which dialed then closed)
// left open against IMDS / private / link-local targets.
//
// Reuses isSafeURL as the policy (the same one the front-door gate
// runs) so there's a single source of truth for allow/deny. The
// "is already an IP" fast path is intentional: by the time Control
// runs, getaddrinfo has already resolved, so the address is
// `ip:port` form. If for some reason the dialer passes a hostname
// (e.g., IPv6 bracket form), the policy is still applied via the
// constructed URL — the constructor handles both shapes.
func safeDialControl(network, address string, _ syscall.RawConn) error {
host, _, splitErr := net.SplitHostPort(address)
if splitErr != nil {
// Not host:port — shouldn't happen post-resolution, but if
// it does, fall through (the dialer will surface the parse
// error downstream; don't preempt).
return nil
}
ip := net.ParseIP(host)
if ip == nil {
// Not a literal IP — Control must have been called with
// an unresolved hostname (Go's dialer normally resolves
// first, but custom resolvers can defer). Re-resolve here
// so the SSRF policy is still applied.
if ips, lookupErr := net.LookupIP(host); lookupErr == nil {
for _, candidate := range ips {
if safeErr := isSafeURL("http://" + candidate.String() + "/"); safeErr != nil {
return &ssrfDialError{ip: candidate, reason: safeErr}
}
}
}
return nil
}
if safeErr := isSafeURL("http://" + ip.String() + "/"); safeErr != nil {
return &ssrfDialError{ip: ip, reason: safeErr}
}
return nil
}
// ssrfDialError is the error type returned by safeDialControl when
// the resolved IP fails the isSafeURL policy. The error message
// includes the IP and the policy reason so the platform log
// surfaces the SSRF attempt (the workspace agent_card.url
// embedding attack in #2132 / RC 103771). Returned from
// net.Dialer.Control, so it propagates out of DialContext as the
// dial error — the http.Client surfaces it to the caller.
type ssrfDialError struct {
ip net.IP
reason error
@@ -1,11 +1,15 @@
package handlers
import (
"context"
"database/sql"
"encoding/json"
"errors"
"net"
"net/http"
"net/http/httptest"
"testing"
"time"
"github.com/DATA-DOG/go-sqlmock"
"github.com/gin-gonic/gin"
@@ -383,3 +387,72 @@ func TestTranscript_DisablesRedirects_DoesNotForwardAuthToRedirectTarget(t *test
// hit + bearer NOT forwarded to the target. Both checked above.
_ = w.Header().Get("Location")
}
// TestTranscript_DialGuardBlocksBeforeConnect pins the #2132 fast-follow
// (Researcher RC 103905): the dial-time IP guard runs in
// net.Dialer.Control, which fires AFTER getaddrinfo but BEFORE the
// TCP connect() syscall. The prior POST-DIAL safeDialContext (dialed
// then closed on a blocked IP) left a port-scan side-channel: a TCP
// SYN was sent to the internal target, opening a connection that
// could be observed by the target's stack (and potentially logged
// by IMDS as a credential-exfil probe). This test asserts:
//
// 1. A direct dial to a loopback address via safeDialer() returns
// an *ssrfDialError (the SSRF policy was enforced).
// 2. NO TCP connection was opened on the listener (the
// connect() syscall never ran; Control rejected it first).
//
// The test bypasses the front-door isSafeURL gate by using safeDialer()
// directly — the front-door gate would otherwise block loopback before
// the dialer runs. The point of this test is the dialer's behavior
// in isolation (the belt-and-suspenders for DNS-rebinding TOCTOU).
func TestTranscript_DialGuardBlocksBeforeConnect(t *testing.T) {
// Stand up a real TCP listener on loopback. The handler that
// counts accepts runs in a goroutine; if Control does its job
// the connect() syscall never runs, no SYN arrives, and the
// accept count stays at 0.
var acceptCount int
listener, err := net.Listen("tcp", "127.0.0.1:0")
if err != nil {
t.Fatalf("listen: %v", err)
}
defer listener.Close()
go func() {
for {
conn, aerr := listener.Accept()
if aerr != nil {
return // listener closed → test over
}
acceptCount++
_ = conn.Close()
}
}()
// Now dial the listener's address via safeDialer(). This
// exercises the same Control path the transcript proxy uses.
// The address is 127.0.0.1 (loopback) which isSafeURL rejects
// in production; the dialer's Control must intercept and
// return an error BEFORE the connect() syscall.
addr := listener.Addr().String()
conn, dialErr := safeDialer().DialContext(context.Background(), "tcp", addr)
if conn != nil {
_ = conn.Close()
}
if dialErr == nil {
t.Fatalf("expected dial to be blocked by Control, but it succeeded (conn=%v)", conn)
}
var ssrfErr *ssrfDialError
if !errors.As(dialErr, &ssrfErr) {
t.Fatalf("expected *ssrfDialError, got %T: %v", dialErr, dialErr)
}
if !ssrfErr.ip.IsLoopback() {
t.Errorf("expected loopback IP in error, got %v", ssrfErr.ip)
}
// Give the OS a moment to deliver any in-flight SYNs (none
// should arrive, but the accept loop is async).
time.Sleep(50 * time.Millisecond)
if acceptCount != 0 {
t.Errorf("listener accepted %d connections — Control did NOT block the dial before connect() (port-scan side-channel open)", acceptCount)
}
}