diff --git a/.gitea/workflows/local-provision-e2e.yml b/.gitea/workflows/local-provision-e2e.yml index 2732c14a4..038270a4d 100644 --- a/.gitea/workflows/local-provision-e2e.yml +++ b/.gitea/workflows/local-provision-e2e.yml @@ -320,12 +320,13 @@ jobs: # even if the runner's $GITHUB_ENV propagation is flaky (#2468 RCA). MOLECULE_ENV: development SECRETS_ENCRYPTION_KEY: lpe2e-test-encryption-key-32bytes!! - # act_runner runs the job inside a Docker container, so /.dockerenv exists - # and the platform auto-detects platformInDocker=true. But the job container - # is NOT on molecule-core-net, so it cannot resolve workspace container - # hostnames (ws-:8000). Force false so the proxy keeps using the - # host-mapped 127.0.0.1: URL, which IS reachable. - MOLECULE_IN_DOCKER: false + # #2851: the provisioner now makes the workspace advertise http://localhost:. + # When the platform itself runs inside Docker, the A2A proxy rewrites the + # stored http://127.0.0.1: URL to ws-:8000, so keep the job + # container on molecule-core-net and Docker-aware mode so that Docker-DNS + # name resolves. Dev mode already relaxes private-range SSRF checks, so SaaS + # deploy mode is no longer required. + MOLECULE_IN_DOCKER: true steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5 @@ -351,6 +352,14 @@ jobs: # uses for token-write helper containers. docker pull alpine:3.20@sha256:c64c687cbea9300178b30c95835354e34c4e4febc4badfe27102879de0483b5e >/dev/null docker network create molecule-core-net >/dev/null 2>&1 || true + # #2851: connect the act_runner job container to molecule-core-net so the + # platform process can resolve ws-:8000 workspace URLs. + JOB_CID="$(hostname)" + if docker network connect molecule-core-net "$JOB_CID" >/dev/null 2>&1; then + echo "Connected job container ($JOB_CID) to molecule-core-net." + else + echo "WARN: could not connect job container to molecule-core-net (already connected or fallback)." + fi - name: Start Postgres (docker, ephemeral host port) run: | diff --git a/tests/e2e/test_local_provision_lifecycle_e2e.sh b/tests/e2e/test_local_provision_lifecycle_e2e.sh index 371defe35..b1ee0e086 100755 --- a/tests/e2e/test_local_provision_lifecycle_e2e.sh +++ b/tests/e2e/test_local_provision_lifecycle_e2e.sh @@ -441,6 +441,11 @@ if [ "$LIFECYCLE_LLM" = "minimax" ]; then echo " secret write (MODEL_PROVIDER): $(echo "$SECP" | head -c 120)" fi +# #2851: the provisioner now injects MOLECULE_WORKSPACE_URL automatically so the +# runtime advertises a host-reachable URL (http://localhost:), which +# the A2A proxy rewrites to ws-:8000 when the platform runs inside Docker. +# No manual workspace-secret injection is needed here. + # Seed config.yaml directly into the named config volume so the provision (and # every later restart) has a config source. Create's byok-no-cred abort never # wrote it, and this dev stack ships no claude-code template in the platform's @@ -515,6 +520,25 @@ for _ in $(seq 1 10); do sleep 1 done if [ -n "$RUN" ]; then pass "container running: $RUN"; else fail "no running ws-${WSID} container within 10s of online" "docker ps shows none"; fi + +# #2851: fail fast if the workspace advertised an unresolvable/unreachable URL. +# The provisioner now makes the runtime advertise http://localhost:, +# which the platform stores as http://127.0.0.1:. The A2A proxy +# rewrites that to ws-:8000 when the platform runs inside Docker, so the +# URL stored in the registry should always be a host-reachable loopback address. +# (The end-to-end proxy reach in Step 5 is the real reachability proof; this +# assertion just surfaces hostname/DNS misconfiguration early.) +WS_URL_AFTER=$(ws_field "$WS" "url") +if [ -n "$WS_URL_AFTER" ]; then + pass "workspace registered a non-empty URL: $WS_URL_AFTER" +else + fail "workspace URL is empty after reaching online" "registry row has no url" +fi +if echo "$WS_URL_AFTER" | grep -qE '^https?://(127\.0\.0\.1|localhost):'; then + pass "workspace registered a host-reachable loopback URL" +else + fail "workspace URL is not a host-reachable loopback address" "url=$WS_URL_AFTER" +fi echo "" # ---------------------------------------------------------------------------- diff --git a/workspace-server/internal/provisioner/provisioner.go b/workspace-server/internal/provisioner/provisioner.go index 71d08a3ac..2992435a9 100644 --- a/workspace-server/internal/provisioner/provisioner.go +++ b/workspace-server/internal/provisioner/provisioner.go @@ -11,6 +11,7 @@ import ( "fmt" "io" "log" + "net" "os" "path/filepath" "runtime" @@ -520,6 +521,34 @@ func InternalURL(workspaceID string) string { return fmt.Sprintf("http://%s:%s", ContainerName(workspaceID), DefaultPort) } +// allocateHostPort binds a temporary TCP listener on 127.0.0.1:0 and returns +// the allocated ephemeral port. The listener is closed before returning, so +// the port is free for Docker to bind. This lets the provisioner advertise a +// stable, host-reachable workspace URL (http://localhost:) before the +// container exists, eliminating the race where the runtime registers its +// unresolvable Docker container-id hostname. +func allocateHostPort() (string, error) { + l, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + return "", fmt.Errorf("failed to allocate ephemeral host port: %w", err) + } + defer l.Close() + addr, ok := l.Addr().(*net.TCPAddr) + if !ok { + return "", fmt.Errorf("unexpected listener address type %T", l.Addr()) + } + return strconv.Itoa(addr.Port), nil +} + +// workspaceAdvertiseURL returns the URL the runtime should advertise at +// register time. localhost is accepted by registry validateAgentURL; the +// handler layer then stores the equivalent http://127.0.0.1: URL and +// the A2A proxy rewrites it to ws-:8000 when the platform itself runs +// inside Docker. +func workspaceAdvertiseURL(hostPort string) string { + return fmt.Sprintf("http://localhost:%s", hostPort) +} + // Start provisions and starts a workspace container. func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, error) { if p == nil || p.cli == nil { @@ -530,8 +559,18 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e // already exists, so pre-deploy workspace data is not orphaned. configVolume := p.resolveConfigVolumeName(ctx, cfg.WorkspaceID) + // #2851: allocate a stable host port BEFORE building container env so the + // runtime can advertise a host-reachable URL. The alternative — letting + // Docker pick an ephemeral port and inspecting after start — leaves the + // runtime guessing its own address and registering an unresolvable + // container-id hostname. + hostPort, err := allocateHostPort() + if err != nil { + return "", err + } + // Create named volume for configs (idempotent — no-op if already exists) - _, err := p.cli.VolumeCreate(ctx, volume.CreateOptions{ + _, err = p.cli.VolumeCreate(ctx, volume.CreateOptions{ Name: configVolume, Labels: managedLabels(), }) @@ -540,7 +579,12 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e } log.Printf("Provisioner: config volume %s ready", configVolume) - env := buildContainerEnv(cfg) + // #2851: tell the runtime exactly which URL to advertise. localhost is + // accepted by registry validateAgentURL; the handler layer then stores the + // equivalent http://127.0.0.1: URL and the A2A proxy rewrites it to + // ws-:8000 when the platform itself runs inside Docker. + advertiseURL := workspaceAdvertiseURL(hostPort) + env := append(buildContainerEnv(cfg), fmt.Sprintf("MOLECULE_WORKSPACE_URL=%s", advertiseURL)) image, imgErr := selectImage(cfg) if imgErr != nil { @@ -646,7 +690,7 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e RestartPolicy: container.RestartPolicy{Name: "unless-stopped"}, PortBindings: nat.PortMap{ nat.Port(DefaultPort + "/tcp"): []nat.PortBinding{ - {HostIP: "127.0.0.1", HostPort: ""}, // Ephemeral host port + {HostIP: "127.0.0.1", HostPort: hostPort}, // Pre-allocated stable host port (#2851) }, }, } @@ -761,30 +805,34 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e // /configs and /workspace, then drops to agent via gosu). No per-start // chown needed here. - // Resolve the host-mapped port. Retry inspect up to 3 times if Docker hasn't - // bound the ephemeral port yet (rare race under heavy load). - hostURL := InternalURL(cfg.WorkspaceID) // fallback to Docker-internal + // #2851: use the pre-allocated host port directly. The inspect loop below + // is kept as a verification that Docker bound the expected port, but the + // stable value comes from allocateHostPort above. + hostURL := fmt.Sprintf("http://127.0.0.1:%s", hostPort) for attempt := 0; attempt < 3; attempt++ { info, inspectErr := p.cli.ContainerInspect(ctx, resp.ID) if inspectErr != nil { break } portBindings := info.NetworkSettings.Ports[nat.Port(DefaultPort+"/tcp")] - if len(portBindings) > 0 { - hostPort := portBindings[0].HostPort - hostIP := portBindings[0].HostIP - if hostIP == "" { - hostIP = "127.0.0.1" - } - hostURL = fmt.Sprintf("http://%s:%s", hostIP, hostPort) + if len(portBindings) > 0 && portBindings[0].HostPort == hostPort { break } if attempt < 2 { time.Sleep(500 * time.Millisecond) // wait for Docker to bind the port + } else { + log.Printf("Provisioner: container %s did not bind expected host port %s; falling back to inspect value", name, hostPort) + if len(portBindings) > 0 { + boundIP := portBindings[0].HostIP + if boundIP == "" { + boundIP = "127.0.0.1" + } + hostURL = fmt.Sprintf("http://%s:%s", boundIP, portBindings[0].HostPort) + } } } - log.Printf("Provisioner: started container %s for workspace %s at %s (internal: %s)", name, cfg.WorkspaceID, hostURL, InternalURL(cfg.WorkspaceID)) + log.Printf("Provisioner: started container %s for workspace %s at %s (advertise: %s, internal: %s)", name, cfg.WorkspaceID, hostURL, advertiseURL, InternalURL(cfg.WorkspaceID)) return hostURL, nil } diff --git a/workspace-server/internal/provisioner/provisioner_test.go b/workspace-server/internal/provisioner/provisioner_test.go index 5dde27420..071da2500 100644 --- a/workspace-server/internal/provisioner/provisioner_test.go +++ b/workspace-server/internal/provisioner/provisioner_test.go @@ -5,6 +5,7 @@ import ( "context" "errors" "io" + "net" "os" "path/filepath" "strconv" @@ -1633,3 +1634,44 @@ func TestApplyTierResources(t *testing.T) { }) } } + +// ---------- #2851 host-port advertisement ---------- + +func TestAllocateHostPort(t *testing.T) { + port, err := allocateHostPort() + if err != nil { + t.Fatalf("allocateHostPort failed: %v", err) + } + if port == "" { + t.Fatal("allocateHostPort returned empty port") + } + if port == "0" { + t.Fatalf("allocateHostPort returned port 0") + } + + // Verify the port is actually numeric and in the ephemeral range. + n, err := strconv.Atoi(port) + if err != nil { + t.Fatalf("allocateHostPort returned non-numeric port %q: %v", port, err) + } + if n < 1024 || n > 65535 { + t.Fatalf("allocateHostPort returned out-of-range port %d", n) + } + + // Verify the port is genuinely free for Docker to bind. (We don't assert + // uniqueness across two calls — the OS may immediately reuse a just-closed + // ephemeral port under heavy load.) + l, err := net.Listen("tcp", "127.0.0.1:"+port) + if err != nil { + t.Fatalf("allocateHostPort returned port %s that is not bindable: %v", port, err) + } + l.Close() +} + +func TestWorkspaceAdvertiseURL(t *testing.T) { + got := workspaceAdvertiseURL("12345") + want := "http://localhost:12345" + if got != want { + t.Errorf("workspaceAdvertiseURL = %q, want %q", got, want) + } +}