fix(provisioner): #2851 inject MOLECULE_WORKSPACE_URL for local-provision real-image #2854

Merged
devops-engineer merged 4 commits from fix/2851-local-provision-real-image-hostname into main 2026-06-14 15:20:18 +00:00
4 changed files with 143 additions and 20 deletions
+15 -6
View File
@@ -320,12 +320,13 @@ jobs:
# even if the runner's $GITHUB_ENV propagation is flaky (#2468 RCA).
MOLECULE_ENV: development
SECRETS_ENCRYPTION_KEY: lpe2e-test-encryption-key-32bytes!!
# act_runner runs the job inside a Docker container, so /.dockerenv exists
# and the platform auto-detects platformInDocker=true. But the job container
# is NOT on molecule-core-net, so it cannot resolve workspace container
# hostnames (ws-<id>:8000). Force false so the proxy keeps using the
# host-mapped 127.0.0.1:<ephemeral_port> URL, which IS reachable.
MOLECULE_IN_DOCKER: false
# #2851: the provisioner now makes the workspace advertise http://localhost:<host-port>.
# When the platform itself runs inside Docker, the A2A proxy rewrites the
# stored http://127.0.0.1:<host-port> URL to ws-<id>:8000, so keep the job
# container on molecule-core-net and Docker-aware mode so that Docker-DNS
# name resolves. Dev mode already relaxes private-range SSRF checks, so SaaS
# deploy mode is no longer required.
MOLECULE_IN_DOCKER: true
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
@@ -351,6 +352,14 @@ jobs:
# uses for token-write helper containers.
docker pull alpine:3.20@sha256:c64c687cbea9300178b30c95835354e34c4e4febc4badfe27102879de0483b5e >/dev/null
docker network create molecule-core-net >/dev/null 2>&1 || true
# #2851: connect the act_runner job container to molecule-core-net so the
# platform process can resolve ws-<id>:8000 workspace URLs.
JOB_CID="$(hostname)"
if docker network connect molecule-core-net "$JOB_CID" >/dev/null 2>&1; then
echo "Connected job container ($JOB_CID) to molecule-core-net."
else
echo "WARN: could not connect job container to molecule-core-net (already connected or fallback)."
fi
- name: Start Postgres (docker, ephemeral host port)
run: |
@@ -441,6 +441,11 @@ if [ "$LIFECYCLE_LLM" = "minimax" ]; then
echo " secret write (MODEL_PROVIDER): $(echo "$SECP" | head -c 120)"
fi
# #2851: the provisioner now injects MOLECULE_WORKSPACE_URL automatically so the
# runtime advertises a host-reachable URL (http://localhost:<host-port>), which
# the A2A proxy rewrites to ws-<id>:8000 when the platform runs inside Docker.
# No manual workspace-secret injection is needed here.
# Seed config.yaml directly into the named config volume so the provision (and
# every later restart) has a config source. Create's byok-no-cred abort never
# wrote it, and this dev stack ships no claude-code template in the platform's
@@ -515,6 +520,25 @@ for _ in $(seq 1 10); do
sleep 1
done
if [ -n "$RUN" ]; then pass "container running: $RUN"; else fail "no running ws-${WSID} container within 10s of online" "docker ps shows none"; fi
# #2851: fail fast if the workspace advertised an unresolvable/unreachable URL.
# The provisioner now makes the runtime advertise http://localhost:<host-port>,
# which the platform stores as http://127.0.0.1:<host-port>. The A2A proxy
# rewrites that to ws-<id>:8000 when the platform runs inside Docker, so the
# URL stored in the registry should always be a host-reachable loopback address.
# (The end-to-end proxy reach in Step 5 is the real reachability proof; this
# assertion just surfaces hostname/DNS misconfiguration early.)
WS_URL_AFTER=$(ws_field "$WS" "url")
if [ -n "$WS_URL_AFTER" ]; then
pass "workspace registered a non-empty URL: $WS_URL_AFTER"
else
fail "workspace URL is empty after reaching online" "registry row has no url"
fi
if echo "$WS_URL_AFTER" | grep -qE '^https?://(127\.0\.0\.1|localhost):'; then
pass "workspace registered a host-reachable loopback URL"
else
fail "workspace URL is not a host-reachable loopback address" "url=$WS_URL_AFTER"
fi
echo ""
# ----------------------------------------------------------------------------
@@ -11,6 +11,7 @@ import (
"fmt"
"io"
"log"
"net"
"os"
"path/filepath"
"runtime"
@@ -520,6 +521,34 @@ func InternalURL(workspaceID string) string {
return fmt.Sprintf("http://%s:%s", ContainerName(workspaceID), DefaultPort)
}
// allocateHostPort binds a temporary TCP listener on 127.0.0.1:0 and returns
// the allocated ephemeral port. The listener is closed before returning, so
// the port is free for Docker to bind. This lets the provisioner advertise a
// stable, host-reachable workspace URL (http://localhost:<port>) before the
// container exists, eliminating the race where the runtime registers its
// unresolvable Docker container-id hostname.
func allocateHostPort() (string, error) {
l, err := net.Listen("tcp", "127.0.0.1:0")
if err != nil {
return "", fmt.Errorf("failed to allocate ephemeral host port: %w", err)
}
defer l.Close()
addr, ok := l.Addr().(*net.TCPAddr)
if !ok {
return "", fmt.Errorf("unexpected listener address type %T", l.Addr())
}
return strconv.Itoa(addr.Port), nil
}
// workspaceAdvertiseURL returns the URL the runtime should advertise at
// register time. localhost is accepted by registry validateAgentURL; the
// handler layer then stores the equivalent http://127.0.0.1:<port> URL and
// the A2A proxy rewrites it to ws-<id>:8000 when the platform itself runs
// inside Docker.
func workspaceAdvertiseURL(hostPort string) string {
return fmt.Sprintf("http://localhost:%s", hostPort)
}
// Start provisions and starts a workspace container.
func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, error) {
if p == nil || p.cli == nil {
@@ -530,8 +559,18 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e
// already exists, so pre-deploy workspace data is not orphaned.
configVolume := p.resolveConfigVolumeName(ctx, cfg.WorkspaceID)
// #2851: allocate a stable host port BEFORE building container env so the
// runtime can advertise a host-reachable URL. The alternative — letting
// Docker pick an ephemeral port and inspecting after start — leaves the
// runtime guessing its own address and registering an unresolvable
// container-id hostname.
hostPort, err := allocateHostPort()
if err != nil {
return "", err
}
// Create named volume for configs (idempotent — no-op if already exists)
_, err := p.cli.VolumeCreate(ctx, volume.CreateOptions{
_, err = p.cli.VolumeCreate(ctx, volume.CreateOptions{
Name: configVolume,
Labels: managedLabels(),
})
@@ -540,7 +579,12 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e
}
log.Printf("Provisioner: config volume %s ready", configVolume)
env := buildContainerEnv(cfg)
// #2851: tell the runtime exactly which URL to advertise. localhost is
// accepted by registry validateAgentURL; the handler layer then stores the
// equivalent http://127.0.0.1:<port> URL and the A2A proxy rewrites it to
// ws-<id>:8000 when the platform itself runs inside Docker.
advertiseURL := workspaceAdvertiseURL(hostPort)
env := append(buildContainerEnv(cfg), fmt.Sprintf("MOLECULE_WORKSPACE_URL=%s", advertiseURL))
image, imgErr := selectImage(cfg)
if imgErr != nil {
@@ -646,7 +690,7 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e
RestartPolicy: container.RestartPolicy{Name: "unless-stopped"},
PortBindings: nat.PortMap{
nat.Port(DefaultPort + "/tcp"): []nat.PortBinding{
{HostIP: "127.0.0.1", HostPort: ""}, // Ephemeral host port
{HostIP: "127.0.0.1", HostPort: hostPort}, // Pre-allocated stable host port (#2851)
},
},
}
@@ -761,30 +805,34 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e
// /configs and /workspace, then drops to agent via gosu). No per-start
// chown needed here.
// Resolve the host-mapped port. Retry inspect up to 3 times if Docker hasn't
// bound the ephemeral port yet (rare race under heavy load).
hostURL := InternalURL(cfg.WorkspaceID) // fallback to Docker-internal
// #2851: use the pre-allocated host port directly. The inspect loop below
// is kept as a verification that Docker bound the expected port, but the
// stable value comes from allocateHostPort above.
hostURL := fmt.Sprintf("http://127.0.0.1:%s", hostPort)
for attempt := 0; attempt < 3; attempt++ {
info, inspectErr := p.cli.ContainerInspect(ctx, resp.ID)
if inspectErr != nil {
break
}
portBindings := info.NetworkSettings.Ports[nat.Port(DefaultPort+"/tcp")]
if len(portBindings) > 0 {
hostPort := portBindings[0].HostPort
hostIP := portBindings[0].HostIP
if hostIP == "" {
hostIP = "127.0.0.1"
}
hostURL = fmt.Sprintf("http://%s:%s", hostIP, hostPort)
if len(portBindings) > 0 && portBindings[0].HostPort == hostPort {
break
}
if attempt < 2 {
time.Sleep(500 * time.Millisecond) // wait for Docker to bind the port
} else {
log.Printf("Provisioner: container %s did not bind expected host port %s; falling back to inspect value", name, hostPort)
if len(portBindings) > 0 {
boundIP := portBindings[0].HostIP
if boundIP == "" {
boundIP = "127.0.0.1"
}
hostURL = fmt.Sprintf("http://%s:%s", boundIP, portBindings[0].HostPort)
}
}
}
log.Printf("Provisioner: started container %s for workspace %s at %s (internal: %s)", name, cfg.WorkspaceID, hostURL, InternalURL(cfg.WorkspaceID))
log.Printf("Provisioner: started container %s for workspace %s at %s (advertise: %s, internal: %s)", name, cfg.WorkspaceID, hostURL, advertiseURL, InternalURL(cfg.WorkspaceID))
return hostURL, nil
}
@@ -5,6 +5,7 @@ import (
"context"
"errors"
"io"
"net"
"os"
"path/filepath"
"strconv"
@@ -1633,3 +1634,44 @@ func TestApplyTierResources(t *testing.T) {
})
}
}
// ---------- #2851 host-port advertisement ----------
func TestAllocateHostPort(t *testing.T) {
port, err := allocateHostPort()
if err != nil {
t.Fatalf("allocateHostPort failed: %v", err)
}
if port == "" {
t.Fatal("allocateHostPort returned empty port")
}
if port == "0" {
t.Fatalf("allocateHostPort returned port 0")
}
// Verify the port is actually numeric and in the ephemeral range.
n, err := strconv.Atoi(port)
if err != nil {
t.Fatalf("allocateHostPort returned non-numeric port %q: %v", port, err)
}
if n < 1024 || n > 65535 {
t.Fatalf("allocateHostPort returned out-of-range port %d", n)
}
// Verify the port is genuinely free for Docker to bind. (We don't assert
// uniqueness across two calls — the OS may immediately reuse a just-closed
// ephemeral port under heavy load.)
l, err := net.Listen("tcp", "127.0.0.1:"+port)
if err != nil {
t.Fatalf("allocateHostPort returned port %s that is not bindable: %v", port, err)
}
l.Close()
}
func TestWorkspaceAdvertiseURL(t *testing.T) {
got := workspaceAdvertiseURL("12345")
want := "http://localhost:12345"
if got != want {
t.Errorf("workspaceAdvertiseURL = %q, want %q", got, want)
}
}