From ac487689d4be16346deb73de5a3fb0a9bfffc651 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Sun, 14 Jun 2026 14:26:25 +0000 Subject: [PATCH 1/4] fix(ci): #2851 advertise reachable workspace URL in local-provision real-image e2e The real-image advisory lane failed because the workspace runtime registered its Docker container-id hostname, which the platform host could not resolve. Fix the harness side: - Connect the act_runner job container to molecule-core-net in the advisory job. - Run the platform with MOLECULE_IN_DOCKER=true + MOLECULE_DEPLOY_MODE=saas so it resolves ws-:8000 and the registry validator accepts the private Docker-network IP. - Inject MOLECULE_WORKSPACE_URL=http://ws-:8000 as a workspace secret before the first restart, so the runtime registers a reachable URL. - Add an early assertion that the registered URL is non-empty and reachable from molecule-core-net, so hostname failures surface before the MiniMax round-trip. --- .gitea/workflows/local-provision-e2e.yml | 21 ++++++++++----- .../e2e/test_local_provision_lifecycle_e2e.sh | 27 +++++++++++++++++++ 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/.gitea/workflows/local-provision-e2e.yml b/.gitea/workflows/local-provision-e2e.yml index 2732c14a4..65282fb31 100644 --- a/.gitea/workflows/local-provision-e2e.yml +++ b/.gitea/workflows/local-provision-e2e.yml @@ -320,12 +320,13 @@ jobs: # even if the runner's $GITHUB_ENV propagation is flaky (#2468 RCA). MOLECULE_ENV: development SECRETS_ENCRYPTION_KEY: lpe2e-test-encryption-key-32bytes!! - # act_runner runs the job inside a Docker container, so /.dockerenv exists - # and the platform auto-detects platformInDocker=true. But the job container - # is NOT on molecule-core-net, so it cannot resolve workspace container - # hostnames (ws-:8000). Force false so the proxy keeps using the - # host-mapped 127.0.0.1: URL, which IS reachable. - MOLECULE_IN_DOCKER: false + # #2851: the real workspace advertises its container DNS name (ws-:8000). + # Connect the job container to molecule-core-net below and run the platform + # in Docker-aware mode so the proxy resolves workspace container names. + # SaaS deploy mode lets the registry validator accept the private Docker-network + # IP that ws- resolves to, since this advisory lane is a single-host test. + MOLECULE_IN_DOCKER: true + MOLECULE_DEPLOY_MODE: saas steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5 @@ -351,6 +352,14 @@ jobs: # uses for token-write helper containers. docker pull alpine:3.20@sha256:c64c687cbea9300178b30c95835354e34c4e4febc4badfe27102879de0483b5e >/dev/null docker network create molecule-core-net >/dev/null 2>&1 || true + # #2851: connect the act_runner job container to molecule-core-net so the + # platform process can resolve ws-:8000 workspace URLs. + JOB_CID="$(hostname)" + if docker network connect molecule-core-net "$JOB_CID" >/dev/null 2>&1; then + echo "Connected job container ($JOB_CID) to molecule-core-net." + else + echo "WARN: could not connect job container to molecule-core-net (already connected or fallback)." + fi - name: Start Postgres (docker, ephemeral host port) run: | diff --git a/tests/e2e/test_local_provision_lifecycle_e2e.sh b/tests/e2e/test_local_provision_lifecycle_e2e.sh index 371defe35..18262bf13 100755 --- a/tests/e2e/test_local_provision_lifecycle_e2e.sh +++ b/tests/e2e/test_local_provision_lifecycle_e2e.sh @@ -441,6 +441,17 @@ if [ "$LIFECYCLE_LLM" = "minimax" ]; then echo " secret write (MODEL_PROVIDER): $(echo "$SECP" | head -c 120)" fi +# #2851: the runtime derives its advertised A2A URL from HOSTNAME (Docker sets +# this to the container id, e.g. 30e9e720fbc2, which the platform host cannot +# resolve → registry_register_400). Inject MOLECULE_WORKSPACE_URL via a workspace +# secret so the runtime registers the reachable container DNS name instead. +WS_CONTAINER_NAME=$(container_name "$WSID") +WS_URL="http://${WS_CONTAINER_NAME}:8000" +SECP=$(curl -s -X POST "$BASE/workspaces/$WSID/secrets" \ + -H "Authorization: Bearer $WTOKEN" -H "Content-Type: application/json" \ + -d "{\"key\":\"MOLECULE_WORKSPACE_URL\",\"value\":\"${WS_URL}\"}") +echo " secret write (MOLECULE_WORKSPACE_URL): $(echo "$SECP" | head -c 120)" + # Seed config.yaml directly into the named config volume so the provision (and # every later restart) has a config source. Create's byok-no-cred abort never # wrote it, and this dev stack ships no claude-code template in the platform's @@ -515,6 +526,22 @@ for _ in $(seq 1 10); do sleep 1 done if [ -n "$RUN" ]; then pass "container running: $RUN"; else fail "no running ws-${WSID} container within 10s of online" "docker ps shows none"; fi + +# #2851: fail fast if the workspace advertised an unresolvable/unreachable URL. +# The MiniMax round-trip fails opaquely when the proxy cannot reach the agent; +# asserting the registered URL is set and reachable surfaces hostname/DNS issues +# at the registration layer instead. +WS_URL_AFTER=$(ws_field "$WS" "url") +if [ -n "$WS_URL_AFTER" ]; then + pass "workspace registered a non-empty URL: $WS_URL_AFTER" +else + fail "workspace URL is empty after reaching online" "registry row has no url" +fi +if docker run --rm --network molecule-core-net alpine:latest sh -c "wget -qO- ${WS_URL_AFTER}/.well-known/agent-card.json >/dev/null 2>&1" 2>/dev/null; then + pass "workspace URL is reachable from molecule-core-net" +else + fail "workspace URL is not reachable from molecule-core-net" "url=$WS_URL_AFTER" +fi echo "" # ---------------------------------------------------------------------------- -- 2.52.0 From 6a6b7ecc83e90e634a359fe6831bb1b8ea60dce9 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Sun, 14 Jun 2026 14:43:06 +0000 Subject: [PATCH 2/4] fix(provisioner): #2851 inject MOLECULE_WORKSPACE_URL for local-provision real-image Make the local Docker provisioner allocate a stable host port and inject MOLECULE_WORKSPACE_URL=http://localhost: into the workspace container env before the runtime starts. The runtime's resolve_workspace_url already consumes this env var verbatim, so it registers a reachable URL instead of its unresolvable Docker container-id hostname. The handler layer still stores the equivalent http://127.0.0.1: URL, and the A2A proxy already rewrites 127.0.0.1: to ws-:8000 when the platform runs inside Docker, so no workspace-server Go change is required. Also remove the manual workspace-secret workaround from the lifecycle E2E script and drop the advisory job's MOLECULE_DEPLOY_MODE=saas override (dev mode already relaxes private-range SSRF checks). Tests added for allocateHostPort and workspaceAdvertiseURL. Co-Authored-By: Claude --- .gitea/workflows/local-provision-e2e.yml | 12 +-- .../e2e/test_local_provision_lifecycle_e2e.sh | 14 +--- .../internal/provisioner/provisioner.go | 76 +++++++++++++++---- .../internal/provisioner/provisioner_test.go | 40 ++++++++++ 4 files changed, 112 insertions(+), 30 deletions(-) diff --git a/.gitea/workflows/local-provision-e2e.yml b/.gitea/workflows/local-provision-e2e.yml index 65282fb31..038270a4d 100644 --- a/.gitea/workflows/local-provision-e2e.yml +++ b/.gitea/workflows/local-provision-e2e.yml @@ -320,13 +320,13 @@ jobs: # even if the runner's $GITHUB_ENV propagation is flaky (#2468 RCA). MOLECULE_ENV: development SECRETS_ENCRYPTION_KEY: lpe2e-test-encryption-key-32bytes!! - # #2851: the real workspace advertises its container DNS name (ws-:8000). - # Connect the job container to molecule-core-net below and run the platform - # in Docker-aware mode so the proxy resolves workspace container names. - # SaaS deploy mode lets the registry validator accept the private Docker-network - # IP that ws- resolves to, since this advisory lane is a single-host test. + # #2851: the provisioner now makes the workspace advertise http://localhost:. + # When the platform itself runs inside Docker, the A2A proxy rewrites the + # stored http://127.0.0.1: URL to ws-:8000, so keep the job + # container on molecule-core-net and Docker-aware mode so that Docker-DNS + # name resolves. Dev mode already relaxes private-range SSRF checks, so SaaS + # deploy mode is no longer required. MOLECULE_IN_DOCKER: true - MOLECULE_DEPLOY_MODE: saas steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5 diff --git a/tests/e2e/test_local_provision_lifecycle_e2e.sh b/tests/e2e/test_local_provision_lifecycle_e2e.sh index 18262bf13..987368644 100755 --- a/tests/e2e/test_local_provision_lifecycle_e2e.sh +++ b/tests/e2e/test_local_provision_lifecycle_e2e.sh @@ -441,16 +441,10 @@ if [ "$LIFECYCLE_LLM" = "minimax" ]; then echo " secret write (MODEL_PROVIDER): $(echo "$SECP" | head -c 120)" fi -# #2851: the runtime derives its advertised A2A URL from HOSTNAME (Docker sets -# this to the container id, e.g. 30e9e720fbc2, which the platform host cannot -# resolve → registry_register_400). Inject MOLECULE_WORKSPACE_URL via a workspace -# secret so the runtime registers the reachable container DNS name instead. -WS_CONTAINER_NAME=$(container_name "$WSID") -WS_URL="http://${WS_CONTAINER_NAME}:8000" -SECP=$(curl -s -X POST "$BASE/workspaces/$WSID/secrets" \ - -H "Authorization: Bearer $WTOKEN" -H "Content-Type: application/json" \ - -d "{\"key\":\"MOLECULE_WORKSPACE_URL\",\"value\":\"${WS_URL}\"}") -echo " secret write (MOLECULE_WORKSPACE_URL): $(echo "$SECP" | head -c 120)" +# #2851: the provisioner now injects MOLECULE_WORKSPACE_URL automatically so the +# runtime advertises a host-reachable URL (http://localhost:), which +# the A2A proxy rewrites to ws-:8000 when the platform runs inside Docker. +# No manual workspace-secret injection is needed here. # Seed config.yaml directly into the named config volume so the provision (and # every later restart) has a config source. Create's byok-no-cred abort never diff --git a/workspace-server/internal/provisioner/provisioner.go b/workspace-server/internal/provisioner/provisioner.go index 71d08a3ac..2992435a9 100644 --- a/workspace-server/internal/provisioner/provisioner.go +++ b/workspace-server/internal/provisioner/provisioner.go @@ -11,6 +11,7 @@ import ( "fmt" "io" "log" + "net" "os" "path/filepath" "runtime" @@ -520,6 +521,34 @@ func InternalURL(workspaceID string) string { return fmt.Sprintf("http://%s:%s", ContainerName(workspaceID), DefaultPort) } +// allocateHostPort binds a temporary TCP listener on 127.0.0.1:0 and returns +// the allocated ephemeral port. The listener is closed before returning, so +// the port is free for Docker to bind. This lets the provisioner advertise a +// stable, host-reachable workspace URL (http://localhost:) before the +// container exists, eliminating the race where the runtime registers its +// unresolvable Docker container-id hostname. +func allocateHostPort() (string, error) { + l, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + return "", fmt.Errorf("failed to allocate ephemeral host port: %w", err) + } + defer l.Close() + addr, ok := l.Addr().(*net.TCPAddr) + if !ok { + return "", fmt.Errorf("unexpected listener address type %T", l.Addr()) + } + return strconv.Itoa(addr.Port), nil +} + +// workspaceAdvertiseURL returns the URL the runtime should advertise at +// register time. localhost is accepted by registry validateAgentURL; the +// handler layer then stores the equivalent http://127.0.0.1: URL and +// the A2A proxy rewrites it to ws-:8000 when the platform itself runs +// inside Docker. +func workspaceAdvertiseURL(hostPort string) string { + return fmt.Sprintf("http://localhost:%s", hostPort) +} + // Start provisions and starts a workspace container. func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, error) { if p == nil || p.cli == nil { @@ -530,8 +559,18 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e // already exists, so pre-deploy workspace data is not orphaned. configVolume := p.resolveConfigVolumeName(ctx, cfg.WorkspaceID) + // #2851: allocate a stable host port BEFORE building container env so the + // runtime can advertise a host-reachable URL. The alternative — letting + // Docker pick an ephemeral port and inspecting after start — leaves the + // runtime guessing its own address and registering an unresolvable + // container-id hostname. + hostPort, err := allocateHostPort() + if err != nil { + return "", err + } + // Create named volume for configs (idempotent — no-op if already exists) - _, err := p.cli.VolumeCreate(ctx, volume.CreateOptions{ + _, err = p.cli.VolumeCreate(ctx, volume.CreateOptions{ Name: configVolume, Labels: managedLabels(), }) @@ -540,7 +579,12 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e } log.Printf("Provisioner: config volume %s ready", configVolume) - env := buildContainerEnv(cfg) + // #2851: tell the runtime exactly which URL to advertise. localhost is + // accepted by registry validateAgentURL; the handler layer then stores the + // equivalent http://127.0.0.1: URL and the A2A proxy rewrites it to + // ws-:8000 when the platform itself runs inside Docker. + advertiseURL := workspaceAdvertiseURL(hostPort) + env := append(buildContainerEnv(cfg), fmt.Sprintf("MOLECULE_WORKSPACE_URL=%s", advertiseURL)) image, imgErr := selectImage(cfg) if imgErr != nil { @@ -646,7 +690,7 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e RestartPolicy: container.RestartPolicy{Name: "unless-stopped"}, PortBindings: nat.PortMap{ nat.Port(DefaultPort + "/tcp"): []nat.PortBinding{ - {HostIP: "127.0.0.1", HostPort: ""}, // Ephemeral host port + {HostIP: "127.0.0.1", HostPort: hostPort}, // Pre-allocated stable host port (#2851) }, }, } @@ -761,30 +805,34 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e // /configs and /workspace, then drops to agent via gosu). No per-start // chown needed here. - // Resolve the host-mapped port. Retry inspect up to 3 times if Docker hasn't - // bound the ephemeral port yet (rare race under heavy load). - hostURL := InternalURL(cfg.WorkspaceID) // fallback to Docker-internal + // #2851: use the pre-allocated host port directly. The inspect loop below + // is kept as a verification that Docker bound the expected port, but the + // stable value comes from allocateHostPort above. + hostURL := fmt.Sprintf("http://127.0.0.1:%s", hostPort) for attempt := 0; attempt < 3; attempt++ { info, inspectErr := p.cli.ContainerInspect(ctx, resp.ID) if inspectErr != nil { break } portBindings := info.NetworkSettings.Ports[nat.Port(DefaultPort+"/tcp")] - if len(portBindings) > 0 { - hostPort := portBindings[0].HostPort - hostIP := portBindings[0].HostIP - if hostIP == "" { - hostIP = "127.0.0.1" - } - hostURL = fmt.Sprintf("http://%s:%s", hostIP, hostPort) + if len(portBindings) > 0 && portBindings[0].HostPort == hostPort { break } if attempt < 2 { time.Sleep(500 * time.Millisecond) // wait for Docker to bind the port + } else { + log.Printf("Provisioner: container %s did not bind expected host port %s; falling back to inspect value", name, hostPort) + if len(portBindings) > 0 { + boundIP := portBindings[0].HostIP + if boundIP == "" { + boundIP = "127.0.0.1" + } + hostURL = fmt.Sprintf("http://%s:%s", boundIP, portBindings[0].HostPort) + } } } - log.Printf("Provisioner: started container %s for workspace %s at %s (internal: %s)", name, cfg.WorkspaceID, hostURL, InternalURL(cfg.WorkspaceID)) + log.Printf("Provisioner: started container %s for workspace %s at %s (advertise: %s, internal: %s)", name, cfg.WorkspaceID, hostURL, advertiseURL, InternalURL(cfg.WorkspaceID)) return hostURL, nil } diff --git a/workspace-server/internal/provisioner/provisioner_test.go b/workspace-server/internal/provisioner/provisioner_test.go index 5dde27420..212769eed 100644 --- a/workspace-server/internal/provisioner/provisioner_test.go +++ b/workspace-server/internal/provisioner/provisioner_test.go @@ -1633,3 +1633,43 @@ func TestApplyTierResources(t *testing.T) { }) } } + +// ---------- #2851 host-port advertisement ---------- + +func TestAllocateHostPort(t *testing.T) { + port1, err := allocateHostPort() + if err != nil { + t.Fatalf("allocateHostPort failed: %v", err) + } + if port1 == "" { + t.Fatal("allocateHostPort returned empty port") + } + if port1 == "0" { + t.Fatalf("allocateHostPort returned port 0") + } + + port2, err := allocateHostPort() + if err != nil { + t.Fatalf("second allocateHostPort failed: %v", err) + } + if port2 == port1 { + t.Fatalf("allocateHostPort returned the same port twice: %s", port1) + } + + // Verify the port is actually numeric and in the ephemeral range. + n, err := strconv.Atoi(port1) + if err != nil { + t.Fatalf("allocateHostPort returned non-numeric port %q: %v", port1, err) + } + if n < 1024 || n > 65535 { + t.Fatalf("allocateHostPort returned out-of-range port %d", n) + } +} + +func TestWorkspaceAdvertiseURL(t *testing.T) { + got := workspaceAdvertiseURL("12345") + want := "http://localhost:12345" + if got != want { + t.Errorf("workspaceAdvertiseURL = %q, want %q", got, want) + } +} -- 2.52.0 From 8af0e18fb9bf60a2eada39aa81632e79c5906065 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Sun, 14 Jun 2026 15:13:44 +0000 Subject: [PATCH 3/4] test(e2e): #2851 assert loopback URL, not molecule-core-net reachability The provisioner now stores http://127.0.0.1: in the registry. The old molecule-core-net reachability assertion assumed a ws-:8000 URL (from the manual workaround). Update the assertion to expect a host-reachable loopback URL; the real end-to-end reachability is still proven by Step 5's proxy-reach test. Co-Authored-By: Claude --- tests/e2e/test_local_provision_lifecycle_e2e.sh | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tests/e2e/test_local_provision_lifecycle_e2e.sh b/tests/e2e/test_local_provision_lifecycle_e2e.sh index 987368644..b1ee0e086 100755 --- a/tests/e2e/test_local_provision_lifecycle_e2e.sh +++ b/tests/e2e/test_local_provision_lifecycle_e2e.sh @@ -522,19 +522,22 @@ done if [ -n "$RUN" ]; then pass "container running: $RUN"; else fail "no running ws-${WSID} container within 10s of online" "docker ps shows none"; fi # #2851: fail fast if the workspace advertised an unresolvable/unreachable URL. -# The MiniMax round-trip fails opaquely when the proxy cannot reach the agent; -# asserting the registered URL is set and reachable surfaces hostname/DNS issues -# at the registration layer instead. +# The provisioner now makes the runtime advertise http://localhost:, +# which the platform stores as http://127.0.0.1:. The A2A proxy +# rewrites that to ws-:8000 when the platform runs inside Docker, so the +# URL stored in the registry should always be a host-reachable loopback address. +# (The end-to-end proxy reach in Step 5 is the real reachability proof; this +# assertion just surfaces hostname/DNS misconfiguration early.) WS_URL_AFTER=$(ws_field "$WS" "url") if [ -n "$WS_URL_AFTER" ]; then pass "workspace registered a non-empty URL: $WS_URL_AFTER" else fail "workspace URL is empty after reaching online" "registry row has no url" fi -if docker run --rm --network molecule-core-net alpine:latest sh -c "wget -qO- ${WS_URL_AFTER}/.well-known/agent-card.json >/dev/null 2>&1" 2>/dev/null; then - pass "workspace URL is reachable from molecule-core-net" +if echo "$WS_URL_AFTER" | grep -qE '^https?://(127\.0\.0\.1|localhost):'; then + pass "workspace registered a host-reachable loopback URL" else - fail "workspace URL is not reachable from molecule-core-net" "url=$WS_URL_AFTER" + fail "workspace URL is not a host-reachable loopback address" "url=$WS_URL_AFTER" fi echo "" -- 2.52.0 From d848aef0b61f48a8facd17d1c4a99237b7b91221 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Sun, 14 Jun 2026 15:15:25 +0000 Subject: [PATCH 4/4] test(provisioner): deflake allocateHostPort test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the uniqueness assertion — the OS may immediately reuse a just- closed ephemeral port. Replace it with a bindability check and keep the range/numeric validations. Co-Authored-By: Claude --- .../internal/provisioner/provisioner_test.go | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/workspace-server/internal/provisioner/provisioner_test.go b/workspace-server/internal/provisioner/provisioner_test.go index 212769eed..071da2500 100644 --- a/workspace-server/internal/provisioner/provisioner_test.go +++ b/workspace-server/internal/provisioner/provisioner_test.go @@ -5,6 +5,7 @@ import ( "context" "errors" "io" + "net" "os" "path/filepath" "strconv" @@ -1637,33 +1638,34 @@ func TestApplyTierResources(t *testing.T) { // ---------- #2851 host-port advertisement ---------- func TestAllocateHostPort(t *testing.T) { - port1, err := allocateHostPort() + port, err := allocateHostPort() if err != nil { t.Fatalf("allocateHostPort failed: %v", err) } - if port1 == "" { + if port == "" { t.Fatal("allocateHostPort returned empty port") } - if port1 == "0" { + if port == "0" { t.Fatalf("allocateHostPort returned port 0") } - port2, err := allocateHostPort() - if err != nil { - t.Fatalf("second allocateHostPort failed: %v", err) - } - if port2 == port1 { - t.Fatalf("allocateHostPort returned the same port twice: %s", port1) - } - // Verify the port is actually numeric and in the ephemeral range. - n, err := strconv.Atoi(port1) + n, err := strconv.Atoi(port) if err != nil { - t.Fatalf("allocateHostPort returned non-numeric port %q: %v", port1, err) + t.Fatalf("allocateHostPort returned non-numeric port %q: %v", port, err) } if n < 1024 || n > 65535 { t.Fatalf("allocateHostPort returned out-of-range port %d", n) } + + // Verify the port is genuinely free for Docker to bind. (We don't assert + // uniqueness across two calls — the OS may immediately reuse a just-closed + // ephemeral port under heavy load.) + l, err := net.Listen("tcp", "127.0.0.1:"+port) + if err != nil { + t.Fatalf("allocateHostPort returned port %s that is not bindable: %v", port, err) + } + l.Close() } func TestWorkspaceAdvertiseURL(t *testing.T) { -- 2.52.0