diff --git a/.gitea/workflows/local-provision-e2e.yml b/.gitea/workflows/local-provision-e2e.yml index d927f9b71..e861b353d 100644 --- a/.gitea/workflows/local-provision-e2e.yml +++ b/.gitea/workflows/local-provision-e2e.yml @@ -324,13 +324,13 @@ jobs: # even if the runner's $GITHUB_ENV propagation is flaky (#2468 RCA). MOLECULE_ENV: development SECRETS_ENCRYPTION_KEY: lpe2e-test-encryption-key-32bytes!! - # #2851: the provisioner now makes the workspace advertise http://localhost:. - # When the platform itself runs inside Docker, the A2A proxy rewrites the - # stored http://127.0.0.1: URL to ws-:8000, so keep the job - # container on molecule-core-net and Docker-aware mode so that Docker-DNS - # name resolves. Dev mode already relaxes private-range SSRF checks, so SaaS - # deploy mode is no longer required. - MOLECULE_IN_DOCKER: true + # #2851 fix follow-up: act_runner executes this job container in host-network + # mode, so the platform process shares the host's loopback. The A2A proxy + # must therefore keep the stored http://127.0.0.1: URL as-is; + # enabling Docker-aware rewrite would synthesize ws-:8000, which the + # host-network platform cannot resolve. Leave MOLECULE_IN_DOCKER=false so + # the proxy uses the host-mapped port directly (same as lifecycle-stub). + MOLECULE_IN_DOCKER: false steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5 @@ -356,14 +356,10 @@ jobs: # uses for token-write helper containers. docker pull alpine:3.20@sha256:c64c687cbea9300178b30c95835354e34c4e4febc4badfe27102879de0483b5e >/dev/null docker network create molecule-core-net >/dev/null 2>&1 || true - # #2851: connect the act_runner job container to molecule-core-net so the - # platform process can resolve ws-:8000 workspace URLs. - JOB_CID="$(hostname)" - if docker network connect molecule-core-net "$JOB_CID" >/dev/null 2>&1; then - echo "Connected job container ($JOB_CID) to molecule-core-net." - else - echo "WARN: could not connect job container to molecule-core-net (already connected or fallback)." - fi + # Workspace containers join molecule-core-net. The act_runner job + # container runs host-network, so connecting it to the bridge is not + # possible, but the bridge itself must exist for sibling workspace + # containers to communicate. - name: Start Postgres (docker, ephemeral host port) run: | @@ -424,6 +420,11 @@ jobs: fi echo "PLATFORM_HOST_IP=${PLATFORM_HOST_IP}" echo "PLATFORM_URL=http://${PLATFORM_HOST_IP}:${PORT}" >> "$GITHUB_ENV" + # The workspace advertises localhost: (the default when + # MOLECULE_WORKSPACE_ADVERTISE_HOST is unset). Because this job runs + # host-network, the platform reaches the container via 127.0.0.1; no + # gateway-IP advertisement is needed here and keeping it unset lets + # /registry/register accept the localhost URL. T="lpe2e-real-admin-${{ github.run_id }}-${{ github.run_attempt }}" echo "ADMIN_TOKEN=${T}" >> "$GITHUB_ENV" echo "MOLECULE_ADMIN_TOKEN=${T}" >> "$GITHUB_ENV" diff --git a/tests/e2e/stub-runtime/server.py b/tests/e2e/stub-runtime/server.py index 0081cb5d0..81275ae90 100644 --- a/tests/e2e/stub-runtime/server.py +++ b/tests/e2e/stub-runtime/server.py @@ -83,11 +83,15 @@ HOSTNAME = os.environ.get("HOSTNAME", "").strip() # docker sets this to the con # address (0.0.0.0:8000, reachable as ws-:8000 on the bridge) is what # the proxy actually hits — independent of the URL string we register. # -# Net: register a name-form localhost URL purely to satisfy push-mode's -# "url required + must pass SSRF check" and to get our auth_token. Routing is -# handled by the provisioner-stored 127.0.0.1 URL + the proxy rewrite. +# Net: register a host-reachable localhost URL. The provisioner injects the +# correct host-mapped port via MOLECULE_WORKSPACE_URL (#2851); prefer that and +# fall back to the legacy STUB_REGISTER_URL or the internal listen port. _short = WORKSPACE_ID[:12] if len(WORKSPACE_ID) > 12 else WORKSPACE_ID -SELF_URL = os.environ.get("STUB_REGISTER_URL", f"http://localhost:{PORT}") +SELF_URL = ( + os.environ.get("STUB_REGISTER_URL") + or os.environ.get("MOLECULE_WORKSPACE_URL") + or f"http://localhost:{PORT}" +) CONFIG_PATH = (os.environ.get("WORKSPACE_CONFIG_PATH") or "/configs").rstrip("/") AUTH_TOKEN_FILE = f"{CONFIG_PATH}/.auth_token" diff --git a/tests/e2e/test_local_provision_lifecycle_e2e.sh b/tests/e2e/test_local_provision_lifecycle_e2e.sh index b1ee0e086..9896da669 100755 --- a/tests/e2e/test_local_provision_lifecycle_e2e.sh +++ b/tests/e2e/test_local_provision_lifecycle_e2e.sh @@ -522,10 +522,12 @@ done if [ -n "$RUN" ]; then pass "container running: $RUN"; else fail "no running ws-${WSID} container within 10s of online" "docker ps shows none"; fi # #2851: fail fast if the workspace advertised an unresolvable/unreachable URL. -# The provisioner now makes the runtime advertise http://localhost:, -# which the platform stores as http://127.0.0.1:. The A2A proxy -# rewrites that to ws-:8000 when the platform runs inside Docker, so the -# URL stored in the registry should always be a host-reachable loopback address. +# The provisioner now makes the runtime advertise http://localhost: +# by default, which the platform stores as http://127.0.0.1:. The A2A +# proxy rewrites that to ws-:8000 when the platform runs inside Docker, so +# the URL stored in the registry should always be a host-reachable address. +# When the platform itself is containerized, MOLECULE_WORKSPACE_ADVERTISE_HOST +# points the runtime at the Docker gateway IP instead; accept that too. # (The end-to-end proxy reach in Step 5 is the real reachability proof; this # assertion just surfaces hostname/DNS misconfiguration early.) WS_URL_AFTER=$(ws_field "$WS" "url") @@ -534,10 +536,13 @@ if [ -n "$WS_URL_AFTER" ]; then else fail "workspace URL is empty after reaching online" "registry row has no url" fi -if echo "$WS_URL_AFTER" | grep -qE '^https?://(127\.0\.0\.1|localhost):'; then - pass "workspace registered a host-reachable loopback URL" +URL_HOST_AFTER="${WS_URL_AFTER#http://}" +URL_HOST_AFTER="${URL_HOST_AFTER#https://}" +URL_HOST_AFTER="${URL_HOST_AFTER%%:*}" +if [ "$URL_HOST_AFTER" = "127.0.0.1" ] || [ "$URL_HOST_AFTER" = "localhost" ] || [ "$URL_HOST_AFTER" = "${PLATFORM_HOST_IP:-}" ]; then + pass "workspace registered a host-reachable URL (host=$URL_HOST_AFTER)" else - fail "workspace URL is not a host-reachable loopback address" "url=$WS_URL_AFTER" + fail "workspace URL is not a host-reachable address" "url=$WS_URL_AFTER expected localhost/127.0.0.1 or PLATFORM_HOST_IP=${PLATFORM_HOST_IP:-}" fi echo "" diff --git a/workspace-server/internal/handlers/registry.go b/workspace-server/internal/handlers/registry.go index 44068c972..740ed4cf8 100644 --- a/workspace-server/internal/handlers/registry.go +++ b/workspace-server/internal/handlers/registry.go @@ -10,6 +10,7 @@ import ( "net/http" "net/url" "os" + "strconv" "strings" "sync" "time" @@ -21,6 +22,30 @@ import ( "github.com/gin-gonic/gin" ) +// isProvisionerHostPortURL reports whether u is a loopback URL with a port +// other than 8000. Such URLs are written by the provisioner (via +// MOLECULE_WORKSPACE_URL) and should be preserved across runtime registrations. +// The 8000 fallback is the runtime's own default and is allowed to be +// overwritten. +func isProvisionerHostPortURL(u string) bool { + if u == "" { + return false + } + if !strings.HasPrefix(u, "http://127.0.0.1:") && !strings.HasPrefix(u, "http://localhost:") { + return false + } + // Extract the trailing port. + port := u[strings.LastIndex(u, ":")+1:] + if port == "" { + return false + } + n, err := strconv.Atoi(port) + if err != nil { + return false + } + return n != 8000 +} + // blockedRange is a named CIDR block so the conditional blocklist in // validateAgentURL reads as a slice of homogeneous values instead of // repeated anonymous struct literals. @@ -578,8 +603,33 @@ func (h *RegistryHandler) Register(c *gin.Context) { INSERT INTO workspaces (id, name, url, agent_card, status, last_heartbeat_at, delivery_mode, kind) VALUES ($1, $2, $3, $4::jsonb, 'online', now(), $5, COALESCE(NULLIF($6, ''), 'workspace')) ON CONFLICT (id) DO UPDATE SET + -- Preserve the provisioner-set host-port URL. The provisioner + -- injects MOLECULE_WORKSPACE_URL= into the container + -- env (buildStartWorkspaceEnv in workspace-server), so the + -- runtime should register that same URL. The runtime's + -- resolve_workspace_url honors MOLECULE_WORKSPACE_URL at highest + -- precedence, so when the env propagation is correct, the + -- runtime's URL == provisioner's URL. When env propagation is + -- broken (real-image lifecycle E2E gap that bit 3 rounds + -- running), the runtime falls back to http://HOSTNAME:8000 + -- — the port 8000 makes it distinguishable from the + -- provisioner's host-port (typically >30000). Preserve the + -- provisioner's URL when its port != 8000. + -- + -- Researcher #11798: round-3 fix changed the provisioner + -- from http://127.0.0.1 to http://localhost (the + -- workspaceAdvertiseURL default), but the Register handler + -- only matched the legacy 127.0.0.1 prefix, so the upsert + -- overwrote the provisioner's URL with the runtime's 8000 + -- fallback. Generalize to match any host-prefixed + -- host-port URL whose port != 8000. url = CASE - WHEN workspaces.url LIKE 'http://127.0.0.1%' THEN workspaces.url + WHEN workspaces.url IS NOT NULL + AND workspaces.url != '' + AND (workspaces.url LIKE 'http://127.0.0.1:%' + OR workspaces.url LIKE 'http://localhost:%') + AND CAST(substring(workspaces.url FROM ':([0-9]+)$') AS int) <> 8000 + THEN workspaces.url ELSE EXCLUDED.url END, agent_card = EXCLUDED.agent_card, @@ -617,7 +667,7 @@ func (h *RegistryHandler) Register(c *gin.Context) { cachedURL := effectiveURL var dbURL string if err := db.DB.QueryRowContext(ctx, `SELECT url FROM workspaces WHERE id = $1`, payload.ID).Scan(&dbURL); err == nil { - if strings.HasPrefix(dbURL, "http://127.0.0.1") { + if isProvisionerHostPortURL(dbURL) { cachedURL = dbURL } } diff --git a/workspace-server/internal/handlers/registry_auth_integration_test.go b/workspace-server/internal/handlers/registry_auth_integration_test.go index 17f45d74f..59b071d37 100644 --- a/workspace-server/internal/handlers/registry_auth_integration_test.go +++ b/workspace-server/internal/handlers/registry_auth_integration_test.go @@ -157,6 +157,44 @@ func statusOf(t *testing.T, conn *sql.DB, id string) string { return status } +// insertWorkspaceWithURL creates a workspace row with an explicit URL +// (the provisioner-set host-port URL the CASE-preservation tests need +// pre-populated). Returns the DB-generated UUID. Mirrors insertWorkspace +// but additionally writes the `url` column on insert. +func insertWorkspaceWithURL(t *testing.T, conn *sql.DB, name, status, url string) string { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + var id string + err := conn.QueryRowContext(ctx, ` + INSERT INTO workspaces (name, status, parent_id, delivery_mode, url) + VALUES ($1, $2, NULL, 'push', NULLIF($3, '')) + RETURNING id + `, name, status, url).Scan(&id) + if err != nil { + t.Fatalf("insertWorkspaceWithURL(%s): %v", name, err) + } + return id +} + +// urlOf reads a workspace row's current url, failing the test if the row +// is gone or the url is NULL. Used by the CASE-preservation tests to +// verify the upsert preserved (or did not preserve) the expected URL. +func urlOf(t *testing.T, conn *sql.DB, id string) string { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + var url sql.NullString + err := conn.QueryRowContext(ctx, `SELECT url FROM workspaces WHERE id = $1`, id).Scan(&url) + if err != nil { + t.Fatalf("urlOf(%s): %v", id, err) + } + if !url.Valid { + t.Fatalf("urlOf(%s): row has NULL url", id) + } + return url.String +} + // --------------------------------------------------------------------------- // 1 — registry register/heartbeat row-state: the #73 tombstone guard. // @@ -176,8 +214,25 @@ const registerUpsertSQL = ` INSERT INTO workspaces (id, name, url, agent_card, status, last_heartbeat_at, delivery_mode) VALUES ($1, $2, $3, $4::jsonb, 'online', now(), $5) ON CONFLICT (id) DO UPDATE SET + -- Preserve the provisioner-set host-port URL. The provisioner + -- injects MOLECULE_WORKSPACE_URL= into the container + -- env (buildStartWorkspaceEnv in workspace-server), so the + -- runtime should register that same URL. The runtime's + -- resolve_workspace_url honors MOLECULE_WORKSPACE_URL at highest + -- precedence, so when the env propagation is correct, the + -- runtime's URL == provisioner's URL. When env propagation is + -- broken (real-image lifecycle E2E gap that bit 3 rounds + -- running), the runtime falls back to http://HOSTNAME:8000 + -- — the port 8000 makes it distinguishable from the + -- provisioner's host-port (typically >30000). Preserve the + -- provisioner's URL when its port != 8000. url = CASE - WHEN workspaces.url LIKE 'http://127.0.0.1%' THEN workspaces.url + WHEN workspaces.url IS NOT NULL + AND workspaces.url != '' + AND (workspaces.url LIKE 'http://127.0.0.1:%' + OR workspaces.url LIKE 'http://localhost:%') + AND CAST(substring(workspaces.url FROM ':([0-9]+)$') AS int) <> 8000 + THEN workspaces.url ELSE EXCLUDED.url END, agent_card = EXCLUDED.agent_card, @@ -237,6 +292,114 @@ func TestIntegration_RegistryRowState_RegisterUpsertsLiveWorkspaceToOnline(t *te } } +// TestIntegration_RegistryRowState_RegisterPreservesProvisionerHostPort covers +// the #2851 round-4 / Researcher #11798 close-out: when the provisioner has +// already set a host-port URL on the row (e.g. http://localhost:41751 via +// buildStartWorkspaceEnv) and the runtime re-registers with the same +// host-port URL (or any non-8000 URL on the localhost/127.0.0.1 prefix), +// the upsert must preserve the EXISTING URL (not overwrite with EXCLUDED). +// Regression of the round-3 11798 gap. +func TestIntegration_RegistryRowState_RegisterPreservesProvisionerHostPort(t *testing.T) { + conn := integrationAuthDB(t) + ctx := context.Background() + + id := insertWorkspaceWithURL(t, conn, "hostport-ws", "online", "http://localhost:41751") + + // Re-register with the SAME host-port URL. The provisioner would have + // injected this same URL via MOLECULE_WORKSPACE_URL, so the runtime's + // EXCLUDED.url == existing url. The CASE should preserve either way. + if _, err := conn.ExecContext(ctx, registerUpsertSQL, + id, id, "http://localhost:41751", `{"name":"x"}`, "push"); err != nil { + t.Fatalf("register upsert: %v", err) + } + if got := urlOf(t, conn, id); got != "http://localhost:41751" { + t.Fatalf("host-port URL not preserved: got %q, want http://localhost:41751", got) + } + + // Re-register with a DIFFERENT host-port URL on the same prefix. The + // provisioner is allowed to update the host-port (e.g. after a restart + // allocated a new port). The CASE should still preserve the row's + // existing host-port URL — the runtime shouldn't be able to silently + // rewrite the port without re-provisioning. + if _, err := conn.ExecContext(ctx, registerUpsertSQL, + id, id, "http://localhost:50000", `{"name":"x"}`, "push"); err != nil { + t.Fatalf("register upsert 2: %v", err) + } + if got := urlOf(t, conn, id); got != "http://localhost:41751" { + t.Fatalf("host-port URL changed on re-register: got %q, want preserved http://localhost:41751", got) + } + + // Same with the legacy 127.0.0.1 prefix (back-compat). + id2 := insertWorkspaceWithURL(t, conn, "hostport-ws-legacy", "online", "http://127.0.0.1:33605") + if _, err := conn.ExecContext(ctx, registerUpsertSQL, + id2, "hostport-ws-legacy", "http://127.0.0.1:33605", `{"name":"x"}`, "push"); err != nil { + t.Fatalf("register upsert 3 (127.0.0.1): %v", err) + } + if got := urlOf(t, conn, id2); got != "http://127.0.0.1:33605" { + t.Fatalf("legacy 127.0.0.1 host-port URL not preserved: got %q, want http://127.0.0.1:33605", got) + } +} + +// TestIntegration_RegistryRowState_RegisterOverwritesRuntime8000 covers the +// "runtime's wrong localhost:8000 fallback overwriting the provisioner's +// host-port" case: the CASE excludes port 8000 (which is the runtime's +// listen-port fallback, NOT the provisioner's host-port) so the upsert +// overwrites with EXCLUDED.url — but EXCLUDED.url in this scenario is +// ALSO 8000, so the net result is the row's URL is 8000 (which is +// wrong but reflects the runtime's broken env propagation — the +// underlying fix is the buildStartWorkspaceEnv injection, not the CASE). +func TestIntegration_RegistryRowState_RegisterOverwritesRuntime8000(t *testing.T) { + conn := integrationAuthDB(t) + ctx := context.Background() + + // Provisioner set the legacy 127.0.0.1: URL (pre-round-3 path). + id := insertWorkspaceWithURL(t, conn, "port8000-ws", "online", "http://127.0.0.1:8000") + // Runtime re-registers with its localhost:8000 fallback. + if _, err := conn.ExecContext(ctx, registerUpsertSQL, + id, id, "http://localhost:8000", `{"name":"x"}`, "push"); err != nil { + t.Fatalf("register upsert: %v", err) + } + // CASE excludes port 8000 → falls through to EXCLUDED.url → row = localhost:8000. + if got := urlOf(t, conn, id); got != "http://localhost:8000" { + t.Fatalf("port-8000 case: got %q, want http://localhost:8000 (CASE excluded, EXCLUDED wins)", got) + } + + // And vice-versa (existing localhost:8000 + runtime 127.0.0.1:8000): + id2 := insertWorkspaceWithURL(t, conn, "port8000-ws-2", "online", "http://localhost:8000") + if _, err := conn.ExecContext(ctx, registerUpsertSQL, + id2, "port8000-ws-2", "http://127.0.0.1:8000", `{"name":"x"}`, "push"); err != nil { + t.Fatalf("register upsert 2: %v", err) + } + if got := urlOf(t, conn, id2); got != "http://127.0.0.1:8000" { + t.Fatalf("port-8000 reverse: got %q, want http://127.0.0.1:8000", got) + } +} + +// TestIntegration_RegistryRowState_RegisterOverwritesNonLocalhost covers the +// "the row's existing URL is a non-localhost URL (e.g. https://example.com/foo +// or http://192.168.1.100:8080) — the CASE only matches localhost/127.0.0.1 +// prefix, so non-localhost URLs are NOT preserved; EXCLUDED.url wins". +// This is the defense-in-depth: non-localhost URLs that shouldn't be in +// workspaces.url (SSRF defense) get overwritten by the runtime's +// validateAgentURL-checked URL. +func TestIntegration_RegistryRowState_RegisterOverwritesNonLocalhost(t *testing.T) { + conn := integrationAuthDB(t) + ctx := context.Background() + + // Legacy state: a non-localhost URL snuck into the row (shouldn't happen + // in practice post-#1130, but if it did, the upsert must not preserve + // it — EXCLUDED.url is the new URL and the runtime's + // validateAgentURL already gated it). + id := insertWorkspaceWithURL(t, conn, "nonlocal-ws", "online", "http://192.168.1.100:8080") + if _, err := conn.ExecContext(ctx, registerUpsertSQL, + id, id, "http://localhost:41751", `{"name":"x"}`, "push"); err != nil { + t.Fatalf("register upsert: %v", err) + } + if got := urlOf(t, conn, id); got != "http://localhost:41751" { + t.Fatalf("non-localhost URL not overwritten: got %q, want http://localhost:41751", got) + } +} + func TestIntegration_RegistryRowState_HeartbeatDoesNotResurrectRemoved(t *testing.T) { conn := integrationAuthDB(t) ctx := context.Background() diff --git a/workspace-server/internal/provisioner/provisioner.go b/workspace-server/internal/provisioner/provisioner.go index d5f46c6b1..bb26fc926 100644 --- a/workspace-server/internal/provisioner/provisioner.go +++ b/workspace-server/internal/provisioner/provisioner.go @@ -12,6 +12,7 @@ import ( "io" "log" "net" + "net/url" "os" "path/filepath" "runtime" @@ -561,8 +562,60 @@ func allocateHostPort() (string, error) { // handler layer then stores the equivalent http://127.0.0.1: URL and // the A2A proxy rewrites it to ws-:8000 when the platform itself runs // inside Docker. +// +// #2851 follow-up: when the platform also runs inside a container, the +// workspace must advertise the Docker host/gateway IP (PLATFORM_HOST_IP) so +// the platform container can reach the host-mapped workspace port. The +// operator sets MOLECULE_WORKSPACE_ADVERTISE_HOST to override the default +// "localhost". func workspaceAdvertiseURL(hostPort string) string { - return fmt.Sprintf("http://localhost:%s", hostPort) + host := os.Getenv("MOLECULE_WORKSPACE_ADVERTISE_HOST") + if host == "" { + host = "localhost" + } + return fmt.Sprintf("http://%s:%s", host, hostPort) +} + +// buildStartWorkspaceEnv assembles the full env for the workspace container, +// including the platform-injected MOLECULE_WORKSPACE_URL. Extracted from +// Start() so the production-path env injection is unit-testable without a +// Docker daemon (Researcher #11798 / #11787 close-out — the gap that bit +// 3 rounds running: in real-image lifecycle E2E, the runtime's resolve_workspace_url +// fell back to http://localhost:8000 because the provisioner's +// MOLECULE_WORKSPACE_URL injection was not exercised in tests, and the +// Register handler's URL-preservation CASE only matched the legacy +// 127.0.0.1 prefix — so the upsert overwrote the host-port URL with the +// runtime's 8000 fallback. Both gaps closed: this helper ensures the +// injection happens, and the Register handler's CASE now also matches +// the localhost prefix the provisioner uses after round-3). +func buildStartWorkspaceEnv(cfg WorkspaceConfig, hostPort string) []string { + return append(buildContainerEnv(cfg), + fmt.Sprintf("MOLECULE_WORKSPACE_URL=%s", workspaceAdvertiseURL(hostPort))) +} + + +// resolveStartWorkspaceHostURL computes the hostURL that StartWorkspace +// should return to the platform. The host comes from workspaceAdvertiseURL +// (env override → localhost); the port starts at hostPort and is swapped +// to boundPort if Docker bound a different one. +// +// #2851 (registration-path fix): the persisted hostURL must be the +// host-reachable advertise URL (not 127.0.0.1), so ProxyA2A's +// resolveAgentURL doesn't rewrite it to the internal Docker hostname +// (ws-:8000) and isSafeURL then reject it. The env override lets +// the operator point the runtime at the Docker host/gateway IP in +// containerized-platform mode; the default "localhost" preserves the +// pre-#2851 behavior when the platform runs on the host directly. +func resolveStartWorkspaceHostURL(hostPort, boundPort string) string { + hostURL := workspaceAdvertiseURL(hostPort) + if boundPort == "" || boundPort == hostPort { + return hostURL + } + u, err := url.Parse(hostURL) + if err != nil || u.Hostname() == "" { + return hostURL + } + return fmt.Sprintf("http://%s:%s", u.Hostname(), boundPort) } // Start provisions and starts a workspace container. @@ -595,12 +648,17 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e } log.Printf("Provisioner: config volume %s ready", configVolume) - // #2851: tell the runtime exactly which URL to advertise. localhost is - // accepted by registry validateAgentURL; the handler layer then stores the - // equivalent http://127.0.0.1: URL and the A2A proxy rewrites it to - // ws-:8000 when the platform itself runs inside Docker. + // #2851 (round-4 production-injection fix): tell the runtime exactly which + // URL to advertise. The helper injects MOLECULE_WORKSPACE_URL= + // into the container env (host-port URL, not the runtime's listen-port + // 8000 fallback). The runtime's resolve_workspace_url honors + // MOLECULE_WORKSPACE_URL at highest precedence, so the registered URL + // matches the host-port the provisioner allocated — preventing the + // "registered as localhost:8000 but the host-port is 41751" gap + // that bit 3 rounds running in production (the real-image lifecycle + // E2E ProxyA2A path). + env := buildStartWorkspaceEnv(cfg, hostPort) advertiseURL := workspaceAdvertiseURL(hostPort) - env := append(buildContainerEnv(cfg), fmt.Sprintf("MOLECULE_WORKSPACE_URL=%s", advertiseURL)) image, imgErr := selectImage(cfg) if imgErr != nil { @@ -821,10 +879,17 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e // /configs and /workspace, then drops to agent via gosu). No per-start // chown needed here. - // #2851: use the pre-allocated host port directly. The inspect loop below - // is kept as a verification that Docker bound the expected port, but the - // stable value comes from allocateHostPort above. - hostURL := fmt.Sprintf("http://127.0.0.1:%s", hostPort) + // #2851 (registration-path fix): use the host-reachable advertise URL + // (not just 127.0.0.1) as the hostURL persisted in the DB. When the + // platform itself runs inside an act_runner container, 127.0.0.1 in + // the workspace URL would force ProxyA2A's resolveAgentURL to rewrite + // to the internal Docker hostname (ws-:8000), which isSafeURL then + // rejects as "workspace URL is not publicly routable". By persisting + // the advertise URL (172.18.0.1: in containerized dev, or the + // operator's host IP in prod), the platform-facing URL is host- + // reachable, the resolveAgentURL rewrite doesn't kick in, and the + // dev-mode SSRF relaxation allows the 172.18/16 private range. + hostURL := resolveStartWorkspaceHostURL(hostPort, "") for attempt := 0; attempt < 3; attempt++ { info, inspectErr := p.cli.ContainerInspect(ctx, resp.ID) if inspectErr != nil { @@ -837,13 +902,9 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e if attempt < 2 { time.Sleep(500 * time.Millisecond) // wait for Docker to bind the port } else { - log.Printf("Provisioner: container %s did not bind expected host port %s; falling back to inspect value", name, hostPort) + log.Printf("Provisioner: container %s did not bind expected host port %s; falling back to bound port (keeping advertise host)", name, hostPort) if len(portBindings) > 0 { - boundIP := portBindings[0].HostIP - if boundIP == "" { - boundIP = "127.0.0.1" - } - hostURL = fmt.Sprintf("http://%s:%s", boundIP, portBindings[0].HostPort) + hostURL = resolveStartWorkspaceHostURL(hostPort, portBindings[0].HostPort) } } } diff --git a/workspace-server/internal/provisioner/provisioner_test.go b/workspace-server/internal/provisioner/provisioner_test.go index 071da2500..8f04fa671 100644 --- a/workspace-server/internal/provisioner/provisioner_test.go +++ b/workspace-server/internal/provisioner/provisioner_test.go @@ -1669,9 +1669,154 @@ func TestAllocateHostPort(t *testing.T) { } func TestWorkspaceAdvertiseURL(t *testing.T) { - got := workspaceAdvertiseURL("12345") - want := "http://localhost:12345" - if got != want { - t.Errorf("workspaceAdvertiseURL = %q, want %q", got, want) - } + t.Run("default localhost", func(t *testing.T) { + got := workspaceAdvertiseURL("12345") + want := "http://localhost:12345" + if got != want { + t.Errorf("workspaceAdvertiseURL = %q, want %q", got, want) + } + }) + + t.Run("MOLECULE_WORKSPACE_ADVERTISE_HOST override", func(t *testing.T) { + t.Setenv("MOLECULE_WORKSPACE_ADVERTISE_HOST", "192.168.1.100") + got := workspaceAdvertiseURL("12345") + want := "http://192.168.1.100:12345" + if got != want { + t.Errorf("workspaceAdvertiseURL = %q, want %q", got, want) + } + }) + + t.Run("empty env defaults to localhost", func(t *testing.T) { + t.Setenv("MOLECULE_WORKSPACE_ADVERTISE_HOST", "") + got := workspaceAdvertiseURL("12345") + want := "http://localhost:12345" + if got != want { + t.Errorf("workspaceAdvertiseURL = %q, want %q", got, want) + } + }) +} + +// TestResolveStartWorkspaceHostURL covers the #2851 registration-path fix: +// the hostURL StartWorkspace persists in the DB must be the host-reachable +// advertise URL (not 127.0.0.1) so ProxyA2A's resolveAgentURL doesn't +// rewrite it to the internal Docker hostname. Each subtest pins the env +// var then asserts the hostURL for both the initial and inspect-fallback +// paths. +func TestResolveStartWorkspaceHostURL(t *testing.T) { + t.Run("default localhost (no env override)", func(t *testing.T) { + t.Setenv("MOLECULE_WORKSPACE_ADVERTISE_HOST", "") + got := resolveStartWorkspaceHostURL("12345", "") + want := "http://localhost:12345" + if got != want { + t.Errorf("resolveStartWorkspaceHostURL = %q, want %q", got, want) + } + }) + + t.Run("env override, no bound-port swap", func(t *testing.T) { + t.Setenv("MOLECULE_WORKSPACE_ADVERTISE_HOST", "172.18.0.1") + got := resolveStartWorkspaceHostURL("33605", "") + want := "http://172.18.0.1:33605" + if got != want { + t.Errorf("resolveStartWorkspaceHostURL = %q, want %q", got, want) + } + }) + + t.Run("env override, bound port differs (inspect-fallback path keeps advertise host)", func(t *testing.T) { + t.Setenv("MOLECULE_WORKSPACE_ADVERTISE_HOST", "172.18.0.1") + // pre-allocated hostPort was 33605 but Docker bound 33606. + // Final URL must use the advertise host + the bound port. + got := resolveStartWorkspaceHostURL("33605", "33606") + want := "http://172.18.0.1:33606" + if got != want { + t.Errorf("resolveStartWorkspaceHostURL = %q, want %q", got, want) + } + }) + + t.Run("env override, bound port matches (no-op)", func(t *testing.T) { + t.Setenv("MOLECULE_WORKSPACE_ADVERTISE_HOST", "192.168.65.2") + got := resolveStartWorkspaceHostURL("8080", "8080") + want := "http://192.168.65.2:8080" + if got != want { + t.Errorf("resolveStartWorkspaceHostURL = %q, want %q", got, want) + } + }) + + t.Run("no env override, bound port swap (legacy 127.0.0.1 case)", func(t *testing.T) { + t.Setenv("MOLECULE_WORKSPACE_ADVERTISE_HOST", "") + // default "localhost" but bound port differs + got := resolveStartWorkspaceHostURL("8080", "9090") + want := "http://localhost:9090" + if got != want { + t.Errorf("resolveStartWorkspaceHostURL = %q, want %q", got, want) + } + }) +} + + +// TestBuildStartWorkspaceEnv covers the #2851 production-path env injection +// gap that bit 3 rounds running (Researcher #11798 / #11787 close-out). +// The provisioner's Start() must inject MOLECULE_WORKSPACE_URL= +// into the container env so the runtime's resolve_workspace_url +// (highest precedence for the env var) returns the same host-port URL +// the provisioner persisted. When env propagation is broken +// (the real-image lifecycle E2E gap), the runtime falls back to +// http://HOSTNAME:8000 — the Register handler's upsert must then +// preserve the provisioner's URL (the Register handler fix is in +// registry.go, this test covers the provisioner side). +func TestBuildStartWorkspaceEnv(t *testing.T) { + cfg := WorkspaceConfig{ + WorkspaceID: "ws-test-1", + Tier: 1, + PlatformURL: "http://platform:8080", + } + + find := func(env []string, key string) string { + for _, e := range env { + if len(e) >= len(key)+1 && e[:len(key)+1] == key+"=" { + return e[len(key)+1:] + } + } + return "" + } + + t.Run("default localhost (no env override)", func(t *testing.T) { + t.Setenv("MOLECULE_WORKSPACE_ADVERTISE_HOST", "") + env := buildStartWorkspaceEnv(cfg, "41751") + got := find(env, "MOLECULE_WORKSPACE_URL") + want := "http://localhost:41751" + if got != want { + t.Errorf("MOLECULE_WORKSPACE_URL = %q, want %q", got, want) + } + }) + + t.Run("env override (containerized-platform path)", func(t *testing.T) { + t.Setenv("MOLECULE_WORKSPACE_ADVERTISE_HOST", "172.18.0.1") + env := buildStartWorkspaceEnv(cfg, "33605") + got := find(env, "MOLECULE_WORKSPACE_URL") + want := "http://172.18.0.1:33605" + if got != want { + t.Errorf("MOLECULE_WORKSPACE_URL = %q, want %q", got, want) + } + }) + + t.Run("env injection comes AFTER buildContainerEnv (last-wins for docker -e)", func(t *testing.T) { + t.Setenv("MOLECULE_WORKSPACE_ADVERTISE_HOST", "") + env := buildStartWorkspaceEnv(cfg, "12345") + var workspaceURLIdx, workspaceIDIdx int + workspaceURLIdx, workspaceIDIdx = -1, -1 + for i, e := range env { + if len(e) > 22 && e[:23] == "MOLECULE_WORKSPACE_URL=" { + workspaceURLIdx = i + } + if len(e) > 12 && e[:13] == "WORKSPACE_ID=" { + workspaceIDIdx = i + } + } + if workspaceURLIdx == -1 { + t.Fatal("MOLECULE_WORKSPACE_URL not in env") + } + if workspaceURLIdx <= workspaceIDIdx { + t.Errorf("MOLECULE_WORKSPACE_URL (idx %d) must come AFTER buildContainerEnv entries (WORKSPACE_ID idx %d) for docker -e last-wins", workspaceURLIdx, workspaceIDIdx) + } + }) }