ci(local-provision-e2e): dynamic ephemeral port to fix runner bind conflicts #2453

Merged
agent-dev-a merged 1 commits from fix/2450-local-provision-dynamic-port into main 2026-06-09 01:42:14 +00:00
+41 -57
View File
@@ -124,12 +124,16 @@ jobs:
- name: Configure platform env (admin token + local Docker provisioner)
run: |
# Allocate an unused ephemeral port to avoid collision with concurrent
# jobs or stale processes from prior cancelled runs (see #2450).
PORT=$(python3 -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()")
echo "PORT=${PORT}" >> "$GITHUB_ENV"
echo "BASE=http://localhost:${PORT}" >> "$GITHUB_ENV"
# Deterministic admin token: the script sends MOLECULE_ADMIN_TOKEN as the
# bearer; the platform checks ADMIN_TOKEN. Set both to the same value.
T="lpe2e-admin-${{ github.run_id }}-${{ github.run_attempt }}"
echo "ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
echo "MOLECULE_ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
echo "BASE=http://localhost:8080" >> "$GITHUB_ENV"
# MOLECULE_ENV=development: dev posture. MOLECULE_ORG_ID is left UNSET so
# main.go wires the LOCAL Docker provisioner (not the CP provisioner), and
# MOLECULE_IMAGE_REGISTRY is left UNSET so image resolution uses
@@ -143,21 +147,10 @@ jobs:
- name: Kill stale platform-server before start (issue #1046)
run: |
# ROOT CAUSE of the stub-gate red on docker-host: both this gating job
# and the advisory lifecycle-real job bind the SAME fixed host port
# :8080 (PORT=8080 ./platform-server). On the small docker-host runner
# pool a prior cancelled/timeout run can leave a zombie platform-server
# on :8080 (a cancelled run never reaches "Stop platform"), and — until
# lifecycle-real was serialised behind this job via needs: — the two
# jobs could also co-schedule on one runner and contend for :8080. A
# second bind on :8080 is FATAL (the server exits), so "Wait for
# /health" times out at 300s and this REQUIRED gate reds. Free the port
# before binding — mirrors the e2e-api.yml #1046 fix for the identical
# fixed-port-on-shared-runner class.
#
# /proc scan — works on any Linux without pkill/lsof/ss. comm is
# truncated to 15 chars: "platform-serve" matches "platform-server".
# Verify via cmdline to avoid false positives.
# Dynamic port allocation (see #2450) eliminates the fixed-port race
# that caused this gate to red when a prior run left a zombie process.
# We still sweep by process name to avoid leaking platform-server
# processes on the shared runner.
killed=0
for pid in $(grep -l "platform-serve" /proc/[0-9]*/comm 2>/dev/null); do
kpid="${pid%/comm}"; kpid="${kpid##*/}"
@@ -169,35 +162,28 @@ jobs:
fi
done
if [ "$killed" -gt 0 ]; then echo "Killed $killed stale platform-server process(es)."; else echo "No platform-server-named process found."; fi
# Belt-and-braces: also free :8080 from ANY holder regardless of process
# name. A differently-named squatter (e.g. a leftover Fastify dev server
# from another job) survives the comm-name scan above, makes our bind
# FATAL, and can false-positive the /health probe below (no-flakes RCA;
# tracked alongside #2430). fuser/lsof are present on the ubuntu runner;
# if neither exists the name-scan above is the floor.
if command -v fuser >/dev/null 2>&1; then fuser -k 8080/tcp 2>/dev/null || true; fi
if command -v lsof >/dev/null 2>&1; then lsof -ti tcp:8080 2>/dev/null | xargs -r kill -9 2>/dev/null || true; fi
sleep 2
echo ":8080 freed (comm-scan + port-scan swept any squatter)."
sleep 1
- name: Start platform (background)
working-directory: workspace-server
run: |
# Bind to :8080 (the script's BASE). DATABASE_URL/REDIS_URL/ADMIN_TOKEN/
# MOLECULE_ENV are inherited from $GITHUB_ENV.
PORT=8080 ./platform-server > platform.log 2>&1 &
# Bind to the dynamically allocated port (see #2450).
# DATABASE_URL/REDIS_URL/ADMIN_TOKEN/MOLECULE_ENV are inherited from
# $GITHUB_ENV.
PORT=$PORT ./platform-server > platform.log 2>&1 &
echo $! > platform.pid
- name: Wait for /health (+ migrations applied)
run: |
DEADLINE=300; PID="$(cat workspace-server/platform.pid 2>/dev/null || true)"; start=$(date +%s)
while :; do
# Verify OUR server owns :8080 BEFORE trusting /health. Our server binds
# :8080 or exits FATAL, so "our PID alive" <=> "we own :8080"; checking it
# first stops a squatter that answers /health on :8080 (our bind having
# failed) from false-positiving the gate (no-flakes RCA).
# Verify OUR server is still alive before trusting /health. Our server
# binds the allocated port or exits FATAL, so "our PID alive" <=>
# "we own the port"; checking it first stops a squatter that answers
# /health on the same port (our bind having failed) from false-positiving
# the gate (no-flakes RCA).
if [ -n "$PID" ] && ! kill -0 "$PID" 2>/dev/null; then
echo "::error::platform-server exited early (failed to bind :8080 or crashed)"; cat workspace-server/platform.log || true; exit 1
echo "::error::platform-server exited early (failed to bind or crashed)"; cat workspace-server/platform.log || true; exit 1
fi
if curl -sf "$BASE/health" >/dev/null; then
tables=$(docker exec "$PG_CONTAINER" psql -U dev -d molecule -tAc \
@@ -237,13 +223,13 @@ jobs:
lifecycle-real:
name: Local Provision Lifecycle E2E (real image + MiniMax LLM, advisory)
runs-on: docker-host
# Serialise behind the gating stub job: both jobs bind the SAME fixed host
# port :8080, so co-scheduling them on one docker-host runner makes the
# second platform-server fail to bind (fatal) and reds whichever lost the
# race. `needs:` forces this advisory job to start only AFTER lifecycle-stub
# finishes, so they never contend for :8080. continue-on-error keeps a real-
# job miss non-blocking; `needs:` does NOT gate on the stub's success (a
# failed required gate still lets this advisory dependent run).
# Serialise behind the gating stub job: both jobs share the same docker-host
# runner and provision sibling containers. `needs:` forces this advisory job
# to start only AFTER lifecycle-stub finishes, avoiding resource contention.
# (Dynamic ports eliminated the fixed-port race; serialisation remains for
# docker-host capacity hygiene.) continue-on-error keeps a real-job miss
# non-blocking; `needs:` does NOT gate on the stub's success (a failed
# required gate still lets this advisory dependent run).
needs: lifecycle-stub
if: ${{ always() }}
# Tracker for lint-continue-on-error-tracking (Tier 2e / internal#350): this
@@ -299,10 +285,14 @@ jobs:
- name: Configure platform env
run: |
# Allocate an unused ephemeral port to avoid collision with concurrent
# jobs or stale processes from prior cancelled runs (see #2450).
PORT=$(python3 -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()")
echo "PORT=${PORT}" >> "$GITHUB_ENV"
echo "BASE=http://localhost:${PORT}" >> "$GITHUB_ENV"
T="lpe2e-real-admin-${{ github.run_id }}-${{ github.run_attempt }}"
echo "ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
echo "MOLECULE_ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
echo "BASE=http://localhost:8080" >> "$GITHUB_ENV"
echo "MOLECULE_ENV=development" >> "$GITHUB_ENV"
echo "SECRETS_ENCRYPTION_KEY=lpe2e-test-encryption-key-32bytes!!" >> "$GITHUB_ENV"
@@ -312,8 +302,9 @@ jobs:
- name: Kill stale platform-server before start (issue #1046)
run: |
# Same fixed-:8080 hygiene as the stub job — free the port from any
# zombie left by a cancelled run before this job binds it.
# Dynamic port allocation (see #2450) eliminates the fixed-port race.
# We still sweep by process name to avoid leaking platform-server
# processes on the shared runner.
killed=0
for pid in $(grep -l "platform-serve" /proc/[0-9]*/comm 2>/dev/null); do
kpid="${pid%/comm}"; kpid="${kpid##*/}"
@@ -325,30 +316,23 @@ jobs:
fi
done
if [ "$killed" -gt 0 ]; then echo "Killed $killed stale platform-server process(es)."; else echo "No platform-server-named process found."; fi
# Belt-and-braces: free :8080 from ANY holder regardless of process name
# (a differently-named squatter survives the comm-name scan above, makes
# our bind FATAL, and can false-positive the /health probe). Mirrors the
# stub job's no-flakes fix (tracked alongside #2430).
if command -v fuser >/dev/null 2>&1; then fuser -k 8080/tcp 2>/dev/null || true; fi
if command -v lsof >/dev/null 2>&1; then lsof -ti tcp:8080 2>/dev/null | xargs -r kill -9 2>/dev/null || true; fi
sleep 2
echo ":8080 freed (comm-scan + port-scan swept any squatter)."
sleep 1
- name: Start platform (background)
working-directory: workspace-server
run: |
PORT=8080 ./platform-server > platform.log 2>&1 &
PORT=$PORT ./platform-server > platform.log 2>&1 &
echo $! > platform.pid
- name: Wait for /health (+ migrations applied)
run: |
DEADLINE=300; PID="$(cat workspace-server/platform.pid 2>/dev/null || true)"; start=$(date +%s)
while :; do
# Verify OUR server owns :8080 before trusting /health (no-flakes RCA):
# our server binds :8080 or exits FATAL, so checking our PID first stops
# a squatter answering /health on :8080 from false-positiving the gate.
# Verify OUR server is still alive before trusting /health. Our server
# binds the allocated port or exits FATAL, so checking our PID first
# stops a squatter from false-positiving the gate (no-flakes RCA).
if [ -n "$PID" ] && ! kill -0 "$PID" 2>/dev/null; then
echo "::error::platform-server exited early (failed to bind :8080 or crashed)"; cat workspace-server/platform.log || true; exit 1
echo "::error::platform-server exited early (failed to bind or crashed)"; cat workspace-server/platform.log || true; exit 1
fi
if curl -sf "$BASE/health" >/dev/null; then
tables=$(docker exec "$PG_CONTAINER" psql -U dev -d molecule -tAc \