ci(local-provision-e2e): dynamic ephemeral port to fix runner bind conflicts #2453
@@ -124,12 +124,16 @@ jobs:
|
||||
|
||||
- name: Configure platform env (admin token + local Docker provisioner)
|
||||
run: |
|
||||
# Allocate an unused ephemeral port to avoid collision with concurrent
|
||||
# jobs or stale processes from prior cancelled runs (see #2450).
|
||||
PORT=$(python3 -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()")
|
||||
echo "PORT=${PORT}" >> "$GITHUB_ENV"
|
||||
echo "BASE=http://localhost:${PORT}" >> "$GITHUB_ENV"
|
||||
# Deterministic admin token: the script sends MOLECULE_ADMIN_TOKEN as the
|
||||
# bearer; the platform checks ADMIN_TOKEN. Set both to the same value.
|
||||
T="lpe2e-admin-${{ github.run_id }}-${{ github.run_attempt }}"
|
||||
echo "ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
|
||||
echo "MOLECULE_ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
|
||||
echo "BASE=http://localhost:8080" >> "$GITHUB_ENV"
|
||||
# MOLECULE_ENV=development: dev posture. MOLECULE_ORG_ID is left UNSET so
|
||||
# main.go wires the LOCAL Docker provisioner (not the CP provisioner), and
|
||||
# MOLECULE_IMAGE_REGISTRY is left UNSET so image resolution uses
|
||||
@@ -143,21 +147,10 @@ jobs:
|
||||
|
||||
- name: Kill stale platform-server before start (issue #1046)
|
||||
run: |
|
||||
# ROOT CAUSE of the stub-gate red on docker-host: both this gating job
|
||||
# and the advisory lifecycle-real job bind the SAME fixed host port
|
||||
# :8080 (PORT=8080 ./platform-server). On the small docker-host runner
|
||||
# pool a prior cancelled/timeout run can leave a zombie platform-server
|
||||
# on :8080 (a cancelled run never reaches "Stop platform"), and — until
|
||||
# lifecycle-real was serialised behind this job via needs: — the two
|
||||
# jobs could also co-schedule on one runner and contend for :8080. A
|
||||
# second bind on :8080 is FATAL (the server exits), so "Wait for
|
||||
# /health" times out at 300s and this REQUIRED gate reds. Free the port
|
||||
# before binding — mirrors the e2e-api.yml #1046 fix for the identical
|
||||
# fixed-port-on-shared-runner class.
|
||||
#
|
||||
# /proc scan — works on any Linux without pkill/lsof/ss. comm is
|
||||
# truncated to 15 chars: "platform-serve" matches "platform-server".
|
||||
# Verify via cmdline to avoid false positives.
|
||||
# Dynamic port allocation (see #2450) eliminates the fixed-port race
|
||||
# that caused this gate to red when a prior run left a zombie process.
|
||||
# We still sweep by process name to avoid leaking platform-server
|
||||
# processes on the shared runner.
|
||||
killed=0
|
||||
for pid in $(grep -l "platform-serve" /proc/[0-9]*/comm 2>/dev/null); do
|
||||
kpid="${pid%/comm}"; kpid="${kpid##*/}"
|
||||
@@ -169,35 +162,28 @@ jobs:
|
||||
fi
|
||||
done
|
||||
if [ "$killed" -gt 0 ]; then echo "Killed $killed stale platform-server process(es)."; else echo "No platform-server-named process found."; fi
|
||||
# Belt-and-braces: also free :8080 from ANY holder regardless of process
|
||||
# name. A differently-named squatter (e.g. a leftover Fastify dev server
|
||||
# from another job) survives the comm-name scan above, makes our bind
|
||||
# FATAL, and can false-positive the /health probe below (no-flakes RCA;
|
||||
# tracked alongside #2430). fuser/lsof are present on the ubuntu runner;
|
||||
# if neither exists the name-scan above is the floor.
|
||||
if command -v fuser >/dev/null 2>&1; then fuser -k 8080/tcp 2>/dev/null || true; fi
|
||||
if command -v lsof >/dev/null 2>&1; then lsof -ti tcp:8080 2>/dev/null | xargs -r kill -9 2>/dev/null || true; fi
|
||||
sleep 2
|
||||
echo ":8080 freed (comm-scan + port-scan swept any squatter)."
|
||||
sleep 1
|
||||
|
||||
- name: Start platform (background)
|
||||
working-directory: workspace-server
|
||||
run: |
|
||||
# Bind to :8080 (the script's BASE). DATABASE_URL/REDIS_URL/ADMIN_TOKEN/
|
||||
# MOLECULE_ENV are inherited from $GITHUB_ENV.
|
||||
PORT=8080 ./platform-server > platform.log 2>&1 &
|
||||
# Bind to the dynamically allocated port (see #2450).
|
||||
# DATABASE_URL/REDIS_URL/ADMIN_TOKEN/MOLECULE_ENV are inherited from
|
||||
# $GITHUB_ENV.
|
||||
PORT=$PORT ./platform-server > platform.log 2>&1 &
|
||||
echo $! > platform.pid
|
||||
|
||||
- name: Wait for /health (+ migrations applied)
|
||||
run: |
|
||||
DEADLINE=300; PID="$(cat workspace-server/platform.pid 2>/dev/null || true)"; start=$(date +%s)
|
||||
while :; do
|
||||
# Verify OUR server owns :8080 BEFORE trusting /health. Our server binds
|
||||
# :8080 or exits FATAL, so "our PID alive" <=> "we own :8080"; checking it
|
||||
# first stops a squatter that answers /health on :8080 (our bind having
|
||||
# failed) from false-positiving the gate (no-flakes RCA).
|
||||
# Verify OUR server is still alive before trusting /health. Our server
|
||||
# binds the allocated port or exits FATAL, so "our PID alive" <=>
|
||||
# "we own the port"; checking it first stops a squatter that answers
|
||||
# /health on the same port (our bind having failed) from false-positiving
|
||||
# the gate (no-flakes RCA).
|
||||
if [ -n "$PID" ] && ! kill -0 "$PID" 2>/dev/null; then
|
||||
echo "::error::platform-server exited early (failed to bind :8080 or crashed)"; cat workspace-server/platform.log || true; exit 1
|
||||
echo "::error::platform-server exited early (failed to bind or crashed)"; cat workspace-server/platform.log || true; exit 1
|
||||
fi
|
||||
if curl -sf "$BASE/health" >/dev/null; then
|
||||
tables=$(docker exec "$PG_CONTAINER" psql -U dev -d molecule -tAc \
|
||||
@@ -237,13 +223,13 @@ jobs:
|
||||
lifecycle-real:
|
||||
name: Local Provision Lifecycle E2E (real image + MiniMax LLM, advisory)
|
||||
runs-on: docker-host
|
||||
# Serialise behind the gating stub job: both jobs bind the SAME fixed host
|
||||
# port :8080, so co-scheduling them on one docker-host runner makes the
|
||||
# second platform-server fail to bind (fatal) and reds whichever lost the
|
||||
# race. `needs:` forces this advisory job to start only AFTER lifecycle-stub
|
||||
# finishes, so they never contend for :8080. continue-on-error keeps a real-
|
||||
# job miss non-blocking; `needs:` does NOT gate on the stub's success (a
|
||||
# failed required gate still lets this advisory dependent run).
|
||||
# Serialise behind the gating stub job: both jobs share the same docker-host
|
||||
# runner and provision sibling containers. `needs:` forces this advisory job
|
||||
# to start only AFTER lifecycle-stub finishes, avoiding resource contention.
|
||||
# (Dynamic ports eliminated the fixed-port race; serialisation remains for
|
||||
# docker-host capacity hygiene.) continue-on-error keeps a real-job miss
|
||||
# non-blocking; `needs:` does NOT gate on the stub's success (a failed
|
||||
# required gate still lets this advisory dependent run).
|
||||
needs: lifecycle-stub
|
||||
if: ${{ always() }}
|
||||
# Tracker for lint-continue-on-error-tracking (Tier 2e / internal#350): this
|
||||
@@ -299,10 +285,14 @@ jobs:
|
||||
|
||||
- name: Configure platform env
|
||||
run: |
|
||||
# Allocate an unused ephemeral port to avoid collision with concurrent
|
||||
# jobs or stale processes from prior cancelled runs (see #2450).
|
||||
PORT=$(python3 -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()")
|
||||
echo "PORT=${PORT}" >> "$GITHUB_ENV"
|
||||
echo "BASE=http://localhost:${PORT}" >> "$GITHUB_ENV"
|
||||
T="lpe2e-real-admin-${{ github.run_id }}-${{ github.run_attempt }}"
|
||||
echo "ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
|
||||
echo "MOLECULE_ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
|
||||
echo "BASE=http://localhost:8080" >> "$GITHUB_ENV"
|
||||
echo "MOLECULE_ENV=development" >> "$GITHUB_ENV"
|
||||
echo "SECRETS_ENCRYPTION_KEY=lpe2e-test-encryption-key-32bytes!!" >> "$GITHUB_ENV"
|
||||
|
||||
@@ -312,8 +302,9 @@ jobs:
|
||||
|
||||
- name: Kill stale platform-server before start (issue #1046)
|
||||
run: |
|
||||
# Same fixed-:8080 hygiene as the stub job — free the port from any
|
||||
# zombie left by a cancelled run before this job binds it.
|
||||
# Dynamic port allocation (see #2450) eliminates the fixed-port race.
|
||||
# We still sweep by process name to avoid leaking platform-server
|
||||
# processes on the shared runner.
|
||||
killed=0
|
||||
for pid in $(grep -l "platform-serve" /proc/[0-9]*/comm 2>/dev/null); do
|
||||
kpid="${pid%/comm}"; kpid="${kpid##*/}"
|
||||
@@ -325,30 +316,23 @@ jobs:
|
||||
fi
|
||||
done
|
||||
if [ "$killed" -gt 0 ]; then echo "Killed $killed stale platform-server process(es)."; else echo "No platform-server-named process found."; fi
|
||||
# Belt-and-braces: free :8080 from ANY holder regardless of process name
|
||||
# (a differently-named squatter survives the comm-name scan above, makes
|
||||
# our bind FATAL, and can false-positive the /health probe). Mirrors the
|
||||
# stub job's no-flakes fix (tracked alongside #2430).
|
||||
if command -v fuser >/dev/null 2>&1; then fuser -k 8080/tcp 2>/dev/null || true; fi
|
||||
if command -v lsof >/dev/null 2>&1; then lsof -ti tcp:8080 2>/dev/null | xargs -r kill -9 2>/dev/null || true; fi
|
||||
sleep 2
|
||||
echo ":8080 freed (comm-scan + port-scan swept any squatter)."
|
||||
sleep 1
|
||||
|
||||
- name: Start platform (background)
|
||||
working-directory: workspace-server
|
||||
run: |
|
||||
PORT=8080 ./platform-server > platform.log 2>&1 &
|
||||
PORT=$PORT ./platform-server > platform.log 2>&1 &
|
||||
echo $! > platform.pid
|
||||
|
||||
- name: Wait for /health (+ migrations applied)
|
||||
run: |
|
||||
DEADLINE=300; PID="$(cat workspace-server/platform.pid 2>/dev/null || true)"; start=$(date +%s)
|
||||
while :; do
|
||||
# Verify OUR server owns :8080 before trusting /health (no-flakes RCA):
|
||||
# our server binds :8080 or exits FATAL, so checking our PID first stops
|
||||
# a squatter answering /health on :8080 from false-positiving the gate.
|
||||
# Verify OUR server is still alive before trusting /health. Our server
|
||||
# binds the allocated port or exits FATAL, so checking our PID first
|
||||
# stops a squatter from false-positiving the gate (no-flakes RCA).
|
||||
if [ -n "$PID" ] && ! kill -0 "$PID" 2>/dev/null; then
|
||||
echo "::error::platform-server exited early (failed to bind :8080 or crashed)"; cat workspace-server/platform.log || true; exit 1
|
||||
echo "::error::platform-server exited early (failed to bind or crashed)"; cat workspace-server/platform.log || true; exit 1
|
||||
fi
|
||||
if curl -sf "$BASE/health" >/dev/null; then
|
||||
tables=$(docker exec "$PG_CONTAINER" psql -U dev -d molecule -tAc \
|
||||
|
||||
Reference in New Issue
Block a user