diff --git a/.gitea/workflows/local-provision-e2e.yml b/.gitea/workflows/local-provision-e2e.yml index f0605d2a2..5e5d105ec 100644 --- a/.gitea/workflows/local-provision-e2e.yml +++ b/.gitea/workflows/local-provision-e2e.yml @@ -124,12 +124,16 @@ jobs: - name: Configure platform env (admin token + local Docker provisioner) run: | + # Allocate an unused ephemeral port to avoid collision with concurrent + # jobs or stale processes from prior cancelled runs (see #2450). + PORT=$(python3 -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()") + echo "PORT=${PORT}" >> "$GITHUB_ENV" + echo "BASE=http://localhost:${PORT}" >> "$GITHUB_ENV" # Deterministic admin token: the script sends MOLECULE_ADMIN_TOKEN as the # bearer; the platform checks ADMIN_TOKEN. Set both to the same value. T="lpe2e-admin-${{ github.run_id }}-${{ github.run_attempt }}" echo "ADMIN_TOKEN=${T}" >> "$GITHUB_ENV" echo "MOLECULE_ADMIN_TOKEN=${T}" >> "$GITHUB_ENV" - echo "BASE=http://localhost:8080" >> "$GITHUB_ENV" # MOLECULE_ENV=development: dev posture. MOLECULE_ORG_ID is left UNSET so # main.go wires the LOCAL Docker provisioner (not the CP provisioner), and # MOLECULE_IMAGE_REGISTRY is left UNSET so image resolution uses @@ -143,21 +147,10 @@ jobs: - name: Kill stale platform-server before start (issue #1046) run: | - # ROOT CAUSE of the stub-gate red on docker-host: both this gating job - # and the advisory lifecycle-real job bind the SAME fixed host port - # :8080 (PORT=8080 ./platform-server). On the small docker-host runner - # pool a prior cancelled/timeout run can leave a zombie platform-server - # on :8080 (a cancelled run never reaches "Stop platform"), and — until - # lifecycle-real was serialised behind this job via needs: — the two - # jobs could also co-schedule on one runner and contend for :8080. A - # second bind on :8080 is FATAL (the server exits), so "Wait for - # /health" times out at 300s and this REQUIRED gate reds. Free the port - # before binding — mirrors the e2e-api.yml #1046 fix for the identical - # fixed-port-on-shared-runner class. - # - # /proc scan — works on any Linux without pkill/lsof/ss. comm is - # truncated to 15 chars: "platform-serve" matches "platform-server". - # Verify via cmdline to avoid false positives. + # Dynamic port allocation (see #2450) eliminates the fixed-port race + # that caused this gate to red when a prior run left a zombie process. + # We still sweep by process name to avoid leaking platform-server + # processes on the shared runner. killed=0 for pid in $(grep -l "platform-serve" /proc/[0-9]*/comm 2>/dev/null); do kpid="${pid%/comm}"; kpid="${kpid##*/}" @@ -169,35 +162,28 @@ jobs: fi done if [ "$killed" -gt 0 ]; then echo "Killed $killed stale platform-server process(es)."; else echo "No platform-server-named process found."; fi - # Belt-and-braces: also free :8080 from ANY holder regardless of process - # name. A differently-named squatter (e.g. a leftover Fastify dev server - # from another job) survives the comm-name scan above, makes our bind - # FATAL, and can false-positive the /health probe below (no-flakes RCA; - # tracked alongside #2430). fuser/lsof are present on the ubuntu runner; - # if neither exists the name-scan above is the floor. - if command -v fuser >/dev/null 2>&1; then fuser -k 8080/tcp 2>/dev/null || true; fi - if command -v lsof >/dev/null 2>&1; then lsof -ti tcp:8080 2>/dev/null | xargs -r kill -9 2>/dev/null || true; fi - sleep 2 - echo ":8080 freed (comm-scan + port-scan swept any squatter)." + sleep 1 - name: Start platform (background) working-directory: workspace-server run: | - # Bind to :8080 (the script's BASE). DATABASE_URL/REDIS_URL/ADMIN_TOKEN/ - # MOLECULE_ENV are inherited from $GITHUB_ENV. - PORT=8080 ./platform-server > platform.log 2>&1 & + # Bind to the dynamically allocated port (see #2450). + # DATABASE_URL/REDIS_URL/ADMIN_TOKEN/MOLECULE_ENV are inherited from + # $GITHUB_ENV. + PORT=$PORT ./platform-server > platform.log 2>&1 & echo $! > platform.pid - name: Wait for /health (+ migrations applied) run: | DEADLINE=300; PID="$(cat workspace-server/platform.pid 2>/dev/null || true)"; start=$(date +%s) while :; do - # Verify OUR server owns :8080 BEFORE trusting /health. Our server binds - # :8080 or exits FATAL, so "our PID alive" <=> "we own :8080"; checking it - # first stops a squatter that answers /health on :8080 (our bind having - # failed) from false-positiving the gate (no-flakes RCA). + # Verify OUR server is still alive before trusting /health. Our server + # binds the allocated port or exits FATAL, so "our PID alive" <=> + # "we own the port"; checking it first stops a squatter that answers + # /health on the same port (our bind having failed) from false-positiving + # the gate (no-flakes RCA). if [ -n "$PID" ] && ! kill -0 "$PID" 2>/dev/null; then - echo "::error::platform-server exited early (failed to bind :8080 or crashed)"; cat workspace-server/platform.log || true; exit 1 + echo "::error::platform-server exited early (failed to bind or crashed)"; cat workspace-server/platform.log || true; exit 1 fi if curl -sf "$BASE/health" >/dev/null; then tables=$(docker exec "$PG_CONTAINER" psql -U dev -d molecule -tAc \ @@ -237,13 +223,13 @@ jobs: lifecycle-real: name: Local Provision Lifecycle E2E (real image + MiniMax LLM, advisory) runs-on: docker-host - # Serialise behind the gating stub job: both jobs bind the SAME fixed host - # port :8080, so co-scheduling them on one docker-host runner makes the - # second platform-server fail to bind (fatal) and reds whichever lost the - # race. `needs:` forces this advisory job to start only AFTER lifecycle-stub - # finishes, so they never contend for :8080. continue-on-error keeps a real- - # job miss non-blocking; `needs:` does NOT gate on the stub's success (a - # failed required gate still lets this advisory dependent run). + # Serialise behind the gating stub job: both jobs share the same docker-host + # runner and provision sibling containers. `needs:` forces this advisory job + # to start only AFTER lifecycle-stub finishes, avoiding resource contention. + # (Dynamic ports eliminated the fixed-port race; serialisation remains for + # docker-host capacity hygiene.) continue-on-error keeps a real-job miss + # non-blocking; `needs:` does NOT gate on the stub's success (a failed + # required gate still lets this advisory dependent run). needs: lifecycle-stub if: ${{ always() }} # Tracker for lint-continue-on-error-tracking (Tier 2e / internal#350): this @@ -299,10 +285,14 @@ jobs: - name: Configure platform env run: | + # Allocate an unused ephemeral port to avoid collision with concurrent + # jobs or stale processes from prior cancelled runs (see #2450). + PORT=$(python3 -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()") + echo "PORT=${PORT}" >> "$GITHUB_ENV" + echo "BASE=http://localhost:${PORT}" >> "$GITHUB_ENV" T="lpe2e-real-admin-${{ github.run_id }}-${{ github.run_attempt }}" echo "ADMIN_TOKEN=${T}" >> "$GITHUB_ENV" echo "MOLECULE_ADMIN_TOKEN=${T}" >> "$GITHUB_ENV" - echo "BASE=http://localhost:8080" >> "$GITHUB_ENV" echo "MOLECULE_ENV=development" >> "$GITHUB_ENV" echo "SECRETS_ENCRYPTION_KEY=lpe2e-test-encryption-key-32bytes!!" >> "$GITHUB_ENV" @@ -312,8 +302,9 @@ jobs: - name: Kill stale platform-server before start (issue #1046) run: | - # Same fixed-:8080 hygiene as the stub job — free the port from any - # zombie left by a cancelled run before this job binds it. + # Dynamic port allocation (see #2450) eliminates the fixed-port race. + # We still sweep by process name to avoid leaking platform-server + # processes on the shared runner. killed=0 for pid in $(grep -l "platform-serve" /proc/[0-9]*/comm 2>/dev/null); do kpid="${pid%/comm}"; kpid="${kpid##*/}" @@ -325,30 +316,23 @@ jobs: fi done if [ "$killed" -gt 0 ]; then echo "Killed $killed stale platform-server process(es)."; else echo "No platform-server-named process found."; fi - # Belt-and-braces: free :8080 from ANY holder regardless of process name - # (a differently-named squatter survives the comm-name scan above, makes - # our bind FATAL, and can false-positive the /health probe). Mirrors the - # stub job's no-flakes fix (tracked alongside #2430). - if command -v fuser >/dev/null 2>&1; then fuser -k 8080/tcp 2>/dev/null || true; fi - if command -v lsof >/dev/null 2>&1; then lsof -ti tcp:8080 2>/dev/null | xargs -r kill -9 2>/dev/null || true; fi - sleep 2 - echo ":8080 freed (comm-scan + port-scan swept any squatter)." + sleep 1 - name: Start platform (background) working-directory: workspace-server run: | - PORT=8080 ./platform-server > platform.log 2>&1 & + PORT=$PORT ./platform-server > platform.log 2>&1 & echo $! > platform.pid - name: Wait for /health (+ migrations applied) run: | DEADLINE=300; PID="$(cat workspace-server/platform.pid 2>/dev/null || true)"; start=$(date +%s) while :; do - # Verify OUR server owns :8080 before trusting /health (no-flakes RCA): - # our server binds :8080 or exits FATAL, so checking our PID first stops - # a squatter answering /health on :8080 from false-positiving the gate. + # Verify OUR server is still alive before trusting /health. Our server + # binds the allocated port or exits FATAL, so checking our PID first + # stops a squatter from false-positiving the gate (no-flakes RCA). if [ -n "$PID" ] && ! kill -0 "$PID" 2>/dev/null; then - echo "::error::platform-server exited early (failed to bind :8080 or crashed)"; cat workspace-server/platform.log || true; exit 1 + echo "::error::platform-server exited early (failed to bind or crashed)"; cat workspace-server/platform.log || true; exit 1 fi if curl -sf "$BASE/health" >/dev/null; then tables=$(docker exec "$PG_CONTAINER" psql -U dev -d molecule -tAc \