2026-06-09 01:42:14 +00:00
1 changed files with 41 additions and 57 deletions
@@ -124,12 +124,16 @@ jobs:

      - name: Configure platform env (admin token + local Docker provisioner)
        run: |
+          # Allocate an unused ephemeral port to avoid collision with concurrent
+          # jobs or stale processes from prior cancelled runs (see #2450).
+          PORT=$(python3 -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()")
+          echo "PORT=${PORT}" >> "$GITHUB_ENV"
+          echo "BASE=http://localhost:${PORT}" >> "$GITHUB_ENV"
          # Deterministic admin token: the script sends MOLECULE_ADMIN_TOKEN as the
          # bearer; the platform checks ADMIN_TOKEN. Set both to the same value.
          T="lpe2e-admin-${{ github.run_id }}-${{ github.run_attempt }}"
          echo "ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
          echo "MOLECULE_ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
-          echo "BASE=http://localhost:8080" >> "$GITHUB_ENV"
          # MOLECULE_ENV=development: dev posture. MOLECULE_ORG_ID is left UNSET so
          # main.go wires the LOCAL Docker provisioner (not the CP provisioner), and
          # MOLECULE_IMAGE_REGISTRY is left UNSET so image resolution uses
@@ -143,21 +147,10 @@ jobs:

      - name: Kill stale platform-server before start (issue #1046)
        run: |
-          # ROOT CAUSE of the stub-gate red on docker-host: both this gating job
-          # and the advisory lifecycle-real job bind the SAME fixed host port
-          # :8080 (PORT=8080 ./platform-server). On the small docker-host runner
-          # pool a prior cancelled/timeout run can leave a zombie platform-server
-          # on :8080 (a cancelled run never reaches "Stop platform"), and — until
-          # lifecycle-real was serialised behind this job via needs: — the two
-          # jobs could also co-schedule on one runner and contend for :8080. A
-          # second bind on :8080 is FATAL (the server exits), so "Wait for
-          # /health" times out at 300s and this REQUIRED gate reds. Free the port
-          # before binding — mirrors the e2e-api.yml #1046 fix for the identical
-          # fixed-port-on-shared-runner class.
-          #
-          # /proc scan — works on any Linux without pkill/lsof/ss. comm is
-          # truncated to 15 chars: "platform-serve" matches "platform-server".
-          # Verify via cmdline to avoid false positives.
+          # Dynamic port allocation (see #2450) eliminates the fixed-port race
+          # that caused this gate to red when a prior run left a zombie process.
+          # We still sweep by process name to avoid leaking platform-server
+          # processes on the shared runner.
          killed=0
          for pid in $(grep -l "platform-serve" /proc/[0-9]*/comm 2>/dev/null); do
            kpid="${pid%/comm}"; kpid="${kpid##*/}"
@@ -169,35 +162,28 @@ jobs:
            fi
          done
          if [ "$killed" -gt 0 ]; then echo "Killed $killed stale platform-server process(es)."; else echo "No platform-server-named process found."; fi
-          # Belt-and-braces: also free :8080 from ANY holder regardless of process
-          # name. A differently-named squatter (e.g. a leftover Fastify dev server
-          # from another job) survives the comm-name scan above, makes our bind
-          # FATAL, and can false-positive the /health probe below (no-flakes RCA;
-          # tracked alongside #2430). fuser/lsof are present on the ubuntu runner;
-          # if neither exists the name-scan above is the floor.
-          if command -v fuser >/dev/null 2>&1; then fuser -k 8080/tcp 2>/dev/null || true; fi
-          if command -v lsof  >/dev/null 2>&1; then lsof -ti tcp:8080 2>/dev/null | xargs -r kill -9 2>/dev/null || true; fi
-          sleep 2
-          echo ":8080 freed (comm-scan + port-scan swept any squatter)."
+          sleep 1

      - name: Start platform (background)
        working-directory: workspace-server
        run: |
-          # Bind to :8080 (the script's BASE). DATABASE_URL/REDIS_URL/ADMIN_TOKEN/
-          # MOLECULE_ENV are inherited from $GITHUB_ENV.
-          PORT=8080 ./platform-server > platform.log 2>&1 &
+          # Bind to the dynamically allocated port (see #2450).
+          # DATABASE_URL/REDIS_URL/ADMIN_TOKEN/MOLECULE_ENV are inherited from
+          # $GITHUB_ENV.
+          PORT=$PORT ./platform-server > platform.log 2>&1 &
          echo $! > platform.pid

      - name: Wait for /health (+ migrations applied)
        run: |
          DEADLINE=300; PID="$(cat workspace-server/platform.pid 2>/dev/null || true)"; start=$(date +%s)
          while :; do
-            # Verify OUR server owns :8080 BEFORE trusting /health. Our server binds
-            # :8080 or exits FATAL, so "our PID alive" <=> "we own :8080"; checking it
-            # first stops a squatter that answers /health on :8080 (our bind having
-            # failed) from false-positiving the gate (no-flakes RCA).
+            # Verify OUR server is still alive before trusting /health. Our server
+            # binds the allocated port or exits FATAL, so "our PID alive" <=>
+            # "we own the port"; checking it first stops a squatter that answers
+            # /health on the same port (our bind having failed) from false-positiving
+            # the gate (no-flakes RCA).
            if [ -n "$PID" ] && ! kill -0 "$PID" 2>/dev/null; then
-              echo "::error::platform-server exited early (failed to bind :8080 or crashed)"; cat workspace-server/platform.log || true; exit 1
+              echo "::error::platform-server exited early (failed to bind or crashed)"; cat workspace-server/platform.log || true; exit 1
            fi
            if curl -sf "$BASE/health" >/dev/null; then
              tables=$(docker exec "$PG_CONTAINER" psql -U dev -d molecule -tAc \
@@ -237,13 +223,13 @@ jobs:
  lifecycle-real:
    name: Local Provision Lifecycle E2E (real image + MiniMax LLM, advisory)
    runs-on: docker-host
-    # Serialise behind the gating stub job: both jobs bind the SAME fixed host
-    # port :8080, so co-scheduling them on one docker-host runner makes the
-    # second platform-server fail to bind (fatal) and reds whichever lost the
-    # race. `needs:` forces this advisory job to start only AFTER lifecycle-stub
-    # finishes, so they never contend for :8080. continue-on-error keeps a real-
-    # job miss non-blocking; `needs:` does NOT gate on the stub's success (a
-    # failed required gate still lets this advisory dependent run).
+    # Serialise behind the gating stub job: both jobs share the same docker-host
+    # runner and provision sibling containers. `needs:` forces this advisory job
+    # to start only AFTER lifecycle-stub finishes, avoiding resource contention.
+    # (Dynamic ports eliminated the fixed-port race; serialisation remains for
+    # docker-host capacity hygiene.) continue-on-error keeps a real-job miss
+    # non-blocking; `needs:` does NOT gate on the stub's success (a failed
+    # required gate still lets this advisory dependent run).
    needs: lifecycle-stub
    if: ${{ always() }}
    # Tracker for lint-continue-on-error-tracking (Tier 2e / internal#350): this
@@ -299,10 +285,14 @@ jobs:

      - name: Configure platform env
        run: |
+          # Allocate an unused ephemeral port to avoid collision with concurrent
+          # jobs or stale processes from prior cancelled runs (see #2450).
+          PORT=$(python3 -c "import socket; s=socket.socket(); s.bind(('', 0)); print(s.getsockname()[1]); s.close()")
+          echo "PORT=${PORT}" >> "$GITHUB_ENV"
+          echo "BASE=http://localhost:${PORT}" >> "$GITHUB_ENV"
          T="lpe2e-real-admin-${{ github.run_id }}-${{ github.run_attempt }}"
          echo "ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
          echo "MOLECULE_ADMIN_TOKEN=${T}" >> "$GITHUB_ENV"
-          echo "BASE=http://localhost:8080" >> "$GITHUB_ENV"
          echo "MOLECULE_ENV=development" >> "$GITHUB_ENV"
          echo "SECRETS_ENCRYPTION_KEY=lpe2e-test-encryption-key-32bytes!!" >> "$GITHUB_ENV"

@@ -312,8 +302,9 @@ jobs:

      - name: Kill stale platform-server before start (issue #1046)
        run: |
-          # Same fixed-:8080 hygiene as the stub job — free the port from any
-          # zombie left by a cancelled run before this job binds it.
+          # Dynamic port allocation (see #2450) eliminates the fixed-port race.
+          # We still sweep by process name to avoid leaking platform-server
+          # processes on the shared runner.
          killed=0
          for pid in $(grep -l "platform-serve" /proc/[0-9]*/comm 2>/dev/null); do
            kpid="${pid%/comm}"; kpid="${kpid##*/}"
@@ -325,30 +316,23 @@ jobs:
            fi
          done
          if [ "$killed" -gt 0 ]; then echo "Killed $killed stale platform-server process(es)."; else echo "No platform-server-named process found."; fi
-          # Belt-and-braces: free :8080 from ANY holder regardless of process name
-          # (a differently-named squatter survives the comm-name scan above, makes
-          # our bind FATAL, and can false-positive the /health probe). Mirrors the
-          # stub job's no-flakes fix (tracked alongside #2430).
-          if command -v fuser >/dev/null 2>&1; then fuser -k 8080/tcp 2>/dev/null || true; fi
-          if command -v lsof  >/dev/null 2>&1; then lsof -ti tcp:8080 2>/dev/null | xargs -r kill -9 2>/dev/null || true; fi
-          sleep 2
-          echo ":8080 freed (comm-scan + port-scan swept any squatter)."
+          sleep 1

      - name: Start platform (background)
        working-directory: workspace-server
        run: |
-          PORT=8080 ./platform-server > platform.log 2>&1 &
+          PORT=$PORT ./platform-server > platform.log 2>&1 &
          echo $! > platform.pid

      - name: Wait for /health (+ migrations applied)
        run: |
          DEADLINE=300; PID="$(cat workspace-server/platform.pid 2>/dev/null || true)"; start=$(date +%s)
          while :; do
-            # Verify OUR server owns :8080 before trusting /health (no-flakes RCA):
-            # our server binds :8080 or exits FATAL, so checking our PID first stops
-            # a squatter answering /health on :8080 from false-positiving the gate.
+            # Verify OUR server is still alive before trusting /health. Our server
+            # binds the allocated port or exits FATAL, so checking our PID first
+            # stops a squatter from false-positiving the gate (no-flakes RCA).
            if [ -n "$PID" ] && ! kill -0 "$PID" 2>/dev/null; then
-              echo "::error::platform-server exited early (failed to bind :8080 or crashed)"; cat workspace-server/platform.log || true; exit 1
+              echo "::error::platform-server exited early (failed to bind or crashed)"; cat workspace-server/platform.log || true; exit 1
            fi
            if curl -sf "$BASE/health" >/dev/null; then
              tables=$(docker exec "$PG_CONTAINER" psql -U dev -d molecule -tAc \