diff --git a/.gitea/workflows/e2e-api.yml b/.gitea/workflows/e2e-api.yml index 5df6efff..336fbcdb 100644 --- a/.gitea/workflows/e2e-api.yml +++ b/.gitea/workflows/e2e-api.yml @@ -69,6 +69,13 @@ name: E2E API Smoke Test # 2318) shows Postgres ready in 3s, Redis in 1s, Platform in 1s when # they DO come up. Timeouts are not the bottleneck; not bumped. # +# Item #1046 (fixed 2026-05-14): Stale platform-server from cancelled runs +# lingers on :8080 after "Stop platform" step is skipped (workflow cancelled +# before reaching line 335). Added a pre-start "Kill stale platform-server" +# step (line 286) that scans /proc for zombie platform-server processes +# and kills them before the port probe or bind. Makes the ephemeral port +# probe + start sequence deterministic. +# # Item explicitly NOT fixed here: failing test `Status back online` # fails because the platform's langgraph workspace template image # (ghcr.io/molecule-ai/workspace-template-langgraph:latest) returns @@ -283,6 +290,33 @@ jobs: echo "PORT=${PLATFORM_PORT}" >> "$GITHUB_ENV" echo "BASE=http://127.0.0.1:${PLATFORM_PORT}" >> "$GITHUB_ENV" echo "Platform host port: ${PLATFORM_PORT}" + - name: Kill stale platform-server before start (issue #1046) + if: needs.detect-changes.outputs.api == 'true' + run: | + # Concurrent runs on the same host-network act_runner can leave a + # zombie platform-server from a cancelled/timeout run. Cancelled + # runs never reach the "Stop platform" step (line 335), so the + # old process lingers. Before picking a port or starting, scan + # /proc for any platform-server binary and kill it. + # + # "Pick platform port" uses socket.bind(("", 0)) which SHOULD get + # a free port from the OS, but the OS may hand out a port still + # in TIME_WAIT from a zombie. Killing zombie processes first makes + # both the port probe AND the bind deterministic. + # + # Safe: only kills platform-server; ignores other Go binaries. + # Uses only shell builtins + grep + kill — available on any Ubuntu + # runner without extra tools. + for pid in $(grep -l 'platform-server' /proc/*/comm 2>/dev/null | cut -d/ -f3); do + cmdline=$(cat "/proc/${pid}/cmdline" 2>/dev/null | tr '\0' ' ') + if echo "$cmdline" | grep -q 'platform-server'; then + echo "Killing stale platform-server pid ${pid}: ${cmdline}" + kill "$pid" 2>/dev/null || true + fi + done + # Brief pause to let the kernel release sockets + sleep 1 + echo "Stale platform-server cleanup complete." - name: Start platform (background) if: needs.detect-changes.outputs.api == 'true' working-directory: workspace-server