Merge pull request 'fix(ci): wait for platform /health on a migration-chain-proof budget (#2205)' (#2206) from fix/e2e-api-health-wait-migration-chain into main

2026-06-04 05:32:28 +00:00
parent 7a72516f7e 382a894f53
commit 7f25373309
4 changed files with 111 additions and 19 deletions
@@ -330,16 +330,40 @@ jobs:
      - name: Wait for /health
        if: needs.detect-changes.outputs.api == 'true'
        run: |
-          for i in $(seq 1 30); do
+          # Readiness signal: the platform binds /health only AFTER the full
+          # migration chain has been applied on cold start (it prints
+          # "Platform starting on :PORT" at that point). So a 200 from /health
+          # is the real "migrations done + server listening" signal.
+          #
+          # The migration chain grows every release, so a fixed ~30s budget is
+          # brittle by construction (it WILL be exceeded as migrations accrue).
+          # Use a generous wall-clock budget that comfortably exceeds
+          # cold-start + full-migration time, polling fast. This is robust to a
+          # growing chain WITHOUT masking a genuinely dead platform: if the
+          # background platform-server process has exited (e.g. a broken
+          # migration crashed it), we stop and fail loudly at once instead of
+          # waiting out the whole budget.
+          DEADLINE_SECS=180          # cold-start + full migration chain headroom
+          PLATFORM_PID="$(cat workspace-server/platform.pid 2>/dev/null || true)"
+          start=$(date +%s)
+          while :; do
            if curl -sf "$BASE/health" > /dev/null; then
-              echo "Platform up after ${i}s"
+              echo "Platform healthy after $(( $(date +%s) - start ))s"
              exit 0
            fi
+            # Fast-fail: if the platform process died, /health will never come.
+            if [ -n "$PLATFORM_PID" ] && ! kill -0 "$PLATFORM_PID" 2>/dev/null; then
+              echo "::error::platform-server (pid ${PLATFORM_PID}) exited before /health became reachable — see log below"
+              cat workspace-server/platform.log || true
+              exit 1
+            fi
+            if [ "$(( $(date +%s) - start ))" -ge "$DEADLINE_SECS" ]; then
+              echo "::error::Platform did not become healthy within ${DEADLINE_SECS}s — see log below"
+              cat workspace-server/platform.log || true
+              exit 1
+            fi
            sleep 1
          done
-          echo "::error::Platform did not become healthy in 30s"
-          cat workspace-server/platform.log || true
-          exit 1
      - name: Assert migrations applied
        if: needs.detect-changes.outputs.api == 'true'
        run: |
@@ -242,16 +242,36 @@ jobs:
      - name: Wait for /health
        if: needs.detect-changes.outputs.chat == 'true'
        run: |
-          for i in $(seq 1 30); do
+          # Readiness signal: the platform binds /health only AFTER the full
+          # migration chain has been applied on cold start (it prints
+          # "Platform starting on :PORT" at that point). So a 200 from /health
+          # is the real "migrations done + server listening" signal.
+          #
+          # The migration chain grows every release, so a fixed ~30s budget is
+          # brittle by construction. Use a generous wall-clock budget that
+          # comfortably exceeds cold-start + full-migration time, polling fast.
+          # Robust to a growing chain WITHOUT masking a dead platform: if the
+          # background platform-server process has exited, fail loudly at once.
+          DEADLINE_SECS=180          # cold-start + full migration chain headroom
+          PLATFORM_PID="$(cat workspace-server/platform.pid 2>/dev/null || true)"
+          start=$(date +%s)
+          while :; do
            if curl -sf "http://127.0.0.1:${PLATFORM_PORT}/health" > /dev/null; then
-              echo "Platform up after ${i}s"
+              echo "Platform healthy after $(( $(date +%s) - start ))s"
              exit 0
            fi
+            if [ -n "$PLATFORM_PID" ] && ! kill -0 "$PLATFORM_PID" 2>/dev/null; then
+              echo "::error::platform-server (pid ${PLATFORM_PID}) exited before /health became reachable — see log below"
+              cat workspace-server/platform.log || true
+              exit 1
+            fi
+            if [ "$(( $(date +%s) - start ))" -ge "$DEADLINE_SECS" ]; then
+              echo "::error::Platform did not become healthy within ${DEADLINE_SECS}s — see log below"
+              cat workspace-server/platform.log || true
+              exit 1
+            fi
            sleep 1
          done
-          echo "::error::Platform did not become healthy in 30s"
-          cat workspace-server/platform.log || true
-          exit 1

      - name: Install canvas dependencies
        if: needs.detect-changes.outputs.chat == 'true'
@@ -130,13 +130,37 @@ jobs:
        run: |
          set -euo pipefail
          ./workspace-server/platform-server > workspace-server/platform.log 2>&1 &
-          echo $! > workspace-server/platform.pid
-          for i in $(seq 1 30); do
-            curl -sf "$BASE/health" >/dev/null && exit 0
+          PLATFORM_PID=$!
+          echo "$PLATFORM_PID" > workspace-server/platform.pid
+          # Readiness signal: the platform binds /health only AFTER the full
+          # migration chain has been applied on cold start (it prints
+          # "Platform starting on :PORT" at that point). So a 200 from /health
+          # is the real "migrations done + server listening" signal.
+          #
+          # The migration chain grows every release, so a fixed ~30s budget is
+          # brittle by construction. Use a generous wall-clock budget that
+          # comfortably exceeds cold-start + full-migration time, polling fast.
+          # Robust to a growing chain WITHOUT masking a dead platform: if the
+          # background platform-server process has exited, fail loudly at once.
+          DEADLINE_SECS=180          # cold-start + full migration chain headroom
+          start=$(date +%s)
+          while :; do
+            if curl -sf "$BASE/health" >/dev/null; then
+              echo "Platform healthy after $(( $(date +%s) - start ))s"
+              exit 0
+            fi
+            if ! kill -0 "$PLATFORM_PID" 2>/dev/null; then
+              echo "::error::platform-server (pid ${PLATFORM_PID}) exited before /health became reachable — see log below"
+              cat workspace-server/platform.log || true
+              exit 1
+            fi
+            if [ "$(( $(date +%s) - start ))" -ge "$DEADLINE_SECS" ]; then
+              echo "::error::Platform did not become healthy within ${DEADLINE_SECS}s — see log below"
+              cat workspace-server/platform.log || true
+              exit 1
+            fi
            sleep 1
          done
-          cat workspace-server/platform.log || true
-          exit 1

      - name: Run comprehensive E2E
        run: bash tests/e2e/test_comprehensive_e2e.sh
@@ -267,12 +267,36 @@ jobs:
          echo $! > platform.pid
      - name: Wait for /health
        run: |
-          for i in $(seq 1 30); do
-            curl -sf "$BASE/health" > /dev/null && { echo "Platform up after ${i}s"; exit 0; }
+          # Readiness signal: the platform binds /health only AFTER the full
+          # migration chain has been applied on cold start (it prints
+          # "Platform starting on :PORT" at that point). So a 200 from /health
+          # is the real "migrations done + server listening" signal.
+          #
+          # The migration chain grows every release, so a fixed ~30s budget is
+          # brittle by construction. Use a generous wall-clock budget that
+          # comfortably exceeds cold-start + full-migration time, polling fast.
+          # Robust to a growing chain WITHOUT masking a dead platform: if the
+          # background platform-server process has exited, fail loudly at once.
+          DEADLINE_SECS=180          # cold-start + full migration chain headroom
+          PLATFORM_PID="$(cat workspace-server/platform.pid 2>/dev/null || true)"
+          start=$(date +%s)
+          while :; do
+            if curl -sf "$BASE/health" > /dev/null; then
+              echo "Platform healthy after $(( $(date +%s) - start ))s"
+              exit 0
+            fi
+            if [ -n "$PLATFORM_PID" ] && ! kill -0 "$PLATFORM_PID" 2>/dev/null; then
+              echo "::error::platform-server (pid ${PLATFORM_PID}) exited before /health became reachable — see log below"
+              cat workspace-server/platform.log || true
+              exit 1
+            fi
+            if [ "$(( $(date +%s) - start ))" -ge "$DEADLINE_SECS" ]; then
+              echo "::error::Platform did not become healthy within ${DEADLINE_SECS}s — see log below"
+              cat workspace-server/platform.log || true
+              exit 1
+            fi
            sleep 1
          done
-          echo "::error::Platform did not become healthy in 30s"
-          cat workspace-server/platform.log || true; exit 1
      - name: Run LOCAL fresh-provision peer-visibility E2E (literal MCP list_peers)
        # HONEST gate — NO continue-on-error. The local backend uses
        # external-mode workspaces so this context tests the literal MCP