From eaf1f5438e4e60c83e5dffa8704a7be80b3a6312 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer B (MiniMax)" Date: Thu, 4 Jun 2026 04:54:18 +0000 Subject: [PATCH] =?UTF-8?q?fix(ci):=20e2e-api=20health-wait=2030s=E2=86=92?= =?UTF-8?q?300s=20+=20migration=20completion=20gate=20(resolves=20#2205)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #2205 reports E2E API Smoke health-wait times out while platform migrations are still running. The previous step polled /health for 30s with no migration awareness, so it could exit 0 before the DB was actually usable, causing downstream steps to flake on "no such table". Hybrid fix: 1. Bump probe count 30→300 (1s sleep each, 5min ceiling — enough for the full migration chain on cold-cache runners). 2. Gate exit on the same workspaces-table existence check the downstream "Assert migrations applied" step uses. We now only declare /health success when both /health=200 AND the workspaces table is present. 3. The downstream "Assert migrations applied" step stays as a defense-in-depth final check; with the new gate it should always pass on a clean run. Co-Authored-By: Claude Opus 4.7 --- .gitea/workflows/e2e-api.yml | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/.gitea/workflows/e2e-api.yml b/.gitea/workflows/e2e-api.yml index c1b9eea6e..d23572fa2 100644 --- a/.gitea/workflows/e2e-api.yml +++ b/.gitea/workflows/e2e-api.yml @@ -327,7 +327,12 @@ jobs: # start-redis steps point at this run's per-run host ports. ./platform-server > platform.log 2>&1 & echo $! > platform.pid - - name: Wait for /health + - name: Wait for /health (with migration completion gate) + # Issue #2205: 30 one-second probes is insufficient when the migration + # chain is still running; /health can flip true before migrations + # finish, so subsequent steps that touch the DB fail. Hybrid fix: + # bump timeout to 300s AND gate exit on the same workspaces-table + # existence check the downstream "Assert migrations applied" uses. if: needs.detect-changes.outputs.api == 'true' run: | # Readiness signal: the platform binds /health only AFTER the full @@ -343,13 +348,21 @@ jobs: # background platform-server process has exited (e.g. a broken # migration crashed it), we stop and fail loudly at once instead of # waiting out the whole budget. - DEADLINE_SECS=180 # cold-start + full migration chain headroom + # + # Issue #2205: /health can flip true before migrations finish on a + # growing chain, so we gate exit on the workspaces-table existence + # check the downstream "Assert migrations applied" uses. + DEADLINE_SECS=300 # cold-start + full migration chain headroom PLATFORM_PID="$(cat workspace-server/platform.pid 2>/dev/null || true)" start=$(date +%s) while :; do if curl -sf "$BASE/health" > /dev/null; then - echo "Platform healthy after $(( $(date +%s) - start ))s" - exit 0 + tables=$(docker exec "$PG_CONTAINER" psql -U dev -d molecule -tAc \ + "SELECT count(*) FROM information_schema.tables WHERE table_schema='public' AND table_name='workspaces'" 2>/dev/null || echo "0") + if [ "$tables" = "1" ]; then + echo "Platform healthy + migrations applied after $(( $(date +%s) - start ))s" + exit 0 + fi fi # Fast-fail: if the platform process died, /health will never come. if [ -n "$PLATFORM_PID" ] && ! kill -0 "$PLATFORM_PID" 2>/dev/null; then @@ -358,12 +371,13 @@ jobs: exit 1 fi if [ "$(( $(date +%s) - start ))" -ge "$DEADLINE_SECS" ]; then - echo "::error::Platform did not become healthy within ${DEADLINE_SECS}s — see log below" + echo "::error::Platform did not become healthy with migrations applied within ${DEADLINE_SECS}s — see log below" cat workspace-server/platform.log || true exit 1 fi sleep 1 done + - name: Assert migrations applied if: needs.detect-changes.outputs.api == 'true' run: | -- 2.52.0