diff --git a/tests/e2e/test_staging_full_saas.sh b/tests/e2e/test_staging_full_saas.sh index 35786485b..a5d1320fd 100755 --- a/tests/e2e/test_staging_full_saas.sh +++ b/tests/e2e/test_staging_full_saas.sh @@ -25,6 +25,11 @@ # Optional env: # E2E_RUNTIME hermes (default) | claude-code | langgraph # E2E_PROVISION_TIMEOUT_SECS default 900 (15 min cold EC2 budget) +# E2E_WORKSPACE_ONLINE_TIMEOUT_SECS default 3600 (60 min — hermes +# cold-boot worst-case + slack). Raised from +# 1800 (#1646) because flaky tenant-provisioning +# latency (not a code regression) causes +# alternating pass/fail on identical SHAs. # E2E_KEEP_ORG 1 → skip teardown (debugging only) # E2E_RUN_ID Slug suffix; CI: ${GITHUB_RUN_ID} # E2E_MODE full (default) | smoke @@ -56,6 +61,7 @@ CP_URL="${MOLECULE_CP_URL:-https://staging-api.moleculesai.app}" ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN required — Railway staging CP_ADMIN_API_TOKEN}" RUNTIME="${E2E_RUNTIME:-hermes}" PROVISION_TIMEOUT_SECS="${E2E_PROVISION_TIMEOUT_SECS:-900}" +WORKSPACE_ONLINE_TIMEOUT_SECS="${E2E_WORKSPACE_ONLINE_TIMEOUT_SECS:-3600}" RUN_ID_SUFFIX="${E2E_RUN_ID:-$(date +%H%M%S)-$$}" MODE="${E2E_MODE:-full}" # `canary` is a legacy alias for `smoke` retained for back-compat with @@ -363,7 +369,7 @@ print(s[:4000]) wait_workspaces_online_routable() { local label="$1"; shift - local deadline=$(( $(date +%s) + 1800 )) + local deadline=$(( $(date +%s) + WORKSPACE_ONLINE_TIMEOUT_SECS )) local wid ws_last_status ws_last_url ws_url_missing_logged ws_failed_logged local ws_json ws_status ws_url ws_last_err @@ -377,7 +383,7 @@ wait_workspaces_online_routable() { if [ "$(date +%s)" -gt "$deadline" ]; then ws_last_err=$(tenant_call GET "/workspaces/$wid" 2>/dev/null | \ python3 -c "import json,sys; print(json.load(sys.stdin).get('last_sample_error',''))" 2>/dev/null || echo "") - fail "Workspace $wid never reached online with a routable URL within 30 min (last status=$ws_last_status, url=$ws_last_url, err=$ws_last_err)" + fail "Workspace $wid never reached online with a routable URL within ${WORKSPACE_ONLINE_TIMEOUT_SECS}s (~$((WORKSPACE_ONLINE_TIMEOUT_SECS/60)) min) (last status=$ws_last_status, url=$ws_last_url, err=$ws_last_err)" fi ws_json=$(tenant_call GET "/workspaces/$wid" 2>/dev/null || echo '{}') ws_status=$(echo "$ws_json" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status') or '')" 2>/dev/null) @@ -526,14 +532,16 @@ fi # deadline fires at 5 min and sets status=failed prematurely; heartbeat # then transitions failed → online after install.sh finishes. So: # -# - 30 min deadline (hermes worst-case + slack) +# - ${WORKSPACE_ONLINE_TIMEOUT_SECS}s (~$((WORKSPACE_ONLINE_TIMEOUT_SECS/60)) min) +# deadline (hermes worst-case + slack). Configurable via +# E2E_WORKSPACE_ONLINE_TIMEOUT_SECS (#1646). # - 'failed' is a TRANSIENT state we must tolerate — log and keep # polling, only hard-fail at the deadline. Pre-bootstrap-watcher-fix # (controlplane#245) this was a flake generator: workspace went # failed→online inside our window but we bailed at the failed read. WS_TO_CHECK=("$PARENT_ID") [ -n "$CHILD_ID" ] && WS_TO_CHECK+=("$CHILD_ID") -wait_workspaces_online_routable "7/11 Waiting for workspace(s) to reach status=online (up to 30 min — hermes cold boot)..." "${WS_TO_CHECK[@]}" +wait_workspaces_online_routable "7/11 Waiting for workspace(s) to reach status=online (up to $((WORKSPACE_ONLINE_TIMEOUT_SECS/60)) min — hermes cold boot)..." "${WS_TO_CHECK[@]}" # ─── 7b. Canvas-terminal diagnose (EIC chain probe) ──────────────────── # This step exists because the canvas-terminal failure of 2026-05-03