2026-05-22 18:52:38 +00:00
1 changed files with 12 additions and 4 deletions
@@ -25,6 +25,11 @@
 # Optional env:
 #   E2E_RUNTIME                  hermes (default) | claude-code | langgraph
 #   E2E_PROVISION_TIMEOUT_SECS   default 900 (15 min cold EC2 budget)
+#   E2E_WORKSPACE_ONLINE_TIMEOUT_SECS  default 3600 (60 min — hermes
+#                                cold-boot worst-case + slack). Raised from
+#                                1800 (#1646) because flaky tenant-provisioning
+#                                latency (not a code regression) causes
+#                                alternating pass/fail on identical SHAs.
 #   E2E_KEEP_ORG                 1 → skip teardown (debugging only)
 #   E2E_RUN_ID                   Slug suffix; CI: ${GITHUB_RUN_ID}
 #   E2E_MODE                     full (default) | smoke
@@ -56,6 +61,7 @@ CP_URL="${MOLECULE_CP_URL:-https://staging-api.moleculesai.app}"
 ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN required — Railway staging CP_ADMIN_API_TOKEN}"
 RUNTIME="${E2E_RUNTIME:-hermes}"
 PROVISION_TIMEOUT_SECS="${E2E_PROVISION_TIMEOUT_SECS:-900}"
+WORKSPACE_ONLINE_TIMEOUT_SECS="${E2E_WORKSPACE_ONLINE_TIMEOUT_SECS:-3600}"
 RUN_ID_SUFFIX="${E2E_RUN_ID:-$(date +%H%M%S)-$$}"
 MODE="${E2E_MODE:-full}"
 # `canary` is a legacy alias for `smoke` retained for back-compat with
@@ -363,7 +369,7 @@ print(s[:4000])

 wait_workspaces_online_routable() {
  local label="$1"; shift
-  local deadline=$(( $(date +%s) + 1800 ))
+  local deadline=$(( $(date +%s) + WORKSPACE_ONLINE_TIMEOUT_SECS ))
  local wid ws_last_status ws_last_url ws_url_missing_logged ws_failed_logged
  local ws_json ws_status ws_url ws_last_err

@@ -377,7 +383,7 @@ wait_workspaces_online_routable() {
      if [ "$(date +%s)" -gt "$deadline" ]; then
        ws_last_err=$(tenant_call GET "/workspaces/$wid" 2>/dev/null | \
          python3 -c "import json,sys; print(json.load(sys.stdin).get('last_sample_error',''))" 2>/dev/null || echo "")
-        fail "Workspace $wid never reached online with a routable URL within 30 min (last status=$ws_last_status, url=$ws_last_url, err=$ws_last_err)"
+        fail "Workspace $wid never reached online with a routable URL within ${WORKSPACE_ONLINE_TIMEOUT_SECS}s (~$((WORKSPACE_ONLINE_TIMEOUT_SECS/60)) min) (last status=$ws_last_status, url=$ws_last_url, err=$ws_last_err)"
      fi
      ws_json=$(tenant_call GET "/workspaces/$wid" 2>/dev/null || echo '{}')
      ws_status=$(echo "$ws_json" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status') or '')" 2>/dev/null)
@@ -526,14 +532,16 @@ fi
 # deadline fires at 5 min and sets status=failed prematurely; heartbeat
 # then transitions failed → online after install.sh finishes. So:
 #
-#   - 30 min deadline (hermes worst-case + slack)
+#   - ${WORKSPACE_ONLINE_TIMEOUT_SECS}s (~$((WORKSPACE_ONLINE_TIMEOUT_SECS/60)) min)
+#     deadline (hermes worst-case + slack). Configurable via
+#     E2E_WORKSPACE_ONLINE_TIMEOUT_SECS (#1646).
 #   - 'failed' is a TRANSIENT state we must tolerate — log and keep
 #     polling, only hard-fail at the deadline. Pre-bootstrap-watcher-fix
 #     (controlplane#245) this was a flake generator: workspace went
 #     failed→online inside our window but we bailed at the failed read.
 WS_TO_CHECK=("$PARENT_ID")
 [ -n "$CHILD_ID" ] && WS_TO_CHECK+=("$CHILD_ID")
-wait_workspaces_online_routable "7/11 Waiting for workspace(s) to reach status=online (up to 30 min — hermes cold boot)..." "${WS_TO_CHECK[@]}"
+wait_workspaces_online_routable "7/11 Waiting for workspace(s) to reach status=online (up to $((WORKSPACE_ONLINE_TIMEOUT_SECS/60)) min — hermes cold boot)..." "${WS_TO_CHECK[@]}"

 # ─── 7b. Canvas-terminal diagnose (EIC chain probe) ────────────────────
 # This step exists because the canvas-terminal failure of 2026-05-03