Merge pull request #1930 from Molecule-AI/fix/e2e-hermes-boot-timeout

fix(e2e): hermes cold-boot tolerance — 20min deadline + treat failed as transient
This commit is contained in:
Hongming Wang 2026-04-23 17:44:50 -07:00 committed by GitHub
commit 307b5b5408
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -277,20 +277,48 @@ else
fi
# ─── 7. Wait for workspace(s) online ───────────────────────────────────
log "7/11 Waiting for workspace(s) to reach status=online..."
WS_DEADLINE=$(( $(date +%s) + 600 ))
# Hermes cold-boot takes 10-13 min on slow apt days (apt + uv + hermes
# install + npm browser-tools). The controlplane bootstrap-watcher
# deadline fires at 5 min and sets status=failed prematurely; heartbeat
# then transitions failed → online after install.sh finishes. So:
#
# - 20 min deadline (hermes worst-case + slack)
# - 'failed' is a TRANSIENT state we must tolerate — log and keep
# polling, only hard-fail at the deadline. Pre-bootstrap-watcher-fix
# (controlplane#245) this was a flake generator: workspace went
# failed→online inside our window but we bailed at the failed read.
log "7/11 Waiting for workspace(s) to reach status=online (up to 20 min — hermes cold boot)..."
WS_DEADLINE=$(( $(date +%s) + 1200 ))
WS_TO_CHECK="$PARENT_ID"
[ -n "$CHILD_ID" ] && WS_TO_CHECK="$WS_TO_CHECK $CHILD_ID"
for wid in $WS_TO_CHECK; do
WS_LAST_STATUS=""
WS_FAILED_LOGGED=0
while true; do
if [ "$(date +%s)" -gt "$WS_DEADLINE" ]; then
fail "Workspace $wid never reached online within 10 min"
WS_LAST_ERR=$(tenant_call GET "/workspaces/$wid" 2>/dev/null | \
python3 -c "import json,sys; print(json.load(sys.stdin).get('last_sample_error',''))" 2>/dev/null || echo "")
fail "Workspace $wid never reached online within 20 min (last status=$WS_LAST_STATUS, err=$WS_LAST_ERR)"
fi
WS_JSON=$(tenant_call GET "/workspaces/$wid" 2>/dev/null || echo '{}')
WS_STATUS=$(echo "$WS_JSON" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))" 2>/dev/null)
if [ "$WS_STATUS" != "$WS_LAST_STATUS" ]; then
log " $wid$WS_STATUS"
WS_LAST_STATUS="$WS_STATUS"
fi
case "$WS_STATUS" in
online) break ;;
failed) fail "Workspace $wid status=failed: $(echo "$WS_JSON" | python3 -c 'import json,sys; print(json.load(sys.stdin).get("last_sample_error",""))')" ;;
failed)
# Not a hard fail — bootstrap-watcher frequently marks failed at
# 5 min on hermes, then heartbeat recovers to online around 10-13
# min when install.sh finishes. Log once per workspace so the CI
# output isn't spammy.
if [ "$WS_FAILED_LOGGED" = "0" ]; then
log " $wid transiently failed — waiting for heartbeat recovery (bootstrap-watcher deadline, see cp#245)"
WS_FAILED_LOGGED=1
fi
sleep 10
;;
*) sleep 10 ;;
esac
done