fix(e2e): restart-survival honors RESTART_TIMEOUT + exact-match target container (relates-to #2680) #2688
@@ -55,6 +55,14 @@ export ADMIN_TOKEN MOLECULE_ADMIN_TOKEN="${ADMIN_TOKEN}"
|
||||
ONLINE_TIMEOUT_EXPLICIT=0
|
||||
[ -n "${ONLINE_TIMEOUT:-}" ] && ONLINE_TIMEOUT_EXPLICIT=1
|
||||
ONLINE_TIMEOUT="${ONLINE_TIMEOUT:-90}" # seconds to wait for online
|
||||
|
||||
# Same pattern for RESTART_TIMEOUT (Step 4 restart-survival poll). Initialize
|
||||
# the _EXPLICIT flag and the default BEFORE the LIFECYCLE_LLM=minimax block
|
||||
# runs, so the minimax block can correctly see whether the caller pinned a
|
||||
# value and avoid clobbering it. (CR2 RC #11266 ordering fix.)
|
||||
RESTART_TIMEOUT_EXPLICIT=0
|
||||
[ -n "${RESTART_TIMEOUT:-}" ] && RESTART_TIMEOUT_EXPLICIT=1
|
||||
RESTART_TIMEOUT="${RESTART_TIMEOUT:-$ONLINE_TIMEOUT}"
|
||||
A2A_TIMEOUT="${A2A_TIMEOUT:-30}"
|
||||
STUB_DIR="$(cd "$(dirname "$0")/stub-runtime" && pwd)"
|
||||
RUNTIME="claude-code"
|
||||
@@ -133,8 +141,19 @@ if [ "$LIFECYCLE_LLM" = "minimax" ]; then
|
||||
# The real template boot is heavier than the stub; give it room (unless the
|
||||
# caller pinned ONLINE_TIMEOUT explicitly).
|
||||
[ "$ONLINE_TIMEOUT_EXPLICIT" -eq 0 ] && ONLINE_TIMEOUT=180
|
||||
# Step 4 (restart-survival) has to wait for the REAL-image cold start on top
|
||||
# of the same path — agent SDK boot + MiniMax LLM dial is the slowest leg.
|
||||
# 240s gives the wedge-detector a chance to clear once the agent finally
|
||||
# registers (registry.go's degraded→online path needs ~2-3 successful
|
||||
# heartbeats after the wedge window).
|
||||
[ "${RESTART_TIMEOUT_EXPLICIT:-0}" -eq 0 ] && RESTART_TIMEOUT=240
|
||||
fi
|
||||
|
||||
# RESTART_TIMEOUT governs Step 4 (restart-survival poll). The default
|
||||
# initialization + _EXPLICIT probe happen ABOVE this block (alongside
|
||||
# ONLINE_TIMEOUT), so the LIFECYCLE_LLM=minimax override below can
|
||||
# correctly see whether the caller pinned a value and avoid clobbering it.
|
||||
|
||||
# Image the provisioner should actually run. Default: build the stub. Override
|
||||
# to a real image (a pre-built tag) for the advisory lifecycle-only run.
|
||||
LIFECYCLE_RUNTIME_IMAGE="${LIFECYCLE_RUNTIME_IMAGE:-__BUILD_STUB__}"
|
||||
@@ -214,11 +233,19 @@ container_running() { # container_running <ws-id> -> echoes name if running
|
||||
|
||||
diagnose_provision() {
|
||||
local wsid="${1:-}"
|
||||
local target_name
|
||||
target_name=$(container_name "$wsid")
|
||||
# EXACT-match the target container name. The old `container_running` used
|
||||
# `docker ps --filter "name=ws-${wsid}"` which is a SUBSTRING match, so on a
|
||||
# shared dev daemon with many stale ws-* containers from other dev activity
|
||||
# it could return a non-target container (e.g. ws-${wsid}-stale) and dump
|
||||
# its logs in the diagnostic — obscuring the real failure. Exact match
|
||||
# fixes that (#2680).
|
||||
local container
|
||||
container=$(container_running "$wsid")
|
||||
echo "--- DIAGNOSE provisioning for $wsid ---"
|
||||
container=$(docker ps --format '{{.Names}}' 2>/dev/null | grep -Fx "$target_name" || true)
|
||||
echo "--- DIAGNOSE provisioning for $wsid (target=$target_name) ---"
|
||||
echo "last_sample_error: ${LAST:-<none>}"
|
||||
echo "container_running: ${container:-<none>}"
|
||||
echo "container_running (exact match): ${container:-<none>}"
|
||||
if [ -n "$container" ]; then
|
||||
echo "--- container logs ($container) ---"
|
||||
docker logs "$container" 2>&1 | tail -n 60 || true
|
||||
@@ -227,8 +254,10 @@ diagnose_provision() {
|
||||
echo "--- container reachability test ---"
|
||||
docker exec "$container" sh -c 'echo "platform_url=$PLATFORM_URL"; curl -sfS -m 5 "$PLATFORM_URL/health" 2>&1 || echo "WARN: curl probe failed (curl=$?)"' || true
|
||||
fi
|
||||
echo "--- all ws-* containers ---"
|
||||
docker ps --filter "name=ws-" --format '{{.Names}} {{.Status}}' 2>/dev/null || true
|
||||
# Other ws-* containers from sibling dev activity — clearly labelled as
|
||||
# NOT the target so the failure-mode readout isn't mis-attributed.
|
||||
echo "--- OTHER ws-* containers on this daemon (NOT the target) ---"
|
||||
docker ps --format '{{.Names}} {{.Status}}' 2>/dev/null | grep -E '^ws-' | grep -vFx "$target_name" || echo " (none)"
|
||||
echo "--- all ws-* volumes ---"
|
||||
docker volume ls -q 2>/dev/null | grep '^ws-' || true
|
||||
echo "--- end diagnose ---"
|
||||
@@ -507,8 +536,15 @@ else
|
||||
# template/configFiles passed) — so this passes ONLY if the config volume
|
||||
# survived the stop and still has config.yaml. A regression (volume reaped /
|
||||
# emptied) surfaces as status=failed with the "config volume is empty" error.
|
||||
#
|
||||
# Use RESTART_TIMEOUT (defaults to ONLINE_TIMEOUT, bumped to 240s in
|
||||
# LIFECYCLE_LLM=minimax mode — the real-image advisory lane). The wedge
|
||||
# detector can legitimately flip status to 'degraded' during the cold-start
|
||||
# window while heartbeats are still ramping up; that's NOT a failure here
|
||||
# (the agent hasn't finished booting yet), so we keep polling until online
|
||||
# OR failed OR the full RESTART_TIMEOUT.
|
||||
STATUS=""; LAST=""
|
||||
for _ in $(seq 1 "$ONLINE_TIMEOUT"); do
|
||||
for _ in $(seq 1 "$RESTART_TIMEOUT"); do
|
||||
WS=$(admin_curl "$BASE/workspaces/$WSID")
|
||||
STATUS=$(ws_field "$WS" "status")
|
||||
LAST=$(ws_field "$WS" "last_sample_error")
|
||||
|
||||
Reference in New Issue
Block a user