fix(e2e): restart-survival honors RESTART_TIMEOUT + exact-match target container (relates-to #2680) #2688

Merged
devops-engineer merged 2 commits from fix/restart-context-degraded-2680 into main 2026-06-13 01:37:58 +00:00
@@ -55,6 +55,14 @@ export ADMIN_TOKEN MOLECULE_ADMIN_TOKEN="${ADMIN_TOKEN}"
ONLINE_TIMEOUT_EXPLICIT=0
[ -n "${ONLINE_TIMEOUT:-}" ] && ONLINE_TIMEOUT_EXPLICIT=1
ONLINE_TIMEOUT="${ONLINE_TIMEOUT:-90}" # seconds to wait for online
# Same pattern for RESTART_TIMEOUT (Step 4 restart-survival poll). Initialize
# the _EXPLICIT flag and the default BEFORE the LIFECYCLE_LLM=minimax block
# runs, so the minimax block can correctly see whether the caller pinned a
# value and avoid clobbering it. (CR2 RC #11266 ordering fix.)
RESTART_TIMEOUT_EXPLICIT=0
[ -n "${RESTART_TIMEOUT:-}" ] && RESTART_TIMEOUT_EXPLICIT=1
RESTART_TIMEOUT="${RESTART_TIMEOUT:-$ONLINE_TIMEOUT}"
A2A_TIMEOUT="${A2A_TIMEOUT:-30}"
STUB_DIR="$(cd "$(dirname "$0")/stub-runtime" && pwd)"
RUNTIME="claude-code"
@@ -133,8 +141,19 @@ if [ "$LIFECYCLE_LLM" = "minimax" ]; then
# The real template boot is heavier than the stub; give it room (unless the
# caller pinned ONLINE_TIMEOUT explicitly).
[ "$ONLINE_TIMEOUT_EXPLICIT" -eq 0 ] && ONLINE_TIMEOUT=180
# Step 4 (restart-survival) has to wait for the REAL-image cold start on top
# of the same path — agent SDK boot + MiniMax LLM dial is the slowest leg.
# 240s gives the wedge-detector a chance to clear once the agent finally
# registers (registry.go's degraded→online path needs ~2-3 successful
# heartbeats after the wedge window).
[ "${RESTART_TIMEOUT_EXPLICIT:-0}" -eq 0 ] && RESTART_TIMEOUT=240
fi
# RESTART_TIMEOUT governs Step 4 (restart-survival poll). The default
# initialization + _EXPLICIT probe happen ABOVE this block (alongside
# ONLINE_TIMEOUT), so the LIFECYCLE_LLM=minimax override below can
# correctly see whether the caller pinned a value and avoid clobbering it.
# Image the provisioner should actually run. Default: build the stub. Override
# to a real image (a pre-built tag) for the advisory lifecycle-only run.
LIFECYCLE_RUNTIME_IMAGE="${LIFECYCLE_RUNTIME_IMAGE:-__BUILD_STUB__}"
@@ -214,11 +233,19 @@ container_running() { # container_running <ws-id> -> echoes name if running
diagnose_provision() {
local wsid="${1:-}"
local target_name
target_name=$(container_name "$wsid")
# EXACT-match the target container name. The old `container_running` used
# `docker ps --filter "name=ws-${wsid}"` which is a SUBSTRING match, so on a
# shared dev daemon with many stale ws-* containers from other dev activity
# it could return a non-target container (e.g. ws-${wsid}-stale) and dump
# its logs in the diagnostic — obscuring the real failure. Exact match
# fixes that (#2680).
local container
container=$(container_running "$wsid")
echo "--- DIAGNOSE provisioning for $wsid ---"
container=$(docker ps --format '{{.Names}}' 2>/dev/null | grep -Fx "$target_name" || true)
echo "--- DIAGNOSE provisioning for $wsid (target=$target_name) ---"
echo "last_sample_error: ${LAST:-<none>}"
echo "container_running: ${container:-<none>}"
echo "container_running (exact match): ${container:-<none>}"
if [ -n "$container" ]; then
echo "--- container logs ($container) ---"
docker logs "$container" 2>&1 | tail -n 60 || true
@@ -227,8 +254,10 @@ diagnose_provision() {
echo "--- container reachability test ---"
docker exec "$container" sh -c 'echo "platform_url=$PLATFORM_URL"; curl -sfS -m 5 "$PLATFORM_URL/health" 2>&1 || echo "WARN: curl probe failed (curl=$?)"' || true
fi
echo "--- all ws-* containers ---"
docker ps --filter "name=ws-" --format '{{.Names}} {{.Status}}' 2>/dev/null || true
# Other ws-* containers from sibling dev activity — clearly labelled as
# NOT the target so the failure-mode readout isn't mis-attributed.
echo "--- OTHER ws-* containers on this daemon (NOT the target) ---"
docker ps --format '{{.Names}} {{.Status}}' 2>/dev/null | grep -E '^ws-' | grep -vFx "$target_name" || echo " (none)"
echo "--- all ws-* volumes ---"
docker volume ls -q 2>/dev/null | grep '^ws-' || true
echo "--- end diagnose ---"
@@ -507,8 +536,15 @@ else
# template/configFiles passed) — so this passes ONLY if the config volume
# survived the stop and still has config.yaml. A regression (volume reaped /
# emptied) surfaces as status=failed with the "config volume is empty" error.
#
# Use RESTART_TIMEOUT (defaults to ONLINE_TIMEOUT, bumped to 240s in
# LIFECYCLE_LLM=minimax mode — the real-image advisory lane). The wedge
# detector can legitimately flip status to 'degraded' during the cold-start
# window while heartbeats are still ramping up; that's NOT a failure here
# (the agent hasn't finished booting yet), so we keep polling until online
# OR failed OR the full RESTART_TIMEOUT.
STATUS=""; LAST=""
for _ in $(seq 1 "$ONLINE_TIMEOUT"); do
for _ in $(seq 1 "$RESTART_TIMEOUT"); do
WS=$(admin_curl "$BASE/workspaces/$WSID")
STATUS=$(ws_field "$WS" "status")
LAST=$(ws_field "$WS" "last_sample_error")