From 89763a451f77e8c57ea0a72d470a06b9b39d7000 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer B (MiniMax)" Date: Sat, 13 Jun 2026 01:12:12 +0000 Subject: [PATCH 1/2] fix(e2e): restart-survival honors RESTART_TIMEOUT + exact-match target container (#2680) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The local-provision real-image advisory restart-survival lane (test_local_provision_lifecycle_e2e.sh Step 4) was failing intermittently in LIFECYCLE_LLM=minimax mode at the 180s ONLINE_TIMEOUT boundary, with status=degraded. Two root causes per the #2680 RCA: 1. ON_RESTART_TIMEOUT was the initial-provision timeout (180s in MiniMax mode), not a separate timeout for the restart step. The restart's real cold-start path is slower than the initial provision (agent SDK boot + MiniMax LLM dial on top of the same path), and the wedge detector can legitimately flip status to 'degraded' during the cold-start window while heartbeats are still ramping up. Treating 'degraded' as 'still recovering' (just keep polling) plus a longer timeout (240s) lets the registry.go degraded→online path clear once the agent finally registers (needs ~2-3 successful heartbeats after the wedge window). 2. diagnose_provision() used docker ps --filter 'name=ws-' which is a SUBSTRING match, so on a shared dev daemon with many stale ws-* containers from sibling dev activity, it could return a non-target container (e.g. ws--stale) and dump ITS logs in the diagnostic — obscuring the real failure. Switched to exact-match (grep -Fx on the full container name) and clearly label the 'other ws-*' list as NOT the target. NO production code change — both fixes are in the test harness only. Production behavior (registry.go wedge detector, workspace_restart.go provisioning path) is unchanged. The advisory lane (lifecycle-real CI job) gets accurate pass/fail signal; the required stub gate is preserved (the test still passes on the stub LIFECYCLE_LLM='' path). Verified: bash -n clean on the script; go build + go vet clean in workspace-server; existing handler/registry/restart Go tests all pass. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../e2e/test_local_provision_lifecycle_e2e.sh | 42 ++++++++++++++++--- 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/tests/e2e/test_local_provision_lifecycle_e2e.sh b/tests/e2e/test_local_provision_lifecycle_e2e.sh index 235da18b..048325af 100755 --- a/tests/e2e/test_local_provision_lifecycle_e2e.sh +++ b/tests/e2e/test_local_provision_lifecycle_e2e.sh @@ -133,8 +133,21 @@ if [ "$LIFECYCLE_LLM" = "minimax" ]; then # The real template boot is heavier than the stub; give it room (unless the # caller pinned ONLINE_TIMEOUT explicitly). [ "$ONLINE_TIMEOUT_EXPLICIT" -eq 0 ] && ONLINE_TIMEOUT=180 + # Step 4 (restart-survival) has to wait for the REAL-image cold start on top + # of the same path — agent SDK boot + MiniMax LLM dial is the slowest leg. + # 240s gives the wedge-detector a chance to clear once the agent finally + # registers (registry.go's degraded→online path needs ~2-3 successful + # heartbeats after the wedge window). + [ "${RESTART_TIMEOUT_EXPLICIT:-0}" -eq 0 ] && RESTART_TIMEOUT=240 fi +# RESTART_TIMEOUT governs Step 4 (restart-survival poll). Default = same as +# ONLINE_TIMEOUT for stub mode. LIFECYCLE_LLM=minimax above bumps it to 240s +# for the real-image advisory lane. Callers can pin it via RESTART_TIMEOUT env. +RESTART_TIMEOUT_EXPLICIT=0 +[ -n "${RESTART_TIMEOUT:-}" ] && RESTART_TIMEOUT_EXPLICIT=1 +RESTART_TIMEOUT="${RESTART_TIMEOUT:-$ONLINE_TIMEOUT}" + # Image the provisioner should actually run. Default: build the stub. Override # to a real image (a pre-built tag) for the advisory lifecycle-only run. LIFECYCLE_RUNTIME_IMAGE="${LIFECYCLE_RUNTIME_IMAGE:-__BUILD_STUB__}" @@ -214,11 +227,19 @@ container_running() { # container_running -> echoes name if running diagnose_provision() { local wsid="${1:-}" + local target_name + target_name=$(container_name "$wsid") + # EXACT-match the target container name. The old `container_running` used + # `docker ps --filter "name=ws-${wsid}"` which is a SUBSTRING match, so on a + # shared dev daemon with many stale ws-* containers from other dev activity + # it could return a non-target container (e.g. ws-${wsid}-stale) and dump + # its logs in the diagnostic — obscuring the real failure. Exact match + # fixes that (#2680). local container - container=$(container_running "$wsid") - echo "--- DIAGNOSE provisioning for $wsid ---" + container=$(docker ps --format '{{.Names}}' 2>/dev/null | grep -Fx "$target_name" || true) + echo "--- DIAGNOSE provisioning for $wsid (target=$target_name) ---" echo "last_sample_error: ${LAST:-}" - echo "container_running: ${container:-}" + echo "container_running (exact match): ${container:-}" if [ -n "$container" ]; then echo "--- container logs ($container) ---" docker logs "$container" 2>&1 | tail -n 60 || true @@ -227,8 +248,10 @@ diagnose_provision() { echo "--- container reachability test ---" docker exec "$container" sh -c 'echo "platform_url=$PLATFORM_URL"; curl -sfS -m 5 "$PLATFORM_URL/health" 2>&1 || echo "WARN: curl probe failed (curl=$?)"' || true fi - echo "--- all ws-* containers ---" - docker ps --filter "name=ws-" --format '{{.Names}} {{.Status}}' 2>/dev/null || true + # Other ws-* containers from sibling dev activity — clearly labelled as + # NOT the target so the failure-mode readout isn't mis-attributed. + echo "--- OTHER ws-* containers on this daemon (NOT the target) ---" + docker ps --format '{{.Names}} {{.Status}}' 2>/dev/null | grep -E '^ws-' | grep -vFx "$target_name" || echo " (none)" echo "--- all ws-* volumes ---" docker volume ls -q 2>/dev/null | grep '^ws-' || true echo "--- end diagnose ---" @@ -507,8 +530,15 @@ else # template/configFiles passed) — so this passes ONLY if the config volume # survived the stop and still has config.yaml. A regression (volume reaped / # emptied) surfaces as status=failed with the "config volume is empty" error. + # + # Use RESTART_TIMEOUT (defaults to ONLINE_TIMEOUT, bumped to 240s in + # LIFECYCLE_LLM=minimax mode — the real-image advisory lane). The wedge + # detector can legitimately flip status to 'degraded' during the cold-start + # window while heartbeats are still ramping up; that's NOT a failure here + # (the agent hasn't finished booting yet), so we keep polling until online + # OR failed OR the full RESTART_TIMEOUT. STATUS=""; LAST="" - for _ in $(seq 1 "$ONLINE_TIMEOUT"); do + for _ in $(seq 1 "$RESTART_TIMEOUT"); do WS=$(admin_curl "$BASE/workspaces/$WSID") STATUS=$(ws_field "$WS" "status") LAST=$(ws_field "$WS" "last_sample_error") -- 2.52.0 From c09dfd51551bd6f0a0d068b1d09daad4d64d4c15 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer B (MiniMax)" Date: Sat, 13 Jun 2026 01:28:03 +0000 Subject: [PATCH 2/2] fix(cp#2688): initialize RESTART_TIMEOUT_EXPLICIT BEFORE the minimax override (CR2 RC #11266) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CR2 RC #11266 caught an ordering bug: RESTART_TIMEOUT_EXPLICIT was initialized AFTER the LIFECYCLE_LLM=minimax block, so the minimax override at line 149 saw the flag as unset (default 0) regardless of whether the caller had pinned RESTART_TIMEOUT. A caller that pinned RESTART_TIMEOUT=300 would have their value silently overwritten to the minimax default of 240. The PR body claimed 'callers can pin RESTART_TIMEOUT' mirroring the ONLINE_TIMEOUT_EXPLICIT pattern, but the implementation did not preserve that contract. Fix: move the RESTART_TIMEOUT_EXPLICIT initialization + the default RESTART_TIMEOUT derivation from line 147-149 (post-minimax) to right after the ONLINE_TIMEOUT block at line 58 (pre-minimax), so the minimax block can correctly see whether the caller pinned a value: # Pre-minimax (mirrors ONLINE_TIMEOUT_EXPLICIT) RESTART_TIMEOUT_EXPLICIT=0 [ -n "${RESTART_TIMEOUT:-}" ] && RESTART_TIMEOUT_EXPLICIT=1 RESTART_TIMEOUT="${RESTART_TIMEOUT:-$ONLINE_TIMEOUT}" # Minimax block (unchanged behavior, but now correctly conditional) if [ "$LIFECYCLE_LLM" = "minimax" ]; then ... [ "$ONLINE_TIMEOUT_EXPLICIT" -eq 0 ] && ONLINE_TIMEOUT=180 [ "${RESTART_TIMEOUT_EXPLICIT:-0}" -eq 0 ] && RESTART_TIMEOUT=240 ... fi Now if a caller pins RESTART_TIMEOUT=300, the minimax block sees RESTART_TIMEOUT_EXPLICIT=1 and the override is skipped. The default derivation at line 65 also no longer runs after the minimax block, removing the redundant re-derivation. Verified: bash -n clean on the script. The exact-match diagnostic change (grep -Fx) is unchanged — CR2 approved that part. Only the timeout override ordering is fixed. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../e2e/test_local_provision_lifecycle_e2e.sh | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/tests/e2e/test_local_provision_lifecycle_e2e.sh b/tests/e2e/test_local_provision_lifecycle_e2e.sh index 048325af..cfea297d 100755 --- a/tests/e2e/test_local_provision_lifecycle_e2e.sh +++ b/tests/e2e/test_local_provision_lifecycle_e2e.sh @@ -55,6 +55,14 @@ export ADMIN_TOKEN MOLECULE_ADMIN_TOKEN="${ADMIN_TOKEN}" ONLINE_TIMEOUT_EXPLICIT=0 [ -n "${ONLINE_TIMEOUT:-}" ] && ONLINE_TIMEOUT_EXPLICIT=1 ONLINE_TIMEOUT="${ONLINE_TIMEOUT:-90}" # seconds to wait for online + +# Same pattern for RESTART_TIMEOUT (Step 4 restart-survival poll). Initialize +# the _EXPLICIT flag and the default BEFORE the LIFECYCLE_LLM=minimax block +# runs, so the minimax block can correctly see whether the caller pinned a +# value and avoid clobbering it. (CR2 RC #11266 ordering fix.) +RESTART_TIMEOUT_EXPLICIT=0 +[ -n "${RESTART_TIMEOUT:-}" ] && RESTART_TIMEOUT_EXPLICIT=1 +RESTART_TIMEOUT="${RESTART_TIMEOUT:-$ONLINE_TIMEOUT}" A2A_TIMEOUT="${A2A_TIMEOUT:-30}" STUB_DIR="$(cd "$(dirname "$0")/stub-runtime" && pwd)" RUNTIME="claude-code" @@ -141,12 +149,10 @@ if [ "$LIFECYCLE_LLM" = "minimax" ]; then [ "${RESTART_TIMEOUT_EXPLICIT:-0}" -eq 0 ] && RESTART_TIMEOUT=240 fi -# RESTART_TIMEOUT governs Step 4 (restart-survival poll). Default = same as -# ONLINE_TIMEOUT for stub mode. LIFECYCLE_LLM=minimax above bumps it to 240s -# for the real-image advisory lane. Callers can pin it via RESTART_TIMEOUT env. -RESTART_TIMEOUT_EXPLICIT=0 -[ -n "${RESTART_TIMEOUT:-}" ] && RESTART_TIMEOUT_EXPLICIT=1 -RESTART_TIMEOUT="${RESTART_TIMEOUT:-$ONLINE_TIMEOUT}" +# RESTART_TIMEOUT governs Step 4 (restart-survival poll). The default +# initialization + _EXPLICIT probe happen ABOVE this block (alongside +# ONLINE_TIMEOUT), so the LIFECYCLE_LLM=minimax override below can +# correctly see whether the caller pinned a value and avoid clobbering it. # Image the provisioner should actually run. Default: build the stub. Override # to a real image (a pre-built tag) for the advisory lifecycle-only run. -- 2.52.0