From 5f99c29de3bd3cdb61420dc39531883d1822d273 Mon Sep 17 00:00:00 2001 From: core-devops Date: Thu, 4 Jun 2026 20:26:46 -0700 Subject: [PATCH] =?UTF-8?q?fix(e2e):=20reconciler=20e2e=20=E2=80=94=20fall?= =?UTF-8?q?=20back=20to=20AWS=20workspace=20tag=20when=20API=20omits=20ins?= =?UTF-8?q?tance=5Fid=20(core#2261=20review)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The online-wait loop only exited when status=online AND the tenant API surfaced instance_id — but staging never surfaces it (observed: the DB has it, the API response omits it). So the loop spun to the 900s deadline and failed with a misleading "never reached online", and the slug-tag fallback below was dead code (only reachable when instance_id was empty AFTER the loop, which never happened). Fix: once online, grace-wait (45s) for the API instance_id, then fall back to the AWS workspace-instance tag (ws-tenant--) — the same approach the live proof used. The reconciler reads instance_id from the DB and acts on the real EC2 regardless of what the API surfaces, so the AWS-tag instance is the correct kill target. Makes the e2e actually able to reach the kill + reconciler-flip steps. core#2261 Co-Authored-By: Claude Opus 4.8 (1M context) --- ...st_reconciler_heals_terminated_instance.sh | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/tests/e2e/test_reconciler_heals_terminated_instance.sh b/tests/e2e/test_reconciler_heals_terminated_instance.sh index 7b6850ac7..b1c791299 100755 --- a/tests/e2e/test_reconciler_heals_terminated_instance.sh +++ b/tests/e2e/test_reconciler_heals_terminated_instance.sh @@ -382,6 +382,10 @@ log " WS_ID=$WS_ID" log " Waiting for workspace to reach status=online (up to $((WORKSPACE_ONLINE_TIMEOUT_SECS/60)) min)..." ONLINE_DEADLINE=$(( $(date +%s) + WORKSPACE_ONLINE_TIMEOUT_SECS )) ORIGINAL_INSTANCE_ID="" +ONLINE_SINCE="" +# Grace before falling back to the AWS workspace tag when the tenant API +# does not surface instance_id (observed on staging). +INSTANCE_ID_GRACE_SECS="${E2E_INSTANCE_ID_GRACE_SECS:-45}" WS_LAST_STATUS="" while true; do if [ "$(date +%s)" -gt "$ONLINE_DEADLINE" ]; then @@ -394,11 +398,27 @@ while true; do WS_LAST_STATUS="$WS_STATUS" fi if [ "$WS_STATUS" = "online" ]; then + [ -z "$ONLINE_SINCE" ] && ONLINE_SINCE=$(date +%s) ORIGINAL_INSTANCE_ID=$(ws_field "$WS_ID" "instance_id") if [ -n "$ORIGINAL_INSTANCE_ID" ]; then break fi - # online but instance_id not surfaced yet — keep polling briefly. + # The workspace is online but the tenant API does not surface instance_id + # (observed on staging — the DB has it, the API response omits it). After a + # short grace, fall back to the AWS workspace-instance tag so the kill step + # can proceed. The reconciler reads instance_id from the DB and acts on the + # real EC2 regardless of what the API surfaces, so the AWS-tag instance is + # the correct kill target. Without this fallback the loop spins to the online + # deadline and fails with a misleading "never reached online". + if [ $(( $(date +%s) - ONLINE_SINCE )) -ge "$INSTANCE_ID_GRACE_SECS" ]; then + # ws-tenant-- is the workspace EC2 (vs tenant-). + ORIGINAL_INSTANCE_ID=$(e2e_ec2_instances_for_slug "$SLUG" 2>/dev/null \ + | awk '$2 ~ /^ws-tenant-/ {print $1}' | sort -u | head -1) + if [ -n "$ORIGINAL_INSTANCE_ID" ]; then + log " instance_id not surfaced by API after ${INSTANCE_ID_GRACE_SECS}s — using AWS workspace tag: $ORIGINAL_INSTANCE_ID" + break + fi + fi log " $WS_ID online but instance_id not populated yet — waiting" fi # 'failed' is transient on cold boot (bootstrap-watcher deadline vs heartbeat -- 2.52.0