From b87befdabe5ec073e00194e0b6d2ea25c6f49522 Mon Sep 17 00:00:00 2001 From: rabbitblood Date: Sun, 26 Apr 2026 11:44:54 -0700 Subject: [PATCH] chore(simplify): trim SHA-rot comments + harden TENANT_HOST scheme/port stripping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simplify pass on top of the canary fix: - Drop the three CP commit SHAs from comments — issue #2090 covers the audit trail, SHAs would rot. - Pull the inline `900` into TLS_TIMEOUT_SEC=$((15 * 60)) so the bash mirrors the TS side (15 min) at a glance. - TENANT_HOST extraction now strips http(s) AND any port suffix, so getent doesn't silently fail on a ws://host:443 style URL. - sed-redact Authorization/Cookie out of the curl -v dump, defensive against future callers adding an auth header to this probe. Pure cleanup; no behaviour change to the happy path. Co-Authored-By: Claude Opus 4.7 (1M context) --- canvas/e2e/staging-setup.ts | 12 +++++------ tests/e2e/test_staging_full_saas.sh | 31 +++++++++++++++-------------- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/canvas/e2e/staging-setup.ts b/canvas/e2e/staging-setup.ts index 89cbf61d..5fc39225 100644 --- a/canvas/e2e/staging-setup.ts +++ b/canvas/e2e/staging-setup.ts @@ -50,12 +50,12 @@ const WORKSPACE_ONLINE_TIMEOUT_MS = 20 * 60 * 1000; // TLS readiness depends on (1) Cloudflare DNS propagation through the // edge, (2) the tenant's CF Tunnel registering the new hostname, (3) // CF's edge ACME cert provisioning + cache. Each of these layers can -// add 1-3 min on its own under heavy staging load. Bumped from 10 to -// 15 min after #2090 (6 consecutive canary failures starting 2026-04-26 -// correlated with CP commits a3eb8be / ed70405 / 4ab339e). Stays below -// the 20-min PROVISION_TIMEOUT envelope so a genuinely-stuck tenant -// still fails-loud at the provision step rather than masquerading as -// a TLS issue. Kept aligned with tests/e2e/test_staging_full_saas.sh. +// add 1-3 min on its own under heavy staging load. Bumped 10→15 min +// after a burst of canary failures correlated with CP changes (#2090). +// Stays below the 20-min PROVISION_TIMEOUT envelope so a genuinely- +// stuck tenant fails-loud at the provision step rather than +// masquerading as a TLS issue. Kept aligned with +// tests/e2e/test_staging_full_saas.sh. const TLS_TIMEOUT_MS = 15 * 60 * 1000; async function jsonFetch( diff --git a/tests/e2e/test_staging_full_saas.sh b/tests/e2e/test_staging_full_saas.sh index 25e08f81..e9d9da5c 100755 --- a/tests/e2e/test_staging_full_saas.sh +++ b/tests/e2e/test_staging_full_saas.sh @@ -195,22 +195,21 @@ TENANT_TOKEN=$(echo "$TENANT_TOKEN_RESP" | python3 -c "import json,sys; print(js ok "Tenant admin token retrieved (len=${#TENANT_TOKEN})" # ─── 4. Wait for tenant TLS / DNS propagation ────────────────────────── -# 15 min — kept below the 20-min provision envelope so a genuinely-stuck -# tenant still fails loud at the earlier provision step rather than -# masquerading as a TLS issue. CF DNS propagation + tunnel hostname -# registration + ACME cert + edge cache run 5-7 min on a healthy day; the -# +5 min headroom over the previous 10-min cap covers the slower path -# observed in #2090 (6 consecutive canary failures starting 2026-04-26 -# correlated with CP commits a3eb8be / ed70405 / 4ab339e). +# Kept below the 20-min provision envelope so a genuinely-stuck tenant +# still fails loud at the earlier provision step rather than masquerading +# as a TLS issue. CF DNS propagation + tunnel hostname registration + +# ACME cert + edge cache run 5-7 min on a healthy day; +5 min headroom +# over the previous 10-min cap covers the slower path observed in #2090. # -# On timeout we print a diagnostic burst — last DNS lookup, last curl -# verbose handshake, last response headers — so the next failure tells -# us which layer (DNS, TLS, HTTP) is actually broken. Without this the -# only signal is "no 2xx in N min" which sent us in circles. +# On timeout, dump DNS + curl -v + headers so the next failure identifies +# the broken layer (DNS / TLS / HTTP). Authorization is redacted +# defensively in case a future caller adds an auth header to this probe. log "4/11 Waiting for tenant TLS / DNS propagation..." -TLS_DEADLINE=$(( $(date +%s) + 900 )) -TENANT_HOST="${TENANT_URL#https://}" +TLS_TIMEOUT_SEC=$((15 * 60)) +TLS_DEADLINE=$(( $(date +%s) + TLS_TIMEOUT_SEC )) +TENANT_HOST="${TENANT_URL#http*://}" TENANT_HOST="${TENANT_HOST%%/*}" +TENANT_HOST="${TENANT_HOST%%:*}" while true; do if curl -sSfk --max-time 5 "$TENANT_URL/health" >/dev/null 2>&1; then break @@ -220,9 +219,11 @@ while true; do log "DNS lookup ($TENANT_HOST):" getent hosts "$TENANT_HOST" 2>&1 || log " (no DNS resolution)" log "curl -v $TENANT_URL/health (last 40 lines):" - curl -kv --max-time 10 "$TENANT_URL/health" 2>&1 | tail -n 40 | sed 's/^/ /' || true + curl -kv --max-time 10 "$TENANT_URL/health" 2>&1 \ + | sed -E 's/(Authorization|Cookie):.*/\1: [redacted]/i' \ + | tail -n 40 | sed 's/^/ /' || true log "── END DIAGNOSTIC ──" - fail "Tenant URL never responded 2xx on /health within 15 min" + fail "Tenant URL never responded 2xx on /health within ${TLS_TIMEOUT_SEC}s" fi sleep 5 done