chore(simplify): trim SHA-rot comments + harden TENANT_HOST scheme/port stripping
Simplify pass on top of the canary fix: - Drop the three CP commit SHAs from comments — issue #2090 covers the audit trail, SHAs would rot. - Pull the inline `900` into TLS_TIMEOUT_SEC=$((15 * 60)) so the bash mirrors the TS side (15 min) at a glance. - TENANT_HOST extraction now strips http(s) AND any port suffix, so getent doesn't silently fail on a ws://host:443 style URL. - sed-redact Authorization/Cookie out of the curl -v dump, defensive against future callers adding an auth header to this probe. Pure cleanup; no behaviour change to the happy path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
af89d3fcbd
commit
b87befdabe
@ -50,12 +50,12 @@ const WORKSPACE_ONLINE_TIMEOUT_MS = 20 * 60 * 1000;
|
||||
// TLS readiness depends on (1) Cloudflare DNS propagation through the
|
||||
// edge, (2) the tenant's CF Tunnel registering the new hostname, (3)
|
||||
// CF's edge ACME cert provisioning + cache. Each of these layers can
|
||||
// add 1-3 min on its own under heavy staging load. Bumped from 10 to
|
||||
// 15 min after #2090 (6 consecutive canary failures starting 2026-04-26
|
||||
// correlated with CP commits a3eb8be / ed70405 / 4ab339e). Stays below
|
||||
// the 20-min PROVISION_TIMEOUT envelope so a genuinely-stuck tenant
|
||||
// still fails-loud at the provision step rather than masquerading as
|
||||
// a TLS issue. Kept aligned with tests/e2e/test_staging_full_saas.sh.
|
||||
// add 1-3 min on its own under heavy staging load. Bumped 10→15 min
|
||||
// after a burst of canary failures correlated with CP changes (#2090).
|
||||
// Stays below the 20-min PROVISION_TIMEOUT envelope so a genuinely-
|
||||
// stuck tenant fails-loud at the provision step rather than
|
||||
// masquerading as a TLS issue. Kept aligned with
|
||||
// tests/e2e/test_staging_full_saas.sh.
|
||||
const TLS_TIMEOUT_MS = 15 * 60 * 1000;
|
||||
|
||||
async function jsonFetch(
|
||||
|
||||
@ -195,22 +195,21 @@ TENANT_TOKEN=$(echo "$TENANT_TOKEN_RESP" | python3 -c "import json,sys; print(js
|
||||
ok "Tenant admin token retrieved (len=${#TENANT_TOKEN})"
|
||||
|
||||
# ─── 4. Wait for tenant TLS / DNS propagation ──────────────────────────
|
||||
# 15 min — kept below the 20-min provision envelope so a genuinely-stuck
|
||||
# tenant still fails loud at the earlier provision step rather than
|
||||
# masquerading as a TLS issue. CF DNS propagation + tunnel hostname
|
||||
# registration + ACME cert + edge cache run 5-7 min on a healthy day; the
|
||||
# +5 min headroom over the previous 10-min cap covers the slower path
|
||||
# observed in #2090 (6 consecutive canary failures starting 2026-04-26
|
||||
# correlated with CP commits a3eb8be / ed70405 / 4ab339e).
|
||||
# Kept below the 20-min provision envelope so a genuinely-stuck tenant
|
||||
# still fails loud at the earlier provision step rather than masquerading
|
||||
# as a TLS issue. CF DNS propagation + tunnel hostname registration +
|
||||
# ACME cert + edge cache run 5-7 min on a healthy day; +5 min headroom
|
||||
# over the previous 10-min cap covers the slower path observed in #2090.
|
||||
#
|
||||
# On timeout we print a diagnostic burst — last DNS lookup, last curl
|
||||
# verbose handshake, last response headers — so the next failure tells
|
||||
# us which layer (DNS, TLS, HTTP) is actually broken. Without this the
|
||||
# only signal is "no 2xx in N min" which sent us in circles.
|
||||
# On timeout, dump DNS + curl -v + headers so the next failure identifies
|
||||
# the broken layer (DNS / TLS / HTTP). Authorization is redacted
|
||||
# defensively in case a future caller adds an auth header to this probe.
|
||||
log "4/11 Waiting for tenant TLS / DNS propagation..."
|
||||
TLS_DEADLINE=$(( $(date +%s) + 900 ))
|
||||
TENANT_HOST="${TENANT_URL#https://}"
|
||||
TLS_TIMEOUT_SEC=$((15 * 60))
|
||||
TLS_DEADLINE=$(( $(date +%s) + TLS_TIMEOUT_SEC ))
|
||||
TENANT_HOST="${TENANT_URL#http*://}"
|
||||
TENANT_HOST="${TENANT_HOST%%/*}"
|
||||
TENANT_HOST="${TENANT_HOST%%:*}"
|
||||
while true; do
|
||||
if curl -sSfk --max-time 5 "$TENANT_URL/health" >/dev/null 2>&1; then
|
||||
break
|
||||
@ -220,9 +219,11 @@ while true; do
|
||||
log "DNS lookup ($TENANT_HOST):"
|
||||
getent hosts "$TENANT_HOST" 2>&1 || log " (no DNS resolution)"
|
||||
log "curl -v $TENANT_URL/health (last 40 lines):"
|
||||
curl -kv --max-time 10 "$TENANT_URL/health" 2>&1 | tail -n 40 | sed 's/^/ /' || true
|
||||
curl -kv --max-time 10 "$TENANT_URL/health" 2>&1 \
|
||||
| sed -E 's/(Authorization|Cookie):.*/\1: [redacted]/i' \
|
||||
| tail -n 40 | sed 's/^/ /' || true
|
||||
log "── END DIAGNOSTIC ──"
|
||||
fail "Tenant URL never responded 2xx on /health within 15 min"
|
||||
fail "Tenant URL never responded 2xx on /health within ${TLS_TIMEOUT_SEC}s"
|
||||
fi
|
||||
sleep 5
|
||||
done
|
||||
|
||||
Loading…
Reference in New Issue
Block a user