chore(simplify): trim SHA-rot comments + harden TENANT_HOST scheme/port stripping

Simplify pass on top of the canary fix:

- Drop the three CP commit SHAs from comments — issue #2090 covers
  the audit trail, SHAs would rot.
- Pull the inline `900` into TLS_TIMEOUT_SEC=$((15 * 60)) so the
  bash mirrors the TS side (15 min) at a glance.
- TENANT_HOST extraction now strips http(s) AND any port suffix, so
  getent doesn't silently fail on a ws://host:443 style URL.
- sed-redact Authorization/Cookie out of the curl -v dump, defensive
  against future callers adding an auth header to this probe.

Pure cleanup; no behaviour change to the happy path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
rabbitblood 2026-04-26 11:44:54 -07:00
parent af89d3fcbd
commit b87befdabe
2 changed files with 22 additions and 21 deletions

View File

@ -50,12 +50,12 @@ const WORKSPACE_ONLINE_TIMEOUT_MS = 20 * 60 * 1000;
// TLS readiness depends on (1) Cloudflare DNS propagation through the
// edge, (2) the tenant's CF Tunnel registering the new hostname, (3)
// CF's edge ACME cert provisioning + cache. Each of these layers can
// add 1-3 min on its own under heavy staging load. Bumped from 10 to
// 15 min after #2090 (6 consecutive canary failures starting 2026-04-26
// correlated with CP commits a3eb8be / ed70405 / 4ab339e). Stays below
// the 20-min PROVISION_TIMEOUT envelope so a genuinely-stuck tenant
// still fails-loud at the provision step rather than masquerading as
// a TLS issue. Kept aligned with tests/e2e/test_staging_full_saas.sh.
// add 1-3 min on its own under heavy staging load. Bumped 10→15 min
// after a burst of canary failures correlated with CP changes (#2090).
// Stays below the 20-min PROVISION_TIMEOUT envelope so a genuinely-
// stuck tenant fails-loud at the provision step rather than
// masquerading as a TLS issue. Kept aligned with
// tests/e2e/test_staging_full_saas.sh.
const TLS_TIMEOUT_MS = 15 * 60 * 1000;
async function jsonFetch(

View File

@ -195,22 +195,21 @@ TENANT_TOKEN=$(echo "$TENANT_TOKEN_RESP" | python3 -c "import json,sys; print(js
ok "Tenant admin token retrieved (len=${#TENANT_TOKEN})"
# ─── 4. Wait for tenant TLS / DNS propagation ──────────────────────────
# 15 min — kept below the 20-min provision envelope so a genuinely-stuck
# tenant still fails loud at the earlier provision step rather than
# masquerading as a TLS issue. CF DNS propagation + tunnel hostname
# registration + ACME cert + edge cache run 5-7 min on a healthy day; the
# +5 min headroom over the previous 10-min cap covers the slower path
# observed in #2090 (6 consecutive canary failures starting 2026-04-26
# correlated with CP commits a3eb8be / ed70405 / 4ab339e).
# Kept below the 20-min provision envelope so a genuinely-stuck tenant
# still fails loud at the earlier provision step rather than masquerading
# as a TLS issue. CF DNS propagation + tunnel hostname registration +
# ACME cert + edge cache run 5-7 min on a healthy day; +5 min headroom
# over the previous 10-min cap covers the slower path observed in #2090.
#
# On timeout we print a diagnostic burst — last DNS lookup, last curl
# verbose handshake, last response headers — so the next failure tells
# us which layer (DNS, TLS, HTTP) is actually broken. Without this the
# only signal is "no 2xx in N min" which sent us in circles.
# On timeout, dump DNS + curl -v + headers so the next failure identifies
# the broken layer (DNS / TLS / HTTP). Authorization is redacted
# defensively in case a future caller adds an auth header to this probe.
log "4/11 Waiting for tenant TLS / DNS propagation..."
TLS_DEADLINE=$(( $(date +%s) + 900 ))
TENANT_HOST="${TENANT_URL#https://}"
TLS_TIMEOUT_SEC=$((15 * 60))
TLS_DEADLINE=$(( $(date +%s) + TLS_TIMEOUT_SEC ))
TENANT_HOST="${TENANT_URL#http*://}"
TENANT_HOST="${TENANT_HOST%%/*}"
TENANT_HOST="${TENANT_HOST%%:*}"
while true; do
if curl -sSfk --max-time 5 "$TENANT_URL/health" >/dev/null 2>&1; then
break
@ -220,9 +219,11 @@ while true; do
log "DNS lookup ($TENANT_HOST):"
getent hosts "$TENANT_HOST" 2>&1 || log " (no DNS resolution)"
log "curl -v $TENANT_URL/health (last 40 lines):"
curl -kv --max-time 10 "$TENANT_URL/health" 2>&1 | tail -n 40 | sed 's/^/ /' || true
curl -kv --max-time 10 "$TENANT_URL/health" 2>&1 \
| sed -E 's/(Authorization|Cookie):.*/\1: [redacted]/i' \
| tail -n 40 | sed 's/^/ /' || true
log "── END DIAGNOSTIC ──"
fail "Tenant URL never responded 2xx on /health within 15 min"
fail "Tenant URL never responded 2xx on /health within ${TLS_TIMEOUT_SEC}s"
fi
sleep 5
done