forked from molecule-ai/molecule-core
fix(e2e): bump tenant TLS timeout to 15m + diagnostic burst on failure (#2090)
Canary #2090 has been red for 6 consecutive runs over 4+ hours, all timing out at the TLS-readiness step exactly at the 10-min cap. Time window correlates with three CP commits that landed today/yesterday and changed EC2 boot behaviour: - molecule-controlplane@a3eb8be — fix(ec2): force fresh clone of /opt/adapter - molecule-controlplane@ed70405 — feat(sweep): wire up healthcheck loop - molecule-controlplane@4ab339e — fix(provisioner): aggregate cleanup errors Two changes here, both surgical: 1. Bump the bash-side TLS deadline from 600s to 900s, and the canvas TS mirror from 10m to 15m. Stays below the 20-min provision envelope (so a genuinely-stuck tenant still fails loud at the earlier provision step instead of masquerading as TLS). 2. On TLS-timeout, dump a diagnostic burst before exiting: - getent hosts $TENANT_HOST (DNS resolution state) - curl -kv $TENANT_URL/health (TLS handshake + HTTP layer) The previous failure log was just "no 2xx in N min" with no signal for which layer was actually broken. After this, the next timeout tells us whether DNS, TLS handshake, or HTTP layer is the culprit so the CP root cause can be isolated without speculation. This is the unblock; a separate molecule-controlplane issue tracks the underlying regression suspicion. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
9b26144386
commit
af89d3fcbd
@ -50,12 +50,13 @@ const WORKSPACE_ONLINE_TIMEOUT_MS = 20 * 60 * 1000;
|
||||
// TLS readiness depends on (1) Cloudflare DNS propagation through the
|
||||
// edge, (2) the tenant's CF Tunnel registering the new hostname, (3)
|
||||
// CF's edge ACME cert provisioning + cache. Each of these layers can
|
||||
// add 1-3 min on its own under heavy staging load. The original 3-min
|
||||
// cap blocked four cycles of staging→main PRs across 2026-04-24+.
|
||||
// 10 min stays inside the 20-min PROVISION_TIMEOUT envelope (so a
|
||||
// genuinely-stuck tenant still fails-loud at the provision step) but
|
||||
// absorbs the realistic worst case for a one-shot tenant TLS handshake.
|
||||
const TLS_TIMEOUT_MS = 10 * 60 * 1000;
|
||||
// add 1-3 min on its own under heavy staging load. Bumped from 10 to
|
||||
// 15 min after #2090 (6 consecutive canary failures starting 2026-04-26
|
||||
// correlated with CP commits a3eb8be / ed70405 / 4ab339e). Stays below
|
||||
// the 20-min PROVISION_TIMEOUT envelope so a genuinely-stuck tenant
|
||||
// still fails-loud at the provision step rather than masquerading as
|
||||
// a TLS issue. Kept aligned with tests/e2e/test_staging_full_saas.sh.
|
||||
const TLS_TIMEOUT_MS = 15 * 60 * 1000;
|
||||
|
||||
async function jsonFetch(
|
||||
url: string,
|
||||
|
||||
@ -195,21 +195,34 @@ TENANT_TOKEN=$(echo "$TENANT_TOKEN_RESP" | python3 -c "import json,sys; print(js
|
||||
ok "Tenant admin token retrieved (len=${#TENANT_TOKEN})"
|
||||
|
||||
# ─── 4. Wait for tenant TLS / DNS propagation ──────────────────────────
|
||||
# 10 min — same envelope as canvas/e2e/staging-setup.ts TLS_TIMEOUT_MS.
|
||||
# CF DNS propagation + tunnel hostname registration + ACME cert + edge
|
||||
# cache routinely take 5-7 min under staging load; the original 3-min
|
||||
# cap blocked multiple staging→main PRs across 2026-04-24+. Stays
|
||||
# inside the parent provision envelope so a genuinely-stuck tenant
|
||||
# still fails loud at the earlier provision step rather than masquerading
|
||||
# as a TLS issue.
|
||||
# 15 min — kept below the 20-min provision envelope so a genuinely-stuck
|
||||
# tenant still fails loud at the earlier provision step rather than
|
||||
# masquerading as a TLS issue. CF DNS propagation + tunnel hostname
|
||||
# registration + ACME cert + edge cache run 5-7 min on a healthy day; the
|
||||
# +5 min headroom over the previous 10-min cap covers the slower path
|
||||
# observed in #2090 (6 consecutive canary failures starting 2026-04-26
|
||||
# correlated with CP commits a3eb8be / ed70405 / 4ab339e).
|
||||
#
|
||||
# On timeout we print a diagnostic burst — last DNS lookup, last curl
|
||||
# verbose handshake, last response headers — so the next failure tells
|
||||
# us which layer (DNS, TLS, HTTP) is actually broken. Without this the
|
||||
# only signal is "no 2xx in N min" which sent us in circles.
|
||||
log "4/11 Waiting for tenant TLS / DNS propagation..."
|
||||
TLS_DEADLINE=$(( $(date +%s) + 600 ))
|
||||
TLS_DEADLINE=$(( $(date +%s) + 900 ))
|
||||
TENANT_HOST="${TENANT_URL#https://}"
|
||||
TENANT_HOST="${TENANT_HOST%%/*}"
|
||||
while true; do
|
||||
if curl -sSfk --max-time 5 "$TENANT_URL/health" >/dev/null 2>&1; then
|
||||
break
|
||||
fi
|
||||
if [ "$(date +%s)" -gt "$TLS_DEADLINE" ]; then
|
||||
fail "Tenant URL never responded 2xx on /health within 10 min"
|
||||
log "── DIAGNOSTIC BURST (TLS-readiness timeout) ──"
|
||||
log "DNS lookup ($TENANT_HOST):"
|
||||
getent hosts "$TENANT_HOST" 2>&1 || log " (no DNS resolution)"
|
||||
log "curl -v $TENANT_URL/health (last 40 lines):"
|
||||
curl -kv --max-time 10 "$TENANT_URL/health" 2>&1 | tail -n 40 | sed 's/^/ /' || true
|
||||
log "── END DIAGNOSTIC ──"
|
||||
fail "Tenant URL never responded 2xx on /health within 15 min"
|
||||
fi
|
||||
sleep 5
|
||||
done
|
||||
|
||||
Loading…
Reference in New Issue
Block a user