From f9b1b3495631542c1f5fae4015afaf4a06b8768d Mon Sep 17 00:00:00 2001 From: rabbitblood Date: Sun, 26 Apr 2026 08:21:18 -0700 Subject: [PATCH] =?UTF-8?q?fix(e2e):=20bump=20staging=20tenant=20TLS-readi?= =?UTF-8?q?ness=20timeout=203min=20=E2=86=92=2010min?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes a 4+ cycle Canvas tabs E2E flake pattern that's been blocking staging→main PRs since 2026-04-24+ (#2096, #2094, #2055, #2079, ...). Root cause: TLS_TIMEOUT_MS=180s (3 min) is too tight for the layered realities of staging tenant TLS readiness: 1. Cloudflare DNS propagation through the edge (1-2 min typical) 2. Tenant CF Tunnel registering the new hostname (1-2 min) 3. CF edge ACME cert provisioning + cache (1-3 min) Each layer can add 1-3 min on its own under heavy staging load — the realistic worst case is well past the 3-min cap. Provision and workspace-online timeouts were already raised to 20 min (staging-setup.ts:42-46 history). The TLS gate was the remaining under-budgeted step. Bumping to 10 min keeps it inside the 20-min PROVISION envelope so a genuinely-stuck tenant still fails loud at the earlier provision step rather than masquerading as a TLS issue. Both call sites raised together: - canvas/e2e/staging-setup.ts: TLS_TIMEOUT_MS = 10 * 60 * 1000 - tests/e2e/test_staging_full_saas.sh: TLS_DEADLINE += 600 Each carries an inline rationale comment so the next reviewer sees the layer-by-layer decomposition without re-reading the issue thread. Co-Authored-By: Claude Opus 4.7 (1M context) --- canvas/e2e/staging-setup.ts | 11 ++++++++++- tests/e2e/test_staging_full_saas.sh | 11 +++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/canvas/e2e/staging-setup.ts b/canvas/e2e/staging-setup.ts index 963f9ccb..122efdf9 100644 --- a/canvas/e2e/staging-setup.ts +++ b/canvas/e2e/staging-setup.ts @@ -46,7 +46,16 @@ const TENANT_DOMAIN = process.env.STAGING_TENANT_DOMAIN || "staging.moleculesai. // were blocking staging→main syncs on 2026-04-24. const PROVISION_TIMEOUT_MS = 20 * 60 * 1000; const WORKSPACE_ONLINE_TIMEOUT_MS = 20 * 60 * 1000; -const TLS_TIMEOUT_MS = 3 * 60 * 1000; + +// TLS readiness depends on (1) Cloudflare DNS propagation through the +// edge, (2) the tenant's CF Tunnel registering the new hostname, (3) +// CF's edge ACME cert provisioning + cache. Each of these layers can +// add 1-3 min on its own under heavy staging load. The original 3-min +// cap blocked four cycles of staging→main PRs across 2026-04-24+. +// 10 min stays inside the 20-min PROVISION_TIMEOUT envelope (so a +// genuinely-stuck tenant still fails-loud at the provision step) but +// absorbs the realistic worst case for a one-shot tenant TLS handshake. +const TLS_TIMEOUT_MS = 10 * 60 * 1000; async function jsonFetch( url: string, diff --git a/tests/e2e/test_staging_full_saas.sh b/tests/e2e/test_staging_full_saas.sh index ba0fc7a9..e498ed46 100755 --- a/tests/e2e/test_staging_full_saas.sh +++ b/tests/e2e/test_staging_full_saas.sh @@ -195,14 +195,21 @@ TENANT_TOKEN=$(echo "$TENANT_TOKEN_RESP" | python3 -c "import json,sys; print(js ok "Tenant admin token retrieved (len=${#TENANT_TOKEN})" # ─── 4. Wait for tenant TLS / DNS propagation ────────────────────────── +# 10 min — same envelope as canvas/e2e/staging-setup.ts TLS_TIMEOUT_MS. +# CF DNS propagation + tunnel hostname registration + ACME cert + edge +# cache routinely take 5-7 min under staging load; the original 3-min +# cap blocked multiple staging→main PRs across 2026-04-24+. Stays +# inside the parent provision envelope so a genuinely-stuck tenant +# still fails loud at the earlier provision step rather than masquerading +# as a TLS issue. log "4/11 Waiting for tenant TLS / DNS propagation..." -TLS_DEADLINE=$(( $(date +%s) + 180 )) +TLS_DEADLINE=$(( $(date +%s) + 600 )) while true; do if curl -sSfk --max-time 5 "$TENANT_URL/health" >/dev/null 2>&1; then break fi if [ "$(date +%s)" -gt "$TLS_DEADLINE" ]; then - fail "Tenant URL never responded 2xx on /health within 3 min" + fail "Tenant URL never responded 2xx on /health within 10 min" fi sleep 5 done