From c3ba5df9ff356cb790b65d17e42379229a997f34 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 3 May 2026 13:06:25 -0700 Subject: [PATCH] test(e2e): add canvas-terminal diagnose probe to synth-E2E (catches EIC-chain regressions in <20 min) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Why: the 2026-05-03 SG-missing-port-22 bug was structurally invisible to local-dev — handleLocalConnect uses docker exec; only handleRemoteConnect exercises EIC. The CP provisioner shipped without the EIC ingress rule for ~6 months and nobody noticed until a paying tenant clicked Terminal. Continuous synth-E2E runs every 20 min; adding this probe means the same class of regression (CP provisioner ingress, EIC_ENDPOINT_SG_ID env, handleRemoteConnect chain, SDK source-group support) surfaces within ~20 min of merge instead of waiting for a user report. What: after Step 7 (workspace online), call GET /workspaces/$wid/terminal/diagnose for each workspace. The endpoint already exists in workspace-server (terminal_diagnose.go); it runs the full EIC + ssh chain from inside the tenant (which has AWS creds via its IAM profile) and returns {ok, first_failure, steps[]}. We just need to call it as the tenant — no AWS creds plumbed onto the GHA runner, no port-forwarding from CI. Local-docker workspaces (instance_id NULL) hit diagnoseLocal which probes docker.Ping + container exec; same ok=true contract, so the probe works on both production paths. This is a partial mitigation for task #269 (eliminate handleLocalConnect bypass — local must mimic prod terminal path). The architectural fix (refactor terminal.go so local docker also exercises an EIC-shaped sequence) remains pending; this PR is the "find out issues earlier" half of the user's directive. --- tests/e2e/test_staging_full_saas.sh | 36 +++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tests/e2e/test_staging_full_saas.sh b/tests/e2e/test_staging_full_saas.sh index ce7f1e29..330affcf 100755 --- a/tests/e2e/test_staging_full_saas.sh +++ b/tests/e2e/test_staging_full_saas.sh @@ -429,6 +429,42 @@ for wid in $WS_TO_CHECK; do ok " $wid online" done +# ─── 7b. Canvas-terminal diagnose (EIC chain probe) ──────────────────── +# This step exists because the canvas-terminal failure of 2026-05-03 +# was structurally invisible to local-dev (handleLocalConnect uses +# docker exec; handleRemoteConnect uses EIC + ssh). The CP provisioner +# shipped without the tcp/22 EIC ingress rule for ~6 months and nobody +# noticed until a paying tenant clicked Terminal in canvas. Probing the +# diagnose endpoint here at synth-E2E time means a regression in +# - tenantIngressRules / workspaceIngressRules (CP) +# - eicSSHIngressRule helper (CP) +# - AuthorizeIngress source-group support (CP awsapi) +# - EIC_ENDPOINT_SG_ID Railway env +# - handleRemoteConnect's send-ssh-public-key/open-tunnel/ssh chain +# surfaces within ~20 min of merge instead of waiting for a user report. +# +# The diagnose endpoint runs the full EIC + ssh probe from inside the +# tenant's workspace-server (which already has AWS creds via its IAM +# profile) and reports per-step status. We only need to call it as the +# tenant — no AWS creds needed on the GHA runner. Returns +# {"ok": bool, "first_failure": "name", "steps": [...]}. +# +# Local-docker workspaces (instance_id NULL) get diagnoseLocal which +# probes docker.Ping + container exec; we still expect ok=true there +# since local-docker is the alternative production path. +log "7b/11 Canvas-terminal EIC diagnose probe..." +for wid in $WS_TO_CHECK; do + DIAG_JSON=$(tenant_call GET "/workspaces/$wid/terminal/diagnose" 2>/dev/null || echo '{}') + DIAG_OK=$(echo "$DIAG_JSON" | python3 -c "import json,sys; d=json.load(sys.stdin); print('true' if d.get('ok') else 'false')" 2>/dev/null || echo "false") + if [ "$DIAG_OK" = "true" ]; then + ok " $wid terminal-reachable (canvas terminal will work)" + else + DIAG_FAIL=$(echo "$DIAG_JSON" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('first_failure','unknown'))" 2>/dev/null || echo "unknown") + DIAG_DETAIL=$(echo "$DIAG_JSON" | python3 -c "import json,sys; d=json.load(sys.stdin); s=[x for x in d.get('steps',[]) if not x.get('ok')]; print(s[0].get('error','') if s else '')" 2>/dev/null || echo "") + fail "Workspace $wid terminal diagnose failed at step '$DIAG_FAIL': $DIAG_DETAIL — check tenant SG has tcp/22 from EIC endpoint SG (sg-0785d5c6138220523), EIC_ENDPOINT_SG_ID set in Railway, and EIC endpoint health" + fi +done + # ─── 8. A2A round-trip on parent ─────────────────────────────────────── log "8/11 Sending A2A message to parent — expecting agent response..." # Smoke prompt phrasing — DO NOT trim back to the bare "Reply with exactly: PONG"