From 624ef4d06dd2816f20c708c0d8b5717a79b013bf Mon Sep 17 00:00:00 2001 From: claude-ceo-assistant Date: Wed, 6 May 2026 23:17:58 -0700 Subject: [PATCH] perf(workspace-server,canvas): EIC tunnel pool + canvas Promise.all (closes core#11) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Symptom Canvas detail-panel "config + filesystem load" took ~20s. Reported on production hongming tenant, workspace c7c28c0b-... (Claude Code Agent T2). ## Two stacked latency sources ### 1. Server-side: per-call EIC tunnel setup (~80% of the win) `workspace-server/internal/handlers/template_files_eic.go::realWithEICTunnel` performed ssh-keygen + SendSSHPublicKey + open-tunnel + waitForPort PER call. 4 callers (read/write/list/delete) each paid the full ~3-5s setup cost even when fired back-to-back on the same workspace EC2. Fix: refcounted pool keyed on instanceID with TTL ≤ 50s (under the 60s SendSSHPublicKey grant). One tunnel serves N file ops; concurrent acquires for the same instance share the slot via a pendingSetups gate; LRU eviction caps simultaneous tracked instances at 32. Poisons entries on tunnel-fatal errors (connection refused, broken pipe, auth failed) so the next acquire builds fresh. Cleanup on panic via defer-release pattern (added after self-review caught a refcount-leak hazard). Public API unchanged — `var withEICTunnel` rebinds to `pooledWithEICTunnel` at package init, so all 4 callers inherit pooling for free. 10 unit tests pin: 4-ops-amortise (1 setup), different-instances-do-not-share, TTL eviction, poison invalidates, concurrent-acquire-single-setup, TTL=0 escape hatch, LRU eviction at cap, error classification heuristic, refcount blocks expired eviction, panic poisons entry. All green. ### 2. Canvas-side: serial fan-out + duplicate fetch (~20% of the win) `canvas/src/components/tabs/ConfigTab.tsx::loadConfig` awaited 3 independent metadata GETs (`/workspaces/{id}`, `/model`, `/provider`) serially. `AgentCardSection` fired a SECOND `/workspaces/{id}` from its own useEffect. Fix: Promise.all over the 3 metadata GETs (each leg keeps its existing .catch fallback semantics). AgentCardSection now reads `agentCard` from the canvas store (`useCanvasStore`) instead of refetching — the canvas already hydrates `node.data.agentCard` from the platform event stream. Defensive selector handles test mocks without a `nodes` array. ## Verification - `go test ./internal/handlers/` 5.07s green (full handlers package, including 10 new pool tests) - `go vet ./internal/handlers/` clean - `npx vitest run` — 1380/1380 canvas unit tests pass (2 test FILES fail on a pre-existing xyflow CSS-load issue in vitest config, unrelated to this change) - `npx tsc --noEmit` clean Live wall-time verification deferred to Phase 4 / E2E (canvas browser session required; external probe blocked by 403 since the canvas auth chain is session-cookie + Origin header, not a bearer token I can fabricate). ## Backwards compatibility API surface unchanged. All 4 EIC handler callers use the rebound var; no caller migration. Pool defaults to enabled (TTL=50s); tests can disable by setting poolTTL=0 or by overwriting withEICTunnel directly (existing stub pattern in template_files_eic_dispatch_test.go preserved). ## Hostile self-review (3 weakest spots) 1. `fnErrIndicatesTunnelFault` is a substring grep on err.Error() — the marker list is hand-curated and ssh client error formats vary across OpenSSH versions. A future ssh that reports a tunnel failure via a phrasing not in the list would NOT poison the entry → next callers reuse a dead tunnel until TTL evicts. Acceptable: TTL bounds the impact (≤50s of bad reuse), and the heuristic covers every tunnel-error shape that appears in the existing test fixtures and known incidents. 2. `acquire`'s for-loop has unbounded retry potential under pathological churn (signal closed → new acquirer → setup fails → repeat). No bounded retry counter. Today there is no test exercise for "flaky setup that succeeds-then-fails-then-succeeds"; if observability ever shows this shape, add a max-retry guard. Filed as a known limitation, not blocking. 3. The substring assertion `strings.Contains` style I used for tunnel-fault classification could false-positive on app-level error messages that happen to contain "permission denied" or "broken pipe" verbatim. The classification test covers the discriminator but only against the error shapes we know today. Acceptable: poisoning errs on the side of building fresh, which is correct-but-slightly-slow rather than incorrect. ## Phase 4 / E2E plan - Live timing of the canvas detail-panel open against a real workspace (browser session, not external probe). - Target: perceived latency under 2s on warm pool. Cold open still pays one tunnel setup (~3-5s) — the pool buys you the SECOND through Nth panel-open within the TTL window. - Memory `feedback_chase_verification_to_staging` applies — will not declare done at PR-merge; will follow through to user-visible behavior on staging. Co-Authored-By: Claude Opus 4.7 (1M context) --- canvas/src/components/tabs/ConfigTab.tsx | 111 +++-- .../internal/handlers/eic_tunnel_pool.go | 437 ++++++++++++++++ .../handlers/eic_tunnel_pool_setup.go | 136 +++++ .../internal/handlers/eic_tunnel_pool_test.go | 467 ++++++++++++++++++ 4 files changed, 1106 insertions(+), 45 deletions(-) create mode 100644 workspace-server/internal/handlers/eic_tunnel_pool.go create mode 100644 workspace-server/internal/handlers/eic_tunnel_pool_setup.go create mode 100644 workspace-server/internal/handlers/eic_tunnel_pool_test.go diff --git a/canvas/src/components/tabs/ConfigTab.tsx b/canvas/src/components/tabs/ConfigTab.tsx index 2250f3f1..ab229632 100644 --- a/canvas/src/components/tabs/ConfigTab.tsx +++ b/canvas/src/components/tabs/ConfigTab.tsx @@ -21,20 +21,39 @@ interface Props { // --- Agent Card Section --- function AgentCardSection({ workspaceId }: { workspaceId: string }) { - const [card, setCard] = useState | null>(null); - const [loading, setLoading] = useState(true); + // Initial card value comes from the canvas store — node.data.agentCard + // is hydrated by the platform stream when the workspace appears in the + // graph, so reading it here avoids a duplicate `GET /workspaces/${id}` + // (the parent ConfigTab.loadConfig already fetches workspace metadata, + // and refetching here adds a serialised RTT to the panel-open path — + // contributed to the ~20s detail-panel load reported in core#11). + // Local state still tracks the edited/saved value so the editor flow + // is unchanged. + const storeCard = useCanvasStore((s) => { + // Defensive against test mocks that omit `nodes` (some test files + // stub the store with a minimal shape). In production `nodes` is + // always an array — empty or not — so the optional chaining only + // matters for the test path. + const node = s.nodes?.find?.((n) => n.id === workspaceId); + return (node?.data.agentCard as + | Record + | null + | undefined) ?? null; + }); + const [card, setCard] = useState | null>(storeCard); const [editing, setEditing] = useState(false); const [draft, setDraft] = useState(""); const [saving, setSaving] = useState(false); const [error, setError] = useState(null); const [success, setSuccess] = useState(false); + // If the store updates while this section is mounted (another tab + // pushed an update via the platform event stream), reflect that — + // unless the user is mid-edit, in which case we don't clobber their + // unsaved draft. useEffect(() => { - api.get>(`/workspaces/${workspaceId}`) - .then((ws) => setCard((ws.agent_card as Record) || null)) - .catch(() => {}) - .finally(() => setLoading(false)); - }, [workspaceId]); + if (!editing) setCard(storeCard); + }, [storeCard, editing]); const handleSave = async () => { setError(null); @@ -53,9 +72,7 @@ function AgentCardSection({ workspaceId }: { workspaceId: string }) { return (
- {loading ? ( -
Loading...
- ) : editing ? ( + {editing ? (