From f30b3d44764f939a699439e11bbc34729eb5a25f Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 30 Apr 2026 20:16:41 -0700 Subject: [PATCH] fix(terminal): cap ssh handshake at 10s so hung sshd surfaces fast MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the workspace EC2's sshd is unresponsive (mid-restart, SG drop, AMI without ec2-instance-connect), the canvas's xterm shows the user's typed bytes echoed back by the workspace-server's *local* PTY (cooked + echo mode before ssh sets it raw post-handshake) and then closes silently when Cloudflare's idle WebSocket timer fires (~100s) — with no "Connection refused" or "Permission denied" output ever reaching the user. This is what hongmingwang's hermes terminal looked like 2026-04-30 right after the heartbeat-fix redeploy: status="online" but the shell appeared dead. Caught reproducibly by holding a fresh /workspaces//terminal WebSocket open for 60s — server sent zero frames except the local-PTY echo of one keystroke typed at t=8s. ssh was hung at handshake; bash never saw the byte. Fix: add `-o ConnectTimeout=10` to ssh args. Now the failure surfaces as a real ssh error message in the terminal within 10s, instead of masquerading as a silently dead shell over the next ~100s. Doesn't diagnose *why* sshd isn't responding (separate investigation), but it does mean the user gets actionable feedback within seconds. Behavior-based regression test asserts `-o ConnectTimeout=N` is in the ssh argv — pins presence, not the literal value, so operators can tune without breaking the gate. Verified to FAIL on pre-fix code (matched the literal arg pair) and PASS on fix. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/handlers/terminal.go | 14 +++++ .../internal/handlers/terminal_test.go | 54 +++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/workspace-server/internal/handlers/terminal.go b/workspace-server/internal/handlers/terminal.go index 62fe74b4..434ae1f0 100644 --- a/workspace-server/internal/handlers/terminal.go +++ b/workspace-server/internal/handlers/terminal.go @@ -277,12 +277,26 @@ var openTunnelCmd = func(o eicSSHOptions) *exec.Cmd { // to 22; with CP provisioning today the workspace runs as a native // process under the ubuntu user, so landing at ubuntu's shell IS the // terminal experience. +// +// ConnectTimeout=10 is the user-experience guard — without it, ssh waits +// indefinitely for the remote sshd's banner. When the workspace EC2's +// sshd is unresponsive (mid-restart, SG drop, AMI without ec2-instance- +// connect installed) the canvas's xterm shows the user's typed bytes +// echoed back by the workspace-server's *local* PTY (cooked + echo mode +// before ssh finishes its handshake) and then closes silently when CF's +// idle WebSocket timer fires, with no "Connection refused" or "Permission +// denied" output ever reaching the user. Capping at 10s makes the failure +// surface as a real ssh error message in the terminal — caught 2026-04-30 +// when hongmingwang's hermes shell hung after the heartbeat-fix redeploy +// and a probe at /workspaces//terminal sat for 60s with the only +// frame being the local-PTY echo of a single 'X' typed mid-handshake. var sshCommandCmd = func(o eicSSHOptions) *exec.Cmd { return exec.Command( "ssh", "-i", o.PrivateKeyPath, "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null", + "-o", "ConnectTimeout=10", "-o", "ServerAliveInterval=30", "-o", "ServerAliveCountMax=3", "-p", fmt.Sprintf("%d", o.LocalPort), diff --git a/workspace-server/internal/handlers/terminal_test.go b/workspace-server/internal/handlers/terminal_test.go index 4a3f29fd..040196cb 100644 --- a/workspace-server/internal/handlers/terminal_test.go +++ b/workspace-server/internal/handlers/terminal_test.go @@ -490,3 +490,57 @@ func TestKI005_OrgToken_SkipsValidateToken(t *testing.T) { } } +// TestSSHCommandCmd_ConnectTimeoutPresent pins the user-experience guard +// against ssh-handshake-hang. Without ConnectTimeout, ssh waits forever +// for the remote sshd's banner — which masquerades as a "silently dead" +// shell to the user, because the workspace-server's local PTY is in +// cooked + echo mode before ssh finishes its handshake, so the canvas +// echoes the user's keystrokes back without ever reaching remote bash, +// and Cloudflare eventually closes the WebSocket on idle (~100s) with +// no error frame to surface what went wrong. +// +// Repro 2026-04-30: a 60s probe at hongmingwang's hermes /terminal +// endpoint after the heartbeat-fix redeploy showed only the local-PTY +// echo of a single 'X' typed mid-handshake. Workspace EC2 was up and +// heartbeating but its sshd was unresponsive; ssh hung indefinitely. +// +// Behavior-based: matches the literal `-o ConnectTimeout=N` arg pair so +// this stays pinned even if the rest of the args reorder. Does not pin +// the exact value — operators may tune it — but does pin presence. +func TestSSHCommandCmd_ConnectTimeoutPresent(t *testing.T) { + t.Parallel() + + cmd := sshCommandCmd(eicSSHOptions{ + InstanceID: "i-test", + OSUser: "ubuntu", + Region: "us-east-2", + LocalPort: 2222, + PrivateKeyPath: "/tmp/test-key", + }) + + args := cmd.Args + found := false + for i, a := range args { + if a != "-o" { + continue + } + if i+1 >= len(args) { + continue + } + val := args[i+1] + if len(val) >= len("ConnectTimeout=") && + val[:len("ConnectTimeout=")] == "ConnectTimeout=" { + found = true + break + } + } + if !found { + t.Errorf("sshCommandCmd is missing `-o ConnectTimeout=N` — without it, "+ + "ssh hangs forever when the workspace EC2's sshd is unresponsive "+ + "and the canvas terminal silently dies on Cloudflare's idle WS "+ + "timeout with no error message reaching the user. See terminal.go "+ + "sshCommandCmd comment (2026-04-30 hongmingwang hermes). args=%v", + args) + } +} +