diff --git a/workspace-server/internal/handlers/terminal.go b/workspace-server/internal/handlers/terminal.go index 62fe74b4..434ae1f0 100644 --- a/workspace-server/internal/handlers/terminal.go +++ b/workspace-server/internal/handlers/terminal.go @@ -277,12 +277,26 @@ var openTunnelCmd = func(o eicSSHOptions) *exec.Cmd { // to 22; with CP provisioning today the workspace runs as a native // process under the ubuntu user, so landing at ubuntu's shell IS the // terminal experience. +// +// ConnectTimeout=10 is the user-experience guard — without it, ssh waits +// indefinitely for the remote sshd's banner. When the workspace EC2's +// sshd is unresponsive (mid-restart, SG drop, AMI without ec2-instance- +// connect installed) the canvas's xterm shows the user's typed bytes +// echoed back by the workspace-server's *local* PTY (cooked + echo mode +// before ssh finishes its handshake) and then closes silently when CF's +// idle WebSocket timer fires, with no "Connection refused" or "Permission +// denied" output ever reaching the user. Capping at 10s makes the failure +// surface as a real ssh error message in the terminal — caught 2026-04-30 +// when hongmingwang's hermes shell hung after the heartbeat-fix redeploy +// and a probe at /workspaces//terminal sat for 60s with the only +// frame being the local-PTY echo of a single 'X' typed mid-handshake. var sshCommandCmd = func(o eicSSHOptions) *exec.Cmd { return exec.Command( "ssh", "-i", o.PrivateKeyPath, "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null", + "-o", "ConnectTimeout=10", "-o", "ServerAliveInterval=30", "-o", "ServerAliveCountMax=3", "-p", fmt.Sprintf("%d", o.LocalPort), diff --git a/workspace-server/internal/handlers/terminal_test.go b/workspace-server/internal/handlers/terminal_test.go index 4a3f29fd..040196cb 100644 --- a/workspace-server/internal/handlers/terminal_test.go +++ b/workspace-server/internal/handlers/terminal_test.go @@ -490,3 +490,57 @@ func TestKI005_OrgToken_SkipsValidateToken(t *testing.T) { } } +// TestSSHCommandCmd_ConnectTimeoutPresent pins the user-experience guard +// against ssh-handshake-hang. Without ConnectTimeout, ssh waits forever +// for the remote sshd's banner — which masquerades as a "silently dead" +// shell to the user, because the workspace-server's local PTY is in +// cooked + echo mode before ssh finishes its handshake, so the canvas +// echoes the user's keystrokes back without ever reaching remote bash, +// and Cloudflare eventually closes the WebSocket on idle (~100s) with +// no error frame to surface what went wrong. +// +// Repro 2026-04-30: a 60s probe at hongmingwang's hermes /terminal +// endpoint after the heartbeat-fix redeploy showed only the local-PTY +// echo of a single 'X' typed mid-handshake. Workspace EC2 was up and +// heartbeating but its sshd was unresponsive; ssh hung indefinitely. +// +// Behavior-based: matches the literal `-o ConnectTimeout=N` arg pair so +// this stays pinned even if the rest of the args reorder. Does not pin +// the exact value — operators may tune it — but does pin presence. +func TestSSHCommandCmd_ConnectTimeoutPresent(t *testing.T) { + t.Parallel() + + cmd := sshCommandCmd(eicSSHOptions{ + InstanceID: "i-test", + OSUser: "ubuntu", + Region: "us-east-2", + LocalPort: 2222, + PrivateKeyPath: "/tmp/test-key", + }) + + args := cmd.Args + found := false + for i, a := range args { + if a != "-o" { + continue + } + if i+1 >= len(args) { + continue + } + val := args[i+1] + if len(val) >= len("ConnectTimeout=") && + val[:len("ConnectTimeout=")] == "ConnectTimeout=" { + found = true + break + } + } + if !found { + t.Errorf("sshCommandCmd is missing `-o ConnectTimeout=N` — without it, "+ + "ssh hangs forever when the workspace EC2's sshd is unresponsive "+ + "and the canvas terminal silently dies on Cloudflare's idle WS "+ + "timeout with no error message reaching the user. See terminal.go "+ + "sshCommandCmd comment (2026-04-30 hongmingwang hermes). args=%v", + args) + } +} +