Merge pull request #2434 from Molecule-AI/fix/terminal-ssh-connect-timeout

fix(terminal): cap ssh handshake at 10s so hung sshd surfaces fast
This commit is contained in:
Hongming Wang 2026-05-01 03:26:41 +00:00 committed by GitHub
commit 4e39609ae0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 69 additions and 0 deletions

View File

@ -277,12 +277,26 @@ var openTunnelCmd = func(o eicSSHOptions) *exec.Cmd {
// to 22; with CP provisioning today the workspace runs as a native
// process under the ubuntu user, so landing at ubuntu's shell IS the
// terminal experience.
//
// ConnectTimeout=10 is the user-experience guard — without it, ssh waits
// indefinitely for the remote sshd's banner. When the workspace EC2's
// sshd is unresponsive (mid-restart, SG drop, AMI without ec2-instance-
// connect installed) the canvas's xterm shows the user's typed bytes
// echoed back by the workspace-server's *local* PTY (cooked + echo mode
// before ssh finishes its handshake) and then closes silently when CF's
// idle WebSocket timer fires, with no "Connection refused" or "Permission
// denied" output ever reaching the user. Capping at 10s makes the failure
// surface as a real ssh error message in the terminal — caught 2026-04-30
// when hongmingwang's hermes shell hung after the heartbeat-fix redeploy
// and a probe at /workspaces/<id>/terminal sat for 60s with the only
// frame being the local-PTY echo of a single 'X' typed mid-handshake.
var sshCommandCmd = func(o eicSSHOptions) *exec.Cmd {
return exec.Command(
"ssh",
"-i", o.PrivateKeyPath,
"-o", "StrictHostKeyChecking=no",
"-o", "UserKnownHostsFile=/dev/null",
"-o", "ConnectTimeout=10",
"-o", "ServerAliveInterval=30",
"-o", "ServerAliveCountMax=3",
"-p", fmt.Sprintf("%d", o.LocalPort),

View File

@ -320,6 +320,7 @@ func TestSSHCommandCmd_BuildsArgv(t *testing.T) {
"-i", "/tmp/k",
"-o", "StrictHostKeyChecking=no",
"-o", "UserKnownHostsFile=/dev/null",
"-o", "ConnectTimeout=10",
"-o", "ServerAliveInterval=30",
"-o", "ServerAliveCountMax=3",
"-p", "2222",
@ -490,3 +491,57 @@ func TestKI005_OrgToken_SkipsValidateToken(t *testing.T) {
}
}
// TestSSHCommandCmd_ConnectTimeoutPresent pins the user-experience guard
// against ssh-handshake-hang. Without ConnectTimeout, ssh waits forever
// for the remote sshd's banner — which masquerades as a "silently dead"
// shell to the user, because the workspace-server's local PTY is in
// cooked + echo mode before ssh finishes its handshake, so the canvas
// echoes the user's keystrokes back without ever reaching remote bash,
// and Cloudflare eventually closes the WebSocket on idle (~100s) with
// no error frame to surface what went wrong.
//
// Repro 2026-04-30: a 60s probe at hongmingwang's hermes /terminal
// endpoint after the heartbeat-fix redeploy showed only the local-PTY
// echo of a single 'X' typed mid-handshake. Workspace EC2 was up and
// heartbeating but its sshd was unresponsive; ssh hung indefinitely.
//
// Behavior-based: matches the literal `-o ConnectTimeout=N` arg pair so
// this stays pinned even if the rest of the args reorder. Does not pin
// the exact value — operators may tune it — but does pin presence.
func TestSSHCommandCmd_ConnectTimeoutPresent(t *testing.T) {
t.Parallel()
cmd := sshCommandCmd(eicSSHOptions{
InstanceID: "i-test",
OSUser: "ubuntu",
Region: "us-east-2",
LocalPort: 2222,
PrivateKeyPath: "/tmp/test-key",
})
args := cmd.Args
found := false
for i, a := range args {
if a != "-o" {
continue
}
if i+1 >= len(args) {
continue
}
val := args[i+1]
if len(val) >= len("ConnectTimeout=") &&
val[:len("ConnectTimeout=")] == "ConnectTimeout=" {
found = true
break
}
}
if !found {
t.Errorf("sshCommandCmd is missing `-o ConnectTimeout=N` — without it, "+
"ssh hangs forever when the workspace EC2's sshd is unresponsive "+
"and the canvas terminal silently dies on Cloudflare's idle WS "+
"timeout with no error message reaching the user. See terminal.go "+
"sshCommandCmd comment (2026-04-30 hongmingwang hermes). args=%v",
args)
}
}