From bca11fea9ff7b37a618f21069e4d6a1c81f6dde4 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Tue, 21 Apr 2026 18:39:00 -0700 Subject: [PATCH] fix(terminal): correct CP branch to SSH-only (no docker exec) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Proven by end-to-end testing against a live Hermes workspace EC2: CP-provisioned workspaces run the agent as a NATIVE process under the ubuntu user, not inside a Docker container. The earlier \`aws ec2-instance-connect ssh -- docker exec -it ws-X bash\` was doubly wrong: - aws-cli's \`ssh\` subcommand doesn't accept a trailing command - Even if it did, there's no container to exec into Replaced with a three-step pipeline that matches what actually works when run by hand: 1. ssh-keygen — ephemeral ed25519 per session 2. aws ec2-instance-connect send-ssh-public-key --instance-os-user ubuntu 3. aws ec2-instance-connect open-tunnel --local-port N (runs in background) 4. ssh -p N -i ubuntu@127.0.0.1 Infra prerequisites (verified in docs/infra/workspace-terminal.md): - EIC service-linked role created - EIC Endpoint in the workspace VPC (we created eice-08b035ec8789202f9) - Workspace SG allows 22/tcp from the EIC Endpoint's SG - molecule-cp IAM: ec2:DescribeInstances + ec2-instance-connect:* Changes in this commit: - eicSSHOptions struct carries session inputs between factories - openTunnelCmd + sshCommandCmd + sendSSHPublicKey are package vars so tests can stub them individually - Default OS user is \"ubuntu\" (Ubuntu 24.04 CP AMI). Override via WORKSPACE_EC2_OS_USER env var if the AMI changes - AWS_REGION env var respected; default us-east-2 matches current CP - pickFreePort + waitForPort helpers — no hardcoded ports, tolerates multiple concurrent sessions - Tests updated: two argv-shape regressions for open-tunnel + ssh (SSH shape was the silent-drift case that caused the first failure) Refs: #1528, #1531 Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/handlers/terminal.go | 204 +++++++++++++++--- .../internal/handlers/terminal_test.go | 85 ++++---- 2 files changed, 223 insertions(+), 66 deletions(-) diff --git a/workspace-server/internal/handlers/terminal.go b/workspace-server/internal/handlers/terminal.go index 67793acc..18b1b4cc 100644 --- a/workspace-server/internal/handlers/terminal.go +++ b/workspace-server/internal/handlers/terminal.go @@ -2,8 +2,10 @@ package handlers import ( "context" + "fmt" "io" "log" + "net" "net/http" "os" "os/exec" @@ -198,30 +200,83 @@ func (h *TerminalHandler) handleLocalConnect(c *gin.Context, workspaceID string) <-done } -// sshCommandFactory builds the argv that opens an interactive shell into a -// CP-provisioned workspace. Exposed as a var so tests can override it. -// Real builds invoke the AWS CLI's EIC shortcut, which handles ephemeral -// key generation, SendSSHPublicKey, OpenTunnel, and SSH in one command. -// Requires aws-cli v2 + openssh-client in the tenant image. -var sshCommandFactory = func(instanceID, osUser, containerName string) *exec.Cmd { +// eicSSHOptions bundles the per-session inputs for spawning the EIC tunnel +// and the ssh client that rides on top of it. Fields are plain data so +// tests can stub the two factories below without fighting exec.Cmd. +type eicSSHOptions struct { + InstanceID string + OSUser string + Region string + LocalPort int + PrivateKeyPath string +} + +// openTunnelCmd builds the argv that opens a TLS-tunneled TCP port from +// the local machine to the workspace EC2's sshd via the EIC Endpoint. +// Long-lived: stays up for the whole terminal session. +var openTunnelCmd = func(o eicSSHOptions) *exec.Cmd { + args := []string{ + "ec2-instance-connect", "open-tunnel", + "--instance-id", o.InstanceID, + "--local-port", fmt.Sprintf("%d", o.LocalPort), + } + if o.Region != "" { + args = append([]string{"--region", o.Region}, args...) + } + return exec.Command("aws", args...) +} + +// sshCommandCmd builds the argv for the interactive ssh client that rides +// on the open tunnel. The remote side is the workspace EC2's sshd bound +// to 22; with CP provisioning today the workspace runs as a native +// process under the ubuntu user, so landing at ubuntu's shell IS the +// terminal experience. +var sshCommandCmd = func(o eicSSHOptions) *exec.Cmd { return exec.Command( - "aws", "ec2-instance-connect", "ssh", - "--instance-id", instanceID, - "--connection-type", "eice", // via EIC Endpoint - "--os-user", osUser, - "--", - "docker", "exec", "-it", containerName, "/bin/bash", + "ssh", + "-i", o.PrivateKeyPath, + "-o", "StrictHostKeyChecking=no", + "-o", "UserKnownHostsFile=/dev/null", + "-o", "ServerAliveInterval=30", + "-o", "ServerAliveCountMax=3", + "-p", fmt.Sprintf("%d", o.LocalPort), + fmt.Sprintf("%s@127.0.0.1", o.OSUser), ) } -// handleRemoteConnect tunnels a terminal session to a workspace running on -// a separate EC2 via EC2 Instance Connect. Design: docs/infra/workspace-terminal.md. +// sendSSHPublicKey pushes an ephemeral public key to the EIC service so +// the workspace's sshd accepts the paired private key for the next 60s. +// Exposed as a var so tests can stub the AWS call. +var sendSSHPublicKey = func(ctx context.Context, region, instanceID, osUser, pubKey string) error { + cmd := exec.CommandContext(ctx, "aws", "ec2-instance-connect", "send-ssh-public-key", + "--region", region, + "--instance-id", instanceID, + "--instance-os-user", osUser, + "--ssh-public-key", pubKey) + out, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("send-ssh-public-key: %w (%s)", err, strings.TrimSpace(string(out))) + } + return nil +} + +// handleRemoteConnect opens a terminal session on a workspace EC2 using: +// +// aws ec2-instance-connect send-ssh-public-key (push ephemeral key) +// aws ec2-instance-connect open-tunnel (TLS tunnel to :22) +// ssh -p ubuntu@127.0.0.1 (interactive shell) +// +// CP-provisioned workspaces run as native processes under ubuntu, not +// Docker. Design: docs/infra/workspace-terminal.md. func (h *TerminalHandler) handleRemoteConnect(c *gin.Context, workspaceID, instanceID string) { osUser := os.Getenv("WORKSPACE_EC2_OS_USER") if osUser == "" { - osUser = "ec2-user" // AL2023 default; override via env if AMI changes + osUser = "ubuntu" // Ubuntu 24.04 AMI, default CP workspace runtime user + } + region := os.Getenv("AWS_REGION") + if region == "" { + region = "us-east-2" // CP default — override via env } - containerName := provisioner.ContainerName(workspaceID) conn, err := termUpgrader.Upgrade(c.Writer, c.Request, nil) if err != nil { @@ -230,22 +285,85 @@ func (h *TerminalHandler) handleRemoteConnect(c *gin.Context, workspaceID, insta } defer conn.Close() - // PTY so interactive bash works (prompts, line editing, colors). - // os/exec alone can't allocate a controlling terminal; creack/pty - // opens the pty pair and wires it to the child's stdin/stdout/stderr. ctx, cancel := context.WithCancel(c.Request.Context()) defer cancel() - cmd := sshCommandFactory(instanceID, osUser, containerName) - cmd.Env = os.Environ() // inherit AWS_REGION, AWS credentials, etc. + + // Ephemeral keypair — never hits disk after the session ends, and is + // only valid for <60s on the instance side regardless. + keyDir, err := os.MkdirTemp("", "molecule-terminal-*") + if err != nil { + log.Printf("Terminal keydir mkdir for ws=%s: %v", workspaceID, err) + _ = conn.WriteMessage(websocket.TextMessage, + []byte("Error: failed to allocate session keypair\r\n")) + return + } + defer func() { _ = os.RemoveAll(keyDir) }() + keyPath := keyDir + "/id" + keygen := exec.CommandContext(ctx, "ssh-keygen", "-t", "ed25519", "-f", keyPath, "-N", "", "-q", "-C", "molecule-terminal") + if out, kerr := keygen.CombinedOutput(); kerr != nil { + log.Printf("Terminal ssh-keygen for ws=%s: %v (%s)", workspaceID, kerr, out) + _ = conn.WriteMessage(websocket.TextMessage, + []byte("Error: failed to generate session keypair\r\n")) + return + } + pubKey, err := os.ReadFile(keyPath + ".pub") + if err != nil { + log.Printf("Terminal pubkey read for ws=%s: %v", workspaceID, err) + return + } + + // 1. Push public key — sshd accepts matching private for 60s. + if err := sendSSHPublicKey(ctx, region, instanceID, osUser, strings.TrimSpace(string(pubKey))); err != nil { + log.Printf("Terminal EIC send-key for ws=%s instance=%s: %v", workspaceID, instanceID, err) + _ = conn.WriteMessage(websocket.TextMessage, + []byte("Error: failed to push session key (check tenant IAM + see docs/infra/workspace-terminal.md)\r\n")) + return + } + + // 2. Open tunnel on an OS-picked free port; retry briefly because + // tunnel takes ~1-2s to start listening after exec. + localPort, err := pickFreePort() + if err != nil { + log.Printf("Terminal free port pick failed: %v", err) + return + } + opts := eicSSHOptions{ + InstanceID: instanceID, + OSUser: osUser, + Region: region, + LocalPort: localPort, + PrivateKeyPath: keyPath, + } + tunnel := openTunnelCmd(opts) + tunnel.Env = os.Environ() + if err := tunnel.Start(); err != nil { + log.Printf("Terminal tunnel start for ws=%s: %v", workspaceID, err) + _ = conn.WriteMessage(websocket.TextMessage, + []byte("Error: failed to open EIC tunnel (check EIC Endpoint + SG 22 from endpoint SG; see docs/infra/workspace-terminal.md)\r\n")) + return + } + defer func() { + if tunnel.Process != nil { + _ = tunnel.Process.Kill() + } + _ = tunnel.Wait() + }() + if err := waitForPort(ctx, "127.0.0.1", localPort, 10*time.Second); err != nil { + log.Printf("Terminal tunnel never listened for ws=%s: %v", workspaceID, err) + _ = conn.WriteMessage(websocket.TextMessage, + []byte("Error: EIC tunnel didn't come up in time\r\n")) + return + } + + // 3. SSH over the tunnel, pty-wrapped so bash behaves interactively. + cmd := sshCommandCmd(opts) + cmd.Env = os.Environ() ptmx, err := pty.Start(cmd) if err != nil { - // Most likely causes: aws CLI missing, EIC Endpoint not set up, - // IAM perms missing. Report a specific hint, not the raw error - // (avoids leaking account ids or ARNs to the client). - log.Printf("Terminal EIC start error for ws=%s instance=%s: %v", workspaceID, instanceID, err) + log.Printf("Terminal ssh pty.Start for ws=%s: %v", workspaceID, err) _ = conn.WriteMessage(websocket.TextMessage, - []byte("Error: failed to open EIC tunnel — check tenant aws CLI + IAM (see docs/infra/workspace-terminal.md)\r\n")) + []byte("Error: failed to launch ssh client\r\n")) return } defer func() { _ = ptmx.Close() }() @@ -301,3 +419,37 @@ func (h *TerminalHandler) handleRemoteConnect(c *gin.Context, workspaceID, insta } _ = cmd.Wait() } + +// pickFreePort asks the OS for an unused TCP port in the ephemeral range. +// There's an unavoidable TOCTOU window between close() and the EIC tunnel +// binding the port; in practice the window is short enough that we've +// never seen a collision in testing. +func pickFreePort() (int, error) { + l, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + return 0, err + } + port := l.Addr().(*net.TCPAddr).Port + _ = l.Close() + return port, nil +} + +// waitForPort polls 127.0.0.1: until something is listening or the +// deadline passes. Used to wait for the EIC tunnel subprocess to bind +// its local port before we dial ssh at it. +func waitForPort(ctx context.Context, host string, port int, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + addr := fmt.Sprintf("%s:%d", host, port) + for time.Now().Before(deadline) { + if ctx.Err() != nil { + return ctx.Err() + } + c, err := net.DialTimeout("tcp", addr, 500*time.Millisecond) + if err == nil { + _ = c.Close() + return nil + } + time.Sleep(200 * time.Millisecond) + } + return fmt.Errorf("timed out waiting for %s", addr) +} diff --git a/workspace-server/internal/handlers/terminal_test.go b/workspace-server/internal/handlers/terminal_test.go index a660740d..8664467b 100644 --- a/workspace-server/internal/handlers/terminal_test.go +++ b/workspace-server/internal/handlers/terminal_test.go @@ -3,7 +3,6 @@ package handlers import ( "net/http" "net/http/httptest" - "os/exec" "testing" "github.com/DATA-DOG/go-sqlmock" @@ -11,50 +10,28 @@ import ( ) // TestHandleConnect_RoutesToRemote asserts HandleConnect picks the CP path -// when the workspace row carries an instance_id. We stub sshCommandFactory -// to capture the args rather than actually spawning aws-cli. +// when the workspace row carries an instance_id. The WS upgrade fails in +// a unit test (plain HTTP request, no ws handshake), but that's after the +// DB lookup — so unmet sqlmock expectations is the routing assertion. func TestHandleConnect_RoutesToRemote(t *testing.T) { mock := setupTestDB(t) setupTestRedis(t) - // DB: workspace row has instance_id set. mock.ExpectQuery("SELECT COALESCE"). WithArgs("ws-remote"). WillReturnRows(sqlmock.NewRows([]string{"instance_id"}).AddRow("i-abc123")) - var capturedInstance, capturedOSUser, capturedContainer string - prev := sshCommandFactory - sshCommandFactory = func(instanceID, osUser, containerName string) *exec.Cmd { - capturedInstance = instanceID - capturedOSUser = osUser - capturedContainer = containerName - // `true` exits immediately so the handler tears down cleanly; we're - // asserting on the factory args, not on shell behavior here. - return exec.Command("true") - } - defer func() { sshCommandFactory = prev }() - - h := NewTerminalHandler(nil) // docker client irrelevant on remote path + h := NewTerminalHandler(nil) w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) c.Params = gin.Params{{Key: "id", Value: "ws-remote"}} - // Plain HTTP request — WS upgrade will fail and the handler returns early - // before reaching the factory. Simulating a full WS upgrade in a unit - // test is heavy; we check routing (DB lookup + factory wiring) instead. c.Request = httptest.NewRequest("GET", "/workspaces/ws-remote/terminal", nil) h.HandleConnect(c) - // WS upgrade failure is expected here — the point is the router chose - // the remote branch, which we verify by the DB query being consumed. if err := mock.ExpectationsWereMet(); err != nil { t.Fatalf("unmet sqlmock expectations (router didn't hit CP branch): %v", err) } - // Factory isn't called because WS upgrade fails before pty.Start. That's - // fine — it's proven by the remote branch getting past the SELECT. - _ = capturedInstance - _ = capturedOSUser - _ = capturedContainer } // TestHandleConnect_RoutesToLocal asserts HandleConnect stays on the local @@ -86,23 +63,51 @@ func TestHandleConnect_RoutesToLocal(t *testing.T) { } } -// TestSshCommandFactory_BuildsEICCommand asserts the default factory -// produces the expected aws-cli argv. Prevents silent drift in the -// command shape (e.g. someone renaming --connection-type). -func TestSshCommandFactory_BuildsEICCommand(t *testing.T) { - cmd := sshCommandFactory("i-0abc", "ec2-user", "ws-deadbeef") - - // cmd.Args[0] is the argv[0] ("aws"); subsequent entries are flags. +// TestOpenTunnelCmd_BuildsArgv guards against silent drift in the EIC +// tunnel invocation (e.g. someone flipping --local-port to --port). +func TestOpenTunnelCmd_BuildsArgv(t *testing.T) { + cmd := openTunnelCmd(eicSSHOptions{ + InstanceID: "i-0abc", + Region: "us-east-2", + LocalPort: 2222, + }) want := []string{ - "aws", "ec2-instance-connect", "ssh", + "aws", "--region", "us-east-2", + "ec2-instance-connect", "open-tunnel", "--instance-id", "i-0abc", - "--connection-type", "eice", - "--os-user", "ec2-user", - "--", - "docker", "exec", "-it", "ws-deadbeef", "/bin/bash", + "--local-port", "2222", } if len(cmd.Args) != len(want) { - t.Fatalf("argv length: got %d (%v), want %d (%v)", len(cmd.Args), cmd.Args, len(want), want) + t.Fatalf("argv length: got %v want %v", cmd.Args, want) + } + for i := range want { + if cmd.Args[i] != want[i] { + t.Errorf("argv[%d] = %q, want %q", i, cmd.Args[i], want[i]) + } + } +} + +// TestSSHCommandCmd_BuildsArgv guards against drift in the ssh-client +// invocation — specifically the user@host shape and the inline options +// that defeat host-key + known_hosts friction. +func TestSSHCommandCmd_BuildsArgv(t *testing.T) { + cmd := sshCommandCmd(eicSSHOptions{ + OSUser: "ubuntu", + LocalPort: 2222, + PrivateKeyPath: "/tmp/k", + }) + want := []string{ + "ssh", + "-i", "/tmp/k", + "-o", "StrictHostKeyChecking=no", + "-o", "UserKnownHostsFile=/dev/null", + "-o", "ServerAliveInterval=30", + "-o", "ServerAliveCountMax=3", + "-p", "2222", + "ubuntu@127.0.0.1", + } + if len(cmd.Args) != len(want) { + t.Fatalf("argv length: got %v want %v", cmd.Args, want) } for i := range want { if cmd.Args[i] != want[i] {