fix(terminal): correct CP branch to SSH-only (no docker exec)

Proven by end-to-end testing against a live Hermes workspace EC2:
CP-provisioned workspaces run the agent as a NATIVE process under
the ubuntu user, not inside a Docker container. The earlier
\`aws ec2-instance-connect ssh -- docker exec -it ws-X bash\` was
doubly wrong:
- aws-cli's \`ssh\` subcommand doesn't accept a trailing command
- Even if it did, there's no container to exec into

Replaced with a three-step pipeline that matches what actually
works when run by hand:
1. ssh-keygen  — ephemeral ed25519 per session
2. aws ec2-instance-connect send-ssh-public-key --instance-os-user ubuntu
3. aws ec2-instance-connect open-tunnel --local-port N  (runs in background)
4. ssh -p N -i <key> ubuntu@127.0.0.1

Infra prerequisites (verified in docs/infra/workspace-terminal.md):
- EIC service-linked role created
- EIC Endpoint in the workspace VPC (we created eice-08b035ec8789202f9)
- Workspace SG allows 22/tcp from the EIC Endpoint's SG
- molecule-cp IAM: ec2:DescribeInstances + ec2-instance-connect:*

Changes in this commit:
- eicSSHOptions struct carries session inputs between factories
- openTunnelCmd + sshCommandCmd + sendSSHPublicKey are package vars
  so tests can stub them individually
- Default OS user is \"ubuntu\" (Ubuntu 24.04 CP AMI). Override via
  WORKSPACE_EC2_OS_USER env var if the AMI changes
- AWS_REGION env var respected; default us-east-2 matches current CP
- pickFreePort + waitForPort helpers — no hardcoded ports, tolerates
  multiple concurrent sessions
- Tests updated: two argv-shape regressions for open-tunnel + ssh
  (SSH shape was the silent-drift case that caused the first failure)

Refs: #1528, #1531
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hongming Wang 2026-04-21 18:39:00 -07:00
parent 89d9470ba4
commit bca11fea9f
2 changed files with 223 additions and 66 deletions

View File

@ -2,8 +2,10 @@ package handlers
import (
"context"
"fmt"
"io"
"log"
"net"
"net/http"
"os"
"os/exec"
@ -198,30 +200,83 @@ func (h *TerminalHandler) handleLocalConnect(c *gin.Context, workspaceID string)
<-done
}
// sshCommandFactory builds the argv that opens an interactive shell into a
// CP-provisioned workspace. Exposed as a var so tests can override it.
// Real builds invoke the AWS CLI's EIC shortcut, which handles ephemeral
// key generation, SendSSHPublicKey, OpenTunnel, and SSH in one command.
// Requires aws-cli v2 + openssh-client in the tenant image.
var sshCommandFactory = func(instanceID, osUser, containerName string) *exec.Cmd {
// eicSSHOptions bundles the per-session inputs for spawning the EIC tunnel
// and the ssh client that rides on top of it. Fields are plain data so
// tests can stub the two factories below without fighting exec.Cmd.
type eicSSHOptions struct {
InstanceID string
OSUser string
Region string
LocalPort int
PrivateKeyPath string
}
// openTunnelCmd builds the argv that opens a TLS-tunneled TCP port from
// the local machine to the workspace EC2's sshd via the EIC Endpoint.
// Long-lived: stays up for the whole terminal session.
var openTunnelCmd = func(o eicSSHOptions) *exec.Cmd {
args := []string{
"ec2-instance-connect", "open-tunnel",
"--instance-id", o.InstanceID,
"--local-port", fmt.Sprintf("%d", o.LocalPort),
}
if o.Region != "" {
args = append([]string{"--region", o.Region}, args...)
}
return exec.Command("aws", args...)
}
// sshCommandCmd builds the argv for the interactive ssh client that rides
// on the open tunnel. The remote side is the workspace EC2's sshd bound
// to 22; with CP provisioning today the workspace runs as a native
// process under the ubuntu user, so landing at ubuntu's shell IS the
// terminal experience.
var sshCommandCmd = func(o eicSSHOptions) *exec.Cmd {
return exec.Command(
"aws", "ec2-instance-connect", "ssh",
"--instance-id", instanceID,
"--connection-type", "eice", // via EIC Endpoint
"--os-user", osUser,
"--",
"docker", "exec", "-it", containerName, "/bin/bash",
"ssh",
"-i", o.PrivateKeyPath,
"-o", "StrictHostKeyChecking=no",
"-o", "UserKnownHostsFile=/dev/null",
"-o", "ServerAliveInterval=30",
"-o", "ServerAliveCountMax=3",
"-p", fmt.Sprintf("%d", o.LocalPort),
fmt.Sprintf("%s@127.0.0.1", o.OSUser),
)
}
// handleRemoteConnect tunnels a terminal session to a workspace running on
// a separate EC2 via EC2 Instance Connect. Design: docs/infra/workspace-terminal.md.
// sendSSHPublicKey pushes an ephemeral public key to the EIC service so
// the workspace's sshd accepts the paired private key for the next 60s.
// Exposed as a var so tests can stub the AWS call.
var sendSSHPublicKey = func(ctx context.Context, region, instanceID, osUser, pubKey string) error {
cmd := exec.CommandContext(ctx, "aws", "ec2-instance-connect", "send-ssh-public-key",
"--region", region,
"--instance-id", instanceID,
"--instance-os-user", osUser,
"--ssh-public-key", pubKey)
out, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("send-ssh-public-key: %w (%s)", err, strings.TrimSpace(string(out)))
}
return nil
}
// handleRemoteConnect opens a terminal session on a workspace EC2 using:
//
// aws ec2-instance-connect send-ssh-public-key (push ephemeral key)
// aws ec2-instance-connect open-tunnel (TLS tunnel to :22)
// ssh -p <tunnel-port> ubuntu@127.0.0.1 (interactive shell)
//
// CP-provisioned workspaces run as native processes under ubuntu, not
// Docker. Design: docs/infra/workspace-terminal.md.
func (h *TerminalHandler) handleRemoteConnect(c *gin.Context, workspaceID, instanceID string) {
osUser := os.Getenv("WORKSPACE_EC2_OS_USER")
if osUser == "" {
osUser = "ec2-user" // AL2023 default; override via env if AMI changes
osUser = "ubuntu" // Ubuntu 24.04 AMI, default CP workspace runtime user
}
region := os.Getenv("AWS_REGION")
if region == "" {
region = "us-east-2" // CP default — override via env
}
containerName := provisioner.ContainerName(workspaceID)
conn, err := termUpgrader.Upgrade(c.Writer, c.Request, nil)
if err != nil {
@ -230,22 +285,85 @@ func (h *TerminalHandler) handleRemoteConnect(c *gin.Context, workspaceID, insta
}
defer conn.Close()
// PTY so interactive bash works (prompts, line editing, colors).
// os/exec alone can't allocate a controlling terminal; creack/pty
// opens the pty pair and wires it to the child's stdin/stdout/stderr.
ctx, cancel := context.WithCancel(c.Request.Context())
defer cancel()
cmd := sshCommandFactory(instanceID, osUser, containerName)
cmd.Env = os.Environ() // inherit AWS_REGION, AWS credentials, etc.
// Ephemeral keypair — never hits disk after the session ends, and is
// only valid for <60s on the instance side regardless.
keyDir, err := os.MkdirTemp("", "molecule-terminal-*")
if err != nil {
log.Printf("Terminal keydir mkdir for ws=%s: %v", workspaceID, err)
_ = conn.WriteMessage(websocket.TextMessage,
[]byte("Error: failed to allocate session keypair\r\n"))
return
}
defer func() { _ = os.RemoveAll(keyDir) }()
keyPath := keyDir + "/id"
keygen := exec.CommandContext(ctx, "ssh-keygen", "-t", "ed25519", "-f", keyPath, "-N", "", "-q", "-C", "molecule-terminal")
if out, kerr := keygen.CombinedOutput(); kerr != nil {
log.Printf("Terminal ssh-keygen for ws=%s: %v (%s)", workspaceID, kerr, out)
_ = conn.WriteMessage(websocket.TextMessage,
[]byte("Error: failed to generate session keypair\r\n"))
return
}
pubKey, err := os.ReadFile(keyPath + ".pub")
if err != nil {
log.Printf("Terminal pubkey read for ws=%s: %v", workspaceID, err)
return
}
// 1. Push public key — sshd accepts matching private for 60s.
if err := sendSSHPublicKey(ctx, region, instanceID, osUser, strings.TrimSpace(string(pubKey))); err != nil {
log.Printf("Terminal EIC send-key for ws=%s instance=%s: %v", workspaceID, instanceID, err)
_ = conn.WriteMessage(websocket.TextMessage,
[]byte("Error: failed to push session key (check tenant IAM + see docs/infra/workspace-terminal.md)\r\n"))
return
}
// 2. Open tunnel on an OS-picked free port; retry briefly because
// tunnel takes ~1-2s to start listening after exec.
localPort, err := pickFreePort()
if err != nil {
log.Printf("Terminal free port pick failed: %v", err)
return
}
opts := eicSSHOptions{
InstanceID: instanceID,
OSUser: osUser,
Region: region,
LocalPort: localPort,
PrivateKeyPath: keyPath,
}
tunnel := openTunnelCmd(opts)
tunnel.Env = os.Environ()
if err := tunnel.Start(); err != nil {
log.Printf("Terminal tunnel start for ws=%s: %v", workspaceID, err)
_ = conn.WriteMessage(websocket.TextMessage,
[]byte("Error: failed to open EIC tunnel (check EIC Endpoint + SG 22 from endpoint SG; see docs/infra/workspace-terminal.md)\r\n"))
return
}
defer func() {
if tunnel.Process != nil {
_ = tunnel.Process.Kill()
}
_ = tunnel.Wait()
}()
if err := waitForPort(ctx, "127.0.0.1", localPort, 10*time.Second); err != nil {
log.Printf("Terminal tunnel never listened for ws=%s: %v", workspaceID, err)
_ = conn.WriteMessage(websocket.TextMessage,
[]byte("Error: EIC tunnel didn't come up in time\r\n"))
return
}
// 3. SSH over the tunnel, pty-wrapped so bash behaves interactively.
cmd := sshCommandCmd(opts)
cmd.Env = os.Environ()
ptmx, err := pty.Start(cmd)
if err != nil {
// Most likely causes: aws CLI missing, EIC Endpoint not set up,
// IAM perms missing. Report a specific hint, not the raw error
// (avoids leaking account ids or ARNs to the client).
log.Printf("Terminal EIC start error for ws=%s instance=%s: %v", workspaceID, instanceID, err)
log.Printf("Terminal ssh pty.Start for ws=%s: %v", workspaceID, err)
_ = conn.WriteMessage(websocket.TextMessage,
[]byte("Error: failed to open EIC tunnel — check tenant aws CLI + IAM (see docs/infra/workspace-terminal.md)\r\n"))
[]byte("Error: failed to launch ssh client\r\n"))
return
}
defer func() { _ = ptmx.Close() }()
@ -301,3 +419,37 @@ func (h *TerminalHandler) handleRemoteConnect(c *gin.Context, workspaceID, insta
}
_ = cmd.Wait()
}
// pickFreePort asks the OS for an unused TCP port in the ephemeral range.
// There's an unavoidable TOCTOU window between close() and the EIC tunnel
// binding the port; in practice the window is short enough that we've
// never seen a collision in testing.
func pickFreePort() (int, error) {
l, err := net.Listen("tcp", "127.0.0.1:0")
if err != nil {
return 0, err
}
port := l.Addr().(*net.TCPAddr).Port
_ = l.Close()
return port, nil
}
// waitForPort polls 127.0.0.1:<port> until something is listening or the
// deadline passes. Used to wait for the EIC tunnel subprocess to bind
// its local port before we dial ssh at it.
func waitForPort(ctx context.Context, host string, port int, timeout time.Duration) error {
deadline := time.Now().Add(timeout)
addr := fmt.Sprintf("%s:%d", host, port)
for time.Now().Before(deadline) {
if ctx.Err() != nil {
return ctx.Err()
}
c, err := net.DialTimeout("tcp", addr, 500*time.Millisecond)
if err == nil {
_ = c.Close()
return nil
}
time.Sleep(200 * time.Millisecond)
}
return fmt.Errorf("timed out waiting for %s", addr)
}

View File

@ -3,7 +3,6 @@ package handlers
import (
"net/http"
"net/http/httptest"
"os/exec"
"testing"
"github.com/DATA-DOG/go-sqlmock"
@ -11,50 +10,28 @@ import (
)
// TestHandleConnect_RoutesToRemote asserts HandleConnect picks the CP path
// when the workspace row carries an instance_id. We stub sshCommandFactory
// to capture the args rather than actually spawning aws-cli.
// when the workspace row carries an instance_id. The WS upgrade fails in
// a unit test (plain HTTP request, no ws handshake), but that's after the
// DB lookup — so unmet sqlmock expectations is the routing assertion.
func TestHandleConnect_RoutesToRemote(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
// DB: workspace row has instance_id set.
mock.ExpectQuery("SELECT COALESCE").
WithArgs("ws-remote").
WillReturnRows(sqlmock.NewRows([]string{"instance_id"}).AddRow("i-abc123"))
var capturedInstance, capturedOSUser, capturedContainer string
prev := sshCommandFactory
sshCommandFactory = func(instanceID, osUser, containerName string) *exec.Cmd {
capturedInstance = instanceID
capturedOSUser = osUser
capturedContainer = containerName
// `true` exits immediately so the handler tears down cleanly; we're
// asserting on the factory args, not on shell behavior here.
return exec.Command("true")
}
defer func() { sshCommandFactory = prev }()
h := NewTerminalHandler(nil) // docker client irrelevant on remote path
h := NewTerminalHandler(nil)
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
c.Params = gin.Params{{Key: "id", Value: "ws-remote"}}
// Plain HTTP request — WS upgrade will fail and the handler returns early
// before reaching the factory. Simulating a full WS upgrade in a unit
// test is heavy; we check routing (DB lookup + factory wiring) instead.
c.Request = httptest.NewRequest("GET", "/workspaces/ws-remote/terminal", nil)
h.HandleConnect(c)
// WS upgrade failure is expected here — the point is the router chose
// the remote branch, which we verify by the DB query being consumed.
if err := mock.ExpectationsWereMet(); err != nil {
t.Fatalf("unmet sqlmock expectations (router didn't hit CP branch): %v", err)
}
// Factory isn't called because WS upgrade fails before pty.Start. That's
// fine — it's proven by the remote branch getting past the SELECT.
_ = capturedInstance
_ = capturedOSUser
_ = capturedContainer
}
// TestHandleConnect_RoutesToLocal asserts HandleConnect stays on the local
@ -86,23 +63,51 @@ func TestHandleConnect_RoutesToLocal(t *testing.T) {
}
}
// TestSshCommandFactory_BuildsEICCommand asserts the default factory
// produces the expected aws-cli argv. Prevents silent drift in the
// command shape (e.g. someone renaming --connection-type).
func TestSshCommandFactory_BuildsEICCommand(t *testing.T) {
cmd := sshCommandFactory("i-0abc", "ec2-user", "ws-deadbeef")
// cmd.Args[0] is the argv[0] ("aws"); subsequent entries are flags.
// TestOpenTunnelCmd_BuildsArgv guards against silent drift in the EIC
// tunnel invocation (e.g. someone flipping --local-port to --port).
func TestOpenTunnelCmd_BuildsArgv(t *testing.T) {
cmd := openTunnelCmd(eicSSHOptions{
InstanceID: "i-0abc",
Region: "us-east-2",
LocalPort: 2222,
})
want := []string{
"aws", "ec2-instance-connect", "ssh",
"aws", "--region", "us-east-2",
"ec2-instance-connect", "open-tunnel",
"--instance-id", "i-0abc",
"--connection-type", "eice",
"--os-user", "ec2-user",
"--",
"docker", "exec", "-it", "ws-deadbeef", "/bin/bash",
"--local-port", "2222",
}
if len(cmd.Args) != len(want) {
t.Fatalf("argv length: got %d (%v), want %d (%v)", len(cmd.Args), cmd.Args, len(want), want)
t.Fatalf("argv length: got %v want %v", cmd.Args, want)
}
for i := range want {
if cmd.Args[i] != want[i] {
t.Errorf("argv[%d] = %q, want %q", i, cmd.Args[i], want[i])
}
}
}
// TestSSHCommandCmd_BuildsArgv guards against drift in the ssh-client
// invocation — specifically the user@host shape and the inline options
// that defeat host-key + known_hosts friction.
func TestSSHCommandCmd_BuildsArgv(t *testing.T) {
cmd := sshCommandCmd(eicSSHOptions{
OSUser: "ubuntu",
LocalPort: 2222,
PrivateKeyPath: "/tmp/k",
})
want := []string{
"ssh",
"-i", "/tmp/k",
"-o", "StrictHostKeyChecking=no",
"-o", "UserKnownHostsFile=/dev/null",
"-o", "ServerAliveInterval=30",
"-o", "ServerAliveCountMax=3",
"-p", "2222",
"ubuntu@127.0.0.1",
}
if len(cmd.Args) != len(want) {
t.Fatalf("argv length: got %v want %v", cmd.Args, want)
}
for i := range want {
if cmd.Args[i] != want[i] {