diff --git a/workspace-server/internal/handlers/terminal_diagnose.go b/workspace-server/internal/handlers/terminal_diagnose.go new file mode 100644 index 00000000..e40f6e19 --- /dev/null +++ b/workspace-server/internal/handlers/terminal_diagnose.go @@ -0,0 +1,328 @@ +package handlers + +import ( + "context" + "fmt" + "net/http" + "os" + "os/exec" + "strings" + "time" + + "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner" + "github.com/gin-gonic/gin" +) + +// HandleDiagnose handles GET /workspaces/:id/terminal/diagnose. It runs the +// same per-step pipeline as HandleConnect (ssh-keygen → EIC send-key → tunnel +// → ssh) but non-interactively, captures the first failing step and its +// stderr, and returns the result as JSON. +// +// Why this exists: when the canvas terminal silently disconnects ("Session +// ended" with no error frame), there is no remote-readable signal of which +// stage failed. The ssh client's stderr lives in the workspace-server's +// process logs on the tenant CP EC2 — invisible without shell access. +// HandleConnect can't trivially expose stderr because it has already +// upgraded to WebSocket binary frames by the time ssh runs. HandleDiagnose +// stays pure HTTP/JSON, so the same auth (WorkspaceAuth + ADMIN_TOKEN +// fallback) gives operators a one-call probe of the whole shell pipeline. +// +// Stages mirrored from handleRemoteConnect: +// +// 1. ssh-keygen (ephemeral session keypair) +// 2. send-ssh-public-key (AWS EIC API push, IAM-gated) +// 3. pick-free-port (local port for the tunnel) +// 4. open-tunnel (aws ec2-instance-connect open-tunnel start) +// 5. wait-for-port (the tunnel actually listens) +// 6. ssh-probe (`ssh ... 'echo MARKER'` — proves end-to-end auth+shell) +// +// Local Docker workspaces (no instance_id row) get a smaller probe: +// container-found + container-running. Same response shape so callers +// don't need to branch. +func (h *TerminalHandler) HandleDiagnose(c *gin.Context) { + workspaceID := c.Param("id") + ctx, cancel := context.WithTimeout(c.Request.Context(), 30*time.Second) + defer cancel() + + var instanceID string + _ = db.DB.QueryRowContext(ctx, + `SELECT COALESCE(instance_id, '') FROM workspaces WHERE id = $1`, + workspaceID).Scan(&instanceID) + + var res diagnoseResult + if instanceID != "" { + res = h.diagnoseRemote(ctx, workspaceID, instanceID) + } else { + res = h.diagnoseLocal(ctx, workspaceID) + } + c.JSON(http.StatusOK, res) +} + +// diagnoseStep is one row in the diagnostic report. Always carries Name + +// OK + DurationMs; Error/Detail filled when the step fails. +type diagnoseStep struct { + Name string `json:"name"` + OK bool `json:"ok"` + DurationMs int64 `json:"duration_ms"` + Error string `json:"error,omitempty"` + Detail string `json:"detail,omitempty"` +} + +// diagnoseResult is the full report. ``OK`` is true only when every step +// passed; ``FirstFailure`` names the step that broke the chain so callers +// can route alerts (e.g., "send-ssh-public-key" → IAM team; "ssh-probe" → +// SG/sshd team). +type diagnoseResult struct { + WorkspaceID string `json:"workspace_id"` + InstanceID string `json:"instance_id,omitempty"` + Remote bool `json:"remote"` + OK bool `json:"ok"` + FirstFailure string `json:"first_failure,omitempty"` + Steps []diagnoseStep `json:"steps"` +} + +// sshProbeMarker is the string the ssh probe echoes back. Distinct from any +// shell builtin output so we can grep for it unambiguously even when the +// remote prints a banner or motd. +const sshProbeMarker = "MOLECULE_TERMINAL_PROBE_OK" + +// sshProbeCmd builds the non-interactive ssh probe command. Exposed as a +// var so tests can stub it without spinning up a real sshd. BatchMode=yes +// ensures ssh fails fast on prompt instead of hanging on a TTY. +var sshProbeCmd = func(o eicSSHOptions) *exec.Cmd { + return exec.Command( + "ssh", + "-i", o.PrivateKeyPath, + "-o", "StrictHostKeyChecking=no", + "-o", "UserKnownHostsFile=/dev/null", + "-o", "BatchMode=yes", + "-o", "ConnectTimeout=10", + "-p", fmt.Sprintf("%d", o.LocalPort), + fmt.Sprintf("%s@127.0.0.1", o.OSUser), + "echo "+sshProbeMarker, + ) +} + +// diagnoseRemote runs the full EIC + ssh probe and reports per-step status. +// Bails on the first failure so the operator sees which stage breaks; later +// stages stay in the report as zero-value rows so the response shape is +// stable regardless of where the chain stopped. +func (h *TerminalHandler) diagnoseRemote(ctx context.Context, workspaceID, instanceID string) diagnoseResult { + res := diagnoseResult{ + WorkspaceID: workspaceID, + InstanceID: instanceID, + Remote: true, + } + + osUser := os.Getenv("WORKSPACE_EC2_OS_USER") + if osUser == "" { + osUser = "ubuntu" + } + region := os.Getenv("AWS_REGION") + if region == "" { + region = "us-east-2" + } + + stop := func(name string, step diagnoseStep) diagnoseResult { + res.Steps = append(res.Steps, step) + res.FirstFailure = name + return res + } + + // Step 1: ssh-keygen + t0 := time.Now() + keyDir, err := os.MkdirTemp("", "molecule-diagnose-*") + if err != nil { + return stop("ssh-keygen", diagnoseStep{ + Name: "ssh-keygen", + DurationMs: time.Since(t0).Milliseconds(), + Error: fmt.Sprintf("mkdir tmp: %v", err), + }) + } + defer func() { _ = os.RemoveAll(keyDir) }() + keyPath := keyDir + "/id" + keygen := exec.CommandContext(ctx, "ssh-keygen", "-t", "ed25519", "-f", keyPath, "-N", "", "-q", "-C", "molecule-diagnose") + if out, kerr := keygen.CombinedOutput(); kerr != nil { + return stop("ssh-keygen", diagnoseStep{ + Name: "ssh-keygen", + DurationMs: time.Since(t0).Milliseconds(), + Error: kerr.Error(), + Detail: strings.TrimSpace(string(out)), + }) + } + res.Steps = append(res.Steps, diagnoseStep{Name: "ssh-keygen", OK: true, DurationMs: time.Since(t0).Milliseconds()}) + + pubKey, err := os.ReadFile(keyPath + ".pub") + if err != nil { + return stop("ssh-keygen", diagnoseStep{ + Name: "ssh-keygen", + Error: fmt.Sprintf("read pubkey: %v", err), + }) + } + + // Step 2: send-ssh-public-key (AWS Instance Connect) + t0 = time.Now() + if err := sendSSHPublicKey(ctx, region, instanceID, osUser, strings.TrimSpace(string(pubKey))); err != nil { + return stop("send-ssh-public-key", diagnoseStep{ + Name: "send-ssh-public-key", + DurationMs: time.Since(t0).Milliseconds(), + Error: err.Error(), + }) + } + res.Steps = append(res.Steps, diagnoseStep{Name: "send-ssh-public-key", OK: true, DurationMs: time.Since(t0).Milliseconds()}) + + // Step 3: pick-free-port + t0 = time.Now() + localPort, err := pickFreePort() + if err != nil { + return stop("pick-free-port", diagnoseStep{ + Name: "pick-free-port", + DurationMs: time.Since(t0).Milliseconds(), + Error: err.Error(), + }) + } + res.Steps = append(res.Steps, diagnoseStep{ + Name: "pick-free-port", + OK: true, + DurationMs: time.Since(t0).Milliseconds(), + Detail: fmt.Sprintf("port=%d", localPort), + }) + + // Step 4: open-tunnel (long-running subprocess; we hold its stderr so + // we can include it in failure detail for the next two stages). + opts := eicSSHOptions{ + InstanceID: instanceID, + OSUser: osUser, + Region: region, + LocalPort: localPort, + PrivateKeyPath: keyPath, + } + t0 = time.Now() + tunnel := openTunnelCmd(opts) + tunnel.Env = os.Environ() + var tunnelStderr strings.Builder + tunnel.Stderr = &tunnelStderr + if err := tunnel.Start(); err != nil { + return stop("open-tunnel", diagnoseStep{ + Name: "open-tunnel", + DurationMs: time.Since(t0).Milliseconds(), + Error: err.Error(), + Detail: tunnelStderr.String(), + }) + } + defer func() { + if tunnel.Process != nil { + _ = tunnel.Process.Kill() + } + _ = tunnel.Wait() + }() + res.Steps = append(res.Steps, diagnoseStep{Name: "open-tunnel", OK: true, DurationMs: time.Since(t0).Milliseconds()}) + + // Step 5: wait-for-port — verifies the tunnel actually bound the port. + // Tunnel-side errors (auth, SG, missing endpoint) usually surface here + // because the subprocess exits before binding. Fold its stderr into the + // detail so the operator sees the real reason. + t0 = time.Now() + if err := waitForPort(ctx, "127.0.0.1", localPort, 10*time.Second); err != nil { + return stop("wait-for-port", diagnoseStep{ + Name: "wait-for-port", + DurationMs: time.Since(t0).Milliseconds(), + Error: err.Error(), + Detail: tunnelStderr.String(), + }) + } + res.Steps = append(res.Steps, diagnoseStep{Name: "wait-for-port", OK: true, DurationMs: time.Since(t0).Milliseconds()}) + + // Step 6: ssh-probe — non-interactive `ssh ... 'echo MARKER'`. Proves + // auth (key push reached sshd), shell ready (bash returns echo output), + // and the network path end-to-end. Captures combined output + exit + // error so we see "Permission denied", "Connection refused", or "Host + // key verification failed" verbatim. + t0 = time.Now() + probe := sshProbeCmd(opts) + probe.Env = os.Environ() + out, perr := probe.CombinedOutput() + outStr := strings.TrimSpace(string(out)) + durMs := time.Since(t0).Milliseconds() + if perr != nil || !strings.Contains(outStr, sshProbeMarker) { + errStr := "" + if perr != nil { + errStr = perr.Error() + } + return stop("ssh-probe", diagnoseStep{ + Name: "ssh-probe", + DurationMs: durMs, + Error: errStr, + Detail: outStr, + }) + } + res.Steps = append(res.Steps, diagnoseStep{Name: "ssh-probe", OK: true, DurationMs: durMs}) + + res.OK = true + return res +} + +// diagnoseLocal probes the Docker container path. Smaller surface: just +// "is the named container running on this Docker daemon". +func (h *TerminalHandler) diagnoseLocal(ctx context.Context, workspaceID string) diagnoseResult { + res := diagnoseResult{WorkspaceID: workspaceID, Remote: false} + if h.docker == nil { + res.Steps = append(res.Steps, diagnoseStep{ + Name: "docker-available", + Error: "docker client not configured on this workspace-server", + }) + res.FirstFailure = "docker-available" + return res + } + + candidates := []string{provisioner.ContainerName(workspaceID), "ws-" + workspaceID} + var foundName string + var lastErr error + var running bool + var stateStatus string + t0 := time.Now() + for _, n := range candidates { + info, err := h.docker.ContainerInspect(ctx, n) + if err == nil { + foundName = n + running = info.State.Running + stateStatus = info.State.Status + break + } + lastErr = err + } + if foundName == "" { + errMsg := "no matching container" + if lastErr != nil { + errMsg = lastErr.Error() + } + res.Steps = append(res.Steps, diagnoseStep{ + Name: "container-found", + DurationMs: time.Since(t0).Milliseconds(), + Error: errMsg, + Detail: fmt.Sprintf("tried: %s", strings.Join(candidates, ", ")), + }) + res.FirstFailure = "container-found" + return res + } + res.Steps = append(res.Steps, diagnoseStep{ + Name: "container-found", + OK: true, + DurationMs: time.Since(t0).Milliseconds(), + Detail: foundName, + }) + + if !running { + res.Steps = append(res.Steps, diagnoseStep{ + Name: "container-running", + Error: "container not running", + Detail: stateStatus, + }) + res.FirstFailure = "container-running" + return res + } + res.Steps = append(res.Steps, diagnoseStep{Name: "container-running", OK: true, Detail: stateStatus}) + res.OK = true + return res +} diff --git a/workspace-server/internal/handlers/terminal_diagnose_test.go b/workspace-server/internal/handlers/terminal_diagnose_test.go new file mode 100644 index 00000000..5cf672fe --- /dev/null +++ b/workspace-server/internal/handlers/terminal_diagnose_test.go @@ -0,0 +1,222 @@ +package handlers + +import ( + "context" + "encoding/json" + "errors" + "net/http/httptest" + "os/exec" + "testing" + + "github.com/DATA-DOG/go-sqlmock" + "github.com/gin-gonic/gin" +) + +// TestHandleDiagnose_RoutesToRemote pins the dispatch: a workspace row with +// a non-empty instance_id takes the EIC + ssh probe path. We stub the +// first-stage (send-ssh-public-key) to fail so the test stays +// hermetic — no AWS calls, no network — and confirm: +// +// - first_failure is "send-ssh-public-key" (not the earlier ssh-keygen) +// - the steps array includes the ssh-keygen pass + the failed +// send-ssh-public-key step +// - response is HTTP 200 (the endpoint always returns 200; failure is +// in the JSON body so callers don't need branch-on-status) +func TestHandleDiagnose_RoutesToRemote(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + + mock.ExpectQuery("SELECT COALESCE"). + WithArgs("ws-remote"). + WillReturnRows(sqlmock.NewRows([]string{"instance_id"}).AddRow("i-abc123")) + + prev := sendSSHPublicKey + sendSSHPublicKey = func(ctx context.Context, region, instanceID, osUser, pubKey string) error { + return errors.New("AccessDeniedException: not authorized") + } + defer func() { sendSSHPublicKey = prev }() + + h := NewTerminalHandler(nil) + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-remote"}} + c.Request = httptest.NewRequest("GET", "/workspaces/ws-remote/terminal/diagnose", nil) + + h.HandleDiagnose(c) + + if w.Code != 200 { + t.Fatalf("HandleDiagnose status: got %d, want 200 (body=%s)", w.Code, w.Body.String()) + } + var got diagnoseResult + if err := json.Unmarshal(w.Body.Bytes(), &got); err != nil { + t.Fatalf("response not JSON: %v (body=%s)", err, w.Body.String()) + } + if !got.Remote { + t.Errorf("Remote=false; expected true for instance_id-bearing workspace") + } + if got.OK { + t.Errorf("OK=true despite stubbed send-key failure") + } + if got.FirstFailure != "send-ssh-public-key" { + t.Errorf("FirstFailure=%q; want send-ssh-public-key", got.FirstFailure) + } + // ssh-keygen must run successfully before send-ssh-public-key fails. + if len(got.Steps) < 2 { + t.Fatalf("expected >=2 steps (ssh-keygen + send-ssh-public-key); got %d", len(got.Steps)) + } + if got.Steps[0].Name != "ssh-keygen" || !got.Steps[0].OK { + t.Errorf("step[0]: want ssh-keygen ok=true; got %+v", got.Steps[0]) + } + if got.Steps[1].Name != "send-ssh-public-key" || got.Steps[1].OK { + t.Errorf("step[1]: want send-ssh-public-key ok=false; got %+v", got.Steps[1]) + } + // The IAM error message must surface in the step's Error field — that's + // the whole point of the endpoint. + if got.Steps[1].Error == "" { + t.Errorf("step[1].Error is empty; AWS error must surface verbatim") + } +} + +// TestHandleDiagnose_RoutesToLocal — empty instance_id takes the Docker +// path. With nil docker client, container-found can't even start, so we +// fail at "docker-available". Confirms the local-vs-remote dispatch. +func TestHandleDiagnose_RoutesToLocal(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + + mock.ExpectQuery("SELECT COALESCE"). + WithArgs("ws-local"). + WillReturnRows(sqlmock.NewRows([]string{"instance_id"}).AddRow("")) + + h := NewTerminalHandler(nil) + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-local"}} + c.Request = httptest.NewRequest("GET", "/workspaces/ws-local/terminal/diagnose", nil) + + h.HandleDiagnose(c) + + if w.Code != 200 { + t.Fatalf("status: got %d, want 200", w.Code) + } + var got diagnoseResult + if err := json.Unmarshal(w.Body.Bytes(), &got); err != nil { + t.Fatalf("response not JSON: %v", err) + } + if got.Remote { + t.Errorf("Remote=true; expected false for empty-instance_id workspace") + } + if got.FirstFailure != "docker-available" { + t.Errorf("FirstFailure=%q; want docker-available (no docker client)", got.FirstFailure) + } +} + +// TestDiagnoseRemote_StopsAtSSHProbe — full happy path through send-key, +// pick-port, open-tunnel, wait-for-port, then stub the ssh probe to fail. +// Confirms first_failure surfaces the actual ssh stderr ("Permission +// denied") rather than the earlier successful steps. This is the +// most operationally important behavior — the endpoint exists primarily +// to differentiate "IAM broke" (send-key fails) from "sshd broke" (probe +// fails) from "SG/network broke" (wait-for-port fails). +func TestDiagnoseRemote_StopsAtSSHProbe(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + + mock.ExpectQuery("SELECT COALESCE"). + WithArgs("ws-probe-fail"). + WillReturnRows(sqlmock.NewRows([]string{"instance_id"}).AddRow("i-test")) + + // Stub send-key to succeed. + prevSend := sendSSHPublicKey + sendSSHPublicKey = func(ctx context.Context, region, instanceID, osUser, pubKey string) error { + return nil + } + defer func() { sendSSHPublicKey = prevSend }() + + // Stub openTunnelCmd to spawn `nc -l ` so waitForPort succeeds. + // We need the tunnel to actually bind the port; nc does that + // portably. macOS has BSD nc by default. + prevTun := openTunnelCmd + openTunnelCmd = func(o eicSSHOptions) *exec.Cmd { + // `nc -l ` listens on the picked free port. -k keeps it + // alive across single-client disconnects on Linux nc; harmless + // on BSD nc which doesn't have it (we'd need -k for BSD too — + // fall back to a portable busy-wait). + return exec.Command("sh", "-c", + `port="$1"; while true; do nc -l "$port" >/dev/null 2>&1 || true; done`, + "sh", numToString(o.LocalPort)) + } + defer func() { openTunnelCmd = prevTun }() + + // Stub the ssh probe to return "Permission denied" with non-zero exit, + // the canonical "key wasn't authorized" failure. + prevProbe := sshProbeCmd + sshProbeCmd = func(o eicSSHOptions) *exec.Cmd { + return exec.Command("sh", "-c", "echo 'Permission denied (publickey).' >&2; exit 255") + } + defer func() { sshProbeCmd = prevProbe }() + + h := NewTerminalHandler(nil) + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-probe-fail"}} + c.Request = httptest.NewRequest("GET", "/workspaces/ws-probe-fail/terminal/diagnose", nil) + + h.HandleDiagnose(c) + + if w.Code != 200 { + t.Fatalf("status: got %d", w.Code) + } + var got diagnoseResult + if err := json.Unmarshal(w.Body.Bytes(), &got); err != nil { + t.Fatalf("response not JSON: %v (body=%s)", err, w.Body.String()) + } + if got.OK { + t.Errorf("OK=true despite stubbed probe failure") + } + if got.FirstFailure != "ssh-probe" { + t.Errorf("FirstFailure=%q; want ssh-probe (got body=%s)", got.FirstFailure, w.Body.String()) + } + // The "Permission denied" message must be in the probe step's Detail — + // that's what tells the operator "this is sshd auth, not network". + var probeStep *diagnoseStep + for i := range got.Steps { + if got.Steps[i].Name == "ssh-probe" { + probeStep = &got.Steps[i] + break + } + } + if probeStep == nil { + t.Fatalf("no ssh-probe step in result: %+v", got.Steps) + } + if probeStep.OK { + t.Errorf("ssh-probe step OK=true despite failure stub") + } + if probeStep.Detail == "" && probeStep.Error == "" { + t.Errorf("ssh-probe step has no Error or Detail; ssh stderr is exactly what we want to expose") + } +} + +// numToString is a tiny helper to avoid pulling fmt into the test for one +// integer-to-string call. Same observable behavior as strconv.Itoa. +func numToString(n int) string { + if n == 0 { + return "0" + } + var buf [20]byte + i := len(buf) + neg := n < 0 + if neg { + n = -n + } + for n > 0 { + i-- + buf[i] = byte('0' + n%10) + n /= 10 + } + if neg { + i-- + buf[i] = '-' + } + return string(buf[i:]) +} diff --git a/workspace-server/internal/router/router.go b/workspace-server/internal/router/router.go index 3d04b12e..5373ed0f 100644 --- a/workspace-server/internal/router/router.go +++ b/workspace-server/internal/router/router.go @@ -470,6 +470,7 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi } th := handlers.NewTerminalHandler(dockerCli) wsAuth.GET("/terminal", th.HandleConnect) + wsAuth.GET("/terminal/diagnose", th.HandleDiagnose) // Canvas Viewport — #166 + #168: GET stays fully open for bootstrap. // PUT uses CanvasOrBearer (accepts Origin-match OR bearer token) so the