feat(terminal): add diagnose endpoint for SSH probe stages
GET /workspaces/:id/terminal/diagnose runs the same per-stage pipeline as
/terminal (ssh-keygen → EIC send-key → tunnel → ssh) but non-interactively
and returns JSON. Each stage reports {name, ok, duration_ms, error,
detail}, plus a top-level first_failure naming the broken stage.
Why: when the canvas terminal silently disconnects ("Session ended" with
no error frame — the user-reported failure mode on hongmingwang's hermes
workspace), there is no remote-readable signal of WHICH stage failed.
The ssh client's stderr lives only in the workspace-server's stdout on
the tenant CP EC2 — invisible without shell access. /terminal can't
expose stderr cleanly because it has already upgraded to WebSocket
binary frames by the time ssh runs. /terminal/diagnose stays pure
HTTP/JSON, so the same auth (WorkspaceAuth + ADMIN_TOKEN fallback) gives
operators a one-call probe that splits "IAM broke" (send-ssh-public-key
fails) from "tunnel/SG broke" (wait-for-port fails) from "sshd auth
broke" (ssh-probe gets Permission denied) from "shell broke" (probe
exits non-zero with stderr).
Stages mirrored from handleRemoteConnect in terminal.go:
1. ssh-keygen ephemeral session keypair
2. send-ssh-public-key AWS EIC API push, IAM-gated
3. pick-free-port local port for the tunnel
4. open-tunnel aws ec2-instance-connect open-tunnel start
5. wait-for-port the tunnel actually listens (folds tunnel
stderr into Detail when it doesn't)
6. ssh-probe non-interactive `ssh ... 'echo MARKER'` that
confirms auth + bash + the marker round-trip
(CombinedOutput captures stderr verbatim —
this is the whole reason the endpoint exists)
Local Docker workspaces (no instance_id) get a smaller probe:
container-found + container-running. Same response shape so callers
don't need to branch.
Tests stub sendSSHPublicKey / openTunnelCmd / sshProbeCmd via the
existing package-level vars (same pattern as TestSSHCommandCmd_*) so
the test suite stays hermetic — no AWS, no network. The three new
tests pin: (a) routing to remote on instance_id present,
(b) routing to local on empty instance_id, (c) the operationally
critical case — full success through wait-for-port then a probe
failure surfaces ssh stderr in the ssh-probe step's Error/Detail
with first_failure="ssh-probe".
Auth: rides on existing WorkspaceAuth middleware. Operators with the
tenant ADMIN_TOKEN (fetched via /cp/admin/orgs/:slug/admin-token) can
probe any workspace without per-workspace token; same admin path as
the canvas dashboard reads workspace activity.
Response always returns HTTP 200 (success or step failure are both in
the JSON body) so callers don't need to branch on status code — the
endpoint either reports a first_failure or doesn't.
Resolves task #200, supports task #193 (workspace EC2 sshd
unresponsive — without this endpoint we couldn't pin the failure
stage from outside the tenant CP EC2).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
f46c471f9b
commit
d012a803e4
328
workspace-server/internal/handlers/terminal_diagnose.go
Normal file
328
workspace-server/internal/handlers/terminal_diagnose.go
Normal file
@ -0,0 +1,328 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
// HandleDiagnose handles GET /workspaces/:id/terminal/diagnose. It runs the
|
||||
// same per-step pipeline as HandleConnect (ssh-keygen → EIC send-key → tunnel
|
||||
// → ssh) but non-interactively, captures the first failing step and its
|
||||
// stderr, and returns the result as JSON.
|
||||
//
|
||||
// Why this exists: when the canvas terminal silently disconnects ("Session
|
||||
// ended" with no error frame), there is no remote-readable signal of which
|
||||
// stage failed. The ssh client's stderr lives in the workspace-server's
|
||||
// process logs on the tenant CP EC2 — invisible without shell access.
|
||||
// HandleConnect can't trivially expose stderr because it has already
|
||||
// upgraded to WebSocket binary frames by the time ssh runs. HandleDiagnose
|
||||
// stays pure HTTP/JSON, so the same auth (WorkspaceAuth + ADMIN_TOKEN
|
||||
// fallback) gives operators a one-call probe of the whole shell pipeline.
|
||||
//
|
||||
// Stages mirrored from handleRemoteConnect:
|
||||
//
|
||||
// 1. ssh-keygen (ephemeral session keypair)
|
||||
// 2. send-ssh-public-key (AWS EIC API push, IAM-gated)
|
||||
// 3. pick-free-port (local port for the tunnel)
|
||||
// 4. open-tunnel (aws ec2-instance-connect open-tunnel start)
|
||||
// 5. wait-for-port (the tunnel actually listens)
|
||||
// 6. ssh-probe (`ssh ... 'echo MARKER'` — proves end-to-end auth+shell)
|
||||
//
|
||||
// Local Docker workspaces (no instance_id row) get a smaller probe:
|
||||
// container-found + container-running. Same response shape so callers
|
||||
// don't need to branch.
|
||||
func (h *TerminalHandler) HandleDiagnose(c *gin.Context) {
|
||||
workspaceID := c.Param("id")
|
||||
ctx, cancel := context.WithTimeout(c.Request.Context(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
var instanceID string
|
||||
_ = db.DB.QueryRowContext(ctx,
|
||||
`SELECT COALESCE(instance_id, '') FROM workspaces WHERE id = $1`,
|
||||
workspaceID).Scan(&instanceID)
|
||||
|
||||
var res diagnoseResult
|
||||
if instanceID != "" {
|
||||
res = h.diagnoseRemote(ctx, workspaceID, instanceID)
|
||||
} else {
|
||||
res = h.diagnoseLocal(ctx, workspaceID)
|
||||
}
|
||||
c.JSON(http.StatusOK, res)
|
||||
}
|
||||
|
||||
// diagnoseStep is one row in the diagnostic report. Always carries Name +
|
||||
// OK + DurationMs; Error/Detail filled when the step fails.
|
||||
type diagnoseStep struct {
|
||||
Name string `json:"name"`
|
||||
OK bool `json:"ok"`
|
||||
DurationMs int64 `json:"duration_ms"`
|
||||
Error string `json:"error,omitempty"`
|
||||
Detail string `json:"detail,omitempty"`
|
||||
}
|
||||
|
||||
// diagnoseResult is the full report. ``OK`` is true only when every step
|
||||
// passed; ``FirstFailure`` names the step that broke the chain so callers
|
||||
// can route alerts (e.g., "send-ssh-public-key" → IAM team; "ssh-probe" →
|
||||
// SG/sshd team).
|
||||
type diagnoseResult struct {
|
||||
WorkspaceID string `json:"workspace_id"`
|
||||
InstanceID string `json:"instance_id,omitempty"`
|
||||
Remote bool `json:"remote"`
|
||||
OK bool `json:"ok"`
|
||||
FirstFailure string `json:"first_failure,omitempty"`
|
||||
Steps []diagnoseStep `json:"steps"`
|
||||
}
|
||||
|
||||
// sshProbeMarker is the string the ssh probe echoes back. Distinct from any
|
||||
// shell builtin output so we can grep for it unambiguously even when the
|
||||
// remote prints a banner or motd.
|
||||
const sshProbeMarker = "MOLECULE_TERMINAL_PROBE_OK"
|
||||
|
||||
// sshProbeCmd builds the non-interactive ssh probe command. Exposed as a
|
||||
// var so tests can stub it without spinning up a real sshd. BatchMode=yes
|
||||
// ensures ssh fails fast on prompt instead of hanging on a TTY.
|
||||
var sshProbeCmd = func(o eicSSHOptions) *exec.Cmd {
|
||||
return exec.Command(
|
||||
"ssh",
|
||||
"-i", o.PrivateKeyPath,
|
||||
"-o", "StrictHostKeyChecking=no",
|
||||
"-o", "UserKnownHostsFile=/dev/null",
|
||||
"-o", "BatchMode=yes",
|
||||
"-o", "ConnectTimeout=10",
|
||||
"-p", fmt.Sprintf("%d", o.LocalPort),
|
||||
fmt.Sprintf("%s@127.0.0.1", o.OSUser),
|
||||
"echo "+sshProbeMarker,
|
||||
)
|
||||
}
|
||||
|
||||
// diagnoseRemote runs the full EIC + ssh probe and reports per-step status.
|
||||
// Bails on the first failure so the operator sees which stage breaks; later
|
||||
// stages stay in the report as zero-value rows so the response shape is
|
||||
// stable regardless of where the chain stopped.
|
||||
func (h *TerminalHandler) diagnoseRemote(ctx context.Context, workspaceID, instanceID string) diagnoseResult {
|
||||
res := diagnoseResult{
|
||||
WorkspaceID: workspaceID,
|
||||
InstanceID: instanceID,
|
||||
Remote: true,
|
||||
}
|
||||
|
||||
osUser := os.Getenv("WORKSPACE_EC2_OS_USER")
|
||||
if osUser == "" {
|
||||
osUser = "ubuntu"
|
||||
}
|
||||
region := os.Getenv("AWS_REGION")
|
||||
if region == "" {
|
||||
region = "us-east-2"
|
||||
}
|
||||
|
||||
stop := func(name string, step diagnoseStep) diagnoseResult {
|
||||
res.Steps = append(res.Steps, step)
|
||||
res.FirstFailure = name
|
||||
return res
|
||||
}
|
||||
|
||||
// Step 1: ssh-keygen
|
||||
t0 := time.Now()
|
||||
keyDir, err := os.MkdirTemp("", "molecule-diagnose-*")
|
||||
if err != nil {
|
||||
return stop("ssh-keygen", diagnoseStep{
|
||||
Name: "ssh-keygen",
|
||||
DurationMs: time.Since(t0).Milliseconds(),
|
||||
Error: fmt.Sprintf("mkdir tmp: %v", err),
|
||||
})
|
||||
}
|
||||
defer func() { _ = os.RemoveAll(keyDir) }()
|
||||
keyPath := keyDir + "/id"
|
||||
keygen := exec.CommandContext(ctx, "ssh-keygen", "-t", "ed25519", "-f", keyPath, "-N", "", "-q", "-C", "molecule-diagnose")
|
||||
if out, kerr := keygen.CombinedOutput(); kerr != nil {
|
||||
return stop("ssh-keygen", diagnoseStep{
|
||||
Name: "ssh-keygen",
|
||||
DurationMs: time.Since(t0).Milliseconds(),
|
||||
Error: kerr.Error(),
|
||||
Detail: strings.TrimSpace(string(out)),
|
||||
})
|
||||
}
|
||||
res.Steps = append(res.Steps, diagnoseStep{Name: "ssh-keygen", OK: true, DurationMs: time.Since(t0).Milliseconds()})
|
||||
|
||||
pubKey, err := os.ReadFile(keyPath + ".pub")
|
||||
if err != nil {
|
||||
return stop("ssh-keygen", diagnoseStep{
|
||||
Name: "ssh-keygen",
|
||||
Error: fmt.Sprintf("read pubkey: %v", err),
|
||||
})
|
||||
}
|
||||
|
||||
// Step 2: send-ssh-public-key (AWS Instance Connect)
|
||||
t0 = time.Now()
|
||||
if err := sendSSHPublicKey(ctx, region, instanceID, osUser, strings.TrimSpace(string(pubKey))); err != nil {
|
||||
return stop("send-ssh-public-key", diagnoseStep{
|
||||
Name: "send-ssh-public-key",
|
||||
DurationMs: time.Since(t0).Milliseconds(),
|
||||
Error: err.Error(),
|
||||
})
|
||||
}
|
||||
res.Steps = append(res.Steps, diagnoseStep{Name: "send-ssh-public-key", OK: true, DurationMs: time.Since(t0).Milliseconds()})
|
||||
|
||||
// Step 3: pick-free-port
|
||||
t0 = time.Now()
|
||||
localPort, err := pickFreePort()
|
||||
if err != nil {
|
||||
return stop("pick-free-port", diagnoseStep{
|
||||
Name: "pick-free-port",
|
||||
DurationMs: time.Since(t0).Milliseconds(),
|
||||
Error: err.Error(),
|
||||
})
|
||||
}
|
||||
res.Steps = append(res.Steps, diagnoseStep{
|
||||
Name: "pick-free-port",
|
||||
OK: true,
|
||||
DurationMs: time.Since(t0).Milliseconds(),
|
||||
Detail: fmt.Sprintf("port=%d", localPort),
|
||||
})
|
||||
|
||||
// Step 4: open-tunnel (long-running subprocess; we hold its stderr so
|
||||
// we can include it in failure detail for the next two stages).
|
||||
opts := eicSSHOptions{
|
||||
InstanceID: instanceID,
|
||||
OSUser: osUser,
|
||||
Region: region,
|
||||
LocalPort: localPort,
|
||||
PrivateKeyPath: keyPath,
|
||||
}
|
||||
t0 = time.Now()
|
||||
tunnel := openTunnelCmd(opts)
|
||||
tunnel.Env = os.Environ()
|
||||
var tunnelStderr strings.Builder
|
||||
tunnel.Stderr = &tunnelStderr
|
||||
if err := tunnel.Start(); err != nil {
|
||||
return stop("open-tunnel", diagnoseStep{
|
||||
Name: "open-tunnel",
|
||||
DurationMs: time.Since(t0).Milliseconds(),
|
||||
Error: err.Error(),
|
||||
Detail: tunnelStderr.String(),
|
||||
})
|
||||
}
|
||||
defer func() {
|
||||
if tunnel.Process != nil {
|
||||
_ = tunnel.Process.Kill()
|
||||
}
|
||||
_ = tunnel.Wait()
|
||||
}()
|
||||
res.Steps = append(res.Steps, diagnoseStep{Name: "open-tunnel", OK: true, DurationMs: time.Since(t0).Milliseconds()})
|
||||
|
||||
// Step 5: wait-for-port — verifies the tunnel actually bound the port.
|
||||
// Tunnel-side errors (auth, SG, missing endpoint) usually surface here
|
||||
// because the subprocess exits before binding. Fold its stderr into the
|
||||
// detail so the operator sees the real reason.
|
||||
t0 = time.Now()
|
||||
if err := waitForPort(ctx, "127.0.0.1", localPort, 10*time.Second); err != nil {
|
||||
return stop("wait-for-port", diagnoseStep{
|
||||
Name: "wait-for-port",
|
||||
DurationMs: time.Since(t0).Milliseconds(),
|
||||
Error: err.Error(),
|
||||
Detail: tunnelStderr.String(),
|
||||
})
|
||||
}
|
||||
res.Steps = append(res.Steps, diagnoseStep{Name: "wait-for-port", OK: true, DurationMs: time.Since(t0).Milliseconds()})
|
||||
|
||||
// Step 6: ssh-probe — non-interactive `ssh ... 'echo MARKER'`. Proves
|
||||
// auth (key push reached sshd), shell ready (bash returns echo output),
|
||||
// and the network path end-to-end. Captures combined output + exit
|
||||
// error so we see "Permission denied", "Connection refused", or "Host
|
||||
// key verification failed" verbatim.
|
||||
t0 = time.Now()
|
||||
probe := sshProbeCmd(opts)
|
||||
probe.Env = os.Environ()
|
||||
out, perr := probe.CombinedOutput()
|
||||
outStr := strings.TrimSpace(string(out))
|
||||
durMs := time.Since(t0).Milliseconds()
|
||||
if perr != nil || !strings.Contains(outStr, sshProbeMarker) {
|
||||
errStr := ""
|
||||
if perr != nil {
|
||||
errStr = perr.Error()
|
||||
}
|
||||
return stop("ssh-probe", diagnoseStep{
|
||||
Name: "ssh-probe",
|
||||
DurationMs: durMs,
|
||||
Error: errStr,
|
||||
Detail: outStr,
|
||||
})
|
||||
}
|
||||
res.Steps = append(res.Steps, diagnoseStep{Name: "ssh-probe", OK: true, DurationMs: durMs})
|
||||
|
||||
res.OK = true
|
||||
return res
|
||||
}
|
||||
|
||||
// diagnoseLocal probes the Docker container path. Smaller surface: just
|
||||
// "is the named container running on this Docker daemon".
|
||||
func (h *TerminalHandler) diagnoseLocal(ctx context.Context, workspaceID string) diagnoseResult {
|
||||
res := diagnoseResult{WorkspaceID: workspaceID, Remote: false}
|
||||
if h.docker == nil {
|
||||
res.Steps = append(res.Steps, diagnoseStep{
|
||||
Name: "docker-available",
|
||||
Error: "docker client not configured on this workspace-server",
|
||||
})
|
||||
res.FirstFailure = "docker-available"
|
||||
return res
|
||||
}
|
||||
|
||||
candidates := []string{provisioner.ContainerName(workspaceID), "ws-" + workspaceID}
|
||||
var foundName string
|
||||
var lastErr error
|
||||
var running bool
|
||||
var stateStatus string
|
||||
t0 := time.Now()
|
||||
for _, n := range candidates {
|
||||
info, err := h.docker.ContainerInspect(ctx, n)
|
||||
if err == nil {
|
||||
foundName = n
|
||||
running = info.State.Running
|
||||
stateStatus = info.State.Status
|
||||
break
|
||||
}
|
||||
lastErr = err
|
||||
}
|
||||
if foundName == "" {
|
||||
errMsg := "no matching container"
|
||||
if lastErr != nil {
|
||||
errMsg = lastErr.Error()
|
||||
}
|
||||
res.Steps = append(res.Steps, diagnoseStep{
|
||||
Name: "container-found",
|
||||
DurationMs: time.Since(t0).Milliseconds(),
|
||||
Error: errMsg,
|
||||
Detail: fmt.Sprintf("tried: %s", strings.Join(candidates, ", ")),
|
||||
})
|
||||
res.FirstFailure = "container-found"
|
||||
return res
|
||||
}
|
||||
res.Steps = append(res.Steps, diagnoseStep{
|
||||
Name: "container-found",
|
||||
OK: true,
|
||||
DurationMs: time.Since(t0).Milliseconds(),
|
||||
Detail: foundName,
|
||||
})
|
||||
|
||||
if !running {
|
||||
res.Steps = append(res.Steps, diagnoseStep{
|
||||
Name: "container-running",
|
||||
Error: "container not running",
|
||||
Detail: stateStatus,
|
||||
})
|
||||
res.FirstFailure = "container-running"
|
||||
return res
|
||||
}
|
||||
res.Steps = append(res.Steps, diagnoseStep{Name: "container-running", OK: true, Detail: stateStatus})
|
||||
res.OK = true
|
||||
return res
|
||||
}
|
||||
222
workspace-server/internal/handlers/terminal_diagnose_test.go
Normal file
222
workspace-server/internal/handlers/terminal_diagnose_test.go
Normal file
@ -0,0 +1,222 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"net/http/httptest"
|
||||
"os/exec"
|
||||
"testing"
|
||||
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
// TestHandleDiagnose_RoutesToRemote pins the dispatch: a workspace row with
|
||||
// a non-empty instance_id takes the EIC + ssh probe path. We stub the
|
||||
// first-stage (send-ssh-public-key) to fail so the test stays
|
||||
// hermetic — no AWS calls, no network — and confirm:
|
||||
//
|
||||
// - first_failure is "send-ssh-public-key" (not the earlier ssh-keygen)
|
||||
// - the steps array includes the ssh-keygen pass + the failed
|
||||
// send-ssh-public-key step
|
||||
// - response is HTTP 200 (the endpoint always returns 200; failure is
|
||||
// in the JSON body so callers don't need branch-on-status)
|
||||
func TestHandleDiagnose_RoutesToRemote(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
|
||||
mock.ExpectQuery("SELECT COALESCE").
|
||||
WithArgs("ws-remote").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"instance_id"}).AddRow("i-abc123"))
|
||||
|
||||
prev := sendSSHPublicKey
|
||||
sendSSHPublicKey = func(ctx context.Context, region, instanceID, osUser, pubKey string) error {
|
||||
return errors.New("AccessDeniedException: not authorized")
|
||||
}
|
||||
defer func() { sendSSHPublicKey = prev }()
|
||||
|
||||
h := NewTerminalHandler(nil)
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "ws-remote"}}
|
||||
c.Request = httptest.NewRequest("GET", "/workspaces/ws-remote/terminal/diagnose", nil)
|
||||
|
||||
h.HandleDiagnose(c)
|
||||
|
||||
if w.Code != 200 {
|
||||
t.Fatalf("HandleDiagnose status: got %d, want 200 (body=%s)", w.Code, w.Body.String())
|
||||
}
|
||||
var got diagnoseResult
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &got); err != nil {
|
||||
t.Fatalf("response not JSON: %v (body=%s)", err, w.Body.String())
|
||||
}
|
||||
if !got.Remote {
|
||||
t.Errorf("Remote=false; expected true for instance_id-bearing workspace")
|
||||
}
|
||||
if got.OK {
|
||||
t.Errorf("OK=true despite stubbed send-key failure")
|
||||
}
|
||||
if got.FirstFailure != "send-ssh-public-key" {
|
||||
t.Errorf("FirstFailure=%q; want send-ssh-public-key", got.FirstFailure)
|
||||
}
|
||||
// ssh-keygen must run successfully before send-ssh-public-key fails.
|
||||
if len(got.Steps) < 2 {
|
||||
t.Fatalf("expected >=2 steps (ssh-keygen + send-ssh-public-key); got %d", len(got.Steps))
|
||||
}
|
||||
if got.Steps[0].Name != "ssh-keygen" || !got.Steps[0].OK {
|
||||
t.Errorf("step[0]: want ssh-keygen ok=true; got %+v", got.Steps[0])
|
||||
}
|
||||
if got.Steps[1].Name != "send-ssh-public-key" || got.Steps[1].OK {
|
||||
t.Errorf("step[1]: want send-ssh-public-key ok=false; got %+v", got.Steps[1])
|
||||
}
|
||||
// The IAM error message must surface in the step's Error field — that's
|
||||
// the whole point of the endpoint.
|
||||
if got.Steps[1].Error == "" {
|
||||
t.Errorf("step[1].Error is empty; AWS error must surface verbatim")
|
||||
}
|
||||
}
|
||||
|
||||
// TestHandleDiagnose_RoutesToLocal — empty instance_id takes the Docker
|
||||
// path. With nil docker client, container-found can't even start, so we
|
||||
// fail at "docker-available". Confirms the local-vs-remote dispatch.
|
||||
func TestHandleDiagnose_RoutesToLocal(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
|
||||
mock.ExpectQuery("SELECT COALESCE").
|
||||
WithArgs("ws-local").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"instance_id"}).AddRow(""))
|
||||
|
||||
h := NewTerminalHandler(nil)
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "ws-local"}}
|
||||
c.Request = httptest.NewRequest("GET", "/workspaces/ws-local/terminal/diagnose", nil)
|
||||
|
||||
h.HandleDiagnose(c)
|
||||
|
||||
if w.Code != 200 {
|
||||
t.Fatalf("status: got %d, want 200", w.Code)
|
||||
}
|
||||
var got diagnoseResult
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &got); err != nil {
|
||||
t.Fatalf("response not JSON: %v", err)
|
||||
}
|
||||
if got.Remote {
|
||||
t.Errorf("Remote=true; expected false for empty-instance_id workspace")
|
||||
}
|
||||
if got.FirstFailure != "docker-available" {
|
||||
t.Errorf("FirstFailure=%q; want docker-available (no docker client)", got.FirstFailure)
|
||||
}
|
||||
}
|
||||
|
||||
// TestDiagnoseRemote_StopsAtSSHProbe — full happy path through send-key,
|
||||
// pick-port, open-tunnel, wait-for-port, then stub the ssh probe to fail.
|
||||
// Confirms first_failure surfaces the actual ssh stderr ("Permission
|
||||
// denied") rather than the earlier successful steps. This is the
|
||||
// most operationally important behavior — the endpoint exists primarily
|
||||
// to differentiate "IAM broke" (send-key fails) from "sshd broke" (probe
|
||||
// fails) from "SG/network broke" (wait-for-port fails).
|
||||
func TestDiagnoseRemote_StopsAtSSHProbe(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
|
||||
mock.ExpectQuery("SELECT COALESCE").
|
||||
WithArgs("ws-probe-fail").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"instance_id"}).AddRow("i-test"))
|
||||
|
||||
// Stub send-key to succeed.
|
||||
prevSend := sendSSHPublicKey
|
||||
sendSSHPublicKey = func(ctx context.Context, region, instanceID, osUser, pubKey string) error {
|
||||
return nil
|
||||
}
|
||||
defer func() { sendSSHPublicKey = prevSend }()
|
||||
|
||||
// Stub openTunnelCmd to spawn `nc -l <port>` so waitForPort succeeds.
|
||||
// We need the tunnel to actually bind the port; nc does that
|
||||
// portably. macOS has BSD nc by default.
|
||||
prevTun := openTunnelCmd
|
||||
openTunnelCmd = func(o eicSSHOptions) *exec.Cmd {
|
||||
// `nc -l <port>` listens on the picked free port. -k keeps it
|
||||
// alive across single-client disconnects on Linux nc; harmless
|
||||
// on BSD nc which doesn't have it (we'd need -k for BSD too —
|
||||
// fall back to a portable busy-wait).
|
||||
return exec.Command("sh", "-c",
|
||||
`port="$1"; while true; do nc -l "$port" >/dev/null 2>&1 || true; done`,
|
||||
"sh", numToString(o.LocalPort))
|
||||
}
|
||||
defer func() { openTunnelCmd = prevTun }()
|
||||
|
||||
// Stub the ssh probe to return "Permission denied" with non-zero exit,
|
||||
// the canonical "key wasn't authorized" failure.
|
||||
prevProbe := sshProbeCmd
|
||||
sshProbeCmd = func(o eicSSHOptions) *exec.Cmd {
|
||||
return exec.Command("sh", "-c", "echo 'Permission denied (publickey).' >&2; exit 255")
|
||||
}
|
||||
defer func() { sshProbeCmd = prevProbe }()
|
||||
|
||||
h := NewTerminalHandler(nil)
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "ws-probe-fail"}}
|
||||
c.Request = httptest.NewRequest("GET", "/workspaces/ws-probe-fail/terminal/diagnose", nil)
|
||||
|
||||
h.HandleDiagnose(c)
|
||||
|
||||
if w.Code != 200 {
|
||||
t.Fatalf("status: got %d", w.Code)
|
||||
}
|
||||
var got diagnoseResult
|
||||
if err := json.Unmarshal(w.Body.Bytes(), &got); err != nil {
|
||||
t.Fatalf("response not JSON: %v (body=%s)", err, w.Body.String())
|
||||
}
|
||||
if got.OK {
|
||||
t.Errorf("OK=true despite stubbed probe failure")
|
||||
}
|
||||
if got.FirstFailure != "ssh-probe" {
|
||||
t.Errorf("FirstFailure=%q; want ssh-probe (got body=%s)", got.FirstFailure, w.Body.String())
|
||||
}
|
||||
// The "Permission denied" message must be in the probe step's Detail —
|
||||
// that's what tells the operator "this is sshd auth, not network".
|
||||
var probeStep *diagnoseStep
|
||||
for i := range got.Steps {
|
||||
if got.Steps[i].Name == "ssh-probe" {
|
||||
probeStep = &got.Steps[i]
|
||||
break
|
||||
}
|
||||
}
|
||||
if probeStep == nil {
|
||||
t.Fatalf("no ssh-probe step in result: %+v", got.Steps)
|
||||
}
|
||||
if probeStep.OK {
|
||||
t.Errorf("ssh-probe step OK=true despite failure stub")
|
||||
}
|
||||
if probeStep.Detail == "" && probeStep.Error == "" {
|
||||
t.Errorf("ssh-probe step has no Error or Detail; ssh stderr is exactly what we want to expose")
|
||||
}
|
||||
}
|
||||
|
||||
// numToString is a tiny helper to avoid pulling fmt into the test for one
|
||||
// integer-to-string call. Same observable behavior as strconv.Itoa.
|
||||
func numToString(n int) string {
|
||||
if n == 0 {
|
||||
return "0"
|
||||
}
|
||||
var buf [20]byte
|
||||
i := len(buf)
|
||||
neg := n < 0
|
||||
if neg {
|
||||
n = -n
|
||||
}
|
||||
for n > 0 {
|
||||
i--
|
||||
buf[i] = byte('0' + n%10)
|
||||
n /= 10
|
||||
}
|
||||
if neg {
|
||||
i--
|
||||
buf[i] = '-'
|
||||
}
|
||||
return string(buf[i:])
|
||||
}
|
||||
@ -470,6 +470,7 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi
|
||||
}
|
||||
th := handlers.NewTerminalHandler(dockerCli)
|
||||
wsAuth.GET("/terminal", th.HandleConnect)
|
||||
wsAuth.GET("/terminal/diagnose", th.HandleDiagnose)
|
||||
|
||||
// Canvas Viewport — #166 + #168: GET stays fully open for bootstrap.
|
||||
// PUT uses CanvasOrBearer (accepts Origin-match OR bearer token) so the
|
||||
|
||||
Loading…
Reference in New Issue
Block a user