fix(terminal-diagnose): KI-005 hierarchy check + race-free stderr capture

Two fixes from /code-review-and-quality on PR #2445:

1. **KI-005 hierarchy check parity with /terminal**

   HandleConnect runs the KI-005 cross-workspace guard before dispatch
   (terminal.go:85-106): when X-Workspace-ID is set and != :id, validate
   the bearer's workspace binding then call canCommunicateCheck. Without
   this, an org-level token holder in tenant Foo can probe any
   workspace's diagnostic state by guessing the UUID — same enumeration
   vector KI-005 closed for /terminal in #1609. Per-workspace bearer
   tokens are URL-bound by WorkspaceAuth, so the gap is org tokens
   within the same tenant.

   Fix: copy the same gate into HandleDiagnose, before the
   instance_id SELECT.

   Test: TestHandleDiagnose_KI005_RejectsCrossWorkspace stubs
   canCommunicateCheck=false and confirms 403 fires before the DB
   lookup (sqlmock's ExpectationsWereMet pins that we never reached
   the SELECT COALESCE). Mirrors the existing
   TestTerminalConnect_KI005_RejectsUnauthorizedCrossWorkspace.

2. **Race-free tunnel stderr capture (syncBuf)**

   strings.Builder isn't goroutine-safe. os/exec spawns a background
   goroutine that copies the subprocess's stderr fd to cmd.Stderr's
   Write, so reading the buffer's String() from the request goroutine
   on wait-for-port timeout while the tunnel may still be writing is
   a data race that `go test -race` flags. Worst-case impact in
   production is a garbled Detail string (not a crash), but the fix
   is small.

   Fix: wrap bytes.Buffer in a sync.Mutex (syncBuf type). Same
   io.Writer interface, no API changes elsewhere.

3. **Nit cleanup**

   - read-pubkey failure now reports as its own step name instead of
     a duplicated "ssh-keygen" entry — disambiguates two different
     failure modes that previously shared a name.
   - Replaced numToString hand-rolled int-to-string with strconv.Itoa
     in the test (no import savings reason existed).

Suite: 4 diagnose tests pass with -race; full handlers suite passes
in 3.95s. go vet clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hongming Wang 2026-04-30 21:19:18 -07:00
parent d012a803e4
commit b9311134cf
2 changed files with 104 additions and 27 deletions

View File

@ -1,19 +1,48 @@
package handlers
import (
"bytes"
"context"
"fmt"
"net/http"
"os"
"os/exec"
"strings"
"sync"
"time"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/wsauth"
"github.com/gin-gonic/gin"
)
// syncBuf is a goroutine-safe writer that wraps bytes.Buffer with a mutex.
// Used to capture subprocess stderr without racing the os/exec stderr-copy
// goroutine: ``cmd.Stderr = io.Writer`` spawns a background goroutine that
// reads from the subprocess's stderr fd and calls Write on our writer, so
// reading the buffer from another goroutine (e.g., on wait-for-port
// timeout while the tunnel may still be writing) without synchronization
// is a data race that ``go test -race`` would flag. ``strings.Builder``
// and bare ``bytes.Buffer`` aren't goroutine-safe; this tiny shim is the
// cheapest fix.
type syncBuf struct {
mu sync.Mutex
b bytes.Buffer
}
func (s *syncBuf) Write(p []byte) (int, error) {
s.mu.Lock()
defer s.mu.Unlock()
return s.b.Write(p)
}
func (s *syncBuf) String() string {
s.mu.Lock()
defer s.mu.Unlock()
return s.b.String()
}
// HandleDiagnose handles GET /workspaces/:id/terminal/diagnose. It runs the
// same per-step pipeline as HandleConnect (ssh-keygen → EIC send-key → tunnel
// → ssh) but non-interactively, captures the first failing step and its
@ -45,6 +74,29 @@ func (h *TerminalHandler) HandleDiagnose(c *gin.Context) {
ctx, cancel := context.WithTimeout(c.Request.Context(), 30*time.Second)
defer cancel()
// KI-005 hierarchy check — same shape as HandleConnect. Without this,
// an org-level token holder can probe any workspace in their tenant by
// guessing the UUID, learning its diagnostic state (which IAM call
// fails, what sshd says) even when they don't own it. Per-workspace
// bearer tokens are already URL-bound by WorkspaceAuth, so the gap is
// org tokens — same vector KI-005 closed for /terminal (#1609).
callerID := c.GetHeader("X-Workspace-ID")
if callerID != "" && callerID != workspaceID {
tok := wsauth.BearerTokenFromHeader(c.GetHeader("Authorization"))
if tok != "" {
if err := wsauth.ValidateToken(ctx, db.DB, callerID, tok); err != nil {
if c.GetString("org_token_id") == "" {
c.JSON(http.StatusUnauthorized, gin.H{"error": "invalid token for claimed workspace"})
return
}
}
}
if !canCommunicateCheck(callerID, workspaceID) {
c.JSON(http.StatusForbidden, gin.H{"error": "not authorized to diagnose this workspace's terminal"})
return
}
}
var instanceID string
_ = db.DB.QueryRowContext(ctx,
`SELECT COALESCE(instance_id, '') FROM workspaces WHERE id = $1`,
@ -155,8 +207,8 @@ func (h *TerminalHandler) diagnoseRemote(ctx context.Context, workspaceID, insta
pubKey, err := os.ReadFile(keyPath + ".pub")
if err != nil {
return stop("ssh-keygen", diagnoseStep{
Name: "ssh-keygen",
return stop("read-pubkey", diagnoseStep{
Name: "read-pubkey",
Error: fmt.Sprintf("read pubkey: %v", err),
})
}
@ -201,7 +253,7 @@ func (h *TerminalHandler) diagnoseRemote(ctx context.Context, workspaceID, insta
t0 = time.Now()
tunnel := openTunnelCmd(opts)
tunnel.Env = os.Environ()
var tunnelStderr strings.Builder
var tunnelStderr syncBuf
tunnel.Stderr = &tunnelStderr
if err := tunnel.Start(); err != nil {
return stop("open-tunnel", diagnoseStep{

View File

@ -6,6 +6,7 @@ import (
"errors"
"net/http/httptest"
"os/exec"
"strconv"
"testing"
"github.com/DATA-DOG/go-sqlmock"
@ -111,6 +112,53 @@ func TestHandleDiagnose_RoutesToLocal(t *testing.T) {
}
}
// TestHandleDiagnose_KI005_RejectsCrossWorkspace — the diagnostic endpoint
// has the same cross-workspace info-leak surface as /terminal had before
// #1609. Without KI-005, an org-level token holder could probe any
// workspace in their tenant by guessing the UUID, learning which IAM call
// fails or which sshd error fires. This test pins that HandleDiagnose
// applies the same hierarchy guard as HandleConnect (parity: ws-attacker
// claiming X-Workspace-ID against /workspaces/ws-victim/terminal/diagnose
// must 403, never reaching the SELECT COALESCE for instance_id).
func TestHandleDiagnose_KI005_RejectsCrossWorkspace(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
// Stub CanCommunicate to deny. Reset after — same pattern as the
// HandleConnect KI-005 tests.
prev := canCommunicateCheck
canCommunicateCheck = func(callerID, targetID string) bool { return false }
defer func() { canCommunicateCheck = prev }()
// Token validation: caller's bearer is bound to ws-attacker.
mock.ExpectQuery(`SELECT t\.id, t\.workspace_id\s+FROM workspace_auth_tokens t`).
WithArgs(sqlmock.AnyArg()).
WillReturnRows(sqlmock.NewRows([]string{"id", "workspace_id"}).AddRow("tok-1", "ws-attacker"))
mock.ExpectExec(`UPDATE workspace_auth_tokens SET last_used_at`).
WithArgs(sqlmock.AnyArg()).
WillReturnResult(sqlmock.NewResult(0, 1))
h := NewTerminalHandler(nil)
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
c.Params = gin.Params{{Key: "id", Value: "ws-victim"}}
c.Request = httptest.NewRequest("GET", "/workspaces/ws-victim/terminal/diagnose", nil)
c.Request.Header.Set("X-Workspace-ID", "ws-attacker")
c.Request.Header.Set("Authorization", "Bearer attacker-token")
h.HandleDiagnose(c)
if w.Code != 403 {
t.Errorf("cross-workspace diagnose: got %d, want 403 (%s)", w.Code, w.Body.String())
}
// Critically: the SELECT COALESCE for instance_id must NOT have run —
// no expectation was set for it. ExpectationsWereMet ensures we
// rejected before reaching the DB lookup.
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet sqlmock expectations (rejection should fire before instance_id lookup): %v", err)
}
}
// TestDiagnoseRemote_StopsAtSSHProbe — full happy path through send-key,
// pick-port, open-tunnel, wait-for-port, then stub the ssh probe to fail.
// Confirms first_failure surfaces the actual ssh stderr ("Permission
@ -144,7 +192,7 @@ func TestDiagnoseRemote_StopsAtSSHProbe(t *testing.T) {
// fall back to a portable busy-wait).
return exec.Command("sh", "-c",
`port="$1"; while true; do nc -l "$port" >/dev/null 2>&1 || true; done`,
"sh", numToString(o.LocalPort))
"sh", strconv.Itoa(o.LocalPort))
}
defer func() { openTunnelCmd = prevTun }()
@ -197,26 +245,3 @@ func TestDiagnoseRemote_StopsAtSSHProbe(t *testing.T) {
}
}
// numToString is a tiny helper to avoid pulling fmt into the test for one
// integer-to-string call. Same observable behavior as strconv.Itoa.
func numToString(n int) string {
if n == 0 {
return "0"
}
var buf [20]byte
i := len(buf)
neg := n < 0
if neg {
n = -n
}
for n > 0 {
i--
buf[i] = byte('0' + n%10)
n /= 10
}
if neg {
i--
buf[i] = '-'
}
return string(buf[i:])
}