forked from molecule-ai/molecule-core
fix(terminal-diagnose): KI-005 hierarchy check + race-free stderr capture
Two fixes from /code-review-and-quality on PR #2445: 1. **KI-005 hierarchy check parity with /terminal** HandleConnect runs the KI-005 cross-workspace guard before dispatch (terminal.go:85-106): when X-Workspace-ID is set and != :id, validate the bearer's workspace binding then call canCommunicateCheck. Without this, an org-level token holder in tenant Foo can probe any workspace's diagnostic state by guessing the UUID — same enumeration vector KI-005 closed for /terminal in #1609. Per-workspace bearer tokens are URL-bound by WorkspaceAuth, so the gap is org tokens within the same tenant. Fix: copy the same gate into HandleDiagnose, before the instance_id SELECT. Test: TestHandleDiagnose_KI005_RejectsCrossWorkspace stubs canCommunicateCheck=false and confirms 403 fires before the DB lookup (sqlmock's ExpectationsWereMet pins that we never reached the SELECT COALESCE). Mirrors the existing TestTerminalConnect_KI005_RejectsUnauthorizedCrossWorkspace. 2. **Race-free tunnel stderr capture (syncBuf)** strings.Builder isn't goroutine-safe. os/exec spawns a background goroutine that copies the subprocess's stderr fd to cmd.Stderr's Write, so reading the buffer's String() from the request goroutine on wait-for-port timeout while the tunnel may still be writing is a data race that `go test -race` flags. Worst-case impact in production is a garbled Detail string (not a crash), but the fix is small. Fix: wrap bytes.Buffer in a sync.Mutex (syncBuf type). Same io.Writer interface, no API changes elsewhere. 3. **Nit cleanup** - read-pubkey failure now reports as its own step name instead of a duplicated "ssh-keygen" entry — disambiguates two different failure modes that previously shared a name. - Replaced numToString hand-rolled int-to-string with strconv.Itoa in the test (no import savings reason existed). Suite: 4 diagnose tests pass with -race; full handlers suite passes in 3.95s. go vet clean. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
d012a803e4
commit
b9311134cf
@ -1,19 +1,48 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/wsauth"
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
// syncBuf is a goroutine-safe writer that wraps bytes.Buffer with a mutex.
|
||||
// Used to capture subprocess stderr without racing the os/exec stderr-copy
|
||||
// goroutine: ``cmd.Stderr = io.Writer`` spawns a background goroutine that
|
||||
// reads from the subprocess's stderr fd and calls Write on our writer, so
|
||||
// reading the buffer from another goroutine (e.g., on wait-for-port
|
||||
// timeout while the tunnel may still be writing) without synchronization
|
||||
// is a data race that ``go test -race`` would flag. ``strings.Builder``
|
||||
// and bare ``bytes.Buffer`` aren't goroutine-safe; this tiny shim is the
|
||||
// cheapest fix.
|
||||
type syncBuf struct {
|
||||
mu sync.Mutex
|
||||
b bytes.Buffer
|
||||
}
|
||||
|
||||
func (s *syncBuf) Write(p []byte) (int, error) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
return s.b.Write(p)
|
||||
}
|
||||
|
||||
func (s *syncBuf) String() string {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
return s.b.String()
|
||||
}
|
||||
|
||||
// HandleDiagnose handles GET /workspaces/:id/terminal/diagnose. It runs the
|
||||
// same per-step pipeline as HandleConnect (ssh-keygen → EIC send-key → tunnel
|
||||
// → ssh) but non-interactively, captures the first failing step and its
|
||||
@ -45,6 +74,29 @@ func (h *TerminalHandler) HandleDiagnose(c *gin.Context) {
|
||||
ctx, cancel := context.WithTimeout(c.Request.Context(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
// KI-005 hierarchy check — same shape as HandleConnect. Without this,
|
||||
// an org-level token holder can probe any workspace in their tenant by
|
||||
// guessing the UUID, learning its diagnostic state (which IAM call
|
||||
// fails, what sshd says) even when they don't own it. Per-workspace
|
||||
// bearer tokens are already URL-bound by WorkspaceAuth, so the gap is
|
||||
// org tokens — same vector KI-005 closed for /terminal (#1609).
|
||||
callerID := c.GetHeader("X-Workspace-ID")
|
||||
if callerID != "" && callerID != workspaceID {
|
||||
tok := wsauth.BearerTokenFromHeader(c.GetHeader("Authorization"))
|
||||
if tok != "" {
|
||||
if err := wsauth.ValidateToken(ctx, db.DB, callerID, tok); err != nil {
|
||||
if c.GetString("org_token_id") == "" {
|
||||
c.JSON(http.StatusUnauthorized, gin.H{"error": "invalid token for claimed workspace"})
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
if !canCommunicateCheck(callerID, workspaceID) {
|
||||
c.JSON(http.StatusForbidden, gin.H{"error": "not authorized to diagnose this workspace's terminal"})
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
var instanceID string
|
||||
_ = db.DB.QueryRowContext(ctx,
|
||||
`SELECT COALESCE(instance_id, '') FROM workspaces WHERE id = $1`,
|
||||
@ -155,8 +207,8 @@ func (h *TerminalHandler) diagnoseRemote(ctx context.Context, workspaceID, insta
|
||||
|
||||
pubKey, err := os.ReadFile(keyPath + ".pub")
|
||||
if err != nil {
|
||||
return stop("ssh-keygen", diagnoseStep{
|
||||
Name: "ssh-keygen",
|
||||
return stop("read-pubkey", diagnoseStep{
|
||||
Name: "read-pubkey",
|
||||
Error: fmt.Sprintf("read pubkey: %v", err),
|
||||
})
|
||||
}
|
||||
@ -201,7 +253,7 @@ func (h *TerminalHandler) diagnoseRemote(ctx context.Context, workspaceID, insta
|
||||
t0 = time.Now()
|
||||
tunnel := openTunnelCmd(opts)
|
||||
tunnel.Env = os.Environ()
|
||||
var tunnelStderr strings.Builder
|
||||
var tunnelStderr syncBuf
|
||||
tunnel.Stderr = &tunnelStderr
|
||||
if err := tunnel.Start(); err != nil {
|
||||
return stop("open-tunnel", diagnoseStep{
|
||||
|
||||
@ -6,6 +6,7 @@ import (
|
||||
"errors"
|
||||
"net/http/httptest"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"testing"
|
||||
|
||||
"github.com/DATA-DOG/go-sqlmock"
|
||||
@ -111,6 +112,53 @@ func TestHandleDiagnose_RoutesToLocal(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestHandleDiagnose_KI005_RejectsCrossWorkspace — the diagnostic endpoint
|
||||
// has the same cross-workspace info-leak surface as /terminal had before
|
||||
// #1609. Without KI-005, an org-level token holder could probe any
|
||||
// workspace in their tenant by guessing the UUID, learning which IAM call
|
||||
// fails or which sshd error fires. This test pins that HandleDiagnose
|
||||
// applies the same hierarchy guard as HandleConnect (parity: ws-attacker
|
||||
// claiming X-Workspace-ID against /workspaces/ws-victim/terminal/diagnose
|
||||
// must 403, never reaching the SELECT COALESCE for instance_id).
|
||||
func TestHandleDiagnose_KI005_RejectsCrossWorkspace(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
|
||||
// Stub CanCommunicate to deny. Reset after — same pattern as the
|
||||
// HandleConnect KI-005 tests.
|
||||
prev := canCommunicateCheck
|
||||
canCommunicateCheck = func(callerID, targetID string) bool { return false }
|
||||
defer func() { canCommunicateCheck = prev }()
|
||||
|
||||
// Token validation: caller's bearer is bound to ws-attacker.
|
||||
mock.ExpectQuery(`SELECT t\.id, t\.workspace_id\s+FROM workspace_auth_tokens t`).
|
||||
WithArgs(sqlmock.AnyArg()).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id", "workspace_id"}).AddRow("tok-1", "ws-attacker"))
|
||||
mock.ExpectExec(`UPDATE workspace_auth_tokens SET last_used_at`).
|
||||
WithArgs(sqlmock.AnyArg()).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
h := NewTerminalHandler(nil)
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "ws-victim"}}
|
||||
c.Request = httptest.NewRequest("GET", "/workspaces/ws-victim/terminal/diagnose", nil)
|
||||
c.Request.Header.Set("X-Workspace-ID", "ws-attacker")
|
||||
c.Request.Header.Set("Authorization", "Bearer attacker-token")
|
||||
|
||||
h.HandleDiagnose(c)
|
||||
|
||||
if w.Code != 403 {
|
||||
t.Errorf("cross-workspace diagnose: got %d, want 403 (%s)", w.Code, w.Body.String())
|
||||
}
|
||||
// Critically: the SELECT COALESCE for instance_id must NOT have run —
|
||||
// no expectation was set for it. ExpectationsWereMet ensures we
|
||||
// rejected before reaching the DB lookup.
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet sqlmock expectations (rejection should fire before instance_id lookup): %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestDiagnoseRemote_StopsAtSSHProbe — full happy path through send-key,
|
||||
// pick-port, open-tunnel, wait-for-port, then stub the ssh probe to fail.
|
||||
// Confirms first_failure surfaces the actual ssh stderr ("Permission
|
||||
@ -144,7 +192,7 @@ func TestDiagnoseRemote_StopsAtSSHProbe(t *testing.T) {
|
||||
// fall back to a portable busy-wait).
|
||||
return exec.Command("sh", "-c",
|
||||
`port="$1"; while true; do nc -l "$port" >/dev/null 2>&1 || true; done`,
|
||||
"sh", numToString(o.LocalPort))
|
||||
"sh", strconv.Itoa(o.LocalPort))
|
||||
}
|
||||
defer func() { openTunnelCmd = prevTun }()
|
||||
|
||||
@ -197,26 +245,3 @@ func TestDiagnoseRemote_StopsAtSSHProbe(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// numToString is a tiny helper to avoid pulling fmt into the test for one
|
||||
// integer-to-string call. Same observable behavior as strconv.Itoa.
|
||||
func numToString(n int) string {
|
||||
if n == 0 {
|
||||
return "0"
|
||||
}
|
||||
var buf [20]byte
|
||||
i := len(buf)
|
||||
neg := n < 0
|
||||
if neg {
|
||||
n = -n
|
||||
}
|
||||
for n > 0 {
|
||||
i--
|
||||
buf[i] = byte('0' + n%10)
|
||||
n /= 10
|
||||
}
|
||||
if neg {
|
||||
i--
|
||||
buf[i] = '-'
|
||||
}
|
||||
return string(buf[i:])
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user