molecule-core/workspace-server/internal/handlers/discovery.go
Hongming Wang 2b76f7dfcb fix(discovery): isSafeURL guard on registered URLs (closes #1484)
#1484 flagged that discoverHostPeer() and writeExternalWorkspaceURL()
return URLs sourced from the workspaces table without an isSafeURL
check. Workspace runtimes register their own URLs via /registry/register
— a misbehaving / compromised runtime could register a metadata-IP URL.
Today both functions are gated by Phase 30.6 bearer-required Discover,
so exposure is theoretical. The fix makes them safe regardless of
upstream auth shape.

Changes:
- discoverHostPeer: isSafeURL on resolved URL before responding;
  503 + log on rejection.
- writeExternalWorkspaceURL: same guard applied to the post-rewrite
  outURL (so a host.docker.internal rewrite is checked AND a
  metadata-IP that survived the rewrite untouched is rejected).
- 3 new regression tests:
  * RejectsMetadataIPURL on host-peer path (169.254.169.254 → 503)
  * AcceptsPublicURL on host-peer path (8.8.8.8 → 200; positive
    counterpart so the rejection test can't pass via universal-fail)
  * RejectsMetadataIPURL on external-workspace path

setupTestDB already disables SSRF checks via setSSRFCheckForTest,
so the 16+ existing discovery tests remain untouched. Only the new
tests opt in to enabled SSRF.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 06:50:36 -07:00

436 lines
16 KiB
Go

package handlers
import (
"context"
"database/sql"
"encoding/json"
"errors"
"log"
"net/http"
"strings"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/middleware"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/registry"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/wsauth"
"github.com/gin-gonic/gin"
)
type DiscoveryHandler struct{}
func NewDiscoveryHandler() *DiscoveryHandler {
return &DiscoveryHandler{}
}
// Discover handles GET /registry/discover/:id
func (h *DiscoveryHandler) Discover(c *gin.Context) {
targetID := c.Param("id")
callerID := c.GetHeader("X-Workspace-ID")
if callerID == "" {
c.JSON(http.StatusBadRequest, gin.H{"error": "X-Workspace-ID header is required"})
return
}
// Phase 30.6 — verify the caller's bearer token before revealing any
// peer URL. Without this, a random internet host that knows a
// workspace ID could enumerate siblings. Legacy workspaces (no
// live tokens) grandfather through the same way heartbeat does.
if err := validateDiscoveryCaller(c.Request.Context(), c, callerID); err != nil {
return // response already written
}
if !registry.CanCommunicate(callerID, targetID) {
c.JSON(http.StatusForbidden, gin.H{"error": "not authorized to discover this workspace"})
return
}
ctx := c.Request.Context()
// Workspace-to-workspace: return Docker-internal URL (containers can't
// reach host ports). External targets need their registered URL with
// 127.0.0.1/localhost rewritten to host.docker.internal when the caller
// is itself a Docker container.
if callerID != "" {
discoverWorkspacePeer(ctx, c, callerID, targetID)
return
}
discoverHostPeer(ctx, c, targetID)
}
// discoverHostPeer handles the canvas/external (no X-Workspace-ID) branch of
// Discover. It returns the host-accessible URL for `targetID`, following any
// forwarding chain (max 5 hops). Currently unreachable because Discover
// requires the X-Workspace-ID header up front, but kept to preserve the
// original code path 1:1 in case the requirement is relaxed.
func discoverHostPeer(ctx context.Context, c *gin.Context, targetID string) {
if url, err := db.GetCachedURL(ctx, targetID); err == nil {
c.JSON(http.StatusOK, gin.H{"id": targetID, "url": url})
return
}
var url sql.NullString
var status string
var forwardedTo sql.NullString
err := db.DB.QueryRowContext(ctx,
`SELECT url, status, forwarded_to FROM workspaces WHERE id = $1`, targetID,
).Scan(&url, &status, &forwardedTo)
if err == sql.ErrNoRows {
c.JSON(http.StatusNotFound, gin.H{"error": "workspace not found"})
return
}
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "lookup failed"})
return
}
// Follow forwarding chain (max 5 hops to prevent loops)
resolvedID := targetID
for i := 0; i < 5 && forwardedTo.Valid && forwardedTo.String != ""; i++ {
resolvedID = forwardedTo.String
err = db.DB.QueryRowContext(ctx,
`SELECT url, status, forwarded_to FROM workspaces WHERE id = $1`, resolvedID,
).Scan(&url, &status, &forwardedTo)
if err != nil {
break
}
}
if !url.Valid || url.String == "" {
c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace has no URL", "status": status})
return
}
// #1484 SSRF defense-in-depth: the URL came from the workspaces table
// without any per-write validation (workspace runtimes register their
// own URLs via /registry/register, and a misbehaving / compromised
// runtime could register a 169.254.169.254 metadata URL). Validate
// before handing it to the caller, who might dispatch HTTP against it.
// Currently gated by the bearer-required Discover handler, but this
// guard makes discoverHostPeer safe regardless of upstream auth shape.
if err := isSafeURL(url.String); err != nil {
log.Printf("Discovery: rejecting unsafe registered URL for %s (#1484): %v", resolvedID, err)
c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace URL failed safety check", "status": status})
return
}
db.CacheURL(ctx, resolvedID, url.String)
c.JSON(http.StatusOK, gin.H{
"id": resolvedID,
"url": url.String,
"status": status,
})
}
// discoverWorkspacePeer handles the workspace-to-workspace branch of Discover —
// resolves an internal/Docker-routable URL for `targetID` from the perspective
// of `callerID` and writes the JSON response (or an appropriate 404/503 error).
func discoverWorkspacePeer(ctx context.Context, c *gin.Context, callerID, targetID string) {
var wsName, wsRuntime string
db.DB.QueryRowContext(ctx, `SELECT COALESCE(name,''), COALESCE(runtime,'langgraph') FROM workspaces WHERE id = $1`, targetID).Scan(&wsName, &wsRuntime)
// External workspaces: return their registered URL.
// Rewrite 127.0.0.1/localhost → host.docker.internal ONLY when the
// caller itself is a Docker container; a remote (external) caller
// lives on the other side of the wire and needs the URL as-is
// (localhost rewrites wouldn't resolve from its host anyway).
// Phase 30.6.
if wsRuntime == "external" {
if handled := writeExternalWorkspaceURL(ctx, c, callerID, targetID, wsName); handled {
return
}
}
// Try cached internal URL first
if internalURL, err := db.GetCachedInternalURL(ctx, targetID); err == nil && internalURL != "" {
c.JSON(http.StatusOK, gin.H{"id": targetID, "url": internalURL, "name": wsName})
return
}
// Fallback: only synthesize a URL if the workspace exists and is online/degraded
var wsStatus string
dbErr := db.DB.QueryRowContext(ctx,
`SELECT status FROM workspaces WHERE id = $1`, targetID,
).Scan(&wsStatus)
if dbErr == nil && (wsStatus == "online" || wsStatus == "degraded") {
internalURL := provisioner.InternalURL(targetID)
if cacheErr := db.CacheInternalURL(ctx, targetID, internalURL); cacheErr != nil {
log.Printf("Discovery: failed to cache internal URL for %s: %v", targetID, cacheErr)
}
c.JSON(http.StatusOK, gin.H{"id": targetID, "url": internalURL, "name": wsName})
return
}
// Workspace is not reachable — don't fall through to host URL path
if dbErr == nil {
c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace not available", "status": wsStatus})
} else {
c.JSON(http.StatusNotFound, gin.H{"error": "workspace not found"})
}
}
// writeExternalWorkspaceURL resolves the registered URL for an external-runtime
// target and writes the response. Returns true when a response was written
// (URL present); returns false when the external workspace has no URL on
// file, leaving the caller to fall through to the internal-URL path.
func writeExternalWorkspaceURL(ctx context.Context, c *gin.Context, callerID, targetID, wsName string) bool {
var wsURL string
db.DB.QueryRowContext(ctx, `SELECT COALESCE(url,'') FROM workspaces WHERE id = $1`, targetID).Scan(&wsURL)
if wsURL == "" {
return false
}
outURL := wsURL
var callerRuntime string
db.DB.QueryRowContext(ctx, `SELECT COALESCE(runtime,'langgraph') FROM workspaces WHERE id = $1`, callerID).Scan(&callerRuntime)
if callerRuntime != "external" {
outURL = strings.Replace(outURL, "127.0.0.1", "host.docker.internal", 1)
outURL = strings.Replace(outURL, "localhost", "host.docker.internal", 1)
}
// #1484 SSRF defense-in-depth — same rationale as discoverHostPeer.
// We validate the post-rewrite URL because the rewrite changes which
// host the caller would dispatch against (host.docker.internal is
// only reachable inside a docker network; isSafeURL accepts it but
// blocks a metadata IP that survived the rewrite untouched).
if err := isSafeURL(outURL); err != nil {
log.Printf("Discovery: rejecting unsafe external workspace URL for %s (#1484): %v", targetID, err)
c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace URL failed safety check"})
return true
}
c.JSON(http.StatusOK, gin.H{"id": targetID, "url": outURL, "name": wsName})
return true
}
// Peers handles GET /registry/:id/peers
//
// Optional ``?q=<substring>`` filters the result by case-insensitive
// substring match against ``name`` or ``role`` (#1038). Filtering is done
// in Go after the DB read — keeps the SQL identical to the no-filter path
// (no injection risk, no DB-driver collation surprises) at the cost of
// loading the unfiltered set first. Acceptable because the peer set is
// always bounded by the small fanout of a single workspace's parent +
// children + siblings (typically <50 rows).
func (h *DiscoveryHandler) Peers(c *gin.Context) {
workspaceID := c.Param("id")
ctx := c.Request.Context()
// Phase 30.6 — the peer list leaks sibling identities and URLs.
// Require the bearer token bound to `workspaceID` before returning it.
// The caller HERE is identified by the URL path param, not a header,
// because `/registry/:id/peers` is scoped to "my own peers" — a
// workspace asking for its own view of the team.
if err := validateDiscoveryCaller(ctx, c, workspaceID); err != nil {
return // response already written
}
var parentID sql.NullString
err := db.DB.QueryRowContext(ctx, `SELECT parent_id FROM workspaces WHERE id = $1`, workspaceID).
Scan(&parentID)
if err == sql.ErrNoRows {
c.JSON(http.StatusNotFound, gin.H{"error": "workspace not found"})
return
}
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "lookup failed"})
return
}
var peers []map[string]interface{}
// Siblings
if parentID.Valid {
siblings, _ := queryPeerMaps(`
SELECT w.id, w.name, COALESCE(w.role, ''), w.tier, w.status,
COALESCE(w.agent_card, 'null'::jsonb), COALESCE(w.url, ''),
w.parent_id, w.active_tasks
FROM workspaces w WHERE w.parent_id = $1 AND w.id != $2 AND w.status != 'removed'`,
parentID.String, workspaceID)
peers = append(peers, siblings...)
} else {
siblings, _ := queryPeerMaps(`
SELECT w.id, w.name, COALESCE(w.role, ''), w.tier, w.status,
COALESCE(w.agent_card, 'null'::jsonb), COALESCE(w.url, ''),
w.parent_id, w.active_tasks
FROM workspaces w WHERE w.parent_id IS NULL AND w.id != $1 AND w.status != 'removed'`,
workspaceID)
peers = append(peers, siblings...)
}
// Children
children, _ := queryPeerMaps(`
SELECT w.id, w.name, COALESCE(w.role, ''), w.tier, w.status,
COALESCE(w.agent_card, 'null'::jsonb), COALESCE(w.url, ''),
w.parent_id, w.active_tasks
FROM workspaces w WHERE w.parent_id = $1 AND w.status != 'removed'`, workspaceID)
peers = append(peers, children...)
// Parent
if parentID.Valid {
parent, _ := queryPeerMaps(`
SELECT w.id, w.name, COALESCE(w.role, ''), w.tier, w.status,
COALESCE(w.agent_card, 'null'::jsonb), COALESCE(w.url, ''),
w.parent_id, w.active_tasks
FROM workspaces w WHERE w.id = $1 AND w.status != 'removed'`, parentID.String)
peers = append(peers, parent...)
}
peers = filterPeersByQuery(peers, c.Query("q"))
if peers == nil {
peers = make([]map[string]interface{}, 0)
}
c.JSON(http.StatusOK, peers)
}
// filterPeersByQuery returns peers whose name or role case-insensitively
// contains q. Whitespace-trimmed empty q is a no-op (returns input unchanged).
func filterPeersByQuery(peers []map[string]interface{}, q string) []map[string]interface{} {
q = strings.TrimSpace(q)
if q == "" {
return peers
}
needle := strings.ToLower(q)
out := make([]map[string]interface{}, 0, len(peers))
for _, p := range peers {
name := p["name"].(string)
role := p["role"].(string)
if strings.Contains(strings.ToLower(name), needle) ||
strings.Contains(strings.ToLower(role), needle) {
out = append(out, p)
}
}
return out
}
// queryPeerMaps returns clean JSON-serializable maps instead of Workspace structs.
func queryPeerMaps(query string, args ...interface{}) ([]map[string]interface{}, error) {
rows, err := db.DB.Query(query, args...)
if err != nil {
log.Printf("queryPeerMaps error: %v", err)
return nil, err
}
defer rows.Close()
var result []map[string]interface{}
for rows.Next() {
var id, name, role, status, url string
var tier, activeTasks int
var parentID *string
var agentCard []byte
err := rows.Scan(&id, &name, &role, &tier, &status, &agentCard, &url, &parentID, &activeTasks)
if err != nil {
log.Printf("queryPeerMaps scan error: %v", err)
continue
}
peer := map[string]interface{}{
"id": id,
"name": name,
"tier": tier,
"status": status,
"url": url,
"parent_id": parentID,
"active_tasks": activeTasks,
}
if role != "" {
peer["role"] = role
} else {
peer["role"] = nil
}
if len(agentCard) > 0 && string(agentCard) != "null" {
peer["agent_card"] = json.RawMessage(agentCard)
} else {
peer["agent_card"] = nil
}
result = append(result, peer)
}
return result, nil
}
// CheckAccess handles POST /registry/check-access
func (h *DiscoveryHandler) CheckAccess(c *gin.Context) {
var payload struct {
CallerID string `json:"caller_id" binding:"required"`
TargetID string `json:"target_id" binding:"required"`
}
if err := c.ShouldBindJSON(&payload); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "invalid request body"})
return
}
allowed := registry.CanCommunicate(payload.CallerID, payload.TargetID)
c.JSON(http.StatusOK, gin.H{"allowed": allowed})
}
// validateDiscoveryCaller enforces the Phase 30.6 bearer-token contract
// on the discovery endpoints. Same lazy-bootstrap shape as the registry
// and secrets handlers: legacy workspaces with no tokens are grandfathered,
// workspaces with tokens must present a matching Bearer, token binding
// is strict (A's token cannot authenticate caller B).
//
// Fail-open on DB hiccups. Unlike secrets.Values (which returns plaintext
// secrets and must fail closed), discovery only exposes peer URLs that
// are already behind the existing `CanCommunicate` hierarchy check — a
// momentary DB outage shouldn't take agent-to-agent discovery offline.
func validateDiscoveryCaller(ctx context.Context, c *gin.Context, workspaceID string) error {
hasLive, err := wsauth.HasAnyLiveToken(ctx, db.DB, workspaceID)
if err != nil {
log.Printf("wsauth: discovery HasAnyLiveToken(%s) failed: %v — allowing request", workspaceID, err)
return nil
}
if !hasLive {
return nil // legacy / pre-upgrade
}
// Tier-1b dev-mode hatch — same escape hatch AdminAuth and
// WorkspaceAuth apply on a local Docker setup. Without this, the
// canvas Details tab can never load peers for a workspace that has
// registered its live token, producing the 401 the user sees.
// Gated by MOLECULE_ENV=development + empty ADMIN_TOKEN, so SaaS
// production stays strict.
if middleware.IsDevModeFailOpen() {
return nil
}
// Try session cookie auth first (SaaS canvas path).
// verifiedCPSession returns (valid, presented):
// - (false, false) = no cookie, fall through to bearer
// - (true, true) = valid session, allow
// - (false, true) = cookie presented but invalid, 401
if cookieHeader := c.GetHeader("Cookie"); cookieHeader != "" {
if ok, presented := middleware.VerifiedCPSession(cookieHeader); presented {
if ok {
return nil // session verified, allow
}
c.JSON(http.StatusUnauthorized, gin.H{"error": "invalid session"})
return errors.New("invalid session")
}
}
tok := wsauth.BearerTokenFromHeader(c.GetHeader("Authorization"))
if tok == "" {
// Canvas hits this endpoint via session cookie, not bearer token.
// verifiedCPSession returns (valid, presented):
// - (false, false) = no cookie, 401
// - (true, true) = valid session, allow
// - (false, true) = cookie presented but invalid, 401
if ok, presented := middleware.VerifiedCPSession(c.GetHeader("Cookie")); presented {
if ok {
return nil
}
c.JSON(http.StatusUnauthorized, gin.H{"error": "invalid session"})
return errors.New("invalid session")
}
c.JSON(http.StatusUnauthorized, gin.H{"error": "missing workspace auth token"})
return errors.New("missing token")
}
if err := wsauth.ValidateToken(ctx, db.DB, workspaceID, tok); err != nil {
c.JSON(http.StatusUnauthorized, gin.H{"error": "invalid workspace auth token"})
return err
}
return nil
}