fix(workspace-server): exclude self from peers + agent-readable 400 (#383) #1624

Merged
core-devops merged 2 commits from fix/self-delegation-peer-list-hardening into main 2026-05-21 00:12:39 +00:00
4 changed files with 265 additions and 22 deletions
@@ -122,8 +122,22 @@ func (h *DelegationHandler) Delegate(c *gin.Context) {
// #548 — prevent self-delegation: a workspace delegating to itself
// acquires _run_lock twice on the same mutex, deadlocking permanently.
//
// #383 — the error message is the agent-visible string when this 400
// fires on the SDK's _delegate_sync_via_polling path. The previous
// terse "self-delegation not permitted" was correct but indistinct
// from a transient rate-limit or auth failure, so the LLM would
// re-attempt every 2-3s in a tight loop (chloe-dong tenant external
// workspace, 2026-05-20). The expanded message is explicit about
// (a) what just happened, (b) why it cannot succeed, (c) what to do
// instead — so the agent's retry heuristic recognizes the path as
// terminal and stops.
if sourceID == body.TargetID {
c.JSON(http.StatusBadRequest, gin.H{"error": "self-delegation not permitted"})
c.JSON(http.StatusBadRequest, gin.H{
"error": "self-delegation not permitted",
"reason": "the source workspace and target workspace are the same; you cannot delegate a task to yourself",
"hint": "do the work yourself, or pick a different peer via list_peers — retrying with the same target_id will fail every time",
})
return
}
@@ -256,24 +256,43 @@ func (h *DiscoveryHandler) Peers(c *gin.Context) {
peers = append(peers, siblings...)
}
// Children
// Children — exclude self defensively. A child row whose parent_id
// equals the requesting workspaceID can never legitimately be the
// caller (a workspace can't be its own child), but a future data-
// integrity defect (e.g. self-loop introduced by a buggy register
// path) would otherwise smuggle the caller back into its own peer
// list. The agent then attempts `delegate_task(<own_id>)`, which
// either deadlocks _run_lock (sync path) or hits the platform's
// self-delegation 400 in a tight loop (#383). The `w.id != $2`
// clause makes self-delegation-via-peer-list impossible regardless
// of DB state.
children, _ := queryPeerMaps(`
SELECT w.id, w.name, COALESCE(w.role, ''), w.tier, w.status,
COALESCE(w.agent_card, 'null'::jsonb), COALESCE(w.url, ''),
w.parent_id, w.active_tasks
FROM workspaces w WHERE w.parent_id = $1 AND w.status != 'removed'`, workspaceID)
FROM workspaces w WHERE w.parent_id = $1 AND w.id != $2 AND w.status != 'removed'`,
workspaceID, workspaceID)
peers = append(peers, children...)
// Parent
// Parent — same defense-in-depth. A workspace whose parent_id points
// to itself is data corruption, but the peer-list endpoint must not
// propagate that corruption back to the agent as a "peer who is also
// you" entry.
if parentID.Valid {
parent, _ := queryPeerMaps(`
SELECT w.id, w.name, COALESCE(w.role, ''), w.tier, w.status,
COALESCE(w.agent_card, 'null'::jsonb), COALESCE(w.url, ''),
w.parent_id, w.active_tasks
FROM workspaces w WHERE w.id = $1 AND w.status != 'removed'`, parentID.String)
FROM workspaces w WHERE w.id = $1 AND w.id != $2 AND w.status != 'removed'`,
parentID.String, workspaceID)
peers = append(peers, parent...)
}
// #383 final-line defense: even if a future code path adds a query
// that doesn't filter self, strip the caller's own row before
// returning. Cheap O(n) over a peer set bounded at <50 rows.
peers = excludeSelfFromPeers(peers, workspaceID)
peers = filterPeersByQuery(peers, c.Query("q"))
if peers == nil {
@@ -282,6 +301,32 @@ func (h *DiscoveryHandler) Peers(c *gin.Context) {
c.JSON(http.StatusOK, peers)
}
// excludeSelfFromPeers strips any peer entry whose ``id`` equals
// ``workspaceID`` (the caller's own row). Final-line defense for #383
// (self-delegation 400-loop on external workspaces): a peer-list that
// includes the requester's own row is the root mechanism by which an
// agent ends up delegating to itself. The pre-DB filters in Peers
// already enforce `w.id != $caller` on each branch; this function
// guarantees the contract holds regardless of which query path
// returned the row, including future ones added without a self-filter.
//
// O(n) over a peer set bounded at <50 rows per `Peers` comment — well
// below the hot-path overhead of the existing filterPeersByQuery.
func excludeSelfFromPeers(peers []map[string]interface{}, workspaceID string) []map[string]interface{} {
if len(peers) == 0 {
return peers
}
out := make([]map[string]interface{}, 0, len(peers))
for _, p := range peers {
id, _ := p["id"].(string)
if id == workspaceID {
continue
}
out = append(out, p)
}
return out
}
// filterPeersByQuery returns peers whose name or role case-insensitively
// contains q. Whitespace-trimmed empty q is a no-op (returns input unchanged).
func filterPeersByQuery(peers []map[string]interface{}, q string) []map[string]interface{} {
@@ -125,14 +125,14 @@ func TestPeers_WithParent(t *testing.T) {
WillReturnRows(sqlmock.NewRows(peerCols).
AddRow("ws-sibling-2", "Sibling Two", "worker", 1, "online", []byte("null"), "http://localhost:8002", "ws-parent", 0))
// Expect children query
mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1 AND w.status").
WithArgs("ws-sibling-1").
// Expect children query — #383 added explicit `w.id != $2` self-filter
mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1 AND w.id != \\$2 AND w.status").
WithArgs("ws-sibling-1", "ws-sibling-1").
WillReturnRows(sqlmock.NewRows(peerCols))
// Expect parent query
mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.id = \\$1 AND w.status").
WithArgs("ws-parent").
// Expect parent query — #383 added explicit `w.id != $2` self-filter
mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.id = \\$1 AND w.id != \\$2 AND w.status").
WithArgs("ws-parent", "ws-sibling-1").
WillReturnRows(sqlmock.NewRows(peerCols).
AddRow("ws-parent", "Parent PM", "manager", 2, "online", []byte("null"), "http://localhost:8001", nil, 1))
@@ -228,9 +228,9 @@ func TestPeers_RootWorkspace_NoPeers(t *testing.T) {
WithArgs("ws-root-alone").
WillReturnRows(sqlmock.NewRows(peerCols))
// Children — none
mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1").
WithArgs("ws-root-alone").
// Children — none. #383 added explicit `w.id != $2` self-filter.
mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1 AND w.id != \\$2").
WithArgs("ws-root-alone", "ws-root-alone").
WillReturnRows(sqlmock.NewRows(peerCols))
// No parent query since parent_id is NULL
@@ -282,12 +282,14 @@ func peersFilterFixture(t *testing.T) (*DiscoveryHandler, sqlmock.Sqlmock) {
AddRow("ws-alpha", "Alpha Researcher", "researcher", 1, "online", []byte("null"), "http://a", "ws-pm", 0).
AddRow("ws-beta", "Beta Designer", "designer", 1, "online", []byte("null"), "http://b", "ws-pm", 0))
mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1 AND w.status").
WithArgs("ws-self").
// #383 — children query gained explicit `w.id != $2` self-filter.
mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1 AND w.id != \\$2 AND w.status").
WithArgs("ws-self", "ws-self").
WillReturnRows(sqlmock.NewRows(cols))
mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.id = \\$1 AND w.status").
WithArgs("ws-pm").
// #383 — parent query gained explicit `w.id != $2` self-filter.
mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.id = \\$1 AND w.id != \\$2 AND w.status").
WithArgs("ws-pm", "ws-self").
WillReturnRows(sqlmock.NewRows(cols).
AddRow("ws-pm", "PM Workspace", "manager", 2, "online", []byte("null"), "http://pm", nil, 1))
@@ -966,8 +968,9 @@ func TestPeers_DevModeFailOpen_AllowsBearerlessRequest(t *testing.T) {
mock.ExpectQuery("SELECT w.id.+WHERE w.parent_id IS NULL AND w.id").
WithArgs("ws-dev").
WillReturnRows(sqlmock.NewRows(peerCols))
mock.ExpectQuery("SELECT w.id.+WHERE w.parent_id = \\$1 AND w.status").
WithArgs("ws-dev").
// #383 — children query gained explicit `w.id != $2` self-filter.
mock.ExpectQuery("SELECT w.id.+WHERE w.parent_id = \\$1 AND w.id != \\$2 AND w.status").
WithArgs("ws-dev", "ws-dev").
WillReturnRows(sqlmock.NewRows(peerCols))
w := httptest.NewRecorder()
@@ -1030,3 +1033,183 @@ func TestPeers_DevModeFailOpen_ClosedInProduction(t *testing.T) {
t.Fatalf("expected 401 in production, got %d: %s", w.Code, w.Body.String())
}
}
// ==================== Peers — #383 self never appears in result ====================
// TestPeers_ExcludeSelf_DefenseInDepth verifies the final-line filter in
// Peers strips any row whose id matches the caller. The pre-DB SQL filters
// already do this, but a future code path that omits the `w.id != $caller`
// clause must not be able to smuggle a self-row through. This test
// simulates that future-defect case by mocking the children query to
// (incorrectly) return a row whose id matches the caller, and asserts the
// final filter still drops it.
//
// Root cause class for #383: an agent that sees its own row in /peers
// proceeds to delegate_task to itself, hitting the platform's
// self-delegation 400 in a tight loop. The fix in discovery.go is
// defense-in-depth: even if the SQL filter regresses, this handler-level
// filter prevents the 400-loop from materializing.
func TestPeers_ExcludeSelf_DefenseInDepth(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
handler := NewDiscoveryHandler()
const selfID = "ws-xiaodong"
// parent_id lookup — workspace has a parent.
mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id =").
WithArgs(selfID).
WillReturnRows(sqlmock.NewRows([]string{"parent_id"}).AddRow("ws-parent"))
peerCols := []string{"id", "name", "role", "tier", "status", "agent_card", "url", "parent_id", "active_tasks"}
// Siblings — returns one legitimate sibling. The SQL filter excludes
// self at the source.
mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1 AND w.id != \\$2").
WithArgs("ws-parent", selfID).
WillReturnRows(sqlmock.NewRows(peerCols).
AddRow("ws-sibling", "Sibling", "worker", 1, "online", []byte("null"), "http://localhost:8002", "ws-parent", 0))
// Children — simulates the data-integrity defect class: the DB
// (incorrectly) returns the caller's own row in the children set.
// In real production this would require a workspace whose
// parent_id points to itself — corruption only, but the handler
// must not propagate it.
mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1 AND w.id != \\$2 AND w.status").
WithArgs(selfID, selfID).
WillReturnRows(sqlmock.NewRows(peerCols).
AddRow(selfID, "Self As Child", "worker", 1, "online", []byte("null"), "http://localhost:8001", selfID, 0).
AddRow("ws-child", "Real Child", "worker", 1, "online", []byte("null"), "http://localhost:8003", selfID, 0))
// Parent — explicit `w.id != $2` clause so the parent path is also
// self-filtered. parentID.String = "ws-parent" != selfID, so the
// row is included.
mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.id = \\$1 AND w.id != \\$2 AND w.status").
WithArgs("ws-parent", selfID).
WillReturnRows(sqlmock.NewRows(peerCols).
AddRow("ws-parent", "Parent", "manager", 2, "online", []byte("null"), "http://localhost:8004", nil, 1))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
c.Params = gin.Params{{Key: "id", Value: selfID}}
c.Request = httptest.NewRequest("GET", "/registry/"+selfID+"/peers", nil)
handler.Peers(c)
if w.Code != http.StatusOK {
t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
}
var peers []map[string]interface{}
if err := json.Unmarshal(w.Body.Bytes(), &peers); err != nil {
t.Fatalf("failed to parse response: %v", err)
}
// The defense-in-depth filter must drop the self row even though
// the (mocked-defective) children query returned it.
for _, p := range peers {
if id, _ := p["id"].(string); id == selfID {
t.Fatalf("peer list contains caller's own id %q — self-delegation defense regressed; full list: %+v", selfID, peers)
}
}
// Sanity: the three legitimate peers (sibling, real child, parent)
// must all be present. Catches an over-aggressive filter that
// strips legitimate rows.
expectedIDs := map[string]bool{"ws-sibling": false, "ws-child": false, "ws-parent": false}
for _, p := range peers {
if id, _ := p["id"].(string); id != "" {
if _, ok := expectedIDs[id]; ok {
expectedIDs[id] = true
}
}
}
for id, found := range expectedIDs {
if !found {
t.Errorf("legitimate peer %q missing from response; got %+v", id, peers)
}
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet sqlmock expectations: %v", err)
}
}
// TestExcludeSelfFromPeers_Unit exercises the helper directly so the
// defense-in-depth contract is asserted independently of SQL mocking.
// Pure-function tests run in microseconds and pin the filter shape
// (empty input, no-match passthrough, single-row drop, multi-row drop,
// preserves order) so future edits to the helper can't silently
// regress to "returns input unchanged".
func TestExcludeSelfFromPeers_Unit(t *testing.T) {
t.Run("empty input returns empty slice", func(t *testing.T) {
out := excludeSelfFromPeers(nil, "ws-self")
if len(out) != 0 {
t.Errorf("expected empty, got %+v", out)
}
})
t.Run("no self in list passes through unchanged", func(t *testing.T) {
in := []map[string]interface{}{
{"id": "ws-a", "name": "A"},
{"id": "ws-b", "name": "B"},
}
out := excludeSelfFromPeers(in, "ws-self")
if len(out) != 2 {
t.Fatalf("expected 2, got %d (%+v)", len(out), out)
}
if out[0]["id"] != "ws-a" || out[1]["id"] != "ws-b" {
t.Errorf("order not preserved: %+v", out)
}
})
t.Run("self row dropped, others preserved", func(t *testing.T) {
in := []map[string]interface{}{
{"id": "ws-a", "name": "A"},
{"id": "ws-self", "name": "Me"},
{"id": "ws-b", "name": "B"},
}
out := excludeSelfFromPeers(in, "ws-self")
if len(out) != 2 {
t.Fatalf("expected 2, got %d (%+v)", len(out), out)
}
if out[0]["id"] != "ws-a" || out[1]["id"] != "ws-b" {
t.Errorf("expected [ws-a, ws-b], got %+v", out)
}
})
t.Run("multiple self rows all dropped", func(t *testing.T) {
// Pathological — should never happen, but the contract is
// "no row with id==workspaceID survives", not "at most one
// such row is dropped". Pin it.
in := []map[string]interface{}{
{"id": "ws-self", "name": "Me1"},
{"id": "ws-a", "name": "A"},
{"id": "ws-self", "name": "Me2"},
}
out := excludeSelfFromPeers(in, "ws-self")
if len(out) != 1 {
t.Fatalf("expected 1, got %d (%+v)", len(out), out)
}
if out[0]["id"] != "ws-a" {
t.Errorf("expected [ws-a], got %+v", out)
}
})
t.Run("row with missing id key is preserved (not a self-collision)", func(t *testing.T) {
// A peer row with no "id" key shouldn't be silently dropped
// by the self-filter — it's a malformed row class that
// belongs to a different defect.
in := []map[string]interface{}{
{"name": "no-id-row"},
{"id": "ws-self", "name": "Me"},
}
out := excludeSelfFromPeers(in, "ws-self")
if len(out) != 1 {
t.Fatalf("expected 1, got %d (%+v)", len(out), out)
}
if out[0]["name"] != "no-id-row" {
t.Errorf("expected no-id-row preserved, got %+v", out)
}
})
}
@@ -432,9 +432,10 @@ func TestExtended_Peers(t *testing.T) {
WillReturnRows(sqlmock.NewRows([]string{"id", "name", "role", "tier", "status", "agent_card", "url", "parent_id", "active_tasks"}).
AddRow("ws-sibling", "Sibling Agent", "worker", 1, "online", []byte("null"), "http://localhost:9001", nil, 0))
// Expect children query (workspaces with parent_id = ws-peer)
// Expect children query (workspaces with parent_id = ws-peer, excluding self)
// Query now binds (parent_id, self_id) for the self-filter guard added in #383.
mock.ExpectQuery("SELECT w.id, w.name").
WithArgs("ws-peer").
WithArgs("ws-peer", "ws-peer").
WillReturnRows(sqlmock.NewRows([]string{"id", "name", "role", "tier", "status", "agent_card", "url", "parent_id", "active_tasks"}))
// No parent query since workspace is root-level