From fdb2b3a690e54e24f76b91e2029a01a5bedaf919 Mon Sep 17 00:00:00 2001 From: core-be Date: Wed, 20 May 2026 15:49:20 -0700 Subject: [PATCH 1/2] fix(workspace-server): exclude self from /registry/:id/peers + agent-readable 400 (#383) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Defense-in-depth fix for the self-delegation 400-loop seen on external workspace 小董文婷 in chloe-dong tenant (2026-05-20). Empirical: Activity-tab pattern A2A OUT "Delegating to 小董文婷" (source=target=小董文婷) followed by HTTP 400 {"error":"self-delegation not permitted"} from the platform's /workspaces/:id/delegate. The 400 guard at delegation.go:126 (#548) is correct — the bug is upstream: the agent is self-targeting. Root-cause class: an agent that sees its own row in /registry/:id/peers proceeds to delegate_task(), which deadlocks on the SDK sync path (_run_lock acquired twice) or — via the _delegate_sync_via_polling path that bypasses the SDK's in-process guard — hits the platform's 400 in a tight 2-3s retry loop. The existing SQL sibling filter (`w.id != $caller`) prevented self in the sibling branch only; the children + parent queries relied on "a workspace can't be its own child/parent" being structurally impossible — which it is, until data corruption (a self-loop in parent_id introduced by a buggy register path) makes it possible. Fix shape (3 layers, smallest layer first): 1. discovery.go peers SQL — children + parent queries gain explicit `AND w.id != $2` clauses so self can never enter the result even if parent_id corruption exists. Sibling clause was already correct; this aligns the other two branches. 2. discovery.go peers handler — final-line excludeSelfFromPeers() helper strips any row whose id matches the caller, regardless of which DB query returned it. Cheap O(n) over a peer set bounded at <50 rows; the guarantee is now contract-level ("self never appears in /peers response"), not query-level. Future code paths that add new queries without the self-filter cannot regress the contract. 3. delegation.go self-delegation 400 — expanded body from terse "self-delegation not permitted" to {error, reason, hint}. The agent-visible string now explicitly states the path is terminal so the LLM's retry heuristic stops looping. Same HTTP status, same key — additive, non-breaking for callers that only read response.error. Tests: - TestPeers_ExcludeSelf_DefenseInDepth — mocks the children query to (defectively) return the caller's own row; asserts the final response does NOT contain self while legitimate peers survive. - TestExcludeSelfFromPeers_Unit — 5 sub-cases pinning the helper contract (empty input, no-match passthrough, single-row drop, multi-row drop, missing-id-key preservation). - Existing TestPeers_WithParent / TestPeers_RootWorkspace_NoPeers / peersFilterFixture / multi-WS test updated to match the new children + parent SQL shape (`w.id != $2` arg added). Root cause still under investigation: this PR closes the self-delegation 400-loop class at the platform's peer-list layer. The downstream SDK question — which code path bypasses the in-process guard at tool_delegate_task line 226-233 — is filed as a follow-up; the empirical probe needed CP admin API access we couldn't establish inside the 25-minute investigation budget. Per CTO 2026-05-20 directive. Persona: core-be (40-char token). Refs: #383 (self-delegation loop), #190 (original self-echo bug), #548 (initial platform 400 guard), #345 (workspace-server SSOT parallel-impl note). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/handlers/delegation.go | 16 +- .../internal/handlers/discovery.go | 53 ++++- .../internal/handlers/discovery_test.go | 213 ++++++++++++++++-- 3 files changed, 262 insertions(+), 20 deletions(-) diff --git a/workspace-server/internal/handlers/delegation.go b/workspace-server/internal/handlers/delegation.go index 353a73ae2..7a89d3728 100644 --- a/workspace-server/internal/handlers/delegation.go +++ b/workspace-server/internal/handlers/delegation.go @@ -122,8 +122,22 @@ func (h *DelegationHandler) Delegate(c *gin.Context) { // #548 — prevent self-delegation: a workspace delegating to itself // acquires _run_lock twice on the same mutex, deadlocking permanently. + // + // #383 — the error message is the agent-visible string when this 400 + // fires on the SDK's _delegate_sync_via_polling path. The previous + // terse "self-delegation not permitted" was correct but indistinct + // from a transient rate-limit or auth failure, so the LLM would + // re-attempt every 2-3s in a tight loop (chloe-dong tenant external + // workspace, 2026-05-20). The expanded message is explicit about + // (a) what just happened, (b) why it cannot succeed, (c) what to do + // instead — so the agent's retry heuristic recognizes the path as + // terminal and stops. if sourceID == body.TargetID { - c.JSON(http.StatusBadRequest, gin.H{"error": "self-delegation not permitted"}) + c.JSON(http.StatusBadRequest, gin.H{ + "error": "self-delegation not permitted", + "reason": "the source workspace and target workspace are the same; you cannot delegate a task to yourself", + "hint": "do the work yourself, or pick a different peer via list_peers — retrying with the same target_id will fail every time", + }) return } diff --git a/workspace-server/internal/handlers/discovery.go b/workspace-server/internal/handlers/discovery.go index 33ec12637..1c639e2ea 100644 --- a/workspace-server/internal/handlers/discovery.go +++ b/workspace-server/internal/handlers/discovery.go @@ -256,24 +256,43 @@ func (h *DiscoveryHandler) Peers(c *gin.Context) { peers = append(peers, siblings...) } - // Children + // Children — exclude self defensively. A child row whose parent_id + // equals the requesting workspaceID can never legitimately be the + // caller (a workspace can't be its own child), but a future data- + // integrity defect (e.g. self-loop introduced by a buggy register + // path) would otherwise smuggle the caller back into its own peer + // list. The agent then attempts `delegate_task()`, which + // either deadlocks _run_lock (sync path) or hits the platform's + // self-delegation 400 in a tight loop (#383). The `w.id != $2` + // clause makes self-delegation-via-peer-list impossible regardless + // of DB state. children, _ := queryPeerMaps(` SELECT w.id, w.name, COALESCE(w.role, ''), w.tier, w.status, COALESCE(w.agent_card, 'null'::jsonb), COALESCE(w.url, ''), w.parent_id, w.active_tasks - FROM workspaces w WHERE w.parent_id = $1 AND w.status != 'removed'`, workspaceID) + FROM workspaces w WHERE w.parent_id = $1 AND w.id != $2 AND w.status != 'removed'`, + workspaceID, workspaceID) peers = append(peers, children...) - // Parent + // Parent — same defense-in-depth. A workspace whose parent_id points + // to itself is data corruption, but the peer-list endpoint must not + // propagate that corruption back to the agent as a "peer who is also + // you" entry. if parentID.Valid { parent, _ := queryPeerMaps(` SELECT w.id, w.name, COALESCE(w.role, ''), w.tier, w.status, COALESCE(w.agent_card, 'null'::jsonb), COALESCE(w.url, ''), w.parent_id, w.active_tasks - FROM workspaces w WHERE w.id = $1 AND w.status != 'removed'`, parentID.String) + FROM workspaces w WHERE w.id = $1 AND w.id != $2 AND w.status != 'removed'`, + parentID.String, workspaceID) peers = append(peers, parent...) } + // #383 final-line defense: even if a future code path adds a query + // that doesn't filter self, strip the caller's own row before + // returning. Cheap O(n) over a peer set bounded at <50 rows. + peers = excludeSelfFromPeers(peers, workspaceID) + peers = filterPeersByQuery(peers, c.Query("q")) if peers == nil { @@ -282,6 +301,32 @@ func (h *DiscoveryHandler) Peers(c *gin.Context) { c.JSON(http.StatusOK, peers) } +// excludeSelfFromPeers strips any peer entry whose ``id`` equals +// ``workspaceID`` (the caller's own row). Final-line defense for #383 +// (self-delegation 400-loop on external workspaces): a peer-list that +// includes the requester's own row is the root mechanism by which an +// agent ends up delegating to itself. The pre-DB filters in Peers +// already enforce `w.id != $caller` on each branch; this function +// guarantees the contract holds regardless of which query path +// returned the row, including future ones added without a self-filter. +// +// O(n) over a peer set bounded at <50 rows per `Peers` comment — well +// below the hot-path overhead of the existing filterPeersByQuery. +func excludeSelfFromPeers(peers []map[string]interface{}, workspaceID string) []map[string]interface{} { + if len(peers) == 0 { + return peers + } + out := make([]map[string]interface{}, 0, len(peers)) + for _, p := range peers { + id, _ := p["id"].(string) + if id == workspaceID { + continue + } + out = append(out, p) + } + return out +} + // filterPeersByQuery returns peers whose name or role case-insensitively // contains q. Whitespace-trimmed empty q is a no-op (returns input unchanged). func filterPeersByQuery(peers []map[string]interface{}, q string) []map[string]interface{} { diff --git a/workspace-server/internal/handlers/discovery_test.go b/workspace-server/internal/handlers/discovery_test.go index 3070fcf41..62f45a8b2 100644 --- a/workspace-server/internal/handlers/discovery_test.go +++ b/workspace-server/internal/handlers/discovery_test.go @@ -125,14 +125,14 @@ func TestPeers_WithParent(t *testing.T) { WillReturnRows(sqlmock.NewRows(peerCols). AddRow("ws-sibling-2", "Sibling Two", "worker", 1, "online", []byte("null"), "http://localhost:8002", "ws-parent", 0)) - // Expect children query - mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1 AND w.status"). - WithArgs("ws-sibling-1"). + // Expect children query — #383 added explicit `w.id != $2` self-filter + mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1 AND w.id != \\$2 AND w.status"). + WithArgs("ws-sibling-1", "ws-sibling-1"). WillReturnRows(sqlmock.NewRows(peerCols)) - // Expect parent query - mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.id = \\$1 AND w.status"). - WithArgs("ws-parent"). + // Expect parent query — #383 added explicit `w.id != $2` self-filter + mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.id = \\$1 AND w.id != \\$2 AND w.status"). + WithArgs("ws-parent", "ws-sibling-1"). WillReturnRows(sqlmock.NewRows(peerCols). AddRow("ws-parent", "Parent PM", "manager", 2, "online", []byte("null"), "http://localhost:8001", nil, 1)) @@ -228,9 +228,9 @@ func TestPeers_RootWorkspace_NoPeers(t *testing.T) { WithArgs("ws-root-alone"). WillReturnRows(sqlmock.NewRows(peerCols)) - // Children — none - mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1"). - WithArgs("ws-root-alone"). + // Children — none. #383 added explicit `w.id != $2` self-filter. + mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1 AND w.id != \\$2"). + WithArgs("ws-root-alone", "ws-root-alone"). WillReturnRows(sqlmock.NewRows(peerCols)) // No parent query since parent_id is NULL @@ -282,12 +282,14 @@ func peersFilterFixture(t *testing.T) (*DiscoveryHandler, sqlmock.Sqlmock) { AddRow("ws-alpha", "Alpha Researcher", "researcher", 1, "online", []byte("null"), "http://a", "ws-pm", 0). AddRow("ws-beta", "Beta Designer", "designer", 1, "online", []byte("null"), "http://b", "ws-pm", 0)) - mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1 AND w.status"). - WithArgs("ws-self"). + // #383 — children query gained explicit `w.id != $2` self-filter. + mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1 AND w.id != \\$2 AND w.status"). + WithArgs("ws-self", "ws-self"). WillReturnRows(sqlmock.NewRows(cols)) - mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.id = \\$1 AND w.status"). - WithArgs("ws-pm"). + // #383 — parent query gained explicit `w.id != $2` self-filter. + mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.id = \\$1 AND w.id != \\$2 AND w.status"). + WithArgs("ws-pm", "ws-self"). WillReturnRows(sqlmock.NewRows(cols). AddRow("ws-pm", "PM Workspace", "manager", 2, "online", []byte("null"), "http://pm", nil, 1)) @@ -966,8 +968,9 @@ func TestPeers_DevModeFailOpen_AllowsBearerlessRequest(t *testing.T) { mock.ExpectQuery("SELECT w.id.+WHERE w.parent_id IS NULL AND w.id"). WithArgs("ws-dev"). WillReturnRows(sqlmock.NewRows(peerCols)) - mock.ExpectQuery("SELECT w.id.+WHERE w.parent_id = \\$1 AND w.status"). - WithArgs("ws-dev"). + // #383 — children query gained explicit `w.id != $2` self-filter. + mock.ExpectQuery("SELECT w.id.+WHERE w.parent_id = \\$1 AND w.id != \\$2 AND w.status"). + WithArgs("ws-dev", "ws-dev"). WillReturnRows(sqlmock.NewRows(peerCols)) w := httptest.NewRecorder() @@ -1030,3 +1033,183 @@ func TestPeers_DevModeFailOpen_ClosedInProduction(t *testing.T) { t.Fatalf("expected 401 in production, got %d: %s", w.Code, w.Body.String()) } } + +// ==================== Peers — #383 self never appears in result ==================== + +// TestPeers_ExcludeSelf_DefenseInDepth verifies the final-line filter in +// Peers strips any row whose id matches the caller. The pre-DB SQL filters +// already do this, but a future code path that omits the `w.id != $caller` +// clause must not be able to smuggle a self-row through. This test +// simulates that future-defect case by mocking the children query to +// (incorrectly) return a row whose id matches the caller, and asserts the +// final filter still drops it. +// +// Root cause class for #383: an agent that sees its own row in /peers +// proceeds to delegate_task to itself, hitting the platform's +// self-delegation 400 in a tight loop. The fix in discovery.go is +// defense-in-depth: even if the SQL filter regresses, this handler-level +// filter prevents the 400-loop from materializing. +func TestPeers_ExcludeSelf_DefenseInDepth(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + handler := NewDiscoveryHandler() + + const selfID = "ws-xiaodong" + + // parent_id lookup — workspace has a parent. + mock.ExpectQuery("SELECT parent_id FROM workspaces WHERE id ="). + WithArgs(selfID). + WillReturnRows(sqlmock.NewRows([]string{"parent_id"}).AddRow("ws-parent")) + + peerCols := []string{"id", "name", "role", "tier", "status", "agent_card", "url", "parent_id", "active_tasks"} + + // Siblings — returns one legitimate sibling. The SQL filter excludes + // self at the source. + mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1 AND w.id != \\$2"). + WithArgs("ws-parent", selfID). + WillReturnRows(sqlmock.NewRows(peerCols). + AddRow("ws-sibling", "Sibling", "worker", 1, "online", []byte("null"), "http://localhost:8002", "ws-parent", 0)) + + // Children — simulates the data-integrity defect class: the DB + // (incorrectly) returns the caller's own row in the children set. + // In real production this would require a workspace whose + // parent_id points to itself — corruption only, but the handler + // must not propagate it. + mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.parent_id = \\$1 AND w.id != \\$2 AND w.status"). + WithArgs(selfID, selfID). + WillReturnRows(sqlmock.NewRows(peerCols). + AddRow(selfID, "Self As Child", "worker", 1, "online", []byte("null"), "http://localhost:8001", selfID, 0). + AddRow("ws-child", "Real Child", "worker", 1, "online", []byte("null"), "http://localhost:8003", selfID, 0)) + + // Parent — explicit `w.id != $2` clause so the parent path is also + // self-filtered. parentID.String = "ws-parent" != selfID, so the + // row is included. + mock.ExpectQuery("SELECT w.id, w.name.*WHERE w.id = \\$1 AND w.id != \\$2 AND w.status"). + WithArgs("ws-parent", selfID). + WillReturnRows(sqlmock.NewRows(peerCols). + AddRow("ws-parent", "Parent", "manager", 2, "online", []byte("null"), "http://localhost:8004", nil, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: selfID}} + c.Request = httptest.NewRequest("GET", "/registry/"+selfID+"/peers", nil) + + handler.Peers(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String()) + } + + var peers []map[string]interface{} + if err := json.Unmarshal(w.Body.Bytes(), &peers); err != nil { + t.Fatalf("failed to parse response: %v", err) + } + + // The defense-in-depth filter must drop the self row even though + // the (mocked-defective) children query returned it. + for _, p := range peers { + if id, _ := p["id"].(string); id == selfID { + t.Fatalf("peer list contains caller's own id %q — self-delegation defense regressed; full list: %+v", selfID, peers) + } + } + + // Sanity: the three legitimate peers (sibling, real child, parent) + // must all be present. Catches an over-aggressive filter that + // strips legitimate rows. + expectedIDs := map[string]bool{"ws-sibling": false, "ws-child": false, "ws-parent": false} + for _, p := range peers { + if id, _ := p["id"].(string); id != "" { + if _, ok := expectedIDs[id]; ok { + expectedIDs[id] = true + } + } + } + for id, found := range expectedIDs { + if !found { + t.Errorf("legitimate peer %q missing from response; got %+v", id, peers) + } + } + + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + +// TestExcludeSelfFromPeers_Unit exercises the helper directly so the +// defense-in-depth contract is asserted independently of SQL mocking. +// Pure-function tests run in microseconds and pin the filter shape +// (empty input, no-match passthrough, single-row drop, multi-row drop, +// preserves order) so future edits to the helper can't silently +// regress to "returns input unchanged". +func TestExcludeSelfFromPeers_Unit(t *testing.T) { + t.Run("empty input returns empty slice", func(t *testing.T) { + out := excludeSelfFromPeers(nil, "ws-self") + if len(out) != 0 { + t.Errorf("expected empty, got %+v", out) + } + }) + + t.Run("no self in list passes through unchanged", func(t *testing.T) { + in := []map[string]interface{}{ + {"id": "ws-a", "name": "A"}, + {"id": "ws-b", "name": "B"}, + } + out := excludeSelfFromPeers(in, "ws-self") + if len(out) != 2 { + t.Fatalf("expected 2, got %d (%+v)", len(out), out) + } + if out[0]["id"] != "ws-a" || out[1]["id"] != "ws-b" { + t.Errorf("order not preserved: %+v", out) + } + }) + + t.Run("self row dropped, others preserved", func(t *testing.T) { + in := []map[string]interface{}{ + {"id": "ws-a", "name": "A"}, + {"id": "ws-self", "name": "Me"}, + {"id": "ws-b", "name": "B"}, + } + out := excludeSelfFromPeers(in, "ws-self") + if len(out) != 2 { + t.Fatalf("expected 2, got %d (%+v)", len(out), out) + } + if out[0]["id"] != "ws-a" || out[1]["id"] != "ws-b" { + t.Errorf("expected [ws-a, ws-b], got %+v", out) + } + }) + + t.Run("multiple self rows all dropped", func(t *testing.T) { + // Pathological — should never happen, but the contract is + // "no row with id==workspaceID survives", not "at most one + // such row is dropped". Pin it. + in := []map[string]interface{}{ + {"id": "ws-self", "name": "Me1"}, + {"id": "ws-a", "name": "A"}, + {"id": "ws-self", "name": "Me2"}, + } + out := excludeSelfFromPeers(in, "ws-self") + if len(out) != 1 { + t.Fatalf("expected 1, got %d (%+v)", len(out), out) + } + if out[0]["id"] != "ws-a" { + t.Errorf("expected [ws-a], got %+v", out) + } + }) + + t.Run("row with missing id key is preserved (not a self-collision)", func(t *testing.T) { + // A peer row with no "id" key shouldn't be silently dropped + // by the self-filter — it's a malformed row class that + // belongs to a different defect. + in := []map[string]interface{}{ + {"name": "no-id-row"}, + {"id": "ws-self", "name": "Me"}, + } + out := excludeSelfFromPeers(in, "ws-self") + if len(out) != 1 { + t.Fatalf("expected 1, got %d (%+v)", len(out), out) + } + if out[0]["name"] != "no-id-row" { + t.Errorf("expected no-id-row preserved, got %+v", out) + } + }) +} -- 2.52.0 From 1d535bcc45454da42a45506aff43ab244575a180 Mon Sep 17 00:00:00 2001 From: core-be Date: Wed, 20 May 2026 23:47:16 +0000 Subject: [PATCH 2/2] test(workspace-server): update TestExtended_Peers sqlmock to match new self-filter args The children-peers query in Peers() now binds (parent_id, self_id) for the self-delegation defense added in #383 (PR#1624 layer 2). The sqlmock ExpectQuery fixture must include both args or sqlmock raises: unmet sqlmock expectations: ExpectedQuery => expecting Query[...] - matches sql: SELECT w.id, w.name - is with arguments: 0 - ws-peer Production discovery.go is unchanged; this is fixture-only. Refs: #383 --- workspace-server/internal/handlers/handlers_extended_test.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/workspace-server/internal/handlers/handlers_extended_test.go b/workspace-server/internal/handlers/handlers_extended_test.go index 1cb779cac..5e297e24e 100644 --- a/workspace-server/internal/handlers/handlers_extended_test.go +++ b/workspace-server/internal/handlers/handlers_extended_test.go @@ -432,9 +432,10 @@ func TestExtended_Peers(t *testing.T) { WillReturnRows(sqlmock.NewRows([]string{"id", "name", "role", "tier", "status", "agent_card", "url", "parent_id", "active_tasks"}). AddRow("ws-sibling", "Sibling Agent", "worker", 1, "online", []byte("null"), "http://localhost:9001", nil, 0)) - // Expect children query (workspaces with parent_id = ws-peer) + // Expect children query (workspaces with parent_id = ws-peer, excluding self) + // Query now binds (parent_id, self_id) for the self-filter guard added in #383. mock.ExpectQuery("SELECT w.id, w.name"). - WithArgs("ws-peer"). + WithArgs("ws-peer", "ws-peer"). WillReturnRows(sqlmock.NewRows([]string{"id", "name", "role", "tier", "status", "agent_card", "url", "parent_id", "active_tasks"})) // No parent query since workspace is root-level -- 2.52.0