From 3b34dfefbc4495b4925b965e2b9b4106d8fe2b16 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 30 Apr 2026 11:09:26 -0700 Subject: [PATCH] feat(workspace): surface peer-discovery failure reason instead of "may be isolated" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #2397. Today, every empty-peer condition (true empty, 401/403, 404, 5xx, network) collapses to a single message: "No peers available (this workspace may be isolated)". The user has no way to tell whether they need to provision more workspaces (true isolation), restart the workspace (auth), re-register (404), page on-call (5xx), or check network (timeout) — five different operator actions, one ambiguous string. Wire: - new helper get_peers_with_diagnostic() in a2a_client.py returns (peers, error_summary). error_summary is None on 200; a short actionable string on every other branch. - get_peers() now shims through it so non-tool callers (system-prompt formatters) keep the bare-list contract. - tool_list_peers() switches to the diagnostic helper and surfaces the actual reason. The "may be isolated" string is removed; true empty now reads "no peers in the platform registry." Tests: - TestGetPeersWithDiagnostic: 200, 200-empty, 401, 403, 404, 5xx, network exception, 200-but-non-list-body, and the bare-list-shim regression guard. - TestToolListPeers: each diagnostic branch surfaces its reason + explicit assertion that "may be isolated" is gone. Coverage 91.53% (floor 86%). 122 a2a tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- workspace/a2a_client.py | 58 ++++++++-- workspace/a2a_tools.py | 12 ++- workspace/tests/test_a2a_client.py | 143 +++++++++++++++++++++++++ workspace/tests/test_a2a_tools_impl.py | 55 ++++++++-- 4 files changed, 252 insertions(+), 16 deletions(-) diff --git a/workspace/a2a_client.py b/workspace/a2a_client.py index 7c5e3d87..43882bd1 100644 --- a/workspace/a2a_client.py +++ b/workspace/a2a_client.py @@ -229,19 +229,61 @@ async def send_a2a_message(target_url: str, message: str) -> str: return _format_a2a_error(last_exc, target_url) -async def get_peers() -> list[dict]: - """Get this workspace's peers from the platform registry.""" +async def get_peers_with_diagnostic() -> tuple[list[dict], str | None]: + """Get this workspace's peers, returning (peers, diagnostic). + + diagnostic is None when the call succeeded (status 200, even if the list + is empty). When peers is [] for a non-trivial reason (auth failure, + workspace-id missing from registry, platform error, network error), + diagnostic is a short human-readable string explaining what went wrong + so callers can surface it instead of "may be isolated" — see #2397. + + The legacy get_peers() shim below preserves the bare-list contract for + non-tool callers. + """ + url = f"{PLATFORM_URL}/registry/{WORKSPACE_ID}/peers" async with httpx.AsyncClient(timeout=10.0) as client: try: resp = await client.get( - f"{PLATFORM_URL}/registry/{WORKSPACE_ID}/peers", + url, headers={"X-Workspace-ID": WORKSPACE_ID, **auth_headers()}, ) - if resp.status_code == 200: - return resp.json() - return [] - except Exception: - return [] + except Exception as e: + return [], f"Cannot reach platform at {PLATFORM_URL}: {e}" + + if resp.status_code == 200: + try: + data = resp.json() + except Exception as e: + return [], f"Platform returned 200 but body was not JSON: {e}" + if not isinstance(data, list): + return [], f"Platform returned 200 but body was not a list: {type(data).__name__}" + return data, None + + if resp.status_code in (401, 403): + return [], ( + f"Authentication to platform failed (HTTP {resp.status_code}). " + "The workspace bearer token may be invalid — restarting the workspace usually re-mints it." + ) + if resp.status_code == 404: + return [], ( + f"Workspace ID {WORKSPACE_ID} is not registered with the platform (HTTP 404). " + "Re-registration via the platform's /registry/register endpoint is needed." + ) + if 500 <= resp.status_code < 600: + return [], f"Platform error: HTTP {resp.status_code}." + return [], f"Unexpected platform response: HTTP {resp.status_code}." + + +async def get_peers() -> list[dict]: + """Get this workspace's peers from the platform registry. + + Bare-list shim over get_peers_with_diagnostic() — discards the diagnostic + so callers that don't care about the failure reason (e.g. system-prompt + bootstrap formatters) get the same shape they always had. + """ + peers, _ = await get_peers_with_diagnostic() + return peers async def get_workspace_info() -> dict: diff --git a/workspace/a2a_tools.py b/workspace/a2a_tools.py index 4939e254..d5be00bd 100644 --- a/workspace/a2a_tools.py +++ b/workspace/a2a_tools.py @@ -18,6 +18,7 @@ from a2a_client import ( _peer_names, discover_peer, get_peers, + get_peers_with_diagnostic, get_workspace_info, send_a2a_message, ) @@ -410,9 +411,16 @@ async def tool_send_message_to_user(message: str, attachments: list[str] | None async def tool_list_peers() -> str: """List all workspaces this agent can communicate with.""" - peers = await get_peers() + peers, diagnostic = await get_peers_with_diagnostic() if not peers: - return "No peers available (this workspace may be isolated)" + if diagnostic is not None: + # Non-trivial empty: auth failure / 404 / 5xx / network — surface + # the actual reason so the user/agent doesn't have to guess. #2397. + return f"No peers found. {diagnostic}" + return ( + "You have no peers in the platform registry. " + "(No parent, no children, no siblings registered.)" + ) lines = [] for p in peers: status = p.get("status", "unknown") diff --git a/workspace/tests/test_a2a_client.py b/workspace/tests/test_a2a_client.py index e105fb1e..1412c91f 100644 --- a/workspace/tests/test_a2a_client.py +++ b/workspace/tests/test_a2a_client.py @@ -577,6 +577,149 @@ class TestGetPeers: assert headers_sent.get("X-Workspace-ID") == a2a_client.WORKSPACE_ID +# --------------------------------------------------------------------------- +# get_peers_with_diagnostic — issue #2397 +# +# Pin: an empty peer list MUST come with an actionable diagnostic on every +# non-200 + every transport failure. The bug was that get_peers swallowed +# every failure mode behind `return []`, leaving the agent's tool wrapper +# with no way to distinguish "you have no peers" from "auth broke" / "404 +# from registry" / "platform 5xx" / "network timeout". Each of these +# requires a different operator action. +# --------------------------------------------------------------------------- + +class TestGetPeersWithDiagnostic: + + async def test_200_returns_peers_and_no_diagnostic(self): + """200 with valid list → (peers, None). diagnostic stays None on success.""" + import a2a_client + + peers = [{"id": "ws-1", "name": "Alpha"}] + resp = _make_response(200, peers) + mock_client = _make_mock_client(get_resp=resp) + + with patch("a2a_client.httpx.AsyncClient", return_value=mock_client): + result, diag = await a2a_client.get_peers_with_diagnostic() + + assert result == peers + assert diag is None + + async def test_200_empty_list_returns_no_diagnostic(self): + """200 with [] → (peers=[], diag=None). Truly no peers is success, not error.""" + import a2a_client + + resp = _make_response(200, []) + mock_client = _make_mock_client(get_resp=resp) + + with patch("a2a_client.httpx.AsyncClient", return_value=mock_client): + result, diag = await a2a_client.get_peers_with_diagnostic() + + assert result == [] + assert diag is None + + async def test_401_returns_auth_diagnostic(self): + """401 → diagnostic mentions auth + restart hint.""" + import a2a_client + + resp = _make_response(401, {"detail": "unauthorized"}) + mock_client = _make_mock_client(get_resp=resp) + + with patch("a2a_client.httpx.AsyncClient", return_value=mock_client): + result, diag = await a2a_client.get_peers_with_diagnostic() + + assert result == [] + assert diag is not None + assert "401" in diag + assert "Authentication" in diag or "authentication" in diag.lower() + + async def test_403_returns_auth_diagnostic(self): + """403 → same auth-failure diagnostic shape as 401.""" + import a2a_client + + resp = _make_response(403, {"detail": "forbidden"}) + mock_client = _make_mock_client(get_resp=resp) + + with patch("a2a_client.httpx.AsyncClient", return_value=mock_client): + result, diag = await a2a_client.get_peers_with_diagnostic() + + assert result == [] + assert diag is not None + assert "403" in diag + + async def test_404_returns_registration_diagnostic(self): + """404 → diagnostic tells operator the workspace ID is missing from the registry.""" + import a2a_client + + resp = _make_response(404, {"detail": "not found"}) + mock_client = _make_mock_client(get_resp=resp) + + with patch("a2a_client.httpx.AsyncClient", return_value=mock_client): + result, diag = await a2a_client.get_peers_with_diagnostic() + + assert result == [] + assert diag is not None + assert "404" in diag + assert "registered" in diag.lower() or "registration" in diag.lower() + + async def test_500_returns_platform_error_diagnostic(self): + """5xx → 'Platform error: HTTP .'""" + import a2a_client + + resp = _make_response(503, {"detail": "service unavailable"}) + mock_client = _make_mock_client(get_resp=resp) + + with patch("a2a_client.httpx.AsyncClient", return_value=mock_client): + result, diag = await a2a_client.get_peers_with_diagnostic() + + assert result == [] + assert diag is not None + assert "503" in diag + assert "Platform error" in diag or "platform error" in diag.lower() + + async def test_network_exception_returns_unreachable_diagnostic(self): + """httpx exception → diagnostic mentions PLATFORM_URL + the underlying error.""" + import a2a_client + + mock_client = _make_mock_client(get_exc=TimeoutError("connection timed out")) + + with patch("a2a_client.httpx.AsyncClient", return_value=mock_client): + result, diag = await a2a_client.get_peers_with_diagnostic() + + assert result == [] + assert diag is not None + assert "Cannot reach platform" in diag or "cannot reach" in diag.lower() + assert "timed out" in diag + + async def test_200_with_non_list_body_returns_diagnostic(self): + """200 but body is a dict → diagnostic flags shape mismatch (regression guard).""" + import a2a_client + + resp = _make_response(200, {"oops": "should have been a list"}) + mock_client = _make_mock_client(get_resp=resp) + + with patch("a2a_client.httpx.AsyncClient", return_value=mock_client): + result, diag = await a2a_client.get_peers_with_diagnostic() + + assert result == [] + assert diag is not None + assert "list" in diag.lower() + + async def test_get_peers_shim_preserves_bare_list_contract(self): + """get_peers() still returns just list[dict] — no API break for non-tool callers.""" + import a2a_client + + peers = [{"id": "ws-1", "name": "Alpha"}] + resp = _make_response(200, peers) + mock_client = _make_mock_client(get_resp=resp) + + with patch("a2a_client.httpx.AsyncClient", return_value=mock_client): + result = await a2a_client.get_peers() + + # Must be a list, not a tuple — bare-list shim contract. + assert isinstance(result, list) + assert result == peers + + # --------------------------------------------------------------------------- # get_workspace_info # --------------------------------------------------------------------------- diff --git a/workspace/tests/test_a2a_tools_impl.py b/workspace/tests/test_a2a_tools_impl.py index 22a49268..90d31560 100644 --- a/workspace/tests/test_a2a_tools_impl.py +++ b/workspace/tests/test_a2a_tools_impl.py @@ -536,11 +536,54 @@ class TestToolSendMessageToUser: class TestToolListPeers: - async def test_no_peers_returns_isolated_message(self): + async def test_true_empty_returns_no_peers_message_without_diagnostic(self): + """200 + empty list → 'no peers in the platform registry' (no failure).""" import a2a_tools - with patch("a2a_tools.get_peers", return_value=[]): + with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], None)): result = await a2a_tools.tool_list_peers() - assert "No peers available" in result + # The new wording explicitly says no peers exist (no parent/sibling/child). + # Avoids the misleading "may be isolated" hint when discovery succeeded. + assert "no peers" in result.lower() + assert "No peers found." not in result # diagnostic prefix should NOT appear on the success branch + assert "may be isolated" not in result + + async def test_auth_failure_surfaces_restart_hint(self): + """401/403 → tool_list_peers must surface the auth failure + restart hint, not 'isolated'.""" + import a2a_tools + diag = "Authentication to platform failed (HTTP 401). Restart the workspace to re-mint." + with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], diag)): + result = await a2a_tools.tool_list_peers() + assert "401" in result + assert "Authentication" in result + # The "isolated" message was the bug — make sure the regression doesn't return. + assert "may be isolated" not in result + + async def test_404_surfaces_registration_hint(self): + """404 → tool_list_peers tells the user re-registration is needed.""" + import a2a_tools + diag = "Workspace ID ws-test is not registered with the platform (HTTP 404). Re-register." + with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], diag)): + result = await a2a_tools.tool_list_peers() + assert "404" in result + assert "registered" in result.lower() + + async def test_5xx_surfaces_platform_error(self): + """5xx → 'Platform error' surfaced; agent / user can correctly route to oncall.""" + import a2a_tools + diag = "Platform error: HTTP 503." + with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], diag)): + result = await a2a_tools.tool_list_peers() + assert "503" in result + assert "Platform error" in result + + async def test_network_error_surfaces_unreachable(self): + """Network error → operator can tell that the workspace can't reach the platform at all.""" + import a2a_tools + diag = "Cannot reach platform at http://platform.example: timed out" + with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], diag)): + result = await a2a_tools.tool_list_peers() + assert "Cannot reach platform" in result + assert "timed out" in result async def test_peers_returned_formatted_lines(self): """Peers list is formatted as '- name (ID: ..., status: ..., role: ...)'.""" @@ -550,7 +593,7 @@ class TestToolListPeers: {"id": "ws-1", "name": "Alpha", "status": "online", "role": "worker"}, {"id": "ws-2", "name": "Beta", "status": "idle", "role": "analyst"}, ] - with patch("a2a_tools.get_peers", return_value=peers): + with patch("a2a_tools.get_peers_with_diagnostic", return_value=(peers, None)): result = await a2a_tools.tool_list_peers() assert "Alpha" in result @@ -567,7 +610,7 @@ class TestToolListPeers: # Clear any prior cache entries for these IDs a2a_tools._peer_names.pop("ws-cache-test", None) peers = [{"id": "ws-cache-test", "name": "CacheMe", "status": "online", "role": "w"}] - with patch("a2a_tools.get_peers", return_value=peers): + with patch("a2a_tools.get_peers_with_diagnostic", return_value=(peers, None)): await a2a_tools.tool_list_peers() assert a2a_tools._peer_names.get("ws-cache-test") == "CacheMe" @@ -577,7 +620,7 @@ class TestToolListPeers: import a2a_tools peers = [{"id": "ws-3", "name": "Gamma"}] # no status, no role - with patch("a2a_tools.get_peers", return_value=peers): + with patch("a2a_tools.get_peers_with_diagnostic", return_value=(peers, None)): result = await a2a_tools.tool_list_peers() assert "Gamma" in result