feat(workspace): surface peer-discovery failure reason instead of "may be isolated"

Closes #2397. Today, every empty-peer condition (true empty, 401/403, 404,
5xx, network) collapses to a single message: "No peers available (this
workspace may be isolated)". The user has no way to tell whether they need
to provision more workspaces (true isolation), restart the workspace
(auth), re-register (404), page on-call (5xx), or check network (timeout) —
five different operator actions, one ambiguous string.

Wire:
  - new helper get_peers_with_diagnostic() in a2a_client.py returns
    (peers, error_summary). error_summary is None on 200; a short
    actionable string on every other branch.
  - get_peers() now shims through it so non-tool callers (system-prompt
    formatters) keep the bare-list contract.
  - tool_list_peers() switches to the diagnostic helper and surfaces the
    actual reason. The "may be isolated" string is removed; true empty
    now reads "no peers in the platform registry."

Tests:
  - TestGetPeersWithDiagnostic: 200, 200-empty, 401, 403, 404, 5xx,
    network exception, 200-but-non-list-body, and the bare-list-shim
    regression guard.
  - TestToolListPeers: each diagnostic branch surfaces its reason +
    explicit assertion that "may be isolated" is gone.

Coverage 91.53% (floor 86%). 122 a2a tests pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hongming Wang 2026-04-30 11:09:26 -07:00
parent c06e2fec5e
commit 3b34dfefbc
4 changed files with 252 additions and 16 deletions

View File

@ -229,19 +229,61 @@ async def send_a2a_message(target_url: str, message: str) -> str:
return _format_a2a_error(last_exc, target_url)
async def get_peers() -> list[dict]:
"""Get this workspace's peers from the platform registry."""
async def get_peers_with_diagnostic() -> tuple[list[dict], str | None]:
"""Get this workspace's peers, returning (peers, diagnostic).
diagnostic is None when the call succeeded (status 200, even if the list
is empty). When peers is [] for a non-trivial reason (auth failure,
workspace-id missing from registry, platform error, network error),
diagnostic is a short human-readable string explaining what went wrong
so callers can surface it instead of "may be isolated" see #2397.
The legacy get_peers() shim below preserves the bare-list contract for
non-tool callers.
"""
url = f"{PLATFORM_URL}/registry/{WORKSPACE_ID}/peers"
async with httpx.AsyncClient(timeout=10.0) as client:
try:
resp = await client.get(
f"{PLATFORM_URL}/registry/{WORKSPACE_ID}/peers",
url,
headers={"X-Workspace-ID": WORKSPACE_ID, **auth_headers()},
)
if resp.status_code == 200:
return resp.json()
return []
except Exception:
return []
except Exception as e:
return [], f"Cannot reach platform at {PLATFORM_URL}: {e}"
if resp.status_code == 200:
try:
data = resp.json()
except Exception as e:
return [], f"Platform returned 200 but body was not JSON: {e}"
if not isinstance(data, list):
return [], f"Platform returned 200 but body was not a list: {type(data).__name__}"
return data, None
if resp.status_code in (401, 403):
return [], (
f"Authentication to platform failed (HTTP {resp.status_code}). "
"The workspace bearer token may be invalid — restarting the workspace usually re-mints it."
)
if resp.status_code == 404:
return [], (
f"Workspace ID {WORKSPACE_ID} is not registered with the platform (HTTP 404). "
"Re-registration via the platform's /registry/register endpoint is needed."
)
if 500 <= resp.status_code < 600:
return [], f"Platform error: HTTP {resp.status_code}."
return [], f"Unexpected platform response: HTTP {resp.status_code}."
async def get_peers() -> list[dict]:
"""Get this workspace's peers from the platform registry.
Bare-list shim over get_peers_with_diagnostic() discards the diagnostic
so callers that don't care about the failure reason (e.g. system-prompt
bootstrap formatters) get the same shape they always had.
"""
peers, _ = await get_peers_with_diagnostic()
return peers
async def get_workspace_info() -> dict:

View File

@ -18,6 +18,7 @@ from a2a_client import (
_peer_names,
discover_peer,
get_peers,
get_peers_with_diagnostic,
get_workspace_info,
send_a2a_message,
)
@ -410,9 +411,16 @@ async def tool_send_message_to_user(message: str, attachments: list[str] | None
async def tool_list_peers() -> str:
"""List all workspaces this agent can communicate with."""
peers = await get_peers()
peers, diagnostic = await get_peers_with_diagnostic()
if not peers:
return "No peers available (this workspace may be isolated)"
if diagnostic is not None:
# Non-trivial empty: auth failure / 404 / 5xx / network — surface
# the actual reason so the user/agent doesn't have to guess. #2397.
return f"No peers found. {diagnostic}"
return (
"You have no peers in the platform registry. "
"(No parent, no children, no siblings registered.)"
)
lines = []
for p in peers:
status = p.get("status", "unknown")

View File

@ -577,6 +577,149 @@ class TestGetPeers:
assert headers_sent.get("X-Workspace-ID") == a2a_client.WORKSPACE_ID
# ---------------------------------------------------------------------------
# get_peers_with_diagnostic — issue #2397
#
# Pin: an empty peer list MUST come with an actionable diagnostic on every
# non-200 + every transport failure. The bug was that get_peers swallowed
# every failure mode behind `return []`, leaving the agent's tool wrapper
# with no way to distinguish "you have no peers" from "auth broke" / "404
# from registry" / "platform 5xx" / "network timeout". Each of these
# requires a different operator action.
# ---------------------------------------------------------------------------
class TestGetPeersWithDiagnostic:
async def test_200_returns_peers_and_no_diagnostic(self):
"""200 with valid list → (peers, None). diagnostic stays None on success."""
import a2a_client
peers = [{"id": "ws-1", "name": "Alpha"}]
resp = _make_response(200, peers)
mock_client = _make_mock_client(get_resp=resp)
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
result, diag = await a2a_client.get_peers_with_diagnostic()
assert result == peers
assert diag is None
async def test_200_empty_list_returns_no_diagnostic(self):
"""200 with [] → (peers=[], diag=None). Truly no peers is success, not error."""
import a2a_client
resp = _make_response(200, [])
mock_client = _make_mock_client(get_resp=resp)
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
result, diag = await a2a_client.get_peers_with_diagnostic()
assert result == []
assert diag is None
async def test_401_returns_auth_diagnostic(self):
"""401 → diagnostic mentions auth + restart hint."""
import a2a_client
resp = _make_response(401, {"detail": "unauthorized"})
mock_client = _make_mock_client(get_resp=resp)
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
result, diag = await a2a_client.get_peers_with_diagnostic()
assert result == []
assert diag is not None
assert "401" in diag
assert "Authentication" in diag or "authentication" in diag.lower()
async def test_403_returns_auth_diagnostic(self):
"""403 → same auth-failure diagnostic shape as 401."""
import a2a_client
resp = _make_response(403, {"detail": "forbidden"})
mock_client = _make_mock_client(get_resp=resp)
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
result, diag = await a2a_client.get_peers_with_diagnostic()
assert result == []
assert diag is not None
assert "403" in diag
async def test_404_returns_registration_diagnostic(self):
"""404 → diagnostic tells operator the workspace ID is missing from the registry."""
import a2a_client
resp = _make_response(404, {"detail": "not found"})
mock_client = _make_mock_client(get_resp=resp)
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
result, diag = await a2a_client.get_peers_with_diagnostic()
assert result == []
assert diag is not None
assert "404" in diag
assert "registered" in diag.lower() or "registration" in diag.lower()
async def test_500_returns_platform_error_diagnostic(self):
"""5xx → 'Platform error: HTTP <code>.'"""
import a2a_client
resp = _make_response(503, {"detail": "service unavailable"})
mock_client = _make_mock_client(get_resp=resp)
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
result, diag = await a2a_client.get_peers_with_diagnostic()
assert result == []
assert diag is not None
assert "503" in diag
assert "Platform error" in diag or "platform error" in diag.lower()
async def test_network_exception_returns_unreachable_diagnostic(self):
"""httpx exception → diagnostic mentions PLATFORM_URL + the underlying error."""
import a2a_client
mock_client = _make_mock_client(get_exc=TimeoutError("connection timed out"))
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
result, diag = await a2a_client.get_peers_with_diagnostic()
assert result == []
assert diag is not None
assert "Cannot reach platform" in diag or "cannot reach" in diag.lower()
assert "timed out" in diag
async def test_200_with_non_list_body_returns_diagnostic(self):
"""200 but body is a dict → diagnostic flags shape mismatch (regression guard)."""
import a2a_client
resp = _make_response(200, {"oops": "should have been a list"})
mock_client = _make_mock_client(get_resp=resp)
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
result, diag = await a2a_client.get_peers_with_diagnostic()
assert result == []
assert diag is not None
assert "list" in diag.lower()
async def test_get_peers_shim_preserves_bare_list_contract(self):
"""get_peers() still returns just list[dict] — no API break for non-tool callers."""
import a2a_client
peers = [{"id": "ws-1", "name": "Alpha"}]
resp = _make_response(200, peers)
mock_client = _make_mock_client(get_resp=resp)
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
result = await a2a_client.get_peers()
# Must be a list, not a tuple — bare-list shim contract.
assert isinstance(result, list)
assert result == peers
# ---------------------------------------------------------------------------
# get_workspace_info
# ---------------------------------------------------------------------------

View File

@ -536,11 +536,54 @@ class TestToolSendMessageToUser:
class TestToolListPeers:
async def test_no_peers_returns_isolated_message(self):
async def test_true_empty_returns_no_peers_message_without_diagnostic(self):
"""200 + empty list → 'no peers in the platform registry' (no failure)."""
import a2a_tools
with patch("a2a_tools.get_peers", return_value=[]):
with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], None)):
result = await a2a_tools.tool_list_peers()
assert "No peers available" in result
# The new wording explicitly says no peers exist (no parent/sibling/child).
# Avoids the misleading "may be isolated" hint when discovery succeeded.
assert "no peers" in result.lower()
assert "No peers found." not in result # diagnostic prefix should NOT appear on the success branch
assert "may be isolated" not in result
async def test_auth_failure_surfaces_restart_hint(self):
"""401/403 → tool_list_peers must surface the auth failure + restart hint, not 'isolated'."""
import a2a_tools
diag = "Authentication to platform failed (HTTP 401). Restart the workspace to re-mint."
with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], diag)):
result = await a2a_tools.tool_list_peers()
assert "401" in result
assert "Authentication" in result
# The "isolated" message was the bug — make sure the regression doesn't return.
assert "may be isolated" not in result
async def test_404_surfaces_registration_hint(self):
"""404 → tool_list_peers tells the user re-registration is needed."""
import a2a_tools
diag = "Workspace ID ws-test is not registered with the platform (HTTP 404). Re-register."
with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], diag)):
result = await a2a_tools.tool_list_peers()
assert "404" in result
assert "registered" in result.lower()
async def test_5xx_surfaces_platform_error(self):
"""5xx → 'Platform error' surfaced; agent / user can correctly route to oncall."""
import a2a_tools
diag = "Platform error: HTTP 503."
with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], diag)):
result = await a2a_tools.tool_list_peers()
assert "503" in result
assert "Platform error" in result
async def test_network_error_surfaces_unreachable(self):
"""Network error → operator can tell that the workspace can't reach the platform at all."""
import a2a_tools
diag = "Cannot reach platform at http://platform.example: timed out"
with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], diag)):
result = await a2a_tools.tool_list_peers()
assert "Cannot reach platform" in result
assert "timed out" in result
async def test_peers_returned_formatted_lines(self):
"""Peers list is formatted as '- name (ID: ..., status: ..., role: ...)'."""
@ -550,7 +593,7 @@ class TestToolListPeers:
{"id": "ws-1", "name": "Alpha", "status": "online", "role": "worker"},
{"id": "ws-2", "name": "Beta", "status": "idle", "role": "analyst"},
]
with patch("a2a_tools.get_peers", return_value=peers):
with patch("a2a_tools.get_peers_with_diagnostic", return_value=(peers, None)):
result = await a2a_tools.tool_list_peers()
assert "Alpha" in result
@ -567,7 +610,7 @@ class TestToolListPeers:
# Clear any prior cache entries for these IDs
a2a_tools._peer_names.pop("ws-cache-test", None)
peers = [{"id": "ws-cache-test", "name": "CacheMe", "status": "online", "role": "w"}]
with patch("a2a_tools.get_peers", return_value=peers):
with patch("a2a_tools.get_peers_with_diagnostic", return_value=(peers, None)):
await a2a_tools.tool_list_peers()
assert a2a_tools._peer_names.get("ws-cache-test") == "CacheMe"
@ -577,7 +620,7 @@ class TestToolListPeers:
import a2a_tools
peers = [{"id": "ws-3", "name": "Gamma"}] # no status, no role
with patch("a2a_tools.get_peers", return_value=peers):
with patch("a2a_tools.get_peers_with_diagnostic", return_value=(peers, None)):
result = await a2a_tools.tool_list_peers()
assert "Gamma" in result