feat(workspace): surface peer-discovery failure reason instead of "may be isolated"
Closes #2397. Today, every empty-peer condition (true empty, 401/403, 404, 5xx, network) collapses to a single message: "No peers available (this workspace may be isolated)". The user has no way to tell whether they need to provision more workspaces (true isolation), restart the workspace (auth), re-register (404), page on-call (5xx), or check network (timeout) — five different operator actions, one ambiguous string. Wire: - new helper get_peers_with_diagnostic() in a2a_client.py returns (peers, error_summary). error_summary is None on 200; a short actionable string on every other branch. - get_peers() now shims through it so non-tool callers (system-prompt formatters) keep the bare-list contract. - tool_list_peers() switches to the diagnostic helper and surfaces the actual reason. The "may be isolated" string is removed; true empty now reads "no peers in the platform registry." Tests: - TestGetPeersWithDiagnostic: 200, 200-empty, 401, 403, 404, 5xx, network exception, 200-but-non-list-body, and the bare-list-shim regression guard. - TestToolListPeers: each diagnostic branch surfaces its reason + explicit assertion that "may be isolated" is gone. Coverage 91.53% (floor 86%). 122 a2a tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
c06e2fec5e
commit
3b34dfefbc
@ -229,19 +229,61 @@ async def send_a2a_message(target_url: str, message: str) -> str:
|
||||
return _format_a2a_error(last_exc, target_url)
|
||||
|
||||
|
||||
async def get_peers() -> list[dict]:
|
||||
"""Get this workspace's peers from the platform registry."""
|
||||
async def get_peers_with_diagnostic() -> tuple[list[dict], str | None]:
|
||||
"""Get this workspace's peers, returning (peers, diagnostic).
|
||||
|
||||
diagnostic is None when the call succeeded (status 200, even if the list
|
||||
is empty). When peers is [] for a non-trivial reason (auth failure,
|
||||
workspace-id missing from registry, platform error, network error),
|
||||
diagnostic is a short human-readable string explaining what went wrong
|
||||
so callers can surface it instead of "may be isolated" — see #2397.
|
||||
|
||||
The legacy get_peers() shim below preserves the bare-list contract for
|
||||
non-tool callers.
|
||||
"""
|
||||
url = f"{PLATFORM_URL}/registry/{WORKSPACE_ID}/peers"
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
try:
|
||||
resp = await client.get(
|
||||
f"{PLATFORM_URL}/registry/{WORKSPACE_ID}/peers",
|
||||
url,
|
||||
headers={"X-Workspace-ID": WORKSPACE_ID, **auth_headers()},
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
return resp.json()
|
||||
return []
|
||||
except Exception:
|
||||
return []
|
||||
except Exception as e:
|
||||
return [], f"Cannot reach platform at {PLATFORM_URL}: {e}"
|
||||
|
||||
if resp.status_code == 200:
|
||||
try:
|
||||
data = resp.json()
|
||||
except Exception as e:
|
||||
return [], f"Platform returned 200 but body was not JSON: {e}"
|
||||
if not isinstance(data, list):
|
||||
return [], f"Platform returned 200 but body was not a list: {type(data).__name__}"
|
||||
return data, None
|
||||
|
||||
if resp.status_code in (401, 403):
|
||||
return [], (
|
||||
f"Authentication to platform failed (HTTP {resp.status_code}). "
|
||||
"The workspace bearer token may be invalid — restarting the workspace usually re-mints it."
|
||||
)
|
||||
if resp.status_code == 404:
|
||||
return [], (
|
||||
f"Workspace ID {WORKSPACE_ID} is not registered with the platform (HTTP 404). "
|
||||
"Re-registration via the platform's /registry/register endpoint is needed."
|
||||
)
|
||||
if 500 <= resp.status_code < 600:
|
||||
return [], f"Platform error: HTTP {resp.status_code}."
|
||||
return [], f"Unexpected platform response: HTTP {resp.status_code}."
|
||||
|
||||
|
||||
async def get_peers() -> list[dict]:
|
||||
"""Get this workspace's peers from the platform registry.
|
||||
|
||||
Bare-list shim over get_peers_with_diagnostic() — discards the diagnostic
|
||||
so callers that don't care about the failure reason (e.g. system-prompt
|
||||
bootstrap formatters) get the same shape they always had.
|
||||
"""
|
||||
peers, _ = await get_peers_with_diagnostic()
|
||||
return peers
|
||||
|
||||
|
||||
async def get_workspace_info() -> dict:
|
||||
|
||||
@ -18,6 +18,7 @@ from a2a_client import (
|
||||
_peer_names,
|
||||
discover_peer,
|
||||
get_peers,
|
||||
get_peers_with_diagnostic,
|
||||
get_workspace_info,
|
||||
send_a2a_message,
|
||||
)
|
||||
@ -410,9 +411,16 @@ async def tool_send_message_to_user(message: str, attachments: list[str] | None
|
||||
|
||||
async def tool_list_peers() -> str:
|
||||
"""List all workspaces this agent can communicate with."""
|
||||
peers = await get_peers()
|
||||
peers, diagnostic = await get_peers_with_diagnostic()
|
||||
if not peers:
|
||||
return "No peers available (this workspace may be isolated)"
|
||||
if diagnostic is not None:
|
||||
# Non-trivial empty: auth failure / 404 / 5xx / network — surface
|
||||
# the actual reason so the user/agent doesn't have to guess. #2397.
|
||||
return f"No peers found. {diagnostic}"
|
||||
return (
|
||||
"You have no peers in the platform registry. "
|
||||
"(No parent, no children, no siblings registered.)"
|
||||
)
|
||||
lines = []
|
||||
for p in peers:
|
||||
status = p.get("status", "unknown")
|
||||
|
||||
@ -577,6 +577,149 @@ class TestGetPeers:
|
||||
assert headers_sent.get("X-Workspace-ID") == a2a_client.WORKSPACE_ID
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# get_peers_with_diagnostic — issue #2397
|
||||
#
|
||||
# Pin: an empty peer list MUST come with an actionable diagnostic on every
|
||||
# non-200 + every transport failure. The bug was that get_peers swallowed
|
||||
# every failure mode behind `return []`, leaving the agent's tool wrapper
|
||||
# with no way to distinguish "you have no peers" from "auth broke" / "404
|
||||
# from registry" / "platform 5xx" / "network timeout". Each of these
|
||||
# requires a different operator action.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestGetPeersWithDiagnostic:
|
||||
|
||||
async def test_200_returns_peers_and_no_diagnostic(self):
|
||||
"""200 with valid list → (peers, None). diagnostic stays None on success."""
|
||||
import a2a_client
|
||||
|
||||
peers = [{"id": "ws-1", "name": "Alpha"}]
|
||||
resp = _make_response(200, peers)
|
||||
mock_client = _make_mock_client(get_resp=resp)
|
||||
|
||||
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
|
||||
result, diag = await a2a_client.get_peers_with_diagnostic()
|
||||
|
||||
assert result == peers
|
||||
assert diag is None
|
||||
|
||||
async def test_200_empty_list_returns_no_diagnostic(self):
|
||||
"""200 with [] → (peers=[], diag=None). Truly no peers is success, not error."""
|
||||
import a2a_client
|
||||
|
||||
resp = _make_response(200, [])
|
||||
mock_client = _make_mock_client(get_resp=resp)
|
||||
|
||||
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
|
||||
result, diag = await a2a_client.get_peers_with_diagnostic()
|
||||
|
||||
assert result == []
|
||||
assert diag is None
|
||||
|
||||
async def test_401_returns_auth_diagnostic(self):
|
||||
"""401 → diagnostic mentions auth + restart hint."""
|
||||
import a2a_client
|
||||
|
||||
resp = _make_response(401, {"detail": "unauthorized"})
|
||||
mock_client = _make_mock_client(get_resp=resp)
|
||||
|
||||
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
|
||||
result, diag = await a2a_client.get_peers_with_diagnostic()
|
||||
|
||||
assert result == []
|
||||
assert diag is not None
|
||||
assert "401" in diag
|
||||
assert "Authentication" in diag or "authentication" in diag.lower()
|
||||
|
||||
async def test_403_returns_auth_diagnostic(self):
|
||||
"""403 → same auth-failure diagnostic shape as 401."""
|
||||
import a2a_client
|
||||
|
||||
resp = _make_response(403, {"detail": "forbidden"})
|
||||
mock_client = _make_mock_client(get_resp=resp)
|
||||
|
||||
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
|
||||
result, diag = await a2a_client.get_peers_with_diagnostic()
|
||||
|
||||
assert result == []
|
||||
assert diag is not None
|
||||
assert "403" in diag
|
||||
|
||||
async def test_404_returns_registration_diagnostic(self):
|
||||
"""404 → diagnostic tells operator the workspace ID is missing from the registry."""
|
||||
import a2a_client
|
||||
|
||||
resp = _make_response(404, {"detail": "not found"})
|
||||
mock_client = _make_mock_client(get_resp=resp)
|
||||
|
||||
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
|
||||
result, diag = await a2a_client.get_peers_with_diagnostic()
|
||||
|
||||
assert result == []
|
||||
assert diag is not None
|
||||
assert "404" in diag
|
||||
assert "registered" in diag.lower() or "registration" in diag.lower()
|
||||
|
||||
async def test_500_returns_platform_error_diagnostic(self):
|
||||
"""5xx → 'Platform error: HTTP <code>.'"""
|
||||
import a2a_client
|
||||
|
||||
resp = _make_response(503, {"detail": "service unavailable"})
|
||||
mock_client = _make_mock_client(get_resp=resp)
|
||||
|
||||
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
|
||||
result, diag = await a2a_client.get_peers_with_diagnostic()
|
||||
|
||||
assert result == []
|
||||
assert diag is not None
|
||||
assert "503" in diag
|
||||
assert "Platform error" in diag or "platform error" in diag.lower()
|
||||
|
||||
async def test_network_exception_returns_unreachable_diagnostic(self):
|
||||
"""httpx exception → diagnostic mentions PLATFORM_URL + the underlying error."""
|
||||
import a2a_client
|
||||
|
||||
mock_client = _make_mock_client(get_exc=TimeoutError("connection timed out"))
|
||||
|
||||
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
|
||||
result, diag = await a2a_client.get_peers_with_diagnostic()
|
||||
|
||||
assert result == []
|
||||
assert diag is not None
|
||||
assert "Cannot reach platform" in diag or "cannot reach" in diag.lower()
|
||||
assert "timed out" in diag
|
||||
|
||||
async def test_200_with_non_list_body_returns_diagnostic(self):
|
||||
"""200 but body is a dict → diagnostic flags shape mismatch (regression guard)."""
|
||||
import a2a_client
|
||||
|
||||
resp = _make_response(200, {"oops": "should have been a list"})
|
||||
mock_client = _make_mock_client(get_resp=resp)
|
||||
|
||||
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
|
||||
result, diag = await a2a_client.get_peers_with_diagnostic()
|
||||
|
||||
assert result == []
|
||||
assert diag is not None
|
||||
assert "list" in diag.lower()
|
||||
|
||||
async def test_get_peers_shim_preserves_bare_list_contract(self):
|
||||
"""get_peers() still returns just list[dict] — no API break for non-tool callers."""
|
||||
import a2a_client
|
||||
|
||||
peers = [{"id": "ws-1", "name": "Alpha"}]
|
||||
resp = _make_response(200, peers)
|
||||
mock_client = _make_mock_client(get_resp=resp)
|
||||
|
||||
with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
|
||||
result = await a2a_client.get_peers()
|
||||
|
||||
# Must be a list, not a tuple — bare-list shim contract.
|
||||
assert isinstance(result, list)
|
||||
assert result == peers
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# get_workspace_info
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@ -536,11 +536,54 @@ class TestToolSendMessageToUser:
|
||||
|
||||
class TestToolListPeers:
|
||||
|
||||
async def test_no_peers_returns_isolated_message(self):
|
||||
async def test_true_empty_returns_no_peers_message_without_diagnostic(self):
|
||||
"""200 + empty list → 'no peers in the platform registry' (no failure)."""
|
||||
import a2a_tools
|
||||
with patch("a2a_tools.get_peers", return_value=[]):
|
||||
with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], None)):
|
||||
result = await a2a_tools.tool_list_peers()
|
||||
assert "No peers available" in result
|
||||
# The new wording explicitly says no peers exist (no parent/sibling/child).
|
||||
# Avoids the misleading "may be isolated" hint when discovery succeeded.
|
||||
assert "no peers" in result.lower()
|
||||
assert "No peers found." not in result # diagnostic prefix should NOT appear on the success branch
|
||||
assert "may be isolated" not in result
|
||||
|
||||
async def test_auth_failure_surfaces_restart_hint(self):
|
||||
"""401/403 → tool_list_peers must surface the auth failure + restart hint, not 'isolated'."""
|
||||
import a2a_tools
|
||||
diag = "Authentication to platform failed (HTTP 401). Restart the workspace to re-mint."
|
||||
with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], diag)):
|
||||
result = await a2a_tools.tool_list_peers()
|
||||
assert "401" in result
|
||||
assert "Authentication" in result
|
||||
# The "isolated" message was the bug — make sure the regression doesn't return.
|
||||
assert "may be isolated" not in result
|
||||
|
||||
async def test_404_surfaces_registration_hint(self):
|
||||
"""404 → tool_list_peers tells the user re-registration is needed."""
|
||||
import a2a_tools
|
||||
diag = "Workspace ID ws-test is not registered with the platform (HTTP 404). Re-register."
|
||||
with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], diag)):
|
||||
result = await a2a_tools.tool_list_peers()
|
||||
assert "404" in result
|
||||
assert "registered" in result.lower()
|
||||
|
||||
async def test_5xx_surfaces_platform_error(self):
|
||||
"""5xx → 'Platform error' surfaced; agent / user can correctly route to oncall."""
|
||||
import a2a_tools
|
||||
diag = "Platform error: HTTP 503."
|
||||
with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], diag)):
|
||||
result = await a2a_tools.tool_list_peers()
|
||||
assert "503" in result
|
||||
assert "Platform error" in result
|
||||
|
||||
async def test_network_error_surfaces_unreachable(self):
|
||||
"""Network error → operator can tell that the workspace can't reach the platform at all."""
|
||||
import a2a_tools
|
||||
diag = "Cannot reach platform at http://platform.example: timed out"
|
||||
with patch("a2a_tools.get_peers_with_diagnostic", return_value=([], diag)):
|
||||
result = await a2a_tools.tool_list_peers()
|
||||
assert "Cannot reach platform" in result
|
||||
assert "timed out" in result
|
||||
|
||||
async def test_peers_returned_formatted_lines(self):
|
||||
"""Peers list is formatted as '- name (ID: ..., status: ..., role: ...)'."""
|
||||
@ -550,7 +593,7 @@ class TestToolListPeers:
|
||||
{"id": "ws-1", "name": "Alpha", "status": "online", "role": "worker"},
|
||||
{"id": "ws-2", "name": "Beta", "status": "idle", "role": "analyst"},
|
||||
]
|
||||
with patch("a2a_tools.get_peers", return_value=peers):
|
||||
with patch("a2a_tools.get_peers_with_diagnostic", return_value=(peers, None)):
|
||||
result = await a2a_tools.tool_list_peers()
|
||||
|
||||
assert "Alpha" in result
|
||||
@ -567,7 +610,7 @@ class TestToolListPeers:
|
||||
# Clear any prior cache entries for these IDs
|
||||
a2a_tools._peer_names.pop("ws-cache-test", None)
|
||||
peers = [{"id": "ws-cache-test", "name": "CacheMe", "status": "online", "role": "w"}]
|
||||
with patch("a2a_tools.get_peers", return_value=peers):
|
||||
with patch("a2a_tools.get_peers_with_diagnostic", return_value=(peers, None)):
|
||||
await a2a_tools.tool_list_peers()
|
||||
|
||||
assert a2a_tools._peer_names.get("ws-cache-test") == "CacheMe"
|
||||
@ -577,7 +620,7 @@ class TestToolListPeers:
|
||||
import a2a_tools
|
||||
|
||||
peers = [{"id": "ws-3", "name": "Gamma"}] # no status, no role
|
||||
with patch("a2a_tools.get_peers", return_value=peers):
|
||||
with patch("a2a_tools.get_peers_with_diagnostic", return_value=(peers, None)):
|
||||
result = await a2a_tools.tool_list_peers()
|
||||
|
||||
assert "Gamma" in result
|
||||
|
||||
Loading…
Reference in New Issue
Block a user