- Drop unused `import time` from inbound.py and `import call` from test_inbound.py (caught by ruff in CI; would have caught locally if I'd run it before pushing). - Rewrite the misleading comment in PollDelivery.run_once: the cursor DOES advance past handler exceptions (poison-pill resilience). The previous comment claimed otherwise, which would have confused future readers. - Drop `_parse_activity_row` from inbound.py's `__all__`. The leading underscore signals "private helper"; exposing it via `__all__` contradicted the convention. Tests still import it directly via the module path. - Add `test_fetch_inbound_429_retries_via_get_with_retry` — the PR description claimed branch-coverage of the 429 path but no test exercised it. Closes the gap.
372 lines
14 KiB
Python
372 lines
14 KiB
Python
"""Poll-mode inbound delivery for remote agents that can't expose an HTTP endpoint.
|
|
|
|
The :class:`A2AServer` companion (Phase 30.8b) covers the case where an agent
|
|
can host a publicly reachable HTTP endpoint and the platform pushes work to it.
|
|
Many real adopters can't — laptops behind NAT, ephemeral CI runners, hermes
|
|
self-hosted on a developer machine. For those, the platform queues inbound
|
|
A2A messages on the workspace's ``activity_logs`` and the agent polls.
|
|
|
|
This module provides:
|
|
|
|
* :class:`InboundMessage` — typed view over an ``activity_logs`` row that
|
|
carries an ``a2a_receive`` event. Source is normalized to ``canvas_user``
|
|
vs ``peer_agent`` so the SDK can route replies without the caller having
|
|
to know which envelope to use.
|
|
* :class:`CursorLostError` — raised when the activity endpoint returns
|
|
410 Gone (the cursor's row was rotated out). Caller resets and re-polls.
|
|
* :class:`InboundDelivery` — protocol that ``run_agent_loop`` accepts; both
|
|
:class:`PollDelivery` and :class:`PushDelivery` satisfy it.
|
|
* :class:`PollDelivery` — the new poll-mode implementation.
|
|
* :class:`PushDelivery` — thin wrapper over :class:`A2AServer` so the same
|
|
``run_agent_loop`` works for push-mode agents that expose an inbound URL.
|
|
|
|
Big-tech prior art: Slack Socket Mode, Telegram getUpdates, AWS SQS long
|
|
polling, Stripe ``stripe listen``. Same shape — cursor-based poll, SDK-owned
|
|
loop, single handler callback, smart-reply hidden behind the SDK.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import inspect
|
|
import logging
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
from typing import (
|
|
Any,
|
|
Awaitable,
|
|
Callable,
|
|
Literal,
|
|
Protocol,
|
|
TYPE_CHECKING,
|
|
runtime_checkable,
|
|
)
|
|
|
|
if TYPE_CHECKING:
|
|
from .client import RemoteAgentClient
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
InboundSource = Literal["canvas_user", "peer_agent", "unknown"]
|
|
|
|
|
|
@dataclass
|
|
class InboundMessage:
|
|
"""One inbound A2A event the agent must handle.
|
|
|
|
The ``activity_id`` is the cursor — pass it as ``since_id`` on the next
|
|
fetch to avoid re-receiving this message.
|
|
|
|
``source`` is normalized so the SDK can pick the reply transport:
|
|
|
|
* ``canvas_user`` — a user typing in the canvas chat. Reply via
|
|
``POST /workspaces/:id/notify``.
|
|
* ``peer_agent`` — another workspace's agent. Reply via
|
|
``POST /workspaces/:peer_id/a2a`` with a JSON-RPC envelope and
|
|
``X-Source-Workspace-Id`` header.
|
|
* ``unknown`` — the activity row didn't carry a recognizable source.
|
|
:py:meth:`RemoteAgentClient.reply` raises ``ValueError`` rather than
|
|
guess.
|
|
"""
|
|
|
|
activity_id: str
|
|
source: InboundSource
|
|
source_id: str
|
|
text: str
|
|
raw: dict[str, Any] = field(default_factory=dict)
|
|
|
|
|
|
class CursorLostError(Exception):
|
|
"""Raised when ``GET /workspaces/:id/activity`` returns 410 Gone.
|
|
|
|
The platform retires old activity rows on a fixed window (see
|
|
workspace-server's activity_logs retention policy). If the agent's
|
|
cursor points at a row that has been rotated out, the server replies
|
|
410. Callers should reset the cursor (``since_id=None``) and re-poll;
|
|
they will catch up on whatever's still in the window.
|
|
"""
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Activity row → InboundMessage parsing
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _parse_activity_row(row: dict[str, Any]) -> InboundMessage | None:
|
|
"""Convert one ``activity_logs`` row into an :class:`InboundMessage`.
|
|
|
|
Returns ``None`` if the row is malformed or doesn't carry text we can
|
|
deliver — preferable to raising and aborting the whole poll batch.
|
|
|
|
Activity row shape (per workspace-server's handlers/activity.go):
|
|
``{"id": ..., "type": "a2a_receive", "source_id": ..., "data": {...}, ...}``
|
|
"""
|
|
aid = str(row.get("id") or "")
|
|
if not aid:
|
|
return None
|
|
|
|
data = row.get("data") if isinstance(row.get("data"), dict) else {}
|
|
source_kind = str(data.get("source") or row.get("source") or "")
|
|
source_id = str(row.get("source_id") or data.get("source_id") or "")
|
|
|
|
# Normalize source. The platform uses "canvas_user" / "peer_agent" /
|
|
# sometimes "user" (legacy). Anything else falls into "unknown" so we
|
|
# don't accidentally route a reply down the wrong transport.
|
|
source: InboundSource
|
|
if source_kind in ("canvas_user", "user"):
|
|
source = "canvas_user"
|
|
elif source_kind == "peer_agent":
|
|
source = "peer_agent"
|
|
elif source_id and source_id != "user":
|
|
# Heuristic: a non-empty source_id that isn't the "user" sentinel
|
|
# is almost certainly a peer workspace.
|
|
source = "peer_agent"
|
|
elif source_id == "user":
|
|
source = "canvas_user"
|
|
else:
|
|
source = "unknown"
|
|
|
|
text = str(data.get("text") or data.get("message") or "")
|
|
|
|
return InboundMessage(
|
|
activity_id=aid,
|
|
source=source,
|
|
source_id=source_id,
|
|
text=text,
|
|
raw=row,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Handler + delivery protocol
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# A handler receives the inbound message + the client (so it can reply, fetch
|
|
# secrets, call peers, etc.) and returns either a reply string or None.
|
|
# Sync OR async — :class:`PollDelivery` detects ``Awaitable`` results and
|
|
# awaits them, mirroring the pattern in :class:`A2AServer`.
|
|
MessageHandler = Callable[
|
|
["InboundMessage", "RemoteAgentClient"],
|
|
"str | None | Awaitable[str | None]",
|
|
]
|
|
|
|
|
|
@runtime_checkable
|
|
class InboundDelivery(Protocol):
|
|
"""The contract :py:meth:`RemoteAgentClient.run_agent_loop` calls into.
|
|
|
|
Two implementations ship with the SDK:
|
|
|
|
* :class:`PollDelivery` — for agents without a reachable URL.
|
|
* :class:`PushDelivery` — for agents that host an A2AServer.
|
|
|
|
Third parties can supply their own (e.g. WebSocket, gRPC streaming)
|
|
by satisfying this protocol.
|
|
"""
|
|
|
|
def run_once(self, handler: MessageHandler) -> int:
|
|
"""Drain one batch of inbound messages and dispatch to handler.
|
|
|
|
Returns the count of messages dispatched. The caller's outer loop
|
|
decides cadence / sleep.
|
|
"""
|
|
...
|
|
|
|
def stop(self) -> None:
|
|
"""Release any resources (close sockets, stop background threads)."""
|
|
...
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# PollDelivery — the new path
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# Default poll cadence. 5s gives <5s p50 latency for canvas-user messages
|
|
# while keeping load on workspace-server modest (one GET per agent per 5s).
|
|
# Slack Socket Mode runs at ~1s, Telegram getUpdates with timeout=30 is the
|
|
# canonical long-poll. We don't have long-poll support server-side yet, so
|
|
# fixed 5s is the conservative choice. Tunable via constructor.
|
|
DEFAULT_POLL_INTERVAL = 5.0
|
|
|
|
|
|
class PollDelivery:
|
|
"""Poll ``GET /workspaces/:id/activity?type=a2a_receive&since_id=…``.
|
|
|
|
The cursor is process-memory by default; a restart re-polls from
|
|
scratch, which is harmless because handlers should be idempotent
|
|
(the platform makes no exactly-once guarantees on activity poll —
|
|
the same SDK-level convention as Slack Events API).
|
|
|
|
Pass ``cursor_file`` to persist the cursor across restarts:
|
|
|
|
PollDelivery(client, cursor_file=Path("~/.molecule/cursor"))
|
|
|
|
Cursor-loss (HTTP 410) is handled transparently — the cursor is
|
|
reset to ``None`` and the next poll starts fresh with whatever's in
|
|
the activity window.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
client: "RemoteAgentClient",
|
|
interval: float = DEFAULT_POLL_INTERVAL,
|
|
type: str = "a2a_receive",
|
|
limit: int = 100,
|
|
cursor_file: Path | None = None,
|
|
) -> None:
|
|
self._client = client
|
|
self.interval = interval
|
|
self.type = type
|
|
self.limit = limit
|
|
self._cursor_file = cursor_file
|
|
self._cursor: str | None = self._load_cursor()
|
|
self._stopped = False
|
|
|
|
def _load_cursor(self) -> str | None:
|
|
if self._cursor_file is None or not self._cursor_file.exists():
|
|
return None
|
|
try:
|
|
cur = self._cursor_file.read_text().strip()
|
|
return cur or None
|
|
except OSError as exc:
|
|
logger.warning("could not read cursor file %s: %s", self._cursor_file, exc)
|
|
return None
|
|
|
|
def _save_cursor(self) -> None:
|
|
if self._cursor_file is None or self._cursor is None:
|
|
return
|
|
try:
|
|
self._cursor_file.parent.mkdir(parents=True, exist_ok=True)
|
|
self._cursor_file.write_text(self._cursor)
|
|
except OSError as exc:
|
|
logger.warning("could not write cursor file %s: %s", self._cursor_file, exc)
|
|
|
|
@property
|
|
def cursor(self) -> str | None:
|
|
"""Current cursor (``activity_id`` of the most recently dispatched
|
|
message). Useful for tests and observability."""
|
|
return self._cursor
|
|
|
|
def run_once(self, handler: MessageHandler) -> int:
|
|
"""Fetch one batch and dispatch each message to ``handler``.
|
|
|
|
Returns the number of messages dispatched. The cursor advances past
|
|
every dispatched row, including ones whose handler raised — a
|
|
poison-pill input shouldn't block the queue forever. The handler
|
|
is responsible for surfacing its own errors via logging or its own
|
|
observability. This matches Slack Events delivery and SQS DLQ
|
|
semantics; the platform makes no exactly-once guarantees on
|
|
activity poll, so handlers must be idempotent regardless.
|
|
"""
|
|
if self._stopped:
|
|
return 0
|
|
try:
|
|
batch = self._client.fetch_inbound(
|
|
since_id=self._cursor,
|
|
limit=self.limit,
|
|
type=self.type,
|
|
)
|
|
except CursorLostError:
|
|
logger.info("cursor %s lost (410 Gone) — resetting", self._cursor)
|
|
self._cursor = None
|
|
return 0
|
|
|
|
dispatched = 0
|
|
for msg in batch:
|
|
try:
|
|
self._dispatch(handler, msg)
|
|
except Exception as exc:
|
|
# Log + continue. We DO advance the cursor past this message
|
|
# so a poison-pill input doesn't block the queue forever —
|
|
# this matches how Slack Events delivers and how SQS DLQs
|
|
# work. The handler is expected to surface its own errors
|
|
# via logging or its own observability.
|
|
logger.exception("handler raised on activity %s: %s", msg.activity_id, exc)
|
|
self._cursor = msg.activity_id
|
|
dispatched += 1
|
|
|
|
if dispatched:
|
|
self._save_cursor()
|
|
return dispatched
|
|
|
|
def _dispatch(self, handler: MessageHandler, msg: "InboundMessage") -> None:
|
|
"""Invoke handler, await if async, send the reply if returned."""
|
|
result = handler(msg, self._client)
|
|
if inspect.isawaitable(result):
|
|
# Detect a running loop without using the deprecated
|
|
# asyncio.get_event_loop() (Py3.12+). If a loop is running we
|
|
# refuse — the caller is async and should await the handler
|
|
# themselves; we can't synchronously block on an awaitable
|
|
# without deadlocking the running loop.
|
|
try:
|
|
asyncio.get_running_loop()
|
|
except RuntimeError:
|
|
# No running loop — safe to spin up a fresh one. Mirrors
|
|
# A2AServer's pattern: build, run, close. asyncio.run is
|
|
# the modern equivalent of new_loop+run_until_complete+close
|
|
# and handles the close even on exception.
|
|
result = asyncio.run(result) # type: ignore[arg-type]
|
|
else:
|
|
raise RuntimeError(
|
|
"PollDelivery.run_once was called from inside a running "
|
|
"event loop with an async handler. Use a sync handler "
|
|
"here, or schedule run_once on a worker thread via "
|
|
"asyncio.to_thread()."
|
|
)
|
|
|
|
reply_text = result if isinstance(result, str) else None
|
|
if reply_text:
|
|
try:
|
|
self._client.reply(msg, reply_text)
|
|
except Exception as exc:
|
|
logger.warning("reply send failed for activity %s: %s", msg.activity_id, exc)
|
|
|
|
def stop(self) -> None:
|
|
self._stopped = True
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# PushDelivery — wraps the existing A2AServer
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class PushDelivery:
|
|
"""Adapt :class:`A2AServer` to the :class:`InboundDelivery` protocol.
|
|
|
|
Use this when the agent CAN expose a reachable HTTP endpoint. The
|
|
A2AServer runs in its own thread and dispatches to ``handler`` as
|
|
HTTP requests arrive — ``run_once`` is a no-op (the loop driver in
|
|
:py:meth:`RemoteAgentClient.run_agent_loop` simply sleeps and
|
|
keeps the heartbeat alive).
|
|
"""
|
|
|
|
def __init__(self, client: "RemoteAgentClient", server: Any) -> None:
|
|
# ``server`` typed Any to avoid a circular import; it's an A2AServer.
|
|
self._client = client
|
|
self._server = server
|
|
|
|
def run_once(self, handler: MessageHandler) -> int: # noqa: ARG002 — handler unused
|
|
# A2AServer dispatches synchronously on its own thread; nothing
|
|
# for the outer loop to do per-tick.
|
|
return 0
|
|
|
|
def stop(self) -> None:
|
|
try:
|
|
self._server.stop()
|
|
except Exception as exc:
|
|
logger.warning("PushDelivery stop: A2AServer.stop raised: %s", exc)
|
|
|
|
|
|
__all__ = [
|
|
"CursorLostError",
|
|
"DEFAULT_POLL_INTERVAL",
|
|
"InboundDelivery",
|
|
"InboundMessage",
|
|
"InboundSource",
|
|
"MessageHandler",
|
|
"PollDelivery",
|
|
"PushDelivery",
|
|
]
|