molecule-sdk-python/molecule_agent/inbound.py

"""Poll-mode inbound delivery for remote agents that can't expose an HTTP endpoint.

The :class:`A2AServer` companion (Phase 30.8b) covers the case where an agent
can host a publicly reachable HTTP endpoint and the platform pushes work to it.
Many real adopters can't — laptops behind NAT, ephemeral CI runners, hermes
self-hosted on a developer machine. For those, the platform queues inbound
A2A messages on the workspace's ``activity_logs`` and the agent polls.

This module provides:

* :class:`InboundMessage` — typed view over an ``activity_logs`` row that
  carries an ``a2a_receive`` event. Source is normalized to ``canvas_user``
  vs ``peer_agent`` so the SDK can route replies without the caller having
  to know which envelope to use.
* :class:`CursorLostError` — raised when the activity endpoint returns
  410 Gone (the cursor's row was rotated out). Caller resets and re-polls.
* :class:`InboundDelivery` — protocol that ``run_agent_loop`` accepts; both
  :class:`PollDelivery` and :class:`PushDelivery` satisfy it.
* :class:`PollDelivery` — the new poll-mode implementation.
* :class:`PushDelivery` — thin wrapper over :class:`A2AServer` so the same
  ``run_agent_loop`` works for push-mode agents that expose an inbound URL.

Big-tech prior art: Slack Socket Mode, Telegram getUpdates, AWS SQS long
polling, Stripe ``stripe listen``. Same shape — cursor-based poll, SDK-owned
loop, single handler callback, smart-reply hidden behind the SDK.
"""
from __future__ import annotations

import asyncio
import inspect
import logging
from dataclasses import dataclass, field
from pathlib import Path
from typing import (
    Any,
    Awaitable,
    Callable,
    Literal,
    Protocol,
    TYPE_CHECKING,
    runtime_checkable,
)

if TYPE_CHECKING:
    from .client import RemoteAgentClient

logger = logging.getLogger(__name__)


InboundSource = Literal["canvas_user", "peer_agent", "unknown"]


@dataclass
class InboundMessage:
    """One inbound A2A event the agent must handle.

    The ``activity_id`` is the cursor — pass it as ``since_id`` on the next
    fetch to avoid re-receiving this message.

    ``source`` is normalized so the SDK can pick the reply transport:

    * ``canvas_user`` — a user typing in the canvas chat. Reply via
      ``POST /workspaces/:id/notify``.
    * ``peer_agent`` — another workspace's agent. Reply via
      ``POST /workspaces/:peer_id/a2a`` with a JSON-RPC envelope and
      ``X-Source-Workspace-Id`` header.
    * ``unknown`` — the activity row didn't carry a recognizable source.
      :py:meth:`RemoteAgentClient.reply` raises ``ValueError`` rather than
      guess.
    """

    activity_id: str
    source: InboundSource
    source_id: str
    text: str
    raw: dict[str, Any] = field(default_factory=dict)


class CursorLostError(Exception):
    """Raised when ``GET /workspaces/:id/activity`` returns 410 Gone.

    The platform retires old activity rows on a fixed window (see
    workspace-server's activity_logs retention policy). If the agent's
    cursor points at a row that has been rotated out, the server replies
    410. Callers should reset the cursor (``since_id=None``) and re-poll;
    they will catch up on whatever's still in the window.
    """


# ---------------------------------------------------------------------------
# Activity row → InboundMessage parsing
# ---------------------------------------------------------------------------


def _parse_activity_row(row: dict[str, Any]) -> InboundMessage | None:
    """Convert one ``activity_logs`` row into an :class:`InboundMessage`.

    Returns ``None`` if the row is malformed or doesn't carry text we can
    deliver — preferable to raising and aborting the whole poll batch.

    Activity row shape (per workspace-server's handlers/activity.go):
    ``{"id": ..., "type": "a2a_receive", "source_id": ..., "data": {...}, ...}``
    """
    aid = str(row.get("id") or "")
    if not aid:
        return None

    data = row.get("data") if isinstance(row.get("data"), dict) else {}
    source_kind = str(data.get("source") or row.get("source") or "")
    source_id = str(row.get("source_id") or data.get("source_id") or "")

    # Normalize source. The platform uses "canvas_user" / "peer_agent" /
    # sometimes "user" (legacy). Anything else falls into "unknown" so we
    # don't accidentally route a reply down the wrong transport.
    source: InboundSource
    if source_kind in ("canvas_user", "user"):
        source = "canvas_user"
    elif source_kind == "peer_agent":
        source = "peer_agent"
    elif source_id and source_id != "user":
        # Heuristic: a non-empty source_id that isn't the "user" sentinel
        # is almost certainly a peer workspace.
        source = "peer_agent"
    elif source_id == "user":
        source = "canvas_user"
    else:
        source = "unknown"

    text = str(data.get("text") or data.get("message") or "")

    return InboundMessage(
        activity_id=aid,
        source=source,
        source_id=source_id,
        text=text,
        raw=row,
    )


# ---------------------------------------------------------------------------
# Handler + delivery protocol
# ---------------------------------------------------------------------------


# A handler receives the inbound message + the client (so it can reply, fetch
# secrets, call peers, etc.) and returns either a reply string or None.
# Sync OR async — :class:`PollDelivery` detects ``Awaitable`` results and
# awaits them, mirroring the pattern in :class:`A2AServer`.
MessageHandler = Callable[
    ["InboundMessage", "RemoteAgentClient"],
    "str | None | Awaitable[str | None]",
]


@runtime_checkable
class InboundDelivery(Protocol):
    """The contract :py:meth:`RemoteAgentClient.run_agent_loop` calls into.

    Two implementations ship with the SDK:

    * :class:`PollDelivery` — for agents without a reachable URL.
    * :class:`PushDelivery` — for agents that host an A2AServer.

    Third parties can supply their own (e.g. WebSocket, gRPC streaming)
    by satisfying this protocol.
    """

    def run_once(self, handler: MessageHandler) -> int:
        """Drain one batch of inbound messages and dispatch to handler.

        Returns the count of messages dispatched. The caller's outer loop
        decides cadence / sleep.
        """
        ...

    def stop(self) -> None:
        """Release any resources (close sockets, stop background threads)."""
        ...


# ---------------------------------------------------------------------------
# PollDelivery — the new path
# ---------------------------------------------------------------------------


# Default poll cadence. 5s gives <5s p50 latency for canvas-user messages
# while keeping load on workspace-server modest (one GET per agent per 5s).
# Slack Socket Mode runs at ~1s, Telegram getUpdates with timeout=30 is the
# canonical long-poll. We don't have long-poll support server-side yet, so
# fixed 5s is the conservative choice. Tunable via constructor.
DEFAULT_POLL_INTERVAL = 5.0


class PollDelivery:
    """Poll ``GET /workspaces/:id/activity?type=a2a_receive&since_id=…``.

    The cursor is process-memory by default; a restart re-polls from
    scratch, which is harmless because handlers should be idempotent
    (the platform makes no exactly-once guarantees on activity poll —
    the same SDK-level convention as Slack Events API).

    Pass ``cursor_file`` to persist the cursor across restarts:

        PollDelivery(client, cursor_file=Path("~/.molecule/cursor"))

    Cursor-loss (HTTP 410) is handled transparently — the cursor is
    reset to ``None`` and the next poll starts fresh with whatever's in
    the activity window.
    """

    def __init__(
        self,
        client: "RemoteAgentClient",
        interval: float = DEFAULT_POLL_INTERVAL,
        type: str = "a2a_receive",
        limit: int = 100,
        cursor_file: Path | None = None,
    ) -> None:
        self._client = client
        self.interval = interval
        self.type = type
        self.limit = limit
        self._cursor_file = cursor_file
        self._cursor: str | None = self._load_cursor()
        self._stopped = False

    def _load_cursor(self) -> str | None:
        if self._cursor_file is None or not self._cursor_file.exists():
            return None
        try:
            cur = self._cursor_file.read_text().strip()
            return cur or None
        except OSError as exc:
            logger.warning("could not read cursor file %s: %s", self._cursor_file, exc)
            return None

    def _save_cursor(self) -> None:
        if self._cursor_file is None or self._cursor is None:
            return
        try:
            self._cursor_file.parent.mkdir(parents=True, exist_ok=True)
            self._cursor_file.write_text(self._cursor)
        except OSError as exc:
            logger.warning("could not write cursor file %s: %s", self._cursor_file, exc)

    @property
    def cursor(self) -> str | None:
        """Current cursor (``activity_id`` of the most recently dispatched
        message). Useful for tests and observability."""
        return self._cursor

    def run_once(self, handler: MessageHandler) -> int:
        """Fetch one batch and dispatch each message to ``handler``.

        Returns the number of messages dispatched. The cursor advances past
        every dispatched row, including ones whose handler raised — a
        poison-pill input shouldn't block the queue forever. The handler
        is responsible for surfacing its own errors via logging or its own
        observability. This matches Slack Events delivery and SQS DLQ
        semantics; the platform makes no exactly-once guarantees on
        activity poll, so handlers must be idempotent regardless.
        """
        if self._stopped:
            return 0
        try:
            batch = self._client.fetch_inbound(
                since_id=self._cursor,
                limit=self.limit,
                type=self.type,
            )
        except CursorLostError:
            logger.info("cursor %s lost (410 Gone) — resetting", self._cursor)
            self._cursor = None
            return 0

        dispatched = 0
        for msg in batch:
            try:
                self._dispatch(handler, msg)
            except Exception as exc:
                # Log + continue. We DO advance the cursor past this message
                # so a poison-pill input doesn't block the queue forever —
                # this matches how Slack Events delivers and how SQS DLQs
                # work. The handler is expected to surface its own errors
                # via logging or its own observability.
                logger.exception("handler raised on activity %s: %s", msg.activity_id, exc)
            self._cursor = msg.activity_id
            dispatched += 1

        if dispatched:
            self._save_cursor()
        return dispatched

    def _dispatch(self, handler: MessageHandler, msg: "InboundMessage") -> None:
        """Invoke handler, await if async, send the reply if returned."""
        result = handler(msg, self._client)
        if inspect.isawaitable(result):
            # Detect a running loop without using the deprecated
            # asyncio.get_event_loop() (Py3.12+). If a loop is running we
            # refuse — the caller is async and should await the handler
            # themselves; we can't synchronously block on an awaitable
            # without deadlocking the running loop.
            try:
                asyncio.get_running_loop()
            except RuntimeError:
                # No running loop — safe to spin up a fresh one. Mirrors
                # A2AServer's pattern: build, run, close. asyncio.run is
                # the modern equivalent of new_loop+run_until_complete+close
                # and handles the close even on exception.
                result = asyncio.run(result)  # type: ignore[arg-type]
            else:
                raise RuntimeError(
                    "PollDelivery.run_once was called from inside a running "
                    "event loop with an async handler. Use a sync handler "
                    "here, or schedule run_once on a worker thread via "
                    "asyncio.to_thread()."
                )

        reply_text = result if isinstance(result, str) else None
        if reply_text:
            try:
                self._client.reply(msg, reply_text)
            except Exception as exc:
                logger.warning("reply send failed for activity %s: %s", msg.activity_id, exc)

    def stop(self) -> None:
        self._stopped = True


# ---------------------------------------------------------------------------
# PushDelivery — wraps the existing A2AServer
# ---------------------------------------------------------------------------


class PushDelivery:
    """Adapt :class:`A2AServer` to the :class:`InboundDelivery` protocol.

    Use this when the agent CAN expose a reachable HTTP endpoint. The
    A2AServer runs in its own thread and dispatches to ``handler`` as
    HTTP requests arrive — ``run_once`` is a no-op (the loop driver in
    :py:meth:`RemoteAgentClient.run_agent_loop` simply sleeps and
    keeps the heartbeat alive).
    """

    def __init__(self, client: "RemoteAgentClient", server: Any) -> None:
        # ``server`` typed Any to avoid a circular import; it's an A2AServer.
        self._client = client
        self._server = server

    def run_once(self, handler: MessageHandler) -> int:  # noqa: ARG002 — handler unused
        # A2AServer dispatches synchronously on its own thread; nothing
        # for the outer loop to do per-tick.
        return 0

    def stop(self) -> None:
        try:
            self._server.stop()
        except Exception as exc:
            logger.warning("PushDelivery stop: A2AServer.stop raised: %s", exc)


__all__ = [
    "CursorLostError",
    "DEFAULT_POLL_INTERVAL",
    "InboundDelivery",
    "InboundMessage",
    "InboundSource",
    "MessageHandler",
    "PollDelivery",
    "PushDelivery",
]