molecule-sdk-python/molecule_agent/inbound.py

"""Poll-mode inbound delivery for remote agents that can't expose an HTTP endpoint.

The :class:`A2AServer` companion (Phase 30.8b) covers the case where an agent
can host a publicly reachable HTTP endpoint and the platform pushes work to it.
Many real adopters can't — laptops behind NAT, ephemeral CI runners, hermes
self-hosted on a developer machine. For those, the platform queues inbound
A2A messages on the workspace's ``activity_logs`` and the agent polls.

This module provides:

* :class:`InboundMessage` — typed view over an ``activity_logs`` row that
  carries an ``a2a_receive`` event. Source is normalized to ``canvas_user``
  vs ``peer_agent`` so the SDK can route replies without the caller having
  to know which envelope to use.
* :class:`CursorLostError` — raised when the activity endpoint returns
  410 Gone (the cursor's row was rotated out). Caller resets and re-polls.
* :class:`InboundDelivery` — protocol that ``run_agent_loop`` accepts; both
  :class:`PollDelivery` and :class:`PushDelivery` satisfy it.
* :class:`PollDelivery` — the new poll-mode implementation.
* :class:`PushDelivery` — thin wrapper over :class:`A2AServer` so the same
  ``run_agent_loop`` works for push-mode agents that expose an inbound URL.

Big-tech prior art: Slack Socket Mode, Telegram getUpdates, AWS SQS long
polling, Stripe ``stripe listen``. Same shape — cursor-based poll, SDK-owned
loop, single handler callback, smart-reply hidden behind the SDK.
"""
from __future__ import annotations

import asyncio
import inspect
import logging
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import (
    Any,
    Awaitable,
    Callable,
    Literal,
    Protocol,
    TYPE_CHECKING,
    runtime_checkable,
)

if TYPE_CHECKING:
    from .client import RemoteAgentClient

logger = logging.getLogger(__name__)


InboundSource = Literal["canvas_user", "peer_agent", "unknown"]


@dataclass
class InboundMessage:
    """One inbound A2A event the agent must handle.

    The ``activity_id`` is the cursor — pass it as ``since_id`` on the next
    fetch to avoid re-receiving this message.

    ``source`` is normalized so the SDK can pick the reply transport:

    * ``canvas_user`` — a user typing in the canvas chat. Reply via
      ``POST /workspaces/:id/notify``.
    * ``peer_agent`` — another workspace's agent. Reply via
      ``POST /workspaces/:peer_id/a2a`` with a JSON-RPC envelope and
      ``X-Source-Workspace-Id`` header.
    * ``unknown`` — the activity row didn't carry a recognizable source.
      :py:meth:`RemoteAgentClient.reply` raises ``ValueError`` rather than
      guess.
    """

    activity_id: str
    source: InboundSource
    source_id: str
    text: str
    raw: dict[str, Any] = field(default_factory=dict)


class CursorLostError(Exception):
    """Raised when ``GET /workspaces/:id/activity`` returns 410 Gone.

    The platform retires old activity rows on a fixed window (see
    workspace-server's activity_logs retention policy). If the agent's
    cursor points at a row that has been rotated out, the server replies
    410. Callers should reset the cursor (``since_id=None``) and re-poll;
    they will catch up on whatever's still in the window.
    """


# ---------------------------------------------------------------------------
# Activity row → InboundMessage parsing
# ---------------------------------------------------------------------------


def _parse_activity_row(row: dict[str, Any]) -> InboundMessage | None:
    """Convert one ``activity_logs`` row into an :class:`InboundMessage`.

    Returns ``None`` if the row is malformed or doesn't carry text we can
    deliver — preferable to raising and aborting the whole poll batch.

    Activity row shape (per workspace-server's handlers/activity.go):
    ``{"id": ..., "type": "a2a_receive", "source_id": ..., "data": {...}, ...}``
    """
    aid = str(row.get("id") or "")
    if not aid:
        return None

    data = row.get("data") if isinstance(row.get("data"), dict) else {}
    source_kind = str(data.get("source") or row.get("source") or "")
    source_id = str(row.get("source_id") or data.get("source_id") or "")

    # Normalize source. The platform uses "canvas_user" / "peer_agent" /
    # sometimes "user" (legacy). Anything else falls into "unknown" so we
    # don't accidentally route a reply down the wrong transport.
    source: InboundSource
    if source_kind in ("canvas_user", "user"):
        source = "canvas_user"
    elif source_kind == "peer_agent":
        source = "peer_agent"
    elif source_id and source_id != "user":
        # Heuristic: a non-empty source_id that isn't the "user" sentinel
        # is almost certainly a peer workspace.
        source = "peer_agent"
    elif source_id == "user":
        source = "canvas_user"
    else:
        source = "unknown"

    text = str(data.get("text") or data.get("message") or "")

    return InboundMessage(
        activity_id=aid,
        source=source,
        source_id=source_id,
        text=text,
        raw=row,
    )


# ---------------------------------------------------------------------------
# Handler + delivery protocol
# ---------------------------------------------------------------------------


# A handler receives the inbound message + the client (so it can reply, fetch
# secrets, call peers, etc.) and returns either a reply string or None.
# Sync OR async — :class:`PollDelivery` detects ``Awaitable`` results and
# awaits them, mirroring the pattern in :class:`A2AServer`.
MessageHandler = Callable[
    ["InboundMessage", "RemoteAgentClient"],
    "str | None | Awaitable[str | None]",
]


@runtime_checkable
class InboundDelivery(Protocol):
    """The contract :py:meth:`RemoteAgentClient.run_agent_loop` calls into.

    Two implementations ship with the SDK:

    * :class:`PollDelivery` — for agents without a reachable URL.
    * :class:`PushDelivery` — for agents that host an A2AServer.

    Third parties can supply their own (e.g. WebSocket, gRPC streaming)
    by satisfying this protocol.
    """

    def run_once(self, handler: MessageHandler) -> int:
        """Drain one batch of inbound messages and dispatch to handler.

        Returns the count of messages dispatched. The caller's outer loop
        decides cadence / sleep.
        """
        ...

    def stop(self) -> None:
        """Release any resources (close sockets, stop background threads)."""
        ...


# ---------------------------------------------------------------------------
# PollDelivery — the new path
# ---------------------------------------------------------------------------


# Default poll cadence. 5s gives <5s p50 latency for canvas-user messages
# while keeping load on workspace-server modest (one GET per agent per 5s).
# Slack Socket Mode runs at ~1s, Telegram getUpdates with timeout=30 is the
# canonical long-poll. We don't have long-poll support server-side yet, so
# fixed 5s is the conservative choice. Tunable via constructor.
DEFAULT_POLL_INTERVAL = 5.0


class PollDelivery:
    """Poll ``GET /workspaces/:id/activity?type=a2a_receive&since_id=…``.

    The cursor is process-memory by default; a restart re-polls from
    scratch, which is harmless because handlers should be idempotent
    (the platform makes no exactly-once guarantees on activity poll —
    the same SDK-level convention as Slack Events API).

    Pass ``cursor_file`` to persist the cursor across restarts:

        PollDelivery(client, cursor_file=Path("~/.molecule/cursor"))

    Cursor-loss (HTTP 410) is handled transparently — the cursor is
    reset to ``None`` and the next poll starts fresh with whatever's in
    the activity window.
    """

    def __init__(
        self,
        client: "RemoteAgentClient",
        interval: float = DEFAULT_POLL_INTERVAL,
        type: str = "a2a_receive",
        limit: int = 100,
        cursor_file: Path | None = None,
    ) -> None:
        self._client = client
        self.interval = interval
        self.type = type
        self.limit = limit
        self._cursor_file = cursor_file
        self._cursor: str | None = self._load_cursor()
        self._stopped = False

    def _load_cursor(self) -> str | None:
        if self._cursor_file is None or not self._cursor_file.exists():
            return None
        try:
            cur = self._cursor_file.read_text().strip()
            return cur or None
        except OSError as exc:
            logger.warning("could not read cursor file %s: %s", self._cursor_file, exc)
            return None

    def _save_cursor(self) -> None:
        if self._cursor_file is None or self._cursor is None:
            return
        try:
            self._cursor_file.parent.mkdir(parents=True, exist_ok=True)
            self._cursor_file.write_text(self._cursor)
        except OSError as exc:
            logger.warning("could not write cursor file %s: %s", self._cursor_file, exc)

    @property
    def cursor(self) -> str | None:
        """Current cursor (``activity_id`` of the most recently dispatched
        message). Useful for tests and observability."""
        return self._cursor

    def run_once(self, handler: MessageHandler) -> int:
        """Fetch one batch and dispatch each message to ``handler``.

        Returns the number of messages dispatched. A handler exception is
        logged but does not abort the batch — at-least-once semantics, the
        same row may be re-delivered on the next iteration if its cursor
        wasn't advanced.
        """
        if self._stopped:
            return 0
        try:
            batch = self._client.fetch_inbound(
                since_id=self._cursor,
                limit=self.limit,
                type=self.type,
            )
        except CursorLostError:
            logger.info("cursor %s lost (410 Gone) — resetting", self._cursor)
            self._cursor = None
            return 0

        dispatched = 0
        for msg in batch:
            try:
                self._dispatch(handler, msg)
            except Exception as exc:
                # Log + continue. We DO advance the cursor past this message
                # so a poison-pill input doesn't block the queue forever —
                # this matches how Slack Events delivers and how SQS DLQs
                # work. The handler is expected to surface its own errors
                # via logging or its own observability.
                logger.exception("handler raised on activity %s: %s", msg.activity_id, exc)
            self._cursor = msg.activity_id
            dispatched += 1

        if dispatched:
            self._save_cursor()
        return dispatched

    def _dispatch(self, handler: MessageHandler, msg: "InboundMessage") -> None:
        """Invoke handler, await if async, send the reply if returned."""
        result = handler(msg, self._client)
        if inspect.isawaitable(result):
            # Detect a running loop without using the deprecated
            # asyncio.get_event_loop() (Py3.12+). If a loop is running we
            # refuse — the caller is async and should await the handler
            # themselves; we can't synchronously block on an awaitable
            # without deadlocking the running loop.
            try:
                asyncio.get_running_loop()
            except RuntimeError:
                # No running loop — safe to spin up a fresh one. Mirrors
                # A2AServer's pattern: build, run, close. asyncio.run is
                # the modern equivalent of new_loop+run_until_complete+close
                # and handles the close even on exception.
                result = asyncio.run(result)  # type: ignore[arg-type]
            else:
                raise RuntimeError(
                    "PollDelivery.run_once was called from inside a running "
                    "event loop with an async handler. Use a sync handler "
                    "here, or schedule run_once on a worker thread via "
                    "asyncio.to_thread()."
                )

        reply_text = result if isinstance(result, str) else None
        if reply_text:
            try:
                self._client.reply(msg, reply_text)
            except Exception as exc:
                logger.warning("reply send failed for activity %s: %s", msg.activity_id, exc)

    def stop(self) -> None:
        self._stopped = True


# ---------------------------------------------------------------------------
# PushDelivery — wraps the existing A2AServer
# ---------------------------------------------------------------------------


class PushDelivery:
    """Adapt :class:`A2AServer` to the :class:`InboundDelivery` protocol.

    Use this when the agent CAN expose a reachable HTTP endpoint. The
    A2AServer runs in its own thread and dispatches to ``handler`` as
    HTTP requests arrive — ``run_once`` is a no-op (the loop driver in
    :py:meth:`RemoteAgentClient.run_agent_loop` simply sleeps and
    keeps the heartbeat alive).
    """

    def __init__(self, client: "RemoteAgentClient", server: Any) -> None:
        # ``server`` typed Any to avoid a circular import; it's an A2AServer.
        self._client = client
        self._server = server

    def run_once(self, handler: MessageHandler) -> int:  # noqa: ARG002 — handler unused
        # A2AServer dispatches synchronously on its own thread; nothing
        # for the outer loop to do per-tick.
        return 0

    def stop(self) -> None:
        try:
            self._server.stop()
        except Exception as exc:
            logger.warning("PushDelivery stop: A2AServer.stop raised: %s", exc)


__all__ = [
    "CursorLostError",
    "DEFAULT_POLL_INTERVAL",
    "InboundDelivery",
    "InboundMessage",
    "InboundSource",
    "MessageHandler",
    "PollDelivery",
    "PushDelivery",
    "_parse_activity_row",
]