External agents that can't expose a public HTTP endpoint (laptops behind NAT, ephemeral CI runners, hermes self-hosted, codex et al) had to reverse- engineer the activity-poll loop from molecule-mcp-claude-channel/server.ts because the SDK only shipped the push-mode `A2AServer` (Phase 30.8b). This adds the complementary path: - `RemoteAgentClient.fetch_inbound(since_id=…)` — one-shot GET against `/workspaces/:id/activity?type=a2a_receive&since_id=…`. Cursor-loss (410) surfaces as `CursorLostError`; caller resets and re-polls. - `RemoteAgentClient.reply(msg, text)` — smart-routes to `/notify` for canvas users, `/a2a` (JSON-RPC envelope + X-Source-Workspace-Id) for peer agents. Hides the reply-path bifurcation from connector authors. - `PollDelivery` / `PushDelivery` / `InboundDelivery` protocol — same `MessageHandler` callback works for both transports. - `RemoteAgentClient.run_agent_loop(handler, delivery=None)` — combined heartbeat + state-poll + inbound dispatch. Defaults to `PollDelivery`. Async handlers detected and `asyncio.run`'d (matches A2AServer pattern). Sleep cadence = min(heartbeat_interval, delivery.interval). - `python -m molecule_agent connect` CLI — one-line bootstrap. Loads a user's `module:function` via importlib, registers, runs the loop until pause/delete or SIGTERM. All flags also read from environment variables. Tests: 50 new (test_inbound.py, test_cli_connect.py) covering every prod branch — source normalization, cursor advancement, 410 reset, async/sync handler dispatch, handler exception → log+continue+advance, smart-reply routing for canvas vs peer vs unknown sources, run_agent_loop terminal states, sleep-interval selection, CLI handler resolution failures. Resolves #17.
1228 lines
50 KiB
Python
1228 lines
50 KiB
Python
"""RemoteAgentClient — blocking HTTP client for the Phase 30 remote-agent flow.
|
|
|
|
The client is deliberately dependency-light (``requests`` only) so a remote
|
|
agent author can drop it into any runtime. All methods correspond 1:1 to
|
|
a Phase 30 endpoint:
|
|
|
|
* :py:meth:`register` → ``POST /registry/register`` (30.1)
|
|
* :py:meth:`pull_secrets` → ``GET /workspaces/:id/secrets/values`` (30.2)
|
|
* :py:meth:`poll_state` → ``GET /workspaces/:id/state`` (30.4)
|
|
* :py:meth:`heartbeat` → ``POST /registry/heartbeat`` (30.1)
|
|
* :py:meth:`run_heartbeat_loop` — drives heartbeat + state-poll on a timer,
|
|
returns when the platform reports the workspace paused or deleted.
|
|
|
|
No inbound A2A server is bundled here yet — that requires hosting an HTTP
|
|
endpoint the platform's proxy can reach, which is network-dependent.
|
|
Use :class:`molecule_agent.a2a_server.A2AServer` to add inbound A2A support.
|
|
See that module for usage and the Phase 30.8b contract.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import fcntl
|
|
import hashlib
|
|
import json
|
|
import logging
|
|
import math
|
|
import os
|
|
import random
|
|
import subprocess
|
|
import tarfile
|
|
import time
|
|
import uuid
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
from typing import TYPE_CHECKING, Any
|
|
|
|
import requests
|
|
|
|
if TYPE_CHECKING:
|
|
from .inbound import InboundDelivery, InboundMessage, MessageHandler
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Polling cadence defaults. Chosen to align with the platform's 60-second
|
|
# Redis TTL — one heartbeat per minute keeps the TTL refreshed; state-poll
|
|
# at the same cadence is cheap (tiny GET) and gives ≤60s reaction time on
|
|
# pause / delete. Overridable via RemoteAgentClient constructor kwargs.
|
|
DEFAULT_HEARTBEAT_INTERVAL = 30.0 # seconds
|
|
DEFAULT_STATE_POLL_INTERVAL = 30.0 # seconds
|
|
|
|
# Phase 30.6 — sibling URL cache TTL. Cached URLs expire after this many
|
|
# seconds, forcing a re-discovery call. Short enough that a sibling that
|
|
# moved (restart with new port) is picked up quickly; long enough that
|
|
# we don't hit the discovery endpoint on every A2A call.
|
|
DEFAULT_URL_CACHE_TTL = 300.0 # 5 minutes
|
|
|
|
# Retry-on-429 defaults for idempotent GET calls.
|
|
# Matches the behaviour of the TypeScript MCP server's platformGet().
|
|
DEFAULT_GET_MAX_RETRIES = 3 # retry up to 3 times on 429
|
|
_RETRY_BASE_DELAY = 1.0 # seconds — first delay
|
|
_RETRY_MAX_DELAY = 30.0 # seconds — cap
|
|
_RETRY_JITTER_FRAC = 0.25 # ±25% jitter around base delay
|
|
|
|
# KI-002 — idempotency key granularity: round to the current minute so
|
|
# that concurrent restarts within the same 60-second window produce the
|
|
# same key, while distinct tasks or distinct minutes produce distinct keys.
|
|
_IDEMPOTENCY_ROUND_SECONDS = 60
|
|
|
|
|
|
def make_idempotency_key(task_text: str) -> str:
|
|
"""Compute a deterministic idempotency key for a delegation task.
|
|
|
|
Combines the task text with the current wall-clock minute to produce
|
|
a SHA-256 hex digest. Rounding to minute-level means two container
|
|
restarts within the same minute that send the same task string will
|
|
share the same key, preventing the platform from processing a duplicate
|
|
delegation. A different minute (or a different task string) yields a
|
|
different key.
|
|
|
|
Args:
|
|
task_text: The task description string being delegated.
|
|
|
|
Returns:
|
|
A 64-character hex string (SHA-256 digest).
|
|
"""
|
|
# Round current time down to the nearest minute — same-task restarts
|
|
# within this minute share a key; after the minute rolls over the key
|
|
# changes so a genuinely new task is always treated as new.
|
|
now = int(time.time()) // _IDEMPOTENCY_ROUND_SECONDS * _IDEMPOTENCY_ROUND_SECONDS
|
|
payload = f"{task_text}:{now}"
|
|
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
|
|
|
|
|
|
def _safe_extract_tar(tf: tarfile.TarFile, dest: Path) -> None:
|
|
"""Extract a tarfile, refusing entries that would escape `dest`
|
|
and logging skipped symlinks/hardlinks.
|
|
|
|
Tar archives can include ``..`` and absolute paths. Without explicit
|
|
rejection we'd risk the classic "tar slip" CVE — a malicious plugin
|
|
source could overwrite the agent's home files. We resolve every
|
|
entry's target to an absolute path, verify it lives inside dest, and
|
|
extract one-by-one so the symlink-skip actually takes effect (a bare
|
|
``extractall`` would still write the symlinks we marked as skipped).
|
|
|
|
Symlinks and hardlinks are skipped and a ``logger.warning`` is emitted
|
|
for each one so operators can audit what was dropped.
|
|
"""
|
|
dest_abs = dest.resolve()
|
|
for member in tf.getmembers():
|
|
# Symlinks and hardlinks could point outside the staged tree;
|
|
# skip them entirely (matches the platform-side tar producer) but
|
|
# warn so the operator knows what was dropped.
|
|
if member.issym() or member.islnk():
|
|
link_target = member.linkname
|
|
logger.warning(
|
|
"skipping symlink in plugin tarball (not supported for security): %s -> %s",
|
|
member.name,
|
|
link_target,
|
|
)
|
|
continue
|
|
target = (dest / member.name).resolve()
|
|
try:
|
|
target.relative_to(dest_abs)
|
|
except ValueError as exc:
|
|
raise ValueError(
|
|
f"refusing tar entry escaping dest: {member.name!r} -> {target}"
|
|
) from exc
|
|
tf.extract(member, dest)
|
|
|
|
|
|
def _rmtree_quiet(path: Path) -> None:
|
|
"""rm -rf <path> swallowing missing-file errors. Used for atomic
|
|
install rollback where we sometimes call this on a non-existent
|
|
staging dir."""
|
|
import shutil
|
|
try:
|
|
shutil.rmtree(path)
|
|
except FileNotFoundError:
|
|
pass
|
|
except Exception as exc:
|
|
logger.warning("rmtree(%s) failed: %s", path, exc)
|
|
|
|
|
|
def _walk_files(root: Path) -> list[str]:
|
|
"""Yield relative file paths under ``root`` (directories excluded)."""
|
|
rel: list[str] = []
|
|
for p in root.rglob("*"):
|
|
if p.is_file():
|
|
rel.append(p.relative_to(root).as_posix())
|
|
return rel
|
|
|
|
|
|
def _sha256_file(path: Path) -> str:
|
|
"""Return the SHA256 hex digest of ``path``."""
|
|
h = hashlib.sha256()
|
|
with path.open("rb") as f:
|
|
for chunk in iter(lambda: f.read(65536), b""):
|
|
h.update(chunk)
|
|
return h.hexdigest()
|
|
|
|
|
|
def verify_plugin_sha256(plugin_dir: Path, expected: str) -> bool:
|
|
"""Verify the content of ``plugin_dir`` against an expected SHA256.
|
|
|
|
Computes a *content-addressed manifest hash*: the sorted list of
|
|
``(relative_path, SHA256(file_content))`` pairs (excluding ``plugin.yaml``
|
|
itself) is hashed. This is deterministic regardless of extraction order,
|
|
file timestamps, or directory ordering — any two identical plugin
|
|
directories produce the same hash.
|
|
|
|
``plugin.yaml`` is excluded from its own hash to avoid circular dependency
|
|
(the file contains the sha256 field). Its content is still verified as part
|
|
of the plugin installation.
|
|
|
|
The ``expected`` value is stored in the plugin's ``plugin.yaml`` under
|
|
the ``sha256`` key and should be regenerated any time the plugin
|
|
content changes (e.g. after ``molecule plugin hash`` or a CI step).
|
|
|
|
Args:
|
|
plugin_dir: Path to the unpacked plugin directory.
|
|
expected: 64-character lowercase hex SHA256 of the content manifest.
|
|
|
|
Returns:
|
|
``True`` if the manifest hash matches ``expected``.
|
|
|
|
Raises:
|
|
ValueError: if ``expected`` is not a valid 64-char hex string.
|
|
"""
|
|
if not isinstance(expected, str) or len(expected) != 64 or not _is_hex(expected):
|
|
raise ValueError(
|
|
f"sha256 must be a 64-character lowercase hex string, got {expected!r}"
|
|
)
|
|
|
|
file_hashes: list[tuple[str, str]] = []
|
|
for relpath in sorted(_walk_files(plugin_dir)):
|
|
# plugin.yaml contains the sha256 field itself; including its own hash
|
|
# in the manifest creates a circular dependency. We hash all other files
|
|
# to protect plugin content while leaving the manifest self-describing.
|
|
if relpath == "plugin.yaml":
|
|
continue
|
|
file_hashes.append((relpath, _sha256_file(plugin_dir / relpath)))
|
|
|
|
manifest_bytes = json.dumps(file_hashes, sort_keys=True).encode()
|
|
manifest_hash = hashlib.sha256(manifest_bytes).hexdigest()
|
|
return manifest_hash == expected
|
|
|
|
|
|
def _is_hex(value: str) -> bool:
|
|
if not isinstance(value, str):
|
|
return False
|
|
try:
|
|
int(value, 16)
|
|
return True
|
|
except ValueError:
|
|
return False
|
|
|
|
|
|
@dataclass
|
|
class WorkspaceState:
|
|
"""Snapshot of a remote workspace's platform-side state."""
|
|
workspace_id: str
|
|
status: str # "online" / "paused" / "degraded" / "removed" / "offline" / ...
|
|
paused: bool
|
|
deleted: bool
|
|
|
|
@property
|
|
def should_stop(self) -> bool:
|
|
"""True when the agent should exit its run loop — platform has
|
|
paused or hard-deleted the workspace. The agent can be restarted
|
|
later; we just don't want to keep heartbeating against a dead row.
|
|
"""
|
|
return self.paused or self.deleted
|
|
|
|
|
|
@dataclass
|
|
class PeerInfo:
|
|
"""A sibling or parent workspace that this agent can communicate with."""
|
|
id: str
|
|
name: str
|
|
url: str
|
|
role: str = ""
|
|
tier: int = 2
|
|
status: str = "unknown"
|
|
agent_card: dict[str, Any] = field(default_factory=dict)
|
|
|
|
|
|
class RemoteAgentClient:
|
|
"""Blocking HTTP client for a Phase 30 remote agent.
|
|
|
|
Args:
|
|
workspace_id: UUID of the workspace this agent represents. The
|
|
workspace row must exist on the platform (created via
|
|
``POST /workspaces`` or ``POST /org/import``) — the agent
|
|
claims that identity when it registers.
|
|
platform_url: Base URL of the platform, e.g.
|
|
``https://molecule.example.com``. No trailing slash; the
|
|
client adds paths.
|
|
agent_card: A2A agent card payload. Minimal: ``{"name": "..."}``.
|
|
Full schema matches what an in-container agent would report
|
|
(skills list, capabilities, etc.).
|
|
reported_url: Optional externally-reachable URL at which siblings
|
|
can call this agent's A2A endpoint. If omitted, the agent is
|
|
reachable only via the platform's proxy (which won't be able
|
|
to initiate calls to the agent either — that's a limitation
|
|
of remote agents today, resolved by 30.6/30.7 or by exposing
|
|
an inbound endpoint yourself).
|
|
token_dir: Where to cache the workspace auth token on disk.
|
|
Defaults to ``~/.molecule/<workspace_id>/``. Created with
|
|
0700 permissions if missing.
|
|
heartbeat_interval: Seconds between heartbeats in the run loop.
|
|
state_poll_interval: Seconds between state polls in the run loop.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
workspace_id: str,
|
|
platform_url: str,
|
|
agent_card: dict[str, Any] | None = None,
|
|
reported_url: str = "",
|
|
token_dir: Path | None = None,
|
|
heartbeat_interval: float = DEFAULT_HEARTBEAT_INTERVAL,
|
|
state_poll_interval: float = DEFAULT_STATE_POLL_INTERVAL,
|
|
url_cache_ttl: float = DEFAULT_URL_CACHE_TTL,
|
|
session: requests.Session | None = None,
|
|
) -> None:
|
|
self.workspace_id = workspace_id
|
|
self.platform_url = platform_url.rstrip("/")
|
|
self.agent_card = agent_card or {"name": f"remote-agent-{workspace_id[:8]}"}
|
|
self.reported_url = reported_url
|
|
self.heartbeat_interval = heartbeat_interval
|
|
self.state_poll_interval = state_poll_interval
|
|
self.url_cache_ttl = url_cache_ttl
|
|
# Phase 30.6 — sibling URL cache keyed by workspace id. Values are
|
|
# (url, expires_at_unix_seconds). Process-memory only; we re-fetch
|
|
# on restart because agent lifetimes are short enough that
|
|
# persisting doesn't buy much.
|
|
self._url_cache: dict[str, tuple[str, float]] = {}
|
|
self._session = session or requests.Session()
|
|
self._token_dir = token_dir or (
|
|
Path.home() / ".molecule" / workspace_id
|
|
)
|
|
self._token: str | None = None
|
|
self._start_time = time.time()
|
|
|
|
# ------------------------------------------------------------------
|
|
# Token persistence
|
|
# ------------------------------------------------------------------
|
|
|
|
@property
|
|
def token_file(self) -> Path:
|
|
return self._token_dir / ".auth_token"
|
|
|
|
def load_token(self) -> str | None:
|
|
"""Load a cached token from disk if present. Populates the
|
|
in-memory cache on success.
|
|
|
|
Uses a shared (SH) file lock so it does not block concurrent readers
|
|
or conflict with a simultaneous save_token() call. Non-blocking —
|
|
returns None on lock failure rather than waiting."""
|
|
if self._token is not None:
|
|
return self._token
|
|
if not self.token_file.exists():
|
|
return None
|
|
try:
|
|
# Shared lock: multiple readers allowed; blocks only while a writer
|
|
# holds the exclusive lock. LOCK_NB → fail immediately if contended.
|
|
with self.token_file.open("r") as fh:
|
|
try:
|
|
fcntl.flock(fh.fileno(), fcntl.LOCK_SH | fcntl.LOCK_NB)
|
|
except OSError:
|
|
# Lock unavailable (contended or non-blocking); treat as absent.
|
|
return None
|
|
try:
|
|
tok = fh.read().strip()
|
|
finally:
|
|
fcntl.flock(fh.fileno(), fcntl.LOCK_UN)
|
|
except OSError as exc:
|
|
logger.warning("failed to read %s: %s", self.token_file, exc)
|
|
return None
|
|
if not tok:
|
|
return None
|
|
self._token = tok
|
|
return tok
|
|
|
|
def save_token(self, token: str) -> None:
|
|
"""Persist a freshly-issued token to disk. Creates the parent
|
|
directory with 0700 and the file with 0600 to keep the credential
|
|
off other users' prying eyes.
|
|
|
|
Uses an exclusive (EX) file lock so simultaneous save_token calls from
|
|
concurrent instances of RemoteAgentClient for the same workspace are
|
|
serialised — no partial-overwrite races. Non-blocking: raises
|
|
OSError on lock contention rather than waiting (caller can retry)."""
|
|
token = token.strip()
|
|
if not token:
|
|
raise ValueError("refusing to save empty token")
|
|
self._token_dir.mkdir(parents=True, exist_ok=True)
|
|
try:
|
|
os.chmod(self._token_dir, 0o700)
|
|
except OSError:
|
|
pass # non-fatal — best-effort on unusual filesystems
|
|
# Exclusive lock: serialises writers; blocks until lock is held.
|
|
# LOCK_NB → raise OSError immediately if another writer holds it.
|
|
with self.token_file.open("w") as fh:
|
|
try:
|
|
fcntl.flock(fh.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
except OSError:
|
|
# Another process holds the lock; give up rather than wait.
|
|
# The in-memory _token is still updated so this instance works.
|
|
logger.warning("token file %s is locked by another process; skipping write", self.token_file)
|
|
self._token = token
|
|
return
|
|
try:
|
|
fh.write(token)
|
|
finally:
|
|
fcntl.flock(fh.fileno(), fcntl.LOCK_UN)
|
|
try:
|
|
os.chmod(self.token_file, 0o600)
|
|
except OSError:
|
|
pass
|
|
self._token = token
|
|
|
|
def _auth_headers(self) -> dict[str, str]:
|
|
tok = self.load_token()
|
|
if not tok:
|
|
return {}
|
|
return {"Authorization": f"Bearer {tok}"}
|
|
|
|
def _get_with_retry(
|
|
self,
|
|
url: str,
|
|
headers: dict[str, str] | None = None,
|
|
max_retries: int = DEFAULT_GET_MAX_RETRIES,
|
|
) -> requests.Response:
|
|
"""Issue a GET with automatic retry on 429 (Too Many Requests).
|
|
|
|
Retries up to ``max_retries`` times, honouring the ``Retry-After``
|
|
header (seconds, rounded up to ms) when present. When absent, uses
|
|
exponential back-off with ±25 % jitter: 1 s → 2 s → 4 s, capped at
|
|
30 s.
|
|
|
|
After exhausting retries returns the final 429 response (caller's
|
|
``raise_for_status()`` will turn it into an ``HTTPError``).
|
|
|
|
Only use for idempotent GET calls. POST/DELETE callers must not
|
|
retry, as the server may have already processed the request.
|
|
"""
|
|
attempt = 0
|
|
|
|
while True:
|
|
resp = self._session.get(url, headers=headers or {}, timeout=10.0)
|
|
if resp.status_code != 429 or attempt >= max_retries:
|
|
return resp
|
|
|
|
attempt += 1
|
|
retry_after = resp.headers.get("Retry-After")
|
|
if retry_after is not None:
|
|
delay_s = math.ceil(float(retry_after))
|
|
else:
|
|
base = _RETRY_BASE_DELAY * (2 ** (attempt - 1))
|
|
jitter = _RETRY_BASE_DELAY * _RETRY_JITTER_FRAC * (random.random() * 2 - 1)
|
|
delay_s = base + jitter
|
|
delay_ms = min(int(delay_s * 1000), int(_RETRY_MAX_DELAY * 1000))
|
|
time.sleep(delay_ms / 1000.0)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Endpoints
|
|
# ------------------------------------------------------------------
|
|
|
|
def register(self) -> str:
|
|
"""Register with the platform and cache the issued auth token.
|
|
|
|
Returns the token (also persisted to disk). If the platform has
|
|
already issued a token for this workspace (identified by the
|
|
cached file), register will still succeed but the response will
|
|
not include a new ``auth_token`` — the client keeps using the
|
|
on-disk copy.
|
|
|
|
Raises :class:`requests.HTTPError` on non-2xx responses.
|
|
"""
|
|
# The platform's RegisterPayload requires a non-empty url. A remote
|
|
# agent that doesn't expose inbound A2A yet still needs a placeholder
|
|
# — we use "remote://no-inbound" so the platform can distinguish it
|
|
# from a real HTTP URL and not try to reach the agent.
|
|
reported = self.reported_url or "remote://no-inbound"
|
|
resp = self._session.post(
|
|
f"{self.platform_url}/registry/register",
|
|
json={
|
|
"id": self.workspace_id,
|
|
"url": reported,
|
|
"agent_card": self.agent_card,
|
|
},
|
|
timeout=10.0,
|
|
)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
tok = data.get("auth_token", "")
|
|
if tok:
|
|
self.save_token(tok)
|
|
logger.info("registered and saved new token (prefix=%s…)", tok[:8])
|
|
else:
|
|
# Already-tokened workspace — keep using the cached one.
|
|
existing = self.load_token()
|
|
if not existing:
|
|
logger.warning(
|
|
"register returned no auth_token and no cached token exists — "
|
|
"authenticated calls will 401 until a token is minted"
|
|
)
|
|
return self._token or ""
|
|
|
|
def pull_secrets(self) -> dict[str, str]:
|
|
"""Fetch the merged decrypted secrets via the 30.2 endpoint.
|
|
|
|
Returns an empty dict when the platform has no secrets configured
|
|
for this workspace. Raises on network errors and on 401 (which
|
|
means the token is missing / invalid — call :py:meth:`register`
|
|
first).
|
|
"""
|
|
resp = self._get_with_retry(
|
|
f"{self.platform_url}/workspaces/{self.workspace_id}/secrets/values",
|
|
headers=self._auth_headers(),
|
|
)
|
|
resp.raise_for_status()
|
|
return resp.json() or {}
|
|
|
|
def poll_state(self) -> WorkspaceState | None:
|
|
"""Fetch the workspace's current state via the 30.4 endpoint.
|
|
|
|
Returns None if the platform returns 404 with ``{"deleted": true}``
|
|
(workspace hard-deleted) — callers typically exit their run loop
|
|
in that case. Raises on other HTTP errors.
|
|
"""
|
|
resp = self._get_with_retry(
|
|
f"{self.platform_url}/workspaces/{self.workspace_id}/state",
|
|
headers=self._auth_headers(),
|
|
)
|
|
if resp.status_code == 404:
|
|
# Platform signals hard-delete via 404 + deleted:true
|
|
return WorkspaceState(
|
|
workspace_id=self.workspace_id,
|
|
status="removed",
|
|
paused=False,
|
|
deleted=True,
|
|
)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
return WorkspaceState(
|
|
workspace_id=self.workspace_id,
|
|
status=str(data.get("status", "unknown")),
|
|
paused=bool(data.get("paused", False)),
|
|
deleted=bool(data.get("deleted", False)),
|
|
)
|
|
|
|
def heartbeat(
|
|
self,
|
|
current_task: str = "",
|
|
active_tasks: int = 0,
|
|
error_rate: float = 0.0,
|
|
sample_error: str = "",
|
|
) -> None:
|
|
"""Send a single heartbeat. Safe to call repeatedly — the platform
|
|
treats it as idempotent state-refresh. Raises on non-2xx."""
|
|
uptime = int(time.time() - self._start_time)
|
|
resp = self._session.post(
|
|
f"{self.platform_url}/registry/heartbeat",
|
|
headers=self._auth_headers(),
|
|
json={
|
|
"workspace_id": self.workspace_id,
|
|
"current_task": current_task,
|
|
"active_tasks": active_tasks,
|
|
"error_rate": error_rate,
|
|
"sample_error": sample_error,
|
|
"uptime_seconds": uptime,
|
|
},
|
|
timeout=10.0,
|
|
)
|
|
resp.raise_for_status()
|
|
|
|
# ------------------------------------------------------------------
|
|
# Peer discovery + cache (Phase 30.6)
|
|
# ------------------------------------------------------------------
|
|
|
|
def get_peers(self) -> list[PeerInfo]:
|
|
"""Fetch the list of peer workspaces this agent can communicate with.
|
|
|
|
Hits ``GET /registry/:id/peers`` with the bearer token. The returned
|
|
list includes siblings (same parent) and, if applicable, the parent.
|
|
Each peer's URL is seeded into the local cache so subsequent calls
|
|
to :py:meth:`discover_peer` short-circuit without hitting the
|
|
platform.
|
|
|
|
Raises on 401 (stale/missing token → call :py:meth:`register`) and
|
|
other non-2xx.
|
|
"""
|
|
resp = self._get_with_retry(
|
|
f"{self.platform_url}/registry/{self.workspace_id}/peers",
|
|
headers={
|
|
**self._auth_headers(),
|
|
"X-Workspace-ID": self.workspace_id,
|
|
},
|
|
)
|
|
resp.raise_for_status()
|
|
data = resp.json() or []
|
|
peers: list[PeerInfo] = []
|
|
now = time.time()
|
|
for row in data:
|
|
pid = str(row.get("id", ""))
|
|
url = str(row.get("url", ""))
|
|
if not pid:
|
|
continue
|
|
peer = PeerInfo(
|
|
id=pid,
|
|
name=str(row.get("name", "")),
|
|
url=url,
|
|
role=str(row.get("role", "")),
|
|
tier=int(row.get("tier", 2) or 2),
|
|
status=str(row.get("status", "unknown")),
|
|
agent_card=row.get("agent_card") or {},
|
|
)
|
|
peers.append(peer)
|
|
# Seed the cache so a subsequent call_peer doesn't need a
|
|
# discover round-trip. Only cache HTTP-shaped URLs; skip the
|
|
# "remote://no-inbound" placeholder and empty strings.
|
|
if url.startswith(("http://", "https://")):
|
|
self._url_cache[pid] = (url, now + self.url_cache_ttl)
|
|
return peers
|
|
|
|
def discover_peer(self, target_id: str) -> str | None:
|
|
"""Resolve a peer's URL, using the cache when fresh.
|
|
|
|
Returns the URL string, or None if the platform has no usable URL
|
|
for this target. On 401/403 the caller should re-authenticate or
|
|
verify the hierarchy rule; those are raised as ``HTTPError``.
|
|
|
|
Cache semantics: a cached entry is returned immediately if its TTL
|
|
hasn't expired; otherwise the platform is hit and the cache
|
|
refreshed. Call :py:meth:`invalidate_peer_url` to drop an entry
|
|
that was stale (connection error, 5xx) so the next discover
|
|
re-fetches instead of returning the dead URL again.
|
|
"""
|
|
cached = self._url_cache.get(target_id)
|
|
if cached is not None:
|
|
url, expires_at = cached
|
|
if time.time() < expires_at:
|
|
return url
|
|
# Expired — drop and fall through to refresh
|
|
self._url_cache.pop(target_id, None)
|
|
|
|
resp = self._get_with_retry(
|
|
f"{self.platform_url}/registry/discover/{target_id}",
|
|
headers={
|
|
**self._auth_headers(),
|
|
"X-Workspace-ID": self.workspace_id,
|
|
},
|
|
)
|
|
if resp.status_code == 404:
|
|
return None
|
|
resp.raise_for_status()
|
|
url = str((resp.json() or {}).get("url", ""))
|
|
if url.startswith(("http://", "https://")):
|
|
self._url_cache[target_id] = (url, time.time() + self.url_cache_ttl)
|
|
return url
|
|
return None
|
|
|
|
def invalidate_peer_url(self, target_id: str) -> None:
|
|
"""Drop a peer's cached URL. Call this after a direct-call failure
|
|
so the next call_peer performs a fresh discover."""
|
|
self._url_cache.pop(target_id, None)
|
|
|
|
def call_peer(
|
|
self,
|
|
target_id: str,
|
|
message: str,
|
|
prefer_direct: bool = True,
|
|
) -> dict[str, Any]:
|
|
"""Send an A2A ``message/send`` to a peer.
|
|
|
|
Preferred path (``prefer_direct=True``, default):
|
|
1. Resolve target URL via :py:meth:`discover_peer` (cache-hot
|
|
path when we've seen this peer before).
|
|
2. POST the JSON-RPC envelope directly to the peer's URL.
|
|
3. On connection error / 5xx, invalidate the cache and retry
|
|
via the platform proxy — graceful fallback so a stale URL
|
|
doesn't brick inter-agent communication.
|
|
|
|
Proxy-only path (``prefer_direct=False``):
|
|
Always routes through ``POST /workspaces/:id/a2a`` — useful
|
|
when both agents are behind NAT and can't reach each other
|
|
directly, but the platform can reach both.
|
|
|
|
Returns the full JSON-RPC response dict so callers can inspect
|
|
``result`` vs ``error`` without us flattening the envelope.
|
|
"""
|
|
body = {
|
|
"jsonrpc": "2.0",
|
|
"id": str(uuid.uuid4()),
|
|
"method": "message/send",
|
|
"params": {
|
|
"message": {
|
|
"role": "user",
|
|
"messageId": str(uuid.uuid4()),
|
|
"parts": [{"kind": "text", "text": message}],
|
|
}
|
|
},
|
|
}
|
|
headers = {
|
|
**self._auth_headers(),
|
|
"X-Workspace-ID": self.workspace_id,
|
|
"Content-Type": "application/json",
|
|
}
|
|
|
|
if prefer_direct:
|
|
url = self.discover_peer(target_id)
|
|
if url:
|
|
try:
|
|
resp = self._session.post(url, json=body, headers=headers, timeout=30.0)
|
|
resp.raise_for_status()
|
|
return resp.json()
|
|
except Exception as exc:
|
|
logger.warning(
|
|
"direct A2A to %s (%s) failed: %s — invalidating cache, falling back to proxy",
|
|
target_id, url, exc,
|
|
)
|
|
self.invalidate_peer_url(target_id)
|
|
|
|
# Proxy fallback (or prefer_direct=False)
|
|
resp = self._session.post(
|
|
f"{self.platform_url}/workspaces/{target_id}/a2a",
|
|
json=body, headers=headers, timeout=30.0,
|
|
)
|
|
resp.raise_for_status()
|
|
return resp.json()
|
|
|
|
# ------------------------------------------------------------------
|
|
# Inbound delivery (poll mode) — Phase 30.8c
|
|
# ------------------------------------------------------------------
|
|
|
|
def fetch_inbound(
|
|
self,
|
|
since_id: str | None = None,
|
|
limit: int = 100,
|
|
type: str = "a2a_receive",
|
|
) -> list["InboundMessage"]:
|
|
"""Fetch one batch of inbound A2A activity rows.
|
|
|
|
Hits ``GET /workspaces/:id/activity?type=…&since_id=…&limit=…``.
|
|
Returns the rows newer than ``since_id`` in oldest-first order,
|
|
parsed into :class:`~molecule_agent.inbound.InboundMessage`.
|
|
|
|
Used by :class:`~molecule_agent.inbound.PollDelivery`; most callers
|
|
should drive this through :py:meth:`run_agent_loop` rather than
|
|
polling manually.
|
|
|
|
Args:
|
|
since_id: Activity-id cursor — only rows newer than this are
|
|
returned. Pass ``None`` for the initial fetch.
|
|
limit: Max rows per batch. Default 100. Server-side cap may
|
|
lower this.
|
|
type: Activity-row type filter. Default ``"a2a_receive"``;
|
|
pass another type to consume different streams (e.g.
|
|
``"workspace_state_changed"``).
|
|
|
|
Returns:
|
|
List of :class:`InboundMessage`, oldest first. May be empty.
|
|
|
|
Raises:
|
|
:class:`~molecule_agent.inbound.CursorLostError`: if the server
|
|
returns 410 Gone (cursor's row has been rotated out of the
|
|
activity window). Caller should reset the cursor and retry.
|
|
``requests.HTTPError``: on other non-2xx responses (401, 5xx, …).
|
|
"""
|
|
# Local import to avoid a circular dependency at module load — the
|
|
# inbound module references RemoteAgentClient via TYPE_CHECKING.
|
|
from .inbound import CursorLostError, _parse_activity_row
|
|
|
|
params: dict[str, str] = {"type": type, "limit": str(int(limit))}
|
|
if since_id:
|
|
params["since_id"] = since_id
|
|
url = f"{self.platform_url}/workspaces/{self.workspace_id}/activity"
|
|
resp = self._session.get(
|
|
url,
|
|
headers=self._auth_headers(),
|
|
params=params,
|
|
timeout=15.0,
|
|
)
|
|
if resp.status_code == 410:
|
|
raise CursorLostError(
|
|
f"cursor {since_id!r} no longer valid (410 Gone); reset and re-poll"
|
|
)
|
|
# 429 retry: rebuild the URL with encoded query string and route
|
|
# through _get_with_retry, which honours Retry-After + jittered
|
|
# backoff. We only retry on 429 — every other status falls through
|
|
# to raise_for_status below.
|
|
if resp.status_code == 429:
|
|
from urllib.parse import urlencode
|
|
resp = self._get_with_retry(
|
|
url + "?" + urlencode(params),
|
|
headers=self._auth_headers(),
|
|
)
|
|
resp.raise_for_status()
|
|
|
|
rows = resp.json() or []
|
|
if not isinstance(rows, list):
|
|
# Defensive: if the server ever wraps in {"items": […]} we
|
|
# accept that shape too rather than silently dropping data.
|
|
rows = rows.get("items", []) if isinstance(rows, dict) else []
|
|
|
|
out: list["InboundMessage"] = []
|
|
for row in rows:
|
|
if not isinstance(row, dict):
|
|
continue
|
|
msg = _parse_activity_row(row)
|
|
if msg is not None:
|
|
out.append(msg)
|
|
return out
|
|
|
|
def reply(self, message: "InboundMessage", text: str) -> None:
|
|
"""Reply to an inbound message.
|
|
|
|
The reply transport is picked from ``message.source``:
|
|
|
|
* ``canvas_user`` → ``POST /workspaces/:id/notify`` with
|
|
``{"message": text}``. The canvas surfaces the text to the user.
|
|
* ``peer_agent`` → ``POST /workspaces/:peer_id/a2a`` with a JSON-RPC
|
|
``message/send`` envelope and ``X-Source-Workspace-Id`` header.
|
|
* ``unknown`` → raises ``ValueError``. The SDK refuses to guess the
|
|
transport; the caller should inspect ``message.raw`` and use
|
|
:py:meth:`call_peer` or a direct HTTP call as appropriate.
|
|
|
|
Args:
|
|
message: The :class:`InboundMessage` being replied to. Determines
|
|
the transport.
|
|
text: Reply text. Empty / whitespace-only strings raise
|
|
``ValueError`` to prevent accidental silent acks.
|
|
|
|
Raises:
|
|
``ValueError``: on empty text or unknown source.
|
|
``requests.HTTPError``: on non-2xx server response.
|
|
"""
|
|
if not text or not text.strip():
|
|
raise ValueError("reply text must be non-empty")
|
|
|
|
if message.source == "canvas_user":
|
|
resp = self._session.post(
|
|
f"{self.platform_url}/workspaces/{self.workspace_id}/notify",
|
|
headers={
|
|
**self._auth_headers(),
|
|
"Content-Type": "application/json",
|
|
},
|
|
json={"message": text},
|
|
timeout=15.0,
|
|
)
|
|
resp.raise_for_status()
|
|
return
|
|
|
|
if message.source == "peer_agent":
|
|
target = message.source_id or ""
|
|
if not target:
|
|
raise ValueError(
|
|
"peer_agent inbound message has no source_id — cannot route reply"
|
|
)
|
|
body = {
|
|
"jsonrpc": "2.0",
|
|
"id": str(uuid.uuid4()),
|
|
"method": "message/send",
|
|
"params": {
|
|
"message": {
|
|
"role": "agent",
|
|
"messageId": str(uuid.uuid4()),
|
|
"parts": [{"kind": "text", "text": text}],
|
|
}
|
|
},
|
|
}
|
|
resp = self._session.post(
|
|
f"{self.platform_url}/workspaces/{target}/a2a",
|
|
headers={
|
|
**self._auth_headers(),
|
|
"X-Source-Workspace-Id": self.workspace_id,
|
|
"X-Workspace-ID": self.workspace_id,
|
|
"Content-Type": "application/json",
|
|
},
|
|
json=body,
|
|
timeout=15.0,
|
|
)
|
|
resp.raise_for_status()
|
|
return
|
|
|
|
raise ValueError(
|
|
f"cannot auto-route reply for source={message.source!r}; "
|
|
"inspect message.raw and call /notify or /a2a directly"
|
|
)
|
|
|
|
def run_agent_loop(
|
|
self,
|
|
handler: "MessageHandler",
|
|
delivery: "InboundDelivery | None" = None,
|
|
max_iterations: int | None = None,
|
|
task_supplier: "callable | None" = None,
|
|
) -> str:
|
|
"""Combined heartbeat + state-poll + inbound-delivery loop.
|
|
|
|
Generalization of :py:meth:`run_heartbeat_loop` that also drains
|
|
inbound messages on every tick. This is the recommended entry
|
|
point for an external agent author — registers, heartbeats,
|
|
state-polls, and dispatches inbound, all in one sync call.
|
|
|
|
Args:
|
|
handler: ``Callable[[InboundMessage, RemoteAgentClient],
|
|
str | None | Awaitable[str | None]]``. Invoked once per
|
|
inbound message. Returning a non-empty string sends an
|
|
automatic reply via :py:meth:`reply`. ``None`` skips the
|
|
reply (useful for fire-and-forget consumers).
|
|
delivery: An :class:`InboundDelivery` implementation. Defaults
|
|
to :class:`PollDelivery` (the right choice when the agent
|
|
can't expose an inbound URL — i.e. ``reported_url`` is
|
|
empty or starts with ``remote://``). Pass an explicit
|
|
:class:`PushDelivery` (constructed around an
|
|
:class:`A2AServer`) for push-mode agents.
|
|
max_iterations: Stop after N iterations. ``None`` = run until
|
|
the platform reports the workspace paused or deleted.
|
|
task_supplier: Optional zero-arg callable returning a dict
|
|
``{"current_task": str, "active_tasks": int}`` reported on
|
|
each heartbeat (same contract as :py:meth:`run_heartbeat_loop`).
|
|
|
|
Returns:
|
|
The terminal status: ``"paused"``, ``"removed"``, or
|
|
``"max_iterations"``.
|
|
|
|
Errors from the activity poll, heartbeat, or state poll are
|
|
logged and the loop continues — a transient platform hiccup
|
|
should not take a remote agent offline. Handler exceptions are
|
|
caught at the delivery layer (see :class:`PollDelivery`).
|
|
"""
|
|
from .inbound import PollDelivery
|
|
|
|
if delivery is None:
|
|
delivery = PollDelivery(self)
|
|
|
|
i = 0
|
|
try:
|
|
while True:
|
|
if max_iterations is not None and i >= max_iterations:
|
|
return "max_iterations"
|
|
i += 1
|
|
|
|
report: dict[str, Any] = {}
|
|
if task_supplier is not None:
|
|
try:
|
|
report = task_supplier() or {}
|
|
except Exception as exc:
|
|
logger.warning("task_supplier raised: %s", exc)
|
|
|
|
try:
|
|
self.heartbeat(
|
|
current_task=str(report.get("current_task", "")),
|
|
active_tasks=int(report.get("active_tasks", 0)),
|
|
)
|
|
except Exception as exc:
|
|
logger.warning("heartbeat failed: %s — continuing", exc)
|
|
|
|
try:
|
|
delivery.run_once(handler)
|
|
except Exception as exc:
|
|
logger.warning("inbound delivery.run_once raised: %s — continuing", exc)
|
|
|
|
try:
|
|
state = self.poll_state()
|
|
except Exception as exc:
|
|
logger.warning("state poll failed: %s — continuing", exc)
|
|
state = None
|
|
|
|
if state is not None and state.should_stop:
|
|
logger.info(
|
|
"platform reports workspace %s (paused=%s deleted=%s) — exiting",
|
|
state.status, state.paused, state.deleted,
|
|
)
|
|
return state.status
|
|
|
|
# Sleep cadence: take the smaller of heartbeat_interval and
|
|
# the delivery's poll interval (when present) so inbound
|
|
# latency is bounded by the delivery's setting, not by the
|
|
# heartbeat cadence.
|
|
interval = self.heartbeat_interval
|
|
poll_interval = getattr(delivery, "interval", None)
|
|
if isinstance(poll_interval, (int, float)) and poll_interval > 0:
|
|
interval = min(interval, float(poll_interval))
|
|
time.sleep(interval)
|
|
finally:
|
|
try:
|
|
delivery.stop()
|
|
except Exception as exc:
|
|
logger.warning("delivery.stop raised on loop exit: %s", exc)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Delegation — KI-002 idempotency guard
|
|
# ------------------------------------------------------------------
|
|
|
|
def delegate(
|
|
self,
|
|
task: str,
|
|
target_id: str,
|
|
idempotency_key: str | None = None,
|
|
timeout: float = 300.0,
|
|
) -> dict[str, Any]:
|
|
"""Delegate a task to a peer workspace via the platform proxy.
|
|
|
|
KI-002: To prevent duplicate execution when a container restarts mid-
|
|
delegation, an idempotency key is computed from ``task + current
|
|
minute`` and sent as ``idempotency_key`` in the request body. The
|
|
platform deduplicates requests sharing the same key within the
|
|
minute window. Pass an explicit ``idempotency_key`` to override the
|
|
auto-computed value (useful for callers that manage their own key
|
|
scheme).
|
|
|
|
Args:
|
|
task: Human-readable task description sent to the target.
|
|
target_id: Workspace ID of the peer to delegate to.
|
|
idempotency_key: Optional override for the idempotency key. If
|
|
omitted, one is auto-generated from the task text + current
|
|
wall-clock minute.
|
|
timeout: Request timeout in seconds. Default 300 s.
|
|
|
|
Returns:
|
|
The platform's JSON response dict.
|
|
|
|
Raises:
|
|
``requests.HTTPError`` on non-2xx responses.
|
|
"""
|
|
key = idempotency_key if idempotency_key else make_idempotency_key(task)
|
|
resp = self._session.post(
|
|
f"{self.platform_url}/workspaces/{target_id}/delegate",
|
|
headers={
|
|
**self._auth_headers(),
|
|
"X-Workspace-ID": self.workspace_id,
|
|
"Content-Type": "application/json",
|
|
},
|
|
json={
|
|
"task": task,
|
|
"idempotency_key": key,
|
|
},
|
|
timeout=timeout,
|
|
)
|
|
resp.raise_for_status()
|
|
return resp.json()
|
|
|
|
# ------------------------------------------------------------------
|
|
# Plugin install (Phase 30.3)
|
|
# ------------------------------------------------------------------
|
|
|
|
@property
|
|
def plugins_dir(self) -> Path:
|
|
"""Where pulled plugins are unpacked. Lives under the same
|
|
per-workspace directory as the auth token (``~/.molecule/<id>/``)
|
|
so a single rm cleans the agent's local state."""
|
|
return self._token_dir / "plugins"
|
|
|
|
def install_plugin(
|
|
self,
|
|
name: str,
|
|
source: str | None = None,
|
|
run_setup_sh: bool = True,
|
|
report_to_platform: bool = True,
|
|
) -> Path:
|
|
"""Pull a plugin tarball from the platform, unpack it, optionally
|
|
run its ``setup.sh``, and report success.
|
|
|
|
Phase 30.3 contract:
|
|
|
|
1. Stream ``GET /workspaces/:id/plugins/:name/download[?source=…]``
|
|
2. Atomically extract into ``~/.molecule/<workspace>/plugins/<name>/``
|
|
via a sibling-tempdir + rename so a partial extract never
|
|
leaves the directory in a half-installed state.
|
|
3. If ``setup.sh`` exists in the unpacked tree, run it (bash) so
|
|
pip/npm deps land on the agent's machine. Failures from
|
|
``setup.sh`` are logged but don't prevent the install record
|
|
— the agent author can re-run setup manually.
|
|
4. POST ``/workspaces/:id/plugins`` with the source string so the
|
|
platform's ``workspace_plugins`` table reflects the install.
|
|
3b. If ``plugin.yaml`` declares a ``sha256`` field, verify the unpacked
|
|
content matches before running ``setup.sh`` (blocks supply-chain
|
|
tampering after the platform-level pinned-ref check from PR #1019).
|
|
|
|
Returns the path to the unpacked plugin directory.
|
|
Raises ``requests.HTTPError`` on download failure (401 / 404 / etc.).
|
|
Raises ``ValueError`` if a declared ``sha256`` does not match.
|
|
"""
|
|
target = self.plugins_dir / name
|
|
staging = self.plugins_dir / f".staging-{name}-{uuid.uuid4().hex[:8]}"
|
|
self.plugins_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
url = f"{self.platform_url}/workspaces/{self.workspace_id}/plugins/{name}/download"
|
|
params: dict[str, str] = {}
|
|
if source:
|
|
params["source"] = source
|
|
|
|
# We pull the whole tarball into memory before extracting. The
|
|
# platform side (PLUGIN_INSTALL_MAX_DIR_BYTES) caps plugin size
|
|
# at 100 MiB by default, which is comfortable to hold in process
|
|
# memory and lets us use tarfile's seekable r:gz mode (the
|
|
# streaming r|gz mode is sequential-only and breaks on tarballs
|
|
# whose entries weren't sorted by name, which we don't enforce).
|
|
# If the cap is ever raised above ~500 MiB, switch to a temp
|
|
# file: tarfile.open(fileobj=open(temp, "rb"), mode="r:gz").
|
|
with self._session.get(
|
|
url, headers=self._auth_headers(), params=params,
|
|
timeout=60.0,
|
|
) as resp:
|
|
resp.raise_for_status()
|
|
staging.mkdir(parents=True)
|
|
try:
|
|
import io as _io
|
|
with tarfile.open(fileobj=_io.BytesIO(resp.content), mode="r:gz") as tf:
|
|
_safe_extract_tar(tf, staging)
|
|
except Exception:
|
|
# Roll back the partial extract
|
|
_rmtree_quiet(staging)
|
|
raise
|
|
|
|
# Atomic swap: remove old dir if present, rename staging into place.
|
|
if target.exists():
|
|
_rmtree_quiet(target)
|
|
staging.rename(target)
|
|
logger.info("plugin %s unpacked to %s", name, target)
|
|
|
|
# 3b. Content integrity — verify sha256 declared in plugin.yaml if present.
|
|
# This closes the SDK-side gap identified in SDK-Dev review of PR #1019:
|
|
# the platform enforces source-pinned refs (SHA/tag) but cannot detect
|
|
# content tampering after the tarball is served. The SDK verifies the
|
|
# delivered content against a manifest hash in plugin.yaml.
|
|
manifest_path = target / "plugin.yaml"
|
|
if manifest_path.exists():
|
|
try:
|
|
import yaml as _yaml # lazy — not a top-level dep
|
|
manifest = _yaml.safe_load(manifest_path.read_text())
|
|
expected_sha: str | None = manifest.get("sha256") if isinstance(manifest, dict) else None
|
|
if expected_sha:
|
|
if not verify_plugin_sha256(target, expected_sha):
|
|
_rmtree_quiet(target)
|
|
raise ValueError(
|
|
f"plugin {name} sha256 mismatch — expected {expected_sha}. "
|
|
f"Plugin directory may have been tampered with after download. "
|
|
f"setup.sh was NOT run."
|
|
)
|
|
logger.info("plugin %s sha256 verified", name)
|
|
except ValueError:
|
|
raise # re-raise our own ValueError (sha256 mismatch)
|
|
except Exception as exc:
|
|
# YAML read / parse failures are non-fatal — skip verification
|
|
# and fall through to setup.sh. We log the error so operators
|
|
# can see why verification was skipped.
|
|
logger.warning("plugin %s sha256 verification skipped: %s", name, exc)
|
|
|
|
# 3. setup.sh — best-effort. We never raise on its failure because
|
|
# the plugin files are now correctly installed; setup is just for
|
|
# heavy deps that the agent author can rerun manually.
|
|
if run_setup_sh:
|
|
setup = target / "setup.sh"
|
|
if setup.is_file():
|
|
try:
|
|
proc = subprocess.run(
|
|
["bash", str(setup)],
|
|
cwd=str(target),
|
|
capture_output=True, text=True, timeout=120,
|
|
)
|
|
if proc.returncode == 0:
|
|
logger.info("plugin %s setup.sh ok", name)
|
|
else:
|
|
logger.warning(
|
|
"plugin %s setup.sh exit=%d stderr=%s",
|
|
name, proc.returncode, proc.stderr[:200],
|
|
)
|
|
except subprocess.TimeoutExpired:
|
|
logger.warning("plugin %s setup.sh timed out (120s)", name)
|
|
except FileNotFoundError:
|
|
logger.warning("plugin %s setup.sh present but bash not found", name)
|
|
|
|
# 4. Report to platform — write a workspace_plugins row so List
|
|
# reflects the install. Best-effort; the local files are correct
|
|
# regardless of whether this POST succeeds.
|
|
if report_to_platform:
|
|
try:
|
|
report_source = source or f"local://{name}"
|
|
self._session.post(
|
|
f"{self.platform_url}/workspaces/{self.workspace_id}/plugins",
|
|
headers=self._auth_headers(),
|
|
json={"source": report_source},
|
|
timeout=10.0,
|
|
)
|
|
except Exception as exc:
|
|
logger.warning("plugin %s install record POST failed: %s", name, exc)
|
|
|
|
return target
|
|
|
|
# ------------------------------------------------------------------
|
|
# Run loop
|
|
# ------------------------------------------------------------------
|
|
|
|
def run_heartbeat_loop(
|
|
self,
|
|
max_iterations: int | None = None,
|
|
task_supplier: "callable | None" = None,
|
|
) -> str:
|
|
"""Drive heartbeat + state-poll on a timer. Returns the terminal
|
|
status when the loop exits (``"paused"``, ``"removed"``, or
|
|
``"max_iterations"``).
|
|
|
|
Args:
|
|
max_iterations: Stop after N loop iterations. None = run until
|
|
the workspace is paused / deleted. Useful for tests and
|
|
smoke scripts.
|
|
task_supplier: Optional zero-arg callable returning a dict
|
|
``{"current_task": str, "active_tasks": int}`` fetched
|
|
each iteration. Lets the agent report what it's doing.
|
|
|
|
The loop sends one heartbeat + one state poll per iteration; the
|
|
next iteration sleeps for ``heartbeat_interval`` seconds. Errors
|
|
from either call are logged and the loop continues — we deliberately
|
|
do NOT re-raise because a transient platform hiccup shouldn't take
|
|
a remote agent offline.
|
|
"""
|
|
i = 0
|
|
while True:
|
|
if max_iterations is not None and i >= max_iterations:
|
|
return "max_iterations"
|
|
i += 1
|
|
|
|
report: dict[str, Any] = {}
|
|
if task_supplier is not None:
|
|
try:
|
|
report = task_supplier() or {}
|
|
except Exception as exc:
|
|
logger.warning("task_supplier raised: %s", exc)
|
|
|
|
try:
|
|
self.heartbeat(
|
|
current_task=str(report.get("current_task", "")),
|
|
active_tasks=int(report.get("active_tasks", 0)),
|
|
)
|
|
except Exception as exc:
|
|
logger.warning("heartbeat failed: %s — continuing", exc)
|
|
|
|
try:
|
|
state = self.poll_state()
|
|
except Exception as exc:
|
|
logger.warning("state poll failed: %s — continuing", exc)
|
|
state = None
|
|
|
|
if state is not None and state.should_stop:
|
|
logger.info(
|
|
"platform reports workspace %s (paused=%s deleted=%s) — exiting",
|
|
state.status, state.paused, state.deleted,
|
|
)
|
|
return state.status
|
|
|
|
time.sleep(self.heartbeat_interval)
|
|
|
|
|
|
__all__ = [
|
|
"RemoteAgentClient",
|
|
"WorkspaceState",
|
|
"PeerInfo",
|
|
"DEFAULT_HEARTBEAT_INTERVAL",
|
|
"DEFAULT_STATE_POLL_INTERVAL",
|
|
"DEFAULT_URL_CACHE_TTL",
|
|
"verify_plugin_sha256",
|
|
"make_idempotency_key",
|
|
]
|
|
|