Closes #220. #215 added auth_headers() to /registry/register but missed two other self-post paths from the same workspace container: 1. initial_prompt (_do_send_sync) — fires once on first boot after the A2A server is ready. Posts to /workspaces/:id/a2a via the platform proxy. Missing headers meant the initial prompt got silently dropped as 401 on any token-enrolled workspace. 2. idle loop (_post_sync) — fires every idle_interval_seconds while the workspace has no active task (#205 pattern). Same proxy path, same missing headers, same silent 401 in multi-tenant mode. Both now build headers as {"Content-Type": "application/json", **auth_headers()} auth_headers() returns {"Authorization": "Bearer <token>"} when /auth-token.txt exists, empty dict otherwise (first boot before register issues the token). The existing lazy-bootstrap fail-open on the platform side covers the empty-dict case. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
503 lines
22 KiB
Python
503 lines
22 KiB
Python
"""Workspace runtime entry point.
|
|
|
|
Loads config -> discovers adapter -> setup -> create executor -> wrap in A2A -> register -> heartbeat.
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import os
|
|
import socket
|
|
|
|
import httpx
|
|
import uvicorn
|
|
from a2a.server.apps import A2AStarletteApplication
|
|
from a2a.server.request_handlers import DefaultRequestHandler
|
|
from a2a.server.tasks import InMemoryTaskStore
|
|
from a2a.types import AgentCard, AgentCapabilities, AgentSkill
|
|
|
|
from adapters import get_adapter, AdapterConfig
|
|
from config import load_config
|
|
from heartbeat import HeartbeatLoop
|
|
from preflight import run_preflight, render_preflight_report
|
|
from builtin_tools.awareness_client import get_awareness_config
|
|
import uuid as _uuid
|
|
|
|
from builtin_tools.telemetry import setup_telemetry, make_trace_middleware
|
|
from policies.namespaces import resolve_awareness_namespace
|
|
|
|
|
|
from initial_prompt import (
|
|
mark_initial_prompt_attempted,
|
|
resolve_initial_prompt_marker,
|
|
)
|
|
from platform_auth import auth_headers
|
|
|
|
|
|
def get_machine_ip() -> str: # pragma: no cover
|
|
"""Get the machine's IP for A2A discovery."""
|
|
try:
|
|
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
|
s.connect(("8.8.8.8", 80))
|
|
ip = s.getsockname()[0]
|
|
s.close()
|
|
return ip
|
|
except Exception:
|
|
return "127.0.0.1"
|
|
|
|
|
|
async def main(): # pragma: no cover
|
|
workspace_id = os.environ.get("WORKSPACE_ID", "workspace-default")
|
|
config_path = os.environ.get("WORKSPACE_CONFIG_PATH", "/configs")
|
|
platform_url = os.environ.get("PLATFORM_URL", "http://platform:8080")
|
|
awareness_config = get_awareness_config()
|
|
|
|
# 0. Initialise OpenTelemetry (no-op if packages not installed)
|
|
setup_telemetry(service_name=workspace_id)
|
|
|
|
# 1. Load config
|
|
config = load_config(config_path)
|
|
port = config.a2a.port
|
|
preflight = run_preflight(config, config_path)
|
|
render_preflight_report(preflight)
|
|
if not preflight.ok:
|
|
raise SystemExit(1)
|
|
if awareness_config:
|
|
awareness_namespace = resolve_awareness_namespace(
|
|
workspace_id,
|
|
awareness_config.get("namespace", ""),
|
|
)
|
|
print(f"Awareness enabled for namespace: {awareness_namespace}")
|
|
|
|
# 1.5 Initialise governance adapter (no-op if disabled or package absent)
|
|
from builtin_tools.governance import initialize_governance
|
|
if config.governance.enabled:
|
|
await initialize_governance(config.governance)
|
|
print(f"Governance: Microsoft Agent Governance Toolkit enabled (mode={config.governance.policy_mode})")
|
|
else:
|
|
print("Governance: disabled (set governance.enabled: true in config.yaml to activate)")
|
|
|
|
# 2. Create heartbeat (passed to adapter for task tracking)
|
|
heartbeat = HeartbeatLoop(platform_url, workspace_id)
|
|
|
|
# 3. Get adapter for this runtime
|
|
runtime = config.runtime or "langgraph"
|
|
adapter_cls = get_adapter(runtime) # Raises KeyError if unknown — no silent fallback
|
|
|
|
adapter = adapter_cls()
|
|
print(f"Runtime: {runtime} ({adapter.display_name()})")
|
|
|
|
# 4. Build adapter config
|
|
adapter_config = AdapterConfig(
|
|
model=config.model,
|
|
system_prompt=None, # Adapter builds its own prompt
|
|
tools=config.skills, # Skill names from config.yaml
|
|
runtime_config=vars(config.runtime_config) if config.runtime_config else {},
|
|
config_path=config_path,
|
|
workspace_id=workspace_id,
|
|
prompt_files=config.prompt_files,
|
|
a2a_port=port,
|
|
heartbeat=heartbeat,
|
|
)
|
|
|
|
# 5. Setup adapter and create executor
|
|
# If setup fails, ensure heartbeat is stopped to prevent resource leak
|
|
try:
|
|
await adapter.setup(adapter_config)
|
|
executor = await adapter.create_executor(adapter_config)
|
|
except Exception:
|
|
# heartbeat hasn't started yet but may have async tasks pending
|
|
if hasattr(heartbeat, "stop"):
|
|
try:
|
|
await heartbeat.stop()
|
|
except Exception:
|
|
pass
|
|
raise
|
|
|
|
# 5.5. Initialise Temporal durable execution wrapper (optional)
|
|
# Connects to TEMPORAL_HOST (default: localhost:7233) and starts a
|
|
# co-located Temporal worker as a background asyncio task.
|
|
# No-op with a warning log if Temporal is unreachable or temporalio
|
|
# is not installed — all tasks fall back to direct execution transparently.
|
|
from builtin_tools.temporal_workflow import create_wrapper as _create_temporal_wrapper
|
|
temporal_wrapper = _create_temporal_wrapper()
|
|
await temporal_wrapper.start()
|
|
|
|
# Get loaded skills for agent card (adapter may have populated them)
|
|
loaded_skills = getattr(adapter, "loaded_skills", [])
|
|
|
|
# 6. Build Agent Card
|
|
machine_ip = os.environ.get("HOSTNAME", get_machine_ip())
|
|
workspace_url = f"http://{machine_ip}:{port}"
|
|
|
|
agent_card = AgentCard(
|
|
name=config.name,
|
|
description=config.description or config.name,
|
|
version=config.version,
|
|
url=workspace_url,
|
|
capabilities=AgentCapabilities(
|
|
streaming=config.a2a.streaming,
|
|
pushNotifications=config.a2a.push_notifications,
|
|
stateTransitionHistory=True,
|
|
),
|
|
skills=[
|
|
AgentSkill(
|
|
id=skill.metadata.id,
|
|
name=skill.metadata.name,
|
|
description=skill.metadata.description,
|
|
tags=skill.metadata.tags,
|
|
examples=skill.metadata.examples,
|
|
)
|
|
for skill in loaded_skills
|
|
],
|
|
defaultInputModes=["text/plain", "application/json"],
|
|
defaultOutputModes=["text/plain", "application/json"],
|
|
)
|
|
|
|
# 7. Wrap in A2A.
|
|
#
|
|
# Regression fix (#204): PR #198 tried to wire push_config_store +
|
|
# push_sender to satisfy #175 (push notification capability), but
|
|
# PushNotificationSender is an abstract base class in the a2a-sdk and
|
|
# can't be instantiated directly. Passing it crashed main.py on startup
|
|
# with `TypeError: Can't instantiate abstract class`. Dropped back to
|
|
# DefaultRequestHandler's own defaults — pushNotifications capability
|
|
# in the AgentCard below is still advertised via AgentCapabilities so
|
|
# clients know we COULD do pushes; actually implementing them requires
|
|
# a concrete sender subclass, tracked as a Phase-H follow-up to #175.
|
|
handler = DefaultRequestHandler(
|
|
agent_executor=executor,
|
|
task_store=InMemoryTaskStore(),
|
|
)
|
|
|
|
app = A2AStarletteApplication(
|
|
agent_card=agent_card,
|
|
http_handler=handler,
|
|
)
|
|
|
|
# 8. Register with platform
|
|
agent_card_dict = {
|
|
"name": config.name,
|
|
"description": config.description,
|
|
"version": config.version,
|
|
"url": workspace_url,
|
|
"skills": [
|
|
{
|
|
"id": s.metadata.id,
|
|
"name": s.metadata.name,
|
|
"description": s.metadata.description,
|
|
"tags": s.metadata.tags,
|
|
}
|
|
for s in loaded_skills
|
|
],
|
|
"capabilities": {
|
|
"streaming": config.a2a.streaming,
|
|
"pushNotifications": config.a2a.push_notifications,
|
|
},
|
|
}
|
|
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
try:
|
|
resp = await client.post(
|
|
f"{platform_url}/registry/register",
|
|
json={
|
|
"id": workspace_id,
|
|
"url": workspace_url,
|
|
"agent_card": agent_card_dict,
|
|
},
|
|
headers=auth_headers(),
|
|
)
|
|
print(f"Registered with platform: {resp.status_code}")
|
|
# Phase 30.1 — capture the auth token issued at first register.
|
|
# The platform only mints one on first register per workspace,
|
|
# so a subsequent restart gets an empty auth_token and we
|
|
# keep using the on-disk copy from the original issuance.
|
|
if resp.status_code == 200:
|
|
try:
|
|
body = resp.json()
|
|
tok = body.get("auth_token")
|
|
if tok:
|
|
from platform_auth import save_token
|
|
save_token(tok)
|
|
print(f"Saved workspace auth token (prefix={tok[:8]}…)")
|
|
except Exception as parse_exc:
|
|
print(f"Warning: couldn't parse register response for token: {parse_exc}")
|
|
except Exception as e:
|
|
print(f"Warning: failed to register with platform: {e}")
|
|
|
|
# 9. Start heartbeat
|
|
heartbeat.start()
|
|
|
|
# 9b. Start skills hot-reload watcher (background task)
|
|
# When a skill file changes the watcher reloads the skill module and calls
|
|
# back into the adapter so the next A2A request uses the updated tools.
|
|
if config.skills:
|
|
try:
|
|
from skill_loader.watcher import SkillsWatcher
|
|
|
|
def _on_skill_reload(updated_skill):
|
|
"""Rebuild the LangGraph agent when a skill changes in-place."""
|
|
if not hasattr(adapter, "loaded_skills"):
|
|
return
|
|
# Replace the matching skill in the adapter's skill list
|
|
adapter.loaded_skills = [
|
|
updated_skill if s.metadata.id == updated_skill.metadata.id else s
|
|
for s in adapter.loaded_skills
|
|
]
|
|
# Rebuild the agent's tool list from updated skills
|
|
if hasattr(adapter, "all_tools") and hasattr(adapter, "system_prompt"):
|
|
from builtin_tools.approval import request_approval
|
|
from builtin_tools.delegation import delegate_to_workspace
|
|
from builtin_tools.memory import commit_memory, search_memory
|
|
from builtin_tools.sandbox import run_code
|
|
base_tools = [delegate_to_workspace, request_approval,
|
|
commit_memory, search_memory, run_code]
|
|
skill_tools = []
|
|
for sk in adapter.loaded_skills:
|
|
skill_tools.extend(sk.tools)
|
|
adapter.all_tools = base_tools + skill_tools
|
|
# Rebuild compiled agent so next ainvoke picks up new tools
|
|
try:
|
|
from agent import create_agent
|
|
new_agent = create_agent(
|
|
config.model, adapter.all_tools, adapter.system_prompt
|
|
)
|
|
executor.agent = new_agent
|
|
print(f"Skills hot-reload: '{updated_skill.metadata.id}' reloaded — "
|
|
f"{len(updated_skill.tools)} tool(s)")
|
|
except Exception as rebuild_err:
|
|
print(f"Skills hot-reload: agent rebuild failed: {rebuild_err}")
|
|
|
|
skills_watcher = SkillsWatcher(
|
|
config_path=config_path,
|
|
skill_names=config.skills,
|
|
on_reload=_on_skill_reload,
|
|
)
|
|
asyncio.create_task(skills_watcher.start())
|
|
print(f"Skills hot-reload enabled for: {config.skills}")
|
|
except Exception as e:
|
|
print(f"Warning: skills watcher could not start: {e}")
|
|
|
|
# 10. Run A2A server
|
|
print(f"Workspace {workspace_id} starting on port {port}")
|
|
# Wrap the ASGI app with W3C TraceContext extraction middleware so incoming
|
|
# A2A HTTP requests propagate their trace context into _incoming_trace_context.
|
|
built_app = make_trace_middleware(app.build())
|
|
|
|
server_config = uvicorn.Config(
|
|
built_app,
|
|
host="0.0.0.0",
|
|
port=port,
|
|
log_level="info",
|
|
)
|
|
server = uvicorn.Server(server_config)
|
|
|
|
# 10b. Schedule initial_prompt self-message after server is ready.
|
|
# Only runs on first boot — creates a marker file to prevent re-execution on restart.
|
|
initial_prompt_task = None
|
|
initial_prompt_marker = resolve_initial_prompt_marker(config_path)
|
|
if config.initial_prompt and not os.path.exists(initial_prompt_marker):
|
|
# Write the marker UP FRONT (#71): if the prompt later crashes or
|
|
# times out, we do NOT replay on next boot — that created a
|
|
# ProcessError cascade where every message kept crashing. Operators
|
|
# can always re-send via chat. Log loudly if the marker write
|
|
# fails so the situation is visible.
|
|
if not mark_initial_prompt_attempted(initial_prompt_marker):
|
|
print(
|
|
f"Initial prompt: WARNING — could not write marker at "
|
|
f"{initial_prompt_marker}; this boot may replay if it crashes.",
|
|
flush=True,
|
|
)
|
|
async def _send_initial_prompt():
|
|
"""Wait for server to be ready, then send initial_prompt as self-message."""
|
|
# Wait for the A2A server to accept connections
|
|
ready = False
|
|
for attempt in range(30):
|
|
await asyncio.sleep(1)
|
|
try:
|
|
async with httpx.AsyncClient(timeout=5.0) as client:
|
|
resp = await client.get(f"http://127.0.0.1:{port}/.well-known/agent.json")
|
|
if resp.status_code == 200:
|
|
ready = True
|
|
break
|
|
except Exception:
|
|
continue
|
|
|
|
if not ready:
|
|
print("Initial prompt: server not ready after 30s, skipping", flush=True)
|
|
return
|
|
|
|
# Send initial prompt through the platform A2A proxy (not directly to self).
|
|
# The proxy logs an a2a_receive with source_id=NULL (canvas-style),
|
|
# broadcasts A2A_RESPONSE via WebSocket so the chat shows both the
|
|
# prompt (as user message) and the response (as agent message).
|
|
# Uses urllib in a thread to avoid asyncio/httpx streaming hangs.
|
|
import json as _json
|
|
import urllib.request
|
|
|
|
def _do_send_sync():
|
|
import time as _time
|
|
payload = _json.dumps({
|
|
"method": "message/send",
|
|
"params": {
|
|
"message": {
|
|
"role": "user",
|
|
"messageId": f"initial-{_uuid.uuid4().hex[:8]}",
|
|
"parts": [{"kind": "text", "text": config.initial_prompt}],
|
|
},
|
|
},
|
|
}).encode()
|
|
|
|
# #220: include platform bearer token so the request isn't
|
|
# silently rejected once any workspace has a live token on
|
|
# file. Without this, initial_prompt 401s in multi-tenant
|
|
# mode exactly like /registry/register did in #215.
|
|
headers = {"Content-Type": "application/json", **auth_headers()}
|
|
|
|
# Retry with backoff — the platform proxy may not be able to
|
|
# reach us yet (container networking takes a moment to settle).
|
|
max_retries = 5
|
|
for attempt in range(max_retries):
|
|
try:
|
|
req = urllib.request.Request(
|
|
f"{platform_url}/workspaces/{workspace_id}/a2a",
|
|
data=payload,
|
|
headers=headers,
|
|
)
|
|
with urllib.request.urlopen(req, timeout=600) as resp:
|
|
resp.read()
|
|
print(f"Initial prompt: completed (status={resp.status})", flush=True)
|
|
break
|
|
except Exception as e:
|
|
if attempt < max_retries - 1:
|
|
delay = 2 ** attempt # 1, 2, 4, 8, 16 seconds
|
|
print(f"Initial prompt: attempt {attempt + 1} failed ({e}), retrying in {delay}s...", flush=True)
|
|
_time.sleep(delay)
|
|
else:
|
|
print(f"Initial prompt: failed after {max_retries} attempts — {e}", flush=True)
|
|
return
|
|
|
|
# Marker was already written up front (#71). Nothing to do here.
|
|
|
|
print("Initial prompt: sending via platform proxy...", flush=True)
|
|
loop = asyncio.get_event_loop()
|
|
loop.run_in_executor(None, _do_send_sync)
|
|
|
|
initial_prompt_task = asyncio.create_task(_send_initial_prompt())
|
|
|
|
# 10c. Idle loop — reflection-on-completion / backlog-pull pattern.
|
|
# Fires config.idle_prompt every config.idle_interval_seconds while the
|
|
# workspace has no active task. This turns every role from "waits for cron"
|
|
# into "self-wakes when idle" — the Hermes/Letta shape from today's
|
|
# multi-framework survey (see docs/ecosystem-watch.md). Cost collapses to
|
|
# event-driven in practice: the idle check is local (no LLM call, just
|
|
# heartbeat.active_tasks==0), and the prompt only fires when there's
|
|
# actually nothing to do. Gated on idle_prompt being non-empty so existing
|
|
# workspaces upgrade opt-in — set idle_prompt in org.yaml defaults or
|
|
# per-workspace to enable.
|
|
idle_loop_task = None
|
|
if config.idle_prompt:
|
|
# Idle-fire HTTP timeout. Kept tight relative to the fire cadence so a
|
|
# hung platform doesn't accumulate dangling requests — a fire that
|
|
# takes longer than the idle interval itself is almost certainly stuck.
|
|
IDLE_FIRE_TIMEOUT_SECONDS = max(60, min(300, config.idle_interval_seconds))
|
|
# Initial settle delay — never longer than 60s so cold-start races
|
|
# don't stall the first fire, and never shorter than the configured
|
|
# interval (short intervals shouldn't fire instantly on boot either).
|
|
IDLE_INITIAL_SETTLE_SECONDS = min(config.idle_interval_seconds, 60)
|
|
|
|
async def _run_idle_loop():
|
|
"""Self-sends config.idle_prompt periodically when the workspace is idle."""
|
|
await asyncio.sleep(IDLE_INITIAL_SETTLE_SECONDS)
|
|
|
|
import json as _json
|
|
from urllib import request as _urlreq, error as _urlerr
|
|
|
|
while True:
|
|
try:
|
|
await asyncio.sleep(config.idle_interval_seconds)
|
|
except asyncio.CancelledError:
|
|
return
|
|
|
|
# Local idle check — no platform API call, no LLM call.
|
|
# heartbeat.active_tasks == 0 means no in-flight work.
|
|
if heartbeat.active_tasks > 0:
|
|
continue
|
|
|
|
# Self-post the idle prompt via the platform A2A proxy (same
|
|
# path as initial_prompt). The agent's own concurrency control
|
|
# rejects if the workspace becomes busy between this check and
|
|
# the post — that's the expected safety valve.
|
|
payload = _json.dumps({
|
|
"method": "message/send",
|
|
"params": {
|
|
"message": {
|
|
"role": "user",
|
|
"messageId": f"idle-{_uuid.uuid4().hex[:8]}",
|
|
"parts": [{"kind": "text", "text": config.idle_prompt}],
|
|
},
|
|
},
|
|
}).encode()
|
|
|
|
def _post_sync():
|
|
# Returns (status_code, error_type) so the caller logs the
|
|
# actual outcome instead of a bare "post failed" line.
|
|
# #220: include auth_headers() on every idle fire. Without
|
|
# this, the idle loop 401s in multi-tenant mode.
|
|
headers = {"Content-Type": "application/json", **auth_headers()}
|
|
try:
|
|
req = _urlreq.Request(
|
|
f"{platform_url}/workspaces/{workspace_id}/a2a",
|
|
data=payload,
|
|
headers=headers,
|
|
)
|
|
with _urlreq.urlopen(req, timeout=IDLE_FIRE_TIMEOUT_SECONDS) as resp:
|
|
resp.read()
|
|
return resp.status, None
|
|
except _urlerr.HTTPError as e:
|
|
return e.code, type(e).__name__
|
|
except _urlerr.URLError as e:
|
|
return None, f"URLError: {e.reason}"
|
|
except Exception as e: # pragma: no cover — catch-all safety net
|
|
return None, type(e).__name__
|
|
|
|
print(
|
|
f"Idle loop: firing (active_tasks=0, interval={config.idle_interval_seconds}s, "
|
|
f"timeout={IDLE_FIRE_TIMEOUT_SECONDS}s)",
|
|
flush=True,
|
|
)
|
|
loop_ref = asyncio.get_running_loop()
|
|
|
|
def _log_result(future):
|
|
try:
|
|
status, err = future.result()
|
|
if err:
|
|
print(
|
|
f"Idle loop: post failed — status={status} err={err}",
|
|
flush=True,
|
|
)
|
|
else:
|
|
print(f"Idle loop: post ok status={status}", flush=True)
|
|
except Exception as e: # pragma: no cover
|
|
print(f"Idle loop: executor callback crashed — {e}", flush=True)
|
|
|
|
fut = loop_ref.run_in_executor(None, _post_sync)
|
|
fut.add_done_callback(_log_result)
|
|
|
|
idle_loop_task = asyncio.create_task(_run_idle_loop())
|
|
|
|
try:
|
|
await server.serve()
|
|
finally:
|
|
# Cancel initial prompt if still running
|
|
if initial_prompt_task and not initial_prompt_task.done():
|
|
initial_prompt_task.cancel()
|
|
# Cancel idle loop if running
|
|
if idle_loop_task and not idle_loop_task.done():
|
|
idle_loop_task.cancel()
|
|
# Gracefully stop the Temporal worker background task on shutdown
|
|
await temporal_wrapper.stop()
|
|
|
|
|
|
if __name__ == "__main__": # pragma: no cover
|
|
asyncio.run(main())
|