"""Workspace runtime entry point. Loads config -> discovers adapter -> setup -> create executor -> wrap in A2A -> register -> heartbeat. """ import asyncio import json import os import socket import httpx import uvicorn from a2a.server.apps import A2AStarletteApplication from a2a.server.request_handlers import DefaultRequestHandler from a2a.server.tasks import InMemoryTaskStore from a2a.types import AgentCard, AgentCapabilities, AgentSkill from adapters import get_adapter, AdapterConfig from agents_md import generate_agents_md from config import load_config from heartbeat import HeartbeatLoop from preflight import run_preflight, render_preflight_report from builtin_tools.awareness_client import get_awareness_config import uuid as _uuid from builtin_tools.telemetry import setup_telemetry, make_trace_middleware from policies.namespaces import resolve_awareness_namespace from initial_prompt import ( mark_initial_prompt_attempted, resolve_initial_prompt_marker, ) from platform_auth import auth_headers def get_machine_ip() -> str: # pragma: no cover """Get the machine's IP for A2A discovery.""" try: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.connect(("8.8.8.8", 80)) ip = s.getsockname()[0] s.close() return ip except Exception: return "127.0.0.1" # Re-exported from transcript_auth for the inline /transcript handler. # Separate module keeps the security-critical gate import-light + unit-testable. from transcript_auth import transcript_authorized as _transcript_authorized async def main(): # pragma: no cover workspace_id = os.environ.get("WORKSPACE_ID", "workspace-default") config_path = os.environ.get("WORKSPACE_CONFIG_PATH", "/configs") platform_url = os.environ.get("PLATFORM_URL", "http://platform:8080") awareness_config = get_awareness_config() # 0. Initialise OpenTelemetry (no-op if packages not installed) setup_telemetry(service_name=workspace_id) # 1. Load config config = load_config(config_path) port = config.a2a.port preflight = run_preflight(config, config_path) render_preflight_report(preflight) # 1a. Generate AGENTS.md so peer agents and discovery tools can see this # workspace's identity, role, endpoint, and capabilities immediately. try: generate_agents_md(config_path, "/workspace/AGENTS.md") except Exception as _agents_md_err: # pragma: no cover print(f"Warning: AGENTS.md generation failed (non-fatal): {_agents_md_err}") if not preflight.ok: raise SystemExit(1) if awareness_config: awareness_namespace = resolve_awareness_namespace( workspace_id, awareness_config.get("namespace", ""), ) print(f"Awareness enabled for namespace: {awareness_namespace}") # 1.5 Initialise governance adapter (no-op if disabled or package absent) from builtin_tools.governance import initialize_governance if config.governance.enabled: await initialize_governance(config.governance) print(f"Governance: Microsoft Agent Governance Toolkit enabled (mode={config.governance.policy_mode})") else: print("Governance: disabled (set governance.enabled: true in config.yaml to activate)") # 2. Create heartbeat (passed to adapter for task tracking) heartbeat = HeartbeatLoop(platform_url, workspace_id) # 3. Get adapter for this runtime runtime = config.runtime or "langgraph" adapter_cls = get_adapter(runtime) # Raises KeyError if unknown — no silent fallback adapter = adapter_cls() print(f"Runtime: {runtime} ({adapter.display_name()})") # 4. Build adapter config adapter_config = AdapterConfig( model=config.model, system_prompt=None, # Adapter builds its own prompt tools=config.skills, # Skill names from config.yaml runtime_config=vars(config.runtime_config) if config.runtime_config else {}, config_path=config_path, workspace_id=workspace_id, prompt_files=config.prompt_files, a2a_port=port, heartbeat=heartbeat, ) # 5. Setup adapter and create executor # If setup fails, ensure heartbeat is stopped to prevent resource leak try: await adapter.setup(adapter_config) executor = await adapter.create_executor(adapter_config) except Exception: # heartbeat hasn't started yet but may have async tasks pending if hasattr(heartbeat, "stop"): try: await heartbeat.stop() except Exception: pass raise # 5.5. Initialise Temporal durable execution wrapper (optional) # Connects to TEMPORAL_HOST (default: localhost:7233) and starts a # co-located Temporal worker as a background asyncio task. # No-op with a warning log if Temporal is unreachable or temporalio # is not installed — all tasks fall back to direct execution transparently. from builtin_tools.temporal_workflow import create_wrapper as _create_temporal_wrapper temporal_wrapper = _create_temporal_wrapper() await temporal_wrapper.start() # Get loaded skills for agent card (adapter may have populated them) loaded_skills = getattr(adapter, "loaded_skills", []) # 6. Build Agent Card machine_ip = os.environ.get("HOSTNAME", get_machine_ip()) workspace_url = f"http://{machine_ip}:{port}" agent_card = AgentCard( name=config.name, description=config.description or config.name, version=config.version, url=workspace_url, capabilities=AgentCapabilities( streaming=config.a2a.streaming, pushNotifications=config.a2a.push_notifications, stateTransitionHistory=True, ), skills=[ AgentSkill( id=skill.metadata.id, name=skill.metadata.name, description=skill.metadata.description, tags=skill.metadata.tags, examples=skill.metadata.examples, ) for skill in loaded_skills ], defaultInputModes=["text/plain", "application/json"], defaultOutputModes=["text/plain", "application/json"], ) # 7. Wrap in A2A. # # Regression fix (#204): PR #198 tried to wire push_config_store + # push_sender to satisfy #175 (push notification capability), but # PushNotificationSender is an abstract base class in the a2a-sdk and # can't be instantiated directly. Passing it crashed main.py on startup # with `TypeError: Can't instantiate abstract class`. Dropped back to # DefaultRequestHandler's own defaults — pushNotifications capability # in the AgentCard below is still advertised via AgentCapabilities so # clients know we COULD do pushes; actually implementing them requires # a concrete sender subclass, tracked as a Phase-H follow-up to #175. handler = DefaultRequestHandler( agent_executor=executor, task_store=InMemoryTaskStore(), ) app = A2AStarletteApplication( agent_card=agent_card, http_handler=handler, ) # 8. Register with platform agent_card_dict = { "name": config.name, "description": config.description, "version": config.version, "url": workspace_url, "skills": [ { "id": s.metadata.id, "name": s.metadata.name, "description": s.metadata.description, "tags": s.metadata.tags, } for s in loaded_skills ], "capabilities": { "streaming": config.a2a.streaming, "pushNotifications": config.a2a.push_notifications, }, } async with httpx.AsyncClient(timeout=10.0) as client: try: resp = await client.post( f"{platform_url}/registry/register", json={ "id": workspace_id, "url": workspace_url, "agent_card": agent_card_dict, }, headers=auth_headers(), ) print(f"Registered with platform: {resp.status_code}") # Phase 30.1 — capture the auth token issued at first register. # The platform only mints one on first register per workspace, # so a subsequent restart gets an empty auth_token and we # keep using the on-disk copy from the original issuance. if resp.status_code == 200: try: body = resp.json() tok = body.get("auth_token") if tok: from platform_auth import save_token save_token(tok) print(f"Saved workspace auth token (prefix={tok[:8]}…)") except Exception as parse_exc: print(f"Warning: couldn't parse register response for token: {parse_exc}") except Exception as e: print(f"Warning: failed to register with platform: {e}") # 9. Start heartbeat heartbeat.start() # 9b. Start skills hot-reload watcher (background task) # When a skill file changes the watcher reloads the skill module and calls # back into the adapter so the next A2A request uses the updated tools. if config.skills: try: from skill_loader.watcher import SkillsWatcher def _on_skill_reload(updated_skill): """Rebuild the LangGraph agent when a skill changes in-place.""" if not hasattr(adapter, "loaded_skills"): return # Replace the matching skill in the adapter's skill list adapter.loaded_skills = [ updated_skill if s.metadata.id == updated_skill.metadata.id else s for s in adapter.loaded_skills ] # Rebuild the agent's tool list from updated skills if hasattr(adapter, "all_tools") and hasattr(adapter, "system_prompt"): from builtin_tools.approval import request_approval from builtin_tools.delegation import delegate_to_workspace from builtin_tools.memory import commit_memory, search_memory from builtin_tools.sandbox import run_code base_tools = [delegate_to_workspace, request_approval, commit_memory, search_memory, run_code] skill_tools = [] for sk in adapter.loaded_skills: skill_tools.extend(sk.tools) adapter.all_tools = base_tools + skill_tools # Rebuild compiled agent so next ainvoke picks up new tools try: from agent import create_agent new_agent = create_agent( config.model, adapter.all_tools, adapter.system_prompt ) executor.agent = new_agent print(f"Skills hot-reload: '{updated_skill.metadata.id}' reloaded — " f"{len(updated_skill.tools)} tool(s)") except Exception as rebuild_err: print(f"Skills hot-reload: agent rebuild failed: {rebuild_err}") skills_watcher = SkillsWatcher( config_path=config_path, skill_names=config.skills, on_reload=_on_skill_reload, ) asyncio.create_task(skills_watcher.start()) print(f"Skills hot-reload enabled for: {config.skills}") except Exception as e: print(f"Warning: skills watcher could not start: {e}") # 10. Run A2A server print(f"Workspace {workspace_id} starting on port {port}") # Wrap the ASGI app with W3C TraceContext extraction middleware so incoming # A2A HTTP requests propagate their trace context into _incoming_trace_context. starlette_app = app.build() # Add /transcript route — exposes the most-recent agent session log # (claude-code reads ~/.claude/projects//.jsonl). Other # runtimes return supported:false. from starlette.responses import JSONResponse from starlette.routing import Route async def _transcript_handler(request): # Require workspace bearer token — the same token issued at registration # and stored in /configs/.auth_token. Any container on molecule-monorepo-net # could otherwise read the full session log. Closes #287. # # #328: fail CLOSED when the token file is unavailable. get_token() # returns None during the bootstrap window (first register hasn't # completed), if /configs/.auth_token was deleted, or on OSError. # The old `if expected:` guard treated all three cases as "skip # auth" — an unauthenticated container on the same Docker network # could read the entire session log during that window. Deny # instead. The platform's TranscriptHandler acquires the token # during registration, so once the bootstrap completes it always # has a valid credential to present. from platform_auth import get_token if not _transcript_authorized(get_token(), request.headers.get("Authorization", "")): return JSONResponse({"error": "unauthorized"}, status_code=401) try: since = int(request.query_params.get("since", "0")) limit = int(request.query_params.get("limit", "100")) except (TypeError, ValueError): return JSONResponse({"error": "since and limit must be integers"}, status_code=400) result = await adapter.transcript_lines(since=since, limit=limit) return JSONResponse(result) starlette_app.add_route("/transcript", _transcript_handler, methods=["GET"]) built_app = make_trace_middleware(starlette_app) server_config = uvicorn.Config( built_app, host="0.0.0.0", port=port, log_level="info", ) server = uvicorn.Server(server_config) # 10b. Schedule initial_prompt self-message after server is ready. # Only runs on first boot — creates a marker file to prevent re-execution on restart. initial_prompt_task = None initial_prompt_marker = resolve_initial_prompt_marker(config_path) if config.initial_prompt and not os.path.exists(initial_prompt_marker): # Write the marker UP FRONT (#71): if the prompt later crashes or # times out, we do NOT replay on next boot — that created a # ProcessError cascade where every message kept crashing. Operators # can always re-send via chat. Log loudly if the marker write # fails so the situation is visible. if not mark_initial_prompt_attempted(initial_prompt_marker): print( f"Initial prompt: WARNING — could not write marker at " f"{initial_prompt_marker}; this boot may replay if it crashes.", flush=True, ) async def _send_initial_prompt(): """Wait for server to be ready, then send initial_prompt as self-message.""" # Wait for the A2A server to accept connections ready = False for attempt in range(30): await asyncio.sleep(1) try: async with httpx.AsyncClient(timeout=5.0) as client: resp = await client.get(f"http://127.0.0.1:{port}/.well-known/agent.json") if resp.status_code == 200: ready = True break except Exception: continue if not ready: print("Initial prompt: server not ready after 30s, skipping", flush=True) return # Send initial prompt through the platform A2A proxy (not directly to self). # The proxy logs an a2a_receive with source_id=NULL (canvas-style), # broadcasts A2A_RESPONSE via WebSocket so the chat shows both the # prompt (as user message) and the response (as agent message). # Uses urllib in a thread to avoid asyncio/httpx streaming hangs. import json as _json import urllib.request def _do_send_sync(): import time as _time payload = _json.dumps({ "method": "message/send", "params": { "message": { "role": "user", "messageId": f"initial-{_uuid.uuid4().hex[:8]}", "parts": [{"kind": "text", "text": config.initial_prompt}], }, }, }).encode() # #220: include platform bearer token so the request isn't # silently rejected once any workspace has a live token on # file. Without this, initial_prompt 401s in multi-tenant # mode exactly like /registry/register did in #215. headers = {"Content-Type": "application/json", **auth_headers()} # Retry with backoff — the platform proxy may not be able to # reach us yet (container networking takes a moment to settle). max_retries = 5 for attempt in range(max_retries): try: req = urllib.request.Request( f"{platform_url}/workspaces/{workspace_id}/a2a", data=payload, headers=headers, ) with urllib.request.urlopen(req, timeout=600) as resp: resp.read() print(f"Initial prompt: completed (status={resp.status})", flush=True) break except Exception as e: if attempt < max_retries - 1: delay = 2 ** attempt # 1, 2, 4, 8, 16 seconds print(f"Initial prompt: attempt {attempt + 1} failed ({e}), retrying in {delay}s...", flush=True) _time.sleep(delay) else: print(f"Initial prompt: failed after {max_retries} attempts — {e}", flush=True) return # Marker was already written up front (#71). Nothing to do here. print("Initial prompt: sending via platform proxy...", flush=True) loop = asyncio.get_event_loop() loop.run_in_executor(None, _do_send_sync) initial_prompt_task = asyncio.create_task(_send_initial_prompt()) # 10c. Idle loop — reflection-on-completion / backlog-pull pattern. # Fires config.idle_prompt every config.idle_interval_seconds while the # workspace has no active task. This turns every role from "waits for cron" # into "self-wakes when idle" — the Hermes/Letta shape from today's # multi-framework survey (see docs/ecosystem-watch.md). Cost collapses to # event-driven in practice: the idle check is local (no LLM call, just # heartbeat.active_tasks==0), and the prompt only fires when there's # actually nothing to do. Gated on idle_prompt being non-empty so existing # workspaces upgrade opt-in — set idle_prompt in org.yaml defaults or # per-workspace to enable. idle_loop_task = None if config.idle_prompt: # Idle-fire HTTP timeout. Kept tight relative to the fire cadence so a # hung platform doesn't accumulate dangling requests — a fire that # takes longer than the idle interval itself is almost certainly stuck. IDLE_FIRE_TIMEOUT_SECONDS = max(60, min(300, config.idle_interval_seconds)) # Initial settle delay — never longer than 60s so cold-start races # don't stall the first fire, and never shorter than the configured # interval (short intervals shouldn't fire instantly on boot either). IDLE_INITIAL_SETTLE_SECONDS = min(config.idle_interval_seconds, 60) async def _run_idle_loop(): """Self-sends config.idle_prompt periodically when the workspace is idle.""" await asyncio.sleep(IDLE_INITIAL_SETTLE_SECONDS) import json as _json from urllib import request as _urlreq, error as _urlerr while True: try: await asyncio.sleep(config.idle_interval_seconds) except asyncio.CancelledError: return # Local idle check — no platform API call, no LLM call. # heartbeat.active_tasks == 0 means no in-flight work. if heartbeat.active_tasks > 0: continue # Self-post the idle prompt via the platform A2A proxy (same # path as initial_prompt). The agent's own concurrency control # rejects if the workspace becomes busy between this check and # the post — that's the expected safety valve. payload = _json.dumps({ "method": "message/send", "params": { "message": { "role": "user", "messageId": f"idle-{_uuid.uuid4().hex[:8]}", "parts": [{"kind": "text", "text": config.idle_prompt}], }, }, }).encode() def _post_sync(): # Returns (status_code, error_type) so the caller logs the # actual outcome instead of a bare "post failed" line. # #220: include auth_headers() on every idle fire. Without # this, the idle loop 401s in multi-tenant mode. headers = {"Content-Type": "application/json", **auth_headers()} try: req = _urlreq.Request( f"{platform_url}/workspaces/{workspace_id}/a2a", data=payload, headers=headers, ) with _urlreq.urlopen(req, timeout=IDLE_FIRE_TIMEOUT_SECONDS) as resp: resp.read() return resp.status, None except _urlerr.HTTPError as e: return e.code, type(e).__name__ except _urlerr.URLError as e: return None, f"URLError: {e.reason}" except Exception as e: # pragma: no cover — catch-all safety net return None, type(e).__name__ print( f"Idle loop: firing (active_tasks=0, interval={config.idle_interval_seconds}s, " f"timeout={IDLE_FIRE_TIMEOUT_SECONDS}s)", flush=True, ) loop_ref = asyncio.get_running_loop() def _log_result(future): try: status, err = future.result() if err: print( f"Idle loop: post failed — status={status} err={err}", flush=True, ) else: print(f"Idle loop: post ok status={status}", flush=True) except Exception as e: # pragma: no cover print(f"Idle loop: executor callback crashed — {e}", flush=True) fut = loop_ref.run_in_executor(None, _post_sync) fut.add_done_callback(_log_result) idle_loop_task = asyncio.create_task(_run_idle_loop()) try: await server.serve() finally: # Cancel initial prompt if still running if initial_prompt_task and not initial_prompt_task.done(): initial_prompt_task.cancel() # Cancel idle loop if running if idle_loop_task and not idle_loop_task.done(): idle_loop_task.cancel() # Gracefully stop the Temporal worker background task on shutdown await temporal_wrapper.stop() if __name__ == "__main__": # pragma: no cover asyncio.run(main())