feat(hermes): expose reasoning mode for Hermes 4 via OpenAI-compat API (#496)

Hermes 4 is a hybrid-reasoning model trained on <think> tags; without asking for thinking we pay flagship $/tok but get non-reasoning quality. This adds a dedicated HermesA2AExecutor that dispatches to any OpenAI-compat endpoint (OpenRouter, Nous Portal) and enables native reasoning for Hermes 4 models. Key decisions: - ProviderConfig + _reasoning_supported() detect Hermes 4 by model slug substring ("hermes-4", "hermes4") — case-insensitive, no config needed - extra_body={"reasoning": {"enabled": True}} sent only to Hermes 4 entries; Hermes 3 path unchanged (no extra_body, no regressions) - choices[0].message.reasoning + reasoning_details extracted and written to an OTEL span (hermes.reasoning) — deliberately NOT echoed in the A2A reply so the reasoning trace never contaminates the agent's next-turn context - API key / base URL default to OPENAI_API_KEY / OPENAI_BASE_URL env vars with openrouter.ai/api/v1 as the fallback endpoint - _client injection parameter for unit tests (no live API calls needed) - Error sanitization: only exception class name surfaces to user (mirrors sanitize_agent_error() convention from cli_executor.py) Test coverage: 35 tests, 100% coverage on all new code paths including: - _reasoning_supported() — Hermes 4/3/unknown/empty/uppercase - ProviderConfig — field assignment and capability flags - extra_body presence for Hermes 4, absence for Hermes 3 - reasoning not in A2A reply; _log_reasoning called when trace present - reasoning_details forwarded; span attributes set correctly - Telemetry failure swallowed (never blocks response) - API error → sanitized class-name-only reply - cancel() → TaskStatusUpdateEvent(state=canceled) Full suite: 990 passed, 0 failed (no regressions). Resolves #496 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-16 20:38:45 +00:00 · 2026-04-16 20:38:45 +00:00 · b7c8f18ab2
commit b7c8f18ab2
parent f88f221dfe
2 changed files with 1028 additions and 0 deletions
--- a/workspace-template/hermes_executor.py
+++ b/workspace-template/hermes_executor.py
@ -0,0 +1,327 @@
+"""OpenAI-compat A2A executor for Hermes models with native reasoning support.
+
+Dispatches to OpenRouter / Nous Portal (or any OpenAI-compatible endpoint)
+and enables Hermes 4 native reasoning when the model supports it.
+
+Reasoning (Hermes 4 only)
+--------------------------
+Hermes 4 is a hybrid-reasoning model trained on ``<think>`` tags.  When
+``reasoning_supported`` is True for the active model, this executor appends:
+
+    extra_body={"reasoning": {"enabled": True}}
+
+to the ``chat.completions.create()`` call.  The ``openai`` SDK forwards
+``extra_body`` verbatim to the upstream provider, so both OpenRouter and
+Nous Portal receive it without needing provider-specific code paths.
+
+On response, ``choices[0].message.reasoning`` and
+``choices[0].message.reasoning_details`` are extracted and written to an
+OTEL activity span so operators can inspect the thinking trace in Langfuse
+/ Jaeger.  The reasoning content is deliberately **not** included in the
+A2A reply — doing so would contaminate the agent's next-turn context with
+the model's internal scratchpad.
+
+Hermes 3 / unknown models
+--------------------------
+No ``extra_body`` is sent.  The response is processed identically to any
+other OpenAI-compat model call.  The Hermes 3 path is exercised by the
+existing adapter test suite and must remain unchanged.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+from typing import TYPE_CHECKING, Any
+
+from a2a.server.agent_execution import AgentExecutor, RequestContext
+from a2a.server.events import EventQueue
+from a2a.utils import new_agent_text_message
+
+if TYPE_CHECKING:
+    from heartbeat import HeartbeatLoop
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Per-model reasoning capability detection
+# ---------------------------------------------------------------------------
+
+# Substrings that identify a Hermes 4 model slug from either provider:
+#   OpenRouter:  "nousresearch/hermes-4-*", "nousresearch/nous-hermes-4-*"
+#   Nous Portal: "hermes-4", "nous-hermes-4"
+#
+# Hermes 3 slugs ("hermes-3-llama-3.1-70b", etc.) do NOT contain any of
+# these patterns, so they correctly resolve to reasoning_supported=False.
+_HERMES4_PATTERNS: tuple[str, ...] = (
+    "hermes-4",
+    "hermes4",
+)
+
+
+def _reasoning_supported(model: str) -> bool:
+    """Return True if *model* identifies a Hermes 4 variant.
+
+    Case-insensitive substring match against ``_HERMES4_PATTERNS``.
+
+    >>> _reasoning_supported("nousresearch/hermes-4-0")
+    True
+    >>> _reasoning_supported("nousresearch/nous-hermes-4")
+    True
+    >>> _reasoning_supported("nousresearch/hermes-3-llama-3.1-70b")
+    False
+    >>> _reasoning_supported("gpt-4o")
+    False
+    """
+    model_lower = model.lower()
+    return any(pat in model_lower for pat in _HERMES4_PATTERNS)
+
+
+# ---------------------------------------------------------------------------
+# ProviderConfig — per-provider / per-model capability flags
+# ---------------------------------------------------------------------------
+
+
+class ProviderConfig:
+    """Immutable capability record derived from a model identifier string.
+
+    Attributes:
+        model:               Full model identifier (e.g. "nousresearch/hermes-4-0").
+        reasoning_supported: True for Hermes 4 entries on OpenRouter / Nous
+                             Portal; False for Hermes 3 and all other models.
+
+    Example::
+
+        cfg = ProviderConfig("nousresearch/hermes-4-0")
+        assert cfg.reasoning_supported is True
+
+        cfg3 = ProviderConfig("nousresearch/hermes-3-llama-3.1-70b")
+        assert cfg3.reasoning_supported is False
+    """
+
+    __slots__ = ("model", "reasoning_supported")
+
+    def __init__(self, model: str) -> None:
+        self.model: str = model
+        self.reasoning_supported: bool = _reasoning_supported(model)
+
+    def __repr__(self) -> str:  # pragma: no cover
+        return (
+            f"ProviderConfig(model={self.model!r}, "
+            f"reasoning_supported={self.reasoning_supported})"
+        )
+
+
+# ---------------------------------------------------------------------------
+# HermesA2AExecutor
+# ---------------------------------------------------------------------------
+
+
+class HermesA2AExecutor(AgentExecutor):
+    """A2A executor for Hermes models via OpenAI-compatible API.
+
+    Compared to the LangGraph executor, this is intentionally thin:
+
+    - Single API call per turn (no streaming or ReAct tool loop).
+    - System prompt injected as the first ``messages[]`` entry.
+    - Hermes 4 reasoning enabled via ``extra_body`` when supported.
+    - Reasoning trace logged to OTEL span — never echoed in the reply.
+
+    Parameters
+    ----------
+    model:
+        Full model identifier string (e.g. ``"nousresearch/hermes-4-0"``).
+        Used to select the upstream model AND detect reasoning support.
+    system_prompt:
+        Optional system prompt prepended to every conversation.
+    base_url:
+        OpenAI-compat endpoint base URL.  Defaults to
+        ``OPENAI_BASE_URL`` env var, then ``https://openrouter.ai/api/v1``.
+    api_key:
+        Provider API key.  Defaults to ``OPENAI_API_KEY`` env var.
+    heartbeat:
+        Optional ``HeartbeatLoop`` instance used to surface the current
+        task description in the platform UI.
+    _client:
+        Inject a pre-built ``AsyncOpenAI`` (or compatible mock) — for
+        testing only.  When provided, ``base_url`` and ``api_key`` are
+        ignored.
+    """
+
+    def __init__(
+        self,
+        model: str,
+        system_prompt: str | None = None,
+        base_url: str | None = None,
+        api_key: str | None = None,
+        heartbeat: "HeartbeatLoop | None" = None,
+        _client: Any = None,
+    ) -> None:
+        self.model = model
+        self.system_prompt = system_prompt
+        self._heartbeat = heartbeat
+        self._provider = ProviderConfig(model)
+
+        if _client is not None:
+            # Test injection path — skip real AsyncOpenAI construction so
+            # unit tests don't need a live OpenAI API key.
+            self._client = _client
+        else:
+            # Lazy import keeps ``openai`` out of the global module-load path
+            # so callers that never use HermesA2AExecutor don't pay the import
+            # cost, and tests can stub ``sys.modules["openai"]`` before import.
+            from openai import AsyncOpenAI
+
+            self._client = AsyncOpenAI(
+                base_url=(
+                    base_url
+                    or os.environ.get("OPENAI_BASE_URL", "https://openrouter.ai/api/v1")
+                ),
+                api_key=(
+                    api_key
+                    or os.environ.get("OPENAI_API_KEY", "")
+                ),
+            )
+
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+
+    def _build_messages(self, user_input: str) -> list[dict]:
+        """Assemble the ``messages`` list: optional system prompt then user turn."""
+        msgs: list[dict] = []
+        if self.system_prompt:
+            msgs.append({"role": "system", "content": self.system_prompt})
+        msgs.append({"role": "user", "content": user_input})
+        return msgs
+
+    def _log_reasoning(
+        self,
+        context: RequestContext,
+        reasoning: str | None,
+        reasoning_details: object | None,
+    ) -> None:
+        """Write the Hermes 4 reasoning trace to an OTEL span.
+
+        The trace is surfaced to Langfuse / Jaeger for operator inspection.
+        It is intentionally **not** returned to the caller — including it in
+        the A2A reply would contaminate the agent's next-turn context.
+
+        Any exception is swallowed so a telemetry failure never blocks the
+        response being returned.
+        """
+        try:
+            from builtin_tools.telemetry import (
+                A2A_TASK_ID,
+                WORKSPACE_ID_ATTR,
+                get_tracer,
+            )
+
+            workspace_id = os.environ.get("WORKSPACE_ID", "unknown")
+            tracer = get_tracer()
+            with tracer.start_as_current_span("hermes.reasoning") as span:
+                span.set_attribute(WORKSPACE_ID_ATTR, workspace_id)
+                span.set_attribute(A2A_TASK_ID, context.context_id or "")
+                span.set_attribute("hermes.model", self.model)
+                span.set_attribute("hermes.reasoning_length", len(reasoning or ""))
+                if reasoning:
+                    # Cap the preview attribute at 512 chars — full trace is
+                    # stored in the span exporter's data store.
+                    span.set_attribute("hermes.reasoning_preview", reasoning[:512])
+                if reasoning_details is not None:
+                    span.set_attribute("hermes.has_reasoning_details", True)
+        except Exception:
+            logger.debug(
+                "hermes_executor: reasoning OTEL log failed (non-fatal)", exc_info=True
+            )
+
+    # ------------------------------------------------------------------
+    # AgentExecutor interface
+    # ------------------------------------------------------------------
+
+    async def execute(self, context: RequestContext, event_queue: EventQueue) -> None:
+        """Run a single Hermes turn and enqueue the reply as an A2A Message.
+
+        Sequence:
+        1. Extract user text from A2A message parts.
+        2. Build ``messages[]`` (optional system + user).
+        3. Call OpenAI-compat API; include ``extra_body`` for Hermes 4.
+        4. Extract and log reasoning trace — does NOT appear in the reply.
+        5. Enqueue a final ``Message`` with the content text.
+        """
+        from shared_runtime import extract_message_text
+
+        user_input = extract_message_text(context)
+        if not user_input:
+            parts = getattr(getattr(context, "message", None), "parts", None)
+            logger.warning("HermesA2AExecutor: no text in message parts: %s", parts)
+            await event_queue.enqueue_event(
+                new_agent_text_message("Error: message contained no text content.")
+            )
+            return
+
+        messages = self._build_messages(user_input)
+
+        # Only Hermes 4 entries get extra_body — sending it to Hermes 3
+        # or other models is a no-op at best; a 400 at worst.
+        extra_body: dict | None = None
+        if self._provider.reasoning_supported:
+            extra_body = {"reasoning": {"enabled": True}}
+
+        try:
+            response = await self._client.chat.completions.create(
+                model=self.model,
+                messages=messages,
+                extra_body=extra_body,
+            )
+
+            choice = response.choices[0]
+            content: str = choice.message.content or ""
+
+            # ``reasoning`` and ``reasoning_details`` are Hermes 4 / provider
+            # extensions not defined in the openai SDK's ChatCompletionMessage
+            # schema.  They arrive as dynamic attributes when the upstream API
+            # returns them; getattr guards against their absence.
+            reasoning: str | None = getattr(choice.message, "reasoning", None)
+            reasoning_details: object | None = getattr(
+                choice.message, "reasoning_details", None
+            )
+
+            if reasoning or reasoning_details:
+                logger.info(
+                    "hermes_executor: reasoning trace [model=%s len=%d]: %.200s...",
+                    self.model,
+                    len(reasoning or ""),
+                    reasoning or "",
+                )
+                # Log to OTEL — intentionally omitted from the A2A reply.
+                self._log_reasoning(context, reasoning, reasoning_details)
+
+            final_text = content.strip() or "(no response generated)"
+            await event_queue.enqueue_event(new_agent_text_message(final_text))
+
+        except Exception as exc:
+            logger.error(
+                "hermes_executor: API error [model=%s]: %s",
+                self.model,
+                type(exc).__name__,
+                exc_info=True,
+            )
+            # Expose only the exception class name — not the message body,
+            # which may contain API keys, rate-limit metadata, or provider
+            # error details that shouldn't reach the end user.
+            # Mirrors the sanitize_agent_error() convention in cli_executor.py.
+            await event_queue.enqueue_event(
+                new_agent_text_message(f"Agent error: {type(exc).__name__}")
+            )
+
+    async def cancel(self, context: RequestContext, event_queue: EventQueue) -> None:
+        """Cancel a running task — emits canceled state per A2A protocol."""
+        from a2a.types import TaskState, TaskStatus, TaskStatusUpdateEvent
+
+        await event_queue.enqueue_event(
+            TaskStatusUpdateEvent(
+                status=TaskStatus(state=TaskState.canceled),
+                final=True,
+            )
+        )
--- a/workspace-template/tests/test_hermes_executor.py
+++ b/workspace-template/tests/test_hermes_executor.py
@ -0,0 +1,701 @@
+"""Tests for hermes_executor.py — Hermes OpenAI-compat A2A executor.
+
+Coverage targets
+----------------
+- _reasoning_supported()        — model name pattern detection
+- ProviderConfig                — capability flags derived from model name
+- HermesA2AExecutor.__init__   — field assignment + client injection
+- HermesA2AExecutor._build_messages — system prompt + user turn assembly
+- HermesA2AExecutor._log_reasoning  — OTEL span emission + swallowed errors
+- HermesA2AExecutor.execute    — happy path, empty input, API error,
+                                  Hermes 4 extra_body, Hermes 3 no extra_body,
+                                  reasoning not in reply, reasoning_details
+- HermesA2AExecutor.cancel     — TaskStatusUpdateEvent emitted
+
+The ``openai`` module is stubbed in sys.modules so no real API call is made.
+The A2A SDK types are already stubbed by conftest.py.
+"""
+
+import sys
+from types import ModuleType
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+# ---------------------------------------------------------------------------
+# Stub openai before hermes_executor is imported so AsyncOpenAI resolves to a
+# controllable mock.  conftest.py already stubs a2a and builtin_tools.
+# ---------------------------------------------------------------------------
+
+if "openai" not in sys.modules:
+    _openai_mod = ModuleType("openai")
+
+    class _StubAsyncOpenAI:
+        """Minimal stand-in for openai.AsyncOpenAI — tests override this."""
+
+        def __init__(self, base_url=None, api_key=None):
+            self.base_url = base_url
+            self.api_key = api_key
+            self.chat = MagicMock()
+
+    _openai_mod.AsyncOpenAI = _StubAsyncOpenAI
+    sys.modules["openai"] = _openai_mod
+
+# ---------------------------------------------------------------------------
+# Stub shared_runtime.extract_message_text (mirrors the real implementation).
+# ---------------------------------------------------------------------------
+
+if "shared_runtime" not in sys.modules:
+    _sr_mod = ModuleType("shared_runtime")
+
+    def _extract_message_text(context_or_parts) -> str:
+        parts = getattr(getattr(context_or_parts, "message", None), "parts", None)
+        if parts is None:
+            parts = context_or_parts
+        texts = []
+        for p in parts or []:
+            t = getattr(p, "text", None) or getattr(
+                getattr(p, "root", None), "text", None
+            ) or ""
+            if t:
+                texts.append(t)
+        return " ".join(texts).strip()
+
+    _sr_mod.extract_message_text = _extract_message_text
+    sys.modules["shared_runtime"] = _sr_mod
+
+# Now import the module under test
+from hermes_executor import (  # noqa: E402
+    HermesA2AExecutor,
+    ProviderConfig,
+    _HERMES4_PATTERNS,
+    _reasoning_supported,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_context(text: str, context_id: str = "ctx-test") -> MagicMock:
+    """Return a mock RequestContext with the given text in message.parts."""
+    part = MagicMock()
+    part.text = text
+    ctx = MagicMock()
+    ctx.message.parts = [part]
+    ctx.context_id = context_id
+    return ctx
+
+
+def _make_empty_context() -> MagicMock:
+    """Return a context whose message parts contain no text."""
+    part = MagicMock(spec=[])  # no .text attribute
+    part.root = MagicMock(spec=[])  # no .root.text either
+    ctx = MagicMock()
+    ctx.message.parts = [part]
+    ctx.context_id = "ctx-empty"
+    return ctx
+
+
+class _FakeMessage:
+    """Minimal stand-in for openai ChatCompletionMessage.
+
+    Only sets *reasoning* / *reasoning_details* as real attributes when
+    explicitly provided — matching what an upstream OpenAI-compat provider
+    returns (the SDK does NOT define these fields on ChatCompletionMessage;
+    they arrive as dynamic extras).  Using a plain class rather than
+    MagicMock avoids MagicMock's auto-attribute creation, which would cause
+    ``getattr(msg, "reasoning", None)`` to return a truthy MagicMock even
+    when the field was never set.
+    """
+
+    def __init__(
+        self,
+        content: str,
+        reasoning: str | None = None,
+        reasoning_details=None,
+        *,
+        _set_reasoning: bool = False,
+        _set_reasoning_details: bool = False,
+    ) -> None:
+        self.content = content
+        if _set_reasoning or reasoning is not None:
+            self.reasoning = reasoning
+        if _set_reasoning_details or reasoning_details is not None:
+            self.reasoning_details = reasoning_details
+
+
+def _make_api_response(content: str, reasoning: str | None = None, reasoning_details=None):
+    """Build a mock OpenAI ChatCompletion response."""
+    msg = _FakeMessage(content=content, reasoning=reasoning, reasoning_details=reasoning_details)
+    choice = MagicMock()
+    choice.message = msg
+    response = MagicMock()
+    response.choices = [choice]
+    return response
+
+
+def _make_executor(
+    model: str = "nousresearch/hermes-4-0",
+    system_prompt: str | None = "You are Hermes.",
+) -> tuple[HermesA2AExecutor, AsyncMock]:
+    """Return (executor, mock_client) with a pre-wired async mock client."""
+    mock_client = MagicMock()
+    mock_client.chat.completions.create = AsyncMock()
+    executor = HermesA2AExecutor(
+        model=model,
+        system_prompt=system_prompt,
+        _client=mock_client,
+    )
+    return executor, mock_client
+
+
+# ---------------------------------------------------------------------------
+# _reasoning_supported
+# ---------------------------------------------------------------------------
+
+
+def test_reasoning_supported_hermes4_slug():
+    """Exact "hermes-4" substring → True."""
+    assert _reasoning_supported("nousresearch/hermes-4-0") is True
+
+
+def test_reasoning_supported_hermes4_nous_portal():
+    """Nous Portal style slug containing "hermes-4" → True."""
+    assert _reasoning_supported("nous-hermes-4") is True
+
+
+def test_reasoning_supported_hermes4_uppercase():
+    """Case-insensitive match — uppercase "HERMES-4" → True."""
+    assert _reasoning_supported("NOUSRESEARCH/HERMES-4") is True
+
+
+def test_reasoning_supported_hermes4_compact():
+    """Compact "hermes4" pattern → True."""
+    assert _reasoning_supported("hermes4-fine-tuned") is True
+
+
+def test_reasoning_not_supported_hermes3():
+    """Hermes 3 slug → False (pattern "hermes-3" not in _HERMES4_PATTERNS)."""
+    assert _reasoning_supported("nousresearch/hermes-3-llama-3.1-70b") is False
+
+
+def test_reasoning_not_supported_gpt4():
+    """Unrelated model → False."""
+    assert _reasoning_supported("gpt-4o") is False
+
+
+def test_reasoning_not_supported_empty():
+    """Empty string → False."""
+    assert _reasoning_supported("") is False
+
+
+# ---------------------------------------------------------------------------
+# ProviderConfig
+# ---------------------------------------------------------------------------
+
+
+def test_provider_config_hermes4():
+    """Hermes 4 model → reasoning_supported=True."""
+    cfg = ProviderConfig("nousresearch/hermes-4-0")
+    assert cfg.model == "nousresearch/hermes-4-0"
+    assert cfg.reasoning_supported is True
+
+
+def test_provider_config_hermes3():
+    """Hermes 3 model → reasoning_supported=False."""
+    cfg = ProviderConfig("nousresearch/hermes-3-llama-3.1-70b")
+    assert cfg.reasoning_supported is False
+
+
+def test_provider_config_unknown():
+    """Unknown model → reasoning_supported=False."""
+    cfg = ProviderConfig("mistralai/mixtral-8x7b")
+    assert cfg.reasoning_supported is False
+
+
+# ---------------------------------------------------------------------------
+# HermesA2AExecutor construction
+# ---------------------------------------------------------------------------
+
+
+def test_constructor_fields_stored():
+    """All constructor fields are persisted as attributes."""
+    mock_client = MagicMock()
+    executor = HermesA2AExecutor(
+        model="nousresearch/hermes-4-0",
+        system_prompt="sys",
+        _client=mock_client,
+    )
+    assert executor.model == "nousresearch/hermes-4-0"
+    assert executor.system_prompt == "sys"
+    assert executor._client is mock_client
+    assert isinstance(executor._provider, ProviderConfig)
+    assert executor._provider.reasoning_supported is True
+
+
+def test_constructor_hermes3_reasoning_not_enabled():
+    """Hermes 3 model → _provider.reasoning_supported is False."""
+    executor = HermesA2AExecutor(
+        model="nousresearch/hermes-3-llama-3.1-70b",
+        _client=MagicMock(),
+    )
+    assert executor._provider.reasoning_supported is False
+
+
+def test_constructor_uses_injected_client():
+    """When _client is supplied, AsyncOpenAI is never called."""
+    stub = MagicMock()
+    executor = HermesA2AExecutor(model="hermes-4", _client=stub)
+    assert executor._client is stub
+
+
+# ---------------------------------------------------------------------------
+# _build_messages
+# ---------------------------------------------------------------------------
+
+
+def test_build_messages_with_system_prompt():
+    """System prompt is prepended as role=system."""
+    executor = HermesA2AExecutor(
+        model="hermes-4", system_prompt="Be helpful.", _client=MagicMock()
+    )
+    msgs = executor._build_messages("Hello!")
+    assert msgs[0] == {"role": "system", "content": "Be helpful."}
+    assert msgs[1] == {"role": "user", "content": "Hello!"}
+
+
+def test_build_messages_no_system_prompt():
+    """Without system_prompt only the user turn is present."""
+    executor = HermesA2AExecutor(
+        model="hermes-4", system_prompt=None, _client=MagicMock()
+    )
+    msgs = executor._build_messages("Hello!")
+    assert len(msgs) == 1
+    assert msgs[0] == {"role": "user", "content": "Hello!"}
+
+
+# ---------------------------------------------------------------------------
+# execute — happy path
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_execute_returns_content():
+    """Successful API call → content is enqueued as A2A message."""
+    executor, mock_client = _make_executor()
+    mock_client.chat.completions.create.return_value = _make_api_response("42")
+
+    ctx = _make_context("What is 6×7?")
+    eq = AsyncMock()
+
+    await executor.execute(ctx, eq)
+
+    eq.enqueue_event.assert_called_once_with("42")
+
+
+@pytest.mark.asyncio
+async def test_execute_empty_content_returns_fallback():
+    """Empty content string → fallback message '(no response generated)'."""
+    executor, mock_client = _make_executor()
+    mock_client.chat.completions.create.return_value = _make_api_response("")
+
+    ctx = _make_context("ping")
+    eq = AsyncMock()
+
+    await executor.execute(ctx, eq)
+
+    eq.enqueue_event.assert_called_once_with("(no response generated)")
+
+
+@pytest.mark.asyncio
+async def test_execute_strips_whitespace_content():
+    """Content with only whitespace is treated as empty → fallback."""
+    executor, mock_client = _make_executor()
+    mock_client.chat.completions.create.return_value = _make_api_response("   \n  ")
+
+    ctx = _make_context("ping")
+    eq = AsyncMock()
+
+    await executor.execute(ctx, eq)
+
+    eq.enqueue_event.assert_called_once_with("(no response generated)")
+
+
+# ---------------------------------------------------------------------------
+# execute — empty input
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_execute_empty_input_returns_error():
+    """Message with no extractable text → error message, no API call."""
+    executor, mock_client = _make_executor()
+
+    ctx = _make_empty_context()
+    eq = AsyncMock()
+
+    await executor.execute(ctx, eq)
+
+    eq.enqueue_event.assert_called_once_with(
+        "Error: message contained no text content."
+    )
+    mock_client.chat.completions.create.assert_not_called()
+
+
+# ---------------------------------------------------------------------------
+# execute — Hermes 4 extra_body
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_execute_hermes4_sends_reasoning_extra_body():
+    """Hermes 4 model → extra_body with reasoning enabled is sent."""
+    executor, mock_client = _make_executor(model="nousresearch/hermes-4-0")
+    mock_client.chat.completions.create.return_value = _make_api_response("ok")
+
+    await executor.execute(_make_context("hello"), AsyncMock())
+
+    call_kwargs = mock_client.chat.completions.create.call_args[1]
+    assert call_kwargs["extra_body"] == {"reasoning": {"enabled": True}}
+
+
+@pytest.mark.asyncio
+async def test_execute_hermes3_no_extra_body():
+    """Hermes 3 model → extra_body=None, no reasoning injection."""
+    executor, mock_client = _make_executor(model="nousresearch/hermes-3-llama-3.1-70b")
+    mock_client.chat.completions.create.return_value = _make_api_response("ok")
+
+    await executor.execute(_make_context("hello"), AsyncMock())
+
+    call_kwargs = mock_client.chat.completions.create.call_args[1]
+    assert call_kwargs["extra_body"] is None
+
+
+@pytest.mark.asyncio
+async def test_execute_model_passed_to_api():
+    """The model name is forwarded verbatim to the API call."""
+    model = "nousresearch/hermes-4-0"
+    executor, mock_client = _make_executor(model=model)
+    mock_client.chat.completions.create.return_value = _make_api_response("ok")
+
+    await executor.execute(_make_context("hi"), AsyncMock())
+
+    call_kwargs = mock_client.chat.completions.create.call_args[1]
+    assert call_kwargs["model"] == model
+
+
+# ---------------------------------------------------------------------------
+# execute — reasoning trace handling
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_execute_reasoning_not_in_reply():
+    """Reasoning trace is present in response but NOT included in the A2A reply."""
+    executor, mock_client = _make_executor(model="nousresearch/hermes-4-0")
+    response = _make_api_response(
+        content="The answer is 42.",
+        reasoning="<think>First I compute 6×7...</think>",
+    )
+    mock_client.chat.completions.create.return_value = response
+
+    eq = AsyncMock()
+    await executor.execute(_make_context("6×7?"), eq)
+
+    # Reply must contain ONLY the content, not the reasoning
+    enqueued = eq.enqueue_event.call_args[0][0]
+    assert enqueued == "The answer is 42."
+    assert "<think>" not in enqueued
+    assert "6×7" not in enqueued  # reasoning text excluded
+
+
+@pytest.mark.asyncio
+async def test_execute_reasoning_logged_via_otel(monkeypatch):
+    """Reasoning trace → _log_reasoning is called."""
+    executor, mock_client = _make_executor(model="nousresearch/hermes-4-0")
+    response = _make_api_response(
+        content="Answer.",
+        reasoning="<think>reasoning here</think>",
+    )
+    mock_client.chat.completions.create.return_value = response
+
+    log_calls: list = []
+
+    original_log = executor._log_reasoning
+
+    def capturing_log(context, reasoning, reasoning_details):
+        log_calls.append((reasoning, reasoning_details))
+        return original_log(context, reasoning, reasoning_details)
+
+    monkeypatch.setattr(executor, "_log_reasoning", capturing_log)
+
+    await executor.execute(_make_context("test"), AsyncMock())
+
+    assert len(log_calls) == 1
+    assert log_calls[0][0] == "<think>reasoning here</think>"
+
+
+@pytest.mark.asyncio
+async def test_execute_reasoning_details_logged(monkeypatch):
+    """reasoning_details field is passed through to _log_reasoning."""
+    executor, mock_client = _make_executor(model="hermes-4")
+    details = {"steps": ["step1", "step2"]}
+    response = _make_api_response(
+        content="ok",
+        reasoning="some reasoning",
+        reasoning_details=details,
+    )
+    mock_client.chat.completions.create.return_value = response
+
+    log_calls: list = []
+
+    def capturing_log(context, reasoning, reasoning_details):
+        log_calls.append((reasoning, reasoning_details))
+
+    monkeypatch.setattr(executor, "_log_reasoning", capturing_log)
+
+    await executor.execute(_make_context("test"), AsyncMock())
+
+    assert log_calls[0][1] is details
+
+
+@pytest.mark.asyncio
+async def test_execute_no_reasoning_field_no_log(monkeypatch):
+    """Response with no reasoning attribute → _log_reasoning not called."""
+    executor, mock_client = _make_executor(model="nousresearch/hermes-4-0")
+    # _make_api_response with no reasoning arg → no .reasoning attribute set
+    response = _make_api_response(content="ok")
+    mock_client.chat.completions.create.return_value = response
+
+    log_calls: list = []
+    monkeypatch.setattr(executor, "_log_reasoning", lambda *a: log_calls.append(a))
+
+    await executor.execute(_make_context("test"), AsyncMock())
+
+    assert log_calls == []
+
+
+# ---------------------------------------------------------------------------
+# execute — API error handling
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_execute_api_error_returns_sanitized_message():
+    """API exception → class name only in the A2A reply (no message body)."""
+    executor, mock_client = _make_executor()
+
+    class FakeAPIError(Exception):
+        pass
+
+    mock_client.chat.completions.create.side_effect = FakeAPIError(
+        "api_key=sk-secret123 rate limit exceeded"
+    )
+
+    eq = AsyncMock()
+    await executor.execute(_make_context("hello"), eq)
+
+    enqueued = eq.enqueue_event.call_args[0][0]
+    assert enqueued == "Agent error: FakeAPIError"
+    # Secret must NOT leak
+    assert "sk-secret" not in enqueued
+    assert "rate limit" not in enqueued
+
+
+@pytest.mark.asyncio
+async def test_execute_api_error_is_logged(caplog):
+    """API exception is logged at ERROR level."""
+    import logging
+
+    executor, mock_client = _make_executor()
+    mock_client.chat.completions.create.side_effect = ValueError("bad request")
+
+    with caplog.at_level(logging.ERROR, logger="hermes_executor"):
+        await executor.execute(_make_context("hello"), AsyncMock())
+
+    assert any("API error" in r.message for r in caplog.records)
+
+
+# ---------------------------------------------------------------------------
+# _log_reasoning — direct unit tests
+# ---------------------------------------------------------------------------
+
+
+def test_log_reasoning_otel_span_attributes():
+    """_log_reasoning sets the expected OTEL span attributes."""
+    executor, _ = _make_executor(model="nousresearch/hermes-4-0")
+
+    mock_span = MagicMock()
+    mock_tracer = MagicMock()
+    mock_tracer.start_as_current_span.return_value.__enter__ = MagicMock(
+        return_value=mock_span
+    )
+    mock_tracer.start_as_current_span.return_value.__exit__ = MagicMock(
+        return_value=False
+    )
+
+    ctx = MagicMock()
+    ctx.context_id = "ctx-abc"
+
+    with patch("hermes_executor.os.environ.get", return_value="ws-123"), \
+         patch("hermes_executor.logger"):
+        # Patch builtin_tools.telemetry inside the method
+        import builtin_tools.telemetry as _tel
+        original_get_tracer = _tel.get_tracer
+        _tel.get_tracer = MagicMock(return_value=mock_tracer)
+        try:
+            executor._log_reasoning(ctx, "deep thinking here", None)
+        finally:
+            _tel.get_tracer = original_get_tracer
+
+    mock_span.set_attribute.assert_any_call("hermes.model", "nousresearch/hermes-4-0")
+    mock_span.set_attribute.assert_any_call("hermes.reasoning_length", len("deep thinking here"))
+    mock_span.set_attribute.assert_any_call("hermes.reasoning_preview", "deep thinking here")
+
+
+def test_log_reasoning_swallows_telemetry_error(caplog):
+    """_log_reasoning never raises even when OTEL throws."""
+    import logging
+
+    executor, _ = _make_executor()
+    ctx = MagicMock()
+    ctx.context_id = "ctx-xyz"
+
+    with patch("builtin_tools.telemetry.get_tracer", side_effect=RuntimeError("boom")):
+        # Must not raise
+        executor._log_reasoning(ctx, "reasoning text", None)
+
+
+def test_log_reasoning_has_reasoning_details_attribute():
+    """reasoning_details → has_reasoning_details span attribute set to True."""
+    executor, _ = _make_executor(model="hermes-4")
+
+    mock_span = MagicMock()
+    mock_tracer = MagicMock()
+    mock_tracer.start_as_current_span.return_value.__enter__ = MagicMock(
+        return_value=mock_span
+    )
+    mock_tracer.start_as_current_span.return_value.__exit__ = MagicMock(
+        return_value=False
+    )
+
+    ctx = MagicMock()
+    ctx.context_id = "ctx-rd"
+
+    import builtin_tools.telemetry as _tel
+    original = _tel.get_tracer
+    _tel.get_tracer = MagicMock(return_value=mock_tracer)
+    try:
+        executor._log_reasoning(ctx, None, {"steps": []})
+    finally:
+        _tel.get_tracer = original
+
+    mock_span.set_attribute.assert_any_call("hermes.has_reasoning_details", True)
+
+
+def test_log_reasoning_no_preview_when_reasoning_is_none():
+    """When reasoning is None, hermes.reasoning_preview attribute is not set."""
+    executor, _ = _make_executor(model="hermes-4")
+
+    mock_span = MagicMock()
+    mock_tracer = MagicMock()
+    mock_tracer.start_as_current_span.return_value.__enter__ = MagicMock(
+        return_value=mock_span
+    )
+    mock_tracer.start_as_current_span.return_value.__exit__ = MagicMock(
+        return_value=False
+    )
+
+    ctx = MagicMock()
+    ctx.context_id = "ctx-none"
+
+    import builtin_tools.telemetry as _tel
+    original = _tel.get_tracer
+    _tel.get_tracer = MagicMock(return_value=mock_tracer)
+    try:
+        executor._log_reasoning(ctx, None, None)
+    finally:
+        _tel.get_tracer = original
+
+    # hermes.reasoning_preview should NOT have been set
+    preview_calls = [
+        c for c in mock_span.set_attribute.call_args_list
+        if c[0][0] == "hermes.reasoning_preview"
+    ]
+    assert preview_calls == []
+
+
+# ---------------------------------------------------------------------------
+# cancel
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_cancel_emits_canceled_event():
+    """cancel() enqueues a TaskStatusUpdateEvent with state=canceled."""
+    executor, _ = _make_executor()
+
+    # Stub a2a.types if not already present with minimal TaskStatusUpdateEvent
+    import a2a.types as a2a_types
+
+    class _TaskState:
+        canceled = "canceled"
+
+    class _TaskStatus:
+        def __init__(self, state):
+            self.state = state
+
+    class _TaskStatusUpdateEvent:
+        def __init__(self, status, final):
+            self.status = status
+            self.final = final
+
+    a2a_types.TaskState = _TaskState
+    a2a_types.TaskStatus = _TaskStatus
+    a2a_types.TaskStatusUpdateEvent = _TaskStatusUpdateEvent
+
+    eq = AsyncMock()
+    ctx = MagicMock()
+    await executor.cancel(ctx, eq)
+
+    eq.enqueue_event.assert_called_once()
+    event = eq.enqueue_event.call_args[0][0]
+    assert isinstance(event, _TaskStatusUpdateEvent)
+    assert event.status.state == "canceled"
+    assert event.final is True
+
+
+# ---------------------------------------------------------------------------
+# Integration: system prompt is sent with messages
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+async def test_system_prompt_included_in_api_call():
+    """System prompt appears as first message in the API call."""
+    executor, mock_client = _make_executor(
+        model="hermes-4", system_prompt="You are a math tutor."
+    )
+    mock_client.chat.completions.create.return_value = _make_api_response("6")
+
+    await executor.execute(_make_context("3+3?"), AsyncMock())
+
+    msgs = mock_client.chat.completions.create.call_args[1]["messages"]
+    assert msgs[0] == {"role": "system", "content": "You are a math tutor."}
+    assert msgs[1]["role"] == "user"
+    assert "3+3?" in msgs[1]["content"]
+
+
+@pytest.mark.asyncio
+async def test_no_system_prompt_only_user_message():
+    """Without system_prompt, only the user turn is in messages."""
+    executor, mock_client = _make_executor(model="hermes-4", system_prompt=None)
+    mock_client.chat.completions.create.return_value = _make_api_response("ok")
+
+    await executor.execute(_make_context("hello"), AsyncMock())
+
+    msgs = mock_client.chat.completions.create.call_args[1]["messages"]
+    assert len(msgs) == 1
+    assert msgs[0]["role"] == "user"