Merge pull request #42 from Molecule-AI/fix/stderr-capture-a2a-response

fix(runtime): capture stderr in A2A error response (closes #66)
2026-04-24 13:25:15 -07:00 · 2026-04-24 13:25:15 -07:00 · 7fc3537b2f
commit 7fc3537b2f
parent 1759e221e9 19fde6f466
3 changed files with 71 additions and 8 deletions
--- a/molecule_runtime/claude_sdk_executor.py
+++ b/molecule_runtime/claude_sdk_executor.py
@ -66,10 +66,11 @@ _NO_TEXT_MSG = "Error: message contained no text content."
 _NO_RESPONSE_MSG = "(no response generated)"
 _MAX_RETRIES = 3
 _BASE_RETRY_DELAY_S = 5
-# Cap for stderr captured from the CLI subprocess in the executor log. Keeps
-# log lines bounded while still surfacing enough context to diagnose crashes.
-# Fixes #66 (previously the executor logged nothing beyond the generic
-# "Check stderr output for details" message).
+# Cap for stderr captured from the CLI subprocess in the executor log and
+# surfaced in A2A error responses. 1 KB keeps log lines bounded while still
+# surfacing enough context to diagnose crashes. Fixes #66 (previously the
+# executor logged nothing beyond the generic "Check stderr output for
+# details" message).
 _PROCESS_ERROR_STDERR_MAX_CHARS = 4096

 # Substrings in error messages that indicate a transient failure worth retrying.
@ -454,10 +455,14 @@ class ClaudeSDKExecutor(AgentExecutor):
                    # Non-retryable or exhausted retries. Log exit_code +
                    # stderr explicitly (fixes #66) so operators don't have
                    # to reproduce the crash manually to find out why the
-                    # subprocess died.
+                    # subprocess died. Surface the first ~1 KB of stderr in
+                    # the A2A response so clients can show actionable context.
                    logger.error("SDK agent error [claude-code]: %s", formatted)
                    logger.exception("SDK agent error [claude-code] — full traceback follows")
-                    response_text = sanitize_agent_error(exc)
+                    exc_stderr = getattr(exc, "stderr", None)
+                    if exc_stderr:
+                        exc_stderr = exc_stderr[:1024]
+                    response_text = sanitize_agent_error(exc, stderr=exc_stderr)
                    break
        finally:
            await set_current_task(self.heartbeat, "")
--- a/molecule_runtime/executor_helpers.py
+++ b/molecule_runtime/executor_helpers.py
@ -409,6 +409,7 @@ def classify_subprocess_error(stderr_text: str, exit_code: int | None) -> str:
 def sanitize_agent_error(
    exc: BaseException | None = None,
    category: str | None = None,
+    stderr: str | None = None,
 ) -> str:
    """Render an agent-side failure into a user-safe error message.

@ -416,7 +417,12 @@ def sanitize_agent_error(
    category string (e.g. from `classify_subprocess_error`). If both are
    given, `category` wins. If neither, the tag defaults to "unknown".

-    The message body is deliberately dropped — exception messages and
+    `stderr` is the first ~1 KB of captured subprocess stderr — it is
+    included verbatim in the returned message so callers can surface it in
+    A2A error responses. The caller is responsible for the cap; this
+    function does not truncate.
+
+    The message body is deliberately simple — exception messages and
    subprocess stderr frequently leak stack traces, paths, tokens, and
    API keys. Full detail is available in the workspace logs via
    `logger.exception()` / `logger.error()`.
@ -427,4 +433,7 @@ def sanitize_agent_error(
        tag = type(exc).__name__
    else:
        tag = "unknown"
-    return f"Agent error ({tag}) — see workspace logs for details."
+    msg = f"Agent error ({tag})"
+    if stderr:
+        msg += f": {stderr}"
+    return msg
--- a/tests/test_executor_helpers.py
+++ b/tests/test_executor_helpers.py
@ -0,0 +1,49 @@
+"""Tests for sanitize_agent_error() — specifically the stderr surface in A2A responses."""
+
+from __future__ import annotations
+
+import pytest
+
+from molecule_runtime.executor_helpers import sanitize_agent_error
+
+
+class TestSanitizeAgentError:
+    """sanitize_agent_error() must surface stderr in A2A error messages."""
+
+    def test_plain_error_no_stderr(self):
+        """No stderr → simple message, no trailing colon."""
+        class DummyError(Exception):
+            pass
+
+        result = sanitize_agent_error(exc=DummyError("boom"))
+        assert result == "Agent error (DummyError)"
+
+    def test_stderr_included_in_message(self):
+        """stderr= kwarg is included verbatim so A2A callers can show it."""
+        result = sanitize_agent_error(category="subprocess_error", stderr="You hit your rate limit · resets Apr 17")
+        assert result == "Agent error (subprocess_error): You hit your rate limit · resets Apr 17"
+
+    def test_stderr_truncated_by_caller(self):
+        """Caller is responsible for capping; function does not truncate."""
+        long_stderr = "x" * 5000
+        result = sanitize_agent_error(category="subprocess_error", stderr=long_stderr)
+        assert result.endswith(long_stderr)
+
+    def test_category_wins_over_exc_type(self):
+        """When both category and exc are passed, category is the tag."""
+        result = sanitize_agent_error(exc=ValueError("oops"), category="rate_limited", stderr="rate limit")
+        assert "Agent error (rate_limited)" in result
+        assert "ValueError" not in result
+
+    def test_exc_name_used_when_no_category_or_stderr(self):
+        """Fallback to exc type name when category is absent."""
+        class ToolNotFoundError(Exception):
+            pass
+
+        result = sanitize_agent_error(exc=ToolNotFoundError("some tool not found"))
+        assert result == "Agent error (ToolNotFoundError)"
+
+    def test_unknown_tag_when_no_exc_no_category(self):
+        """Neither exc nor category → defaults to 'unknown'."""
+        result = sanitize_agent_error()
+        assert result == "Agent error (unknown)"