feat: save oversized tool results to file instead of destructive truncation (#5210)

Previously, tool results exceeding 100K characters were silently chopped with only a '[Truncated]' notice — the rest of the content was lost permanently. The model had no way to access the truncated portion. Now, oversized results are written to HERMES_HOME/cache/tool_responses/ and the model receives: - A 1,500-char head preview for immediate context - The file path so it can use read_file/search_files on the full output This preserves the context window protection (inline content stays small) while making the full data recoverable. Falls back to the old destructive truncation if the file write fails. Inspired by Block/goose's large response handler pattern.
2026-04-05 10:29:57 -07:00 · 2026-04-05 10:29:57 -07:00 · 51ed7dc2f3
commit 51ed7dc2f3
parent d932980c1a
3 changed files with 240 additions and 27 deletions
--- a/run_agent.py
+++ b/run_agent.py
@ -405,6 +405,68 @@ def _strip_budget_warnings_from_history(messages: list) -> None:
            msg["content"] = cleaned


+# =========================================================================
+# Large tool result handler — save oversized output to temp file
+# =========================================================================
+
+# Threshold at which tool results are saved to a file instead of kept inline.
+# 100K chars ≈ 25K tokens — generous for any reasonable output but prevents
+# catastrophic context explosions.
+_LARGE_RESULT_CHARS = 100_000
+
+# How many characters of the original result to include as an inline preview
+# so the model has immediate context about what the tool returned.
+_LARGE_RESULT_PREVIEW_CHARS = 1_500
+
+
+def _save_oversized_tool_result(function_name: str, function_result: str) -> str:
+    """Replace oversized tool results with a file reference + preview.
+
+    When a tool returns more than ``_LARGE_RESULT_CHARS`` characters, the full
+    content is written to a temporary file under ``HERMES_HOME/cache/tool_responses/``
+    and the result sent to the model is replaced with:
+      • a brief head preview  (first ``_LARGE_RESULT_PREVIEW_CHARS`` chars)
+      • the file path so the model can use ``read_file`` / ``search_files``
+
+    Falls back to destructive truncation if the file write fails.
+    """
+    original_len = len(function_result)
+    if original_len <= _LARGE_RESULT_CHARS:
+        return function_result
+
+    # Build the target directory
+    try:
+        response_dir = os.path.join(get_hermes_home(), "cache", "tool_responses")
+        os.makedirs(response_dir, exist_ok=True)
+
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+        # Sanitize tool name for use in filename
+        safe_name = re.sub(r"[^\w\-]", "_", function_name)[:40]
+        filename = f"{safe_name}_{timestamp}.txt"
+        filepath = os.path.join(response_dir, filename)
+
+        with open(filepath, "w", encoding="utf-8") as f:
+            f.write(function_result)
+
+        preview = function_result[:_LARGE_RESULT_PREVIEW_CHARS]
+        return (
+            f"{preview}\n\n"
+            f"[Large tool response: {original_len:,} characters total — "
+            f"only the first {_LARGE_RESULT_PREVIEW_CHARS:,} shown above. "
+            f"Full output saved to: {filepath}\n"
+            f"Use read_file or search_files on that path to access the rest.]"
+        )
+    except Exception as exc:
+        # Fall back to destructive truncation if file write fails
+        logger.warning("Failed to save large tool result to file: %s", exc)
+        return (
+            function_result[:_LARGE_RESULT_CHARS]
+            + f"\n\n[Truncated: tool response was {original_len:,} chars, "
+            f"exceeding the {_LARGE_RESULT_CHARS:,} char limit. "
+            f"File save failed: {exc}]"
+        )
+
+
 class AIAgent:
    """
    AI Agent with tool calling capabilities.
@ -6051,15 +6113,8 @@ class AIAgent:
                except Exception as cb_err:
                    logging.debug(f"Tool complete callback error: {cb_err}")

-            # Truncate oversized results
-            MAX_TOOL_RESULT_CHARS = 100_000
-            if len(function_result) > MAX_TOOL_RESULT_CHARS:
-                original_len = len(function_result)
-                function_result = (
-                    function_result[:MAX_TOOL_RESULT_CHARS]
-                    + f"\n\n[Truncated: tool response was {original_len:,} chars, "
-                    f"exceeding the {MAX_TOOL_RESULT_CHARS:,} char limit]"
-                )
+            # Save oversized results to file instead of destructive truncation
+            function_result = _save_oversized_tool_result(name, function_result)

            # Append tool result message in order
            tool_msg = {
@ -6332,18 +6387,8 @@ class AIAgent:
                except Exception as cb_err:
                    logging.debug(f"Tool complete callback error: {cb_err}")

-            # Guard against tools returning absurdly large content that would
-            # blow up the context window. 100K chars ≈ 25K tokens — generous
-            # enough for any reasonable tool output but prevents catastrophic
-            # context explosions (e.g. accidental base64 image dumps).
-            MAX_TOOL_RESULT_CHARS = 100_000
-            if len(function_result) > MAX_TOOL_RESULT_CHARS:
-                original_len = len(function_result)
-                function_result = (
-                    function_result[:MAX_TOOL_RESULT_CHARS]
-                    + f"\n\n[Truncated: tool response was {original_len:,} chars, "
-                    f"exceeding the {MAX_TOOL_RESULT_CHARS:,} char limit]"
-                )
+            # Save oversized results to file instead of destructive truncation
+            function_result = _save_oversized_tool_result(function_name, function_result)

            tool_msg = {
                "role": "tool",
--- a/tests/test_large_tool_result.py
+++ b/tests/test_large_tool_result.py
@ -0,0 +1,162 @@
+"""Tests for _save_oversized_tool_result() — the large tool response handler.
+
+When a tool returns more than _LARGE_RESULT_CHARS characters, the full content
+is saved to a file and the model receives a preview + file path instead.
+"""
+
+import os
+import re
+
+import pytest
+
+from run_agent import (
+    _save_oversized_tool_result,
+    _LARGE_RESULT_CHARS,
+    _LARGE_RESULT_PREVIEW_CHARS,
+)
+
+
+class TestSaveOversizedToolResult:
+    """Unit tests for the large tool result handler."""
+
+    def test_small_result_returned_unchanged(self):
+        """Results under the threshold pass through untouched."""
+        small = "x" * 1000
+        assert _save_oversized_tool_result("terminal", small) is small
+
+    def test_exactly_at_threshold_returned_unchanged(self):
+        """Results exactly at the threshold pass through."""
+        exact = "y" * _LARGE_RESULT_CHARS
+        assert _save_oversized_tool_result("terminal", exact) is exact
+
+    def test_oversized_result_saved_to_file(self, tmp_path, monkeypatch):
+        """Results over the threshold are written to a file."""
+        monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
+        os.makedirs(tmp_path / ".hermes", exist_ok=True)
+
+        big = "A" * (_LARGE_RESULT_CHARS + 500)
+        result = _save_oversized_tool_result("terminal", big)
+
+        # Should contain the preview
+        assert result.startswith("A" * _LARGE_RESULT_PREVIEW_CHARS)
+        # Should mention the file path
+        assert "Full output saved to:" in result
+        # Should mention original size
+        assert f"{len(big):,}" in result
+
+        # Extract the file path and verify the file exists with full content
+        match = re.search(r"Full output saved to: (.+?)\n", result)
+        assert match, f"No file path found in result: {result[:300]}"
+        filepath = match.group(1)
+        assert os.path.isfile(filepath)
+        with open(filepath, "r", encoding="utf-8") as f:
+            saved = f.read()
+        assert saved == big
+        assert len(saved) == _LARGE_RESULT_CHARS + 500
+
+    def test_file_placed_in_cache_tool_responses(self, tmp_path, monkeypatch):
+        """Saved file lives under HERMES_HOME/cache/tool_responses/."""
+        hermes_home = str(tmp_path / ".hermes")
+        monkeypatch.setenv("HERMES_HOME", hermes_home)
+        os.makedirs(hermes_home, exist_ok=True)
+
+        big = "B" * (_LARGE_RESULT_CHARS + 1)
+        result = _save_oversized_tool_result("web_search", big)
+
+        match = re.search(r"Full output saved to: (.+?)\n", result)
+        filepath = match.group(1)
+        expected_dir = os.path.join(hermes_home, "cache", "tool_responses")
+        assert filepath.startswith(expected_dir)
+
+    def test_filename_contains_tool_name(self, tmp_path, monkeypatch):
+        """The saved filename includes a sanitized version of the tool name."""
+        monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
+        os.makedirs(tmp_path / ".hermes", exist_ok=True)
+
+        big = "C" * (_LARGE_RESULT_CHARS + 1)
+        result = _save_oversized_tool_result("browser_navigate", big)
+
+        match = re.search(r"Full output saved to: (.+?)\n", result)
+        filename = os.path.basename(match.group(1))
+        assert filename.startswith("browser_navigate_")
+        assert filename.endswith(".txt")
+
+    def test_tool_name_sanitized(self, tmp_path, monkeypatch):
+        """Special characters in tool names are replaced in the filename."""
+        monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
+        os.makedirs(tmp_path / ".hermes", exist_ok=True)
+
+        big = "D" * (_LARGE_RESULT_CHARS + 1)
+        result = _save_oversized_tool_result("mcp:some/weird tool", big)
+
+        match = re.search(r"Full output saved to: (.+?)\n", result)
+        filename = os.path.basename(match.group(1))
+        # No slashes or colons in filename
+        assert "/" not in filename
+        assert ":" not in filename
+
+    def test_fallback_on_write_failure(self, tmp_path, monkeypatch):
+        """When file write fails, falls back to destructive truncation."""
+        # Point HERMES_HOME to a path that will fail (file, not directory)
+        bad_path = str(tmp_path / "not_a_dir.txt")
+        with open(bad_path, "w") as f:
+            f.write("I'm a file, not a directory")
+        monkeypatch.setenv("HERMES_HOME", bad_path)
+
+        big = "E" * (_LARGE_RESULT_CHARS + 50_000)
+        result = _save_oversized_tool_result("terminal", big)
+
+        # Should still contain data (fallback truncation)
+        assert len(result) > 0
+        assert result.startswith("E" * 1000)
+        # Should mention the failure
+        assert "File save failed" in result
+        # Should be truncated to approximately _LARGE_RESULT_CHARS + error msg
+        assert len(result) < len(big)
+
+    def test_preview_length_capped(self, tmp_path, monkeypatch):
+        """The inline preview is capped at _LARGE_RESULT_PREVIEW_CHARS."""
+        monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
+        os.makedirs(tmp_path / ".hermes", exist_ok=True)
+
+        # Use distinct chars so we can measure the preview
+        big = "Z" * (_LARGE_RESULT_CHARS + 5000)
+        result = _save_oversized_tool_result("terminal", big)
+
+        # The preview section is the content before the "[Large tool response:" marker
+        marker_pos = result.index("[Large tool response:")
+        preview_section = result[:marker_pos].rstrip()
+        assert len(preview_section) == _LARGE_RESULT_PREVIEW_CHARS
+
+    def test_guidance_message_mentions_tools(self, tmp_path, monkeypatch):
+        """The replacement message tells the model how to access the file."""
+        monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
+        os.makedirs(tmp_path / ".hermes", exist_ok=True)
+
+        big = "F" * (_LARGE_RESULT_CHARS + 1)
+        result = _save_oversized_tool_result("terminal", big)
+
+        assert "read_file" in result
+        assert "search_files" in result
+
+    def test_empty_result_passes_through(self):
+        """Empty strings are not oversized."""
+        assert _save_oversized_tool_result("terminal", "") == ""
+
+    def test_unicode_content_preserved(self, tmp_path, monkeypatch):
+        """Unicode content is fully preserved in the saved file."""
+        monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
+        os.makedirs(tmp_path / ".hermes", exist_ok=True)
+
+        # Mix of ASCII and multi-byte unicode to exceed threshold
+        unit = "Hello 世界! 🎉 " * 100  # ~1400 chars per repeat
+        big = unit * ((_LARGE_RESULT_CHARS // len(unit)) + 1)
+        assert len(big) > _LARGE_RESULT_CHARS
+
+        result = _save_oversized_tool_result("terminal", big)
+        match = re.search(r"Full output saved to: (.+?)\n", result)
+        filepath = match.group(1)
+
+        with open(filepath, "r", encoding="utf-8") as f:
+            saved = f.read()
+        assert saved == big
--- a/tests/test_run_agent.py
+++ b/tests/test_run_agent.py
@ -1002,16 +1002,19 @@ class TestExecuteToolCalls:
        assert messages[0]["role"] == "tool"
        assert messages[0]["tool_call_id"] == "c1"

-    def test_result_truncation_over_100k(self, agent):
+    def test_result_truncation_over_100k(self, agent, tmp_path, monkeypatch):
+        monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
+        (tmp_path / ".hermes").mkdir()
        tc = _mock_tool_call(name="web_search", arguments="{}", call_id="c1")
        mock_msg = _mock_assistant_msg(content="", tool_calls=[tc])
        messages = []
        big_result = "x" * 150_000
        with patch("run_agent.handle_function_call", return_value=big_result):
            agent._execute_tool_calls(mock_msg, messages, "task-1")
-        # Content should be truncated
+        # Content should be replaced with preview + file path
        assert len(messages[0]["content"]) < 150_000
-        assert "Truncated" in messages[0]["content"]
+        assert "Large tool response" in messages[0]["content"]
+        assert "Full output saved to:" in messages[0]["content"]


 class TestConcurrentToolExecution:
@ -1230,8 +1233,10 @@ class TestConcurrentToolExecution:
        assert "cancelled" in messages[0]["content"].lower() or "skipped" in messages[0]["content"].lower()
        assert "cancelled" in messages[1]["content"].lower() or "skipped" in messages[1]["content"].lower()

-    def test_concurrent_truncates_large_results(self, agent):
-        """Concurrent path should truncate results over 100k chars."""
+    def test_concurrent_truncates_large_results(self, agent, tmp_path, monkeypatch):
+        """Concurrent path should save oversized results to file."""
+        monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
+        (tmp_path / ".hermes").mkdir()
        tc1 = _mock_tool_call(name="web_search", arguments='{}', call_id="c1")
        tc2 = _mock_tool_call(name="web_search", arguments='{}', call_id="c2")
        mock_msg = _mock_assistant_msg(content="", tool_calls=[tc1, tc2])
@ -1244,7 +1249,8 @@ class TestConcurrentToolExecution:
        assert len(messages) == 2
        for m in messages:
            assert len(m["content"]) < 150_000
-            assert "Truncated" in m["content"]
+            assert "Large tool response" in m["content"]
+            assert "Full output saved to:" in m["content"]

    def test_invoke_tool_dispatches_to_handle_function_call(self, agent):
        """_invoke_tool should route regular tools through handle_function_call."""