fix: accept reasoning-only responses without retries — set content to "(empty)" (#5278)

* feat: coerce tool call arguments to match JSON Schema types LLMs frequently return numbers as strings ("42" instead of 42) and booleans as strings ("true" instead of true). This causes silent failures with MCP tools and any tool with strictly-typed parameters. Added coerce_tool_args() in model_tools.py that runs before every tool dispatch. For each argument, it checks the tool registry schema and attempts safe coercion: - "42" → 42 when schema says "type": "integer" - "3.14" → 3.14 when schema says "type": "number" - "true"/"false" → True/False when schema says "type": "boolean" - Union types tried in order - Original values preserved when coercion fails or is not applicable Inspired by Block/goose tool argument coercion system. * fix: accept reasoning-only responses without retries — set content to "(empty)" Previously, when a model returned reasoning/thinking but no visible content, we entered a 120-line retry/classify/compress/salvage cascade that wasted 3+ API calls trying to "fix" the response. The model was done thinking — retrying with the same input just burned money. Now reasoning-only responses are accepted immediately: - Reasoning stays in the `reasoning` field (semantically correct) - Content set to "(empty)" — valid non-empty string every provider accepts - No retries, no compression triggers, no salvage logic - Session history contains "(empty)" not "" — prevents #2128 session poisoning where empty assistant content caused prefill rejections Removes ~120 lines, adds ~15. Saves 2-3 API calls per reasoning-only response. Fixes #2128.
2026-04-05 11:30:52 -07:00 · 2026-04-05 11:30:52 -07:00 · a0a1b86c2e
commit a0a1b86c2e
parent 534511bebb
2 changed files with 38 additions and 160 deletions
--- a/run_agent.py
+++ b/run_agent.py
@ -8620,140 +8620,24 @@ class AIAgent:
                            self._response_was_previewed = True
                            break

-                        # No fallback available — classify the empty response before
-                        # blindly spending retries. Some local/custom backends surface
-                        # implicit context pressure as reasoning-only output rather than
-                        # an explicit overflow error.
-                        if not hasattr(self, '_empty_content_retries'):
-                            self._empty_content_retries = 0
-                        self._empty_content_retries += 1
+                        # Reasoning-only response: the model produced thinking
+                        # but no visible content.  This is a valid response —
+                        # keep reasoning in its own field and set content to
+                        # "(empty)" so every provider accepts the message.
+                        # No retries needed.
+                        reasoning_text = self._extract_reasoning(assistant_message)
+                        assistant_msg = self._build_assistant_message(assistant_message, finish_reason)
+                        assistant_msg["content"] = "(empty)"
+                        messages.append(assistant_msg)

-                        empty_response_info = self._classify_empty_content_response(
-                            assistant_message,
-                            finish_reason=finish_reason,
-                            approx_tokens=approx_tokens,
-                            api_messages=api_messages,
-                            conversation_history=conversation_history,
-                        )
-                        reasoning_text = empty_response_info["reasoning_text"]
-                        self._vprint(f"{self.log_prefix}⚠️  Response only contains think block with no content after it")
                        if reasoning_text:
                            reasoning_preview = reasoning_text[:500] + "..." if len(reasoning_text) > 500 else reasoning_text
-                            self._vprint(f"{self.log_prefix}   Reasoning: {reasoning_preview}")
+                            self._vprint(f"{self.log_prefix}ℹ️  Reasoning-only response (no visible content). Reasoning: {reasoning_preview}")
                        else:
-                            content_preview = final_response[:80] + "..." if len(final_response) > 80 else final_response
-                            self._vprint(f"{self.log_prefix}   Content: '{content_preview}'")
+                            self._vprint(f"{self.log_prefix}ℹ️  Empty response (no content or reasoning).")

-                        if empty_response_info["should_compress"]:
-                            compression_attempts += 1
-                            if compression_attempts > max_compression_attempts:
-                                self._vprint(f"{self.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached.", force=True)
-                                self._vprint(f"{self.log_prefix}   💡 Local/custom backend returned reasoning-only output with no visible content. This often means the resumed/large session exceeds the runtime context window. Try /new or lower model.context_length to the actual runtime limit.", force=True)
-                            else:
-                                self._vprint(f"{self.log_prefix}🗜️  Reasoning-only response looks like implicit context pressure — attempting compression ({compression_attempts}/{max_compression_attempts})...", force=True)
-                                original_len = len(messages)
-                                messages, active_system_prompt = self._compress_context(
-                                    messages, system_message, approx_tokens=approx_tokens,
-                                    task_id=effective_task_id,
-                                )
-                                if len(messages) < original_len:
-                                    conversation_history = None
-                                    self._emit_status(f"🗜️ Compressed {original_len} → {len(messages)} messages after reasoning-only response, retrying...")
-                                    time.sleep(2)
-                                    api_call_count -= 1
-                                    self.iteration_budget.refund()
-                                    retry_count += 1
-                                    continue
-                                self._vprint(f"{self.log_prefix}   Compression could not shrink the session; falling back to retry/salvage logic.")
-
-                        if (
-                            reasoning_text
-                            and empty_response_info["repeated_signature"]
-                            and empty_response_info["has_structured_reasoning"]
-                        ):
-                            self._vprint(f"{self.log_prefix}ℹ️  Structured reasoning-only response repeated unchanged — using reasoning text directly.", force=True)
-                            self._empty_content_retries = 0
-                            final_response = reasoning_text
-                            empty_msg = {
-                                "role": "assistant",
-                                "content": final_response,
-                                "reasoning": reasoning_text,
-                                "finish_reason": finish_reason,
-                            }
-                            messages.append(empty_msg)
-                            break
-                        
-                        if self._empty_content_retries < 3:
-                            self._vprint(f"{self.log_prefix}🔄 Retrying API call ({self._empty_content_retries}/3)...")
-                            continue
-                        else:
-                            self._vprint(f"{self.log_prefix}❌ Max retries (3) for empty content exceeded.", force=True)
-                            self._empty_content_retries = 0
-                            
-                            # If a prior tool_calls turn had real content, salvage it:
-                            # rewrite that turn's content to a brief tool description,
-                            # and use the original content as the final response here.
-                            fallback = getattr(self, '_last_content_with_tools', None)
-                            if fallback:
-                                self._last_content_with_tools = None
-                                # Find the last assistant message with tool_calls and rewrite it
-                                for i in range(len(messages) - 1, -1, -1):
-                                    msg = messages[i]
-                                    if msg.get("role") == "assistant" and msg.get("tool_calls"):
-                                        tool_names = []
-                                        for tc in msg["tool_calls"]:
-                                            if not tc or not isinstance(tc, dict): continue
-                                            fn = tc.get("function", {})
-                                            tool_names.append(fn.get("name", "unknown"))
-                                        msg["content"] = f"Calling the {', '.join(tool_names)} tool{'s' if len(tool_names) > 1 else ''}..."
-                                        break
-                                # Strip <think> blocks from fallback content for user display
-                                final_response = self._strip_think_blocks(fallback).strip()
-                                self._response_was_previewed = True
-                                break
-                            
-                            # No fallback -- if reasoning_text exists, the model put its
-                            # entire response inside <think> tags; use that as the content.
-                            if reasoning_text:
-                                self._vprint(f"{self.log_prefix}Using reasoning as response content (model wrapped entire response in think tags).", force=True)
-                                final_response = reasoning_text
-                                empty_msg = {
-                                    "role": "assistant",
-                                    "content": final_response,
-                                    "reasoning": reasoning_text,
-                                    "finish_reason": finish_reason,
-                                }
-                                messages.append(empty_msg)
-                                break
-
-                            # Truly empty -- no reasoning and no content
-                            empty_msg = {
-                                "role": "assistant",
-                                "content": final_response,
-                                "reasoning": reasoning_text,
-                                "finish_reason": finish_reason,
-                            }
-                            messages.append(empty_msg)
-
-                            self._cleanup_task_resources(effective_task_id)
-                            self._persist_session(messages, conversation_history)
-
-                            error_message = "Model generated only think blocks with no actual response after 3 retries"
-                            if empty_response_info["is_local_custom"]:
-                                error_message = (
-                                    "Local/custom backend returned reasoning-only output with no visible response after 3 retries. "
-                                    "Likely causes: wrong /v1 endpoint, runtime context window smaller than Hermes expects, "
-                                    "or a resumed/large session exceeding the backend's actual context limit."
-                                )
-
-                            return {
-                                "final_response": final_response or None,
-                                "messages": messages,
-                                "api_calls": api_call_count,
-                                "completed": False,
-                                "partial": True,
-                                "error": error_message
-                            }
+                        final_response = "(empty)"
+                        break
                    
                    # Reset retry counter/signature on successful content
                    if hasattr(self, '_empty_content_retries'):
--- a/tests/test_run_agent.py
+++ b/tests/test_run_agent.py
@ -1488,19 +1488,14 @@ class TestRunConversation:
        assert result["completed"] is True
        assert result["api_calls"] == 2

-    def test_empty_content_retry_uses_inline_reasoning_as_response(self, agent):
-        """Reasoning-only payloads should recover the inline reasoning text."""
+    def test_inline_think_blocks_reasoning_only_accepted(self, agent):
+        """Inline <think> reasoning-only responses accepted with (empty) content, no retries."""
        self._setup_agent(agent)
        empty_resp = _mock_response(
            content="<think>internal reasoning</think>",
            finish_reason="stop",
        )
-        # Return empty 3 times to exhaust retries
-        agent.client.chat.completions.create.side_effect = [
-            empty_resp,
-            empty_resp,
-            empty_resp,
-        ]
+        agent.client.chat.completions.create.side_effect = [empty_resp]
        with (
            patch.object(agent, "_persist_session"),
            patch.object(agent, "_save_trajectory"),
@ -1508,10 +1503,14 @@ class TestRunConversation:
        ):
            result = agent.run_conversation("answer me")
        assert result["completed"] is True
-        assert result["final_response"] == "internal reasoning"
+        assert result["final_response"] == "(empty)"
+        assert result["api_calls"] == 1  # no retries
+        # Reasoning should be preserved in the assistant message
+        assistant_msgs = [m for m in result["messages"] if m.get("role") == "assistant"]
+        assert any(m.get("reasoning") for m in assistant_msgs)

-    def test_empty_content_local_resumed_session_triggers_compression(self, agent):
-        """Local resumed reasoning-only responses should compress before burning retries."""
+    def test_reasoning_only_local_resumed_no_compression_triggered(self, agent):
+        """Reasoning-only responses no longer trigger compression — accepted immediately."""
        self._setup_agent(agent)
        agent.base_url = "http://127.0.0.1:1234/v1"
        agent.compression_enabled = True
@ -1520,39 +1519,34 @@ class TestRunConversation:
            finish_reason="stop",
            reasoning_content="reasoning only",
        )
-        ok_resp = _mock_response(content="Recovered after compression", finish_reason="stop")
        prefill = [
            {"role": "user", "content": "old question"},
            {"role": "assistant", "content": "old answer"},
        ]

        with (
-            patch.object(agent, "_interruptible_api_call", side_effect=[empty_resp, ok_resp]),
+            patch.object(agent, "_interruptible_api_call", side_effect=[empty_resp]),
            patch.object(agent, "_compress_context") as mock_compress,
            patch.object(agent, "_persist_session"),
            patch.object(agent, "_save_trajectory"),
            patch.object(agent, "_cleanup_task_resources"),
        ):
-            mock_compress.return_value = (
-                [{"role": "user", "content": "compressed user message"}],
-                "compressed system prompt",
-            )
            result = agent.run_conversation("hello", conversation_history=prefill)

-        mock_compress.assert_called_once()
+        mock_compress.assert_not_called()  # no compression triggered
        assert result["completed"] is True
-        assert result["final_response"] == "Recovered after compression"
-        assert result["api_calls"] == 1  # compression retry is refunded, same as explicit overflow path
+        assert result["final_response"] == "(empty)"
+        assert result["api_calls"] == 1

-    def test_empty_content_repeated_structured_reasoning_salvages_early(self, agent):
-        """Repeated identical structured reasoning-only responses should stop retrying early."""
+    def test_reasoning_only_response_accepted_without_retry(self, agent):
+        """Reasoning-only response should be accepted with (empty) content, no retries."""
        self._setup_agent(agent)
        empty_resp = _mock_response(
            content=None,
            finish_reason="stop",
            reasoning_content="structured reasoning answer",
        )
-        agent.client.chat.completions.create.side_effect = [empty_resp, empty_resp]
+        agent.client.chat.completions.create.side_effect = [empty_resp]
        with (
            patch.object(agent, "_persist_session"),
            patch.object(agent, "_save_trajectory"),
@ -1560,24 +1554,24 @@ class TestRunConversation:
        ):
            result = agent.run_conversation("answer me")
        assert result["completed"] is True
-        assert result["final_response"] == "structured reasoning answer"
-        assert result["api_calls"] == 2
+        assert result["final_response"] == "(empty)"
+        assert result["api_calls"] == 1  # no retries

-    def test_empty_content_local_custom_error_is_actionable(self, agent):
-        """Local/custom retries should return a diagnostic tailored to context/endpoint mismatch."""
+    def test_truly_empty_response_accepted_without_retry(self, agent):
+        """Truly empty response (no content, no reasoning) should still complete with (empty)."""
        self._setup_agent(agent)
        agent.base_url = "http://127.0.0.1:1234/v1"
        empty_resp = _mock_response(content=None, finish_reason="stop")
-        agent.client.chat.completions.create.side_effect = [empty_resp, empty_resp, empty_resp]
+        agent.client.chat.completions.create.side_effect = [empty_resp]
        with (
            patch.object(agent, "_persist_session"),
            patch.object(agent, "_save_trajectory"),
            patch.object(agent, "_cleanup_task_resources"),
        ):
            result = agent.run_conversation("answer me")
-        assert result["completed"] is False
-        assert "Local/custom backend returned reasoning-only output" in result["error"]
-        assert "wrong /v1 endpoint" in result["error"]
+        assert result["completed"] is True
+        assert result["final_response"] == "(empty)"
+        assert result["api_calls"] == 1  # no retries

    def test_nous_401_refreshes_after_remint_and_retries(self, agent):
        self._setup_agent(agent)