fix: accept reasoning-only responses without retries — set content to "(empty)" (#5278)

* feat: coerce tool call arguments to match JSON Schema types

LLMs frequently return numbers as strings ("42" instead of 42) and
booleans as strings ("true" instead of true). This causes silent
failures with MCP tools and any tool with strictly-typed parameters.

Added coerce_tool_args() in model_tools.py that runs before every tool
dispatch. For each argument, it checks the tool registry schema and
attempts safe coercion:
  - "42" → 42 when schema says "type": "integer"
  - "3.14" → 3.14 when schema says "type": "number"
  - "true"/"false" → True/False when schema says "type": "boolean"
  - Union types tried in order
  - Original values preserved when coercion fails or is not applicable

Inspired by Block/goose tool argument coercion system.

* fix: accept reasoning-only responses without retries — set content to "(empty)"

Previously, when a model returned reasoning/thinking but no visible
content, we entered a 120-line retry/classify/compress/salvage cascade
that wasted 3+ API calls trying to "fix" the response. The model was
done thinking — retrying with the same input just burned money.

Now reasoning-only responses are accepted immediately:
- Reasoning stays in the `reasoning` field (semantically correct)
- Content set to "(empty)" — valid non-empty string every provider accepts
- No retries, no compression triggers, no salvage logic
- Session history contains "(empty)" not "" — prevents #2128 session
  poisoning where empty assistant content caused prefill rejections

Removes ~120 lines, adds ~15. Saves 2-3 API calls per reasoning-only
response. Fixes #2128.
This commit is contained in:
Teknium 2026-04-05 11:30:52 -07:00 committed by GitHub
parent 534511bebb
commit a0a1b86c2e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 38 additions and 160 deletions

View File

@ -8620,140 +8620,24 @@ class AIAgent:
self._response_was_previewed = True
break
# No fallback available — classify the empty response before
# blindly spending retries. Some local/custom backends surface
# implicit context pressure as reasoning-only output rather than
# an explicit overflow error.
if not hasattr(self, '_empty_content_retries'):
self._empty_content_retries = 0
self._empty_content_retries += 1
# Reasoning-only response: the model produced thinking
# but no visible content. This is a valid response —
# keep reasoning in its own field and set content to
# "(empty)" so every provider accepts the message.
# No retries needed.
reasoning_text = self._extract_reasoning(assistant_message)
assistant_msg = self._build_assistant_message(assistant_message, finish_reason)
assistant_msg["content"] = "(empty)"
messages.append(assistant_msg)
empty_response_info = self._classify_empty_content_response(
assistant_message,
finish_reason=finish_reason,
approx_tokens=approx_tokens,
api_messages=api_messages,
conversation_history=conversation_history,
)
reasoning_text = empty_response_info["reasoning_text"]
self._vprint(f"{self.log_prefix}⚠️ Response only contains think block with no content after it")
if reasoning_text:
reasoning_preview = reasoning_text[:500] + "..." if len(reasoning_text) > 500 else reasoning_text
self._vprint(f"{self.log_prefix} Reasoning: {reasoning_preview}")
self._vprint(f"{self.log_prefix} Reasoning-only response (no visible content). Reasoning: {reasoning_preview}")
else:
content_preview = final_response[:80] + "..." if len(final_response) > 80 else final_response
self._vprint(f"{self.log_prefix} Content: '{content_preview}'")
self._vprint(f"{self.log_prefix} Empty response (no content or reasoning).")
if empty_response_info["should_compress"]:
compression_attempts += 1
if compression_attempts > max_compression_attempts:
self._vprint(f"{self.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached.", force=True)
self._vprint(f"{self.log_prefix} 💡 Local/custom backend returned reasoning-only output with no visible content. This often means the resumed/large session exceeds the runtime context window. Try /new or lower model.context_length to the actual runtime limit.", force=True)
else:
self._vprint(f"{self.log_prefix}🗜️ Reasoning-only response looks like implicit context pressure — attempting compression ({compression_attempts}/{max_compression_attempts})...", force=True)
original_len = len(messages)
messages, active_system_prompt = self._compress_context(
messages, system_message, approx_tokens=approx_tokens,
task_id=effective_task_id,
)
if len(messages) < original_len:
conversation_history = None
self._emit_status(f"🗜️ Compressed {original_len}{len(messages)} messages after reasoning-only response, retrying...")
time.sleep(2)
api_call_count -= 1
self.iteration_budget.refund()
retry_count += 1
continue
self._vprint(f"{self.log_prefix} Compression could not shrink the session; falling back to retry/salvage logic.")
if (
reasoning_text
and empty_response_info["repeated_signature"]
and empty_response_info["has_structured_reasoning"]
):
self._vprint(f"{self.log_prefix} Structured reasoning-only response repeated unchanged — using reasoning text directly.", force=True)
self._empty_content_retries = 0
final_response = reasoning_text
empty_msg = {
"role": "assistant",
"content": final_response,
"reasoning": reasoning_text,
"finish_reason": finish_reason,
}
messages.append(empty_msg)
break
if self._empty_content_retries < 3:
self._vprint(f"{self.log_prefix}🔄 Retrying API call ({self._empty_content_retries}/3)...")
continue
else:
self._vprint(f"{self.log_prefix}❌ Max retries (3) for empty content exceeded.", force=True)
self._empty_content_retries = 0
# If a prior tool_calls turn had real content, salvage it:
# rewrite that turn's content to a brief tool description,
# and use the original content as the final response here.
fallback = getattr(self, '_last_content_with_tools', None)
if fallback:
self._last_content_with_tools = None
# Find the last assistant message with tool_calls and rewrite it
for i in range(len(messages) - 1, -1, -1):
msg = messages[i]
if msg.get("role") == "assistant" and msg.get("tool_calls"):
tool_names = []
for tc in msg["tool_calls"]:
if not tc or not isinstance(tc, dict): continue
fn = tc.get("function", {})
tool_names.append(fn.get("name", "unknown"))
msg["content"] = f"Calling the {', '.join(tool_names)} tool{'s' if len(tool_names) > 1 else ''}..."
break
# Strip <think> blocks from fallback content for user display
final_response = self._strip_think_blocks(fallback).strip()
self._response_was_previewed = True
break
# No fallback -- if reasoning_text exists, the model put its
# entire response inside <think> tags; use that as the content.
if reasoning_text:
self._vprint(f"{self.log_prefix}Using reasoning as response content (model wrapped entire response in think tags).", force=True)
final_response = reasoning_text
empty_msg = {
"role": "assistant",
"content": final_response,
"reasoning": reasoning_text,
"finish_reason": finish_reason,
}
messages.append(empty_msg)
break
# Truly empty -- no reasoning and no content
empty_msg = {
"role": "assistant",
"content": final_response,
"reasoning": reasoning_text,
"finish_reason": finish_reason,
}
messages.append(empty_msg)
self._cleanup_task_resources(effective_task_id)
self._persist_session(messages, conversation_history)
error_message = "Model generated only think blocks with no actual response after 3 retries"
if empty_response_info["is_local_custom"]:
error_message = (
"Local/custom backend returned reasoning-only output with no visible response after 3 retries. "
"Likely causes: wrong /v1 endpoint, runtime context window smaller than Hermes expects, "
"or a resumed/large session exceeding the backend's actual context limit."
)
return {
"final_response": final_response or None,
"messages": messages,
"api_calls": api_call_count,
"completed": False,
"partial": True,
"error": error_message
}
final_response = "(empty)"
break
# Reset retry counter/signature on successful content
if hasattr(self, '_empty_content_retries'):

View File

@ -1488,19 +1488,14 @@ class TestRunConversation:
assert result["completed"] is True
assert result["api_calls"] == 2
def test_empty_content_retry_uses_inline_reasoning_as_response(self, agent):
"""Reasoning-only payloads should recover the inline reasoning text."""
def test_inline_think_blocks_reasoning_only_accepted(self, agent):
"""Inline <think> reasoning-only responses accepted with (empty) content, no retries."""
self._setup_agent(agent)
empty_resp = _mock_response(
content="<think>internal reasoning</think>",
finish_reason="stop",
)
# Return empty 3 times to exhaust retries
agent.client.chat.completions.create.side_effect = [
empty_resp,
empty_resp,
empty_resp,
]
agent.client.chat.completions.create.side_effect = [empty_resp]
with (
patch.object(agent, "_persist_session"),
patch.object(agent, "_save_trajectory"),
@ -1508,10 +1503,14 @@ class TestRunConversation:
):
result = agent.run_conversation("answer me")
assert result["completed"] is True
assert result["final_response"] == "internal reasoning"
assert result["final_response"] == "(empty)"
assert result["api_calls"] == 1 # no retries
# Reasoning should be preserved in the assistant message
assistant_msgs = [m for m in result["messages"] if m.get("role") == "assistant"]
assert any(m.get("reasoning") for m in assistant_msgs)
def test_empty_content_local_resumed_session_triggers_compression(self, agent):
"""Local resumed reasoning-only responses should compress before burning retries."""
def test_reasoning_only_local_resumed_no_compression_triggered(self, agent):
"""Reasoning-only responses no longer trigger compression — accepted immediately."""
self._setup_agent(agent)
agent.base_url = "http://127.0.0.1:1234/v1"
agent.compression_enabled = True
@ -1520,39 +1519,34 @@ class TestRunConversation:
finish_reason="stop",
reasoning_content="reasoning only",
)
ok_resp = _mock_response(content="Recovered after compression", finish_reason="stop")
prefill = [
{"role": "user", "content": "old question"},
{"role": "assistant", "content": "old answer"},
]
with (
patch.object(agent, "_interruptible_api_call", side_effect=[empty_resp, ok_resp]),
patch.object(agent, "_interruptible_api_call", side_effect=[empty_resp]),
patch.object(agent, "_compress_context") as mock_compress,
patch.object(agent, "_persist_session"),
patch.object(agent, "_save_trajectory"),
patch.object(agent, "_cleanup_task_resources"),
):
mock_compress.return_value = (
[{"role": "user", "content": "compressed user message"}],
"compressed system prompt",
)
result = agent.run_conversation("hello", conversation_history=prefill)
mock_compress.assert_called_once()
mock_compress.assert_not_called() # no compression triggered
assert result["completed"] is True
assert result["final_response"] == "Recovered after compression"
assert result["api_calls"] == 1 # compression retry is refunded, same as explicit overflow path
assert result["final_response"] == "(empty)"
assert result["api_calls"] == 1
def test_empty_content_repeated_structured_reasoning_salvages_early(self, agent):
"""Repeated identical structured reasoning-only responses should stop retrying early."""
def test_reasoning_only_response_accepted_without_retry(self, agent):
"""Reasoning-only response should be accepted with (empty) content, no retries."""
self._setup_agent(agent)
empty_resp = _mock_response(
content=None,
finish_reason="stop",
reasoning_content="structured reasoning answer",
)
agent.client.chat.completions.create.side_effect = [empty_resp, empty_resp]
agent.client.chat.completions.create.side_effect = [empty_resp]
with (
patch.object(agent, "_persist_session"),
patch.object(agent, "_save_trajectory"),
@ -1560,24 +1554,24 @@ class TestRunConversation:
):
result = agent.run_conversation("answer me")
assert result["completed"] is True
assert result["final_response"] == "structured reasoning answer"
assert result["api_calls"] == 2
assert result["final_response"] == "(empty)"
assert result["api_calls"] == 1 # no retries
def test_empty_content_local_custom_error_is_actionable(self, agent):
"""Local/custom retries should return a diagnostic tailored to context/endpoint mismatch."""
def test_truly_empty_response_accepted_without_retry(self, agent):
"""Truly empty response (no content, no reasoning) should still complete with (empty)."""
self._setup_agent(agent)
agent.base_url = "http://127.0.0.1:1234/v1"
empty_resp = _mock_response(content=None, finish_reason="stop")
agent.client.chat.completions.create.side_effect = [empty_resp, empty_resp, empty_resp]
agent.client.chat.completions.create.side_effect = [empty_resp]
with (
patch.object(agent, "_persist_session"),
patch.object(agent, "_save_trajectory"),
patch.object(agent, "_cleanup_task_resources"),
):
result = agent.run_conversation("answer me")
assert result["completed"] is False
assert "Local/custom backend returned reasoning-only output" in result["error"]
assert "wrong /v1 endpoint" in result["error"]
assert result["completed"] is True
assert result["final_response"] == "(empty)"
assert result["api_calls"] == 1 # no retries
def test_nous_401_refreshes_after_remint_and_retries(self, agent):
self._setup_agent(agent)