From f0dc919f92c5327cf8033e06c039126f1288e89c Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Thu, 30 Apr 2026 23:03:54 -0700 Subject: [PATCH] fix(compression): include system prompt + tool schemas in token estimates (#18265) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The user-visible /compress banner and the post-compression last_prompt_tokens writeback both counted only the raw message transcript (chars/4). With a 15KB system prompt and 30 tool schemas (~26KB), a 4-message transcript that looks like ~45 tokens to the transcript-only estimator is really ~10.5K tokens of request pressure — a 234x gap. Two user-facing consequences: - Banner shows 'Compressing … (~45 tokens)…' while compression is actually firing on 10K+ tokens of real pressure, confusing users about why compression triggered (reported by @codecovenant on X; #6217). - Post-compression last_prompt_tokens writeback omits tool schemas, so the next should_compress() check compares real usage against a stale underestimate — compression triggers late, potentially past the model's context limit on small-context models (#14695). Swap estimate_messages_tokens_rough() for estimate_request_tokens_rough() at every user-visible banner and at the post-compression writeback. estimate_request_tokens_rough() already existed for exactly this purpose and includes system prompt + tool schemas. Touched call sites: - run_agent.py: post-compression last_prompt_tokens writeback, post-tool call should_compress() fallback when provider usage is missing - cli.py: /compress banner + summary - gateway/run.py: gateway /compress banner + summary - tui_gateway/server.py: TUI /compress status + summary - acp_adapter/server.py: ACP /compact before/after Left intentionally alone: - Session-hygiene fallback and the 'no agent' /status path in gateway/run.py — no agent instance is in scope to query for system prompt/tools, and the existing 30-50% overestimate wobble on hygiene is safety-accepted. - Verbose-mode 'Request size' logging — informational only, already counts system prompt via api_messages[0]. Also relabels the feedback line from 'Rough transcript estimate' to 'Approx request size' so the metric label matches what it actually measures. Credits: diagnoses from @devilardis (#14695) and @Jackten (#6217); user report @codecovenant on X (2026-04-30). Closes #14695 Closes #6217 --- acp_adapter/server.py | 18 +++++++++++--- agent/manual_compression_feedback.py | 10 ++++---- cli.py | 20 +++++++++++++--- gateway/run.py | 17 ++++++++++--- run_agent.py | 19 +++++++++++---- tests/acp/test_server.py | 3 ++- tests/cli/test_manual_compress.py | 20 +++++++++------- tests/gateway/test_compress_command.py | 28 ++++++++++++++-------- tui_gateway/server.py | 33 ++++++++++++++++++++++---- 9 files changed, 126 insertions(+), 42 deletions(-) diff --git a/acp_adapter/server.py b/acp_adapter/server.py index 39eff2f2..f8dade72 100644 --- a/acp_adapter/server.py +++ b/acp_adapter/server.py @@ -1068,10 +1068,16 @@ class HermesACPAgent(acp.Agent): if not hasattr(agent, "_compress_context"): return "Context compression not available for this agent." - from agent.model_metadata import estimate_messages_tokens_rough + from agent.model_metadata import estimate_request_tokens_rough original_count = len(state.history) - approx_tokens = estimate_messages_tokens_rough(state.history) + # Include system prompt + tool schemas so the figure reflects real + # request pressure, not a transcript-only underestimate (#6217). + _sys_prompt = getattr(agent, "_cached_system_prompt", "") or "" + _tools = getattr(agent, "tools", None) or None + approx_tokens = estimate_request_tokens_rough( + state.history, system_prompt=_sys_prompt, tools=_tools + ) original_session_db = getattr(agent, "_session_db", None) try: @@ -1091,7 +1097,13 @@ class HermesACPAgent(acp.Agent): self.session_manager.save_session(state.session_id) new_count = len(state.history) - new_tokens = estimate_messages_tokens_rough(state.history) + _sys_prompt_after = getattr(agent, "_cached_system_prompt", "") or _sys_prompt + _tools_after = getattr(agent, "tools", None) or _tools + new_tokens = estimate_request_tokens_rough( + state.history, + system_prompt=_sys_prompt_after, + tools=_tools_after, + ) return ( f"Context compressed: {original_count} -> {new_count} messages\n" f"~{approx_tokens:,} -> ~{new_tokens:,} tokens" diff --git a/agent/manual_compression_feedback.py b/agent/manual_compression_feedback.py index 8f2d5e5d..32b00f7c 100644 --- a/agent/manual_compression_feedback.py +++ b/agent/manual_compression_feedback.py @@ -20,25 +20,25 @@ def summarize_manual_compression( headline = f"No changes from compression: {before_count} messages" if after_tokens == before_tokens: token_line = ( - f"Rough transcript estimate: ~{before_tokens:,} tokens (unchanged)" + f"Approx request size: ~{before_tokens:,} tokens (unchanged)" ) else: token_line = ( - f"Rough transcript estimate: ~{before_tokens:,} → " + f"Approx request size: ~{before_tokens:,} → " f"~{after_tokens:,} tokens" ) else: headline = f"Compressed: {before_count} → {after_count} messages" token_line = ( - f"Rough transcript estimate: ~{before_tokens:,} → " + f"Approx request size: ~{before_tokens:,} → " f"~{after_tokens:,} tokens" ) note = None if not noop and after_count < before_count and after_tokens > before_tokens: note = ( - "Note: fewer messages can still raise this rough transcript estimate " - "when compression rewrites the transcript into denser summaries." + "Note: fewer messages can still raise this estimate when " + "compression rewrites the transcript into denser summaries." ) return { diff --git a/cli.py b/cli.py index bef1d87b..dbbf83f2 100644 --- a/cli.py +++ b/cli.py @@ -7343,10 +7343,20 @@ class HermesCLI: original_count = len(self.conversation_history) with self._busy_command("Compressing context..."): try: - from agent.model_metadata import estimate_messages_tokens_rough + from agent.model_metadata import estimate_request_tokens_rough from agent.manual_compression_feedback import summarize_manual_compression original_history = list(self.conversation_history) - approx_tokens = estimate_messages_tokens_rough(original_history) + # Include system prompt + tool schemas in the estimate — + # a transcript-only number understates real request pressure + # and can even appear to grow after compression because a + # dense handoff summary replaces many short turns (#6217). + _sys_prompt = getattr(self.agent, "_cached_system_prompt", "") or "" + _tools = getattr(self.agent, "tools", None) or None + approx_tokens = estimate_request_tokens_rough( + original_history, + system_prompt=_sys_prompt, + tools=_tools, + ) if focus_topic: print(f"🗜️ Compressing {original_count} messages (~{approx_tokens:,} tokens), " f"focus: \"{focus_topic}\"...") @@ -7378,7 +7388,11 @@ class HermesCLI: ): self.session_id = self.agent.session_id self._pending_title = None - new_tokens = estimate_messages_tokens_rough(self.conversation_history) + new_tokens = estimate_request_tokens_rough( + self.conversation_history, + system_prompt=_sys_prompt, + tools=_tools, + ) summary = summarize_manual_compression( original_history, self.conversation_history, diff --git a/gateway/run.py b/gateway/run.py index 8c2c6478..90faf9a7 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -8512,7 +8512,7 @@ class GatewayRunner: try: from run_agent import AIAgent from agent.manual_compression_feedback import summarize_manual_compression - from agent.model_metadata import estimate_messages_tokens_rough + from agent.model_metadata import estimate_request_tokens_rough session_key = self._session_key_for_source(source) model, runtime_kwargs = self._resolve_session_agent_runtime( @@ -8527,7 +8527,6 @@ class GatewayRunner: for m in history if m.get("role") in ("user", "assistant") and m.get("content") ] - approx_tokens = estimate_messages_tokens_rough(msgs) tmp_agent = AIAgent( **runtime_kwargs, @@ -8541,6 +8540,16 @@ class GatewayRunner: try: tmp_agent._print_fn = lambda *a, **kw: None + # Estimate with system prompt + tool schemas included so the + # figure reflects real request pressure, not a transcript-only + # underestimate (#6217). Must be computed after tmp_agent is + # built so _cached_system_prompt/tools are populated. + _sys_prompt = getattr(tmp_agent, "_cached_system_prompt", "") or "" + _tools = getattr(tmp_agent, "tools", None) or None + approx_tokens = estimate_request_tokens_rough( + msgs, system_prompt=_sys_prompt, tools=_tools + ) + compressor = tmp_agent.context_compressor if not compressor.has_content_to_compress(msgs): return "Nothing to compress yet (the transcript is still all protected context)." @@ -8565,7 +8574,9 @@ class GatewayRunner: self.session_store.update_session( session_entry.session_key, last_prompt_tokens=0 ) - new_tokens = estimate_messages_tokens_rough(compressed) + new_tokens = estimate_request_tokens_rough( + compressed, system_prompt=_sys_prompt, tools=_tools + ) summary = summarize_manual_compression( msgs, compressed, diff --git a/run_agent.py b/run_agent.py index 0fe6e4a8..4ea0fafe 100644 --- a/run_agent.py +++ b/run_agent.py @@ -9101,9 +9101,14 @@ class AIAgent: # Update token estimate after compaction so pressure calculations # use the post-compression count, not the stale pre-compression one. - _compressed_est = ( - estimate_tokens_rough(new_system_prompt) - + estimate_messages_tokens_rough(compressed) + # Use estimate_request_tokens_rough() so tool schemas are included — + # with 50+ tools enabled, schemas alone can add 20-30K tokens, and + # omitting them delays the next compression cycle far past the + # configured threshold (issue #14695). + _compressed_est = estimate_request_tokens_rough( + compressed, + system_prompt=new_system_prompt or "", + tools=self.tools or None, ) self.context_compressor.last_prompt_tokens = _compressed_est self.context_compressor.last_completion_tokens = 0 @@ -13223,7 +13228,13 @@ class AIAgent: # causing premature compression. (#12026) _real_tokens = _compressor.last_prompt_tokens else: - _real_tokens = estimate_messages_tokens_rough(messages) + # Include tool schemas — with 50+ tools enabled + # these add 20-30K tokens the messages-only + # estimate misses, which can skip compression + # past the configured threshold (#14695). + _real_tokens = estimate_request_tokens_rough( + messages, tools=self.tools or None + ) if self.compression_enabled and _compressor.should_compress(_real_tokens): self._safe_print(" ⟳ compacting context…") diff --git a/tests/acp/test_server.py b/tests/acp/test_server.py index 6628f0da..35aafc60 100644 --- a/tests/acp/test_server.py +++ b/tests/acp/test_server.py @@ -730,6 +730,7 @@ class TestSlashCommands: ] state.agent.compression_enabled = True state.agent._cached_system_prompt = "system" + state.agent.tools = None original_session_db = object() state.agent._session_db = original_session_db @@ -746,7 +747,7 @@ class TestSlashCommands: with ( patch.object(agent.session_manager, "save_session") as mock_save, patch( - "agent.model_metadata.estimate_messages_tokens_rough", + "agent.model_metadata.estimate_request_tokens_rough", side_effect=[40, 12], ), ): diff --git a/tests/cli/test_manual_compress.py b/tests/cli/test_manual_compress.py index 9144c94b..afbde073 100644 --- a/tests/cli/test_manual_compress.py +++ b/tests/cli/test_manual_compress.py @@ -21,20 +21,21 @@ def test_manual_compress_reports_noop_without_success_banner(capsys): shell.agent = MagicMock() shell.agent.compression_enabled = True shell.agent._cached_system_prompt = "" + shell.agent.tools = None shell.agent.session_id = shell.session_id # no-op compression: no split shell.agent._compress_context.return_value = (list(history), "") - def _estimate(messages): + def _estimate(messages, **_kwargs): assert messages == history return 100 - with patch("agent.model_metadata.estimate_messages_tokens_rough", side_effect=_estimate): + with patch("agent.model_metadata.estimate_request_tokens_rough", side_effect=_estimate): shell._manual_compress() output = capsys.readouterr().out assert "No changes from compression" in output assert "✅ Compressed" not in output - assert "Rough transcript estimate: ~100 tokens (unchanged)" in output + assert "Approx request size: ~100 tokens (unchanged)" in output def test_manual_compress_explains_when_token_estimate_rises(capsys): @@ -49,22 +50,23 @@ def test_manual_compress_explains_when_token_estimate_rises(capsys): shell.agent = MagicMock() shell.agent.compression_enabled = True shell.agent._cached_system_prompt = "" + shell.agent.tools = None shell.agent.session_id = shell.session_id # no-op: no split shell.agent._compress_context.return_value = (compressed, "") - def _estimate(messages): + def _estimate(messages, **_kwargs): if messages == history: return 100 if messages == compressed: return 120 raise AssertionError(f"unexpected transcript: {messages!r}") - with patch("agent.model_metadata.estimate_messages_tokens_rough", side_effect=_estimate): + with patch("agent.model_metadata.estimate_request_tokens_rough", side_effect=_estimate): shell._manual_compress() output = capsys.readouterr().out assert "✅ Compressed: 4 → 3 messages" in output - assert "Rough transcript estimate: ~100 → ~120 tokens" in output + assert "Approx request size: ~100 → ~120 tokens" in output assert "denser summaries" in output @@ -89,6 +91,7 @@ def test_manual_compress_syncs_session_id_after_split(): shell.agent = MagicMock() shell.agent.compression_enabled = True shell.agent._cached_system_prompt = "" + shell.agent.tools = None # Simulate _compress_context mutating agent.session_id as a side effect. def _fake_compress(*args, **kwargs): shell.agent.session_id = new_child_id @@ -97,7 +100,7 @@ def test_manual_compress_syncs_session_id_after_split(): shell.agent.session_id = old_id # starts in sync shell._pending_title = "stale title" - with patch("agent.model_metadata.estimate_messages_tokens_rough", return_value=100): + with patch("agent.model_metadata.estimate_request_tokens_rough", return_value=100): shell._manual_compress() # CLI session_id must now point at the continuation child, not the parent. @@ -118,11 +121,12 @@ def test_manual_compress_no_sync_when_session_id_unchanged(): shell.agent = MagicMock() shell.agent.compression_enabled = True shell.agent._cached_system_prompt = "" + shell.agent.tools = None shell.agent.session_id = shell.session_id shell.agent._compress_context.return_value = (list(history), "") shell._pending_title = "keep me" - with patch("agent.model_metadata.estimate_messages_tokens_rough", return_value=100): + with patch("agent.model_metadata.estimate_request_tokens_rough", return_value=100): shell._manual_compress() # No split → pending title untouched. diff --git a/tests/gateway/test_compress_command.py b/tests/gateway/test_compress_command.py index 21ff777f..e09e40a0 100644 --- a/tests/gateway/test_compress_command.py +++ b/tests/gateway/test_compress_command.py @@ -64,11 +64,13 @@ async def test_compress_command_reports_noop_without_success_banner(): agent_instance = MagicMock() agent_instance.shutdown_memory_provider = MagicMock() agent_instance.close = MagicMock() + agent_instance._cached_system_prompt = "" + agent_instance.tools = None agent_instance.context_compressor.has_content_to_compress.return_value = True agent_instance.session_id = "sess-1" agent_instance._compress_context.return_value = (list(history), "") - def _estimate(messages): + def _estimate(messages, **_kwargs): assert messages == history return 100 @@ -76,13 +78,13 @@ async def test_compress_command_reports_noop_without_success_banner(): patch("gateway.run._resolve_runtime_agent_kwargs", return_value={"api_key": "test-key"}), patch("gateway.run._resolve_gateway_model", return_value="test-model"), patch("run_agent.AIAgent", return_value=agent_instance), - patch("agent.model_metadata.estimate_messages_tokens_rough", side_effect=_estimate), + patch("agent.model_metadata.estimate_request_tokens_rough", side_effect=_estimate), ): result = await runner._handle_compress_command(_make_event()) assert "No changes from compression" in result assert "Compressed:" not in result - assert "Rough transcript estimate: ~100 tokens (unchanged)" in result + assert "Approx request size: ~100 tokens (unchanged)" in result agent_instance.shutdown_memory_provider.assert_called_once() agent_instance.close.assert_called_once() @@ -99,11 +101,13 @@ async def test_compress_command_explains_when_token_estimate_rises(): agent_instance = MagicMock() agent_instance.shutdown_memory_provider = MagicMock() agent_instance.close = MagicMock() + agent_instance._cached_system_prompt = "" + agent_instance.tools = None agent_instance.context_compressor.has_content_to_compress.return_value = True agent_instance.session_id = "sess-1" agent_instance._compress_context.return_value = (compressed, "") - def _estimate(messages): + def _estimate(messages, **_kwargs): if messages == history: return 100 if messages == compressed: @@ -114,12 +118,12 @@ async def test_compress_command_explains_when_token_estimate_rises(): patch("gateway.run._resolve_runtime_agent_kwargs", return_value={"api_key": "test-key"}), patch("gateway.run._resolve_gateway_model", return_value="test-model"), patch("run_agent.AIAgent", return_value=agent_instance), - patch("agent.model_metadata.estimate_messages_tokens_rough", side_effect=_estimate), + patch("agent.model_metadata.estimate_request_tokens_rough", side_effect=_estimate), ): result = await runner._handle_compress_command(_make_event()) assert "Compressed: 4 → 3 messages" in result - assert "Rough transcript estimate: ~100 → ~120 tokens" in result + assert "Approx request size: ~100 → ~120 tokens" in result assert "denser summaries" in result agent_instance.shutdown_memory_provider.assert_called_once() agent_instance.close.assert_called_once() @@ -143,6 +147,8 @@ async def test_compress_command_appends_warning_when_summary_generation_fails(): agent_instance = MagicMock() agent_instance.shutdown_memory_provider = MagicMock() agent_instance.close = MagicMock() + agent_instance._cached_system_prompt = "" + agent_instance.tools = None agent_instance.context_compressor.has_content_to_compress.return_value = True # Simulate summary-generation failure: fallback flag set, dropped count # populated, error string captured. @@ -154,7 +160,7 @@ async def test_compress_command_appends_warning_when_summary_generation_fails(): agent_instance.session_id = "sess-1" agent_instance._compress_context.return_value = (compressed, "") - def _estimate(messages): + def _estimate(messages, **_kwargs): if messages == history: return 100 if messages == compressed: @@ -165,7 +171,7 @@ async def test_compress_command_appends_warning_when_summary_generation_fails(): patch("gateway.run._resolve_runtime_agent_kwargs", return_value={"api_key": "***"}), patch("gateway.run._resolve_gateway_model", return_value="test-model"), patch("run_agent.AIAgent", return_value=agent_instance), - patch("agent.model_metadata.estimate_messages_tokens_rough", side_effect=_estimate), + patch("agent.model_metadata.estimate_request_tokens_rough", side_effect=_estimate), ): result = await runner._handle_compress_command(_make_event()) @@ -200,6 +206,8 @@ async def test_compress_command_surfaces_aux_model_failure_even_when_recovered() agent_instance = MagicMock() agent_instance.shutdown_memory_provider = MagicMock() agent_instance.close = MagicMock() + agent_instance._cached_system_prompt = "" + agent_instance.tools = None agent_instance.context_compressor.has_content_to_compress.return_value = True # Fallback placeholder was NOT used — recovery succeeded. agent_instance.context_compressor._last_summary_fallback_used = False @@ -215,7 +223,7 @@ async def test_compress_command_surfaces_aux_model_failure_even_when_recovered() agent_instance.session_id = "sess-1" agent_instance._compress_context.return_value = (compressed, "") - def _estimate(messages): + def _estimate(messages, **_kwargs): if messages == history: return 100 if messages == compressed: @@ -226,7 +234,7 @@ async def test_compress_command_surfaces_aux_model_failure_even_when_recovered() patch("gateway.run._resolve_runtime_agent_kwargs", return_value={"api_key": "***"}), patch("gateway.run._resolve_gateway_model", return_value="test-model"), patch("run_agent.AIAgent", return_value=agent_instance), - patch("agent.model_metadata.estimate_messages_tokens_rough", side_effect=_estimate), + patch("agent.model_metadata.estimate_request_tokens_rough", side_effect=_estimate), ): result = await runner._handle_compress_command(_make_event()) diff --git a/tui_gateway/server.py b/tui_gateway/server.py index fb8aaa81..4a7f4785 100644 --- a/tui_gateway/server.py +++ b/tui_gateway/server.py @@ -1144,7 +1144,7 @@ def _compress_session_history( before_messages: list | None = None, history_version: int | None = None, ) -> tuple[int, dict]: - from agent.model_metadata import estimate_messages_tokens_rough + from agent.model_metadata import estimate_request_tokens_rough agent = session["agent"] # Snapshot history under the lock so the LLM-bound compression call @@ -1160,7 +1160,13 @@ def _compress_session_history( usage = _get_usage(agent) return 0, usage if approx_tokens is None: - approx_tokens = estimate_messages_tokens_rough(history) + # Include system prompt + tool schemas so the figure reflects real + # request pressure, not a transcript-only underestimate (#6217). + _sys_prompt = getattr(agent, "_cached_system_prompt", "") or "" + _tools = getattr(agent, "tools", None) or None + approx_tokens = estimate_request_tokens_rough( + history, system_prompt=_sys_prompt, tools=_tools + ) # Pass system_message=None so AIAgent._compress_context rebuilds the # system prompt cleanly via _build_system_prompt(None). Passing the # cached prompt (which already contains the agent identity block) @@ -2328,14 +2334,21 @@ def _(rid, params: dict) -> dict: focus_topic = str(params.get("focus_topic", "") or "").strip() try: from agent.manual_compression_feedback import summarize_manual_compression - from agent.model_metadata import estimate_messages_tokens_rough + from agent.model_metadata import estimate_request_tokens_rough with session["history_lock"]: before_messages = list(session.get("history", [])) history_version = int(session.get("history_version", 0)) before_count = len(before_messages) + _agent = session["agent"] + _sys_prompt = getattr(_agent, "_cached_system_prompt", "") or "" + _tools = getattr(_agent, "tools", None) or None before_tokens = ( - estimate_messages_tokens_rough(before_messages) if before_count else 0 + estimate_request_tokens_rough( + before_messages, system_prompt=_sys_prompt, tools=_tools + ) + if before_count + else 0 ) if before_count >= 4: @@ -2358,8 +2371,18 @@ def _(rid, params: dict) -> dict: with session["history_lock"]: messages = list(session.get("history", [])) after_count = len(messages) + # Re-read system prompt + tools after compression — _compress_context + # may have rebuilt the system prompt (_cached_system_prompt=None). + _sys_prompt_after = getattr(_agent, "_cached_system_prompt", "") or _sys_prompt + _tools_after = getattr(_agent, "tools", None) or _tools after_tokens = ( - estimate_messages_tokens_rough(messages) if after_count else 0 + estimate_request_tokens_rough( + messages, + system_prompt=_sys_prompt_after, + tools=_tools_after, + ) + if after_count + else 0 ) agent = session["agent"] _sync_session_key_after_compress(sid, session)