From f0dc919f92c5327cf8033e06c039126f1288e89c Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Thu, 30 Apr 2026 23:03:54 -0700
Subject: [PATCH] fix(compression): include system prompt + tool schemas in
 token estimates (#18265)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The user-visible /compress banner and the post-compression last_prompt_tokens
writeback both counted only the raw message transcript (chars/4). With a 15KB
system prompt and 30 tool schemas (~26KB), a 4-message transcript that looks
like ~45 tokens to the transcript-only estimator is really ~10.5K tokens of
request pressure — a 234x gap.

Two user-facing consequences:
- Banner shows 'Compressing … (~45 tokens)…' while compression is actually
  firing on 10K+ tokens of real pressure, confusing users about why
  compression triggered (reported by @codecovenant on X; #6217).
- Post-compression last_prompt_tokens writeback omits tool schemas, so the
  next should_compress() check compares real usage against a stale
  underestimate — compression triggers late, potentially past the model's
  context limit on small-context models (#14695).

Swap estimate_messages_tokens_rough() for estimate_request_tokens_rough()
at every user-visible banner and at the post-compression writeback.
estimate_request_tokens_rough() already existed for exactly this purpose
and includes system prompt + tool schemas.

Touched call sites:
- run_agent.py: post-compression last_prompt_tokens writeback, post-tool
  call should_compress() fallback when provider usage is missing
- cli.py: /compress banner + summary
- gateway/run.py: gateway /compress banner + summary
- tui_gateway/server.py: TUI /compress status + summary
- acp_adapter/server.py: ACP /compact before/after

Left intentionally alone:
- Session-hygiene fallback and the 'no agent' /status path in gateway/run.py
  — no agent instance is in scope to query for system prompt/tools, and the
  existing 30-50% overestimate wobble on hygiene is safety-accepted.
- Verbose-mode 'Request size' logging — informational only, already counts
  system prompt via api_messages[0].

Also relabels the feedback line from 'Rough transcript estimate' to
'Approx request size' so the metric label matches what it actually measures.

Credits: diagnoses from @devilardis (#14695) and @Jackten (#6217);
user report @codecovenant on X (2026-04-30).

Closes #14695
Closes #6217
---
 acp_adapter/server.py                  | 18 +++++++++++---
 agent/manual_compression_feedback.py   | 10 ++++----
 cli.py                                 | 20 +++++++++++++---
 gateway/run.py                         | 17 ++++++++++---
 run_agent.py                           | 19 +++++++++++----
 tests/acp/test_server.py               |  3 ++-
 tests/cli/test_manual_compress.py      | 20 +++++++++-------
 tests/gateway/test_compress_command.py | 28 ++++++++++++++--------
 tui_gateway/server.py                  | 33 ++++++++++++++++++++++----
 9 files changed, 126 insertions(+), 42 deletions(-)

diff --git a/acp_adapter/server.py b/acp_adapter/server.py
index 39eff2f2..f8dade72 100644
--- a/acp_adapter/server.py
+++ b/acp_adapter/server.py
@@ -1068,10 +1068,16 @@ class HermesACPAgent(acp.Agent):
             if not hasattr(agent, "_compress_context"):
                 return "Context compression not available for this agent."
 
-            from agent.model_metadata import estimate_messages_tokens_rough
+            from agent.model_metadata import estimate_request_tokens_rough
 
             original_count = len(state.history)
-            approx_tokens = estimate_messages_tokens_rough(state.history)
+            # Include system prompt + tool schemas so the figure reflects real
+            # request pressure, not a transcript-only underestimate (#6217).
+            _sys_prompt = getattr(agent, "_cached_system_prompt", "") or ""
+            _tools = getattr(agent, "tools", None) or None
+            approx_tokens = estimate_request_tokens_rough(
+                state.history, system_prompt=_sys_prompt, tools=_tools
+            )
             original_session_db = getattr(agent, "_session_db", None)
 
             try:
@@ -1091,7 +1097,13 @@ class HermesACPAgent(acp.Agent):
             self.session_manager.save_session(state.session_id)
 
             new_count = len(state.history)
-            new_tokens = estimate_messages_tokens_rough(state.history)
+            _sys_prompt_after = getattr(agent, "_cached_system_prompt", "") or _sys_prompt
+            _tools_after = getattr(agent, "tools", None) or _tools
+            new_tokens = estimate_request_tokens_rough(
+                state.history,
+                system_prompt=_sys_prompt_after,
+                tools=_tools_after,
+            )
             return (
                 f"Context compressed: {original_count} -> {new_count} messages\n"
                 f"~{approx_tokens:,} -> ~{new_tokens:,} tokens"
diff --git a/agent/manual_compression_feedback.py b/agent/manual_compression_feedback.py
index 8f2d5e5d..32b00f7c 100644
--- a/agent/manual_compression_feedback.py
+++ b/agent/manual_compression_feedback.py
@@ -20,25 +20,25 @@ def summarize_manual_compression(
         headline = f"No changes from compression: {before_count} messages"
         if after_tokens == before_tokens:
             token_line = (
-                f"Rough transcript estimate: ~{before_tokens:,} tokens (unchanged)"
+                f"Approx request size: ~{before_tokens:,} tokens (unchanged)"
             )
         else:
             token_line = (
-                f"Rough transcript estimate: ~{before_tokens:,} → "
+                f"Approx request size: ~{before_tokens:,} → "
                 f"~{after_tokens:,} tokens"
             )
     else:
         headline = f"Compressed: {before_count} → {after_count} messages"
         token_line = (
-            f"Rough transcript estimate: ~{before_tokens:,} → "
+            f"Approx request size: ~{before_tokens:,} → "
             f"~{after_tokens:,} tokens"
         )
 
     note = None
     if not noop and after_count < before_count and after_tokens > before_tokens:
         note = (
-            "Note: fewer messages can still raise this rough transcript estimate "
-            "when compression rewrites the transcript into denser summaries."
+            "Note: fewer messages can still raise this estimate when "
+            "compression rewrites the transcript into denser summaries."
         )
 
     return {
diff --git a/cli.py b/cli.py
index bef1d87b..dbbf83f2 100644
--- a/cli.py
+++ b/cli.py
@@ -7343,10 +7343,20 @@ class HermesCLI:
         original_count = len(self.conversation_history)
         with self._busy_command("Compressing context..."):
             try:
-                from agent.model_metadata import estimate_messages_tokens_rough
+                from agent.model_metadata import estimate_request_tokens_rough
                 from agent.manual_compression_feedback import summarize_manual_compression
                 original_history = list(self.conversation_history)
-                approx_tokens = estimate_messages_tokens_rough(original_history)
+                # Include system prompt + tool schemas in the estimate —
+                # a transcript-only number understates real request pressure
+                # and can even appear to grow after compression because a
+                # dense handoff summary replaces many short turns (#6217).
+                _sys_prompt = getattr(self.agent, "_cached_system_prompt", "") or ""
+                _tools = getattr(self.agent, "tools", None) or None
+                approx_tokens = estimate_request_tokens_rough(
+                    original_history,
+                    system_prompt=_sys_prompt,
+                    tools=_tools,
+                )
                 if focus_topic:
                     print(f"🗜️  Compressing {original_count} messages (~{approx_tokens:,} tokens), "
                           f"focus: \"{focus_topic}\"...")
@@ -7378,7 +7388,11 @@ class HermesCLI:
                 ):
                     self.session_id = self.agent.session_id
                     self._pending_title = None
-                new_tokens = estimate_messages_tokens_rough(self.conversation_history)
+                new_tokens = estimate_request_tokens_rough(
+                    self.conversation_history,
+                    system_prompt=_sys_prompt,
+                    tools=_tools,
+                )
                 summary = summarize_manual_compression(
                     original_history,
                     self.conversation_history,
diff --git a/gateway/run.py b/gateway/run.py
index 8c2c6478..90faf9a7 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -8512,7 +8512,7 @@ class GatewayRunner:
         try:
             from run_agent import AIAgent
             from agent.manual_compression_feedback import summarize_manual_compression
-            from agent.model_metadata import estimate_messages_tokens_rough
+            from agent.model_metadata import estimate_request_tokens_rough
 
             session_key = self._session_key_for_source(source)
             model, runtime_kwargs = self._resolve_session_agent_runtime(
@@ -8527,7 +8527,6 @@ class GatewayRunner:
                 for m in history
                 if m.get("role") in ("user", "assistant") and m.get("content")
             ]
-            approx_tokens = estimate_messages_tokens_rough(msgs)
 
             tmp_agent = AIAgent(
                 **runtime_kwargs,
@@ -8541,6 +8540,16 @@ class GatewayRunner:
             try:
                 tmp_agent._print_fn = lambda *a, **kw: None
 
+                # Estimate with system prompt + tool schemas included so the
+                # figure reflects real request pressure, not a transcript-only
+                # underestimate (#6217). Must be computed after tmp_agent is
+                # built so _cached_system_prompt/tools are populated.
+                _sys_prompt = getattr(tmp_agent, "_cached_system_prompt", "") or ""
+                _tools = getattr(tmp_agent, "tools", None) or None
+                approx_tokens = estimate_request_tokens_rough(
+                    msgs, system_prompt=_sys_prompt, tools=_tools
+                )
+
                 compressor = tmp_agent.context_compressor
                 if not compressor.has_content_to_compress(msgs):
                     return "Nothing to compress yet (the transcript is still all protected context)."
@@ -8565,7 +8574,9 @@ class GatewayRunner:
                 self.session_store.update_session(
                     session_entry.session_key, last_prompt_tokens=0
                 )
-                new_tokens = estimate_messages_tokens_rough(compressed)
+                new_tokens = estimate_request_tokens_rough(
+                    compressed, system_prompt=_sys_prompt, tools=_tools
+                )
                 summary = summarize_manual_compression(
                     msgs,
                     compressed,
diff --git a/run_agent.py b/run_agent.py
index 0fe6e4a8..4ea0fafe 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -9101,9 +9101,14 @@ class AIAgent:
 
         # Update token estimate after compaction so pressure calculations
         # use the post-compression count, not the stale pre-compression one.
-        _compressed_est = (
-            estimate_tokens_rough(new_system_prompt)
-            + estimate_messages_tokens_rough(compressed)
+        # Use estimate_request_tokens_rough() so tool schemas are included —
+        # with 50+ tools enabled, schemas alone can add 20-30K tokens, and
+        # omitting them delays the next compression cycle far past the
+        # configured threshold (issue #14695).
+        _compressed_est = estimate_request_tokens_rough(
+            compressed,
+            system_prompt=new_system_prompt or "",
+            tools=self.tools or None,
         )
         self.context_compressor.last_prompt_tokens = _compressed_est
         self.context_compressor.last_completion_tokens = 0
@@ -13223,7 +13228,13 @@ class AIAgent:
                         # causing premature compression.  (#12026)
                         _real_tokens = _compressor.last_prompt_tokens
                     else:
-                        _real_tokens = estimate_messages_tokens_rough(messages)
+                        # Include tool schemas — with 50+ tools enabled
+                        # these add 20-30K tokens the messages-only
+                        # estimate misses, which can skip compression
+                        # past the configured threshold (#14695).
+                        _real_tokens = estimate_request_tokens_rough(
+                            messages, tools=self.tools or None
+                        )
 
                     if self.compression_enabled and _compressor.should_compress(_real_tokens):
                         self._safe_print("  ⟳ compacting context…")
diff --git a/tests/acp/test_server.py b/tests/acp/test_server.py
index 6628f0da..35aafc60 100644
--- a/tests/acp/test_server.py
+++ b/tests/acp/test_server.py
@@ -730,6 +730,7 @@ class TestSlashCommands:
         ]
         state.agent.compression_enabled = True
         state.agent._cached_system_prompt = "system"
+        state.agent.tools = None
         original_session_db = object()
         state.agent._session_db = original_session_db
 
@@ -746,7 +747,7 @@ class TestSlashCommands:
         with (
             patch.object(agent.session_manager, "save_session") as mock_save,
             patch(
-                "agent.model_metadata.estimate_messages_tokens_rough",
+                "agent.model_metadata.estimate_request_tokens_rough",
                 side_effect=[40, 12],
             ),
         ):
diff --git a/tests/cli/test_manual_compress.py b/tests/cli/test_manual_compress.py
index 9144c94b..afbde073 100644
--- a/tests/cli/test_manual_compress.py
+++ b/tests/cli/test_manual_compress.py
@@ -21,20 +21,21 @@ def test_manual_compress_reports_noop_without_success_banner(capsys):
     shell.agent = MagicMock()
     shell.agent.compression_enabled = True
     shell.agent._cached_system_prompt = ""
+    shell.agent.tools = None
     shell.agent.session_id = shell.session_id  # no-op compression: no split
     shell.agent._compress_context.return_value = (list(history), "")
 
-    def _estimate(messages):
+    def _estimate(messages, **_kwargs):
         assert messages == history
         return 100
 
-    with patch("agent.model_metadata.estimate_messages_tokens_rough", side_effect=_estimate):
+    with patch("agent.model_metadata.estimate_request_tokens_rough", side_effect=_estimate):
         shell._manual_compress()
 
     output = capsys.readouterr().out
     assert "No changes from compression" in output
     assert "✅ Compressed" not in output
-    assert "Rough transcript estimate: ~100 tokens (unchanged)" in output
+    assert "Approx request size: ~100 tokens (unchanged)" in output
 
 
 def test_manual_compress_explains_when_token_estimate_rises(capsys):
@@ -49,22 +50,23 @@ def test_manual_compress_explains_when_token_estimate_rises(capsys):
     shell.agent = MagicMock()
     shell.agent.compression_enabled = True
     shell.agent._cached_system_prompt = ""
+    shell.agent.tools = None
     shell.agent.session_id = shell.session_id  # no-op: no split
     shell.agent._compress_context.return_value = (compressed, "")
 
-    def _estimate(messages):
+    def _estimate(messages, **_kwargs):
         if messages == history:
             return 100
         if messages == compressed:
             return 120
         raise AssertionError(f"unexpected transcript: {messages!r}")
 
-    with patch("agent.model_metadata.estimate_messages_tokens_rough", side_effect=_estimate):
+    with patch("agent.model_metadata.estimate_request_tokens_rough", side_effect=_estimate):
         shell._manual_compress()
 
     output = capsys.readouterr().out
     assert "✅ Compressed: 4 → 3 messages" in output
-    assert "Rough transcript estimate: ~100 → ~120 tokens" in output
+    assert "Approx request size: ~100 → ~120 tokens" in output
     assert "denser summaries" in output
 
 
@@ -89,6 +91,7 @@ def test_manual_compress_syncs_session_id_after_split():
     shell.agent = MagicMock()
     shell.agent.compression_enabled = True
     shell.agent._cached_system_prompt = ""
+    shell.agent.tools = None
     # Simulate _compress_context mutating agent.session_id as a side effect.
     def _fake_compress(*args, **kwargs):
         shell.agent.session_id = new_child_id
@@ -97,7 +100,7 @@ def test_manual_compress_syncs_session_id_after_split():
     shell.agent.session_id = old_id  # starts in sync
     shell._pending_title = "stale title"
 
-    with patch("agent.model_metadata.estimate_messages_tokens_rough", return_value=100):
+    with patch("agent.model_metadata.estimate_request_tokens_rough", return_value=100):
         shell._manual_compress()
 
     # CLI session_id must now point at the continuation child, not the parent.
@@ -118,11 +121,12 @@ def test_manual_compress_no_sync_when_session_id_unchanged():
     shell.agent = MagicMock()
     shell.agent.compression_enabled = True
     shell.agent._cached_system_prompt = ""
+    shell.agent.tools = None
     shell.agent.session_id = shell.session_id
     shell.agent._compress_context.return_value = (list(history), "")
     shell._pending_title = "keep me"
 
-    with patch("agent.model_metadata.estimate_messages_tokens_rough", return_value=100):
+    with patch("agent.model_metadata.estimate_request_tokens_rough", return_value=100):
         shell._manual_compress()
 
     # No split → pending title untouched.
diff --git a/tests/gateway/test_compress_command.py b/tests/gateway/test_compress_command.py
index 21ff777f..e09e40a0 100644
--- a/tests/gateway/test_compress_command.py
+++ b/tests/gateway/test_compress_command.py
@@ -64,11 +64,13 @@ async def test_compress_command_reports_noop_without_success_banner():
     agent_instance = MagicMock()
     agent_instance.shutdown_memory_provider = MagicMock()
     agent_instance.close = MagicMock()
+    agent_instance._cached_system_prompt = ""
+    agent_instance.tools = None
     agent_instance.context_compressor.has_content_to_compress.return_value = True
     agent_instance.session_id = "sess-1"
     agent_instance._compress_context.return_value = (list(history), "")
 
-    def _estimate(messages):
+    def _estimate(messages, **_kwargs):
         assert messages == history
         return 100
 
@@ -76,13 +78,13 @@ async def test_compress_command_reports_noop_without_success_banner():
         patch("gateway.run._resolve_runtime_agent_kwargs", return_value={"api_key": "test-key"}),
         patch("gateway.run._resolve_gateway_model", return_value="test-model"),
         patch("run_agent.AIAgent", return_value=agent_instance),
-        patch("agent.model_metadata.estimate_messages_tokens_rough", side_effect=_estimate),
+        patch("agent.model_metadata.estimate_request_tokens_rough", side_effect=_estimate),
     ):
         result = await runner._handle_compress_command(_make_event())
 
     assert "No changes from compression" in result
     assert "Compressed:" not in result
-    assert "Rough transcript estimate: ~100 tokens (unchanged)" in result
+    assert "Approx request size: ~100 tokens (unchanged)" in result
     agent_instance.shutdown_memory_provider.assert_called_once()
     agent_instance.close.assert_called_once()
 
@@ -99,11 +101,13 @@ async def test_compress_command_explains_when_token_estimate_rises():
     agent_instance = MagicMock()
     agent_instance.shutdown_memory_provider = MagicMock()
     agent_instance.close = MagicMock()
+    agent_instance._cached_system_prompt = ""
+    agent_instance.tools = None
     agent_instance.context_compressor.has_content_to_compress.return_value = True
     agent_instance.session_id = "sess-1"
     agent_instance._compress_context.return_value = (compressed, "")
 
-    def _estimate(messages):
+    def _estimate(messages, **_kwargs):
         if messages == history:
             return 100
         if messages == compressed:
@@ -114,12 +118,12 @@ async def test_compress_command_explains_when_token_estimate_rises():
         patch("gateway.run._resolve_runtime_agent_kwargs", return_value={"api_key": "test-key"}),
         patch("gateway.run._resolve_gateway_model", return_value="test-model"),
         patch("run_agent.AIAgent", return_value=agent_instance),
-        patch("agent.model_metadata.estimate_messages_tokens_rough", side_effect=_estimate),
+        patch("agent.model_metadata.estimate_request_tokens_rough", side_effect=_estimate),
     ):
         result = await runner._handle_compress_command(_make_event())
 
     assert "Compressed: 4 → 3 messages" in result
-    assert "Rough transcript estimate: ~100 → ~120 tokens" in result
+    assert "Approx request size: ~100 → ~120 tokens" in result
     assert "denser summaries" in result
     agent_instance.shutdown_memory_provider.assert_called_once()
     agent_instance.close.assert_called_once()
@@ -143,6 +147,8 @@ async def test_compress_command_appends_warning_when_summary_generation_fails():
     agent_instance = MagicMock()
     agent_instance.shutdown_memory_provider = MagicMock()
     agent_instance.close = MagicMock()
+    agent_instance._cached_system_prompt = ""
+    agent_instance.tools = None
     agent_instance.context_compressor.has_content_to_compress.return_value = True
     # Simulate summary-generation failure: fallback flag set, dropped count
     # populated, error string captured.
@@ -154,7 +160,7 @@ async def test_compress_command_appends_warning_when_summary_generation_fails():
     agent_instance.session_id = "sess-1"
     agent_instance._compress_context.return_value = (compressed, "")
 
-    def _estimate(messages):
+    def _estimate(messages, **_kwargs):
         if messages == history:
             return 100
         if messages == compressed:
@@ -165,7 +171,7 @@ async def test_compress_command_appends_warning_when_summary_generation_fails():
         patch("gateway.run._resolve_runtime_agent_kwargs", return_value={"api_key": "***"}),
         patch("gateway.run._resolve_gateway_model", return_value="test-model"),
         patch("run_agent.AIAgent", return_value=agent_instance),
-        patch("agent.model_metadata.estimate_messages_tokens_rough", side_effect=_estimate),
+        patch("agent.model_metadata.estimate_request_tokens_rough", side_effect=_estimate),
     ):
         result = await runner._handle_compress_command(_make_event())
 
@@ -200,6 +206,8 @@ async def test_compress_command_surfaces_aux_model_failure_even_when_recovered()
     agent_instance = MagicMock()
     agent_instance.shutdown_memory_provider = MagicMock()
     agent_instance.close = MagicMock()
+    agent_instance._cached_system_prompt = ""
+    agent_instance.tools = None
     agent_instance.context_compressor.has_content_to_compress.return_value = True
     # Fallback placeholder was NOT used — recovery succeeded.
     agent_instance.context_compressor._last_summary_fallback_used = False
@@ -215,7 +223,7 @@ async def test_compress_command_surfaces_aux_model_failure_even_when_recovered()
     agent_instance.session_id = "sess-1"
     agent_instance._compress_context.return_value = (compressed, "")
 
-    def _estimate(messages):
+    def _estimate(messages, **_kwargs):
         if messages == history:
             return 100
         if messages == compressed:
@@ -226,7 +234,7 @@ async def test_compress_command_surfaces_aux_model_failure_even_when_recovered()
         patch("gateway.run._resolve_runtime_agent_kwargs", return_value={"api_key": "***"}),
         patch("gateway.run._resolve_gateway_model", return_value="test-model"),
         patch("run_agent.AIAgent", return_value=agent_instance),
-        patch("agent.model_metadata.estimate_messages_tokens_rough", side_effect=_estimate),
+        patch("agent.model_metadata.estimate_request_tokens_rough", side_effect=_estimate),
     ):
         result = await runner._handle_compress_command(_make_event())
 
diff --git a/tui_gateway/server.py b/tui_gateway/server.py
index fb8aaa81..4a7f4785 100644
--- a/tui_gateway/server.py
+++ b/tui_gateway/server.py
@@ -1144,7 +1144,7 @@ def _compress_session_history(
     before_messages: list | None = None,
     history_version: int | None = None,
 ) -> tuple[int, dict]:
-    from agent.model_metadata import estimate_messages_tokens_rough
+    from agent.model_metadata import estimate_request_tokens_rough
 
     agent = session["agent"]
     # Snapshot history under the lock so the LLM-bound compression call
@@ -1160,7 +1160,13 @@ def _compress_session_history(
         usage = _get_usage(agent)
         return 0, usage
     if approx_tokens is None:
-        approx_tokens = estimate_messages_tokens_rough(history)
+        # Include system prompt + tool schemas so the figure reflects real
+        # request pressure, not a transcript-only underestimate (#6217).
+        _sys_prompt = getattr(agent, "_cached_system_prompt", "") or ""
+        _tools = getattr(agent, "tools", None) or None
+        approx_tokens = estimate_request_tokens_rough(
+            history, system_prompt=_sys_prompt, tools=_tools
+        )
     # Pass system_message=None so AIAgent._compress_context rebuilds the
     # system prompt cleanly via _build_system_prompt(None). Passing the
     # cached prompt (which already contains the agent identity block)
@@ -2328,14 +2334,21 @@ def _(rid, params: dict) -> dict:
     focus_topic = str(params.get("focus_topic", "") or "").strip()
     try:
         from agent.manual_compression_feedback import summarize_manual_compression
-        from agent.model_metadata import estimate_messages_tokens_rough
+        from agent.model_metadata import estimate_request_tokens_rough
 
         with session["history_lock"]:
             before_messages = list(session.get("history", []))
             history_version = int(session.get("history_version", 0))
         before_count = len(before_messages)
+        _agent = session["agent"]
+        _sys_prompt = getattr(_agent, "_cached_system_prompt", "") or ""
+        _tools = getattr(_agent, "tools", None) or None
         before_tokens = (
-            estimate_messages_tokens_rough(before_messages) if before_count else 0
+            estimate_request_tokens_rough(
+                before_messages, system_prompt=_sys_prompt, tools=_tools
+            )
+            if before_count
+            else 0
         )
 
         if before_count >= 4:
@@ -2358,8 +2371,18 @@ def _(rid, params: dict) -> dict:
             with session["history_lock"]:
                 messages = list(session.get("history", []))
             after_count = len(messages)
+            # Re-read system prompt + tools after compression — _compress_context
+            # may have rebuilt the system prompt (_cached_system_prompt=None).
+            _sys_prompt_after = getattr(_agent, "_cached_system_prompt", "") or _sys_prompt
+            _tools_after = getattr(_agent, "tools", None) or _tools
             after_tokens = (
-                estimate_messages_tokens_rough(messages) if after_count else 0
+                estimate_request_tokens_rough(
+                    messages,
+                    system_prompt=_sys_prompt_after,
+                    tools=_tools_after,
+                )
+                if after_count
+                else 0
             )
             agent = session["agent"]
             _sync_session_key_after_compress(sid, session)