fix(compression): include system prompt + tool schemas in token estimates (#18265)
The user-visible /compress banner and the post-compression last_prompt_tokens writeback both counted only the raw message transcript (chars/4). With a 15KB system prompt and 30 tool schemas (~26KB), a 4-message transcript that looks like ~45 tokens to the transcript-only estimator is really ~10.5K tokens of request pressure — a 234x gap. Two user-facing consequences: - Banner shows 'Compressing … (~45 tokens)…' while compression is actually firing on 10K+ tokens of real pressure, confusing users about why compression triggered (reported by @codecovenant on X; #6217). - Post-compression last_prompt_tokens writeback omits tool schemas, so the next should_compress() check compares real usage against a stale underestimate — compression triggers late, potentially past the model's context limit on small-context models (#14695). Swap estimate_messages_tokens_rough() for estimate_request_tokens_rough() at every user-visible banner and at the post-compression writeback. estimate_request_tokens_rough() already existed for exactly this purpose and includes system prompt + tool schemas. Touched call sites: - run_agent.py: post-compression last_prompt_tokens writeback, post-tool call should_compress() fallback when provider usage is missing - cli.py: /compress banner + summary - gateway/run.py: gateway /compress banner + summary - tui_gateway/server.py: TUI /compress status + summary - acp_adapter/server.py: ACP /compact before/after Left intentionally alone: - Session-hygiene fallback and the 'no agent' /status path in gateway/run.py — no agent instance is in scope to query for system prompt/tools, and the existing 30-50% overestimate wobble on hygiene is safety-accepted. - Verbose-mode 'Request size' logging — informational only, already counts system prompt via api_messages[0]. Also relabels the feedback line from 'Rough transcript estimate' to 'Approx request size' so the metric label matches what it actually measures. Credits: diagnoses from @devilardis (#14695) and @Jackten (#6217); user report @codecovenant on X (2026-04-30). Closes #14695 Closes #6217
This commit is contained in:
parent
41fa1f1b5c
commit
f0dc919f92
@ -1068,10 +1068,16 @@ class HermesACPAgent(acp.Agent):
|
|||||||
if not hasattr(agent, "_compress_context"):
|
if not hasattr(agent, "_compress_context"):
|
||||||
return "Context compression not available for this agent."
|
return "Context compression not available for this agent."
|
||||||
|
|
||||||
from agent.model_metadata import estimate_messages_tokens_rough
|
from agent.model_metadata import estimate_request_tokens_rough
|
||||||
|
|
||||||
original_count = len(state.history)
|
original_count = len(state.history)
|
||||||
approx_tokens = estimate_messages_tokens_rough(state.history)
|
# Include system prompt + tool schemas so the figure reflects real
|
||||||
|
# request pressure, not a transcript-only underestimate (#6217).
|
||||||
|
_sys_prompt = getattr(agent, "_cached_system_prompt", "") or ""
|
||||||
|
_tools = getattr(agent, "tools", None) or None
|
||||||
|
approx_tokens = estimate_request_tokens_rough(
|
||||||
|
state.history, system_prompt=_sys_prompt, tools=_tools
|
||||||
|
)
|
||||||
original_session_db = getattr(agent, "_session_db", None)
|
original_session_db = getattr(agent, "_session_db", None)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -1091,7 +1097,13 @@ class HermesACPAgent(acp.Agent):
|
|||||||
self.session_manager.save_session(state.session_id)
|
self.session_manager.save_session(state.session_id)
|
||||||
|
|
||||||
new_count = len(state.history)
|
new_count = len(state.history)
|
||||||
new_tokens = estimate_messages_tokens_rough(state.history)
|
_sys_prompt_after = getattr(agent, "_cached_system_prompt", "") or _sys_prompt
|
||||||
|
_tools_after = getattr(agent, "tools", None) or _tools
|
||||||
|
new_tokens = estimate_request_tokens_rough(
|
||||||
|
state.history,
|
||||||
|
system_prompt=_sys_prompt_after,
|
||||||
|
tools=_tools_after,
|
||||||
|
)
|
||||||
return (
|
return (
|
||||||
f"Context compressed: {original_count} -> {new_count} messages\n"
|
f"Context compressed: {original_count} -> {new_count} messages\n"
|
||||||
f"~{approx_tokens:,} -> ~{new_tokens:,} tokens"
|
f"~{approx_tokens:,} -> ~{new_tokens:,} tokens"
|
||||||
|
|||||||
@ -20,25 +20,25 @@ def summarize_manual_compression(
|
|||||||
headline = f"No changes from compression: {before_count} messages"
|
headline = f"No changes from compression: {before_count} messages"
|
||||||
if after_tokens == before_tokens:
|
if after_tokens == before_tokens:
|
||||||
token_line = (
|
token_line = (
|
||||||
f"Rough transcript estimate: ~{before_tokens:,} tokens (unchanged)"
|
f"Approx request size: ~{before_tokens:,} tokens (unchanged)"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
token_line = (
|
token_line = (
|
||||||
f"Rough transcript estimate: ~{before_tokens:,} → "
|
f"Approx request size: ~{before_tokens:,} → "
|
||||||
f"~{after_tokens:,} tokens"
|
f"~{after_tokens:,} tokens"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
headline = f"Compressed: {before_count} → {after_count} messages"
|
headline = f"Compressed: {before_count} → {after_count} messages"
|
||||||
token_line = (
|
token_line = (
|
||||||
f"Rough transcript estimate: ~{before_tokens:,} → "
|
f"Approx request size: ~{before_tokens:,} → "
|
||||||
f"~{after_tokens:,} tokens"
|
f"~{after_tokens:,} tokens"
|
||||||
)
|
)
|
||||||
|
|
||||||
note = None
|
note = None
|
||||||
if not noop and after_count < before_count and after_tokens > before_tokens:
|
if not noop and after_count < before_count and after_tokens > before_tokens:
|
||||||
note = (
|
note = (
|
||||||
"Note: fewer messages can still raise this rough transcript estimate "
|
"Note: fewer messages can still raise this estimate when "
|
||||||
"when compression rewrites the transcript into denser summaries."
|
"compression rewrites the transcript into denser summaries."
|
||||||
)
|
)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|||||||
20
cli.py
20
cli.py
@ -7343,10 +7343,20 @@ class HermesCLI:
|
|||||||
original_count = len(self.conversation_history)
|
original_count = len(self.conversation_history)
|
||||||
with self._busy_command("Compressing context..."):
|
with self._busy_command("Compressing context..."):
|
||||||
try:
|
try:
|
||||||
from agent.model_metadata import estimate_messages_tokens_rough
|
from agent.model_metadata import estimate_request_tokens_rough
|
||||||
from agent.manual_compression_feedback import summarize_manual_compression
|
from agent.manual_compression_feedback import summarize_manual_compression
|
||||||
original_history = list(self.conversation_history)
|
original_history = list(self.conversation_history)
|
||||||
approx_tokens = estimate_messages_tokens_rough(original_history)
|
# Include system prompt + tool schemas in the estimate —
|
||||||
|
# a transcript-only number understates real request pressure
|
||||||
|
# and can even appear to grow after compression because a
|
||||||
|
# dense handoff summary replaces many short turns (#6217).
|
||||||
|
_sys_prompt = getattr(self.agent, "_cached_system_prompt", "") or ""
|
||||||
|
_tools = getattr(self.agent, "tools", None) or None
|
||||||
|
approx_tokens = estimate_request_tokens_rough(
|
||||||
|
original_history,
|
||||||
|
system_prompt=_sys_prompt,
|
||||||
|
tools=_tools,
|
||||||
|
)
|
||||||
if focus_topic:
|
if focus_topic:
|
||||||
print(f"🗜️ Compressing {original_count} messages (~{approx_tokens:,} tokens), "
|
print(f"🗜️ Compressing {original_count} messages (~{approx_tokens:,} tokens), "
|
||||||
f"focus: \"{focus_topic}\"...")
|
f"focus: \"{focus_topic}\"...")
|
||||||
@ -7378,7 +7388,11 @@ class HermesCLI:
|
|||||||
):
|
):
|
||||||
self.session_id = self.agent.session_id
|
self.session_id = self.agent.session_id
|
||||||
self._pending_title = None
|
self._pending_title = None
|
||||||
new_tokens = estimate_messages_tokens_rough(self.conversation_history)
|
new_tokens = estimate_request_tokens_rough(
|
||||||
|
self.conversation_history,
|
||||||
|
system_prompt=_sys_prompt,
|
||||||
|
tools=_tools,
|
||||||
|
)
|
||||||
summary = summarize_manual_compression(
|
summary = summarize_manual_compression(
|
||||||
original_history,
|
original_history,
|
||||||
self.conversation_history,
|
self.conversation_history,
|
||||||
|
|||||||
@ -8512,7 +8512,7 @@ class GatewayRunner:
|
|||||||
try:
|
try:
|
||||||
from run_agent import AIAgent
|
from run_agent import AIAgent
|
||||||
from agent.manual_compression_feedback import summarize_manual_compression
|
from agent.manual_compression_feedback import summarize_manual_compression
|
||||||
from agent.model_metadata import estimate_messages_tokens_rough
|
from agent.model_metadata import estimate_request_tokens_rough
|
||||||
|
|
||||||
session_key = self._session_key_for_source(source)
|
session_key = self._session_key_for_source(source)
|
||||||
model, runtime_kwargs = self._resolve_session_agent_runtime(
|
model, runtime_kwargs = self._resolve_session_agent_runtime(
|
||||||
@ -8527,7 +8527,6 @@ class GatewayRunner:
|
|||||||
for m in history
|
for m in history
|
||||||
if m.get("role") in ("user", "assistant") and m.get("content")
|
if m.get("role") in ("user", "assistant") and m.get("content")
|
||||||
]
|
]
|
||||||
approx_tokens = estimate_messages_tokens_rough(msgs)
|
|
||||||
|
|
||||||
tmp_agent = AIAgent(
|
tmp_agent = AIAgent(
|
||||||
**runtime_kwargs,
|
**runtime_kwargs,
|
||||||
@ -8541,6 +8540,16 @@ class GatewayRunner:
|
|||||||
try:
|
try:
|
||||||
tmp_agent._print_fn = lambda *a, **kw: None
|
tmp_agent._print_fn = lambda *a, **kw: None
|
||||||
|
|
||||||
|
# Estimate with system prompt + tool schemas included so the
|
||||||
|
# figure reflects real request pressure, not a transcript-only
|
||||||
|
# underestimate (#6217). Must be computed after tmp_agent is
|
||||||
|
# built so _cached_system_prompt/tools are populated.
|
||||||
|
_sys_prompt = getattr(tmp_agent, "_cached_system_prompt", "") or ""
|
||||||
|
_tools = getattr(tmp_agent, "tools", None) or None
|
||||||
|
approx_tokens = estimate_request_tokens_rough(
|
||||||
|
msgs, system_prompt=_sys_prompt, tools=_tools
|
||||||
|
)
|
||||||
|
|
||||||
compressor = tmp_agent.context_compressor
|
compressor = tmp_agent.context_compressor
|
||||||
if not compressor.has_content_to_compress(msgs):
|
if not compressor.has_content_to_compress(msgs):
|
||||||
return "Nothing to compress yet (the transcript is still all protected context)."
|
return "Nothing to compress yet (the transcript is still all protected context)."
|
||||||
@ -8565,7 +8574,9 @@ class GatewayRunner:
|
|||||||
self.session_store.update_session(
|
self.session_store.update_session(
|
||||||
session_entry.session_key, last_prompt_tokens=0
|
session_entry.session_key, last_prompt_tokens=0
|
||||||
)
|
)
|
||||||
new_tokens = estimate_messages_tokens_rough(compressed)
|
new_tokens = estimate_request_tokens_rough(
|
||||||
|
compressed, system_prompt=_sys_prompt, tools=_tools
|
||||||
|
)
|
||||||
summary = summarize_manual_compression(
|
summary = summarize_manual_compression(
|
||||||
msgs,
|
msgs,
|
||||||
compressed,
|
compressed,
|
||||||
|
|||||||
19
run_agent.py
19
run_agent.py
@ -9101,9 +9101,14 @@ class AIAgent:
|
|||||||
|
|
||||||
# Update token estimate after compaction so pressure calculations
|
# Update token estimate after compaction so pressure calculations
|
||||||
# use the post-compression count, not the stale pre-compression one.
|
# use the post-compression count, not the stale pre-compression one.
|
||||||
_compressed_est = (
|
# Use estimate_request_tokens_rough() so tool schemas are included —
|
||||||
estimate_tokens_rough(new_system_prompt)
|
# with 50+ tools enabled, schemas alone can add 20-30K tokens, and
|
||||||
+ estimate_messages_tokens_rough(compressed)
|
# omitting them delays the next compression cycle far past the
|
||||||
|
# configured threshold (issue #14695).
|
||||||
|
_compressed_est = estimate_request_tokens_rough(
|
||||||
|
compressed,
|
||||||
|
system_prompt=new_system_prompt or "",
|
||||||
|
tools=self.tools or None,
|
||||||
)
|
)
|
||||||
self.context_compressor.last_prompt_tokens = _compressed_est
|
self.context_compressor.last_prompt_tokens = _compressed_est
|
||||||
self.context_compressor.last_completion_tokens = 0
|
self.context_compressor.last_completion_tokens = 0
|
||||||
@ -13223,7 +13228,13 @@ class AIAgent:
|
|||||||
# causing premature compression. (#12026)
|
# causing premature compression. (#12026)
|
||||||
_real_tokens = _compressor.last_prompt_tokens
|
_real_tokens = _compressor.last_prompt_tokens
|
||||||
else:
|
else:
|
||||||
_real_tokens = estimate_messages_tokens_rough(messages)
|
# Include tool schemas — with 50+ tools enabled
|
||||||
|
# these add 20-30K tokens the messages-only
|
||||||
|
# estimate misses, which can skip compression
|
||||||
|
# past the configured threshold (#14695).
|
||||||
|
_real_tokens = estimate_request_tokens_rough(
|
||||||
|
messages, tools=self.tools or None
|
||||||
|
)
|
||||||
|
|
||||||
if self.compression_enabled and _compressor.should_compress(_real_tokens):
|
if self.compression_enabled and _compressor.should_compress(_real_tokens):
|
||||||
self._safe_print(" ⟳ compacting context…")
|
self._safe_print(" ⟳ compacting context…")
|
||||||
|
|||||||
@ -730,6 +730,7 @@ class TestSlashCommands:
|
|||||||
]
|
]
|
||||||
state.agent.compression_enabled = True
|
state.agent.compression_enabled = True
|
||||||
state.agent._cached_system_prompt = "system"
|
state.agent._cached_system_prompt = "system"
|
||||||
|
state.agent.tools = None
|
||||||
original_session_db = object()
|
original_session_db = object()
|
||||||
state.agent._session_db = original_session_db
|
state.agent._session_db = original_session_db
|
||||||
|
|
||||||
@ -746,7 +747,7 @@ class TestSlashCommands:
|
|||||||
with (
|
with (
|
||||||
patch.object(agent.session_manager, "save_session") as mock_save,
|
patch.object(agent.session_manager, "save_session") as mock_save,
|
||||||
patch(
|
patch(
|
||||||
"agent.model_metadata.estimate_messages_tokens_rough",
|
"agent.model_metadata.estimate_request_tokens_rough",
|
||||||
side_effect=[40, 12],
|
side_effect=[40, 12],
|
||||||
),
|
),
|
||||||
):
|
):
|
||||||
|
|||||||
@ -21,20 +21,21 @@ def test_manual_compress_reports_noop_without_success_banner(capsys):
|
|||||||
shell.agent = MagicMock()
|
shell.agent = MagicMock()
|
||||||
shell.agent.compression_enabled = True
|
shell.agent.compression_enabled = True
|
||||||
shell.agent._cached_system_prompt = ""
|
shell.agent._cached_system_prompt = ""
|
||||||
|
shell.agent.tools = None
|
||||||
shell.agent.session_id = shell.session_id # no-op compression: no split
|
shell.agent.session_id = shell.session_id # no-op compression: no split
|
||||||
shell.agent._compress_context.return_value = (list(history), "")
|
shell.agent._compress_context.return_value = (list(history), "")
|
||||||
|
|
||||||
def _estimate(messages):
|
def _estimate(messages, **_kwargs):
|
||||||
assert messages == history
|
assert messages == history
|
||||||
return 100
|
return 100
|
||||||
|
|
||||||
with patch("agent.model_metadata.estimate_messages_tokens_rough", side_effect=_estimate):
|
with patch("agent.model_metadata.estimate_request_tokens_rough", side_effect=_estimate):
|
||||||
shell._manual_compress()
|
shell._manual_compress()
|
||||||
|
|
||||||
output = capsys.readouterr().out
|
output = capsys.readouterr().out
|
||||||
assert "No changes from compression" in output
|
assert "No changes from compression" in output
|
||||||
assert "✅ Compressed" not in output
|
assert "✅ Compressed" not in output
|
||||||
assert "Rough transcript estimate: ~100 tokens (unchanged)" in output
|
assert "Approx request size: ~100 tokens (unchanged)" in output
|
||||||
|
|
||||||
|
|
||||||
def test_manual_compress_explains_when_token_estimate_rises(capsys):
|
def test_manual_compress_explains_when_token_estimate_rises(capsys):
|
||||||
@ -49,22 +50,23 @@ def test_manual_compress_explains_when_token_estimate_rises(capsys):
|
|||||||
shell.agent = MagicMock()
|
shell.agent = MagicMock()
|
||||||
shell.agent.compression_enabled = True
|
shell.agent.compression_enabled = True
|
||||||
shell.agent._cached_system_prompt = ""
|
shell.agent._cached_system_prompt = ""
|
||||||
|
shell.agent.tools = None
|
||||||
shell.agent.session_id = shell.session_id # no-op: no split
|
shell.agent.session_id = shell.session_id # no-op: no split
|
||||||
shell.agent._compress_context.return_value = (compressed, "")
|
shell.agent._compress_context.return_value = (compressed, "")
|
||||||
|
|
||||||
def _estimate(messages):
|
def _estimate(messages, **_kwargs):
|
||||||
if messages == history:
|
if messages == history:
|
||||||
return 100
|
return 100
|
||||||
if messages == compressed:
|
if messages == compressed:
|
||||||
return 120
|
return 120
|
||||||
raise AssertionError(f"unexpected transcript: {messages!r}")
|
raise AssertionError(f"unexpected transcript: {messages!r}")
|
||||||
|
|
||||||
with patch("agent.model_metadata.estimate_messages_tokens_rough", side_effect=_estimate):
|
with patch("agent.model_metadata.estimate_request_tokens_rough", side_effect=_estimate):
|
||||||
shell._manual_compress()
|
shell._manual_compress()
|
||||||
|
|
||||||
output = capsys.readouterr().out
|
output = capsys.readouterr().out
|
||||||
assert "✅ Compressed: 4 → 3 messages" in output
|
assert "✅ Compressed: 4 → 3 messages" in output
|
||||||
assert "Rough transcript estimate: ~100 → ~120 tokens" in output
|
assert "Approx request size: ~100 → ~120 tokens" in output
|
||||||
assert "denser summaries" in output
|
assert "denser summaries" in output
|
||||||
|
|
||||||
|
|
||||||
@ -89,6 +91,7 @@ def test_manual_compress_syncs_session_id_after_split():
|
|||||||
shell.agent = MagicMock()
|
shell.agent = MagicMock()
|
||||||
shell.agent.compression_enabled = True
|
shell.agent.compression_enabled = True
|
||||||
shell.agent._cached_system_prompt = ""
|
shell.agent._cached_system_prompt = ""
|
||||||
|
shell.agent.tools = None
|
||||||
# Simulate _compress_context mutating agent.session_id as a side effect.
|
# Simulate _compress_context mutating agent.session_id as a side effect.
|
||||||
def _fake_compress(*args, **kwargs):
|
def _fake_compress(*args, **kwargs):
|
||||||
shell.agent.session_id = new_child_id
|
shell.agent.session_id = new_child_id
|
||||||
@ -97,7 +100,7 @@ def test_manual_compress_syncs_session_id_after_split():
|
|||||||
shell.agent.session_id = old_id # starts in sync
|
shell.agent.session_id = old_id # starts in sync
|
||||||
shell._pending_title = "stale title"
|
shell._pending_title = "stale title"
|
||||||
|
|
||||||
with patch("agent.model_metadata.estimate_messages_tokens_rough", return_value=100):
|
with patch("agent.model_metadata.estimate_request_tokens_rough", return_value=100):
|
||||||
shell._manual_compress()
|
shell._manual_compress()
|
||||||
|
|
||||||
# CLI session_id must now point at the continuation child, not the parent.
|
# CLI session_id must now point at the continuation child, not the parent.
|
||||||
@ -118,11 +121,12 @@ def test_manual_compress_no_sync_when_session_id_unchanged():
|
|||||||
shell.agent = MagicMock()
|
shell.agent = MagicMock()
|
||||||
shell.agent.compression_enabled = True
|
shell.agent.compression_enabled = True
|
||||||
shell.agent._cached_system_prompt = ""
|
shell.agent._cached_system_prompt = ""
|
||||||
|
shell.agent.tools = None
|
||||||
shell.agent.session_id = shell.session_id
|
shell.agent.session_id = shell.session_id
|
||||||
shell.agent._compress_context.return_value = (list(history), "")
|
shell.agent._compress_context.return_value = (list(history), "")
|
||||||
shell._pending_title = "keep me"
|
shell._pending_title = "keep me"
|
||||||
|
|
||||||
with patch("agent.model_metadata.estimate_messages_tokens_rough", return_value=100):
|
with patch("agent.model_metadata.estimate_request_tokens_rough", return_value=100):
|
||||||
shell._manual_compress()
|
shell._manual_compress()
|
||||||
|
|
||||||
# No split → pending title untouched.
|
# No split → pending title untouched.
|
||||||
|
|||||||
@ -64,11 +64,13 @@ async def test_compress_command_reports_noop_without_success_banner():
|
|||||||
agent_instance = MagicMock()
|
agent_instance = MagicMock()
|
||||||
agent_instance.shutdown_memory_provider = MagicMock()
|
agent_instance.shutdown_memory_provider = MagicMock()
|
||||||
agent_instance.close = MagicMock()
|
agent_instance.close = MagicMock()
|
||||||
|
agent_instance._cached_system_prompt = ""
|
||||||
|
agent_instance.tools = None
|
||||||
agent_instance.context_compressor.has_content_to_compress.return_value = True
|
agent_instance.context_compressor.has_content_to_compress.return_value = True
|
||||||
agent_instance.session_id = "sess-1"
|
agent_instance.session_id = "sess-1"
|
||||||
agent_instance._compress_context.return_value = (list(history), "")
|
agent_instance._compress_context.return_value = (list(history), "")
|
||||||
|
|
||||||
def _estimate(messages):
|
def _estimate(messages, **_kwargs):
|
||||||
assert messages == history
|
assert messages == history
|
||||||
return 100
|
return 100
|
||||||
|
|
||||||
@ -76,13 +78,13 @@ async def test_compress_command_reports_noop_without_success_banner():
|
|||||||
patch("gateway.run._resolve_runtime_agent_kwargs", return_value={"api_key": "test-key"}),
|
patch("gateway.run._resolve_runtime_agent_kwargs", return_value={"api_key": "test-key"}),
|
||||||
patch("gateway.run._resolve_gateway_model", return_value="test-model"),
|
patch("gateway.run._resolve_gateway_model", return_value="test-model"),
|
||||||
patch("run_agent.AIAgent", return_value=agent_instance),
|
patch("run_agent.AIAgent", return_value=agent_instance),
|
||||||
patch("agent.model_metadata.estimate_messages_tokens_rough", side_effect=_estimate),
|
patch("agent.model_metadata.estimate_request_tokens_rough", side_effect=_estimate),
|
||||||
):
|
):
|
||||||
result = await runner._handle_compress_command(_make_event())
|
result = await runner._handle_compress_command(_make_event())
|
||||||
|
|
||||||
assert "No changes from compression" in result
|
assert "No changes from compression" in result
|
||||||
assert "Compressed:" not in result
|
assert "Compressed:" not in result
|
||||||
assert "Rough transcript estimate: ~100 tokens (unchanged)" in result
|
assert "Approx request size: ~100 tokens (unchanged)" in result
|
||||||
agent_instance.shutdown_memory_provider.assert_called_once()
|
agent_instance.shutdown_memory_provider.assert_called_once()
|
||||||
agent_instance.close.assert_called_once()
|
agent_instance.close.assert_called_once()
|
||||||
|
|
||||||
@ -99,11 +101,13 @@ async def test_compress_command_explains_when_token_estimate_rises():
|
|||||||
agent_instance = MagicMock()
|
agent_instance = MagicMock()
|
||||||
agent_instance.shutdown_memory_provider = MagicMock()
|
agent_instance.shutdown_memory_provider = MagicMock()
|
||||||
agent_instance.close = MagicMock()
|
agent_instance.close = MagicMock()
|
||||||
|
agent_instance._cached_system_prompt = ""
|
||||||
|
agent_instance.tools = None
|
||||||
agent_instance.context_compressor.has_content_to_compress.return_value = True
|
agent_instance.context_compressor.has_content_to_compress.return_value = True
|
||||||
agent_instance.session_id = "sess-1"
|
agent_instance.session_id = "sess-1"
|
||||||
agent_instance._compress_context.return_value = (compressed, "")
|
agent_instance._compress_context.return_value = (compressed, "")
|
||||||
|
|
||||||
def _estimate(messages):
|
def _estimate(messages, **_kwargs):
|
||||||
if messages == history:
|
if messages == history:
|
||||||
return 100
|
return 100
|
||||||
if messages == compressed:
|
if messages == compressed:
|
||||||
@ -114,12 +118,12 @@ async def test_compress_command_explains_when_token_estimate_rises():
|
|||||||
patch("gateway.run._resolve_runtime_agent_kwargs", return_value={"api_key": "test-key"}),
|
patch("gateway.run._resolve_runtime_agent_kwargs", return_value={"api_key": "test-key"}),
|
||||||
patch("gateway.run._resolve_gateway_model", return_value="test-model"),
|
patch("gateway.run._resolve_gateway_model", return_value="test-model"),
|
||||||
patch("run_agent.AIAgent", return_value=agent_instance),
|
patch("run_agent.AIAgent", return_value=agent_instance),
|
||||||
patch("agent.model_metadata.estimate_messages_tokens_rough", side_effect=_estimate),
|
patch("agent.model_metadata.estimate_request_tokens_rough", side_effect=_estimate),
|
||||||
):
|
):
|
||||||
result = await runner._handle_compress_command(_make_event())
|
result = await runner._handle_compress_command(_make_event())
|
||||||
|
|
||||||
assert "Compressed: 4 → 3 messages" in result
|
assert "Compressed: 4 → 3 messages" in result
|
||||||
assert "Rough transcript estimate: ~100 → ~120 tokens" in result
|
assert "Approx request size: ~100 → ~120 tokens" in result
|
||||||
assert "denser summaries" in result
|
assert "denser summaries" in result
|
||||||
agent_instance.shutdown_memory_provider.assert_called_once()
|
agent_instance.shutdown_memory_provider.assert_called_once()
|
||||||
agent_instance.close.assert_called_once()
|
agent_instance.close.assert_called_once()
|
||||||
@ -143,6 +147,8 @@ async def test_compress_command_appends_warning_when_summary_generation_fails():
|
|||||||
agent_instance = MagicMock()
|
agent_instance = MagicMock()
|
||||||
agent_instance.shutdown_memory_provider = MagicMock()
|
agent_instance.shutdown_memory_provider = MagicMock()
|
||||||
agent_instance.close = MagicMock()
|
agent_instance.close = MagicMock()
|
||||||
|
agent_instance._cached_system_prompt = ""
|
||||||
|
agent_instance.tools = None
|
||||||
agent_instance.context_compressor.has_content_to_compress.return_value = True
|
agent_instance.context_compressor.has_content_to_compress.return_value = True
|
||||||
# Simulate summary-generation failure: fallback flag set, dropped count
|
# Simulate summary-generation failure: fallback flag set, dropped count
|
||||||
# populated, error string captured.
|
# populated, error string captured.
|
||||||
@ -154,7 +160,7 @@ async def test_compress_command_appends_warning_when_summary_generation_fails():
|
|||||||
agent_instance.session_id = "sess-1"
|
agent_instance.session_id = "sess-1"
|
||||||
agent_instance._compress_context.return_value = (compressed, "")
|
agent_instance._compress_context.return_value = (compressed, "")
|
||||||
|
|
||||||
def _estimate(messages):
|
def _estimate(messages, **_kwargs):
|
||||||
if messages == history:
|
if messages == history:
|
||||||
return 100
|
return 100
|
||||||
if messages == compressed:
|
if messages == compressed:
|
||||||
@ -165,7 +171,7 @@ async def test_compress_command_appends_warning_when_summary_generation_fails():
|
|||||||
patch("gateway.run._resolve_runtime_agent_kwargs", return_value={"api_key": "***"}),
|
patch("gateway.run._resolve_runtime_agent_kwargs", return_value={"api_key": "***"}),
|
||||||
patch("gateway.run._resolve_gateway_model", return_value="test-model"),
|
patch("gateway.run._resolve_gateway_model", return_value="test-model"),
|
||||||
patch("run_agent.AIAgent", return_value=agent_instance),
|
patch("run_agent.AIAgent", return_value=agent_instance),
|
||||||
patch("agent.model_metadata.estimate_messages_tokens_rough", side_effect=_estimate),
|
patch("agent.model_metadata.estimate_request_tokens_rough", side_effect=_estimate),
|
||||||
):
|
):
|
||||||
result = await runner._handle_compress_command(_make_event())
|
result = await runner._handle_compress_command(_make_event())
|
||||||
|
|
||||||
@ -200,6 +206,8 @@ async def test_compress_command_surfaces_aux_model_failure_even_when_recovered()
|
|||||||
agent_instance = MagicMock()
|
agent_instance = MagicMock()
|
||||||
agent_instance.shutdown_memory_provider = MagicMock()
|
agent_instance.shutdown_memory_provider = MagicMock()
|
||||||
agent_instance.close = MagicMock()
|
agent_instance.close = MagicMock()
|
||||||
|
agent_instance._cached_system_prompt = ""
|
||||||
|
agent_instance.tools = None
|
||||||
agent_instance.context_compressor.has_content_to_compress.return_value = True
|
agent_instance.context_compressor.has_content_to_compress.return_value = True
|
||||||
# Fallback placeholder was NOT used — recovery succeeded.
|
# Fallback placeholder was NOT used — recovery succeeded.
|
||||||
agent_instance.context_compressor._last_summary_fallback_used = False
|
agent_instance.context_compressor._last_summary_fallback_used = False
|
||||||
@ -215,7 +223,7 @@ async def test_compress_command_surfaces_aux_model_failure_even_when_recovered()
|
|||||||
agent_instance.session_id = "sess-1"
|
agent_instance.session_id = "sess-1"
|
||||||
agent_instance._compress_context.return_value = (compressed, "")
|
agent_instance._compress_context.return_value = (compressed, "")
|
||||||
|
|
||||||
def _estimate(messages):
|
def _estimate(messages, **_kwargs):
|
||||||
if messages == history:
|
if messages == history:
|
||||||
return 100
|
return 100
|
||||||
if messages == compressed:
|
if messages == compressed:
|
||||||
@ -226,7 +234,7 @@ async def test_compress_command_surfaces_aux_model_failure_even_when_recovered()
|
|||||||
patch("gateway.run._resolve_runtime_agent_kwargs", return_value={"api_key": "***"}),
|
patch("gateway.run._resolve_runtime_agent_kwargs", return_value={"api_key": "***"}),
|
||||||
patch("gateway.run._resolve_gateway_model", return_value="test-model"),
|
patch("gateway.run._resolve_gateway_model", return_value="test-model"),
|
||||||
patch("run_agent.AIAgent", return_value=agent_instance),
|
patch("run_agent.AIAgent", return_value=agent_instance),
|
||||||
patch("agent.model_metadata.estimate_messages_tokens_rough", side_effect=_estimate),
|
patch("agent.model_metadata.estimate_request_tokens_rough", side_effect=_estimate),
|
||||||
):
|
):
|
||||||
result = await runner._handle_compress_command(_make_event())
|
result = await runner._handle_compress_command(_make_event())
|
||||||
|
|
||||||
|
|||||||
@ -1144,7 +1144,7 @@ def _compress_session_history(
|
|||||||
before_messages: list | None = None,
|
before_messages: list | None = None,
|
||||||
history_version: int | None = None,
|
history_version: int | None = None,
|
||||||
) -> tuple[int, dict]:
|
) -> tuple[int, dict]:
|
||||||
from agent.model_metadata import estimate_messages_tokens_rough
|
from agent.model_metadata import estimate_request_tokens_rough
|
||||||
|
|
||||||
agent = session["agent"]
|
agent = session["agent"]
|
||||||
# Snapshot history under the lock so the LLM-bound compression call
|
# Snapshot history under the lock so the LLM-bound compression call
|
||||||
@ -1160,7 +1160,13 @@ def _compress_session_history(
|
|||||||
usage = _get_usage(agent)
|
usage = _get_usage(agent)
|
||||||
return 0, usage
|
return 0, usage
|
||||||
if approx_tokens is None:
|
if approx_tokens is None:
|
||||||
approx_tokens = estimate_messages_tokens_rough(history)
|
# Include system prompt + tool schemas so the figure reflects real
|
||||||
|
# request pressure, not a transcript-only underestimate (#6217).
|
||||||
|
_sys_prompt = getattr(agent, "_cached_system_prompt", "") or ""
|
||||||
|
_tools = getattr(agent, "tools", None) or None
|
||||||
|
approx_tokens = estimate_request_tokens_rough(
|
||||||
|
history, system_prompt=_sys_prompt, tools=_tools
|
||||||
|
)
|
||||||
# Pass system_message=None so AIAgent._compress_context rebuilds the
|
# Pass system_message=None so AIAgent._compress_context rebuilds the
|
||||||
# system prompt cleanly via _build_system_prompt(None). Passing the
|
# system prompt cleanly via _build_system_prompt(None). Passing the
|
||||||
# cached prompt (which already contains the agent identity block)
|
# cached prompt (which already contains the agent identity block)
|
||||||
@ -2328,14 +2334,21 @@ def _(rid, params: dict) -> dict:
|
|||||||
focus_topic = str(params.get("focus_topic", "") or "").strip()
|
focus_topic = str(params.get("focus_topic", "") or "").strip()
|
||||||
try:
|
try:
|
||||||
from agent.manual_compression_feedback import summarize_manual_compression
|
from agent.manual_compression_feedback import summarize_manual_compression
|
||||||
from agent.model_metadata import estimate_messages_tokens_rough
|
from agent.model_metadata import estimate_request_tokens_rough
|
||||||
|
|
||||||
with session["history_lock"]:
|
with session["history_lock"]:
|
||||||
before_messages = list(session.get("history", []))
|
before_messages = list(session.get("history", []))
|
||||||
history_version = int(session.get("history_version", 0))
|
history_version = int(session.get("history_version", 0))
|
||||||
before_count = len(before_messages)
|
before_count = len(before_messages)
|
||||||
|
_agent = session["agent"]
|
||||||
|
_sys_prompt = getattr(_agent, "_cached_system_prompt", "") or ""
|
||||||
|
_tools = getattr(_agent, "tools", None) or None
|
||||||
before_tokens = (
|
before_tokens = (
|
||||||
estimate_messages_tokens_rough(before_messages) if before_count else 0
|
estimate_request_tokens_rough(
|
||||||
|
before_messages, system_prompt=_sys_prompt, tools=_tools
|
||||||
|
)
|
||||||
|
if before_count
|
||||||
|
else 0
|
||||||
)
|
)
|
||||||
|
|
||||||
if before_count >= 4:
|
if before_count >= 4:
|
||||||
@ -2358,8 +2371,18 @@ def _(rid, params: dict) -> dict:
|
|||||||
with session["history_lock"]:
|
with session["history_lock"]:
|
||||||
messages = list(session.get("history", []))
|
messages = list(session.get("history", []))
|
||||||
after_count = len(messages)
|
after_count = len(messages)
|
||||||
|
# Re-read system prompt + tools after compression — _compress_context
|
||||||
|
# may have rebuilt the system prompt (_cached_system_prompt=None).
|
||||||
|
_sys_prompt_after = getattr(_agent, "_cached_system_prompt", "") or _sys_prompt
|
||||||
|
_tools_after = getattr(_agent, "tools", None) or _tools
|
||||||
after_tokens = (
|
after_tokens = (
|
||||||
estimate_messages_tokens_rough(messages) if after_count else 0
|
estimate_request_tokens_rough(
|
||||||
|
messages,
|
||||||
|
system_prompt=_sys_prompt_after,
|
||||||
|
tools=_tools_after,
|
||||||
|
)
|
||||||
|
if after_count
|
||||||
|
else 0
|
||||||
)
|
)
|
||||||
agent = session["agent"]
|
agent = session["agent"]
|
||||||
_sync_session_key_after_compress(sid, session)
|
_sync_session_key_after_compress(sid, session)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user