fix(anthropic): use model-native output limits instead of hardcoded 16K (#3426)

The Anthropic adapter defaulted to max_tokens=16384 when no explicit value
was configured.  This severely limits thinking-enabled models where thinking
tokens count toward max_tokens:

- Claude Opus 4.6 supports 128K output but was capped at 16K
- Claude Sonnet 4.6 supports 64K output but was capped at 16K

With extended thinking (adaptive or budget-based), the model could exhaust
the entire 16K on reasoning, leaving zero tokens for the actual response.
This caused two user-visible errors:
- 'Response truncated (finish_reason=length)' — thinking consumed most tokens
- 'Response only contains think block with no content' — thinking consumed all

Fix: add _ANTHROPIC_OUTPUT_LIMITS lookup table (sourced from Anthropic docs
and Cline's model catalog) and use the model's actual output limit as the
default.  Unknown future models default to 128K (the current maximum).

Also adds context_length clamping: if the user configured a smaller context
window (e.g. custom endpoint), max_tokens is clamped to context_length - 1
to avoid exceeding the window.

Closes #2706
This commit is contained in:
Teknium 2026-03-27 13:02:52 -07:00 committed by GitHub
parent fb46a90098
commit 6f11ff53ad
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 196 additions and 3 deletions

View File

@ -35,6 +35,54 @@ ADAPTIVE_EFFORT_MAP = {
"minimal": "low",
}
# ── Max output token limits per Anthropic model ───────────────────────
# Source: Anthropic docs + Cline model catalog. Anthropic's API requires
# max_tokens as a mandatory field. Previously we hardcoded 16384, which
# starves thinking-enabled models (thinking tokens count toward the limit).
_ANTHROPIC_OUTPUT_LIMITS = {
# Claude 4.6
"claude-opus-4-6": 128_000,
"claude-sonnet-4-6": 64_000,
# Claude 4.5
"claude-opus-4-5": 64_000,
"claude-sonnet-4-5": 64_000,
"claude-haiku-4-5": 64_000,
# Claude 4
"claude-opus-4": 32_000,
"claude-sonnet-4": 64_000,
# Claude 3.7
"claude-3-7-sonnet": 128_000,
# Claude 3.5
"claude-3-5-sonnet": 8_192,
"claude-3-5-haiku": 8_192,
# Claude 3
"claude-3-opus": 4_096,
"claude-3-sonnet": 4_096,
"claude-3-haiku": 4_096,
}
# For any model not in the table, assume the highest current limit.
# Future Anthropic models are unlikely to have *less* output capacity.
_ANTHROPIC_DEFAULT_OUTPUT_LIMIT = 128_000
def _get_anthropic_max_output(model: str) -> int:
"""Look up the max output token limit for an Anthropic model.
Uses substring matching against _ANTHROPIC_OUTPUT_LIMITS so date-stamped
model IDs (claude-sonnet-4-5-20250929) and variant suffixes (:1m, :fast)
resolve correctly. Longest-prefix match wins to avoid e.g. "claude-3-5"
matching before "claude-3-5-sonnet".
"""
m = model.lower()
best_key = ""
best_val = _ANTHROPIC_DEFAULT_OUTPUT_LIMIT
for key, val in _ANTHROPIC_OUTPUT_LIMITS.items():
if key in m and len(key) > len(best_key):
best_key = key
best_val = val
return best_val
def _supports_adaptive_thinking(model: str) -> bool:
"""Return True for Claude 4.6 models that support adaptive thinking."""
@ -818,9 +866,15 @@ def build_anthropic_kwargs(
tool_choice: Optional[str] = None,
is_oauth: bool = False,
preserve_dots: bool = False,
context_length: Optional[int] = None,
) -> Dict[str, Any]:
"""Build kwargs for anthropic.messages.create().
When *max_tokens* is None, the model's native output limit is used
(e.g. 128K for Opus 4.6, 64K for Sonnet 4.6). If *context_length*
is provided, the effective limit is clamped so it doesn't exceed
the context window.
When *is_oauth* is True, applies Claude Code compatibility transforms:
system prompt prefix, tool name prefixing, and prompt sanitization.
@ -831,7 +885,12 @@ def build_anthropic_kwargs(
anthropic_tools = convert_tools_to_anthropic(tools) if tools else []
model = normalize_model_name(model, preserve_dots=preserve_dots)
effective_max_tokens = max_tokens or 16384
effective_max_tokens = max_tokens or _get_anthropic_max_output(model)
# Clamp to context window if the user set a lower context_length
# (e.g. custom endpoint with limited capacity).
if context_length and effective_max_tokens > context_length:
effective_max_tokens = max(context_length - 1, 1)
# ── OAuth: Claude Code identity ──────────────────────────────────
if is_oauth:

View File

@ -4378,6 +4378,10 @@ class AIAgent:
if self.api_mode == "anthropic_messages":
from agent.anthropic_adapter import build_anthropic_kwargs
anthropic_messages = self._prepare_anthropic_messages_for_api(api_messages)
# Pass context_length so the adapter can clamp max_tokens if the
# user configured a smaller context window than the model's output limit.
ctx_len = getattr(self, "context_compressor", None)
ctx_len = ctx_len.context_length if ctx_len else None
return build_anthropic_kwargs(
model=self.model,
messages=anthropic_messages,
@ -4386,6 +4390,7 @@ class AIAgent:
reasoning_config=self.reasoning_config,
is_oauth=self._is_anthropic_oauth,
preserve_dots=self._anthropic_preserve_dots(),
context_length=ctx_len,
)
if self.api_mode == "codex_responses":

View File

@ -926,7 +926,8 @@ class TestBuildAnthropicKwargs:
)
assert "thinking" not in kwargs
def test_default_max_tokens(self):
def test_default_max_tokens_uses_model_output_limit(self):
"""When max_tokens is None, use the model's native output limit."""
kwargs = build_anthropic_kwargs(
model="claude-sonnet-4-20250514",
messages=[{"role": "user", "content": "Hi"}],
@ -934,7 +935,135 @@ class TestBuildAnthropicKwargs:
max_tokens=None,
reasoning_config=None,
)
assert kwargs["max_tokens"] == 16384
assert kwargs["max_tokens"] == 64_000 # Sonnet 4 output limit
def test_default_max_tokens_opus_4_6(self):
kwargs = build_anthropic_kwargs(
model="claude-opus-4-6",
messages=[{"role": "user", "content": "Hi"}],
tools=None,
max_tokens=None,
reasoning_config=None,
)
assert kwargs["max_tokens"] == 128_000
def test_default_max_tokens_sonnet_4_6(self):
kwargs = build_anthropic_kwargs(
model="claude-sonnet-4-6",
messages=[{"role": "user", "content": "Hi"}],
tools=None,
max_tokens=None,
reasoning_config=None,
)
assert kwargs["max_tokens"] == 64_000
def test_default_max_tokens_date_stamped_model(self):
"""Date-stamped model IDs should resolve via substring match."""
kwargs = build_anthropic_kwargs(
model="claude-sonnet-4-5-20250929",
messages=[{"role": "user", "content": "Hi"}],
tools=None,
max_tokens=None,
reasoning_config=None,
)
assert kwargs["max_tokens"] == 64_000
def test_default_max_tokens_older_model(self):
kwargs = build_anthropic_kwargs(
model="claude-3-5-sonnet-20241022",
messages=[{"role": "user", "content": "Hi"}],
tools=None,
max_tokens=None,
reasoning_config=None,
)
assert kwargs["max_tokens"] == 8_192
def test_default_max_tokens_unknown_model_uses_highest(self):
"""Unknown future models should get the highest known limit."""
kwargs = build_anthropic_kwargs(
model="claude-ultra-5-20260101",
messages=[{"role": "user", "content": "Hi"}],
tools=None,
max_tokens=None,
reasoning_config=None,
)
assert kwargs["max_tokens"] == 128_000
def test_explicit_max_tokens_overrides_default(self):
"""User-specified max_tokens should be respected."""
kwargs = build_anthropic_kwargs(
model="claude-opus-4-6",
messages=[{"role": "user", "content": "Hi"}],
tools=None,
max_tokens=4096,
reasoning_config=None,
)
assert kwargs["max_tokens"] == 4096
def test_context_length_clamp(self):
"""max_tokens should be clamped to context_length if it's smaller."""
kwargs = build_anthropic_kwargs(
model="claude-opus-4-6", # 128K output
messages=[{"role": "user", "content": "Hi"}],
tools=None,
max_tokens=None,
reasoning_config=None,
context_length=50000,
)
assert kwargs["max_tokens"] == 49999 # context_length - 1
def test_context_length_no_clamp_when_larger(self):
"""No clamping when context_length exceeds output limit."""
kwargs = build_anthropic_kwargs(
model="claude-sonnet-4-6", # 64K output
messages=[{"role": "user", "content": "Hi"}],
tools=None,
max_tokens=None,
reasoning_config=None,
context_length=200000,
)
assert kwargs["max_tokens"] == 64_000
# ---------------------------------------------------------------------------
# Model output limit lookup
# ---------------------------------------------------------------------------
class TestGetAnthropicMaxOutput:
def test_opus_4_6(self):
from agent.anthropic_adapter import _get_anthropic_max_output
assert _get_anthropic_max_output("claude-opus-4-6") == 128_000
def test_opus_4_6_variant(self):
from agent.anthropic_adapter import _get_anthropic_max_output
assert _get_anthropic_max_output("claude-opus-4-6:1m:fast") == 128_000
def test_sonnet_4_6(self):
from agent.anthropic_adapter import _get_anthropic_max_output
assert _get_anthropic_max_output("claude-sonnet-4-6") == 64_000
def test_sonnet_4_date_stamped(self):
from agent.anthropic_adapter import _get_anthropic_max_output
assert _get_anthropic_max_output("claude-sonnet-4-20250514") == 64_000
def test_claude_3_5_sonnet(self):
from agent.anthropic_adapter import _get_anthropic_max_output
assert _get_anthropic_max_output("claude-3-5-sonnet-20241022") == 8_192
def test_claude_3_opus(self):
from agent.anthropic_adapter import _get_anthropic_max_output
assert _get_anthropic_max_output("claude-3-opus-20240229") == 4_096
def test_unknown_future_model(self):
from agent.anthropic_adapter import _get_anthropic_max_output
assert _get_anthropic_max_output("claude-ultra-5-20260101") == 128_000
def test_longest_prefix_wins(self):
"""'claude-3-5-sonnet' should match before 'claude-3-5'."""
from agent.anthropic_adapter import _get_anthropic_max_output
# claude-3-5-sonnet (8192) should win over a hypothetical shorter match
assert _get_anthropic_max_output("claude-3-5-sonnet-20241022") == 8_192
# ---------------------------------------------------------------------------