fix(anthropic): use model-native output limits instead of hardcoded 16K (#3426)

The Anthropic adapter defaulted to max_tokens=16384 when no explicit value was configured. This severely limits thinking-enabled models where thinking tokens count toward max_tokens: - Claude Opus 4.6 supports 128K output but was capped at 16K - Claude Sonnet 4.6 supports 64K output but was capped at 16K With extended thinking (adaptive or budget-based), the model could exhaust the entire 16K on reasoning, leaving zero tokens for the actual response. This caused two user-visible errors: - 'Response truncated (finish_reason=length)' — thinking consumed most tokens - 'Response only contains think block with no content' — thinking consumed all Fix: add _ANTHROPIC_OUTPUT_LIMITS lookup table (sourced from Anthropic docs and Cline's model catalog) and use the model's actual output limit as the default. Unknown future models default to 128K (the current maximum). Also adds context_length clamping: if the user configured a smaller context window (e.g. custom endpoint), max_tokens is clamped to context_length - 1 to avoid exceeding the window. Closes #2706
2026-03-27 13:02:52 -07:00 · 2026-03-27 13:02:52 -07:00 · 6f11ff53ad
commit 6f11ff53ad
parent fb46a90098
3 changed files with 196 additions and 3 deletions
--- a/agent/anthropic_adapter.py
+++ b/agent/anthropic_adapter.py
@ -35,6 +35,54 @@ ADAPTIVE_EFFORT_MAP = {
    "minimal": "low",
 }

+# ── Max output token limits per Anthropic model ───────────────────────
+# Source: Anthropic docs + Cline model catalog.  Anthropic's API requires
+# max_tokens as a mandatory field.  Previously we hardcoded 16384, which
+# starves thinking-enabled models (thinking tokens count toward the limit).
+_ANTHROPIC_OUTPUT_LIMITS = {
+    # Claude 4.6
+    "claude-opus-4-6":   128_000,
+    "claude-sonnet-4-6":  64_000,
+    # Claude 4.5
+    "claude-opus-4-5":    64_000,
+    "claude-sonnet-4-5":  64_000,
+    "claude-haiku-4-5":   64_000,
+    # Claude 4
+    "claude-opus-4":      32_000,
+    "claude-sonnet-4":    64_000,
+    # Claude 3.7
+    "claude-3-7-sonnet": 128_000,
+    # Claude 3.5
+    "claude-3-5-sonnet":   8_192,
+    "claude-3-5-haiku":    8_192,
+    # Claude 3
+    "claude-3-opus":       4_096,
+    "claude-3-sonnet":     4_096,
+    "claude-3-haiku":      4_096,
+}
+
+# For any model not in the table, assume the highest current limit.
+# Future Anthropic models are unlikely to have *less* output capacity.
+_ANTHROPIC_DEFAULT_OUTPUT_LIMIT = 128_000
+
+
+def _get_anthropic_max_output(model: str) -> int:
+    """Look up the max output token limit for an Anthropic model.
+
+    Uses substring matching against _ANTHROPIC_OUTPUT_LIMITS so date-stamped
+    model IDs (claude-sonnet-4-5-20250929) and variant suffixes (:1m, :fast)
+    resolve correctly.  Longest-prefix match wins to avoid e.g. "claude-3-5"
+    matching before "claude-3-5-sonnet".
+    """
+    m = model.lower()
+    best_key = ""
+    best_val = _ANTHROPIC_DEFAULT_OUTPUT_LIMIT
+    for key, val in _ANTHROPIC_OUTPUT_LIMITS.items():
+        if key in m and len(key) > len(best_key):
+            best_key = key
+            best_val = val
+    return best_val
+

 def _supports_adaptive_thinking(model: str) -> bool:
    """Return True for Claude 4.6 models that support adaptive thinking."""
@ -818,9 +866,15 @@ def build_anthropic_kwargs(
    tool_choice: Optional[str] = None,
    is_oauth: bool = False,
    preserve_dots: bool = False,
+    context_length: Optional[int] = None,
 ) -> Dict[str, Any]:
    """Build kwargs for anthropic.messages.create().

+    When *max_tokens* is None, the model's native output limit is used
+    (e.g. 128K for Opus 4.6, 64K for Sonnet 4.6).  If *context_length*
+    is provided, the effective limit is clamped so it doesn't exceed
+    the context window.
+
    When *is_oauth* is True, applies Claude Code compatibility transforms:
    system prompt prefix, tool name prefixing, and prompt sanitization.

@ -831,7 +885,12 @@ def build_anthropic_kwargs(
    anthropic_tools = convert_tools_to_anthropic(tools) if tools else []

    model = normalize_model_name(model, preserve_dots=preserve_dots)
-    effective_max_tokens = max_tokens or 16384
+    effective_max_tokens = max_tokens or _get_anthropic_max_output(model)
+
+    # Clamp to context window if the user set a lower context_length
+    # (e.g. custom endpoint with limited capacity).
+    if context_length and effective_max_tokens > context_length:
+        effective_max_tokens = max(context_length - 1, 1)

    # ── OAuth: Claude Code identity ──────────────────────────────────
    if is_oauth:
--- a/run_agent.py
+++ b/run_agent.py
@ -4378,6 +4378,10 @@ class AIAgent:
        if self.api_mode == "anthropic_messages":
            from agent.anthropic_adapter import build_anthropic_kwargs
            anthropic_messages = self._prepare_anthropic_messages_for_api(api_messages)
+            # Pass context_length so the adapter can clamp max_tokens if the
+            # user configured a smaller context window than the model's output limit.
+            ctx_len = getattr(self, "context_compressor", None)
+            ctx_len = ctx_len.context_length if ctx_len else None
            return build_anthropic_kwargs(
                model=self.model,
                messages=anthropic_messages,
@ -4386,6 +4390,7 @@ class AIAgent:
                reasoning_config=self.reasoning_config,
                is_oauth=self._is_anthropic_oauth,
                preserve_dots=self._anthropic_preserve_dots(),
+                context_length=ctx_len,
            )

        if self.api_mode == "codex_responses":
--- a/tests/test_anthropic_adapter.py
+++ b/tests/test_anthropic_adapter.py
@ -926,7 +926,8 @@ class TestBuildAnthropicKwargs:
        )
        assert "thinking" not in kwargs

-    def test_default_max_tokens(self):
+    def test_default_max_tokens_uses_model_output_limit(self):
+        """When max_tokens is None, use the model's native output limit."""
        kwargs = build_anthropic_kwargs(
            model="claude-sonnet-4-20250514",
            messages=[{"role": "user", "content": "Hi"}],
@ -934,7 +935,135 @@ class TestBuildAnthropicKwargs:
            max_tokens=None,
            reasoning_config=None,
        )
-        assert kwargs["max_tokens"] == 16384
+        assert kwargs["max_tokens"] == 64_000  # Sonnet 4 output limit
+
+    def test_default_max_tokens_opus_4_6(self):
+        kwargs = build_anthropic_kwargs(
+            model="claude-opus-4-6",
+            messages=[{"role": "user", "content": "Hi"}],
+            tools=None,
+            max_tokens=None,
+            reasoning_config=None,
+        )
+        assert kwargs["max_tokens"] == 128_000
+
+    def test_default_max_tokens_sonnet_4_6(self):
+        kwargs = build_anthropic_kwargs(
+            model="claude-sonnet-4-6",
+            messages=[{"role": "user", "content": "Hi"}],
+            tools=None,
+            max_tokens=None,
+            reasoning_config=None,
+        )
+        assert kwargs["max_tokens"] == 64_000
+
+    def test_default_max_tokens_date_stamped_model(self):
+        """Date-stamped model IDs should resolve via substring match."""
+        kwargs = build_anthropic_kwargs(
+            model="claude-sonnet-4-5-20250929",
+            messages=[{"role": "user", "content": "Hi"}],
+            tools=None,
+            max_tokens=None,
+            reasoning_config=None,
+        )
+        assert kwargs["max_tokens"] == 64_000
+
+    def test_default_max_tokens_older_model(self):
+        kwargs = build_anthropic_kwargs(
+            model="claude-3-5-sonnet-20241022",
+            messages=[{"role": "user", "content": "Hi"}],
+            tools=None,
+            max_tokens=None,
+            reasoning_config=None,
+        )
+        assert kwargs["max_tokens"] == 8_192
+
+    def test_default_max_tokens_unknown_model_uses_highest(self):
+        """Unknown future models should get the highest known limit."""
+        kwargs = build_anthropic_kwargs(
+            model="claude-ultra-5-20260101",
+            messages=[{"role": "user", "content": "Hi"}],
+            tools=None,
+            max_tokens=None,
+            reasoning_config=None,
+        )
+        assert kwargs["max_tokens"] == 128_000
+
+    def test_explicit_max_tokens_overrides_default(self):
+        """User-specified max_tokens should be respected."""
+        kwargs = build_anthropic_kwargs(
+            model="claude-opus-4-6",
+            messages=[{"role": "user", "content": "Hi"}],
+            tools=None,
+            max_tokens=4096,
+            reasoning_config=None,
+        )
+        assert kwargs["max_tokens"] == 4096
+
+    def test_context_length_clamp(self):
+        """max_tokens should be clamped to context_length if it's smaller."""
+        kwargs = build_anthropic_kwargs(
+            model="claude-opus-4-6",  # 128K output
+            messages=[{"role": "user", "content": "Hi"}],
+            tools=None,
+            max_tokens=None,
+            reasoning_config=None,
+            context_length=50000,
+        )
+        assert kwargs["max_tokens"] == 49999  # context_length - 1
+
+    def test_context_length_no_clamp_when_larger(self):
+        """No clamping when context_length exceeds output limit."""
+        kwargs = build_anthropic_kwargs(
+            model="claude-sonnet-4-6",  # 64K output
+            messages=[{"role": "user", "content": "Hi"}],
+            tools=None,
+            max_tokens=None,
+            reasoning_config=None,
+            context_length=200000,
+        )
+        assert kwargs["max_tokens"] == 64_000
+
+
+# ---------------------------------------------------------------------------
+# Model output limit lookup
+# ---------------------------------------------------------------------------
+
+
+class TestGetAnthropicMaxOutput:
+    def test_opus_4_6(self):
+        from agent.anthropic_adapter import _get_anthropic_max_output
+        assert _get_anthropic_max_output("claude-opus-4-6") == 128_000
+
+    def test_opus_4_6_variant(self):
+        from agent.anthropic_adapter import _get_anthropic_max_output
+        assert _get_anthropic_max_output("claude-opus-4-6:1m:fast") == 128_000
+
+    def test_sonnet_4_6(self):
+        from agent.anthropic_adapter import _get_anthropic_max_output
+        assert _get_anthropic_max_output("claude-sonnet-4-6") == 64_000
+
+    def test_sonnet_4_date_stamped(self):
+        from agent.anthropic_adapter import _get_anthropic_max_output
+        assert _get_anthropic_max_output("claude-sonnet-4-20250514") == 64_000
+
+    def test_claude_3_5_sonnet(self):
+        from agent.anthropic_adapter import _get_anthropic_max_output
+        assert _get_anthropic_max_output("claude-3-5-sonnet-20241022") == 8_192
+
+    def test_claude_3_opus(self):
+        from agent.anthropic_adapter import _get_anthropic_max_output
+        assert _get_anthropic_max_output("claude-3-opus-20240229") == 4_096
+
+    def test_unknown_future_model(self):
+        from agent.anthropic_adapter import _get_anthropic_max_output
+        assert _get_anthropic_max_output("claude-ultra-5-20260101") == 128_000
+
+    def test_longest_prefix_wins(self):
+        """'claude-3-5-sonnet' should match before 'claude-3-5'."""
+        from agent.anthropic_adapter import _get_anthropic_max_output
+        # claude-3-5-sonnet (8192) should win over a hypothetical shorter match
+        assert _get_anthropic_max_output("claude-3-5-sonnet-20241022") == 8_192


 # ---------------------------------------------------------------------------