fix: use output_text for assistant message content in Codex Responses API (#15690)

The Codex Responses API rejects input_text inside assistant messages —
only output_text and refusal are valid content types for assistant role.

_chat_content_to_responses_parts() previously hardcoded all text content
to input_text regardless of the message role. When an assistant message
had list-format content (multimodal or structured), this produced invalid
input_text parts that the API rejected with:

  Invalid value: 'input_text'. Supported values are: 'output_text' and 'refusal'.

Fix: add a role parameter to _chat_content_to_responses_parts() that
selects output_text for assistant messages and input_text for user
messages. Thread this through _chat_messages_to_responses_input() and
_preflight_codex_input_items().

Fixes #15687
This commit is contained in:
kshitij 2026-04-25 10:13:29 -07:00 committed by GitHub
parent 7c17accb29
commit 648b89911f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 129 additions and 11 deletions

View File

@ -44,22 +44,31 @@ _TOOL_CALL_LEAK_PATTERN = re.compile(
# Multimodal content helpers
# ---------------------------------------------------------------------------
def _chat_content_to_responses_parts(content: Any) -> List[Dict[str, Any]]:
def _chat_content_to_responses_parts(content: Any, *, role: str = "user") -> List[Dict[str, Any]]:
"""Convert chat-style multimodal content to Responses API input parts.
Input: ``[{"type":"text"|"image_url", ...}]`` (native OpenAI Chat format)
Output: ``[{"type":"input_text"|"input_image", ...}]`` (Responses format)
Output: ``[{"type":"input_text"|"output_text"|"input_image", ...}]`` (Responses format)
The ``role`` parameter controls the text content type:
- ``"user"`` (default) ``"input_text"``
- ``"assistant"`` ``"output_text"``
The Responses API rejects ``input_text`` inside assistant messages and
``output_text`` inside user messages, so callers MUST pass the correct
role for the message being converted.
Returns an empty list when ``content`` is not a list or contains no
recognized parts callers fall back to the string path.
"""
text_type = "output_text" if role == "assistant" else "input_text"
if not isinstance(content, list):
return []
converted: List[Dict[str, Any]] = []
for part in content:
if isinstance(part, str):
if part:
converted.append({"type": "input_text", "text": part})
converted.append({"type": text_type, "text": part})
continue
if not isinstance(part, dict):
continue
@ -67,7 +76,7 @@ def _chat_content_to_responses_parts(content: Any) -> List[Dict[str, Any]]:
if ptype in {"text", "input_text", "output_text"}:
text = part.get("text")
if isinstance(text, str) and text:
converted.append({"type": "input_text", "text": text})
converted.append({"type": text_type, "text": text})
continue
if ptype in {"image_url", "input_image"}:
image_ref = part.get("image_url")
@ -233,9 +242,10 @@ def _chat_messages_to_responses_input(messages: List[Dict[str, Any]]) -> List[Di
if role in {"user", "assistant"}:
content = msg.get("content", "")
if isinstance(content, list):
content_parts = _chat_content_to_responses_parts(content)
content_parts = _chat_content_to_responses_parts(content, role=role)
text_type = "output_text" if role == "assistant" else "input_text"
content_text = "".join(
p.get("text", "") for p in content_parts if p.get("type") == "input_text"
p.get("text", "") for p in content_parts if p.get("type") == text_type
)
else:
content_parts = []
@ -429,13 +439,16 @@ def _preflight_codex_input_items(raw_items: Any) -> List[Dict[str, Any]]:
content = ""
if isinstance(content, list):
# Multimodal content from ``_chat_messages_to_responses_input``
# is already in Responses format (``input_text`` / ``input_image``).
# Validate each part and pass through.
# is already in Responses format (``input_text`` / ``output_text``
# / ``input_image``). Validate each part and pass through.
# Use the correct text type for the role — ``output_text`` for
# assistant messages, ``input_text`` for user messages.
text_type = "output_text" if role == "assistant" else "input_text"
validated: List[Dict[str, Any]] = []
for part_idx, part in enumerate(content):
if isinstance(part, str):
if part:
validated.append({"type": "input_text", "text": part})
validated.append({"type": text_type, "text": part})
continue
if not isinstance(part, dict):
raise ValueError(
@ -446,7 +459,7 @@ def _preflight_codex_input_items(raw_items: Any) -> List[Dict[str, Any]]:
text = part.get("text", "")
if not isinstance(text, str):
text = str(text or "")
validated.append({"type": "input_text", "text": text})
validated.append({"type": text_type, "text": text})
elif ptype in {"input_image", "image_url"}:
image_ref = part.get("image_url", "")
detail = part.get("detail")

View File

@ -12,7 +12,7 @@ from types import SimpleNamespace
from unittest.mock import patch, MagicMock
import pytest
from agent.codex_responses_adapter import _chat_messages_to_responses_input, _normalize_codex_response, _preflight_codex_input_items
from agent.codex_responses_adapter import _chat_content_to_responses_parts, _chat_messages_to_responses_input, _normalize_codex_response, _preflight_codex_input_items
sys.modules.setdefault("fire", types.SimpleNamespace(Fire=lambda *a, **k: None))
sys.modules.setdefault("firecrawl", types.SimpleNamespace(Firecrawl=object))
@ -520,6 +520,111 @@ class TestChatMessagesToResponsesInput:
reasoning_items = [i for i in items if i.get("type") == "reasoning"]
assert len(reasoning_items) == 0
def test_user_multimodal_content_uses_input_text(self, monkeypatch):
"""User messages with list content must use input_text type."""
agent = _make_agent(monkeypatch, "openai-codex", api_mode="codex_responses",
base_url="https://chatgpt.com/backend-api/codex")
messages = [{"role": "user", "content": [
{"type": "text", "text": "find files"},
]}]
items = _chat_messages_to_responses_input(messages)
assert len(items) == 1
assert items[0]["role"] == "user"
content = items[0]["content"]
assert isinstance(content, list)
assert content[0]["type"] == "input_text"
assert content[0]["text"] == "find files"
def test_assistant_multimodal_content_uses_output_text(self, monkeypatch):
"""Assistant messages with list content must use output_text type.
This is the fix for #15687 — the Responses API rejects input_text
inside assistant messages.
"""
agent = _make_agent(monkeypatch, "openai-codex", api_mode="codex_responses",
base_url="https://chatgpt.com/backend-api/codex")
messages = [{"role": "assistant", "content": [
{"type": "text", "text": "I found the files."},
]}]
items = _chat_messages_to_responses_input(messages)
assert len(items) == 1
assert items[0]["role"] == "assistant"
content = items[0]["content"]
assert isinstance(content, list)
assert content[0]["type"] == "output_text"
assert content[0]["text"] == "I found the files."
def test_preflight_preserves_assistant_output_text(self, monkeypatch):
"""_preflight_codex_input_items must preserve output_text for assistant."""
agent = _make_agent(monkeypatch, "openai-codex", api_mode="codex_responses",
base_url="https://chatgpt.com/backend-api/codex")
raw_input = [
{"role": "user", "content": [{"type": "input_text", "text": "hi"}]},
{"role": "assistant", "content": [{"type": "output_text", "text": "hello"}]},
]
normalized = _preflight_codex_input_items(raw_input)
user_content = normalized[0]["content"]
asst_content = normalized[1]["content"]
assert user_content[0]["type"] == "input_text"
assert asst_content[0]["type"] == "output_text"
def test_full_round_trip_with_list_content(self, monkeypatch):
"""End-to-end: user + assistant with list content through both stages."""
agent = _make_agent(monkeypatch, "openai-codex", api_mode="codex_responses",
base_url="https://chatgpt.com/backend-api/codex")
messages = [
{"role": "user", "content": [{"type": "text", "text": "hello"}]},
{"role": "assistant", "content": [{"type": "text", "text": "hi there"}]},
{"role": "user", "content": [{"type": "text", "text": "continue"}]},
]
items = _chat_messages_to_responses_input(messages)
normalized = _preflight_codex_input_items(items)
# User items use input_text
assert normalized[0]["content"][0]["type"] == "input_text"
assert normalized[2]["content"][0]["type"] == "input_text"
# Assistant item uses output_text
assert normalized[1]["content"][0]["type"] == "output_text"
class TestChatContentToResponsesParts:
"""Unit tests for _chat_content_to_responses_parts role parameter (#15687)."""
def test_default_role_emits_input_text(self):
"""Default (user) role emits input_text."""
result = _chat_content_to_responses_parts([{"type": "text", "text": "hello"}])
assert result[0]["type"] == "input_text"
def test_explicit_user_role_emits_input_text(self):
result = _chat_content_to_responses_parts(
[{"type": "text", "text": "hello"}], role="user"
)
assert result[0]["type"] == "input_text"
def test_assistant_role_emits_output_text(self):
result = _chat_content_to_responses_parts(
[{"type": "text", "text": "hello"}], role="assistant"
)
assert result[0]["type"] == "output_text"
def test_assistant_role_with_string_parts(self):
"""String parts in assistant content also get output_text."""
result = _chat_content_to_responses_parts(["hello"], role="assistant")
assert result[0]["type"] == "output_text"
assert result[0]["text"] == "hello"
def test_assistant_role_with_mixed_input_output_text_types(self):
"""Parts already marked input_text or output_text get normalized to role's type."""
parts = [
{"type": "input_text", "text": "a"},
{"type": "output_text", "text": "b"},
{"type": "text", "text": "c"},
]
result = _chat_content_to_responses_parts(parts, role="assistant")
# All text parts should become output_text regardless of original type
assert all(p["type"] == "output_text" for p in result)
assert [p["text"] for p in result] == ["a", "b", "c"]
# ── Response normalization tests ─────────────────────────────────────────────