From 648b89911f1c9f16feabed1f2551aa7ee4fddfd0 Mon Sep 17 00:00:00 2001 From: kshitij <82637225+kshitijk4poor@users.noreply.github.com> Date: Sat, 25 Apr 2026 10:13:29 -0700 Subject: [PATCH] fix: use output_text for assistant message content in Codex Responses API (#15690) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Codex Responses API rejects input_text inside assistant messages — only output_text and refusal are valid content types for assistant role. _chat_content_to_responses_parts() previously hardcoded all text content to input_text regardless of the message role. When an assistant message had list-format content (multimodal or structured), this produced invalid input_text parts that the API rejected with: Invalid value: 'input_text'. Supported values are: 'output_text' and 'refusal'. Fix: add a role parameter to _chat_content_to_responses_parts() that selects output_text for assistant messages and input_text for user messages. Thread this through _chat_messages_to_responses_input() and _preflight_codex_input_items(). Fixes #15687 --- agent/codex_responses_adapter.py | 33 +++++--- tests/run_agent/test_provider_parity.py | 107 +++++++++++++++++++++++- 2 files changed, 129 insertions(+), 11 deletions(-) diff --git a/agent/codex_responses_adapter.py b/agent/codex_responses_adapter.py index 3b007a76..798ea085 100644 --- a/agent/codex_responses_adapter.py +++ b/agent/codex_responses_adapter.py @@ -44,22 +44,31 @@ _TOOL_CALL_LEAK_PATTERN = re.compile( # Multimodal content helpers # --------------------------------------------------------------------------- -def _chat_content_to_responses_parts(content: Any) -> List[Dict[str, Any]]: +def _chat_content_to_responses_parts(content: Any, *, role: str = "user") -> List[Dict[str, Any]]: """Convert chat-style multimodal content to Responses API input parts. Input: ``[{"type":"text"|"image_url", ...}]`` (native OpenAI Chat format) - Output: ``[{"type":"input_text"|"input_image", ...}]`` (Responses format) + Output: ``[{"type":"input_text"|"output_text"|"input_image", ...}]`` (Responses format) + + The ``role`` parameter controls the text content type: + - ``"user"`` (default) → ``"input_text"`` + - ``"assistant"`` → ``"output_text"`` + + The Responses API rejects ``input_text`` inside assistant messages and + ``output_text`` inside user messages, so callers MUST pass the correct + role for the message being converted. Returns an empty list when ``content`` is not a list or contains no recognized parts — callers fall back to the string path. """ + text_type = "output_text" if role == "assistant" else "input_text" if not isinstance(content, list): return [] converted: List[Dict[str, Any]] = [] for part in content: if isinstance(part, str): if part: - converted.append({"type": "input_text", "text": part}) + converted.append({"type": text_type, "text": part}) continue if not isinstance(part, dict): continue @@ -67,7 +76,7 @@ def _chat_content_to_responses_parts(content: Any) -> List[Dict[str, Any]]: if ptype in {"text", "input_text", "output_text"}: text = part.get("text") if isinstance(text, str) and text: - converted.append({"type": "input_text", "text": text}) + converted.append({"type": text_type, "text": text}) continue if ptype in {"image_url", "input_image"}: image_ref = part.get("image_url") @@ -233,9 +242,10 @@ def _chat_messages_to_responses_input(messages: List[Dict[str, Any]]) -> List[Di if role in {"user", "assistant"}: content = msg.get("content", "") if isinstance(content, list): - content_parts = _chat_content_to_responses_parts(content) + content_parts = _chat_content_to_responses_parts(content, role=role) + text_type = "output_text" if role == "assistant" else "input_text" content_text = "".join( - p.get("text", "") for p in content_parts if p.get("type") == "input_text" + p.get("text", "") for p in content_parts if p.get("type") == text_type ) else: content_parts = [] @@ -429,13 +439,16 @@ def _preflight_codex_input_items(raw_items: Any) -> List[Dict[str, Any]]: content = "" if isinstance(content, list): # Multimodal content from ``_chat_messages_to_responses_input`` - # is already in Responses format (``input_text`` / ``input_image``). - # Validate each part and pass through. + # is already in Responses format (``input_text`` / ``output_text`` + # / ``input_image``). Validate each part and pass through. + # Use the correct text type for the role — ``output_text`` for + # assistant messages, ``input_text`` for user messages. + text_type = "output_text" if role == "assistant" else "input_text" validated: List[Dict[str, Any]] = [] for part_idx, part in enumerate(content): if isinstance(part, str): if part: - validated.append({"type": "input_text", "text": part}) + validated.append({"type": text_type, "text": part}) continue if not isinstance(part, dict): raise ValueError( @@ -446,7 +459,7 @@ def _preflight_codex_input_items(raw_items: Any) -> List[Dict[str, Any]]: text = part.get("text", "") if not isinstance(text, str): text = str(text or "") - validated.append({"type": "input_text", "text": text}) + validated.append({"type": text_type, "text": text}) elif ptype in {"input_image", "image_url"}: image_ref = part.get("image_url", "") detail = part.get("detail") diff --git a/tests/run_agent/test_provider_parity.py b/tests/run_agent/test_provider_parity.py index f96dbf42..3b7993c3 100644 --- a/tests/run_agent/test_provider_parity.py +++ b/tests/run_agent/test_provider_parity.py @@ -12,7 +12,7 @@ from types import SimpleNamespace from unittest.mock import patch, MagicMock import pytest -from agent.codex_responses_adapter import _chat_messages_to_responses_input, _normalize_codex_response, _preflight_codex_input_items +from agent.codex_responses_adapter import _chat_content_to_responses_parts, _chat_messages_to_responses_input, _normalize_codex_response, _preflight_codex_input_items sys.modules.setdefault("fire", types.SimpleNamespace(Fire=lambda *a, **k: None)) sys.modules.setdefault("firecrawl", types.SimpleNamespace(Firecrawl=object)) @@ -520,6 +520,111 @@ class TestChatMessagesToResponsesInput: reasoning_items = [i for i in items if i.get("type") == "reasoning"] assert len(reasoning_items) == 0 + def test_user_multimodal_content_uses_input_text(self, monkeypatch): + """User messages with list content must use input_text type.""" + agent = _make_agent(monkeypatch, "openai-codex", api_mode="codex_responses", + base_url="https://chatgpt.com/backend-api/codex") + messages = [{"role": "user", "content": [ + {"type": "text", "text": "find files"}, + ]}] + items = _chat_messages_to_responses_input(messages) + assert len(items) == 1 + assert items[0]["role"] == "user" + content = items[0]["content"] + assert isinstance(content, list) + assert content[0]["type"] == "input_text" + assert content[0]["text"] == "find files" + + def test_assistant_multimodal_content_uses_output_text(self, monkeypatch): + """Assistant messages with list content must use output_text type. + + This is the fix for #15687 — the Responses API rejects input_text + inside assistant messages. + """ + agent = _make_agent(monkeypatch, "openai-codex", api_mode="codex_responses", + base_url="https://chatgpt.com/backend-api/codex") + messages = [{"role": "assistant", "content": [ + {"type": "text", "text": "I found the files."}, + ]}] + items = _chat_messages_to_responses_input(messages) + assert len(items) == 1 + assert items[0]["role"] == "assistant" + content = items[0]["content"] + assert isinstance(content, list) + assert content[0]["type"] == "output_text" + assert content[0]["text"] == "I found the files." + + def test_preflight_preserves_assistant_output_text(self, monkeypatch): + """_preflight_codex_input_items must preserve output_text for assistant.""" + agent = _make_agent(monkeypatch, "openai-codex", api_mode="codex_responses", + base_url="https://chatgpt.com/backend-api/codex") + raw_input = [ + {"role": "user", "content": [{"type": "input_text", "text": "hi"}]}, + {"role": "assistant", "content": [{"type": "output_text", "text": "hello"}]}, + ] + normalized = _preflight_codex_input_items(raw_input) + user_content = normalized[0]["content"] + asst_content = normalized[1]["content"] + assert user_content[0]["type"] == "input_text" + assert asst_content[0]["type"] == "output_text" + + def test_full_round_trip_with_list_content(self, monkeypatch): + """End-to-end: user + assistant with list content through both stages.""" + agent = _make_agent(monkeypatch, "openai-codex", api_mode="codex_responses", + base_url="https://chatgpt.com/backend-api/codex") + messages = [ + {"role": "user", "content": [{"type": "text", "text": "hello"}]}, + {"role": "assistant", "content": [{"type": "text", "text": "hi there"}]}, + {"role": "user", "content": [{"type": "text", "text": "continue"}]}, + ] + items = _chat_messages_to_responses_input(messages) + normalized = _preflight_codex_input_items(items) + + # User items use input_text + assert normalized[0]["content"][0]["type"] == "input_text" + assert normalized[2]["content"][0]["type"] == "input_text" + # Assistant item uses output_text + assert normalized[1]["content"][0]["type"] == "output_text" + + +class TestChatContentToResponsesParts: + """Unit tests for _chat_content_to_responses_parts role parameter (#15687).""" + + def test_default_role_emits_input_text(self): + """Default (user) role emits input_text.""" + result = _chat_content_to_responses_parts([{"type": "text", "text": "hello"}]) + assert result[0]["type"] == "input_text" + + def test_explicit_user_role_emits_input_text(self): + result = _chat_content_to_responses_parts( + [{"type": "text", "text": "hello"}], role="user" + ) + assert result[0]["type"] == "input_text" + + def test_assistant_role_emits_output_text(self): + result = _chat_content_to_responses_parts( + [{"type": "text", "text": "hello"}], role="assistant" + ) + assert result[0]["type"] == "output_text" + + def test_assistant_role_with_string_parts(self): + """String parts in assistant content also get output_text.""" + result = _chat_content_to_responses_parts(["hello"], role="assistant") + assert result[0]["type"] == "output_text" + assert result[0]["text"] == "hello" + + def test_assistant_role_with_mixed_input_output_text_types(self): + """Parts already marked input_text or output_text get normalized to role's type.""" + parts = [ + {"type": "input_text", "text": "a"}, + {"type": "output_text", "text": "b"}, + {"type": "text", "text": "c"}, + ] + result = _chat_content_to_responses_parts(parts, role="assistant") + # All text parts should become output_text regardless of original type + assert all(p["type"] == "output_text" for p in result) + assert [p["text"] for p in result] == ["a", "b", "c"] + # ── Response normalization tests ─────────────────────────────────────────────