From f336ae3d7de8056352383105d6d23bd554ce8b10 Mon Sep 17 00:00:00 2001 From: Teknium Date: Sun, 19 Apr 2026 11:26:02 -0700 Subject: [PATCH] fix(environments): use incremental UTF-8 decoder in select-based drain MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The first draft of the fix called `chunk.decode("utf-8")` directly on each 4096-byte `os.read()` result, which corrupts output whenever a multi-byte UTF-8 character straddles a read boundary: * `UnicodeDecodeError` fires on the valid-but-truncated byte sequence. * The except handler clears ALL previously-decoded output and replaces the whole buffer with `[binary output detected ...]`. Empirically: 10000 '日' chars (30001 bytes) through the wrapper loses all 10000 characters on the first draft; the baseline TextIOWrapper drain (which uses `encoding='utf-8', errors='replace'` on Popen) preserves them all. This regression affects any command emitting non-ASCII output larger than one chunk — CJK/Arabic/emoji in `npm install`, `pip install`, `docker logs`, `kubectl logs`, etc. Fix: swap to `codecs.getincrementaldecoder('utf-8')(errors='replace')`, which buffers partial multi-byte sequences across chunks and substitutes U+FFFD for genuinely invalid bytes. Flush on drain exit via `decoder.decode(b'', final=True)` to emit any trailing replacement character for a dangling partial sequence. Adds two regression tests: * test_utf8_multibyte_across_read_boundary — 10000 U+65E5 chars, verifies count round-trips and no fallback fires. * test_invalid_utf8_uses_replacement_not_fallback — deliberate \xff\xfe between valid ASCII, verifies surrounding text survives. --- .../tools/test_local_background_child_hang.py | 43 ++++++++++++ tools/environments/base.py | 67 ++++++++++++------- 2 files changed, 84 insertions(+), 26 deletions(-) diff --git a/tests/tools/test_local_background_child_hang.py b/tests/tools/test_local_background_child_hang.py index e8014e87..a8cc0ba1 100644 --- a/tests/tools/test_local_background_child_hang.py +++ b/tests/tools/test_local_background_child_hang.py @@ -109,3 +109,46 @@ class TestBackgroundChildDoesNotHang: assert "日本語" in result["output"] assert "café" in result["output"] assert "résumé" in result["output"] + + def test_utf8_multibyte_across_read_boundary(self, local_env): + """Multibyte UTF-8 characters straddling a 4096-byte ``os.read()`` boundary + must be decoded correctly via the incremental decoder — not lost to a + ``UnicodeDecodeError`` fallback. Regression for a bug in the first draft + of the fix where a strict ``bytes.decode('utf-8')`` on each raw chunk + wiped the entire buffer as soon as any chunk split a multi-byte char. + """ + # 10000 "日" chars = 30000 bytes — guaranteed to cross multiple 4096 + # read boundaries, and most boundaries will land in the middle of the + # 3-byte UTF-8 encoding of U+65E5. + cmd = ( + 'python3 -c \'import sys; ' + 'sys.stdout.buffer.write(chr(0x65e5).encode("utf-8") * 10000); ' + 'sys.stdout.buffer.write(b"\\n")\'' + ) + result = local_env.execute(cmd, timeout=10) + assert result["returncode"] == 0 + # All 10000 characters must survive the round-trip + assert result["output"].count("\u65e5") == 10000, ( + f"lost multibyte chars across read boundaries: got " + f"{result['output'].count(chr(0x65e5))} / 10000" + ) + # And the "[binary output detected ...]" fallback must NOT fire + assert "binary output detected" not in result["output"] + + def test_invalid_utf8_uses_replacement_not_fallback(self, local_env): + """Truly invalid byte sequences must be substituted with U+FFFD (matching + the pre-fix ``errors='replace'`` behaviour of the old ``TextIOWrapper`` + drain), not clobber the entire buffer with a fallback placeholder. + """ + # Write a deliberate invalid UTF-8 lead byte sandwiched between valid ASCII + cmd = ( + 'python3 -c \'import sys; ' + 'sys.stdout.buffer.write(b"before "); ' + 'sys.stdout.buffer.write(b"\\xff\\xfe"); ' + 'sys.stdout.buffer.write(b" after\\n")\'' + ) + result = local_env.execute(cmd, timeout=5) + assert result["returncode"] == 0 + assert "before" in result["output"] + assert "after" in result["output"] + assert "binary output detected" not in result["output"] diff --git a/tools/environments/base.py b/tools/environments/base.py index a7f0a849..cde78e1d 100644 --- a/tools/environments/base.py +++ b/tools/environments/base.py @@ -6,6 +6,7 @@ re-sourced before each command. CWD persists via in-band stdout markers (remote) or a temp file (local). """ +import codecs import json import logging import os @@ -453,37 +454,51 @@ class BaseEnvironment(ABC): # shortly after ``bash`` exits even if the pipe hasn't EOF'd yet. # Any output the grandchild writes after that point goes to an # orphaned pipe (harmless — the kernel reaps it when our end closes). + # + # Decoding: we ``os.read()`` raw bytes in fixed-size chunks (4096) + # so a single multibyte UTF-8 character can split across reads. An + # incremental decoder buffers partial sequences across chunks, and + # ``errors="replace"`` mirrors the baseline ``TextIOWrapper`` (which + # was constructed with ``encoding="utf-8", errors="replace"`` on + # ``Popen``) so binary or mis-encoded output is preserved with + # U+FFFD substitution rather than clobbering the whole buffer. + decoder = codecs.getincrementaldecoder("utf-8")(errors="replace") + def _drain(): fd = proc.stdout.fileno() idle_after_exit = 0 - while True: - try: - ready, _, _ = select.select([fd], [], [], 0.1) - except (ValueError, OSError): - break # fd already closed - if ready: + try: + while True: try: - chunk = os.read(fd, 4096) + ready, _, _ = select.select([fd], [], [], 0.1) except (ValueError, OSError): - break - if not chunk: - break # true EOF — all writers closed - try: - output_chunks.append(chunk.decode("utf-8")) - except UnicodeDecodeError: - output_chunks.clear() - output_chunks.append( - "[binary output detected — raw bytes not displayable]" - ) - break - idle_after_exit = 0 - elif proc.poll() is not None: - # bash is gone and the pipe was idle for ~100ms. Give - # it two more cycles to catch any buffered tail, then - # stop — otherwise we wait forever on a grandchild pipe. - idle_after_exit += 1 - if idle_after_exit >= 3: - break + break # fd already closed + if ready: + try: + chunk = os.read(fd, 4096) + except (ValueError, OSError): + break + if not chunk: + break # true EOF — all writers closed + output_chunks.append(decoder.decode(chunk)) + idle_after_exit = 0 + elif proc.poll() is not None: + # bash is gone and the pipe was idle for ~100ms. Give + # it two more cycles to catch any buffered tail, then + # stop — otherwise we wait forever on a grandchild pipe. + idle_after_exit += 1 + if idle_after_exit >= 3: + break + finally: + # Flush any bytes buffered mid-sequence. With ``errors="replace"`` + # this emits U+FFFD for any final incomplete sequence rather than + # raising. + try: + tail = decoder.decode(b"", final=True) + if tail: + output_chunks.append(tail) + except Exception: + pass drain_thread = threading.Thread(target=_drain, daemon=True) drain_thread.start()