From c5e3a6fb5bb33477d639219de14922caedda98ef Mon Sep 17 00:00:00 2001 From: CoreyNoDream Date: Sat, 2 May 2026 15:14:03 +0800 Subject: [PATCH] fix(cli): decode .env as UTF-8 to avoid GBK crash on Windows Path.read_text() uses the system locale by default. On Windows CN/JP/KR locales (GBK/CP932/CP949), reading a UTF-8 .env raises UnicodeDecodeError as soon as it contains any non-ASCII byte (e.g. an em dash). Pin encoding="utf-8" on every .env read in hermes_cli to match how the rest of the codebase (load_dotenv at doctor.py:26) already decodes it. Adds a regression test that monkeypatches Path.read_text to simulate a GBK locale and asserts 'hermes doctor' no longer raises. Refs #18637 --- hermes_cli/doctor.py | 7 +++-- hermes_cli/main.py | 2 +- hermes_cli/memory_setup.py | 2 +- tests/hermes_cli/test_doctor.py | 51 +++++++++++++++++++++++++++++++++ 4 files changed, 58 insertions(+), 4 deletions(-) diff --git a/hermes_cli/doctor.py b/hermes_cli/doctor.py index f0822bdc..122ed141 100644 --- a/hermes_cli/doctor.py +++ b/hermes_cli/doctor.py @@ -263,8 +263,11 @@ def run_doctor(args): if env_path.exists(): check_ok(f"{_DHH}/.env file exists") - # Check for common issues - content = env_path.read_text() + # Check for common issues. Pin encoding to UTF-8 because .env files are + # written as UTF-8 everywhere in the codebase, while Path.read_text() + # defaults to the system locale — which crashes on non-UTF-8 Windows + # locales (e.g. GBK) as soon as the file contains any non-ASCII byte. + content = env_path.read_text(encoding="utf-8") if _has_provider_env_config(content): check_ok("API key or custom endpoint configured") else: diff --git a/hermes_cli/main.py b/hermes_cli/main.py index 856d85c6..ed8c24c8 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -289,7 +289,7 @@ def _has_any_provider_configured() -> bool: env_file = get_env_path() if env_file.exists(): try: - for line in env_file.read_text().splitlines(): + for line in env_file.read_text(encoding="utf-8").splitlines(): line = line.strip() if line.startswith("#") or "=" not in line: continue diff --git a/hermes_cli/memory_setup.py b/hermes_cli/memory_setup.py index 88186b8e..158f80a7 100644 --- a/hermes_cli/memory_setup.py +++ b/hermes_cli/memory_setup.py @@ -361,7 +361,7 @@ def _write_env_vars(env_path: Path, env_writes: dict) -> None: existing_lines = [] if env_path.exists(): - existing_lines = env_path.read_text().splitlines() + existing_lines = env_path.read_text(encoding="utf-8").splitlines() updated_keys = set() new_lines = [] diff --git a/tests/hermes_cli/test_doctor.py b/tests/hermes_cli/test_doctor.py index 5fafcb81..4a5981c0 100644 --- a/tests/hermes_cli/test_doctor.py +++ b/tests/hermes_cli/test_doctor.py @@ -51,6 +51,57 @@ class TestProviderEnvDetection: assert not _has_provider_env_config(content) +class TestDoctorEnvFileEncoding: + """Regression for #18637 (bug 3): `hermes doctor` crashed on Windows + Chinese locale (GBK) because `.env` was read with Path.read_text() which + defaults to the system locale encoding, not UTF-8.""" + + def test_doctor_reads_env_as_utf8_even_when_locale_is_not_utf8( + self, monkeypatch, tmp_path + ): + import pathlib + + hermes_home = tmp_path / ".hermes" + hermes_home.mkdir() + # Write a UTF-8 .env containing an em dash (U+2014 = e2 80 94). The + # 0x94 byte is exactly the one the issue reporter hit: it's invalid + # as a GBK trailing byte in this position, so locale-default reads + # raise UnicodeDecodeError on Chinese Windows. + env_path = hermes_home / ".env" + env_path.write_text( + "OPENAI_API_KEY=sk-test # em-dash here — should not crash\n", + encoding="utf-8", + ) + + monkeypatch.setattr(doctor_mod, "HERMES_HOME", hermes_home) + + orig_read_text = pathlib.Path.read_text + + def gbk_like_read_text(self, encoding=None, errors=None, **kwargs): + # Simulate a GBK locale: refuse to decode this specific UTF-8 + # .env unless the caller pins encoding="utf-8". + if self == env_path and encoding != "utf-8": + raise UnicodeDecodeError( + "gbk", b"\x94", 0, 1, "illegal multibyte sequence" + ) + return orig_read_text(self, encoding=encoding, errors=errors, **kwargs) + + monkeypatch.setattr(pathlib.Path, "read_text", gbk_like_read_text) + + # Short-circuit the expensive tool-availability probe — we only + # need doctor to reach the .env read without crashing. + fake_model_tools = types.SimpleNamespace( + check_tool_availability=lambda *a, **kw: (_ for _ in ()).throw(SystemExit(0)), + TOOLSET_REQUIREMENTS={}, + ) + monkeypatch.setitem(sys.modules, "model_tools", fake_model_tools) + + # Run doctor. If the .env read still uses locale encoding, this + # raises UnicodeDecodeError and the test fails. + with pytest.raises(SystemExit): + doctor_mod.run_doctor(Namespace(fix=False)) + + class TestDoctorToolAvailabilityOverrides: def test_marks_honcho_available_when_configured(self, monkeypatch): monkeypatch.setattr(doctor, "_honcho_is_configured_for_doctor", lambda: True)