fix(cli): decode .env as UTF-8 to avoid GBK crash on Windows

Path.read_text() uses the system locale by default. On Windows CN/JP/KR locales (GBK/CP932/CP949), reading a UTF-8 .env raises UnicodeDecodeError as soon as it contains any non-ASCII byte (e.g. an em dash). Pin encoding="utf-8" on every .env read in hermes_cli to match how the rest of the codebase (load_dotenv at doctor.py:26) already decodes it. Adds a regression test that monkeypatches Path.read_text to simulate a GBK locale and asserts 'hermes doctor' no longer raises. Refs #18637
2026-05-02 15:14:03 +08:00 · 2026-05-02 15:14:03 +08:00 · c5e3a6fb5b
commit c5e3a6fb5b
parent e2cea6eeba
4 changed files with 58 additions and 4 deletions
--- a/hermes_cli/doctor.py
+++ b/hermes_cli/doctor.py
@ -263,8 +263,11 @@ def run_doctor(args):
    if env_path.exists():
        check_ok(f"{_DHH}/.env file exists")
-        # Check for common issues
+        # Check for common issues. Pin encoding to UTF-8 because .env files are
-        content = env_path.read_text()
+        # written as UTF-8 everywhere in the codebase, while Path.read_text()
        # defaults to the system locale — which crashes on non-UTF-8 Windows
        # locales (e.g. GBK) as soon as the file contains any non-ASCII byte.
        content = env_path.read_text(encoding="utf-8")
        if _has_provider_env_config(content):
            check_ok("API key or custom endpoint configured")
        else:
--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@ -289,7 +289,7 @@ def _has_any_provider_configured() -> bool:
    env_file = get_env_path()
    if env_file.exists():
        try:
-            for line in env_file.read_text().splitlines():
+            for line in env_file.read_text(encoding="utf-8").splitlines():
                line = line.strip()
                if line.startswith("#") or "=" not in line:
                    continue
--- a/hermes_cli/memory_setup.py
+++ b/hermes_cli/memory_setup.py
@ -361,7 +361,7 @@ def _write_env_vars(env_path: Path, env_writes: dict) -> None:
    existing_lines = []
    if env_path.exists():
-        existing_lines = env_path.read_text().splitlines()
+        existing_lines = env_path.read_text(encoding="utf-8").splitlines()
    updated_keys = set()
    new_lines = []
--- a/tests/hermes_cli/test_doctor.py
+++ b/tests/hermes_cli/test_doctor.py
@ -51,6 +51,57 @@ class TestProviderEnvDetection:
        assert not _has_provider_env_config(content)
 class TestDoctorEnvFileEncoding:
    """Regression for #18637 (bug 3): `hermes doctor` crashed on Windows
    Chinese locale (GBK) because `.env` was read with Path.read_text() which
    defaults to the system locale encoding, not UTF-8."""
    def test_doctor_reads_env_as_utf8_even_when_locale_is_not_utf8(
        self, monkeypatch, tmp_path
    ):
        import pathlib
        hermes_home = tmp_path / ".hermes"
        hermes_home.mkdir()
        # Write a UTF-8 .env containing an em dash (U+2014 = e2 80 94). The
        # 0x94 byte is exactly the one the issue reporter hit: it's invalid
        # as a GBK trailing byte in this position, so locale-default reads
        # raise UnicodeDecodeError on Chinese Windows.
        env_path = hermes_home / ".env"
        env_path.write_text(
            "OPENAI_API_KEY=sk-test  # em-dash here — should not crash\n",
            encoding="utf-8",
        )
        monkeypatch.setattr(doctor_mod, "HERMES_HOME", hermes_home)
        orig_read_text = pathlib.Path.read_text
        def gbk_like_read_text(self, encoding=None, errors=None, **kwargs):
            # Simulate a GBK locale: refuse to decode this specific UTF-8
            # .env unless the caller pins encoding="utf-8".
            if self == env_path and encoding != "utf-8":
                raise UnicodeDecodeError(
                    "gbk", b"\x94", 0, 1, "illegal multibyte sequence"
                )
            return orig_read_text(self, encoding=encoding, errors=errors, **kwargs)
        monkeypatch.setattr(pathlib.Path, "read_text", gbk_like_read_text)
        # Short-circuit the expensive tool-availability probe — we only
        # need doctor to reach the .env read without crashing.
        fake_model_tools = types.SimpleNamespace(
            check_tool_availability=lambda *a, **kw: (_ for _ in ()).throw(SystemExit(0)),
            TOOLSET_REQUIREMENTS={},
        )
        monkeypatch.setitem(sys.modules, "model_tools", fake_model_tools)
        # Run doctor. If the .env read still uses locale encoding, this
        # raises UnicodeDecodeError and the test fails.
        with pytest.raises(SystemExit):
            doctor_mod.run_doctor(Namespace(fix=False))
 class TestDoctorToolAvailabilityOverrides:
    def test_marks_honcho_available_when_configured(self, monkeypatch):
        monkeypatch.setattr(doctor, "_honcho_is_configured_for_doctor", lambda: True)