fix(cli): decode .env as UTF-8 to avoid GBK crash on Windows
Path.read_text() uses the system locale by default. On Windows CN/JP/KR locales (GBK/CP932/CP949), reading a UTF-8 .env raises UnicodeDecodeError as soon as it contains any non-ASCII byte (e.g. an em dash). Pin encoding="utf-8" on every .env read in hermes_cli to match how the rest of the codebase (load_dotenv at doctor.py:26) already decodes it. Adds a regression test that monkeypatches Path.read_text to simulate a GBK locale and asserts 'hermes doctor' no longer raises. Refs #18637
This commit is contained in:
parent
e2cea6eeba
commit
c5e3a6fb5b
@ -263,8 +263,11 @@ def run_doctor(args):
|
|||||||
if env_path.exists():
|
if env_path.exists():
|
||||||
check_ok(f"{_DHH}/.env file exists")
|
check_ok(f"{_DHH}/.env file exists")
|
||||||
|
|
||||||
# Check for common issues
|
# Check for common issues. Pin encoding to UTF-8 because .env files are
|
||||||
content = env_path.read_text()
|
# written as UTF-8 everywhere in the codebase, while Path.read_text()
|
||||||
|
# defaults to the system locale — which crashes on non-UTF-8 Windows
|
||||||
|
# locales (e.g. GBK) as soon as the file contains any non-ASCII byte.
|
||||||
|
content = env_path.read_text(encoding="utf-8")
|
||||||
if _has_provider_env_config(content):
|
if _has_provider_env_config(content):
|
||||||
check_ok("API key or custom endpoint configured")
|
check_ok("API key or custom endpoint configured")
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -289,7 +289,7 @@ def _has_any_provider_configured() -> bool:
|
|||||||
env_file = get_env_path()
|
env_file = get_env_path()
|
||||||
if env_file.exists():
|
if env_file.exists():
|
||||||
try:
|
try:
|
||||||
for line in env_file.read_text().splitlines():
|
for line in env_file.read_text(encoding="utf-8").splitlines():
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if line.startswith("#") or "=" not in line:
|
if line.startswith("#") or "=" not in line:
|
||||||
continue
|
continue
|
||||||
|
|||||||
@ -361,7 +361,7 @@ def _write_env_vars(env_path: Path, env_writes: dict) -> None:
|
|||||||
|
|
||||||
existing_lines = []
|
existing_lines = []
|
||||||
if env_path.exists():
|
if env_path.exists():
|
||||||
existing_lines = env_path.read_text().splitlines()
|
existing_lines = env_path.read_text(encoding="utf-8").splitlines()
|
||||||
|
|
||||||
updated_keys = set()
|
updated_keys = set()
|
||||||
new_lines = []
|
new_lines = []
|
||||||
|
|||||||
@ -51,6 +51,57 @@ class TestProviderEnvDetection:
|
|||||||
assert not _has_provider_env_config(content)
|
assert not _has_provider_env_config(content)
|
||||||
|
|
||||||
|
|
||||||
|
class TestDoctorEnvFileEncoding:
|
||||||
|
"""Regression for #18637 (bug 3): `hermes doctor` crashed on Windows
|
||||||
|
Chinese locale (GBK) because `.env` was read with Path.read_text() which
|
||||||
|
defaults to the system locale encoding, not UTF-8."""
|
||||||
|
|
||||||
|
def test_doctor_reads_env_as_utf8_even_when_locale_is_not_utf8(
|
||||||
|
self, monkeypatch, tmp_path
|
||||||
|
):
|
||||||
|
import pathlib
|
||||||
|
|
||||||
|
hermes_home = tmp_path / ".hermes"
|
||||||
|
hermes_home.mkdir()
|
||||||
|
# Write a UTF-8 .env containing an em dash (U+2014 = e2 80 94). The
|
||||||
|
# 0x94 byte is exactly the one the issue reporter hit: it's invalid
|
||||||
|
# as a GBK trailing byte in this position, so locale-default reads
|
||||||
|
# raise UnicodeDecodeError on Chinese Windows.
|
||||||
|
env_path = hermes_home / ".env"
|
||||||
|
env_path.write_text(
|
||||||
|
"OPENAI_API_KEY=sk-test # em-dash here — should not crash\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
monkeypatch.setattr(doctor_mod, "HERMES_HOME", hermes_home)
|
||||||
|
|
||||||
|
orig_read_text = pathlib.Path.read_text
|
||||||
|
|
||||||
|
def gbk_like_read_text(self, encoding=None, errors=None, **kwargs):
|
||||||
|
# Simulate a GBK locale: refuse to decode this specific UTF-8
|
||||||
|
# .env unless the caller pins encoding="utf-8".
|
||||||
|
if self == env_path and encoding != "utf-8":
|
||||||
|
raise UnicodeDecodeError(
|
||||||
|
"gbk", b"\x94", 0, 1, "illegal multibyte sequence"
|
||||||
|
)
|
||||||
|
return orig_read_text(self, encoding=encoding, errors=errors, **kwargs)
|
||||||
|
|
||||||
|
monkeypatch.setattr(pathlib.Path, "read_text", gbk_like_read_text)
|
||||||
|
|
||||||
|
# Short-circuit the expensive tool-availability probe — we only
|
||||||
|
# need doctor to reach the .env read without crashing.
|
||||||
|
fake_model_tools = types.SimpleNamespace(
|
||||||
|
check_tool_availability=lambda *a, **kw: (_ for _ in ()).throw(SystemExit(0)),
|
||||||
|
TOOLSET_REQUIREMENTS={},
|
||||||
|
)
|
||||||
|
monkeypatch.setitem(sys.modules, "model_tools", fake_model_tools)
|
||||||
|
|
||||||
|
# Run doctor. If the .env read still uses locale encoding, this
|
||||||
|
# raises UnicodeDecodeError and the test fails.
|
||||||
|
with pytest.raises(SystemExit):
|
||||||
|
doctor_mod.run_doctor(Namespace(fix=False))
|
||||||
|
|
||||||
|
|
||||||
class TestDoctorToolAvailabilityOverrides:
|
class TestDoctorToolAvailabilityOverrides:
|
||||||
def test_marks_honcho_available_when_configured(self, monkeypatch):
|
def test_marks_honcho_available_when_configured(self, monkeypatch):
|
||||||
monkeypatch.setattr(doctor, "_honcho_is_configured_for_doctor", lambda: True)
|
monkeypatch.setattr(doctor, "_honcho_is_configured_for_doctor", lambda: True)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user