fix(ci): stabilize current main test regressions

This commit is contained in:
Stephen Schoettler 2026-04-30 06:36:50 -07:00
parent 9a14540603
commit 407dfbb021
3 changed files with 55 additions and 27 deletions

View File

@ -216,7 +216,7 @@ class TestSyncBackConflict:
class TestSyncBackRetries:
"""Retry behaviour with exponential backoff."""
@patch("tools.environments.file_sync.time.sleep")
@patch("tools.environments.file_sync._sleep")
def test_sync_back_retries_on_failure(self, mock_sleep, tmp_path):
call_count = 0
@ -237,7 +237,7 @@ class TestSyncBackRetries:
mock_sleep.assert_any_call(_SYNC_BACK_BACKOFF[0])
mock_sleep.assert_any_call(_SYNC_BACK_BACKOFF[1])
@patch("tools.environments.file_sync.time.sleep")
@patch("tools.environments.file_sync._sleep")
def test_sync_back_all_retries_exhausted(self, mock_sleep, tmp_path, caplog):
def always_fail(dest: Path):
raise RuntimeError("persistent failure")

View File

@ -29,6 +29,12 @@ from tools.environments.base import _file_mtime_key
logger = logging.getLogger(__name__)
# Keep retry sleeps patchable without mutating the shared stdlib ``time``
# module. Patching ``tools.environments.file_sync.time.sleep`` replaces
# ``time.sleep`` globally because ``time`` is the module object; under xdist
# that lets unrelated background threads inflate retry-test call counts.
_sleep = time.sleep
_SYNC_INTERVAL_SECONDS = 5.0
_FORCE_SYNC_ENV = "HERMES_FORCE_FILE_SYNC"
@ -243,7 +249,7 @@ class FileSyncManager:
"sync_back: attempt %d failed (%s), retrying in %ds",
attempt + 1, exc, delay,
)
time.sleep(delay)
_sleep(delay)
logger.warning("sync_back: all %d attempts failed: %s", _SYNC_BACK_MAX_RETRIES, last_exc)

View File

@ -383,6 +383,36 @@ class LocalEnvironment(BaseEnvironment):
def _kill_process(self, proc):
"""Kill the entire process group (all children)."""
def _group_alive(pgid: int) -> bool:
try:
# POSIX-only: _IS_WINDOWS is handled before this helper is used.
os.killpg(pgid, 0)
return True
except ProcessLookupError:
return False
except PermissionError:
# The group exists, even if this process cannot signal it.
return True
def _wait_for_group_exit(pgid: int, timeout: float) -> bool:
deadline = time.monotonic() + timeout
while time.monotonic() < deadline:
# Reap the wrapper promptly. A dead but unreaped group leader
# still makes killpg(pgid, 0) report the group as alive.
try:
proc.poll()
except Exception:
pass
if not _group_alive(pgid):
return True
time.sleep(0.05)
try:
proc.poll()
except Exception:
pass
return not _group_alive(pgid)
try:
if _IS_WINDOWS:
proc.terminate()
@ -393,37 +423,29 @@ class LocalEnvironment(BaseEnvironment):
pgid = getattr(proc, "_hermes_pgid", None)
if pgid is None:
raise
os.killpg(pgid, signal.SIGTERM)
deadline = time.monotonic() + 1.0
while time.monotonic() < deadline:
if proc.poll() is not None:
try:
os.killpg(pgid, 0)
except ProcessLookupError:
return
time.sleep(0.05)
# The shell can exit quickly while a child in the same process
# group is still shutting down. Escalate based on the process
# group, not just the shell wrapper, so interrupted commands do
# not leave orphaned grandchildren under load.
try:
# _IS_WINDOWS is guarded by the enclosing else branch.
os.killpg(pgid, signal.SIGTERM)
except ProcessLookupError:
return
# Wait on the process group, not just the shell wrapper. Under
# load the wrapper can exit before grandchildren do; returning
# at that point leaves orphaned process-group members behind.
if _wait_for_group_exit(pgid, 1.0):
return
try:
# POSIX-only: _IS_WINDOWS is handled by the outer branch.
os.killpg(pgid, signal.SIGKILL)
except ProcessLookupError:
return
_wait_for_group_exit(pgid, 2.0)
try:
proc.wait(timeout=1.0)
except subprocess.TimeoutExpired:
proc.wait(timeout=0.2)
except (subprocess.TimeoutExpired, OSError):
pass
deadline = time.monotonic() + 1.0
while time.monotonic() < deadline:
try:
os.killpg(pgid, 0)
except ProcessLookupError:
return
time.sleep(0.05)
except (ProcessLookupError, PermissionError):
except (ProcessLookupError, PermissionError, OSError):
try:
proc.kill()
except Exception: