From 4f57d7116d9f497b9ebf5b9e378f0bbec136567f Mon Sep 17 00:00:00 2001 From: teknium1 Date: Thu, 19 Feb 2026 09:24:04 -0800 Subject: [PATCH] Improved stdout handling in the terminal tool to prevent deadlocks by implementing a background thread to continuously drain output, ensuring smooth command execution without blocking. --- TODO.md | 11 +++++++++++ tools/terminal_tool.py | 37 ++++++++++++++++++++++++++++++------- 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/TODO.md b/TODO.md index 8cee076e..0146734c 100644 --- a/TODO.md +++ b/TODO.md @@ -377,6 +377,17 @@ Both default to `false` in batch_runner and RL environments (checked programmati The bounded memory is the curated layer. For unbounded "long-term memory" -- searching past session transcripts -- see SQLite State Store (#5). A separate `session_search` tool provides ripgrep-style search over the full session history. This is never injected into the system prompt; the agent searches it on demand. +### Known bug: duplicate entries are irrecoverable + +If the same content gets added twice (e.g., agent retries after a failed response generation and re-saves a memory it already saved), both `replace` and `remove` fail with "Multiple entries matched" because content-based matching can't disambiguate identical entries. The user/agent has no way to fix this without manually editing the file. + +**Possible fixes (pick one or combine):** +- **Prevent on add**: reject or silently deduplicate when `content` exactly matches an existing entry. Cheapest fix, prevents the problem entirely. +- **Remove-all flag**: `remove(target, old_text, remove_all=True)` deletes every match. Lets the agent nuke all copies, then re-add one if needed. +- **Index-based fallback**: when multiple matches are found, return them with indices and accept `index` parameter on retry. More complex, but most precise. + +Simplest path: deduplicate on add (check before appending). One-line fix in `memory_tool.py`. + ### Later (optional) - Periodic consolidation via cronjob/subagent: reviews recent session summaries, suggests memory updates. Needs subagents (#1). - Memory import/export: `hermes memory export` / `hermes memory import` for backup/migration. diff --git a/tools/terminal_tool.py b/tools/terminal_tool.py index da2d483d..ae379228 100644 --- a/tools/terminal_tool.py +++ b/tools/terminal_tool.py @@ -642,6 +642,11 @@ class _LocalEnvironment: Uses Popen + polling so the global interrupt event can kill the process early when the user sends a new message, instead of blocking for the full timeout. + + A background reader thread drains stdout continuously to prevent + pipe buffer deadlocks. Without this, commands producing >64KB of + output would block (Linux pipe buffer = 64KB) while the poll loop + waits for the process to finish — a classic deadlock. """ work_dir = cwd or self.cwd or os.getcwd() effective_timeout = timeout or self.timeout @@ -665,6 +670,25 @@ class _LocalEnvironment: preexec_fn=os.setsid, ) + # Drain stdout in a background thread to prevent pipe buffer + # deadlocks. The OS pipe buffer is 64KB on Linux; if the child + # writes more than that before anyone reads, it blocks forever. + _output_chunks: list[str] = [] + def _drain_stdout(): + try: + for line in proc.stdout: + _output_chunks.append(line) + except ValueError: + pass # stdout closed during interrupt/timeout + finally: + try: + proc.stdout.close() + except Exception: + pass + + reader = threading.Thread(target=_drain_stdout, daemon=True) + reader.start() + deadline = time.monotonic() + effective_timeout # Poll every 200ms so we notice interrupts quickly @@ -676,9 +700,8 @@ class _LocalEnvironment: os.killpg(os.getpgid(proc.pid), signal.SIGTERM) except (ProcessLookupError, PermissionError): proc.kill() - # Grab any partial output - partial, _ = proc.communicate(timeout=2) - output = partial or "" + reader.join(timeout=2) + output = "".join(_output_chunks) return { "output": output + "\n[Command interrupted — user sent a new message]", "returncode": 130 # Standard interrupted exit code @@ -690,15 +713,15 @@ class _LocalEnvironment: os.killpg(os.getpgid(proc.pid), signal.SIGTERM) except (ProcessLookupError, PermissionError): proc.kill() - proc.communicate(timeout=2) + reader.join(timeout=2) return {"output": f"Command timed out after {effective_timeout}s", "returncode": 124} # Short sleep to avoid busy-waiting time.sleep(0.2) - # Process finished normally — read all output - stdout, _ = proc.communicate() - return {"output": stdout or "", "returncode": proc.returncode} + # Process finished — wait for reader to drain remaining output + reader.join(timeout=5) + return {"output": "".join(_output_chunks), "returncode": proc.returncode} except Exception as e: return {"output": f"Execution error: {str(e)}", "returncode": 1}