feat(checkpoints): auto-prune orphan and stale shadow repos at startup (#16303)

Every working dir hermes ever touches gets its own shadow git repo under
~/.hermes/checkpoints/{sha256(abs_dir)[:16]}/.  The per-repo _prune is a
no-op (comment in CheckpointManager._prune says so), so abandoned repos
from deleted/moved projects or one-off tmp dirs pile up forever.  Field
reports put the typical offender at 1000+ repos / ~12 GB on active
contributor machines.

Adds an opt-in startup sweep that mirrors the sessions.auto_prune
pattern from #13861 / #16286:

- tools/checkpoint_manager.py: new prune_checkpoints() and
  maybe_auto_prune_checkpoints() helpers.  Deletes shadow repos that
  are orphan (HERMES_WORKDIR marker points to a path that no longer
  exists) or stale (newest in-repo mtime older than retention_days).
  Idempotent via a CHECKPOINT_BASE/.last_prune marker file so it only
  runs once per min_interval_hours regardless of how many hermes
  processes start up.
- hermes_cli/config.py: new checkpoints.auto_prune /
  retention_days / delete_orphans / min_interval_hours knobs.
  Default auto_prune: false so users who rely on /rollback against
  long-ago sessions never lose data silently.
- cli.py / gateway/run.py: startup hooks gated on checkpoints.auto_prune,
  called right next to the existing state.db maintenance block.
- Docs updated with the new config knobs.
- 11 regression tests: orphan/stale deletion, precedence, byte-freed
  tracking, non-shadow dir skip, interval gating, corrupt marker
  recovery.

Refs #3015 (session-file disk growth was fixed in #16286; this covers
the checkpoint side noted out-of-scope there).
This commit is contained in:
Teknium 2026-04-26 19:05:52 -07:00 committed by GitHub
parent ced8f44cd2
commit 478444c262
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 458 additions and 0 deletions

28
cli.py
View File

@ -988,6 +988,29 @@ def _run_state_db_auto_maintenance(session_db) -> None:
logger.debug("state.db auto-maintenance skipped: %s", exc)
def _run_checkpoint_auto_maintenance() -> None:
"""Call ``checkpoint_manager.maybe_auto_prune_checkpoints`` using current config.
Reads the ``checkpoints:`` section from config.yaml via
:func:`hermes_cli.config.load_config`. Honours ``auto_prune`` /
``retention_days`` / ``delete_orphans`` / ``min_interval_hours``.
Never raises maintenance must never block interactive startup.
"""
try:
from hermes_cli.config import load_config as _load_full_config
cfg = (_load_full_config().get("checkpoints") or {})
if not cfg.get("auto_prune", False):
return
from tools.checkpoint_manager import maybe_auto_prune_checkpoints
maybe_auto_prune_checkpoints(
retention_days=int(cfg.get("retention_days", 7)),
min_interval_hours=int(cfg.get("min_interval_hours", 24)),
delete_orphans=bool(cfg.get("delete_orphans", True)),
)
except Exception as exc:
logger.debug("checkpoint auto-maintenance skipped: %s", exc)
def _prune_stale_worktrees(repo_root: str, max_age_hours: int = 24) -> None:
"""Remove stale worktrees and orphaned branches on startup.
@ -2054,6 +2077,11 @@ class HermesCLI:
# Never blocks startup on failure.
_run_state_db_auto_maintenance(self._session_db)
# Opportunistic shadow-repo cleanup — deletes orphan/stale
# checkpoint repos under ~/.hermes/checkpoints/. Opt-in via
# checkpoints.auto_prune, idempotent via .last_prune marker.
_run_checkpoint_auto_maintenance()
# Deferred title: stored in memory until the session is created in the DB
self._pending_title: Optional[str] = None

View File

@ -768,6 +768,22 @@ class GatewayRunner:
except Exception as exc:
logger.debug("state.db auto-maintenance skipped: %s", exc)
# Opportunistic shadow-repo cleanup — deletes orphan/stale
# checkpoint repos under ~/.hermes/checkpoints/. Opt-in via
# checkpoints.auto_prune, idempotent via .last_prune marker.
try:
from hermes_cli.config import load_config as _load_full_config
_ckpt_cfg = (_load_full_config().get("checkpoints") or {})
if _ckpt_cfg.get("auto_prune", False):
from tools.checkpoint_manager import maybe_auto_prune_checkpoints
maybe_auto_prune_checkpoints(
retention_days=int(_ckpt_cfg.get("retention_days", 7)),
min_interval_hours=int(_ckpt_cfg.get("min_interval_hours", 24)),
delete_orphans=bool(_ckpt_cfg.get("delete_orphans", True)),
)
except Exception as exc:
logger.debug("checkpoint auto-maintenance skipped: %s", exc)
# DM pairing store for code-based user authorization
from gateway.pairing import PairingStore
self.pairing_store = PairingStore()

View File

@ -487,6 +487,19 @@ DEFAULT_CONFIG = {
"checkpoints": {
"enabled": True,
"max_snapshots": 50, # Max checkpoints to keep per directory
# Auto-maintenance: shadow repos accumulate forever under
# ~/.hermes/checkpoints/ (one per cd'd working directory). Field
# reports put the typical offender at 1000+ repos / ~12 GB. When
# auto_prune is on, hermes sweeps at startup (at most once per
# min_interval_hours) and deletes:
# * orphan repos: HERMES_WORKDIR no longer exists on disk
# * stale repos: newest mtime older than retention_days
# Opt-in so users who rely on /rollback against long-ago sessions
# never lose data silently.
"auto_prune": False,
"retention_days": 7,
"delete_orphans": True,
"min_interval_hours": 24,
},
# Maximum characters returned by a single read_file call. Reads that

View File

@ -717,3 +717,193 @@ class TestGpgAndGlobalConfigIsolation:
mgr = CheckpointManager(enabled=True)
assert mgr.ensure_checkpoint(str(work_dir), reason="prefix-shadow") is True
assert len(mgr.list_checkpoints(str(work_dir))) == 1
# =========================================================================
# Auto-maintenance: prune_checkpoints + maybe_auto_prune_checkpoints
# =========================================================================
class TestPruneCheckpoints:
"""Sweep orphan/stale shadow repos under CHECKPOINT_BASE (issue #3015 follow-up)."""
def _seed_shadow_repo(
self, base: Path, dir_hash: str, workdir: Path, mtime: float = None
) -> Path:
"""Create a minimal shadow repo on disk without invoking real git."""
import time as _time
shadow = base / dir_hash
shadow.mkdir(parents=True)
(shadow / "HEAD").write_text("ref: refs/heads/main\n")
(shadow / "HERMES_WORKDIR").write_text(str(workdir) + "\n")
(shadow / "info").mkdir()
(shadow / "info" / "exclude").write_text("node_modules/\n")
if mtime is not None:
for p in shadow.rglob("*"):
import os
os.utime(p, (mtime, mtime))
import os
os.utime(shadow, (mtime, mtime))
return shadow
def test_deletes_orphan_when_workdir_missing(self, tmp_path):
from tools.checkpoint_manager import prune_checkpoints
base = tmp_path / "checkpoints"
alive_work = tmp_path / "alive"
alive_work.mkdir()
alive_repo = self._seed_shadow_repo(base, "aaaa" * 4, alive_work)
orphan_repo = self._seed_shadow_repo(
base, "bbbb" * 4, tmp_path / "was-deleted"
)
result = prune_checkpoints(retention_days=0, checkpoint_base=base)
assert result["scanned"] == 2
assert result["deleted_orphan"] == 1
assert result["deleted_stale"] == 0
assert alive_repo.exists()
assert not orphan_repo.exists()
def test_deletes_stale_by_mtime_when_workdir_alive(self, tmp_path):
from tools.checkpoint_manager import prune_checkpoints
import time as _time
base = tmp_path / "checkpoints"
work = tmp_path / "work"
work.mkdir()
fresh_repo = self._seed_shadow_repo(base, "cccc" * 4, work)
stale_work = tmp_path / "stale_work"
stale_work.mkdir()
old = _time.time() - 60 * 86400 # 60 days ago
stale_repo = self._seed_shadow_repo(base, "dddd" * 4, stale_work, mtime=old)
result = prune_checkpoints(
retention_days=30, delete_orphans=False, checkpoint_base=base
)
assert result["deleted_orphan"] == 0
assert result["deleted_stale"] == 1
assert fresh_repo.exists()
assert not stale_repo.exists()
def test_orphan_takes_priority_over_stale(self, tmp_path):
"""Orphan detection counts first — reason="orphan" even if also stale."""
from tools.checkpoint_manager import prune_checkpoints
import time as _time
base = tmp_path / "checkpoints"
old = _time.time() - 60 * 86400
self._seed_shadow_repo(base, "eeee" * 4, tmp_path / "gone", mtime=old)
result = prune_checkpoints(retention_days=30, checkpoint_base=base)
assert result["deleted_orphan"] == 1
assert result["deleted_stale"] == 0
def test_delete_orphans_disabled_keeps_orphans(self, tmp_path):
from tools.checkpoint_manager import prune_checkpoints
base = tmp_path / "checkpoints"
orphan = self._seed_shadow_repo(base, "ffff" * 4, tmp_path / "gone")
result = prune_checkpoints(
retention_days=0, delete_orphans=False, checkpoint_base=base
)
assert result["deleted_orphan"] == 0
assert orphan.exists()
def test_skips_non_shadow_dirs(self, tmp_path):
"""Dirs without HEAD (non-initialised) are left alone."""
from tools.checkpoint_manager import prune_checkpoints
base = tmp_path / "checkpoints"
base.mkdir()
(base / "garbage-dir").mkdir()
(base / "garbage-dir" / "random.txt").write_text("hi")
result = prune_checkpoints(retention_days=0, checkpoint_base=base)
assert result["scanned"] == 0
assert (base / "garbage-dir").exists()
def test_tracks_bytes_freed(self, tmp_path):
from tools.checkpoint_manager import prune_checkpoints
base = tmp_path / "checkpoints"
orphan = self._seed_shadow_repo(base, "1234" * 4, tmp_path / "gone")
(orphan / "objects").mkdir()
(orphan / "objects" / "pack.bin").write_bytes(b"x" * 5000)
result = prune_checkpoints(retention_days=0, checkpoint_base=base)
assert result["deleted_orphan"] == 1
assert result["bytes_freed"] >= 5000
def test_base_missing_returns_empty_counts(self, tmp_path):
from tools.checkpoint_manager import prune_checkpoints
result = prune_checkpoints(checkpoint_base=tmp_path / "does-not-exist")
assert result == {
"scanned": 0, "deleted_orphan": 0, "deleted_stale": 0,
"errors": 0, "bytes_freed": 0,
}
class TestMaybeAutoPruneCheckpoints:
def _seed(self, base, dir_hash, workdir):
base.mkdir(parents=True, exist_ok=True)
shadow = base / dir_hash
shadow.mkdir()
(shadow / "HEAD").write_text("ref: refs/heads/main\n")
(shadow / "HERMES_WORKDIR").write_text(str(workdir) + "\n")
return shadow
def test_first_call_prunes_and_writes_marker(self, tmp_path):
from tools.checkpoint_manager import maybe_auto_prune_checkpoints
base = tmp_path / "checkpoints"
self._seed(base, "0000" * 4, tmp_path / "gone")
out = maybe_auto_prune_checkpoints(checkpoint_base=base)
assert out["skipped"] is False
assert out["result"]["deleted_orphan"] == 1
assert (base / ".last_prune").exists()
def test_second_call_within_interval_skips(self, tmp_path):
from tools.checkpoint_manager import maybe_auto_prune_checkpoints
base = tmp_path / "checkpoints"
self._seed(base, "1111" * 4, tmp_path / "gone")
first = maybe_auto_prune_checkpoints(
checkpoint_base=base, min_interval_hours=24
)
assert first["skipped"] is False
self._seed(base, "2222" * 4, tmp_path / "also-gone")
second = maybe_auto_prune_checkpoints(
checkpoint_base=base, min_interval_hours=24
)
assert second["skipped"] is True
# The second orphan must still exist — skip was honoured.
assert (base / ("2222" * 4)).exists()
def test_corrupt_marker_treated_as_no_prior_run(self, tmp_path):
from tools.checkpoint_manager import maybe_auto_prune_checkpoints
base = tmp_path / "checkpoints"
base.mkdir()
(base / ".last_prune").write_text("not-a-timestamp")
self._seed(base, "3333" * 4, tmp_path / "gone")
out = maybe_auto_prune_checkpoints(checkpoint_base=base)
assert out["skipped"] is False
assert out["result"]["deleted_orphan"] == 1
def test_missing_base_no_raise(self, tmp_path):
from tools.checkpoint_manager import maybe_auto_prune_checkpoints
out = maybe_auto_prune_checkpoints(
checkpoint_base=tmp_path / "does-not-exist"
)
assert out["skipped"] is False
assert out["result"]["scanned"] == 0

View File

@ -651,3 +651,204 @@ def format_checkpoint_list(checkpoints: List[Dict], directory: str) -> str:
lines.append(" /rollback diff <N> preview changes since checkpoint N")
lines.append(" /rollback <N> <file> restore a single file from checkpoint N")
return "\n".join(lines)
# ---------------------------------------------------------------------------
# Auto-maintenance (issue #3015 follow-up)
# ---------------------------------------------------------------------------
#
# Every working directory the agent has ever touched gets its own shadow
# repo under CHECKPOINT_BASE. Per-repo ``_prune`` is a no-op (see comment
# in CheckpointManager._prune), so abandoned repos (deleted projects,
# one-off tmp dirs, long-stale work trees) accumulate forever. Field
# reports put the typical offender at 1000+ repos / ~12 GB on active
# contributor machines.
#
# ``prune_checkpoints`` sweeps CHECKPOINT_BASE at startup, deleting shadow
# repos that match either criterion:
# * orphan: the ``HERMES_WORKDIR`` path no longer exists on disk
# * stale: the repo's newest mtime is older than ``retention_days``
#
# ``maybe_auto_prune_checkpoints`` wraps it with an idempotency marker
# (``CHECKPOINT_BASE/.last_prune``) so calling it on every CLI/gateway
# startup is free after the first run of the day. Opt-in via
# ``checkpoints.auto_prune`` in config.yaml — default off so users who
# rely on ``/rollback`` against long-ago sessions never lose data
# silently.
_PRUNE_MARKER_NAME = ".last_prune"
def _read_workdir_marker(shadow_repo: Path) -> Optional[str]:
"""Read ``HERMES_WORKDIR`` from a shadow repo, or None if missing/unreadable."""
try:
return (shadow_repo / "HERMES_WORKDIR").read_text(encoding="utf-8").strip()
except (OSError, UnicodeDecodeError):
return None
def _shadow_repo_newest_mtime(shadow_repo: Path) -> float:
"""Return newest mtime across the shadow repo (walks objects/refs/HEAD).
We walk instead of trusting the directory mtime because git's pack
operations can leave the top-level dir untouched while refs/objects
inside get updated. Best-effort returns 0.0 on any error.
"""
newest = 0.0
try:
for p in shadow_repo.rglob("*"):
try:
m = p.stat().st_mtime
if m > newest:
newest = m
except OSError:
continue
except OSError:
pass
return newest
def prune_checkpoints(
retention_days: int = 7,
delete_orphans: bool = True,
checkpoint_base: Optional[Path] = None,
) -> Dict[str, int]:
"""Delete stale/orphan shadow repos under ``checkpoint_base``.
A shadow repo is deleted when either:
* ``delete_orphans=True`` and its ``HERMES_WORKDIR`` path no longer
exists on disk (the original project was deleted / moved); OR
* its newest in-repo mtime is older than ``retention_days`` days.
Returns a dict with counts ``{"scanned", "deleted_orphan",
"deleted_stale", "errors", "bytes_freed"}``.
Never raises maintenance must never block interactive startup.
"""
base = checkpoint_base or CHECKPOINT_BASE
result = {
"scanned": 0,
"deleted_orphan": 0,
"deleted_stale": 0,
"errors": 0,
"bytes_freed": 0,
}
if not base.exists():
return result
cutoff = 0.0
if retention_days > 0:
import time as _time
cutoff = _time.time() - retention_days * 86400
for child in base.iterdir():
if not child.is_dir():
continue
# Protect the marker file and anything that isn't a real shadow
# repo (no HEAD = not initialised, leave alone).
if not (child / "HEAD").exists():
continue
result["scanned"] += 1
reason: Optional[str] = None
if delete_orphans:
workdir = _read_workdir_marker(child)
if workdir is None or not Path(workdir).exists():
reason = "orphan"
if reason is None and retention_days > 0:
newest = _shadow_repo_newest_mtime(child)
if newest > 0 and newest < cutoff:
reason = "stale"
if reason is None:
continue
# Measure size before delete (best-effort)
try:
size = sum(p.stat().st_size for p in child.rglob("*") if p.is_file())
except OSError:
size = 0
try:
shutil.rmtree(child)
result["bytes_freed"] += size
if reason == "orphan":
result["deleted_orphan"] += 1
else:
result["deleted_stale"] += 1
logger.debug("Pruned %s checkpoint repo: %s (%d bytes)", reason, child.name, size)
except OSError as exc:
result["errors"] += 1
logger.warning("Failed to prune checkpoint repo %s: %s", child.name, exc)
return result
def maybe_auto_prune_checkpoints(
retention_days: int = 7,
min_interval_hours: int = 24,
delete_orphans: bool = True,
checkpoint_base: Optional[Path] = None,
) -> Dict[str, object]:
"""Idempotent wrapper around ``prune_checkpoints`` for startup hooks.
Writes ``CHECKPOINT_BASE/.last_prune`` on completion so subsequent
calls within ``min_interval_hours`` short-circuit. Designed to be
called once per CLI/gateway process startup; the marker keeps costs
bounded regardless of how many times hermes is invoked per day.
Returns ``{"skipped": bool, "result": prune_checkpoints-dict,
"error": optional str}``.
"""
import time as _time
base = checkpoint_base or CHECKPOINT_BASE
out: Dict[str, object] = {"skipped": False}
try:
if not base.exists():
out["result"] = {
"scanned": 0, "deleted_orphan": 0, "deleted_stale": 0,
"errors": 0, "bytes_freed": 0,
}
return out
marker = base / _PRUNE_MARKER_NAME
now = _time.time()
if marker.exists():
try:
last_ts = float(marker.read_text(encoding="utf-8").strip())
if now - last_ts < min_interval_hours * 3600:
out["skipped"] = True
return out
except (OSError, ValueError):
pass # corrupt marker — treat as no prior run
result = prune_checkpoints(
retention_days=retention_days,
delete_orphans=delete_orphans,
checkpoint_base=base,
)
out["result"] = result
try:
marker.write_text(str(now), encoding="utf-8")
except OSError as exc:
logger.debug("Could not write checkpoint prune marker: %s", exc)
total = result["deleted_orphan"] + result["deleted_stale"]
if total > 0:
logger.info(
"checkpoint auto-maintenance: pruned %d repo(s) "
"(%d orphan, %d stale), reclaimed %.1f MB",
total,
result["deleted_orphan"],
result["deleted_stale"],
result["bytes_freed"] / (1024 * 1024),
)
except Exception as exc:
logger.warning("checkpoint auto-maintenance failed: %s", exc)
out["error"] = str(exc)
return out

View File

@ -64,6 +64,16 @@ Checkpoints are enabled by default. Configure in `~/.hermes/config.yaml`:
checkpoints:
enabled: true # master switch (default: true)
max_snapshots: 50 # max checkpoints per directory
# Auto-maintenance (opt-in): sweep ~/.hermes/checkpoints/ at startup
# and delete shadow repos whose working directory no longer exists
# (orphans) or whose newest commit is older than retention_days.
# Runs at most once per min_interval_hours, tracked via a
# .last_prune marker inside ~/.hermes/checkpoints/.
auto_prune: false # default off — enable to reclaim disk
retention_days: 7
delete_orphans: true # delete repos whose workdir is gone
min_interval_hours: 24
```
To disable: