feat(checkpoints): auto-prune orphan and stale shadow repos at startup (#16303)

Every working dir hermes ever touches gets its own shadow git repo under ~/.hermes/checkpoints/{sha256(abs_dir)[:16]}/. The per-repo _prune is a no-op (comment in CheckpointManager._prune says so), so abandoned repos from deleted/moved projects or one-off tmp dirs pile up forever. Field reports put the typical offender at 1000+ repos / ~12 GB on active contributor machines. Adds an opt-in startup sweep that mirrors the sessions.auto_prune pattern from #13861 / #16286: - tools/checkpoint_manager.py: new prune_checkpoints() and maybe_auto_prune_checkpoints() helpers. Deletes shadow repos that are orphan (HERMES_WORKDIR marker points to a path that no longer exists) or stale (newest in-repo mtime older than retention_days). Idempotent via a CHECKPOINT_BASE/.last_prune marker file so it only runs once per min_interval_hours regardless of how many hermes processes start up. - hermes_cli/config.py: new checkpoints.auto_prune / retention_days / delete_orphans / min_interval_hours knobs. Default auto_prune: false so users who rely on /rollback against long-ago sessions never lose data silently. - cli.py / gateway/run.py: startup hooks gated on checkpoints.auto_prune, called right next to the existing state.db maintenance block. - Docs updated with the new config knobs. - 11 regression tests: orphan/stale deletion, precedence, byte-freed tracking, non-shadow dir skip, interval gating, corrupt marker recovery. Refs #3015 (session-file disk growth was fixed in #16286; this covers the checkpoint side noted out-of-scope there).
2026-04-26 19:05:52 -07:00 · 2026-04-26 19:05:52 -07:00 · 478444c262
commit 478444c262
parent ced8f44cd2
6 changed files with 458 additions and 0 deletions
--- a/cli.py
+++ b/cli.py
@ -988,6 +988,29 @@ def _run_state_db_auto_maintenance(session_db) -> None:
        logger.debug("state.db auto-maintenance skipped: %s", exc)


+def _run_checkpoint_auto_maintenance() -> None:
+    """Call ``checkpoint_manager.maybe_auto_prune_checkpoints`` using current config.
+
+    Reads the ``checkpoints:`` section from config.yaml via
+    :func:`hermes_cli.config.load_config`. Honours ``auto_prune`` /
+    ``retention_days`` / ``delete_orphans`` / ``min_interval_hours``.
+    Never raises — maintenance must never block interactive startup.
+    """
+    try:
+        from hermes_cli.config import load_config as _load_full_config
+        cfg = (_load_full_config().get("checkpoints") or {})
+        if not cfg.get("auto_prune", False):
+            return
+        from tools.checkpoint_manager import maybe_auto_prune_checkpoints
+        maybe_auto_prune_checkpoints(
+            retention_days=int(cfg.get("retention_days", 7)),
+            min_interval_hours=int(cfg.get("min_interval_hours", 24)),
+            delete_orphans=bool(cfg.get("delete_orphans", True)),
+        )
+    except Exception as exc:
+        logger.debug("checkpoint auto-maintenance skipped: %s", exc)
+
+
 def _prune_stale_worktrees(repo_root: str, max_age_hours: int = 24) -> None:
    """Remove stale worktrees and orphaned branches on startup.

@ -2054,6 +2077,11 @@ class HermesCLI:
        # Never blocks startup on failure.
        _run_state_db_auto_maintenance(self._session_db)

+        # Opportunistic shadow-repo cleanup — deletes orphan/stale
+        # checkpoint repos under ~/.hermes/checkpoints/.  Opt-in via
+        # checkpoints.auto_prune, idempotent via .last_prune marker.
+        _run_checkpoint_auto_maintenance()
+
        # Deferred title: stored in memory until the session is created in the DB
        self._pending_title: Optional[str] = None
        
--- a/gateway/run.py
+++ b/gateway/run.py
@ -768,6 +768,22 @@ class GatewayRunner:
            except Exception as exc:
                logger.debug("state.db auto-maintenance skipped: %s", exc)

+        # Opportunistic shadow-repo cleanup — deletes orphan/stale
+        # checkpoint repos under ~/.hermes/checkpoints/.  Opt-in via
+        # checkpoints.auto_prune, idempotent via .last_prune marker.
+        try:
+            from hermes_cli.config import load_config as _load_full_config
+            _ckpt_cfg = (_load_full_config().get("checkpoints") or {})
+            if _ckpt_cfg.get("auto_prune", False):
+                from tools.checkpoint_manager import maybe_auto_prune_checkpoints
+                maybe_auto_prune_checkpoints(
+                    retention_days=int(_ckpt_cfg.get("retention_days", 7)),
+                    min_interval_hours=int(_ckpt_cfg.get("min_interval_hours", 24)),
+                    delete_orphans=bool(_ckpt_cfg.get("delete_orphans", True)),
+                )
+        except Exception as exc:
+            logger.debug("checkpoint auto-maintenance skipped: %s", exc)
+
        # DM pairing store for code-based user authorization
        from gateway.pairing import PairingStore
        self.pairing_store = PairingStore()
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@ -487,6 +487,19 @@ DEFAULT_CONFIG = {
    "checkpoints": {
        "enabled": True,
        "max_snapshots": 50,  # Max checkpoints to keep per directory
+        # Auto-maintenance: shadow repos accumulate forever under
+        # ~/.hermes/checkpoints/ (one per cd'd working directory). Field
+        # reports put the typical offender at 1000+ repos / ~12 GB. When
+        # auto_prune is on, hermes sweeps at startup (at most once per
+        # min_interval_hours) and deletes:
+        #   * orphan repos: HERMES_WORKDIR no longer exists on disk
+        #   * stale repos:  newest mtime older than retention_days
+        # Opt-in so users who rely on /rollback against long-ago sessions
+        # never lose data silently.
+        "auto_prune": False,
+        "retention_days": 7,
+        "delete_orphans": True,
+        "min_interval_hours": 24,
    },

    # Maximum characters returned by a single read_file call.  Reads that
--- a/tests/tools/test_checkpoint_manager.py
+++ b/tests/tools/test_checkpoint_manager.py
@ -717,3 +717,193 @@ class TestGpgAndGlobalConfigIsolation:
        mgr = CheckpointManager(enabled=True)
        assert mgr.ensure_checkpoint(str(work_dir), reason="prefix-shadow") is True
        assert len(mgr.list_checkpoints(str(work_dir))) == 1
+
+
+# =========================================================================
+# Auto-maintenance: prune_checkpoints + maybe_auto_prune_checkpoints
+# =========================================================================
+
+class TestPruneCheckpoints:
+    """Sweep orphan/stale shadow repos under CHECKPOINT_BASE (issue #3015 follow-up)."""
+
+    def _seed_shadow_repo(
+        self, base: Path, dir_hash: str, workdir: Path, mtime: float = None
+    ) -> Path:
+        """Create a minimal shadow repo on disk without invoking real git."""
+        import time as _time
+        shadow = base / dir_hash
+        shadow.mkdir(parents=True)
+        (shadow / "HEAD").write_text("ref: refs/heads/main\n")
+        (shadow / "HERMES_WORKDIR").write_text(str(workdir) + "\n")
+        (shadow / "info").mkdir()
+        (shadow / "info" / "exclude").write_text("node_modules/\n")
+        if mtime is not None:
+            for p in shadow.rglob("*"):
+                import os
+                os.utime(p, (mtime, mtime))
+            import os
+            os.utime(shadow, (mtime, mtime))
+        return shadow
+
+    def test_deletes_orphan_when_workdir_missing(self, tmp_path):
+        from tools.checkpoint_manager import prune_checkpoints
+
+        base = tmp_path / "checkpoints"
+        alive_work = tmp_path / "alive"
+        alive_work.mkdir()
+        alive_repo = self._seed_shadow_repo(base, "aaaa" * 4, alive_work)
+        orphan_repo = self._seed_shadow_repo(
+            base, "bbbb" * 4, tmp_path / "was-deleted"
+        )
+
+        result = prune_checkpoints(retention_days=0, checkpoint_base=base)
+
+        assert result["scanned"] == 2
+        assert result["deleted_orphan"] == 1
+        assert result["deleted_stale"] == 0
+        assert alive_repo.exists()
+        assert not orphan_repo.exists()
+
+    def test_deletes_stale_by_mtime_when_workdir_alive(self, tmp_path):
+        from tools.checkpoint_manager import prune_checkpoints
+        import time as _time
+
+        base = tmp_path / "checkpoints"
+        work = tmp_path / "work"
+        work.mkdir()
+
+        fresh_repo = self._seed_shadow_repo(base, "cccc" * 4, work)
+        stale_work = tmp_path / "stale_work"
+        stale_work.mkdir()
+        old = _time.time() - 60 * 86400  # 60 days ago
+        stale_repo = self._seed_shadow_repo(base, "dddd" * 4, stale_work, mtime=old)
+
+        result = prune_checkpoints(
+            retention_days=30, delete_orphans=False, checkpoint_base=base
+        )
+
+        assert result["deleted_orphan"] == 0
+        assert result["deleted_stale"] == 1
+        assert fresh_repo.exists()
+        assert not stale_repo.exists()
+
+    def test_orphan_takes_priority_over_stale(self, tmp_path):
+        """Orphan detection counts first — reason="orphan" even if also stale."""
+        from tools.checkpoint_manager import prune_checkpoints
+        import time as _time
+
+        base = tmp_path / "checkpoints"
+        old = _time.time() - 60 * 86400
+        self._seed_shadow_repo(base, "eeee" * 4, tmp_path / "gone", mtime=old)
+
+        result = prune_checkpoints(retention_days=30, checkpoint_base=base)
+        assert result["deleted_orphan"] == 1
+        assert result["deleted_stale"] == 0
+
+    def test_delete_orphans_disabled_keeps_orphans(self, tmp_path):
+        from tools.checkpoint_manager import prune_checkpoints
+
+        base = tmp_path / "checkpoints"
+        orphan = self._seed_shadow_repo(base, "ffff" * 4, tmp_path / "gone")
+
+        result = prune_checkpoints(
+            retention_days=0, delete_orphans=False, checkpoint_base=base
+        )
+        assert result["deleted_orphan"] == 0
+        assert orphan.exists()
+
+    def test_skips_non_shadow_dirs(self, tmp_path):
+        """Dirs without HEAD (non-initialised) are left alone."""
+        from tools.checkpoint_manager import prune_checkpoints
+
+        base = tmp_path / "checkpoints"
+        base.mkdir()
+        (base / "garbage-dir").mkdir()
+        (base / "garbage-dir" / "random.txt").write_text("hi")
+
+        result = prune_checkpoints(retention_days=0, checkpoint_base=base)
+        assert result["scanned"] == 0
+        assert (base / "garbage-dir").exists()
+
+    def test_tracks_bytes_freed(self, tmp_path):
+        from tools.checkpoint_manager import prune_checkpoints
+
+        base = tmp_path / "checkpoints"
+        orphan = self._seed_shadow_repo(base, "1234" * 4, tmp_path / "gone")
+        (orphan / "objects").mkdir()
+        (orphan / "objects" / "pack.bin").write_bytes(b"x" * 5000)
+
+        result = prune_checkpoints(retention_days=0, checkpoint_base=base)
+        assert result["deleted_orphan"] == 1
+        assert result["bytes_freed"] >= 5000
+
+    def test_base_missing_returns_empty_counts(self, tmp_path):
+        from tools.checkpoint_manager import prune_checkpoints
+
+        result = prune_checkpoints(checkpoint_base=tmp_path / "does-not-exist")
+        assert result == {
+            "scanned": 0, "deleted_orphan": 0, "deleted_stale": 0,
+            "errors": 0, "bytes_freed": 0,
+        }
+
+
+class TestMaybeAutoPruneCheckpoints:
+    def _seed(self, base, dir_hash, workdir):
+        base.mkdir(parents=True, exist_ok=True)
+        shadow = base / dir_hash
+        shadow.mkdir()
+        (shadow / "HEAD").write_text("ref: refs/heads/main\n")
+        (shadow / "HERMES_WORKDIR").write_text(str(workdir) + "\n")
+        return shadow
+
+    def test_first_call_prunes_and_writes_marker(self, tmp_path):
+        from tools.checkpoint_manager import maybe_auto_prune_checkpoints
+
+        base = tmp_path / "checkpoints"
+        self._seed(base, "0000" * 4, tmp_path / "gone")
+
+        out = maybe_auto_prune_checkpoints(checkpoint_base=base)
+        assert out["skipped"] is False
+        assert out["result"]["deleted_orphan"] == 1
+        assert (base / ".last_prune").exists()
+
+    def test_second_call_within_interval_skips(self, tmp_path):
+        from tools.checkpoint_manager import maybe_auto_prune_checkpoints
+
+        base = tmp_path / "checkpoints"
+        self._seed(base, "1111" * 4, tmp_path / "gone")
+
+        first = maybe_auto_prune_checkpoints(
+            checkpoint_base=base, min_interval_hours=24
+        )
+        assert first["skipped"] is False
+
+        self._seed(base, "2222" * 4, tmp_path / "also-gone")
+        second = maybe_auto_prune_checkpoints(
+            checkpoint_base=base, min_interval_hours=24
+        )
+        assert second["skipped"] is True
+        # The second orphan must still exist — skip was honoured.
+        assert (base / ("2222" * 4)).exists()
+
+    def test_corrupt_marker_treated_as_no_prior_run(self, tmp_path):
+        from tools.checkpoint_manager import maybe_auto_prune_checkpoints
+
+        base = tmp_path / "checkpoints"
+        base.mkdir()
+        (base / ".last_prune").write_text("not-a-timestamp")
+        self._seed(base, "3333" * 4, tmp_path / "gone")
+
+        out = maybe_auto_prune_checkpoints(checkpoint_base=base)
+        assert out["skipped"] is False
+        assert out["result"]["deleted_orphan"] == 1
+
+    def test_missing_base_no_raise(self, tmp_path):
+        from tools.checkpoint_manager import maybe_auto_prune_checkpoints
+
+        out = maybe_auto_prune_checkpoints(
+            checkpoint_base=tmp_path / "does-not-exist"
+        )
+        assert out["skipped"] is False
+        assert out["result"]["scanned"] == 0
+
--- a/tools/checkpoint_manager.py
+++ b/tools/checkpoint_manager.py
@ -651,3 +651,204 @@ def format_checkpoint_list(checkpoints: List[Dict], directory: str) -> str:
    lines.append("  /rollback diff <N>        preview changes since checkpoint N")
    lines.append("  /rollback <N> <file>      restore a single file from checkpoint N")
    return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# Auto-maintenance (issue #3015 follow-up)
+# ---------------------------------------------------------------------------
+#
+# Every working directory the agent has ever touched gets its own shadow
+# repo under CHECKPOINT_BASE.  Per-repo ``_prune`` is a no-op (see comment
+# in CheckpointManager._prune), so abandoned repos (deleted projects,
+# one-off tmp dirs, long-stale work trees) accumulate forever.  Field
+# reports put the typical offender at 1000+ repos / ~12 GB on active
+# contributor machines.
+#
+# ``prune_checkpoints`` sweeps CHECKPOINT_BASE at startup, deleting shadow
+# repos that match either criterion:
+#   * orphan:  the ``HERMES_WORKDIR`` path no longer exists on disk
+#   * stale:   the repo's newest mtime is older than ``retention_days``
+#
+# ``maybe_auto_prune_checkpoints`` wraps it with an idempotency marker
+# (``CHECKPOINT_BASE/.last_prune``) so calling it on every CLI/gateway
+# startup is free after the first run of the day.  Opt-in via
+# ``checkpoints.auto_prune`` in config.yaml — default off so users who
+# rely on ``/rollback`` against long-ago sessions never lose data
+# silently.
+
+_PRUNE_MARKER_NAME = ".last_prune"
+
+
+def _read_workdir_marker(shadow_repo: Path) -> Optional[str]:
+    """Read ``HERMES_WORKDIR`` from a shadow repo, or None if missing/unreadable."""
+    try:
+        return (shadow_repo / "HERMES_WORKDIR").read_text(encoding="utf-8").strip()
+    except (OSError, UnicodeDecodeError):
+        return None
+
+
+def _shadow_repo_newest_mtime(shadow_repo: Path) -> float:
+    """Return newest mtime across the shadow repo (walks objects/refs/HEAD).
+
+    We walk instead of trusting the directory mtime because git's pack
+    operations can leave the top-level dir untouched while refs/objects
+    inside get updated.  Best-effort — returns 0.0 on any error.
+    """
+    newest = 0.0
+    try:
+        for p in shadow_repo.rglob("*"):
+            try:
+                m = p.stat().st_mtime
+                if m > newest:
+                    newest = m
+            except OSError:
+                continue
+    except OSError:
+        pass
+    return newest
+
+
+def prune_checkpoints(
+    retention_days: int = 7,
+    delete_orphans: bool = True,
+    checkpoint_base: Optional[Path] = None,
+) -> Dict[str, int]:
+    """Delete stale/orphan shadow repos under ``checkpoint_base``.
+
+    A shadow repo is deleted when either:
+
+    * ``delete_orphans=True`` and its ``HERMES_WORKDIR`` path no longer
+      exists on disk (the original project was deleted / moved); OR
+    * its newest in-repo mtime is older than ``retention_days`` days.
+
+    Returns a dict with counts ``{"scanned", "deleted_orphan",
+    "deleted_stale", "errors", "bytes_freed"}``.
+
+    Never raises — maintenance must never block interactive startup.
+    """
+    base = checkpoint_base or CHECKPOINT_BASE
+    result = {
+        "scanned": 0,
+        "deleted_orphan": 0,
+        "deleted_stale": 0,
+        "errors": 0,
+        "bytes_freed": 0,
+    }
+    if not base.exists():
+        return result
+
+    cutoff = 0.0
+    if retention_days > 0:
+        import time as _time
+        cutoff = _time.time() - retention_days * 86400
+
+    for child in base.iterdir():
+        if not child.is_dir():
+            continue
+        # Protect the marker file and anything that isn't a real shadow
+        # repo (no HEAD = not initialised, leave alone).
+        if not (child / "HEAD").exists():
+            continue
+        result["scanned"] += 1
+
+        reason: Optional[str] = None
+        if delete_orphans:
+            workdir = _read_workdir_marker(child)
+            if workdir is None or not Path(workdir).exists():
+                reason = "orphan"
+
+        if reason is None and retention_days > 0:
+            newest = _shadow_repo_newest_mtime(child)
+            if newest > 0 and newest < cutoff:
+                reason = "stale"
+
+        if reason is None:
+            continue
+
+        # Measure size before delete (best-effort)
+        try:
+            size = sum(p.stat().st_size for p in child.rglob("*") if p.is_file())
+        except OSError:
+            size = 0
+        try:
+            shutil.rmtree(child)
+            result["bytes_freed"] += size
+            if reason == "orphan":
+                result["deleted_orphan"] += 1
+            else:
+                result["deleted_stale"] += 1
+            logger.debug("Pruned %s checkpoint repo: %s (%d bytes)", reason, child.name, size)
+        except OSError as exc:
+            result["errors"] += 1
+            logger.warning("Failed to prune checkpoint repo %s: %s", child.name, exc)
+
+    return result
+
+
+def maybe_auto_prune_checkpoints(
+    retention_days: int = 7,
+    min_interval_hours: int = 24,
+    delete_orphans: bool = True,
+    checkpoint_base: Optional[Path] = None,
+) -> Dict[str, object]:
+    """Idempotent wrapper around ``prune_checkpoints`` for startup hooks.
+
+    Writes ``CHECKPOINT_BASE/.last_prune`` on completion so subsequent
+    calls within ``min_interval_hours`` short-circuit.  Designed to be
+    called once per CLI/gateway process startup; the marker keeps costs
+    bounded regardless of how many times hermes is invoked per day.
+
+    Returns ``{"skipped": bool, "result": prune_checkpoints-dict,
+    "error": optional str}``.
+    """
+    import time as _time
+    base = checkpoint_base or CHECKPOINT_BASE
+    out: Dict[str, object] = {"skipped": False}
+
+    try:
+        if not base.exists():
+            out["result"] = {
+                "scanned": 0, "deleted_orphan": 0, "deleted_stale": 0,
+                "errors": 0, "bytes_freed": 0,
+            }
+            return out
+
+        marker = base / _PRUNE_MARKER_NAME
+        now = _time.time()
+        if marker.exists():
+            try:
+                last_ts = float(marker.read_text(encoding="utf-8").strip())
+                if now - last_ts < min_interval_hours * 3600:
+                    out["skipped"] = True
+                    return out
+            except (OSError, ValueError):
+                pass  # corrupt marker — treat as no prior run
+
+        result = prune_checkpoints(
+            retention_days=retention_days,
+            delete_orphans=delete_orphans,
+            checkpoint_base=base,
+        )
+        out["result"] = result
+
+        try:
+            marker.write_text(str(now), encoding="utf-8")
+        except OSError as exc:
+            logger.debug("Could not write checkpoint prune marker: %s", exc)
+
+        total = result["deleted_orphan"] + result["deleted_stale"]
+        if total > 0:
+            logger.info(
+                "checkpoint auto-maintenance: pruned %d repo(s) "
+                "(%d orphan, %d stale), reclaimed %.1f MB",
+                total,
+                result["deleted_orphan"],
+                result["deleted_stale"],
+                result["bytes_freed"] / (1024 * 1024),
+            )
+    except Exception as exc:
+        logger.warning("checkpoint auto-maintenance failed: %s", exc)
+        out["error"] = str(exc)
+
+    return out
+
--- a/website/docs/user-guide/checkpoints-and-rollback.md
+++ b/website/docs/user-guide/checkpoints-and-rollback.md
@ -64,6 +64,16 @@ Checkpoints are enabled by default. Configure in `~/.hermes/config.yaml`:
 checkpoints:
  enabled: true          # master switch (default: true)
  max_snapshots: 50      # max checkpoints per directory
+
+  # Auto-maintenance (opt-in): sweep ~/.hermes/checkpoints/ at startup
+  # and delete shadow repos whose working directory no longer exists
+  # (orphans) or whose newest commit is older than retention_days.
+  # Runs at most once per min_interval_hours, tracked via a
+  # .last_prune marker inside ~/.hermes/checkpoints/.
+  auto_prune: false           # default off — enable to reclaim disk
+  retention_days: 7
+  delete_orphans: true        # delete repos whose workdir is gone
+  min_interval_hours: 24
 ```

 To disable: