From 4df2fca2f03eb7561268a7ae415d9bc295d7d0e6 Mon Sep 17 00:00:00 2001 From: nibzard Date: Sun, 5 Apr 2026 12:41:45 +0000 Subject: [PATCH] fix(gateway): cap memory flush retries at 3 to prevent infinite loop The _session_expiry_watcher retried failed memory flushes forever because exceptions were caught at debug level without setting memory_flushed=True. Expired sessions with transient failures (rate limits, network errors) would retry every 5 minutes indefinitely, burning API quota and blocking gateway message processing via 429 rate limit cascades. Observed case: a March 19 session retried 28+ times over ~17 days, causing repeated 429 errors that made Telegram unresponsive. Add a per-session failure counter (_flush_failures) that gives up after 3 consecutive attempts and marks the session as flushed to break the loop. --- gateway/run.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/gateway/run.py b/gateway/run.py index 52bc9f7a..2b7ebe4e 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -1266,6 +1266,8 @@ class GatewayRunner: next message, so there's no blocking delay. """ await asyncio.sleep(60) # initial delay — let the gateway fully start + _flush_failures: dict[str, int] = {} # session_id -> consecutive failure count + _MAX_FLUSH_RETRIES = 3 while self._running: try: self.session_store._ensure_loaded() @@ -1298,8 +1300,25 @@ class GatewayRunner: "Pre-reset memory flush completed for session %s", entry.session_id, ) + _flush_failures.pop(entry.session_id, None) except Exception as e: - logger.debug("Proactive memory flush failed for %s: %s", entry.session_id, e) + failures = _flush_failures.get(entry.session_id, 0) + 1 + _flush_failures[entry.session_id] = failures + if failures >= _MAX_FLUSH_RETRIES: + logger.warning( + "Proactive memory flush gave up after %d attempts for %s: %s. " + "Marking as flushed to prevent infinite retry loop.", + failures, entry.session_id, e, + ) + with self.session_store._lock: + entry.memory_flushed = True + self.session_store._save() + _flush_failures.pop(entry.session_id, None) + else: + logger.debug( + "Proactive memory flush failed (%d/%d) for %s: %s", + failures, _MAX_FLUSH_RETRIES, entry.session_id, e, + ) except Exception as e: logger.debug("Session expiry watcher error: %s", e) # Sleep in small increments so we can stop quickly