fix(gateway): cap memory flush retries at 3 to prevent infinite loop
The _session_expiry_watcher retried failed memory flushes forever because exceptions were caught at debug level without setting memory_flushed=True. Expired sessions with transient failures (rate limits, network errors) would retry every 5 minutes indefinitely, burning API quota and blocking gateway message processing via 429 rate limit cascades. Observed case: a March 19 session retried 28+ times over ~17 days, causing repeated 429 errors that made Telegram unresponsive. Add a per-session failure counter (_flush_failures) that gives up after 3 consecutive attempts and marks the session as flushed to break the loop.
This commit is contained in:
parent
507b63f86b
commit
4df2fca2f0
@ -1266,6 +1266,8 @@ class GatewayRunner:
|
||||
next message, so there's no blocking delay.
|
||||
"""
|
||||
await asyncio.sleep(60) # initial delay — let the gateway fully start
|
||||
_flush_failures: dict[str, int] = {} # session_id -> consecutive failure count
|
||||
_MAX_FLUSH_RETRIES = 3
|
||||
while self._running:
|
||||
try:
|
||||
self.session_store._ensure_loaded()
|
||||
@ -1298,8 +1300,25 @@ class GatewayRunner:
|
||||
"Pre-reset memory flush completed for session %s",
|
||||
entry.session_id,
|
||||
)
|
||||
_flush_failures.pop(entry.session_id, None)
|
||||
except Exception as e:
|
||||
logger.debug("Proactive memory flush failed for %s: %s", entry.session_id, e)
|
||||
failures = _flush_failures.get(entry.session_id, 0) + 1
|
||||
_flush_failures[entry.session_id] = failures
|
||||
if failures >= _MAX_FLUSH_RETRIES:
|
||||
logger.warning(
|
||||
"Proactive memory flush gave up after %d attempts for %s: %s. "
|
||||
"Marking as flushed to prevent infinite retry loop.",
|
||||
failures, entry.session_id, e,
|
||||
)
|
||||
with self.session_store._lock:
|
||||
entry.memory_flushed = True
|
||||
self.session_store._save()
|
||||
_flush_failures.pop(entry.session_id, None)
|
||||
else:
|
||||
logger.debug(
|
||||
"Proactive memory flush failed (%d/%d) for %s: %s",
|
||||
failures, _MAX_FLUSH_RETRIES, entry.session_id, e,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug("Session expiry watcher error: %s", e)
|
||||
# Sleep in small increments so we can stop quickly
|
||||
|
||||
Loading…
Reference in New Issue
Block a user