From b00f478b6e487aed90d8b36753a29550fb7bb63a Mon Sep 17 00:00:00 2001 From: rabbitblood Date: Wed, 15 Apr 2026 03:13:41 -0700 Subject: [PATCH] fix(scheduler): independent heartbeat pulse so liveness doesn't false-stale during long fires (#140) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The #95 scheduler heartbeat scheme relied on: 1. Top of tick() (once per poll interval) 2. Per-fire goroutine entry + exit That leaves a gap: tick() ends with wg.Wait(), so if a single fire takes longer than pollInterval (UIUX audits routinely take 60-120s; max fireTimeout is 5min), the next tick doesn't run and no top-of-tick heartbeat fires. Per-fire heartbeats only bracket the fire — between entry and the HTTP response returning, nothing heartbeats either. Observed today: /admin/liveness reports seconds_ago=251 while docker logs show the scheduler actively firing 'Hourly ecosystem watch'. Scheduler is fine; liveness is lying. Adds an independent 10s heartbeat pulse goroutine inside Start(), decoupled from tick completion. The existing heartbeats at tick top + per-fire are kept as redundant signals but this pulse is the one that guarantees liveness freshness regardless of what tick is doing. Ships the exact fix proposed in #140 body. Closes #140. --- platform/internal/scheduler/scheduler.go | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/platform/internal/scheduler/scheduler.go b/platform/internal/scheduler/scheduler.go index 83b538ea..4ca8e5f7 100644 --- a/platform/internal/scheduler/scheduler.go +++ b/platform/internal/scheduler/scheduler.go @@ -113,6 +113,25 @@ func (s *Scheduler) Start(ctx context.Context) { s.lastTickAt = time.Now() s.mu.Unlock() + // Independent heartbeat pulse (#140). Decoupled from tick completion so + // a single long fire (UIUX audits routinely take 60-120s; max fireTimeout + // is 5min) can't make /admin/liveness look stale for the whole fire window. + // tick() also calls Heartbeat at its top + each fire goroutine calls it + // entry/exit — those are kept as redundant signals but this pulse is the + // one that guarantees liveness freshness regardless of tick state. + go func() { + pulseTicker := time.NewTicker(10 * time.Second) + defer pulseTicker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-pulseTicker.C: + supervised.Heartbeat("scheduler") + } + } + }() + for { select { case <-ctx.Done():