package pendinguploads_test import ( "context" "database/sql" "errors" "sync/atomic" "testing" "time" "github.com/google/uuid" "github.com/Molecule-AI/molecule-monorepo/platform/internal/metrics" "github.com/Molecule-AI/molecule-monorepo/platform/internal/pendinguploads" ) // fakeSweepStorage is a minimal Storage that records every Sweep call // and lets each test inject the per-cycle return values. The other // methods are no-ops — the sweeper goroutine never calls them. type fakeSweepStorage struct { calls atomic.Int64 results []pendinguploads.SweepResult errs []error cycleDone chan struct{} // closed after each Sweep call (test sync) gotRetention atomic.Int64 // last ackRetention seen, in seconds } func newFakeSweepStorage(results []pendinguploads.SweepResult, errs []error) *fakeSweepStorage { return &fakeSweepStorage{ results: results, errs: errs, cycleDone: make(chan struct{}, 16), } } func (f *fakeSweepStorage) Put(_ context.Context, _ uuid.UUID, _ []byte, _, _ string) (uuid.UUID, error) { return uuid.Nil, errors.New("not used") } func (f *fakeSweepStorage) Get(_ context.Context, _ uuid.UUID) (pendinguploads.Record, error) { return pendinguploads.Record{}, errors.New("not used") } func (f *fakeSweepStorage) MarkFetched(_ context.Context, _ uuid.UUID) error { return errors.New("not used") } func (f *fakeSweepStorage) Ack(_ context.Context, _ uuid.UUID) error { return errors.New("not used") } func (f *fakeSweepStorage) PutBatch(_ context.Context, _ uuid.UUID, _ []pendinguploads.PutItem) ([]uuid.UUID, error) { return nil, errors.New("not used") } func (f *fakeSweepStorage) PutBatchTx(_ context.Context, _ *sql.Tx, _ uuid.UUID, _ []pendinguploads.PutItem) ([]uuid.UUID, error) { return nil, errors.New("not used") } func (f *fakeSweepStorage) Sweep(_ context.Context, ackRetention time.Duration) (pendinguploads.SweepResult, error) { idx := int(f.calls.Load()) f.calls.Add(1) f.gotRetention.Store(int64(ackRetention.Seconds())) defer func() { select { case f.cycleDone <- struct{}{}: default: } }() if idx < len(f.errs) && f.errs[idx] != nil { return pendinguploads.SweepResult{}, f.errs[idx] } if idx < len(f.results) { return f.results[idx], nil } return pendinguploads.SweepResult{}, nil } // waitForCycle blocks until at least one Sweep completes, with a deadline. // Tests use this instead of time.Sleep to avoid flakes on slow CI hosts. // // CAVEAT: cycleDone fires from inside fakeSweepStorage.Sweep's defer, // which runs as Sweep returns its result — BEFORE the StartSweeper // loop has processed the (result, error) tuple and called the // metric recorders. Tests that assert on metric counters must NOT // rely on this wait alone; use waitForMetricDelta instead so the // metric increment race (Sweep returns → cycleDone fires → test // reads counter → only then does StartSweeper's loop call // metrics.PendingUploadsSweepError) doesn't produce a flake. func (f *fakeSweepStorage) waitForCycle(t *testing.T, n int, timeout time.Duration) { t.Helper() deadline := time.NewTimer(timeout) defer deadline.Stop() for got := 0; got < n; got++ { select { case <-f.cycleDone: case <-deadline.C: t.Fatalf("waited %s for %d sweep cycles, got %d", timeout, n, f.calls.Load()) } } } // waitForMetricDelta polls the supplied delta function until it returns // `want` or the timeout elapses. Use after waitForCycle when the test // asserts on a metric counter — closes the race between cycleDone // (signalled inside fakeSweepStorage.Sweep's defer, BEFORE Sweep // returns to StartSweeper) and the metric recording (which happens in // StartSweeper's loop AFTER Sweep returns). On a slow CI host the test // goroutine wins the read before StartSweeper's goroutine writes the // counter; the polling assert preserves the determinism of "the metric // MUST be N" without timing-based flakes. // // Per memory feedback_question_test_when_unexpected.md: the failure // mode "delta=0, want=1" looked like a real bug at first glance — // "metric never incremented" — but instrumented analysis showed the // metric DID increment, just AFTER the test's read. The fix is the // test's wait shape, not the production code. func waitForMetricDelta(t *testing.T, delta func() int64, want int64, timeout time.Duration) { t.Helper() deadline := time.Now().Add(timeout) for time.Now().Before(deadline) { if delta() == want { return } time.Sleep(5 * time.Millisecond) } t.Fatalf("waited %s for metric delta=%d, last seen %d", timeout, want, delta()) } func TestStartSweeper_NilStorageDoesNotPanic(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() // Should return immediately without panicking; no goroutine to wait on. pendinguploads.StartSweeper(ctx, nil, time.Second) } func TestStartSweeper_RunsImmediatelyAndOnTick(t *testing.T) { store := newFakeSweepStorage( []pendinguploads.SweepResult{{Acked: 5}, {Acked: 1, Expired: 2}}, nil, ) ctx, cancel := context.WithCancel(context.Background()) defer cancel() done := pendinguploads.StartSweeperForTest(ctx, store, time.Hour) store.waitForCycle(t, 1, 2*time.Second) if got := store.calls.Load(); got < 1 { t.Errorf("expected at least one immediate sweep, got %d", got) } // Retention propagated. if store.gotRetention.Load() != 3600 { t.Errorf("retention seconds = %d, want 3600", store.gotRetention.Load()) } // #86 fix: ensure goroutine has exited before the next test's // metricDelta() baseline capture. cancel() <-done } func TestStartSweeper_ZeroAckRetentionUsesDefault(t *testing.T) { store := newFakeSweepStorage([]pendinguploads.SweepResult{{}}, nil) ctx, cancel := context.WithCancel(context.Background()) defer cancel() done := pendinguploads.StartSweeperForTest(ctx, store, 0) store.waitForCycle(t, 1, 2*time.Second) want := int64(pendinguploads.DefaultAckRetention.Seconds()) if store.gotRetention.Load() != want { t.Errorf("retention = %d, want default %d", store.gotRetention.Load(), want) } // #86 fix. cancel() <-done } func TestStartSweeper_ContextCancelStopsLoop(t *testing.T) { store := newFakeSweepStorage([]pendinguploads.SweepResult{{}}, nil) ctx, cancel := context.WithCancel(context.Background()) done := pendinguploads.StartSweeperForTest(ctx, store, time.Second) store.waitForCycle(t, 1, 2*time.Second) cancel() select { case <-done: case <-time.After(2 * time.Second): t.Fatal("StartSweeper did not return after ctx cancel") } } func TestStartSweeperWithInterval_TickerFiresAdditionalCycles(t *testing.T) { store := newFakeSweepStorage( []pendinguploads.SweepResult{{Acked: 1}, {Expired: 1}, {}, {}, {}}, nil, ) ctx, cancel := context.WithCancel(context.Background()) defer cancel() // Use a short ticker interval (100ms) so the test runs fast without // burning real wall-clock time. StartSweeperWithIntervalForTest is the // test-friendly variant that accepts a caller-specified interval; the // production SweepInterval of 5m is too coarse for a 2s deadline on // a loaded CI runner (the ticker may not fire at all under CPU // contention — the root cause of the pre-existing CI flake). done := make(chan struct{}) go pendinguploads.StartSweeperWithIntervalForTest(ctx, store, time.Hour, 100*time.Millisecond, done) // Immediate cycle + at least one tick-driven cycle. store.waitForCycle(t, 2, 2*time.Second) if got := store.calls.Load(); got < 2 { t.Errorf("expected ≥2 cycles (immediate + 1 tick), got %d", got) } // #86 fix: drain the done channel so the goroutine is fully gone // before the next test's metricDelta() baseline capture. cancel() <-done } func TestStartSweeper_TransientErrorDoesNotCrashLoop(t *testing.T) { // First call errors; second call succeeds. The loop must keep running // across the error so a one-off DB hiccup doesn't disable the GC. store := newFakeSweepStorage( []pendinguploads.SweepResult{{}, {Acked: 1}}, []error{errors.New("transient db error"), nil}, ) ctx, cancel := context.WithCancel(context.Background()) defer cancel() // Capture metric baseline so we can wait for the error counter to // settle before returning — otherwise this test's leaked metric // write races with the next test's metricDelta() baseline read and // causes a non-deterministic +1 leak (manifests as // TestStartSweeper_RecordsMetricsOnSuccess: "error counter delta=1, // want 0"). cycleDone fires inside the fake's Sweep defer, BEFORE // sweepOnce records the error metric — so cancel() right after // waitForCycle is too early. _, _, deltaError := metricDelta(t) done := pendinguploads.StartSweeperForTest(ctx, store, time.Hour) // Wait for the first (errored) cycle. store.waitForCycle(t, 1, 2*time.Second) // Wait for the goroutine to record the error metric. After this // returns, sweepOnce has fully completed and a subsequent cancel() // stops the loop on the next select pass with no in-flight metric // writes outstanding. waitForMetricDelta(t, deltaError, 1, 2*time.Second) // Cancel and wait for the goroutine to fully exit (#86 fix). // Without the done-channel wait, the goroutine races with the next // test's metricDelta() baseline capture — the next test may see // error=1 from this test still "in flight", throwing off its // deltaError assertion. cancel() <-done } // metricDelta returns a function that, when called, returns how much // the (acked, expired, errored) counters have advanced since metricDelta // was originally called. metrics is a process-singleton across the test // suite; deltas isolate this test from order-of-execution dependencies. func metricDelta(t *testing.T) (deltaAcked, deltaExpired, deltaError func() int64) { t.Helper() a0, e0, err0 := metrics.PendingUploadsSweepCounts() deltaAcked = func() int64 { a, _, _ := metrics.PendingUploadsSweepCounts() return a - a0 } deltaExpired = func() int64 { _, e, _ := metrics.PendingUploadsSweepCounts() return e - e0 } deltaError = func() int64 { _, _, x := metrics.PendingUploadsSweepCounts() return x - err0 } return } func TestStartSweeper_RecordsMetricsOnSuccess(t *testing.T) { deltaAcked, deltaExpired, deltaError := metricDelta(t) store := newFakeSweepStorage( []pendinguploads.SweepResult{{Acked: 3, Expired: 5}}, nil, ) ctx, cancel := context.WithCancel(context.Background()) defer cancel() done := pendinguploads.StartSweeperForTest(ctx, store, time.Hour) store.waitForCycle(t, 1, 2*time.Second) // Poll for the success counters to settle — closes the cycleDone- // vs-metric-record race (see waitForMetricDelta comment). waitForMetricDelta(t, deltaAcked, 3, 2*time.Second) waitForMetricDelta(t, deltaExpired, 5, 2*time.Second) // Also poll error counter to 0 — with the race detector, a // concurrent sweeper goroutine from a prior test can still be // running and incrementing pendingUploadsSweepErrors after // metricDelta() captures its baseline. Polling to 0 (matching // TestStartSweeper_RecordsMetricsOnError's pattern for the error // path) ensures the error counter has settled before we assert. // Without this, on a slow race-detector host the error counter // from the previous test's sweeper is still in flight when we // read it, producing deltaError=1 instead of 0. waitForMetricDelta(t, deltaError, 0, 2*time.Second) if got := deltaError(); got != 0 { t.Errorf("error counter delta = %d, want 0", got) } // #86 fix: drain the done channel so the goroutine is fully gone // before the next test's metricDelta() baseline capture. Without this // the previous test's goroutine could still be mid-Sweep (blocked on // the fake's results channel) and its eventual return would mutate // the shared error/acked counters after the next test has already // snapshot its baseline. cancel() <-done } func TestStartSweeper_RecordsMetricsOnError(t *testing.T) { _, _, deltaError := metricDelta(t) store := newFakeSweepStorage( []pendinguploads.SweepResult{{}}, []error{errors.New("db down")}, ) ctx, cancel := context.WithCancel(context.Background()) defer cancel() done := pendinguploads.StartSweeperForTest(ctx, store, time.Hour) store.waitForCycle(t, 1, 2*time.Second) // Poll for the error counter to settle — cycleDone fires inside // the fake's Sweep defer, BEFORE StartSweeper's loop receives the // returned error and calls metrics.PendingUploadsSweepError. On // slow CI hosts a direct deltaError() read here returns 0 even // though the metric WILL be 1 a few ms later. See // waitForMetricDelta comment. waitForMetricDelta(t, deltaError, 1, 2*time.Second) // #86 fix: ensure the goroutine has fully exited before the next // test's metricDelta() baseline capture. cancel() <-done }