molecule-core/workspace-server/internal/pendinguploads/sweeper_test.go

package pendinguploads_test

import (
	"context"
	"database/sql"
	"errors"
	"sync/atomic"
	"testing"
	"time"

	"github.com/google/uuid"

	"github.com/Molecule-AI/molecule-monorepo/platform/internal/metrics"
	"github.com/Molecule-AI/molecule-monorepo/platform/internal/pendinguploads"
)

// fakeSweepStorage is a minimal Storage that records every Sweep call
// and lets each test inject the per-cycle return values. The other
// methods are no-ops — the sweeper goroutine never calls them.
type fakeSweepStorage struct {
	calls       atomic.Int64
	results     []pendinguploads.SweepResult
	errs        []error
	cycleDone   chan struct{} // closed after each Sweep call (test sync)
	gotRetention atomic.Int64 // last ackRetention seen, in seconds
}

func newFakeSweepStorage(results []pendinguploads.SweepResult, errs []error) *fakeSweepStorage {
	return &fakeSweepStorage{
		results:   results,
		errs:      errs,
		cycleDone: make(chan struct{}, 16),
	}
}

func (f *fakeSweepStorage) Put(_ context.Context, _ uuid.UUID, _ []byte, _, _ string) (uuid.UUID, error) {
	return uuid.Nil, errors.New("not used")
}
func (f *fakeSweepStorage) Get(_ context.Context, _ uuid.UUID) (pendinguploads.Record, error) {
	return pendinguploads.Record{}, errors.New("not used")
}
func (f *fakeSweepStorage) MarkFetched(_ context.Context, _ uuid.UUID) error {
	return errors.New("not used")
}
func (f *fakeSweepStorage) Ack(_ context.Context, _ uuid.UUID) error {
	return errors.New("not used")
}
func (f *fakeSweepStorage) PutBatch(_ context.Context, _ uuid.UUID, _ []pendinguploads.PutItem) ([]uuid.UUID, error) {
	return nil, errors.New("not used")
}
func (f *fakeSweepStorage) PutBatchTx(_ context.Context, _ *sql.Tx, _ uuid.UUID, _ []pendinguploads.PutItem) ([]uuid.UUID, error) {
	return nil, errors.New("not used")
}
func (f *fakeSweepStorage) Sweep(_ context.Context, ackRetention time.Duration) (pendinguploads.SweepResult, error) {
	idx := int(f.calls.Load())
	f.calls.Add(1)
	f.gotRetention.Store(int64(ackRetention.Seconds()))
	defer func() {
		select {
		case f.cycleDone <- struct{}{}:
		default:
		}
	}()
	if idx < len(f.errs) && f.errs[idx] != nil {
		return pendinguploads.SweepResult{}, f.errs[idx]
	}
	if idx < len(f.results) {
		return f.results[idx], nil
	}
	return pendinguploads.SweepResult{}, nil
}

// waitForCycle blocks until at least one Sweep completes, with a deadline.
// Tests use this instead of time.Sleep to avoid flakes on slow CI hosts.
//
// CAVEAT: cycleDone fires from inside fakeSweepStorage.Sweep's defer,
// which runs as Sweep returns its result — BEFORE the StartSweeper
// loop has processed the (result, error) tuple and called the
// metric recorders. Tests that assert on metric counters must NOT
// rely on this wait alone; use waitForMetricDelta instead so the
// metric increment race (Sweep returns → cycleDone fires → test
// reads counter → only then does StartSweeper's loop call
// metrics.PendingUploadsSweepError) doesn't produce a flake.
func (f *fakeSweepStorage) waitForCycle(t *testing.T, n int, timeout time.Duration) {
	t.Helper()
	deadline := time.NewTimer(timeout)
	defer deadline.Stop()
	for got := 0; got < n; got++ {
		select {
		case <-f.cycleDone:
		case <-deadline.C:
			t.Fatalf("waited %s for %d sweep cycles, got %d", timeout, n, f.calls.Load())
		}
	}
}

// waitForMetricDelta polls the supplied delta function until it returns
// `want` or the timeout elapses. Use after waitForCycle when the test
// asserts on a metric counter — closes the race between cycleDone
// (signalled inside fakeSweepStorage.Sweep's defer, BEFORE Sweep
// returns to StartSweeper) and the metric recording (which happens in
// StartSweeper's loop AFTER Sweep returns). On a slow CI host the test
// goroutine wins the read before StartSweeper's goroutine writes the
// counter; the polling assert preserves the determinism of "the metric
// MUST be N" without timing-based flakes.
//
// Per memory feedback_question_test_when_unexpected.md: the failure
// mode "delta=0, want=1" looked like a real bug at first glance —
// "metric never incremented" — but instrumented analysis showed the
// metric DID increment, just AFTER the test's read. The fix is the
// test's wait shape, not the production code.
func waitForMetricDelta(t *testing.T, delta func() int64, want int64, timeout time.Duration) {
	t.Helper()
	deadline := time.Now().Add(timeout)
	for time.Now().Before(deadline) {
		if delta() == want {
			return
		}
		time.Sleep(5 * time.Millisecond)
	}
	t.Fatalf("waited %s for metric delta=%d, last seen %d", timeout, want, delta())
}

func TestStartSweeper_NilStorageDoesNotPanic(t *testing.T) {
	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()
	// Should return immediately without panicking; no goroutine to wait on.
	pendinguploads.StartSweeper(ctx, nil, time.Second)
}

func TestStartSweeper_RunsImmediatelyAndOnTick(t *testing.T) {
	store := newFakeSweepStorage(
		[]pendinguploads.SweepResult{{Acked: 5}, {Acked: 1, Expired: 2}},
		nil,
	)
	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()

	done := pendinguploads.StartSweeperForTest(ctx, store, time.Hour)
	store.waitForCycle(t, 1, 2*time.Second)
	if got := store.calls.Load(); got < 1 {
		t.Errorf("expected at least one immediate sweep, got %d", got)
	}
	// Retention propagated.
	if store.gotRetention.Load() != 3600 {
		t.Errorf("retention seconds = %d, want 3600", store.gotRetention.Load())
	}
	// #86 fix: ensure goroutine has exited before the next test's
	// metricDelta() baseline capture.
	cancel()
	<-done
}

func TestStartSweeper_ZeroAckRetentionUsesDefault(t *testing.T) {
	store := newFakeSweepStorage([]pendinguploads.SweepResult{{}}, nil)
	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()

	done := pendinguploads.StartSweeperForTest(ctx, store, 0)
	store.waitForCycle(t, 1, 2*time.Second)
	want := int64(pendinguploads.DefaultAckRetention.Seconds())
	if store.gotRetention.Load() != want {
		t.Errorf("retention = %d, want default %d", store.gotRetention.Load(), want)
	}
	// #86 fix.
	cancel()
	<-done
}

func TestStartSweeper_ContextCancelStopsLoop(t *testing.T) {
	store := newFakeSweepStorage([]pendinguploads.SweepResult{{}}, nil)
	ctx, cancel := context.WithCancel(context.Background())

	done := pendinguploads.StartSweeperForTest(ctx, store, time.Second)
	store.waitForCycle(t, 1, 2*time.Second)
	cancel()

	select {
	case <-done:
	case <-time.After(2 * time.Second):
		t.Fatal("StartSweeper did not return after ctx cancel")
	}
}

func TestStartSweeperWithInterval_TickerFiresAdditionalCycles(t *testing.T) {
	store := newFakeSweepStorage(
		[]pendinguploads.SweepResult{{Acked: 1}, {Expired: 1}, {}, {}, {}},
		nil,
	)
	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()

	done := pendinguploads.StartSweeperForTest(ctx, store, time.Hour)
	// Immediate cycle + at least one tick-driven cycle.
	store.waitForCycle(t, 2, 2*time.Second)

	if got := store.calls.Load(); got < 2 {
		t.Errorf("expected ≥2 cycles (immediate + 1 tick), got %d", got)
	}
	// #86 fix: drain the done channel so the goroutine is fully gone
	// before the next test's metricDelta() baseline capture.
	cancel()
	<-done
}

func TestStartSweeper_TransientErrorDoesNotCrashLoop(t *testing.T) {
	// First call errors; second call succeeds. The loop must keep running
	// across the error so a one-off DB hiccup doesn't disable the GC.
	store := newFakeSweepStorage(
		[]pendinguploads.SweepResult{{}, {Acked: 1}},
		[]error{errors.New("transient db error"), nil},
	)
	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()

	// Capture metric baseline so we can wait for the error counter to
	// settle before returning — otherwise this test's leaked metric
	// write races with the next test's metricDelta() baseline read and
	// causes a non-deterministic +1 leak (manifests as
	// TestStartSweeper_RecordsMetricsOnSuccess: "error counter delta=1,
	// want 0"). cycleDone fires inside the fake's Sweep defer, BEFORE
	// sweepOnce records the error metric — so cancel() right after
	// waitForCycle is too early.
	_, _, deltaError := metricDelta(t)

	done := pendinguploads.StartSweeperForTest(ctx, store, time.Hour)

	// Wait for the first (errored) cycle.
	store.waitForCycle(t, 1, 2*time.Second)
	// Wait for the goroutine to record the error metric. After this
	// returns, sweepOnce has fully completed and a subsequent cancel()
	// stops the loop on the next select pass with no in-flight metric
	// writes outstanding.
	waitForMetricDelta(t, deltaError, 1, 2*time.Second)
	// Cancel and wait for the goroutine to fully exit (#86 fix).
	// Without the done-channel wait, the goroutine races with the next
	// test's metricDelta() baseline capture — the next test may see
	// error=1 from this test still "in flight", throwing off its
	// deltaError assertion.
	cancel()
	<-done
}

// metricDelta returns a function that, when called, returns how much
// the (acked, expired, errored) counters have advanced since metricDelta
// was originally called. metrics is a process-singleton across the test
// suite; deltas isolate this test from order-of-execution dependencies.
func metricDelta(t *testing.T) (deltaAcked, deltaExpired, deltaError func() int64) {
	t.Helper()
	a0, e0, err0 := metrics.PendingUploadsSweepCounts()
	deltaAcked = func() int64 {
		a, _, _ := metrics.PendingUploadsSweepCounts()
		return a - a0
	}
	deltaExpired = func() int64 {
		_, e, _ := metrics.PendingUploadsSweepCounts()
		return e - e0
	}
	deltaError = func() int64 {
		_, _, x := metrics.PendingUploadsSweepCounts()
		return x - err0
	}
	return
}

func TestStartSweeper_RecordsMetricsOnSuccess(t *testing.T) {
	deltaAcked, deltaExpired, deltaError := metricDelta(t)

	store := newFakeSweepStorage(
		[]pendinguploads.SweepResult{{Acked: 3, Expired: 5}},
		nil,
	)
	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()

	done := pendinguploads.StartSweeperForTest(ctx, store, time.Hour)
	store.waitForCycle(t, 1, 2*time.Second)

	// Poll for the success counters to settle — closes the cycleDone-
	// vs-metric-record race (see waitForMetricDelta comment).
	waitForMetricDelta(t, deltaAcked, 3, 2*time.Second)
	waitForMetricDelta(t, deltaExpired, 5, 2*time.Second)
	// Also poll error counter to 0 — with the race detector, a
	// concurrent sweeper goroutine from a prior test can still be
	// running and incrementing pendingUploadsSweepErrors after
	// metricDelta() captures its baseline. Polling to 0 (matching
	// TestStartSweeper_RecordsMetricsOnError's pattern for the error
	// path) ensures the error counter has settled before we assert.
	// Without this, on a slow race-detector host the error counter
	// from the previous test's sweeper is still in flight when we
	// read it, producing deltaError=1 instead of 0.
	waitForMetricDelta(t, deltaError, 0, 2*time.Second)
	if got := deltaError(); got != 0 {
		t.Errorf("error counter delta = %d, want 0", got)
	}

	// #86 fix: drain the done channel so the goroutine is fully gone
	// before the next test's metricDelta() baseline capture. Without this
	// the previous test's goroutine could still be mid-Sweep (blocked on
	// the fake's results channel) and its eventual return would mutate
	// the shared error/acked counters after the next test has already
	// snapshot its baseline.
	cancel()
	<-done
}

func TestStartSweeper_RecordsMetricsOnError(t *testing.T) {
	_, _, deltaError := metricDelta(t)

	store := newFakeSweepStorage(
		[]pendinguploads.SweepResult{{}},
		[]error{errors.New("db down")},
	)
	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()

	done := pendinguploads.StartSweeperForTest(ctx, store, time.Hour)
	store.waitForCycle(t, 1, 2*time.Second)

	// Poll for the error counter to settle — cycleDone fires inside
	// the fake's Sweep defer, BEFORE StartSweeper's loop receives the
	// returned error and calls metrics.PendingUploadsSweepError. On
	// slow CI hosts a direct deltaError() read here returns 0 even
	// though the metric WILL be 1 a few ms later. See
	// waitForMetricDelta comment.
	waitForMetricDelta(t, deltaError, 1, 2*time.Second)

	// #86 fix: ensure the goroutine has fully exited before the next
	// test's metricDelta() baseline capture.
	cancel()
	<-done
}