Three bugs caused enabled schedules to silently disappear from the fire query
(which requires next_run_at IS NOT NULL AND next_run_at <= now()):
Bug 1 - fireSchedule() and recordSkipped(): when ComputeNextRun returned an
error, nextRunPtr stayed nil and UPDATE SET next_run_at = $2 wrote NULL.
Fix: change to COALESCE($2, next_run_at) so the existing DB value is preserved
when $2 is NULL, and log the error explicitly.
Bug 2 - org importer (handlers/org.go): nextRun, _ := ComputeNextRun(...)
silently discarded the error. A bad cron expression would pass time.Time{}
(zero value) to the INSERT. Fix: surface the error, log it, and skip the
schedule INSERT via continue.
Bug 3 - no startup repair: schedules already NULL'd by the pre-fix binary
would never recover. Fix: Start() now calls repairNullNextRunAt() once on
boot, recomputing next_run_at for every enabled schedule with a NULL value.
Tests: TestFireSchedule_ComputeNextRunError, TestRecordSkipped_ComputeNextRunError,
TestRepairNullNextRunAt_RepairsRows, TestRepairNullNextRunAt_DBError_NoPanic,
TestOrgImport_ScheduleComputeError (all pass).
Fixes #722
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
408 lines
15 KiB
Go
408 lines
15 KiB
Go
package scheduler
|
||
|
||
import (
|
||
"context"
|
||
"database/sql"
|
||
"testing"
|
||
"time"
|
||
|
||
sqlmock "github.com/DATA-DOG/go-sqlmock"
|
||
|
||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||
)
|
||
|
||
// errDBDown is a sentinel error used by tests to simulate a DB connection failure.
|
||
var errDBDown = sql.ErrConnDone
|
||
|
||
// setupTestDB replaces the global db.DB with a sqlmock and returns the mock
|
||
// handle. The real DB is restored (by closing the mock conn) via t.Cleanup.
|
||
func setupTestDB(t *testing.T) sqlmock.Sqlmock {
|
||
t.Helper()
|
||
mockDB, mock, err := sqlmock.New()
|
||
if err != nil {
|
||
t.Fatalf("failed to create sqlmock: %v", err)
|
||
}
|
||
db.DB = mockDB
|
||
t.Cleanup(func() { mockDB.Close() })
|
||
return mock
|
||
}
|
||
|
||
// panicProxy is a test double whose ProxyA2ARequest always panics.
|
||
// Used by TestPanicRecovery to verify the scheduler's goroutine-level
|
||
// panic recovery in fireSchedule.
|
||
type panicProxy struct{}
|
||
|
||
func (p *panicProxy) ProxyA2ARequest(
|
||
_ context.Context, _ string, _ []byte, _ string, _ bool,
|
||
) (int, []byte, error) {
|
||
panic("simulated A2A proxy panic")
|
||
}
|
||
|
||
// ── TestLastTickAt_zero ───────────────────────────────────────────────────────
|
||
|
||
// TestLastTickAt_zero confirms that LastTickAt returns a zero time.Time on a
|
||
// freshly-created scheduler that has never been started or ticked.
|
||
func TestLastTickAt_zero(t *testing.T) {
|
||
s := New(nil, nil)
|
||
if got := s.LastTickAt(); !got.IsZero() {
|
||
t.Errorf("LastTickAt() before any tick = %v, want zero time.Time", got)
|
||
}
|
||
}
|
||
|
||
// ── TestHealthy_beforeStart ───────────────────────────────────────────────────
|
||
|
||
// TestHealthy_beforeStart confirms that Healthy returns false when lastTickAt
|
||
// is zero (scheduler created but never ticked).
|
||
func TestHealthy_beforeStart(t *testing.T) {
|
||
s := New(nil, nil)
|
||
if s.Healthy() {
|
||
t.Error("Healthy() = true on a scheduler that has never ticked, want false")
|
||
}
|
||
}
|
||
|
||
// ── TestHealthy_freshTick ─────────────────────────────────────────────────────
|
||
|
||
// TestHealthy_freshTick sets lastTickAt to the current time — mirroring what
|
||
// tick() does after a completed poll cycle — and confirms Healthy returns true.
|
||
func TestHealthy_freshTick(t *testing.T) {
|
||
s := New(nil, nil)
|
||
|
||
// Simulate what tick() writes after wg.Wait() returns.
|
||
s.mu.Lock()
|
||
s.lastTickAt = time.Now()
|
||
s.mu.Unlock()
|
||
|
||
if !s.Healthy() {
|
||
t.Error("Healthy() = false immediately after a fresh tick timestamp, want true")
|
||
}
|
||
}
|
||
|
||
// ── TestHealthy_stale ─────────────────────────────────────────────────────────
|
||
|
||
// TestHealthy_stale backdates lastTickAt by 3×pollInterval (well beyond the
|
||
// 2×pollInterval liveness window) and confirms Healthy returns false.
|
||
func TestHealthy_stale(t *testing.T) {
|
||
s := New(nil, nil)
|
||
|
||
s.mu.Lock()
|
||
s.lastTickAt = time.Now().Add(-3 * pollInterval) // 90 s ago; threshold is 60 s
|
||
s.mu.Unlock()
|
||
|
||
if s.Healthy() {
|
||
t.Errorf("Healthy() = true when lastTickAt is 3×pollInterval (%s) ago, want false",
|
||
3*pollInterval)
|
||
}
|
||
}
|
||
|
||
// ── TestComputeNextRun_valid ──────────────────────────────────────────────────
|
||
|
||
// TestComputeNextRun_valid checks that "0 * * * *" (top-of-hour) returns a
|
||
// future time whose Minute() == 0 when the reference is mid-hour.
|
||
func TestComputeNextRun_valid(t *testing.T) {
|
||
// 2025-01-01 12:30 UTC — clearly mid-hour so "next" top-of-hour is 13:00.
|
||
ref := time.Date(2025, 1, 1, 12, 30, 0, 0, time.UTC)
|
||
|
||
next, err := ComputeNextRun("0 * * * *", "UTC", ref)
|
||
if err != nil {
|
||
t.Fatalf("ComputeNextRun(valid expr) returned unexpected error: %v", err)
|
||
}
|
||
if !next.After(ref) {
|
||
t.Errorf("ComputeNextRun() = %v, want a time strictly after ref %v", next, ref)
|
||
}
|
||
if next.Minute() != 0 {
|
||
t.Errorf("ComputeNextRun() minute = %d, want 0 (top of hour)", next.Minute())
|
||
}
|
||
}
|
||
|
||
// ── TestComputeNextRun_invalid ────────────────────────────────────────────────
|
||
|
||
// TestComputeNextRun_invalid confirms that an unparseable cron expression
|
||
// returns a non-nil error.
|
||
func TestComputeNextRun_invalid(t *testing.T) {
|
||
_, err := ComputeNextRun("not-a-cron", "UTC", time.Now())
|
||
if err == nil {
|
||
t.Error("ComputeNextRun(invalid cron expr) = nil, want non-nil error")
|
||
}
|
||
}
|
||
|
||
// ── TestComputeNextRun_invalidTimezone ────────────────────────────────────────
|
||
|
||
// TestComputeNextRun_invalidTimezone confirms that an unrecognised IANA
|
||
// timezone name returns a non-nil error (rather than silently falling back
|
||
// to UTC, which could mask misconfigured schedules).
|
||
func TestComputeNextRun_invalidTimezone(t *testing.T) {
|
||
_, err := ComputeNextRun("0 * * * *", "Not/AZone", time.Now())
|
||
if err == nil {
|
||
t.Error("ComputeNextRun(invalid tz) = nil, want non-nil error")
|
||
}
|
||
}
|
||
|
||
// ── TestPanicRecovery ─────────────────────────────────────────────────────────
|
||
|
||
// TestPanicRecovery verifies that a panic inside a fireSchedule goroutine does
|
||
// NOT crash the scheduler.
|
||
//
|
||
// The test calls tick() directly with a sqlmock that surfaces one due schedule.
|
||
// panicProxy causes ProxyA2ARequest to panic; the deferred recover() in
|
||
// fireSchedule catches it. After tick() returns, lastTickAt must be set and
|
||
// Healthy() must return true — proving the scheduler survived.
|
||
//
|
||
// Without panic recovery an unrecovered goroutine panic terminates the entire
|
||
// test binary, so the test completing is itself evidence that recovery worked.
|
||
func TestPanicRecovery(t *testing.T) {
|
||
mock := setupTestDB(t)
|
||
|
||
// WorkspaceID must be ≥12 chars: fireSchedule slices it with [:12] for logging.
|
||
schedRows := sqlmock.NewRows(
|
||
[]string{"id", "workspace_id", "name", "cron_expr", "timezone", "prompt"},
|
||
).AddRow(
|
||
"sched-panic-01", // id
|
||
"ws-panic-workspace-1", // workspace_id (21 chars > 12)
|
||
"panic-job", // name
|
||
"* * * * *", // cron_expr
|
||
"UTC", // timezone
|
||
"fire and panic", // prompt
|
||
)
|
||
mock.ExpectQuery(`SELECT id, workspace_id`).WillReturnRows(schedRows)
|
||
|
||
s := New(&panicProxy{}, nil)
|
||
|
||
// tick() launches fireSchedule in a goroutine that will panic.
|
||
// If there is no recovery, the goroutine crash terminates the process here.
|
||
s.tick(context.Background())
|
||
|
||
// tick() returned normally → the panic was caught, wg.Wait() completed,
|
||
// and lastTickAt was updated.
|
||
if !s.Healthy() {
|
||
t.Error("Healthy() = false after panic-recovery tick, want true " +
|
||
"— scheduler must survive a panicking A2A proxy")
|
||
}
|
||
|
||
if err := mock.ExpectationsWereMet(); err != nil {
|
||
t.Errorf("unmet DB expectations: %v", err)
|
||
}
|
||
}
|
||
|
||
// ── TestShort_helper ──────────────────────────────────────────────────────────
|
||
// Regression guard for the short() helper that replaced unsafe [:N] slices
|
||
// after code review. Panicked when IDs were shorter than the slice bound.
|
||
|
||
func TestShort_helper(t *testing.T) {
|
||
cases := []struct {
|
||
in string
|
||
n int
|
||
want string
|
||
}{
|
||
{"abcdef1234567890", 8, "abcdef12"},
|
||
{"abc", 8, "abc"}, // shorter than n — no panic, no truncation
|
||
{"", 8, ""},
|
||
{"12345678", 8, "12345678"}, // exactly n
|
||
}
|
||
for _, tc := range cases {
|
||
if got := short(tc.in, tc.n); got != tc.want {
|
||
t.Errorf("short(%q, %d) = %q, want %q", tc.in, tc.n, got, tc.want)
|
||
}
|
||
}
|
||
}
|
||
|
||
// ── TestRecordSkipped_writesSkippedStatus ────────────────────────────────────
|
||
// #115 coverage gap: the recordSkipped path wasn't tested at all when it
|
||
// first landed. Exercises the UPDATE workspace_schedules + INSERT into
|
||
// activity_logs via sqlmock. Broadcaster is nil so we don't need to stub
|
||
// RecordAndBroadcast (the nil-check in recordSkipped handles that).
|
||
|
||
func TestRecordSkipped_writesSkippedStatus(t *testing.T) {
|
||
mock := setupTestDB(t)
|
||
s := New(nil, nil)
|
||
|
||
sched := scheduleRow{
|
||
ID: "11111111-1111-1111-1111-111111111111",
|
||
WorkspaceID: "22222222-2222-2222-2222-222222222222",
|
||
Name: "Hourly security audit",
|
||
CronExpr: "17 * * * *",
|
||
Timezone: "UTC",
|
||
Prompt: "audit",
|
||
}
|
||
|
||
// Expect the schedule-row UPDATE with last_status='skipped' and the
|
||
// cron_run activity_logs INSERT with status='skipped' + error_detail
|
||
// carrying the active_tasks reason.
|
||
mock.ExpectExec(`UPDATE workspace_schedules`).
|
||
WithArgs(sched.ID, sqlmock.AnyArg(), sqlmock.AnyArg()).
|
||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||
mock.ExpectExec(`INSERT INTO activity_logs`).
|
||
WithArgs(sched.WorkspaceID, sqlmock.AnyArg(), sqlmock.AnyArg(), sqlmock.AnyArg()).
|
||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||
|
||
s.recordSkipped(context.Background(), sched, 3)
|
||
|
||
if err := mock.ExpectationsWereMet(); err != nil {
|
||
t.Errorf("unmet sqlmock expectations: %v", err)
|
||
}
|
||
}
|
||
|
||
// ── successProxy ─────────────────────────────────────────────────────────────
|
||
|
||
// successProxy is a test double whose ProxyA2ARequest always returns HTTP 200
|
||
// with no error, simulating a healthy A2A round-trip.
|
||
type successProxy struct{}
|
||
|
||
func (p *successProxy) ProxyA2ARequest(
|
||
_ context.Context, _ string, _ []byte, _ string, _ bool,
|
||
) (int, []byte, error) {
|
||
return 200, []byte(`{"ok":true}`), nil
|
||
}
|
||
|
||
// ── TestFireSchedule_ComputeNextRunError (#722 Bug 1) ─────────────────────────
|
||
//
|
||
// When ComputeNextRun fails (bad cron expression), fireSchedule must NOT write
|
||
// NULL to next_run_at — it must use COALESCE so the existing DB value is kept.
|
||
// Proof: the UPDATE ExecContext must still be called (schedule not abandoned)
|
||
// and sqlmock satisfies all expectations (no unexpected SQL).
|
||
|
||
func TestFireSchedule_ComputeNextRunError(t *testing.T) {
|
||
mock := setupTestDB(t)
|
||
|
||
sched := scheduleRow{
|
||
ID: "11111111-dead-beef-0000-000000000001",
|
||
WorkspaceID: "22222222-dead-beef-0000-000000000002",
|
||
Name: "bad-cron-job",
|
||
CronExpr: "not-a-valid-cron", // guaranteed to fail ComputeNextRun
|
||
Timezone: "UTC",
|
||
Prompt: "do something",
|
||
}
|
||
|
||
// active_tasks check → 0 (workspace is idle; proceed to fire)
|
||
mock.ExpectQuery(`SELECT COALESCE`).
|
||
WillReturnRows(sqlmock.NewRows([]string{"coalesce"}).AddRow(0))
|
||
|
||
// UPDATE must fire — COALESCE($2, next_run_at) keeps existing value when $2 is nil.
|
||
// AnyArg for $2 because it will be nil (ComputeNextRun failed).
|
||
mock.ExpectExec(`UPDATE workspace_schedules`).
|
||
WithArgs(sched.ID, sqlmock.AnyArg(), "ok", "").
|
||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||
|
||
// activity_logs INSERT always fires
|
||
mock.ExpectExec(`INSERT INTO activity_logs`).
|
||
WithArgs(sched.WorkspaceID, sqlmock.AnyArg(), sqlmock.AnyArg(), "ok", "").
|
||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||
|
||
s := New(&successProxy{}, nil)
|
||
s.fireSchedule(context.Background(), sched)
|
||
|
||
if err := mock.ExpectationsWereMet(); err != nil {
|
||
t.Errorf("unmet DB expectations — schedule update was skipped or next_run_at not preserved: %v", err)
|
||
}
|
||
}
|
||
|
||
// ── TestRecordSkipped_ComputeNextRunError (#722 Bug 1 — skipped path) ─────────
|
||
//
|
||
// Same invariant as TestFireSchedule_ComputeNextRunError but for the
|
||
// recordSkipped path: a bad cron expression must not NULL out next_run_at.
|
||
|
||
func TestRecordSkipped_ComputeNextRunError(t *testing.T) {
|
||
mock := setupTestDB(t)
|
||
|
||
sched := scheduleRow{
|
||
ID: "33333333-dead-beef-0000-000000000003",
|
||
WorkspaceID: "44444444-dead-beef-0000-000000000004",
|
||
Name: "bad-cron-skip",
|
||
CronExpr: "not-a-valid-cron",
|
||
Timezone: "UTC",
|
||
Prompt: "skipped task",
|
||
}
|
||
|
||
mock.ExpectExec(`UPDATE workspace_schedules`).
|
||
WithArgs(sched.ID, sqlmock.AnyArg(), sqlmock.AnyArg()).
|
||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||
mock.ExpectExec(`INSERT INTO activity_logs`).
|
||
WithArgs(sched.WorkspaceID, sqlmock.AnyArg(), sqlmock.AnyArg(), sqlmock.AnyArg()).
|
||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||
|
||
s := New(nil, nil)
|
||
s.recordSkipped(context.Background(), sched, 2)
|
||
|
||
if err := mock.ExpectationsWereMet(); err != nil {
|
||
t.Errorf("unmet DB expectations: %v", err)
|
||
}
|
||
}
|
||
|
||
// ── TestRepairNullNextRunAt_RepairsRows (#722 Bug 3) ──────────────────────────
|
||
//
|
||
// repairNullNextRunAt must SELECT enabled schedules with NULL next_run_at,
|
||
// compute the next fire time, and UPDATE each row.
|
||
|
||
func TestRepairNullNextRunAt_RepairsRows(t *testing.T) {
|
||
mock := setupTestDB(t)
|
||
|
||
// Two schedules whose next_run_at is NULL and whose cron exprs are valid.
|
||
mock.ExpectQuery(`SELECT id, cron_expr, timezone`).
|
||
WillReturnRows(sqlmock.NewRows([]string{"id", "cron_expr", "timezone"}).
|
||
AddRow("sched-repair-01", "0 * * * *", "UTC").
|
||
AddRow("sched-repair-02", "30 9 * * 1", "America/New_York"))
|
||
|
||
// Expect one UPDATE per repaired row.
|
||
mock.ExpectExec(`UPDATE workspace_schedules`).
|
||
WithArgs("sched-repair-01", sqlmock.AnyArg()).
|
||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||
mock.ExpectExec(`UPDATE workspace_schedules`).
|
||
WithArgs("sched-repair-02", sqlmock.AnyArg()).
|
||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||
|
||
s := New(nil, nil)
|
||
s.repairNullNextRunAt(context.Background())
|
||
|
||
if err := mock.ExpectationsWereMet(); err != nil {
|
||
t.Errorf("unmet DB expectations: %v", err)
|
||
}
|
||
}
|
||
|
||
// ── TestRepairNullNextRunAt_DBError_NoPanic (#722 Bug 3) ──────────────────────
|
||
//
|
||
// A DB error from the SELECT must be logged but must not panic — the scheduler
|
||
// startup should proceed normally.
|
||
|
||
func TestRepairNullNextRunAt_DBError_NoPanic(t *testing.T) {
|
||
mock := setupTestDB(t)
|
||
|
||
mock.ExpectQuery(`SELECT id, cron_expr, timezone`).
|
||
WillReturnError(errDBDown)
|
||
|
||
s := New(nil, nil)
|
||
// Must not panic:
|
||
s.repairNullNextRunAt(context.Background())
|
||
|
||
if err := mock.ExpectationsWereMet(); err != nil {
|
||
t.Errorf("unmet DB expectations: %v", err)
|
||
}
|
||
}
|
||
|
||
// ── TestRecordSkipped_shortWorkspaceIDNoPanic ─────────────────────────────────
|
||
// Guards against the short() regression: recordSkipped must not panic if
|
||
// WorkspaceID is unexpectedly shorter than the 12-char prefix used in logs.
|
||
|
||
func TestRecordSkipped_shortWorkspaceIDNoPanic(t *testing.T) {
|
||
mock := setupTestDB(t)
|
||
s := New(nil, nil)
|
||
|
||
// 4-char workspace id — shorter than any substring bound in the code.
|
||
sched := scheduleRow{
|
||
ID: "11111111-1111-1111-1111-111111111111",
|
||
WorkspaceID: "ws-x",
|
||
Name: "test",
|
||
CronExpr: "0 * * * *",
|
||
Timezone: "UTC",
|
||
}
|
||
mock.ExpectExec(`UPDATE workspace_schedules`).
|
||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||
mock.ExpectExec(`INSERT INTO activity_logs`).
|
||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||
|
||
defer func() {
|
||
if r := recover(); r != nil {
|
||
t.Errorf("recordSkipped panicked on short WorkspaceID: %v", r)
|
||
}
|
||
}()
|
||
s.recordSkipped(context.Background(), sched, 1)
|
||
}
|