diff --git a/workspace-server/internal/handlers/workspace.go b/workspace-server/internal/handlers/workspace.go index 2a2005cd..8b534f70 100644 --- a/workspace-server/internal/handlers/workspace.go +++ b/workspace-server/internal/handlers/workspace.go @@ -807,9 +807,11 @@ func (h *WorkspaceHandler) Delete(c *gin.Context) { pq.Array(allIDs)); err != nil { log.Printf("Delete token revocation error for %s: %v", id, err) } - // Disable schedules for removed workspaces (#1027) +// #1027: cascade-disable all schedules for the deleted workspaces so + // the scheduler never fires a cron into a removed container. if _, err := db.DB.ExecContext(ctx, - `UPDATE workspace_schedules SET enabled = false WHERE workspace_id = ANY($1::uuid[])`, + `UPDATE workspace_schedules SET enabled = false, updated_at = now() + WHERE workspace_id = ANY($1::uuid[]) AND enabled = true`, pq.Array(allIDs)); err != nil { log.Printf("Delete schedule disable error for %s: %v", id, err) } diff --git a/workspace-server/internal/scheduler/scheduler.go b/workspace-server/internal/scheduler/scheduler.go index 718a55da..d74baa39 100644 --- a/workspace-server/internal/scheduler/scheduler.go +++ b/workspace-server/internal/scheduler/scheduler.go @@ -242,7 +242,7 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) { defer func() { if r := recover(); r != nil { log.Printf("Scheduler: panic recovered in fireSchedule for '%s' (%s): %v", - sched.Name, sched.ID, r) + sched.Name, sched.ID, r) // Always advance next_run_at even on panic so the schedule doesn't get // stuck re-firing the same panicking schedule indefinitely (#1029). if nextTime, err := ComputeNextRun(sched.CronExpr, sched.Timezone, time.Now()); err == nil { diff --git a/workspace-server/internal/scheduler/scheduler_test.go b/workspace-server/internal/scheduler/scheduler_test.go index 1a2bf293..67c2fcce 100644 --- a/workspace-server/internal/scheduler/scheduler_test.go +++ b/workspace-server/internal/scheduler/scheduler_test.go @@ -276,6 +276,12 @@ func TestFireSchedule_ComputeNextRunError(t *testing.T) { mock.ExpectQuery(`SELECT COALESCE`). WillReturnRows(sqlmock.NewRows([]string{"coalesce"}).AddRow(0)) + // #795 consecutive_empty_runs reset — successProxy returns {"ok":true} + // which is non-empty, so the counter is reset to 0. + mock.ExpectExec(`UPDATE workspace_schedules`). + WithArgs(sched.ID). + WillReturnResult(sqlmock.NewResult(0, 1)) + // UPDATE must fire — COALESCE($2, next_run_at) keeps existing value when $2 is nil. // AnyArg for $2 because it will be nil (ComputeNextRun failed). mock.ExpectExec(`UPDATE workspace_schedules`). @@ -453,14 +459,19 @@ func TestRecordSkipped_shortWorkspaceIDNoPanic(t *testing.T) { // ── Panic-recovery next_run_at advancement (#1029) ────────────────────────── // -// Issue #1029: when fireSchedule panics, the deferred recover must advance -// next_run_at to the next cron window. Without this fix the schedule's -// next_run_at stays in the past and fires on every 30-second tick. +// Issue #1029: when fireSchedule panics (e.g. a nil-pointer in the A2A proxy +// or a bad JSON marshal), the deferred recover must advance next_run_at to the +// next cron window. Without this fix the schedule's next_run_at stays in the +// past and fires on every 30-second tick — a tight retry loop that amplifies +// the original failure. -const testCronPanic = "0 * * * *" - -// TestPanicRecovery_AdvancesNextRunAt verifies that recover issues an UPDATE -// to advance next_run_at when the proxy panics. panic -> recover -> advance. +// TestPanicRecovery_AdvancesNextRunAt verifies that the recover block in +// fireSchedule issues an UPDATE to advance next_run_at when the proxy panics. +// +// This is the core invariant of the #1029 fix: panic → recover → advance. +// The test calls fireSchedule directly (not via tick) so the sqlmock +// expectations are precise — we know exactly which queries fire and in what +// order. func TestPanicRecovery_AdvancesNextRunAt(t *testing.T) { mock := setupTestDB(t) @@ -468,33 +479,42 @@ func TestPanicRecovery_AdvancesNextRunAt(t *testing.T) { ID: "aaa11111-1111-1111-1111-111111111111", WorkspaceID: "bbb22222-2222-2222-2222-222222222222", Name: "panic-advance-test", - CronExpr: testCronPanic, + CronExpr: "0 * * * *", // every hour — valid expr so ComputeNextRun succeeds Timezone: "UTC", Prompt: "trigger panic", } - // fireSchedule checks active_tasks first. + // 1. fireSchedule first checks active_tasks on the workspace. + // Return 0 so the fire proceeds (not skipped). mock.ExpectQuery(`SELECT COALESCE`). WillReturnRows(sqlmock.NewRows([]string{"coalesce"}).AddRow(0)) - // panicProxy causes ProxyA2ARequest to panic. - // Deferred recover calls ComputeNextRun, then ExecContext for next_run_at. + // 2. ProxyA2ARequest panics (panicProxy). + // The deferred recover catches it and calls: + // ComputeNextRun(cronExpr, tz, time.Now()) + // db.DB.ExecContext(ctx, `UPDATE workspace_schedules SET next_run_at = $1 ... WHERE id = $2`, nextTime, sched.ID) + // + // We expect this UPDATE with the schedule ID as arg 2. mock.ExpectExec(`UPDATE workspace_schedules SET next_run_at`). WithArgs(sqlmock.AnyArg(), sched.ID). WillReturnResult(sqlmock.NewResult(0, 1)) s := New(&panicProxy{}, nil) + // fireSchedule must not propagate the panic — the recover catches it. s.fireSchedule(context.Background(), sched) if err := mock.ExpectationsWereMet(); err != nil { - t.Errorf("unmet DB expectations: %v -"+ - "Panic-recovery defer must advance next_run_at via UPDATE (#1029)", err) + t.Errorf("unmet DB expectations: %v\n"+ + "The panic-recovery defer must advance next_run_at via UPDATE (#1029)", err) } } -// TestFireSchedule_NormalSuccess_AdvancesNextRunAt regression guard: normal -// fireSchedule completion must still advance next_run_at via the post-fire UPDATE. +// TestFireSchedule_NormalSuccess_AdvancesNextRunAt is a regression guard for +// the happy path: when fireSchedule completes without error, next_run_at must +// be advanced as part of the normal UPDATE (not via the panic path). +// +// This ensures the #1029 panic-recovery change didn't accidentally break the +// normal flow where both the proxy call and the post-fire UPDATE succeed. func TestFireSchedule_NormalSuccess_AdvancesNextRunAt(t *testing.T) { mock := setupTestDB(t) @@ -502,26 +522,28 @@ func TestFireSchedule_NormalSuccess_AdvancesNextRunAt(t *testing.T) { ID: "ccc33333-3333-3333-3333-333333333333", WorkspaceID: "ddd44444-4444-4444-4444-444444444444", Name: "normal-advance-test", - CronExpr: "30 * * * *", + CronExpr: "30 * * * *", // every hour at :30 Timezone: "UTC", Prompt: "do work", } - // active_tasks check -> workspace idle + // 1. active_tasks check → workspace idle mock.ExpectQuery(`SELECT COALESCE`). WillReturnRows(sqlmock.NewRows([]string{"coalesce"}).AddRow(0)) - // #795 consecutive_empty_runs reset + // 2. #795 consecutive_empty_runs reset — successProxy returns {"ok":true} + // which is non-empty, so the counter is reset to 0. mock.ExpectExec(`UPDATE workspace_schedules`). WithArgs(sched.ID). WillReturnResult(sqlmock.NewResult(0, 1)) - // Normal post-fire UPDATE + // 3. Normal UPDATE after successful proxy call. + // Args: $1=sched.ID, $2=nextRunPtr (computed time), $3=lastStatus, $4=lastError mock.ExpectExec(`UPDATE workspace_schedules`). WithArgs(sched.ID, sqlmock.AnyArg(), "ok", ""). WillReturnResult(sqlmock.NewResult(0, 1)) - // activity_logs INSERT + // 4. activity_logs INSERT mock.ExpectExec(`INSERT INTO activity_logs`). WithArgs(sched.WorkspaceID, sqlmock.AnyArg(), sqlmock.AnyArg(), "ok", ""). WillReturnResult(sqlmock.NewResult(0, 1)) @@ -530,14 +552,21 @@ func TestFireSchedule_NormalSuccess_AdvancesNextRunAt(t *testing.T) { s.fireSchedule(context.Background(), sched) if err := mock.ExpectationsWereMet(); err != nil { - t.Errorf("unmet DB expectations: %v -"+ + t.Errorf("unmet DB expectations: %v\n"+ "Normal fire must still advance next_run_at via the post-fire UPDATE", err) } } -// TestRecordSkipped_AdvancesNextRunAt verifies recordSkipped advances -// next_run_at so the schedule doesn't re-fire on the very next tick. +// TestRecordSkipped_AdvancesNextRunAt verifies that when a workspace is busy +// and the cron fire is skipped, recordSkipped advances next_run_at so the +// schedule doesn't re-fire on the very next tick. +// +// This is the third leg of the #1029 invariant: fire, panic, AND skip must +// all advance next_run_at. +// +// We call recordSkipped directly rather than going through fireSchedule +// because #969 added a deferral loop (poll every 10s for up to 2 min) that +// makes end-to-end testing via fireSchedule impractical with sqlmock. func TestRecordSkipped_AdvancesNextRunAt(t *testing.T) { mock := setupTestDB(t) @@ -545,27 +574,28 @@ func TestRecordSkipped_AdvancesNextRunAt(t *testing.T) { ID: "eee55555-5555-5555-5555-555555555555", WorkspaceID: "fff66666-6666-6666-6666-666666666666", Name: "skipped-advance-test", - CronExpr: "15 * * * *", + CronExpr: "15 * * * *", // every hour at :15 Timezone: "UTC", Prompt: "skipped work", } - // recordSkipped UPDATE + // 1. recordSkipped UPDATE — must include next_run_at ($2) and reason ($3). mock.ExpectExec(`UPDATE workspace_schedules`). WithArgs(sched.ID, sqlmock.AnyArg(), sqlmock.AnyArg()). WillReturnResult(sqlmock.NewResult(0, 1)) - // activity_logs INSERT + // 2. activity_logs INSERT for the skip event mock.ExpectExec(`INSERT INTO activity_logs`). WithArgs(sched.WorkspaceID, sqlmock.AnyArg(), sqlmock.AnyArg(), sqlmock.AnyArg()). WillReturnResult(sqlmock.NewResult(0, 1)) s := New(&successProxy{}, nil) + // Call recordSkipped directly — simulates the skip path when workspace is busy. s.recordSkipped(context.Background(), sched, 2) if err := mock.ExpectationsWereMet(); err != nil { - t.Errorf("unmet DB expectations: %v -"+ + t.Errorf("unmet DB expectations: %v\n"+ "recordSkipped must advance next_run_at when workspace is busy (#1029)", err) } } +// trigger CI