fix(scheduler): detect phantom-producing crons via consecutive-empty tracking (#795)

Post-mortem fix: UIUX Designer ran 22 cron fires over 23 hours with
every single response being empty or '(no response generated)'. The
scheduler reported status=ok because the HTTP call succeeded — nobody
caught it until the CEO asked.

Changes:
- Migration 032: adds consecutive_empty_runs INT to workspace_schedules
- scheduler.go: captures response body from ProxyA2ARequest (was _),
  checks for empty/sentinel markers via isEmptyResponse(), increments
  consecutive_empty_runs on empty ok responses, resets on non-empty.
  When consecutive_empty_runs >= 3, sets last_status='stale' with a
  descriptive error message.

The 'stale' status is surfaced via:
- GET /admin/schedules/health (merged in #671)
- PM's silence detector (companion fix in org-template PR)
- Maintenance loop response-body sampling (operator-side fix)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
rabbitblood 2026-04-17 11:11:05 -07:00
parent a41a2ba663
commit 3249d3ffdb
3 changed files with 59 additions and 1 deletions

View File

@ -5,6 +5,7 @@ import (
"encoding/json"
"fmt"
"log"
"strings"
"sync"
"time"
@ -264,7 +265,7 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) {
// Empty callerID = canvas-style request (bypasses access control, source_id=NULL in activity log).
// "system:scheduler" was invalid — source_id column is UUID and rejects non-UUID strings.
statusCode, _, proxyErr := s.proxy.ProxyA2ARequest(fireCtx, sched.WorkspaceID, a2aBody, "", true)
statusCode, respBody, proxyErr := s.proxy.ProxyA2ARequest(fireCtx, sched.WorkspaceID, a2aBody, "", true)
lastStatus := "ok"
lastError := ""
@ -280,6 +281,34 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) {
log.Printf("Scheduler: '%s' completed (HTTP %d)", sched.Name, statusCode)
}
// #795: detect phantom-producing schedules — cron fires successfully
// but the agent returns empty or "(no response generated)". Track
// consecutive empties and escalate to 'stale' after 3 in a row.
isEmpty := isEmptyResponse(respBody)
if lastStatus == "ok" && isEmpty {
db.DB.ExecContext(ctx, `
UPDATE workspace_schedules
SET consecutive_empty_runs = consecutive_empty_runs + 1,
updated_at = now()
WHERE id = $1`, sched.ID)
// Check if we've crossed the stale threshold
var consecEmpty int
db.DB.QueryRowContext(ctx, `SELECT consecutive_empty_runs FROM workspace_schedules WHERE id = $1`, sched.ID).Scan(&consecEmpty)
if consecEmpty >= 3 {
lastStatus = "stale"
lastError = fmt.Sprintf("empty response %d consecutive times — agent may be phantom-producing (#795)", consecEmpty)
log.Printf("Scheduler: '%s' STALE — %d consecutive empty responses (workspace %s)",
sched.Name, consecEmpty, short(sched.WorkspaceID, 12))
}
} else if lastStatus == "ok" {
// Non-empty success — reset the counter
db.DB.ExecContext(ctx, `
UPDATE workspace_schedules
SET consecutive_empty_runs = 0,
updated_at = now()
WHERE id = $1`, sched.ID)
}
nextRun, nextErr := ComputeNextRun(sched.CronExpr, sched.Timezone, time.Now())
var nextRunPtr *time.Time
if nextErr == nil {
@ -442,6 +471,30 @@ func (s *Scheduler) repairNullNextRunAt(ctx context.Context) {
}
}
// isEmptyResponse checks if an A2A response body indicates the agent
// produced no meaningful output. Catches "(no response generated)" from
// the workspace runtime + genuinely empty/null responses. Used by the
// consecutive-empty tracker (#795) to detect phantom-producing crons.
func isEmptyResponse(body []byte) bool {
if len(body) == 0 {
return true
}
s := string(body)
// The A2A response wraps the agent text in {"result":{"parts":[{"text":"..."}]}}
// Check for the sentinel the workspace runtime emits when the agent produces nothing.
for _, marker := range []string{
`(no response generated)`,
`"text": "(no response generated)"`,
`"text":""`,
`"text": ""`,
} {
if strings.Contains(s, marker) {
return true
}
}
return false
}
func truncate(s string, maxLen int) string {
if len(s) <= maxLen {
return s

View File

@ -0,0 +1 @@
ALTER TABLE workspace_schedules DROP COLUMN IF EXISTS consecutive_empty_runs;

View File

@ -0,0 +1,4 @@
-- #795: Track consecutive empty cron responses to detect phantom-producing schedules.
-- When consecutive_empty_runs >= 3, the scheduler sets last_status='stale' instead of 'ok',
-- making it visible in /admin/schedules/health and the PM silence-detector.
ALTER TABLE workspace_schedules ADD COLUMN IF NOT EXISTS consecutive_empty_runs INTEGER NOT NULL DEFAULT 0;