fix(scheduler): detect phantom-producing crons via consecutive-empty tracking (#795)
Post-mortem fix: UIUX Designer ran 22 cron fires over 23 hours with every single response being empty or '(no response generated)'. The scheduler reported status=ok because the HTTP call succeeded — nobody caught it until the CEO asked. Changes: - Migration 032: adds consecutive_empty_runs INT to workspace_schedules - scheduler.go: captures response body from ProxyA2ARequest (was _), checks for empty/sentinel markers via isEmptyResponse(), increments consecutive_empty_runs on empty ok responses, resets on non-empty. When consecutive_empty_runs >= 3, sets last_status='stale' with a descriptive error message. The 'stale' status is surfaced via: - GET /admin/schedules/health (merged in #671) - PM's silence detector (companion fix in org-template PR) - Maintenance loop response-body sampling (operator-side fix) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
a41a2ba663
commit
3249d3ffdb
@ -5,6 +5,7 @@ import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@ -264,7 +265,7 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) {
|
||||
|
||||
// Empty callerID = canvas-style request (bypasses access control, source_id=NULL in activity log).
|
||||
// "system:scheduler" was invalid — source_id column is UUID and rejects non-UUID strings.
|
||||
statusCode, _, proxyErr := s.proxy.ProxyA2ARequest(fireCtx, sched.WorkspaceID, a2aBody, "", true)
|
||||
statusCode, respBody, proxyErr := s.proxy.ProxyA2ARequest(fireCtx, sched.WorkspaceID, a2aBody, "", true)
|
||||
|
||||
lastStatus := "ok"
|
||||
lastError := ""
|
||||
@ -280,6 +281,34 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) {
|
||||
log.Printf("Scheduler: '%s' completed (HTTP %d)", sched.Name, statusCode)
|
||||
}
|
||||
|
||||
// #795: detect phantom-producing schedules — cron fires successfully
|
||||
// but the agent returns empty or "(no response generated)". Track
|
||||
// consecutive empties and escalate to 'stale' after 3 in a row.
|
||||
isEmpty := isEmptyResponse(respBody)
|
||||
if lastStatus == "ok" && isEmpty {
|
||||
db.DB.ExecContext(ctx, `
|
||||
UPDATE workspace_schedules
|
||||
SET consecutive_empty_runs = consecutive_empty_runs + 1,
|
||||
updated_at = now()
|
||||
WHERE id = $1`, sched.ID)
|
||||
// Check if we've crossed the stale threshold
|
||||
var consecEmpty int
|
||||
db.DB.QueryRowContext(ctx, `SELECT consecutive_empty_runs FROM workspace_schedules WHERE id = $1`, sched.ID).Scan(&consecEmpty)
|
||||
if consecEmpty >= 3 {
|
||||
lastStatus = "stale"
|
||||
lastError = fmt.Sprintf("empty response %d consecutive times — agent may be phantom-producing (#795)", consecEmpty)
|
||||
log.Printf("Scheduler: '%s' STALE — %d consecutive empty responses (workspace %s)",
|
||||
sched.Name, consecEmpty, short(sched.WorkspaceID, 12))
|
||||
}
|
||||
} else if lastStatus == "ok" {
|
||||
// Non-empty success — reset the counter
|
||||
db.DB.ExecContext(ctx, `
|
||||
UPDATE workspace_schedules
|
||||
SET consecutive_empty_runs = 0,
|
||||
updated_at = now()
|
||||
WHERE id = $1`, sched.ID)
|
||||
}
|
||||
|
||||
nextRun, nextErr := ComputeNextRun(sched.CronExpr, sched.Timezone, time.Now())
|
||||
var nextRunPtr *time.Time
|
||||
if nextErr == nil {
|
||||
@ -442,6 +471,30 @@ func (s *Scheduler) repairNullNextRunAt(ctx context.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
// isEmptyResponse checks if an A2A response body indicates the agent
|
||||
// produced no meaningful output. Catches "(no response generated)" from
|
||||
// the workspace runtime + genuinely empty/null responses. Used by the
|
||||
// consecutive-empty tracker (#795) to detect phantom-producing crons.
|
||||
func isEmptyResponse(body []byte) bool {
|
||||
if len(body) == 0 {
|
||||
return true
|
||||
}
|
||||
s := string(body)
|
||||
// The A2A response wraps the agent text in {"result":{"parts":[{"text":"..."}]}}
|
||||
// Check for the sentinel the workspace runtime emits when the agent produces nothing.
|
||||
for _, marker := range []string{
|
||||
`(no response generated)`,
|
||||
`"text": "(no response generated)"`,
|
||||
`"text":""`,
|
||||
`"text": ""`,
|
||||
} {
|
||||
if strings.Contains(s, marker) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func truncate(s string, maxLen int) string {
|
||||
if len(s) <= maxLen {
|
||||
return s
|
||||
|
||||
@ -0,0 +1 @@
|
||||
ALTER TABLE workspace_schedules DROP COLUMN IF EXISTS consecutive_empty_runs;
|
||||
@ -0,0 +1,4 @@
|
||||
-- #795: Track consecutive empty cron responses to detect phantom-producing schedules.
|
||||
-- When consecutive_empty_runs >= 3, the scheduler sets last_status='stale' instead of 'ok',
|
||||
-- making it visible in /admin/schedules/health and the PM silence-detector.
|
||||
ALTER TABLE workspace_schedules ADD COLUMN IF NOT EXISTS consecutive_empty_runs INTEGER NOT NULL DEFAULT 0;
|
||||
Loading…
Reference in New Issue
Block a user