From 9b2d2bb0fe875415cde161b34af40cde4e075aea Mon Sep 17 00:00:00 2001
From: "Molecule AI Dev Engineer B (MiniMax)"
 <dev-engineer-b-minimax@agents.moleculesai.app>
Date: Sat, 23 May 2026 01:27:31 +0000
Subject: [PATCH 1/5] =?UTF-8?q?fix(scheduler):=20#1696=20=E2=80=94=20detec?=
 =?UTF-8?q?t=20SDK-layer=20errors=20inside=20HTTP=20200=20responses?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The claude-code-sdk adapter returns HTTP 200 even when the inner LLM
call throws (Max-plan rate-limit, quota exhaustion, SDK internal errors).
Before this fix all such failures surfaced as "completed (HTTP 200)" in
workspace_schedules.last_status while the agent chat showed errors —
a silent failure that hid persistent schedule outages from operators.

Change:
- Added detectResultKind() helper that inspects the A2A response body
  for result.kind / result.result_kind fields and maps SDK error strings
  (rate limit, quota, API key, etc.) to canonical kind values.
- fireSchedule() now calls detectResultKind() on every HTTP-200 response.
  Non-ok result_kind values are propagated as last_status (e.g.
  "rate_limited", "quota_exhausted", "sdk_error").
- Added consecutive_sdk_errors counter column to workspace_schedules.
  Increment on each SDK error; auto-disable the schedule after 3
  consecutive SDK errors to stop token burn on a consistently failing
  schedule. Counter resets on any non-SDK-error run.
- Added migration 20260523000000_schedule_consecutive_sdk_errors.
- Added unit tests for detectResultKind() covering all known error
  shapes, plus integration tests for rate-limited first run, 3rd-run
  auto-disable, and counter-reset on clean run.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../internal/scheduler/scheduler.go           | 132 +++++++-
 .../internal/scheduler/scheduler_test.go      | 287 +++++++++++++++++-
 ...0_schedule_consecutive_sdk_errors.down.sql |   4 +
 ...000_schedule_consecutive_sdk_errors.up.sql |  20 ++
 4 files changed, 441 insertions(+), 2 deletions(-)
 create mode 100644 workspace-server/migrations/20260523000000_schedule_consecutive_sdk_errors.down.sql
 create mode 100644 workspace-server/migrations/20260523000000_schedule_consecutive_sdk_errors.up.sql

diff --git a/workspace-server/internal/scheduler/scheduler.go b/workspace-server/internal/scheduler/scheduler.go
index 10e67208e..ce8efbd69 100644
--- a/workspace-server/internal/scheduler/scheduler.go
+++ b/workspace-server/internal/scheduler/scheduler.go
@@ -425,6 +425,7 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) {
 
 	lastStatus := "ok"
 	lastError := ""
+	resultKind := ""
 	if proxyErr != nil {
 		lastStatus = "error"
 		lastError = fmt.Sprintf("%v", proxyErr)
@@ -438,7 +439,21 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) {
 		lastError = fmt.Sprintf("A2A adapter error: %s", a2aErr)
 		log.Printf("Scheduler: '%s' A2A adapter error (HTTP %d): %s", sched.Name, statusCode, a2aErr)
 	} else {
-		log.Printf("Scheduler: '%s' completed (HTTP %d)", sched.Name, statusCode)
+		// HTTP 200 — inspect response body for SDK-layer errors.
+		// The claude-code-sdk adapter returns HTTP 200 even when the inner
+		// LLM call throws (e.g. Max-plan rate-limit, quota exhaustion, SDK
+		// internal errors). Without this check those failures surface as
+		// "completed (HTTP 200)" in last_status while the agent chat shows
+		// errors — a silent failure that hides schedule outages.
+		// See: #1696.
+		resultKind = detectResultKind(respBody)
+		if resultKind != "" && resultKind != "ok" {
+			lastStatus = resultKind
+			lastError = fmt.Sprintf("SDK error: result_kind=%s", resultKind)
+			log.Printf("Scheduler: '%s' SDK error detected — result_kind=%s", sched.Name, resultKind)
+		} else {
+			log.Printf("Scheduler: '%s' completed (HTTP %d)", sched.Name, statusCode)
+		}
 	}
 
 	// #795: detect phantom-producing schedules — cron fires successfully
@@ -483,6 +498,54 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) {
 		resetCancel()
 	}
 
+	// #1696: track consecutive SDK errors. When the adapter returns HTTP 200
+	// but the response body signals a non-ok result_kind (rate_limited,
+	// sdk_error, quota_exhausted), we increment a counter. After 3 consecutive
+	// SDK errors we auto-disable the schedule and log it — the schedule is
+	// suffering a persistent LLM-layer failure and firing it again will keep
+	// producing the same errors while burning tokens.
+	//
+	// Only apply when the current lastStatus is a non-ok resultKind (not when
+	// we already have 'error' from proxyErr or non-2xx HTTP status — those have
+	// their own failure semantics). Also skip when lastStatus is 'stale' (the
+	// empty-response escalation path takes priority).
+	var consecSDK int
+	if resultKind != "" && resultKind != "ok" {
+		sdkCtx, sdkCancel := context.WithTimeout(context.Background(), dbQueryTimeout)
+		if err := db.DB.QueryRowContext(sdkCtx, `
+			UPDATE workspace_schedules
+			SET consecutive_sdk_errors = consecutive_sdk_errors + 1,
+			    updated_at = now()
+			WHERE id = $1
+			RETURNING consecutive_sdk_errors`, sched.ID).Scan(&consecSDK); err != nil {
+			log.Printf("Scheduler: '%s' SDK-error bump failed: %v", sched.Name, err)
+		}
+		sdkCancel()
+		if consecSDK >= 3 {
+			log.Printf("Scheduler: '%s' AUTO-DISABLING after %d consecutive SDK errors (workspace %s)",
+				sched.Name, consecSDK, short(sched.WorkspaceID, 12))
+			autoDisableCtx, autoDisableCancel := context.WithTimeout(context.Background(), dbQueryTimeout)
+			_, _ = db.DB.ExecContext(autoDisableCtx, `
+				UPDATE workspace_schedules SET enabled = false, updated_at = now() WHERE id = $1 AND enabled = true`,
+				sched.ID)
+			autoDisableCancel()
+		}
+	} else {
+		// Non-SDK-error run — reset the counter.
+		// Guard: only reset when lastStatus is a clean ok (not 'stale', not
+		// 'error', not resultKind). An 'ok' resultKind means the SDK is fine
+		// and we should clear the streak.
+		if lastStatus == "ok" {
+			resetCtx, resetCancel := context.WithTimeout(context.Background(), dbQueryTimeout)
+			_, _ = db.DB.ExecContext(resetCtx, `
+				UPDATE workspace_schedules
+				SET consecutive_sdk_errors = 0,
+				    updated_at = now()
+				WHERE id = $1`, sched.ID)
+			resetCancel()
+		}
+	}
+
 	nextRun, nextErr := ComputeNextRun(sched.CronExpr, sched.Timezone, time.Now())
 	var nextRunPtr *time.Time
 	if nextErr == nil {
@@ -763,6 +826,73 @@ func (s *Scheduler) sweepPhantomBusy(ctx context.Context) {
 	}
 }
 
+// detectResultKind inspects an A2A response body for SDK-layer error signals
+// that are invisible at the HTTP level. The claude-code-sdk adapter returns
+// HTTP 200 even when the inner LLM call throws (Max-plan rate-limit, quota
+// exhaustion, SDK internal errors) — the error surfaces only in the response
+// body under result.kind or result.result_kind.
+//
+// Returns an empty string when the response is clean (result_kind is "ok" or
+// absent). Returns the result_kind value when it is a non-ok signal, so callers
+// can propagate it as the schedule's last_status.
+//
+// Known non-ok kinds:
+//   - "rate_limited" — LLM API rate-limit hit (Max-plan, etc.)
+//   - "quota_exhausted" — quota / budget exhausted
+//   - "sdk_error" — SDK threw an internal error
+//
+// See #1696.
+func detectResultKind(body []byte) string {
+	if len(body) == 0 {
+		return ""
+	}
+	var top map[string]json.RawMessage
+	if err := json.Unmarshal(body, &top); err != nil {
+		return ""
+	}
+	// Check result.kind first (canonical JSON-RPC shape).
+	if rawResult, ok := top["result"]; ok {
+		var result map[string]json.RawMessage
+		if err := json.Unmarshal(rawResult, &result); err == nil {
+			// result.kind (canonical JSON-RPC error envelope field).
+			if rawKind, ok := result["kind"]; ok {
+				var k string
+				if json.Unmarshal(rawKind, &k) == nil && k != "" && k != "ok" {
+					return k
+				}
+			}
+			// result.result_kind (legacy / alternative field name).
+			if rawKind, ok := result["result_kind"]; ok {
+				var k string
+				if json.Unmarshal(rawKind, &k) == nil && k != "" && k != "ok" {
+					return k
+				}
+			}
+		}
+	}
+	// Top-level error: non-ok HTTP 200 with a structured error in the body.
+	if errMsg, ok := top["error"].(string); ok && errMsg != "" {
+		// Distinguish SDK errors from other errors. SDK-layer errors from the
+		// Claude Code runtime include specific markers.
+		lower := strings.ToLower(errMsg)
+		for _, marker := range []string{"rate limit", "rate_limit", "quota", "max-plan", "max_plan",
+			"claude code returned an error", "sdk error", "api key", "authentication"} {
+			if strings.Contains(lower, marker) {
+				// Map common SDK error strings to result_kind equivalents.
+				switch {
+				case strings.Contains(lower, "rate"):
+					return "rate_limited"
+				case strings.Contains(lower, "quota") || strings.Contains(lower, "max-plan") || strings.Contains(lower, "budget"):
+					return "quota_exhausted"
+				default:
+					return "sdk_error"
+				}
+			}
+		}
+	}
+	return ""
+}
+
 // isEmptyResponse checks if an A2A response body indicates the agent
 // produced no meaningful output. Catches "(no response generated)" from
 // the workspace runtime + genuinely empty/null responses. Used by the
diff --git a/workspace-server/internal/scheduler/scheduler_test.go b/workspace-server/internal/scheduler/scheduler_test.go
index 28272967c..898d52a9b 100644
--- a/workspace-server/internal/scheduler/scheduler_test.go
+++ b/workspace-server/internal/scheduler/scheduler_test.go
@@ -3,6 +3,7 @@ package scheduler
 import (
 	"context"
 	"database/sql"
+	"encoding/json"
 	"testing"
 	"time"
 	"unicode/utf8"
@@ -337,6 +338,12 @@ func TestFireSchedule_ComputeNextRunError(t *testing.T) {
 		WithArgs(sched.ID).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 
+	// #1696 consecutive_sdk_errors reset — successProxy has no result_kind,
+	// so detectResultKind returns "" and lastStatus="ok" → reset.
+	mock.ExpectExec(`UPDATE workspace_schedules`).
+		WithArgs(sched.ID).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
 	// UPDATE must fire — COALESCE($2, next_run_at) keeps existing value when $2 is nil.
 	// AnyArg for $2 because it will be nil (ComputeNextRun failed).
 	mock.ExpectExec(`UPDATE workspace_schedules`).
@@ -592,7 +599,14 @@ func TestFireSchedule_NormalSuccess_AdvancesNextRunAt(t *testing.T) {
 		WithArgs(sched.ID).
 		WillReturnResult(sqlmock.NewResult(0, 1))
 
-	// 3. Normal UPDATE after successful proxy call.
+	// 3. #1696 consecutive_sdk_errors reset — successProxy response has no
+	//    result_kind in the body, so detectResultKind returns "" and lastStatus
+	//    is "ok" → we hit the SDK-error counter reset branch.
+	mock.ExpectExec(`UPDATE workspace_schedules`).
+		WithArgs(sched.ID).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// 4. Normal UPDATE after successful proxy call.
 	//    Args: $1=sched.ID, $2=nextRunPtr (computed time), $3=lastStatus, $4=lastError
 	mock.ExpectExec(`UPDATE workspace_schedules`).
 		WithArgs(sched.ID, sqlmock.AnyArg(), "ok", "").
@@ -655,6 +669,277 @@ func TestRecordSkipped_AdvancesNextRunAt(t *testing.T) {
 }
 // trigger CI
 
+// ── TestDetectResultKind ───────────────────────────────────────────────────────
+
+// TestDetectResultKind covers the SDK error detection path: HTTP 200 responses
+// with non-ok result_kind in the body must be recognised and returned as the
+// kind string, not silently treated as ok.
+func TestDetectResultKind(t *testing.T) {
+	// The test exercises detectResultKind directly so we don't need a full
+	// fireSchedule mock for this unit-test level.
+	tests := []struct {
+		name     string
+		body     string
+		wantKind string
+	}{
+		{
+			name:     "clean ok response — empty body",
+			body:     `{}`,
+			wantKind: "",
+		},
+		{
+			name:     "clean ok response — result.kind absent",
+			body:     `{"result":{"parts":[{"text":"hello"}]}}`,
+			wantKind: "",
+		},
+		{
+			name:     "clean ok response — result.kind=ok",
+			body:     `{"result":{"kind":"ok","parts":[{"text":"hello"}]}}`,
+			wantKind: "",
+		},
+		{
+			name:     "clean ok response — result.result_kind=ok",
+			body:     `{"result":{"result_kind":"ok","parts":[{"text":"hello"}]}}`,
+			wantKind: "",
+		},
+		{
+			name:     "SDK error — result.kind=rate_limited",
+			body:     `{"result":{"kind":"rate_limited","parts":[{"text":"error"}]}}`,
+			wantKind: "rate_limited",
+		},
+		{
+			name:     "SDK error — result.kind=quota_exhausted",
+			body:     `{"result":{"kind":"quota_exhausted"}}`,
+			wantKind: "quota_exhausted",
+		},
+		{
+			name:     "SDK error — result.kind=sdk_error",
+			body:     `{"result":{"kind":"sdk_error"}}`,
+			wantKind: "sdk_error",
+		},
+		{
+			name:     "SDK error — result.result_kind=rate_limited",
+			body:     `{"result":{"result_kind":"rate_limited"}}`,
+			wantKind: "rate_limited",
+		},
+		{
+			name:     "SDK error — error string with rate limit",
+			body:     `{"result":{"parts":[]},"error":"An error occurred: rate limit exceeded"}`,
+			wantKind: "rate_limited",
+		},
+		{
+			name:     "SDK error — error string with max-plan",
+			body:     `{"error":"Max-plan rate limit reached"}`,
+			wantKind: "quota_exhausted",
+		},
+		{
+			name:     "SDK error — error string with quota",
+			body:     `{"error":"quota exhausted for model"}`,
+			wantKind: "quota_exhausted",
+		},
+		{
+			name:     "SDK error — error string with sdk error",
+			body:     `{"error":"Claude Code returned an error result: success"}`,
+			wantKind: "sdk_error",
+		},
+		{
+			name:     "SDK error — error string with api key",
+			body:     `{"error":"invalid API key"}`,
+			wantKind: "sdk_error",
+		},
+		{
+			name:     "unknown error string — not an SDK error",
+			body:     `{"error":"something went wrong"}`,
+			wantKind: "",
+		},
+		{
+			name:     "empty response body",
+			body:     ``,
+			wantKind: "",
+		},
+		{
+			name:     "malformed JSON",
+			body:     `not valid json`,
+			wantKind: "",
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			got := detectResultKind([]byte(tc.body))
+			if got != tc.wantKind {
+				t.Errorf("detectResultKind(%q) = %q, want %q", tc.body, got, tc.wantKind)
+			}
+		})
+	}
+}
+
+// ── TestFireSchedule_SDKError_RateLimited (#1696) ───────────────────────────────
+//
+// When ProxyA2ARequest returns HTTP 200 but the response body contains a
+// non-ok result_kind, fireSchedule must:
+//   1. Set last_status to the result_kind (not 'ok').
+//   2. Set last_error to describe the SDK error.
+//   3. Increment consecutive_sdk_errors.
+//   4. NOT auto-disable on first occurrence (threshold is 3).
+//
+// This test uses an sdkErrorProxy that returns a rate-limited body and asserts
+// the first run is recorded as 'rate_limited' with consecutive_sdk_errors=1
+// and enabled=true.
+func TestFireSchedule_SDKError_RateLimited(t *testing.T) {
+	mock := setupTestDB(t)
+
+	sched := scheduleRow{
+		ID:          "sdk1-test-sched-0001",
+		WorkspaceID: "sdk1-test-workspace1",
+		Name:        "rate-limited-job",
+		CronExpr:    "0 * * * *",
+		Timezone:    "UTC",
+		Prompt:      "do work",
+	}
+
+	// 1. active_tasks check → workspace idle
+	mock.ExpectQuery(`SELECT COALESCE`).
+		WillReturnRows(sqlmock.NewRows([]string{"coalesce"}).AddRow(0))
+
+	// 2. #1696 consecutive_sdk_errors bump — RETURNING gives us count=1
+	mock.ExpectExec(`UPDATE workspace_schedules`).
+		WithArgs(sched.ID).
+		WillReturnRows(sqlmock.NewRows([]string{"consecutive_sdk_errors"}).AddRow(1))
+
+	// 3. Post-fire UPDATE — last_status='rate_limited', last_error='SDK error: result_kind=rate_limited'
+	mock.ExpectExec(`UPDATE workspace_schedules`).
+		WithArgs(sched.ID, sqlmock.AnyArg(), "rate_limited", sqlmock.AnyArg()).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// 4. activity_logs INSERT
+	mock.ExpectExec(`INSERT INTO activity_logs`).
+		WithArgs(sched.WorkspaceID, sqlmock.AnyArg(), sqlmock.AnyArg(), "rate_limited", sqlmock.AnyArg()).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	s := New(&sdkErrorProxy{kind: "rate_limited"}, nil)
+	s.fireSchedule(context.Background(), sched)
+
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet DB expectations for SDK-error first run: %v", err)
+	}
+}
+
+// ── TestFireSchedule_SDKError_AutoDisableOnThirdConsecutive (#1696) ───────────
+//
+// On the 3rd consecutive SDK error, fireSchedule must auto-disable the
+// schedule (enabled=false) in addition to recording the error status.
+// Threshold is 3 per #1696 requirement.
+func TestFireSchedule_SDKError_AutoDisableOnThirdConsecutive(t *testing.T) {
+	mock := setupTestDB(t)
+
+	sched := scheduleRow{
+		ID:          "sdk2-test-sched-0002",
+		WorkspaceID: "sdk2-test-workspace2",
+		Name:        "auto-disable-job",
+		CronExpr:    "0 * * * *",
+		Timezone:    "UTC",
+		Prompt:      "do work",
+	}
+
+	// 1. active_tasks check → workspace idle
+	mock.ExpectQuery(`SELECT COALESCE`).
+		WillReturnRows(sqlmock.NewRows([]string{"coalesce"}).AddRow(0))
+
+	// 2. #1696 consecutive_sdk_errors bump — RETURNING gives count=3 (threshold met)
+	mock.ExpectExec(`UPDATE workspace_schedules`).
+		WithArgs(sched.ID).
+		WillReturnRows(sqlmock.NewRows([]string{"consecutive_sdk_errors"}).AddRow(3))
+
+	// 3. Auto-disable UPDATE — sets enabled=false (schedule has hit 3rd SDK error)
+	mock.ExpectExec(`UPDATE workspace_schedules SET enabled`).
+		WithArgs(sched.ID).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// 4. Post-fire UPDATE
+	mock.ExpectExec(`UPDATE workspace_schedules`).
+		WithArgs(sched.ID, sqlmock.AnyArg(), "rate_limited", sqlmock.AnyArg()).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// 5. activity_logs INSERT
+	mock.ExpectExec(`INSERT INTO activity_logs`).
+		WithArgs(sched.WorkspaceID, sqlmock.AnyArg(), sqlmock.AnyArg(), "rate_limited", sqlmock.AnyArg()).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	s := New(&sdkErrorProxy{kind: "rate_limited"}, nil)
+	s.fireSchedule(context.Background(), sched)
+
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet DB expectations for SDK-error auto-disable: %v", err)
+	}
+}
+
+// ── TestFireSchedule_SDKError_CounterResetOnCleanRun (#1696) ──────────────────
+//
+// A clean HTTP-200 run (no SDK error) must reset consecutive_sdk_errors to 0.
+// This prevents false auto-disable after intermittent SDK blips.
+func TestFireSchedule_SDKError_CounterResetOnCleanRun(t *testing.T) {
+	mock := setupTestDB(t)
+
+	sched := scheduleRow{
+		ID:          "sdk3-test-sched-0003",
+		WorkspaceID: "sdk3-test-workspace3",
+		Name:        "clean-reset-job",
+		CronExpr:    "30 * * * *",
+		Timezone:    "UTC",
+		Prompt:      "do work",
+	}
+
+	// 1. active_tasks check → workspace idle
+	mock.ExpectQuery(`SELECT COALESCE`).
+		WillReturnRows(sqlmock.NewRows([]string{"coalesce"}).AddRow(0))
+
+	// 2. No SDK error — #1696 counter is reset to 0
+	//    (lastStatus is 'ok', resultKind is empty, so we go to the reset branch)
+	mock.ExpectExec(`UPDATE workspace_schedules`).
+		WithArgs(sched.ID).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// 3. Post-fire UPDATE
+	mock.ExpectExec(`UPDATE workspace_schedules`).
+		WithArgs(sched.ID, sqlmock.AnyArg(), "ok", "").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	// 4. activity_logs INSERT
+	mock.ExpectExec(`INSERT INTO activity_logs`).
+		WithArgs(sched.WorkspaceID, sqlmock.AnyArg(), sqlmock.AnyArg(), "ok", "").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	s := New(&successProxy{}, nil)
+	s.fireSchedule(context.Background(), sched)
+
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet DB expectations for SDK-error counter reset: %v", err)
+	}
+}
+
+// ── sdkErrorProxy ──────────────────────────────────────────────────────────────
+//
+// sdkErrorProxy is a test double whose ProxyA2ARequest returns HTTP 200 but
+// embeds a non-ok result_kind in the response body, simulating a Claude Code
+// SDK that returned 200 but the inner LLM call threw a rate-limit / quota error.
+// Used by TestFireSchedule_SDKError_* to cover #1696 SDK error detection.
+type sdkErrorProxy struct {
+	kind string // result_kind value to embed in the response body
+}
+
+func (p *sdkErrorProxy) ProxyA2ARequest(
+	_ context.Context, _ string, _ []byte, _ string, _ bool,
+) (int, []byte, error) {
+	body, _ := json.Marshal(map[string]interface{}{
+		"result": map[string]interface{}{
+			"kind":  p.kind,
+			"parts": []map[string]interface{}{{"kind": "text", "text": "(no response generated)"}},
+		},
+	})
+	return 200, body, nil
+}
+
 // ── TestTruncate_utf8Safe_regression2026 ──────────────────────────────────────
 
 // TestTruncate_utf8Safe_regression2026 locks in the #2026 fix: truncate must
diff --git a/workspace-server/migrations/20260523000000_schedule_consecutive_sdk_errors.down.sql b/workspace-server/migrations/20260523000000_schedule_consecutive_sdk_errors.down.sql
new file mode 100644
index 000000000..cb0923cf9
--- /dev/null
+++ b/workspace-server/migrations/20260523000000_schedule_consecutive_sdk_errors.down.sql
@@ -0,0 +1,4 @@
+-- migration: 20260523000000_schedule_consecutive_sdk_errors.down.sql
+-- Reverts #1696 fix for #1696 (consecutive_sdk_errors column)
+
+ALTER TABLE workspace_schedules DROP COLUMN IF EXISTS consecutive_sdk_errors;
\ No newline at end of file
diff --git a/workspace-server/migrations/20260523000000_schedule_consecutive_sdk_errors.up.sql b/workspace-server/migrations/20260523000000_schedule_consecutive_sdk_errors.up.sql
new file mode 100644
index 000000000..62fac77ae
--- /dev/null
+++ b/workspace-server/migrations/20260523000000_schedule_consecutive_sdk_errors.up.sql
@@ -0,0 +1,20 @@
+-- migration: 20260523000000_schedule_consecutive_sdk_errors.up.sql
+-- Fixes #1696: Add consecutive_sdk_errors counter to track SDK errors (HTTP 200
+-- responses where the Claude Code runtime returned a non-ok result_kind).
+-- When this counter reaches 3, the scheduler sets last_status='rate_limited'
+-- and auto-disables the schedule.
+--
+-- The core issue: the claude-code-sdk adapter returns HTTP 200 even when the
+-- inner LLM call throws (e.g. Max-plan rate-limit). All 3 observed runs logged
+-- "completed (HTTP 200)" yet surfaced agent errors in the workspace chat.
+-- This counter lets us detect that pattern and escalate appropriately.
+
+ALTER TABLE workspace_schedules
+    ADD COLUMN IF NOT EXISTS consecutive_sdk_errors INTEGER NOT NULL DEFAULT 0;
+
+COMMENT ON COLUMN workspace_schedules.consecutive_sdk_errors IS
+    'Count of consecutive scheduler fires where ProxyA2ARequest returned HTTP 200
+     but the response body contained a non-ok result_kind (e.g. rate_limited,
+     sdk_error, quota_exhausted). Reset to 0 on any non-SDK-error status.
+     After 3 consecutive SDK errors the schedule is auto-disabled with
+     status rate_limited. Fixes #1696.';
\ No newline at end of file
-- 
2.52.0


From 1c26a99470ea337a582b72c21c1f1bd33540abb5 Mon Sep 17 00:00:00 2001
From: "Molecule AI Dev Engineer B (MiniMax)"
 <dev-engineer-b-minimax@agents.moleculesai.app>
Date: Sat, 23 May 2026 02:49:44 +0000
Subject: [PATCH 2/5] =?UTF-8?q?fix(scheduler):=20#1696=20=E2=80=94=20fix?=
 =?UTF-8?q?=20type=20error=20in=20detectResultKind=20top-level=20error=20c?=
 =?UTF-8?q?heck?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bug: detectResultKind() used `top["error"].(string)` where `top` is
`map[string]json.RawMessage`. A json.RawMessage is []byte, not interface{},
so the type assertion fails at compile time (or more precisely, the index
expression returns json.RawMessage which doesn't support direct (string)
type assertion in Go 1.25).

Fix: extract the raw error field first, then json.Unmarshal it into string.
Also added missing `}` closing brace.

Fixes: CI / Platform (Go) failure "invalid operation: top[\"error\"]
(map index expression of slice type)"
---
 .../internal/scheduler/scheduler.go           | 34 ++++++++++---------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/workspace-server/internal/scheduler/scheduler.go b/workspace-server/internal/scheduler/scheduler.go
index ce8efbd69..326c27a46 100644
--- a/workspace-server/internal/scheduler/scheduler.go
+++ b/workspace-server/internal/scheduler/scheduler.go
@@ -871,26 +871,28 @@ func detectResultKind(body []byte) string {
 		}
 	}
 	// Top-level error: non-ok HTTP 200 with a structured error in the body.
-	if errMsg, ok := top["error"].(string); ok && errMsg != "" {
-		// Distinguish SDK errors from other errors. SDK-layer errors from the
-		// Claude Code runtime include specific markers.
-		lower := strings.ToLower(errMsg)
-		for _, marker := range []string{"rate limit", "rate_limit", "quota", "max-plan", "max_plan",
-			"claude code returned an error", "sdk error", "api key", "authentication"} {
-			if strings.Contains(lower, marker) {
-				// Map common SDK error strings to result_kind equivalents.
-				switch {
-				case strings.Contains(lower, "rate"):
-					return "rate_limited"
-				case strings.Contains(lower, "quota") || strings.Contains(lower, "max-plan") || strings.Contains(lower, "budget"):
-					return "quota_exhausted"
-				default:
-					return "sdk_error"
+	if rawErr, ok := top["error"]; ok {
+		var errMsg string
+		if err := json.Unmarshal(rawErr, &errMsg); err == nil && errMsg != "" {
+			// Distinguish SDK errors from other errors. SDK-layer errors from the
+			// Claude Code runtime include specific markers.
+			lower := strings.ToLower(errMsg)
+			for _, marker := range []string{"rate limit", "rate_limit", "quota", "max-plan", "max_plan",
+				"claude code returned an error", "sdk error", "api key", "authentication"} {
+				if strings.Contains(lower, marker) {
+					// Map common SDK error strings to result_kind equivalents.
+					switch {
+					case strings.Contains(lower, "rate"):
+						return "rate_limited"
+					case strings.Contains(lower, "quota") || strings.Contains(lower, "max-plan") || strings.Contains(lower, "budget"):
+						return "quota_exhausted"
+					default:
+						return "sdk_error"
+					}
 				}
 			}
 		}
 	}
-	return ""
 }
 
 // isEmptyResponse checks if an A2A response body indicates the agent
-- 
2.52.0


From 5452083ef32a095263d818c3655842317a1b4cc3 Mon Sep 17 00:00:00 2001
From: "Molecule AI Dev Engineer B (MiniMax)"
 <dev-engineer-b-minimax@agents.moleculesai.app>
Date: Sat, 23 May 2026 02:55:36 +0000
Subject: [PATCH 3/5] =?UTF-8?q?fix(scheduler):=20#1696=20=E2=80=94=20add?=
 =?UTF-8?q?=20missing=20return=20""=20in=20detectResultKind?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After the top-level error check loop completes without a match, the
function must return "" rather than falling through to nothing (which
causes "missing return" compilation error in Go 1.25).
---
 workspace-server/internal/scheduler/scheduler.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/workspace-server/internal/scheduler/scheduler.go b/workspace-server/internal/scheduler/scheduler.go
index 326c27a46..8166af0bf 100644
--- a/workspace-server/internal/scheduler/scheduler.go
+++ b/workspace-server/internal/scheduler/scheduler.go
@@ -893,6 +893,7 @@ func detectResultKind(body []byte) string {
 			}
 		}
 	}
+	return ""
 }
 
 // isEmptyResponse checks if an A2A response body indicates the agent
-- 
2.52.0


From 6d92c0641082ef615d32fbefcd63d2486f84e0a4 Mon Sep 17 00:00:00 2001
From: "Molecule AI Dev Engineer B (MiniMax)"
 <dev-engineer-b-minimax@agents.moleculesai.app>
Date: Sat, 23 May 2026 02:59:49 +0000
Subject: [PATCH 4/5] =?UTF-8?q?fix(scheduler):=20#1696=20=E2=80=94=20use?=
 =?UTF-8?q?=20ExpectQuery=20not=20ExpectExec=20for=20RETURNING?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

sqlmock's WillReturnRows() is only valid on ExpectedQuery, not
ExpectedExec. The consecutive_sdk_errors UPDATE uses RETURNING and
QueryRowContext.Scan(), which must be mocked with ExpectQuery.
WillReturnResult is correct for post-fire UPDATE (no RETURNING).
---
 .../internal/scheduler/scheduler_test.go             | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/workspace-server/internal/scheduler/scheduler_test.go b/workspace-server/internal/scheduler/scheduler_test.go
index 898d52a9b..9f16e9b30 100644
--- a/workspace-server/internal/scheduler/scheduler_test.go
+++ b/workspace-server/internal/scheduler/scheduler_test.go
@@ -802,8 +802,10 @@ func TestFireSchedule_SDKError_RateLimited(t *testing.T) {
 	mock.ExpectQuery(`SELECT COALESCE`).
 		WillReturnRows(sqlmock.NewRows([]string{"coalesce"}).AddRow(0))
 
-	// 2. #1696 consecutive_sdk_errors bump — RETURNING gives us count=1
-	mock.ExpectExec(`UPDATE workspace_schedules`).
+	// 2. #1696 consecutive_sdk_errors bump — RETURNING gives us count=1.
+	//    Use ExpectQuery (not Exec) because QueryRowContext + RETURNING
+	//    produces a result set consumed via .Scan().
+	mock.ExpectQuery(`UPDATE workspace_schedules`).
 		WithArgs(sched.ID).
 		WillReturnRows(sqlmock.NewRows([]string{"consecutive_sdk_errors"}).AddRow(1))
 
@@ -846,8 +848,10 @@ func TestFireSchedule_SDKError_AutoDisableOnThirdConsecutive(t *testing.T) {
 	mock.ExpectQuery(`SELECT COALESCE`).
 		WillReturnRows(sqlmock.NewRows([]string{"coalesce"}).AddRow(0))
 
-	// 2. #1696 consecutive_sdk_errors bump — RETURNING gives count=3 (threshold met)
-	mock.ExpectExec(`UPDATE workspace_schedules`).
+	// 2. #1696 consecutive_sdk_errors bump — RETURNING gives count=3 (threshold met).
+	//    Use ExpectQuery (not Exec) because QueryRowContext + RETURNING
+	//    produces a result set consumed via .Scan().
+	mock.ExpectQuery(`UPDATE workspace_schedules`).
 		WithArgs(sched.ID).
 		WillReturnRows(sqlmock.NewRows([]string{"consecutive_sdk_errors"}).AddRow(3))
 
-- 
2.52.0


From e3fabb8ca4f558239b3591f49f045810cd996ba9 Mon Sep 17 00:00:00 2001
From: "Molecule AI Dev Engineer B (MiniMax)"
 <dev-engineer-b-minimax@agents.moleculesai.app>
Date: Sat, 23 May 2026 03:10:08 +0000
Subject: [PATCH 5/5] =?UTF-8?q?fix(scheduler):=20#1696=20=E2=80=94=20check?=
 =?UTF-8?q?=20max-plan=20before=20rate=20in=20detectResultKind?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bug: the original loop checked "rate limit" / "rate_limit" before
"max-plan", so "Max-plan rate limit reached" hit "rate" first and
returned rate_limited instead of quota_exhausted.

Fix: check max-plan/quota first (more specific), then rate limit,
then SDK error strings. Uses if/else chain instead of loop+switch.
---
 .../internal/scheduler/scheduler.go           | 23 ++++++++-----------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/workspace-server/internal/scheduler/scheduler.go b/workspace-server/internal/scheduler/scheduler.go
index 8166af0bf..186a60be3 100644
--- a/workspace-server/internal/scheduler/scheduler.go
+++ b/workspace-server/internal/scheduler/scheduler.go
@@ -877,19 +877,16 @@ func detectResultKind(body []byte) string {
 			// Distinguish SDK errors from other errors. SDK-layer errors from the
 			// Claude Code runtime include specific markers.
 			lower := strings.ToLower(errMsg)
-			for _, marker := range []string{"rate limit", "rate_limit", "quota", "max-plan", "max_plan",
-				"claude code returned an error", "sdk error", "api key", "authentication"} {
-				if strings.Contains(lower, marker) {
-					// Map common SDK error strings to result_kind equivalents.
-					switch {
-					case strings.Contains(lower, "rate"):
-						return "rate_limited"
-					case strings.Contains(lower, "quota") || strings.Contains(lower, "max-plan") || strings.Contains(lower, "budget"):
-						return "quota_exhausted"
-					default:
-						return "sdk_error"
-					}
-				}
+			// Check more specific patterns first (max-plan quota > general rate).
+			if strings.Contains(lower, "max-plan") || strings.Contains(lower, "quota") || strings.Contains(lower, "budget") {
+				return "quota_exhausted"
+			}
+			if strings.Contains(lower, "rate limit") || strings.Contains(lower, "rate_limit") {
+				return "rate_limited"
+			}
+			if strings.Contains(lower, "claude code returned an error") || strings.Contains(lower, "sdk error") ||
+				strings.Contains(lower, "api key") || strings.Contains(lower, "authentication") {
+				return "sdk_error"
 			}
 		}
 	}
-- 
2.52.0