forked from molecule-ai/molecule-core
Heartbeat lies. The asyncio task that POSTs /registry/heartbeat lives in its own process slot, so a workspace whose claude_agent_sdk has wedged on `Control request timeout: initialize` keeps reporting "online" — every chat send hangs the full 5-min platform deadline even though the runtime is dead in the water. This commit teaches the workspace to admit it's wedged and the platform to honor that admission by flipping status → degraded. Five layers, all in one commit because they share a contract: 1. Migration 043 — convert workspaces.status from free-form TEXT to a real `workspace_status` Postgres ENUM with the 6 values production code actually writes (provisioning, online, offline, degraded, failed, removed). Locks the value set; future typo writes error at the DB instead of silently storing rogue strings. Down migration reverts to TEXT and drops the type. 2. workspace-server/internal/models — `HeartbeatPayload` gains a `runtime_state string` field. Empty = healthy. Currently the only non-empty value the handler honors is "wedged"; future symptoms can extend without another migration. 3. workspace-server/internal/handlers/registry.go — `evaluateStatus` gains a wedge branch BEFORE the existing error_rate >= 0.5 path: if `RuntimeState=="wedged"` and currently online, flip to degraded and broadcast WORKSPACE_DEGRADED with the wedge sample error. Recovery (`degraded → online`) now requires BOTH error_rate < 0.1 AND runtime_state cleared, so a workspace still reporting wedged stays degraded even when its error count happens to be 0 (the wedge captures a runtime state, not an error count). 4. workspace/claude_sdk_executor.py — module-level `_sdk_wedged_reason` flag set when execute()'s catch block sees an error matching `_WEDGE_ERROR_PATTERNS` (currently just "control request timeout"). Sticky for the process lifetime; the SDK's internal client-process state is corrupted on this error and only a workspace restart (= new Python process = fresh module state) clears it. Helpers `is_wedged()` / `wedge_reason()` / `_reset_sdk_wedge_for_test()` exposed. 5. workspace/heartbeat.py — heartbeat body now layers on `_runtime_state_payload()` for both the happy path and the 401-retry path. Lazy-imports claude_sdk_executor so non-Claude runtimes (where the module may not even be importable) keep working unchanged. Canvas required no changes — `STATUS_CONFIG.degraded` was already defined in design-tokens.ts (amber dot, "Degraded" label) and WorkspaceNode.tsx already renders `lastSampleError` underneath the status pill when status === "degraded". The existing wiring just never fired because nothing was writing degraded in this code path. Tests: - 3 Go handler tests for the new transitions (online → degraded on wedged, degraded stays put while still wedged, degraded → online after wedge clears) - 5 Python wedge-detector tests (default clean, mark sets flag, sticky-first-wins, execute() flips on Control request timeout, execute() does NOT flip on unrelated errors) - Migration smoke-tested against the local dev DB (3 existing rows, all enum-compatible; migration applied cleanly, post-state has the column as workspace_status type and the index preserved) Verified: 79 Python tests pass; full Go test suite passes; migration applies clean on a real DB; reverse migration restores the column to TEXT. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
115 lines
5.4 KiB
Go
115 lines
5.4 KiB
Go
package models
|
|
|
|
import (
|
|
"database/sql"
|
|
"encoding/json"
|
|
"time"
|
|
)
|
|
|
|
type Workspace struct {
|
|
ID string `json:"id" db:"id"`
|
|
Name string `json:"name" db:"name"`
|
|
Role sql.NullString `json:"role" db:"role"`
|
|
Tier int `json:"tier" db:"tier"`
|
|
AwarenessNamespace sql.NullString `json:"awareness_namespace" db:"awareness_namespace"`
|
|
Status string `json:"status" db:"status"`
|
|
SourceBundleID sql.NullString `json:"source_bundle_id" db:"source_bundle_id"`
|
|
AgentCard json.RawMessage `json:"agent_card" db:"agent_card"`
|
|
URL sql.NullString `json:"url" db:"url"`
|
|
ParentID *string `json:"parent_id" db:"parent_id"`
|
|
ForwardedTo *string `json:"forwarded_to" db:"forwarded_to"`
|
|
LastHeartbeatAt *time.Time `json:"last_heartbeat_at" db:"last_heartbeat_at"`
|
|
LastErrorRate float64 `json:"last_error_rate" db:"last_error_rate"`
|
|
LastSampleError sql.NullString `json:"last_sample_error" db:"last_sample_error"`
|
|
ActiveTasks int `json:"active_tasks" db:"active_tasks"`
|
|
MaxConcurrentTasks int `json:"max_concurrent_tasks" db:"max_concurrent_tasks"`
|
|
UptimeSeconds int `json:"uptime_seconds" db:"uptime_seconds"`
|
|
CreatedAt time.Time `json:"created_at" db:"created_at"`
|
|
UpdatedAt time.Time `json:"updated_at" db:"updated_at"`
|
|
// Canvas layout fields (from JOIN)
|
|
X float64 `json:"x"`
|
|
Y float64 `json:"y"`
|
|
Collapsed bool `json:"collapsed"`
|
|
}
|
|
|
|
type RegisterPayload struct {
|
|
ID string `json:"id" binding:"required"`
|
|
URL string `json:"url" binding:"required"`
|
|
AgentCard json.RawMessage `json:"agent_card" binding:"required"`
|
|
}
|
|
|
|
type HeartbeatPayload struct {
|
|
WorkspaceID string `json:"workspace_id" binding:"required"`
|
|
ErrorRate float64 `json:"error_rate"`
|
|
SampleError string `json:"sample_error"`
|
|
ActiveTasks int `json:"active_tasks"`
|
|
UptimeSeconds int `json:"uptime_seconds"`
|
|
CurrentTask string `json:"current_task"`
|
|
// MonthlySpend is cumulative USD spend for the current calendar month,
|
|
// denominated in cents (e.g. 1500 = $15.00). Zero means "no update" —
|
|
// the heartbeat handler never writes zero to avoid accidentally clearing
|
|
// a previously-reported spend value. Any non-zero value is clamped to
|
|
// [0, maxMonthlySpend] before the DB write. (#615)
|
|
MonthlySpend int64 `json:"monthly_spend"`
|
|
// RuntimeState is a self-reported runtime health flag separate from
|
|
// "is the heartbeat task firing at all". The heartbeat task lives in
|
|
// its own asyncio task and keeps pinging even when the agent runtime
|
|
// is wedged (e.g. claude_agent_sdk's `Control request timeout:
|
|
// initialize` leaves the SDK in a permanent error state for the
|
|
// process lifetime). RuntimeState is how the workspace tells the
|
|
// platform "I'm alive but my Claude runtime is broken — flip me to
|
|
// degraded so the canvas can show a Restart hint."
|
|
//
|
|
// Empty string = healthy / no signal. The only currently-recognised
|
|
// non-empty value is "wedged"; future values can extend this without
|
|
// migration.
|
|
RuntimeState string `json:"runtime_state"`
|
|
}
|
|
|
|
type UpdateCardPayload struct {
|
|
WorkspaceID string `json:"workspace_id" binding:"required"`
|
|
AgentCard json.RawMessage `json:"agent_card" binding:"required"`
|
|
}
|
|
|
|
// MemorySeed represents an initial memory to seed into a workspace at creation time.
|
|
// Used by both the POST /workspaces API and org template import to pre-populate
|
|
// agent memories from config (issue #1050).
|
|
type MemorySeed struct {
|
|
Content string `json:"content" yaml:"content"`
|
|
Scope string `json:"scope" yaml:"scope"` // LOCAL, TEAM, GLOBAL
|
|
}
|
|
|
|
type CreateWorkspacePayload struct {
|
|
Name string `json:"name" binding:"required"`
|
|
Role string `json:"role"`
|
|
Template string `json:"template"` // workspace-configs-templates folder name
|
|
Tier int `json:"tier"`
|
|
Model string `json:"model"`
|
|
Runtime string `json:"runtime"` // "langgraph" (default), "claude-code", etc.
|
|
External bool `json:"external"` // true = no Docker container, just a registered URL
|
|
URL string `json:"url"` // for external workspaces: the A2A endpoint URL
|
|
WorkspaceDir string `json:"workspace_dir"` // host path to mount as /workspace (empty = isolated volume)
|
|
WorkspaceAccess string `json:"workspace_access"` // "none" (default), "read_only", or "read_write" — see #65
|
|
ParentID *string `json:"parent_id"`
|
|
// BudgetLimit is the optional monthly spend ceiling in USD cents.
|
|
// NULL (omitted) means no limit. budget_limit=500 means $5.00/month.
|
|
BudgetLimit *int64 `json:"budget_limit"`
|
|
// Secrets is an optional map of key→plaintext-value pairs to persist as
|
|
// workspace secrets at creation time. Stored encrypted (same path as
|
|
// POST /workspaces/:id/secrets). Nil/empty map is a no-op.
|
|
Secrets map[string]string `json:"secrets"`
|
|
Canvas struct {
|
|
X float64 `json:"x"`
|
|
Y float64 `json:"y"`
|
|
} `json:"canvas"`
|
|
// InitialMemories is an optional list of memories to seed into the
|
|
// workspace immediately after creation. Each entry is inserted into
|
|
// agent_memories with the workspace's awareness namespace. Issue #1050.
|
|
InitialMemories []MemorySeed `json:"initial_memories"`
|
|
}
|
|
|
|
type CheckAccessPayload struct {
|
|
CallerID string `json:"caller_id" binding:"required"`
|
|
TargetID string `json:"target_id" binding:"required"`
|
|
}
|