forked from molecule-ai/molecule-core
Problem observed 2026-04-16: Research Lead, Dev Lead, Security Auditor, and UIUX Designer were being auto-restarted by the liveness monitor every ~30 minutes, even though their containers were healthy and processing real work. A2A callers (PM, children agents) saw regular EOFs: A2A request to <leader-id> failed: Post http://ws-*:8000: EOF Followed in platform logs by: Liveness: workspace <id> TTL expired Auto-restart: restarting <name> (was: offline) Provisioner: stopped and removed container ws-* Root cause: the liveness key `ws:{id}` in Redis has a 60s TTL (platform/internal/db/redis.go). The workspace heartbeat loop (workspace-template/heartbeat.py) refreshes it every 30s. That leaves room for exactly ONE missed heartbeat before expiry. A busy Claude Code Opus synthesis can starve the container's asyncio scheduler for 60-120s (the SDK spawns the claude CLI subprocess and blocks until the message-reader yields; the heartbeat coroutine doesn't run during that window). Leaders running 5-minute orchestrator pulses or processing deep delegations routinely hit this. The platform then mistakes a busy-but-healthy container for a dead one, marks it offline, tears it down, and re-provisions — interrupting whatever work was mid- synthesis and generating a cascade of EOF errors on pending A2A calls. Fix: hoist the TTL into a named `LivenessTTL` constant and raise it to 180s. With a 30s heartbeat interval this now tolerates up to ~5 missed beats before expiry — comfortably longer than any realistic Opus stall, while still detecting genuinely-dead containers within 3 minutes. Safety: real crashes are still caught immediately by a2a_proxy's reactive IsRunning() check (maybeMarkContainerDead in a2a_proxy.go:439). That path doesn't depend on TTL; it fires on the first failed forward. So this PR only relaxes the "slow but alive" false-positive — dead-container detection is unchanged. Observed impact before fix (2026-04-16 ~06:40–06:49 UTC, 10-minute window, 4 containers affected): | Container | EOF errors | Forced restart | |-------------------|-----------:|:--------------:| | Dev Lead | 5 | yes (06:48) | | Research Lead | 5 | yes (06:47) | | Security Auditor | 5 | yes (06:49) | | UIUX Designer | 4 | no (not yet) | Expected impact after merge + redeploy: drop to ~0 forced restarts on healthy-busy leaders. If genuinely-stuck agents stop responding, the IsRunning check still catches them on the next A2A forward. Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
95 lines
3.1 KiB
Go
95 lines
3.1 KiB
Go
package db
|
||
|
||
import (
|
||
"context"
|
||
"fmt"
|
||
"log"
|
||
"time"
|
||
|
||
"github.com/redis/go-redis/v9"
|
||
)
|
||
|
||
var RDB *redis.Client
|
||
|
||
func InitRedis(redisURL string) error {
|
||
opts, err := redis.ParseURL(redisURL)
|
||
if err != nil {
|
||
return fmt.Errorf("parse redis url: %w", err)
|
||
}
|
||
RDB = redis.NewClient(opts)
|
||
|
||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||
defer cancel()
|
||
|
||
if err := RDB.Ping(ctx).Err(); err != nil {
|
||
return fmt.Errorf("ping redis: %w", err)
|
||
}
|
||
log.Println("Connected to Redis")
|
||
return nil
|
||
}
|
||
|
||
// LivenessTTL is the TTL for the workspace liveness key in Redis.
|
||
// Must be > heartbeat interval × (max acceptable missed heartbeats).
|
||
// Workspace heartbeat loop fires every 30s; a busy Claude Code / Opus
|
||
// synthesis can starve the asyncio scheduler for 60-120s, so a 60s TTL
|
||
// triggered false-positive "unreachable — restart" cycles on busy
|
||
// leaders every ~30 minutes (see README in this package + the commit
|
||
// message). 180s allows up to ~5 missed heartbeats before we conclude
|
||
// the container is actually dead, which still cleanly detects real
|
||
// crashes (the a2a_proxy reactive IsRunning() check catches those on
|
||
// the first failed forward, independent of TTL).
|
||
const LivenessTTL = 180 * time.Second
|
||
|
||
// SetOnline sets the workspace liveness key with the LivenessTTL.
|
||
func SetOnline(ctx context.Context, workspaceID string) error {
|
||
key := fmt.Sprintf("ws:%s", workspaceID)
|
||
return RDB.Set(ctx, key, "online", LivenessTTL).Err()
|
||
}
|
||
|
||
// RefreshTTL refreshes the liveness TTL for a workspace.
|
||
func RefreshTTL(ctx context.Context, workspaceID string) error {
|
||
key := fmt.Sprintf("ws:%s", workspaceID)
|
||
return RDB.Expire(ctx, key, LivenessTTL).Err()
|
||
}
|
||
|
||
// CacheURL caches a workspace URL for fast resolution.
|
||
func CacheURL(ctx context.Context, workspaceID, url string) error {
|
||
key := fmt.Sprintf("ws:%s:url", workspaceID)
|
||
return RDB.Set(ctx, key, url, 5*time.Minute).Err()
|
||
}
|
||
|
||
// GetCachedURL gets a cached workspace URL.
|
||
func GetCachedURL(ctx context.Context, workspaceID string) (string, error) {
|
||
key := fmt.Sprintf("ws:%s:url", workspaceID)
|
||
return RDB.Get(ctx, key).Result()
|
||
}
|
||
|
||
// CacheInternalURL caches the Docker-internal URL for workspace-to-workspace discovery.
|
||
func CacheInternalURL(ctx context.Context, workspaceID, url string) error {
|
||
key := fmt.Sprintf("ws:%s:internal_url", workspaceID)
|
||
return RDB.Set(ctx, key, url, 5*time.Minute).Err()
|
||
}
|
||
|
||
// GetCachedInternalURL gets the Docker-internal URL for a workspace.
|
||
func GetCachedInternalURL(ctx context.Context, workspaceID string) (string, error) {
|
||
key := fmt.Sprintf("ws:%s:internal_url", workspaceID)
|
||
return RDB.Get(ctx, key).Result()
|
||
}
|
||
|
||
// ClearWorkspaceKeys removes all Redis keys for a workspace (liveness, URL cache, internal URL cache).
|
||
func ClearWorkspaceKeys(ctx context.Context, workspaceID string) {
|
||
for _, suffix := range []string{"", ":url", ":internal_url"} {
|
||
RDB.Del(ctx, fmt.Sprintf("ws:%s%s", workspaceID, suffix))
|
||
}
|
||
}
|
||
|
||
// IsOnline checks if a workspace is online.
|
||
func IsOnline(ctx context.Context, workspaceID string) (bool, error) {
|
||
key := fmt.Sprintf("ws:%s", workspaceID)
|
||
val, err := RDB.Exists(ctx, key).Result()
|
||
if err != nil {
|
||
return false, err
|
||
}
|
||
return val > 0, nil
|
||
}
|