Compare commits

...

1 Commits

Author SHA1 Message Date
Molecule AI Dev Engineer A (Kimi) c768101cb6 fix(workspace-server): http client timeouts, panic recovery, and error checks (re-created from staging #2045)
ci-arm64-advisory / fast-checks (pull_request) Waiting to run
Lint shellcheck (arm64 pilot) / shellcheck-arm64 (pilot) (pull_request) Successful in 2s
CI / Detect changes (pull_request) Successful in 5s
CI / Python Lint & Test (pull_request) Successful in 3s
E2E API Smoke Test / detect-changes (pull_request) Successful in 8s
E2E Chat / detect-changes (pull_request) Successful in 7s
E2E Staging Canvas (Playwright) / detect-changes (pull_request) Successful in 7s
E2E Peer Visibility (literal MCP list_peers) / E2E Peer Visibility (local) (pull_request) Successful in 31s
E2E Peer Visibility (literal MCP list_peers) / E2E Peer Visibility (pull_request) Successful in 3s
Harness Replays / detect-changes (pull_request) Successful in 3s
Lint forbidden tenant-env keys / Scan workspace_secrets writers for forbidden env keys (pull_request) Successful in 3s
Handlers Postgres Integration / detect-changes (pull_request) Successful in 5s
Lint forbidden tenant-env keys / Scan for repo-host token write into tenant workspace surface (pull_request) Successful in 2s
Secret scan / Scan diff for credential-shaped strings (pull_request) Successful in 3s
Block internal-flavored paths / Block forbidden paths (pull_request) Successful in 1m6s
qa-review / approved (pull_request_target) Failing after 4s
CI / Canvas (Next.js) (pull_request) Successful in 1s
CI / Shellcheck (E2E scripts) (pull_request) Successful in 1s
security-review / approved (pull_request_target) Failing after 39s
E2E Chat / E2E Chat (pull_request) Successful in 18s
lint-required-no-paths / lint-required-no-paths (pull_request) Successful in 59s
Harness Replays / Harness Replays (pull_request) Successful in 1s
E2E API Smoke Test / E2E API Smoke Test (pull_request) Successful in 55s
E2E Staging Canvas (Playwright) / Canvas tabs E2E (pull_request) Successful in 1m11s
CI / Canvas Deploy Reminder (pull_request) Has been skipped
CI / Platform (Go) (pull_request) Successful in 3m55s
E2E Staging SaaS (full lifecycle) / pr-validate (pull_request) Waiting to run
E2E Staging SaaS (full lifecycle) / E2E Staging SaaS (pull_request) Waiting to run
E2E Staging SaaS (full lifecycle) / E2E Staging Platform Boot (pull_request) Waiting to run
E2E Staging External Runtime / E2E Staging External Runtime (pull_request) Successful in 5m15s
CI / all-required (pull_request) Successful in 50s
Handlers Postgres Integration / Handlers Postgres Integration (pull_request) Successful in 6m7s
gate-check-v3 / gate-check (pull_request_target) Successful in 4s
sop-checklist / review-refire (pull_request_target) Has been skipped
sop-checklist / all-items-acked (pull_request) [info tier:low] acked: 0/7 — missing: comprehensive-testing, local-postgres-e2e, staging-smoke, +4
sop-checklist / na-declarations (pull_request) N/A: (none)
sop-checklist / all-items-acked (pull_request_target) Successful in 3s
sop-tier-check / tier-check (pull_request_target) Failing after 5s
- cp_config.go: replace http.DefaultClient with 10s timeout client.
- bundle/importer.go: add panic recovery in provision goroutine.
- a2a_proxy.go: add panic recovery in SSE idle watcher goroutine.
- discovery.go: pass context to queryPeerMaps and use QueryRowContext.
- terminal.go: add panic recovery in stdout/PTY/stdin goroutines.
- workspace.go: add deferred tx.Rollback in Create handler.
- middleware/mcp_ratelimit.go, ratelimit.go, session_auth.go: add panic
  recovery in background cleanup goroutines.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-04 06:22:34 +00:00
9 changed files with 53 additions and 9 deletions
+2 -1
View File
@@ -61,7 +61,8 @@ func refreshEnvFromCP() error {
req.Header.Set("Authorization", "Bearer "+adminToken)
req.Header.Set("X-Molecule-Org-Id", orgID)
resp, err := http.DefaultClient.Do(req)
client := &http.Client{Timeout: 10 * time.Second}
resp, err := client.Do(req)
if err != nil {
return fmt.Errorf("do request: %w", err)
}
@@ -89,6 +89,11 @@ func Import(
// PluginsPath set by caller if available
}
go func() {
defer func() {
if r := recover(); r != nil {
log.Printf("bundle/importer: PANIC during provision start for %s: %v", wsID, r)
}
}()
provCtx, cancel := context.WithTimeout(context.Background(), provisioner.ProvisionTimeout)
defer cancel()
url, err := prov.Start(provCtx, cfg)
@@ -1002,7 +1002,12 @@ func applyIdleTimeout(parent context.Context, b *events.Broadcaster, workspaceID
// completed when t.Cleanup fires. Does NOT read db.DB; idle-timer
// management only.
go func() {
defer unsub()
defer func() {
if r := recover(); r != nil {
log.Printf("a2a_proxy: PANIC in SSE idle watcher for %s: %v", workspaceID, r)
}
unsub()
}()
timer := time.NewTimer(idle)
defer timer.Stop()
for {
@@ -249,7 +249,7 @@ func (h *DiscoveryHandler) Peers(c *gin.Context) {
// parent_id-bound branch enumerates siblings, and that is already scoped to
// one parent (one tenant).
if parentID.Valid {
siblings, _ := queryPeerMaps(`
siblings, _ := queryPeerMaps(ctx, `
SELECT w.id, w.name, COALESCE(w.role, ''), w.tier, w.status,
COALESCE(w.agent_card, 'null'::jsonb), COALESCE(w.url, ''),
w.parent_id, w.active_tasks
@@ -268,7 +268,7 @@ func (h *DiscoveryHandler) Peers(c *gin.Context) {
// self-delegation 400 in a tight loop (#383). The `w.id != $2`
// clause makes self-delegation-via-peer-list impossible regardless
// of DB state.
children, _ := queryPeerMaps(`
children, _ := queryPeerMaps(ctx, `
SELECT w.id, w.name, COALESCE(w.role, ''), w.tier, w.status,
COALESCE(w.agent_card, 'null'::jsonb), COALESCE(w.url, ''),
w.parent_id, w.active_tasks
@@ -281,7 +281,7 @@ func (h *DiscoveryHandler) Peers(c *gin.Context) {
// propagate that corruption back to the agent as a "peer who is also
// you" entry.
if parentID.Valid {
parent, _ := queryPeerMaps(`
parent, _ := queryPeerMaps(ctx, `
SELECT w.id, w.name, COALESCE(w.role, ''), w.tier, w.status,
COALESCE(w.agent_card, 'null'::jsonb), COALESCE(w.url, ''),
w.parent_id, w.active_tasks
@@ -350,8 +350,8 @@ func filterPeersByQuery(peers []map[string]interface{}, q string) []map[string]i
}
// queryPeerMaps returns clean JSON-serializable maps instead of Workspace structs.
func queryPeerMaps(query string, args ...interface{}) ([]map[string]interface{}, error) {
rows, err := db.DB.Query(query, args...)
func queryPeerMaps(ctx context.Context, query string, args ...interface{}) ([]map[string]interface{}, error) {
rows, err := db.DB.QueryContext(ctx, query, args...)
if err != nil {
log.Printf("queryPeerMaps error: %v", err)
return nil, err
+17 -2
View File
@@ -217,7 +217,12 @@ func (h *TerminalHandler) handleLocalConnect(c *gin.Context, workspaceID string)
// synchronously. No db.DB access on this path.
done := make(chan struct{})
go func() {
defer close(done)
defer func() {
if r := recover(); r != nil {
log.Printf("Terminal: PANIC in stdout bridge: %v", r)
}
close(done)
}()
buf := make([]byte, 4096)
for {
n, err := resp.Reader.Read(buf)
@@ -440,7 +445,12 @@ func (h *TerminalHandler) handleRemoteConnect(c *gin.Context, workspaceID, insta
// goAsync-exempt (RFC internal#524 Layer 2.2): WebSocket-lifetime
// I/O bridge; handler blocks on `done` below. No db.DB access.
go func() {
defer close(done)
defer func() {
if r := recover(); r != nil {
log.Printf("Terminal: PANIC in PTY bridge: %v", r)
}
close(done)
}()
buf := make([]byte, 4096)
for {
n, err := ptmx.Read(buf)
@@ -463,6 +473,11 @@ func (h *TerminalHandler) handleRemoteConnect(c *gin.Context, workspaceID, insta
// WebSocket → PTY (stdin)
// goAsync-exempt (RFC internal#524 Layer 2.2): see above.
go func() {
defer func() {
if r := recover(); r != nil {
log.Printf("Terminal: PANIC in stdin loop: %v", r)
}
}()
for {
_, msg, rErr := conn.ReadMessage()
if rErr != nil {
@@ -556,6 +556,7 @@ func (h *WorkspaceHandler) Create(c *gin.Context) {
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to create workspace"})
return
}
defer func() { _ = tx.Rollback() }()
maxConcurrent := payload.MaxConcurrentTasks
if maxConcurrent <= 0 {
@@ -4,6 +4,7 @@ import (
"context"
"crypto/sha256"
"fmt"
"log"
"net/http"
"strconv"
"strings"
@@ -41,6 +42,11 @@ func NewMCPRateLimiter(rate int, interval time.Duration, ctx context.Context) *M
interval: interval,
}
go func() {
defer func() {
if r := recover(); r != nil {
log.Printf("mcp_ratelimit: PANIC in bucket cleanup: %v", r)
}
}()
ticker := time.NewTicker(5 * time.Minute)
defer ticker.Stop()
for {
@@ -3,6 +3,7 @@ package middleware
import (
"context"
"log"
"net/http"
"strconv"
"strings"
@@ -35,6 +36,11 @@ func NewRateLimiter(rate int, interval time.Duration, ctx context.Context) *Rate
interval: interval,
}
go func() {
defer func() {
if r := recover(); r != nil {
log.Printf("ratelimit: PANIC in bucket cleanup: %v", r)
}
}()
ticker := time.NewTicker(5 * time.Minute)
defer ticker.Stop()
for {
@@ -116,6 +116,11 @@ func sessionCachePut(key string, ok bool) {
func init() {
go func() {
defer func() {
if r := recover(); r != nil {
log.Printf("session_auth: PANIC in cache sweeper: %v", r)
}
}()
// Jitter startup so restarts don't align sweeps.
time.Sleep(time.Duration(rand.Int64N(int64(sessionCacheSweepEvery))))
t := time.NewTicker(sessionCacheSweepEvery)