fix(provisioner): IsRunning conservative on daemon errors to stop restart cascade

Root cause of the 2026-04-16 09:10 UTC six-container restart cascade. ## Timeline 09:10:26 — PM sent a batch delegation to 15+ agents (Dev Lead coordinating). 09:10:26-27 — 4 leaders/auditors (Security, RL, BE, DevOps) simultaneously hit "workspace agent unreachable — container restart triggered" even though their containers were running fine. Another 2 (DL, UIUX) tripped in the next few seconds. 09:10:27 — Provisioner stopped + recreated 6 containers in parallel. A2A callers got EOFs, PM's batch coordination stalled. ## Root cause `provisioner.IsRunning` collapsed every ContainerInspect error into `(false, nil)`, including transient Docker daemon hiccups: func IsRunning(...) (bool, error) { info, err := p.cli.ContainerInspect(ctx, name) if err != nil { return false, nil // Container doesn't exist ← MISREAD } return info.State.Running, nil } The comment said "Container doesn't exist" but the error was actually any of: daemon timeout, socket EOF, context deadline, connection refused. Under load (batch delegation fan-out → 15 concurrent HTTP inbound → 15 concurrent Claude Code subprocesses → Docker daemon CPU pressure), ContainerInspect calls started failing transiently. All 6 calls returned `(false, nil)`. Caller `maybeMarkContainerDead` treated `running=false` as "container is dead, restart it" → six parallel restarts. This was exactly the destructive-on-error pattern we keep trying to kill (see #160 SDK-stderr-probe, #318 fail-open classes). ## Fix `IsRunning` now distinguishes NotFound from transient errors: - Legitimately missing container (caller deleted, Docker pruned) → `(false, nil)` — safe to act on; caller marks dead + restarts. - Any other error (daemon timeout, socket issue, context deadline) → `(true, err)` — caller stays on the alive path. The transient error is preserved so metrics + logging still see it, but it does NOT trigger the destructive restart branch. `isContainerNotFound` matches on error-message substring — same approach docker/cli uses internally — to avoid pulling in errdefs as a direct dep. Truth table tests in `isrunning_test.go` cover 8 cases: NotFound variants (real + generic), nil, empty, and the 4 transient- error shapes we've actually observed (deadline, EOF, connection-refused, i/o timeout). ## Caller update `maybeMarkContainerDead` in a2a_proxy.go now logs the transient inspect error (was silently discarded via `_`). Visibility without destructiveness. If this error becomes persistent, we'll see it in platform logs rather than diagnosing after another restart cascade. ## Expected impact - Zero restart cascades from the current class of transient inspect errors (EOF, timeout, connection refused). - Dead containers still detected within the A2A layer because an actual stopped container returns NotFound on inspect, and the TTL monitor (180s post #386) catches anything that slips through. - New visibility in platform logs when inspect has trouble — previously silent. Combined with the TTL fix in #386, the defense-in-depth on spurious restart is now: 1. IsRunning only returns false for real NotFound 2. Liveness TTL is 180s, surviving 5+ missed heartbeats 3. A2A proxy 503-Busy path retries with backoff before touching restart logic at all Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-16 02:21:25 -07:00 · 2026-04-16 02:21:25 -07:00 · 8bf27ae1d0
commit 8bf27ae1d0
parent 37b288c79b
3 changed files with 106 additions and 2 deletions
--- a/platform/internal/handlers/a2a_proxy.go
+++ b/platform/internal/handlers/a2a_proxy.go
@ -442,7 +442,15 @@ func (h *WorkspaceHandler) maybeMarkContainerDead(ctx context.Context, workspace
 	if h.provisioner == nil || wsRuntime == "external" {
 		return false
 	}
-	if running, _ := h.provisioner.IsRunning(ctx, workspaceID); running {
+	running, inspectErr := h.provisioner.IsRunning(ctx, workspaceID)
+	if inspectErr != nil {
+		// Transient Docker-daemon error (timeout, socket EOF, etc.). Post-
+		// #386, IsRunning returns (true, err) in this case — caller stays
+		// on the alive path and does not trigger a restart cascade. Log
+		// so the defect is visible without being destructive.
+		log.Printf("ProxyA2A: IsRunning for %s returned transient error (assuming alive): %v", workspaceID, inspectErr)
+	}
+	if running {
 		return false
 	}
 	log.Printf("ProxyA2A: container for %s is dead — marking offline and triggering restart", workspaceID)
--- a/platform/internal/provisioner/isrunning_test.go
+++ b/platform/internal/provisioner/isrunning_test.go
@ -0,0 +1,51 @@
+package provisioner
+
+import (
+	"errors"
+	"testing"
+)
+
+// isContainerNotFound is the chokepoint that decides whether IsRunning
+// tears down a workspace. Getting this wrong (false positive) causes
+// the restart cascade observed 2026-04-16 09:10 UTC when 6 containers
+// got simultaneous A2A forward failures, their reactive IsRunning
+// checks all hit a busy Docker daemon, timed out, and got flipped to
+// "dead" in parallel. These tests pin the truth table.
+
+func TestIsContainerNotFound(t *testing.T) {
+	cases := []struct {
+		name string
+		err  error
+		want bool
+	}{
+		{"nil", nil, false},
+		{"docker not-found message",
+			errors.New(`Error response from daemon: No such container: ws-abc123`),
+			true},
+		{"generic not found",
+			errors.New("container not found"),
+			true},
+		{"context deadline",
+			errors.New("context deadline exceeded"),
+			false},
+		{"socket EOF",
+			errors.New(`Get "http://%2Fvar%2Frun%2Fdocker.sock/...": EOF`),
+			false},
+		{"daemon connection refused",
+			errors.New("dial unix /var/run/docker.sock: connect: connection refused"),
+			false},
+		{"i/o timeout",
+			errors.New("read unix @->/var/run/docker.sock: i/o timeout"),
+			false},
+		{"empty string",
+			errors.New(""),
+			false},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			if got := isContainerNotFound(tc.err); got != tc.want {
+				t.Errorf("isContainerNotFound(%q) = %v, want %v", tc.err, got, tc.want)
+			}
+		})
+	}
+}
--- a/platform/internal/provisioner/provisioner.go
+++ b/platform/internal/provisioner/provisioner.go
@ -749,15 +749,60 @@ func (p *Provisioner) Stop(ctx context.Context, workspaceID string) error {
 }

 // IsRunning checks if a workspace container is currently running.
+//
+// Conservative on transient Docker errors: returns (true, err) for any
+// inspect failure OTHER than NotFound. Rationale: the only caller that
+// acts destructively on `running=false` is a2a_proxy.maybeMarkContainerDead,
+// which tears down + re-provisions the workspace. A Docker daemon hiccup
+// (timeout, EOF on the daemon socket, context deadline) is NOT evidence
+// that the container died — it's evidence the daemon is momentarily busy.
+// The old behaviour collapsed all errors into "container doesn't exist",
+// which triggered a restart cascade on 2026-04-16 when 6 containers
+// received simultaneous A2A forward failures during a batch delegation;
+// the followup reactive IsRunning calls all hit the daemon under load,
+// timed out, and flipped every container to "dead" in parallel.
+//
+// NotFound (container legitimately deleted) is the only case where
+// running=false is safe to act on — every other error path stays alive
+// so a real crash still surfaces via exec heartbeat or TTL, both of which
+// have narrower false-positive windows than daemon-inspect RPC.
 func (p *Provisioner) IsRunning(ctx context.Context, workspaceID string) (bool, error) {
 	name := ContainerName(workspaceID)
 	info, err := p.cli.ContainerInspect(ctx, name)
 	if err != nil {
-		return false, nil // Container doesn't exist
+		if isContainerNotFound(err) {
+			return false, nil
+		}
+		// Transient daemon error: caller treats !running as dead + restarts.
+		// Returning true + the underlying error preserves the error for
+		// metrics/logging without triggering the destructive path.
+		return true, err
 	}
 	return info.State.Running, nil
 }

+// isContainerNotFound returns true when the Docker client indicates the
+// named container genuinely does not exist, versus a transient daemon
+// error (timeout, socket EOF, context cancellation).
+//
+// docker/docker v28 uses multiple distinct NotFound shapes depending on
+// transport:
+//   - the typed errdefs.ErrNotFound
+//   - a wrapped error whose message contains "No such container"
+//
+// Rather than import errdefs (which would add a transitive dep), we
+// match on the error string. String-matching is the exact approach the
+// Docker CLI itself uses internally — see the "No such container" check
+// in docker/cli — and is stable across daemon versions.
+func isContainerNotFound(err error) bool {
+	if err == nil {
+		return false
+	}
+	s := err.Error()
+	return strings.Contains(s, "No such container") ||
+		strings.Contains(s, "not found")
+}
+
 // DockerClient returns the underlying Docker client for sharing with other handlers.
 func (p *Provisioner) DockerClient() *client.Client {
 	return p.cli