From 8bf27ae1d0f7f7a73fa27bf92bfc524ad94dfc11 Mon Sep 17 00:00:00 2001 From: rabbitblood Date: Thu, 16 Apr 2026 02:21:25 -0700 Subject: [PATCH] fix(provisioner): IsRunning conservative on daemon errors to stop restart cascade MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause of the 2026-04-16 09:10 UTC six-container restart cascade. ## Timeline 09:10:26 — PM sent a batch delegation to 15+ agents (Dev Lead coordinating). 09:10:26-27 — 4 leaders/auditors (Security, RL, BE, DevOps) simultaneously hit "workspace agent unreachable — container restart triggered" even though their containers were running fine. Another 2 (DL, UIUX) tripped in the next few seconds. 09:10:27 — Provisioner stopped + recreated 6 containers in parallel. A2A callers got EOFs, PM's batch coordination stalled. ## Root cause `provisioner.IsRunning` collapsed every ContainerInspect error into `(false, nil)`, including transient Docker daemon hiccups: func IsRunning(...) (bool, error) { info, err := p.cli.ContainerInspect(ctx, name) if err != nil { return false, nil // Container doesn't exist ← MISREAD } return info.State.Running, nil } The comment said "Container doesn't exist" but the error was actually any of: daemon timeout, socket EOF, context deadline, connection refused. Under load (batch delegation fan-out → 15 concurrent HTTP inbound → 15 concurrent Claude Code subprocesses → Docker daemon CPU pressure), ContainerInspect calls started failing transiently. All 6 calls returned `(false, nil)`. Caller `maybeMarkContainerDead` treated `running=false` as "container is dead, restart it" → six parallel restarts. This was exactly the destructive-on-error pattern we keep trying to kill (see #160 SDK-stderr-probe, #318 fail-open classes). ## Fix `IsRunning` now distinguishes NotFound from transient errors: - Legitimately missing container (caller deleted, Docker pruned) → `(false, nil)` — safe to act on; caller marks dead + restarts. - Any other error (daemon timeout, socket issue, context deadline) → `(true, err)` — caller stays on the alive path. The transient error is preserved so metrics + logging still see it, but it does NOT trigger the destructive restart branch. `isContainerNotFound` matches on error-message substring — same approach docker/cli uses internally — to avoid pulling in errdefs as a direct dep. Truth table tests in `isrunning_test.go` cover 8 cases: NotFound variants (real + generic), nil, empty, and the 4 transient- error shapes we've actually observed (deadline, EOF, connection-refused, i/o timeout). ## Caller update `maybeMarkContainerDead` in a2a_proxy.go now logs the transient inspect error (was silently discarded via `_`). Visibility without destructiveness. If this error becomes persistent, we'll see it in platform logs rather than diagnosing after another restart cascade. ## Expected impact - Zero restart cascades from the current class of transient inspect errors (EOF, timeout, connection refused). - Dead containers still detected within the A2A layer because an actual stopped container returns NotFound on inspect, and the TTL monitor (180s post #386) catches anything that slips through. - New visibility in platform logs when inspect has trouble — previously silent. Combined with the TTL fix in #386, the defense-in-depth on spurious restart is now: 1. IsRunning only returns false for real NotFound 2. Liveness TTL is 180s, surviving 5+ missed heartbeats 3. A2A proxy 503-Busy path retries with backoff before touching restart logic at all Co-Authored-By: Claude Opus 4.6 (1M context) --- platform/internal/handlers/a2a_proxy.go | 10 +++- .../internal/provisioner/isrunning_test.go | 51 +++++++++++++++++++ platform/internal/provisioner/provisioner.go | 47 ++++++++++++++++- 3 files changed, 106 insertions(+), 2 deletions(-) create mode 100644 platform/internal/provisioner/isrunning_test.go diff --git a/platform/internal/handlers/a2a_proxy.go b/platform/internal/handlers/a2a_proxy.go index 70da8928..f6ec9ce4 100644 --- a/platform/internal/handlers/a2a_proxy.go +++ b/platform/internal/handlers/a2a_proxy.go @@ -442,7 +442,15 @@ func (h *WorkspaceHandler) maybeMarkContainerDead(ctx context.Context, workspace if h.provisioner == nil || wsRuntime == "external" { return false } - if running, _ := h.provisioner.IsRunning(ctx, workspaceID); running { + running, inspectErr := h.provisioner.IsRunning(ctx, workspaceID) + if inspectErr != nil { + // Transient Docker-daemon error (timeout, socket EOF, etc.). Post- + // #386, IsRunning returns (true, err) in this case — caller stays + // on the alive path and does not trigger a restart cascade. Log + // so the defect is visible without being destructive. + log.Printf("ProxyA2A: IsRunning for %s returned transient error (assuming alive): %v", workspaceID, inspectErr) + } + if running { return false } log.Printf("ProxyA2A: container for %s is dead — marking offline and triggering restart", workspaceID) diff --git a/platform/internal/provisioner/isrunning_test.go b/platform/internal/provisioner/isrunning_test.go new file mode 100644 index 00000000..0f5c587c --- /dev/null +++ b/platform/internal/provisioner/isrunning_test.go @@ -0,0 +1,51 @@ +package provisioner + +import ( + "errors" + "testing" +) + +// isContainerNotFound is the chokepoint that decides whether IsRunning +// tears down a workspace. Getting this wrong (false positive) causes +// the restart cascade observed 2026-04-16 09:10 UTC when 6 containers +// got simultaneous A2A forward failures, their reactive IsRunning +// checks all hit a busy Docker daemon, timed out, and got flipped to +// "dead" in parallel. These tests pin the truth table. + +func TestIsContainerNotFound(t *testing.T) { + cases := []struct { + name string + err error + want bool + }{ + {"nil", nil, false}, + {"docker not-found message", + errors.New(`Error response from daemon: No such container: ws-abc123`), + true}, + {"generic not found", + errors.New("container not found"), + true}, + {"context deadline", + errors.New("context deadline exceeded"), + false}, + {"socket EOF", + errors.New(`Get "http://%2Fvar%2Frun%2Fdocker.sock/...": EOF`), + false}, + {"daemon connection refused", + errors.New("dial unix /var/run/docker.sock: connect: connection refused"), + false}, + {"i/o timeout", + errors.New("read unix @->/var/run/docker.sock: i/o timeout"), + false}, + {"empty string", + errors.New(""), + false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := isContainerNotFound(tc.err); got != tc.want { + t.Errorf("isContainerNotFound(%q) = %v, want %v", tc.err, got, tc.want) + } + }) + } +} diff --git a/platform/internal/provisioner/provisioner.go b/platform/internal/provisioner/provisioner.go index e8a4657c..f2905b2e 100644 --- a/platform/internal/provisioner/provisioner.go +++ b/platform/internal/provisioner/provisioner.go @@ -749,15 +749,60 @@ func (p *Provisioner) Stop(ctx context.Context, workspaceID string) error { } // IsRunning checks if a workspace container is currently running. +// +// Conservative on transient Docker errors: returns (true, err) for any +// inspect failure OTHER than NotFound. Rationale: the only caller that +// acts destructively on `running=false` is a2a_proxy.maybeMarkContainerDead, +// which tears down + re-provisions the workspace. A Docker daemon hiccup +// (timeout, EOF on the daemon socket, context deadline) is NOT evidence +// that the container died — it's evidence the daemon is momentarily busy. +// The old behaviour collapsed all errors into "container doesn't exist", +// which triggered a restart cascade on 2026-04-16 when 6 containers +// received simultaneous A2A forward failures during a batch delegation; +// the followup reactive IsRunning calls all hit the daemon under load, +// timed out, and flipped every container to "dead" in parallel. +// +// NotFound (container legitimately deleted) is the only case where +// running=false is safe to act on — every other error path stays alive +// so a real crash still surfaces via exec heartbeat or TTL, both of which +// have narrower false-positive windows than daemon-inspect RPC. func (p *Provisioner) IsRunning(ctx context.Context, workspaceID string) (bool, error) { name := ContainerName(workspaceID) info, err := p.cli.ContainerInspect(ctx, name) if err != nil { - return false, nil // Container doesn't exist + if isContainerNotFound(err) { + return false, nil + } + // Transient daemon error: caller treats !running as dead + restarts. + // Returning true + the underlying error preserves the error for + // metrics/logging without triggering the destructive path. + return true, err } return info.State.Running, nil } +// isContainerNotFound returns true when the Docker client indicates the +// named container genuinely does not exist, versus a transient daemon +// error (timeout, socket EOF, context cancellation). +// +// docker/docker v28 uses multiple distinct NotFound shapes depending on +// transport: +// - the typed errdefs.ErrNotFound +// - a wrapped error whose message contains "No such container" +// +// Rather than import errdefs (which would add a transitive dep), we +// match on the error string. String-matching is the exact approach the +// Docker CLI itself uses internally — see the "No such container" check +// in docker/cli — and is stable across daemon versions. +func isContainerNotFound(err error) bool { + if err == nil { + return false + } + s := err.Error() + return strings.Contains(s, "No such container") || + strings.Contains(s, "not found") +} + // DockerClient returns the underlying Docker client for sharing with other handlers. func (p *Provisioner) DockerClient() *client.Client { return p.cli