diff --git a/platform/internal/handlers/a2a_proxy.go b/platform/internal/handlers/a2a_proxy.go index 70da8928..f6ec9ce4 100644 --- a/platform/internal/handlers/a2a_proxy.go +++ b/platform/internal/handlers/a2a_proxy.go @@ -442,7 +442,15 @@ func (h *WorkspaceHandler) maybeMarkContainerDead(ctx context.Context, workspace if h.provisioner == nil || wsRuntime == "external" { return false } - if running, _ := h.provisioner.IsRunning(ctx, workspaceID); running { + running, inspectErr := h.provisioner.IsRunning(ctx, workspaceID) + if inspectErr != nil { + // Transient Docker-daemon error (timeout, socket EOF, etc.). Post- + // #386, IsRunning returns (true, err) in this case — caller stays + // on the alive path and does not trigger a restart cascade. Log + // so the defect is visible without being destructive. + log.Printf("ProxyA2A: IsRunning for %s returned transient error (assuming alive): %v", workspaceID, inspectErr) + } + if running { return false } log.Printf("ProxyA2A: container for %s is dead — marking offline and triggering restart", workspaceID) diff --git a/platform/internal/provisioner/isrunning_test.go b/platform/internal/provisioner/isrunning_test.go new file mode 100644 index 00000000..0f5c587c --- /dev/null +++ b/platform/internal/provisioner/isrunning_test.go @@ -0,0 +1,51 @@ +package provisioner + +import ( + "errors" + "testing" +) + +// isContainerNotFound is the chokepoint that decides whether IsRunning +// tears down a workspace. Getting this wrong (false positive) causes +// the restart cascade observed 2026-04-16 09:10 UTC when 6 containers +// got simultaneous A2A forward failures, their reactive IsRunning +// checks all hit a busy Docker daemon, timed out, and got flipped to +// "dead" in parallel. These tests pin the truth table. + +func TestIsContainerNotFound(t *testing.T) { + cases := []struct { + name string + err error + want bool + }{ + {"nil", nil, false}, + {"docker not-found message", + errors.New(`Error response from daemon: No such container: ws-abc123`), + true}, + {"generic not found", + errors.New("container not found"), + true}, + {"context deadline", + errors.New("context deadline exceeded"), + false}, + {"socket EOF", + errors.New(`Get "http://%2Fvar%2Frun%2Fdocker.sock/...": EOF`), + false}, + {"daemon connection refused", + errors.New("dial unix /var/run/docker.sock: connect: connection refused"), + false}, + {"i/o timeout", + errors.New("read unix @->/var/run/docker.sock: i/o timeout"), + false}, + {"empty string", + errors.New(""), + false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := isContainerNotFound(tc.err); got != tc.want { + t.Errorf("isContainerNotFound(%q) = %v, want %v", tc.err, got, tc.want) + } + }) + } +} diff --git a/platform/internal/provisioner/provisioner.go b/platform/internal/provisioner/provisioner.go index e8a4657c..f2905b2e 100644 --- a/platform/internal/provisioner/provisioner.go +++ b/platform/internal/provisioner/provisioner.go @@ -749,15 +749,60 @@ func (p *Provisioner) Stop(ctx context.Context, workspaceID string) error { } // IsRunning checks if a workspace container is currently running. +// +// Conservative on transient Docker errors: returns (true, err) for any +// inspect failure OTHER than NotFound. Rationale: the only caller that +// acts destructively on `running=false` is a2a_proxy.maybeMarkContainerDead, +// which tears down + re-provisions the workspace. A Docker daemon hiccup +// (timeout, EOF on the daemon socket, context deadline) is NOT evidence +// that the container died — it's evidence the daemon is momentarily busy. +// The old behaviour collapsed all errors into "container doesn't exist", +// which triggered a restart cascade on 2026-04-16 when 6 containers +// received simultaneous A2A forward failures during a batch delegation; +// the followup reactive IsRunning calls all hit the daemon under load, +// timed out, and flipped every container to "dead" in parallel. +// +// NotFound (container legitimately deleted) is the only case where +// running=false is safe to act on — every other error path stays alive +// so a real crash still surfaces via exec heartbeat or TTL, both of which +// have narrower false-positive windows than daemon-inspect RPC. func (p *Provisioner) IsRunning(ctx context.Context, workspaceID string) (bool, error) { name := ContainerName(workspaceID) info, err := p.cli.ContainerInspect(ctx, name) if err != nil { - return false, nil // Container doesn't exist + if isContainerNotFound(err) { + return false, nil + } + // Transient daemon error: caller treats !running as dead + restarts. + // Returning true + the underlying error preserves the error for + // metrics/logging without triggering the destructive path. + return true, err } return info.State.Running, nil } +// isContainerNotFound returns true when the Docker client indicates the +// named container genuinely does not exist, versus a transient daemon +// error (timeout, socket EOF, context cancellation). +// +// docker/docker v28 uses multiple distinct NotFound shapes depending on +// transport: +// - the typed errdefs.ErrNotFound +// - a wrapped error whose message contains "No such container" +// +// Rather than import errdefs (which would add a transitive dep), we +// match on the error string. String-matching is the exact approach the +// Docker CLI itself uses internally — see the "No such container" check +// in docker/cli — and is stable across daemon versions. +func isContainerNotFound(err error) bool { + if err == nil { + return false + } + s := err.Error() + return strings.Contains(s, "No such container") || + strings.Contains(s, "not found") +} + // DockerClient returns the underlying Docker client for sharing with other handlers. func (p *Provisioner) DockerClient() *client.Client { return p.cli