Merge pull request #413 from Molecule-AI/fix/isrunning-distinguish-notfound
fix(provisioner): IsRunning conservative on daemon errors to stop restart cascade
This commit is contained in:
commit
441a2a5938
@ -442,7 +442,15 @@ func (h *WorkspaceHandler) maybeMarkContainerDead(ctx context.Context, workspace
|
||||
if h.provisioner == nil || wsRuntime == "external" {
|
||||
return false
|
||||
}
|
||||
if running, _ := h.provisioner.IsRunning(ctx, workspaceID); running {
|
||||
running, inspectErr := h.provisioner.IsRunning(ctx, workspaceID)
|
||||
if inspectErr != nil {
|
||||
// Transient Docker-daemon error (timeout, socket EOF, etc.). Post-
|
||||
// #386, IsRunning returns (true, err) in this case — caller stays
|
||||
// on the alive path and does not trigger a restart cascade. Log
|
||||
// so the defect is visible without being destructive.
|
||||
log.Printf("ProxyA2A: IsRunning for %s returned transient error (assuming alive): %v", workspaceID, inspectErr)
|
||||
}
|
||||
if running {
|
||||
return false
|
||||
}
|
||||
log.Printf("ProxyA2A: container for %s is dead — marking offline and triggering restart", workspaceID)
|
||||
|
||||
51
platform/internal/provisioner/isrunning_test.go
Normal file
51
platform/internal/provisioner/isrunning_test.go
Normal file
@ -0,0 +1,51 @@
|
||||
package provisioner
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// isContainerNotFound is the chokepoint that decides whether IsRunning
|
||||
// tears down a workspace. Getting this wrong (false positive) causes
|
||||
// the restart cascade observed 2026-04-16 09:10 UTC when 6 containers
|
||||
// got simultaneous A2A forward failures, their reactive IsRunning
|
||||
// checks all hit a busy Docker daemon, timed out, and got flipped to
|
||||
// "dead" in parallel. These tests pin the truth table.
|
||||
|
||||
func TestIsContainerNotFound(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
err error
|
||||
want bool
|
||||
}{
|
||||
{"nil", nil, false},
|
||||
{"docker not-found message",
|
||||
errors.New(`Error response from daemon: No such container: ws-abc123`),
|
||||
true},
|
||||
{"generic not found",
|
||||
errors.New("container not found"),
|
||||
true},
|
||||
{"context deadline",
|
||||
errors.New("context deadline exceeded"),
|
||||
false},
|
||||
{"socket EOF",
|
||||
errors.New(`Get "http://%2Fvar%2Frun%2Fdocker.sock/...": EOF`),
|
||||
false},
|
||||
{"daemon connection refused",
|
||||
errors.New("dial unix /var/run/docker.sock: connect: connection refused"),
|
||||
false},
|
||||
{"i/o timeout",
|
||||
errors.New("read unix @->/var/run/docker.sock: i/o timeout"),
|
||||
false},
|
||||
{"empty string",
|
||||
errors.New(""),
|
||||
false},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
if got := isContainerNotFound(tc.err); got != tc.want {
|
||||
t.Errorf("isContainerNotFound(%q) = %v, want %v", tc.err, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@ -749,15 +749,60 @@ func (p *Provisioner) Stop(ctx context.Context, workspaceID string) error {
|
||||
}
|
||||
|
||||
// IsRunning checks if a workspace container is currently running.
|
||||
//
|
||||
// Conservative on transient Docker errors: returns (true, err) for any
|
||||
// inspect failure OTHER than NotFound. Rationale: the only caller that
|
||||
// acts destructively on `running=false` is a2a_proxy.maybeMarkContainerDead,
|
||||
// which tears down + re-provisions the workspace. A Docker daemon hiccup
|
||||
// (timeout, EOF on the daemon socket, context deadline) is NOT evidence
|
||||
// that the container died — it's evidence the daemon is momentarily busy.
|
||||
// The old behaviour collapsed all errors into "container doesn't exist",
|
||||
// which triggered a restart cascade on 2026-04-16 when 6 containers
|
||||
// received simultaneous A2A forward failures during a batch delegation;
|
||||
// the followup reactive IsRunning calls all hit the daemon under load,
|
||||
// timed out, and flipped every container to "dead" in parallel.
|
||||
//
|
||||
// NotFound (container legitimately deleted) is the only case where
|
||||
// running=false is safe to act on — every other error path stays alive
|
||||
// so a real crash still surfaces via exec heartbeat or TTL, both of which
|
||||
// have narrower false-positive windows than daemon-inspect RPC.
|
||||
func (p *Provisioner) IsRunning(ctx context.Context, workspaceID string) (bool, error) {
|
||||
name := ContainerName(workspaceID)
|
||||
info, err := p.cli.ContainerInspect(ctx, name)
|
||||
if err != nil {
|
||||
return false, nil // Container doesn't exist
|
||||
if isContainerNotFound(err) {
|
||||
return false, nil
|
||||
}
|
||||
// Transient daemon error: caller treats !running as dead + restarts.
|
||||
// Returning true + the underlying error preserves the error for
|
||||
// metrics/logging without triggering the destructive path.
|
||||
return true, err
|
||||
}
|
||||
return info.State.Running, nil
|
||||
}
|
||||
|
||||
// isContainerNotFound returns true when the Docker client indicates the
|
||||
// named container genuinely does not exist, versus a transient daemon
|
||||
// error (timeout, socket EOF, context cancellation).
|
||||
//
|
||||
// docker/docker v28 uses multiple distinct NotFound shapes depending on
|
||||
// transport:
|
||||
// - the typed errdefs.ErrNotFound
|
||||
// - a wrapped error whose message contains "No such container"
|
||||
//
|
||||
// Rather than import errdefs (which would add a transitive dep), we
|
||||
// match on the error string. String-matching is the exact approach the
|
||||
// Docker CLI itself uses internally — see the "No such container" check
|
||||
// in docker/cli — and is stable across daemon versions.
|
||||
func isContainerNotFound(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
s := err.Error()
|
||||
return strings.Contains(s, "No such container") ||
|
||||
strings.Contains(s, "not found")
|
||||
}
|
||||
|
||||
// DockerClient returns the underlying Docker client for sharing with other handlers.
|
||||
func (p *Provisioner) DockerClient() *client.Client {
|
||||
return p.cli
|
||||
|
||||
Loading…
Reference in New Issue
Block a user