forked from molecule-ai/molecule-core
fix(provisioner): nil guards on Stop/IsRunning, unblock contract tests (closes #1813)
Both backends panicked when called on a zero-valued or nil receiver:
Provisioner.{Stop,IsRunning} dereferenced p.cli; CPProvisioner.{Stop,
IsRunning} dereferenced p.httpClient. The orphan sweeper and shutdown
paths can call these speculatively where the receiver isn't fully
wired — the panic crashed the goroutine instead of the caller seeing
a clean error.
Three changes:
1. Add ErrNoBackend (typed sentinel) and nil-guard the four methods.
- Provisioner.{Stop,IsRunning}: guard p == nil || p.cli == nil at
the top.
- CPProvisioner.Stop: guard p == nil up top, then httpClient nil
AFTER resolveInstanceID + empty-instance check (the empty
instance_id path doesn't need HTTP and stays a no-op success
even on zero-valued receivers — preserved historical contract
from TestIsRunning_EmptyInstanceIDReturnsFalse).
- CPProvisioner.IsRunning: same shape — empty instance_id stays
(false, nil); httpClient-nil with non-empty instance_id returns
ErrNoBackend.
2. Flip the t.Skip on TestDockerBackend_Contract +
TestCPProvisionerBackend_Contract — both contract tests run now
that the panics are gone. Skipped scenarios were the regression
guard for this fix.
3. Add TestZeroValuedBackends_NoPanic — explicit assertion that
zero-valued and nil receivers return cleanly (no panic). Docker
backend always returns ErrNoBackend on zero-valued; CPProvisioner
may return (false, nil) when the DB-lookup layer absorbs the case
(no instance to query → no HTTP needed). Both are acceptable per
the issue's contract — the gate is no-panic.
Tests:
- 6 sub-cases across the new TestZeroValuedBackends_NoPanic
- TestDockerBackend_Contract + TestCPProvisionerBackend_Contract
now run their 2 scenarios (4 sub-cases each)
- All existing provisioner tests still green
- go build ./... + go vet ./... + go test ./... clean
Closes drift-risk #6 in docs/architecture/backends.md.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
38fead35b4
commit
48b494def3
@ -38,6 +38,7 @@ package provisioner
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"testing"
|
||||
)
|
||||
|
||||
@ -138,23 +139,90 @@ func runBackendContract(t *testing.T, backend Backend) {
|
||||
}
|
||||
|
||||
// TestDockerBackend_Contract feeds the Docker backend through the
|
||||
// shared runner. Skipped pending hardening: the scaffold exposed a
|
||||
// real bug — neither backend's Stop/IsRunning handles a nil underlying
|
||||
// client gracefully (both panic). Filing that as a separate issue;
|
||||
// once both backends return (*, error) instead of panicking, flip this
|
||||
// to t.Run and the contract scenarios exercise the fix.
|
||||
// shared runner. Was Skip'd pending hardening of Provisioner.{Stop,
|
||||
// IsRunning} against nil Docker client — that hardening landed in
|
||||
// fix/provisioner-nil-guards-1813, so the scenarios run now. A
|
||||
// zero-valued *Provisioner returns ErrNoBackend from each method
|
||||
// instead of panicking, satisfying the no-panic contract.
|
||||
func TestDockerBackend_Contract(t *testing.T) {
|
||||
t.Skip("scaffolding only — unblock by hardening Provisioner.{Stop,IsRunning} against nil Docker client; see docs/architecture/backends.md drift risk #6")
|
||||
var p Provisioner
|
||||
runBackendContract(t, &p)
|
||||
}
|
||||
|
||||
// TestCPProvisionerBackend_Contract — same story as the Docker variant.
|
||||
// CPProvisioner panics on nil httpClient today; once that's hardened,
|
||||
// remove the Skip and this runner exercises both backends through a
|
||||
// single contract.
|
||||
// Hardening landed in fix/provisioner-nil-guards-1813; zero-valued
|
||||
// *CPProvisioner returns ErrNoBackend from Stop/IsRunning instead of
|
||||
// panicking on nil httpClient.
|
||||
func TestCPProvisionerBackend_Contract(t *testing.T) {
|
||||
t.Skip("scaffolding only — unblock by hardening CPProvisioner.{Stop,IsRunning} against nil httpClient; see docs/architecture/backends.md drift risk #6")
|
||||
var p CPProvisioner
|
||||
runBackendContract(t, &p)
|
||||
}
|
||||
|
||||
// TestZeroValuedBackends_NoPanic pins the contract that motivated #1813:
|
||||
// a Backend with no underlying client wired up must return cleanly,
|
||||
// never panic. The orphan sweeper and shutdown paths both call these
|
||||
// methods speculatively where the receiver could be unset.
|
||||
//
|
||||
// The exact error shape varies between backends:
|
||||
// - Docker Provisioner has no DB-lookup layer; zero-valued always
|
||||
// returns ErrNoBackend.
|
||||
// - CPProvisioner threads through a package-level resolveInstanceID
|
||||
// lookup; when the DB has no row for the workspace (or db.DB
|
||||
// itself is nil), instance_id comes back empty and the method
|
||||
// short-circuits to (false, nil). Only when there's a real
|
||||
// instance_id to query does the missing-httpClient case surface
|
||||
// as ErrNoBackend.
|
||||
//
|
||||
// Both shapes are acceptable — the contract is "no panic, error is
|
||||
// nil or ErrNoBackend." Anything else is a bug.
|
||||
func TestZeroValuedBackends_NoPanic(t *testing.T) {
|
||||
acceptableErr := func(err error) bool {
|
||||
return err == nil || errors.Is(err, ErrNoBackend)
|
||||
}
|
||||
t.Run("Provisioner.Stop", func(t *testing.T) {
|
||||
var p Provisioner
|
||||
if err := p.Stop(context.Background(), "any"); !errors.Is(err, ErrNoBackend) {
|
||||
t.Errorf("zero-valued Provisioner.Stop: got %v, want ErrNoBackend", err)
|
||||
}
|
||||
})
|
||||
t.Run("Provisioner.IsRunning", func(t *testing.T) {
|
||||
var p Provisioner
|
||||
running, err := p.IsRunning(context.Background(), "any")
|
||||
if !errors.Is(err, ErrNoBackend) {
|
||||
t.Errorf("zero-valued Provisioner.IsRunning: got err=%v, want ErrNoBackend", err)
|
||||
}
|
||||
if running {
|
||||
t.Errorf("zero-valued Provisioner.IsRunning: got running=true, want false")
|
||||
}
|
||||
})
|
||||
t.Run("CPProvisioner.Stop", func(t *testing.T) {
|
||||
var p CPProvisioner
|
||||
if err := p.Stop(context.Background(), "any"); !acceptableErr(err) {
|
||||
t.Errorf("zero-valued CPProvisioner.Stop: got %v, want nil or ErrNoBackend", err)
|
||||
}
|
||||
})
|
||||
t.Run("CPProvisioner.IsRunning", func(t *testing.T) {
|
||||
var p CPProvisioner
|
||||
_, err := p.IsRunning(context.Background(), "any")
|
||||
if !acceptableErr(err) {
|
||||
t.Errorf("zero-valued CPProvisioner.IsRunning: got err=%v, want nil or ErrNoBackend", err)
|
||||
}
|
||||
})
|
||||
// Nil receivers must also be safe. The orphan sweeper code path can
|
||||
// hit Stop on a *Provisioner that's nil (not just zero-valued) when
|
||||
// the wiring path hasn't initialized at all. For nil receivers we
|
||||
// always expect ErrNoBackend (the nil-check at the top fires before
|
||||
// any DB lookup could absorb the case).
|
||||
t.Run("nil_Provisioner.Stop", func(t *testing.T) {
|
||||
var p *Provisioner
|
||||
if err := p.Stop(context.Background(), "any"); !errors.Is(err, ErrNoBackend) {
|
||||
t.Errorf("nil *Provisioner.Stop: got %v, want ErrNoBackend", err)
|
||||
}
|
||||
})
|
||||
t.Run("nil_CPProvisioner.Stop", func(t *testing.T) {
|
||||
var p *CPProvisioner
|
||||
if err := p.Stop(context.Background(), "any"); !errors.Is(err, ErrNoBackend) {
|
||||
t.Errorf("nil *CPProvisioner.Stop: got %v, want ErrNoBackend", err)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@ -194,6 +194,9 @@ func (p *CPProvisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string,
|
||||
// blocking the next provision with InvalidGroup.Duplicate — a full
|
||||
// "Save & Restart" crash on SaaS.
|
||||
func (p *CPProvisioner) Stop(ctx context.Context, workspaceID string) error {
|
||||
if p == nil {
|
||||
return ErrNoBackend
|
||||
}
|
||||
instanceID, err := resolveInstanceID(ctx, workspaceID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cp provisioner: stop: resolve instance_id: %w", err)
|
||||
@ -201,9 +204,17 @@ func (p *CPProvisioner) Stop(ctx context.Context, workspaceID string) error {
|
||||
if instanceID == "" {
|
||||
// No instance was ever provisioned (or already deprovisioned and
|
||||
// the column was cleared). Nothing to terminate — idempotent.
|
||||
// Reached even when httpClient is nil since the empty-instance
|
||||
// path doesn't need HTTP — symmetric with IsRunning.
|
||||
log.Printf("CP provisioner: Stop for %s — no instance_id on file, nothing to do", workspaceID)
|
||||
return nil
|
||||
}
|
||||
if p.httpClient == nil {
|
||||
// HTTP wiring missing but we have an instance_id to terminate —
|
||||
// can't make the DELETE call. Report ErrNoBackend so the
|
||||
// orphan sweeper / shutdown path can branch.
|
||||
return ErrNoBackend
|
||||
}
|
||||
url := fmt.Sprintf("%s/cp/workspaces/%s?instance_id=%s", p.baseURL, workspaceID, instanceID)
|
||||
req, _ := http.NewRequestWithContext(ctx, "DELETE", url, nil)
|
||||
p.provisionAuthHeaders(req)
|
||||
@ -269,6 +280,9 @@ var resolveInstanceID = func(ctx context.Context, workspaceID string) (string, e
|
||||
// Both callers are happy with (true, err); callers that need the
|
||||
// previous (false, err) shape must inspect err themselves.
|
||||
func (p *CPProvisioner) IsRunning(ctx context.Context, workspaceID string) (bool, error) {
|
||||
if p == nil {
|
||||
return false, ErrNoBackend
|
||||
}
|
||||
instanceID, err := resolveInstanceID(ctx, workspaceID)
|
||||
if err != nil {
|
||||
// Treat DB errors the same as transport errors — (true, err) keeps
|
||||
@ -277,9 +291,20 @@ func (p *CPProvisioner) IsRunning(ctx context.Context, workspaceID string) (bool
|
||||
}
|
||||
if instanceID == "" {
|
||||
// No instance recorded. Report "not running" cleanly (no error)
|
||||
// so restart cascades can trigger a fresh provision.
|
||||
// so restart cascades can trigger a fresh provision. This path
|
||||
// is reached even on a zero-valued provisioner (no httpClient
|
||||
// wired) — that's intentional; the resolveInstanceID lookup
|
||||
// goes through the package-level db var, not p.httpClient, so
|
||||
// a no-instance workspace gets a clean answer regardless of
|
||||
// HTTP wiring state.
|
||||
return false, nil
|
||||
}
|
||||
if p.httpClient == nil {
|
||||
// HTTP wiring missing but we have an instance_id to query —
|
||||
// can't proceed without a client. Report ErrNoBackend so the
|
||||
// caller can branch (a2a_proxy keeps alive, healthsweep skips).
|
||||
return false, ErrNoBackend
|
||||
}
|
||||
url := fmt.Sprintf("%s/cp/workspaces/%s/status?instance_id=%s", p.baseURL, workspaceID, instanceID)
|
||||
req, _ := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
p.provisionAuthHeaders(req)
|
||||
|
||||
@ -5,6 +5,7 @@ import (
|
||||
"archive/tar"
|
||||
"bytes"
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
@ -24,6 +25,15 @@ import (
|
||||
ocispec "github.com/opencontainers/image-spec/specs-go/v1"
|
||||
)
|
||||
|
||||
// ErrNoBackend is returned by lifecycle methods (Stop, IsRunning) when
|
||||
// the receiver is zero-valued — i.e. no Docker daemon connection has
|
||||
// been wired up. The orphan sweeper and the contract-test scaffolding
|
||||
// both speculatively call these methods on a possibly-nil receiver;
|
||||
// returning a typed error rather than panicking lets callers reason
|
||||
// about the case explicitly. See docs/architecture/backends.md
|
||||
// drift-risk #6.
|
||||
var ErrNoBackend = errors.New("provisioner: no backend configured (zero-valued receiver)")
|
||||
|
||||
// RuntimeImages maps runtime names to their Docker image refs on GHCR.
|
||||
// Each standalone template repo publishes its image via the reusable
|
||||
// publish-template-image workflow in molecule-ci on every main merge.
|
||||
@ -823,6 +833,9 @@ func (p *Provisioner) RemoveVolume(ctx context.Context, workspaceID string) erro
|
||||
// respawn the container before ContainerRemove runs, leaving a zombie
|
||||
// that re-registers via heartbeat after deletion.
|
||||
func (p *Provisioner) Stop(ctx context.Context, workspaceID string) error {
|
||||
if p == nil || p.cli == nil {
|
||||
return ErrNoBackend
|
||||
}
|
||||
name := ContainerName(workspaceID)
|
||||
|
||||
// Force-remove kills and removes in one atomic operation, bypassing
|
||||
@ -856,6 +869,9 @@ func (p *Provisioner) Stop(ctx context.Context, workspaceID string) error {
|
||||
// so a real crash still surfaces via exec heartbeat or TTL, both of which
|
||||
// have narrower false-positive windows than daemon-inspect RPC.
|
||||
func (p *Provisioner) IsRunning(ctx context.Context, workspaceID string) (bool, error) {
|
||||
if p == nil || p.cli == nil {
|
||||
return false, ErrNoBackend
|
||||
}
|
||||
name := ContainerName(workspaceID)
|
||||
info, err := p.cli.ContainerInspect(ctx, name)
|
||||
if err != nil {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user