fix(provisioner): nil guards on Stop/IsRunning, unblock contract tests (closes #1813)

Both backends panicked when called on a zero-valued or nil receiver:
Provisioner.{Stop,IsRunning} dereferenced p.cli; CPProvisioner.{Stop,
IsRunning} dereferenced p.httpClient. The orphan sweeper and shutdown
paths can call these speculatively where the receiver isn't fully
wired — the panic crashed the goroutine instead of the caller seeing
a clean error.

Three changes:

1. Add ErrNoBackend (typed sentinel) and nil-guard the four methods.
   - Provisioner.{Stop,IsRunning}: guard p == nil || p.cli == nil at
     the top.
   - CPProvisioner.Stop: guard p == nil up top, then httpClient nil
     AFTER resolveInstanceID + empty-instance check (the empty
     instance_id path doesn't need HTTP and stays a no-op success
     even on zero-valued receivers — preserved historical contract
     from TestIsRunning_EmptyInstanceIDReturnsFalse).
   - CPProvisioner.IsRunning: same shape — empty instance_id stays
     (false, nil); httpClient-nil with non-empty instance_id returns
     ErrNoBackend.

2. Flip the t.Skip on TestDockerBackend_Contract +
   TestCPProvisionerBackend_Contract — both contract tests run now
   that the panics are gone. Skipped scenarios were the regression
   guard for this fix.

3. Add TestZeroValuedBackends_NoPanic — explicit assertion that
   zero-valued and nil receivers return cleanly (no panic). Docker
   backend always returns ErrNoBackend on zero-valued; CPProvisioner
   may return (false, nil) when the DB-lookup layer absorbs the case
   (no instance to query → no HTTP needed). Both are acceptable per
   the issue's contract — the gate is no-panic.

Tests:
  - 6 sub-cases across the new TestZeroValuedBackends_NoPanic
  - TestDockerBackend_Contract + TestCPProvisionerBackend_Contract
    now run their 2 scenarios (4 sub-cases each)
  - All existing provisioner tests still green
  - go build ./... + go vet ./... + go test ./... clean

Closes drift-risk #6 in docs/architecture/backends.md.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hongming Wang 2026-04-26 02:17:51 -07:00
parent 38fead35b4
commit 48b494def3
3 changed files with 120 additions and 11 deletions

View File

@ -38,6 +38,7 @@ package provisioner
import (
"context"
"errors"
"testing"
)
@ -138,23 +139,90 @@ func runBackendContract(t *testing.T, backend Backend) {
}
// TestDockerBackend_Contract feeds the Docker backend through the
// shared runner. Skipped pending hardening: the scaffold exposed a
// real bug — neither backend's Stop/IsRunning handles a nil underlying
// client gracefully (both panic). Filing that as a separate issue;
// once both backends return (*, error) instead of panicking, flip this
// to t.Run and the contract scenarios exercise the fix.
// shared runner. Was Skip'd pending hardening of Provisioner.{Stop,
// IsRunning} against nil Docker client — that hardening landed in
// fix/provisioner-nil-guards-1813, so the scenarios run now. A
// zero-valued *Provisioner returns ErrNoBackend from each method
// instead of panicking, satisfying the no-panic contract.
func TestDockerBackend_Contract(t *testing.T) {
t.Skip("scaffolding only — unblock by hardening Provisioner.{Stop,IsRunning} against nil Docker client; see docs/architecture/backends.md drift risk #6")
var p Provisioner
runBackendContract(t, &p)
}
// TestCPProvisionerBackend_Contract — same story as the Docker variant.
// CPProvisioner panics on nil httpClient today; once that's hardened,
// remove the Skip and this runner exercises both backends through a
// single contract.
// Hardening landed in fix/provisioner-nil-guards-1813; zero-valued
// *CPProvisioner returns ErrNoBackend from Stop/IsRunning instead of
// panicking on nil httpClient.
func TestCPProvisionerBackend_Contract(t *testing.T) {
t.Skip("scaffolding only — unblock by hardening CPProvisioner.{Stop,IsRunning} against nil httpClient; see docs/architecture/backends.md drift risk #6")
var p CPProvisioner
runBackendContract(t, &p)
}
// TestZeroValuedBackends_NoPanic pins the contract that motivated #1813:
// a Backend with no underlying client wired up must return cleanly,
// never panic. The orphan sweeper and shutdown paths both call these
// methods speculatively where the receiver could be unset.
//
// The exact error shape varies between backends:
// - Docker Provisioner has no DB-lookup layer; zero-valued always
// returns ErrNoBackend.
// - CPProvisioner threads through a package-level resolveInstanceID
// lookup; when the DB has no row for the workspace (or db.DB
// itself is nil), instance_id comes back empty and the method
// short-circuits to (false, nil). Only when there's a real
// instance_id to query does the missing-httpClient case surface
// as ErrNoBackend.
//
// Both shapes are acceptable — the contract is "no panic, error is
// nil or ErrNoBackend." Anything else is a bug.
func TestZeroValuedBackends_NoPanic(t *testing.T) {
acceptableErr := func(err error) bool {
return err == nil || errors.Is(err, ErrNoBackend)
}
t.Run("Provisioner.Stop", func(t *testing.T) {
var p Provisioner
if err := p.Stop(context.Background(), "any"); !errors.Is(err, ErrNoBackend) {
t.Errorf("zero-valued Provisioner.Stop: got %v, want ErrNoBackend", err)
}
})
t.Run("Provisioner.IsRunning", func(t *testing.T) {
var p Provisioner
running, err := p.IsRunning(context.Background(), "any")
if !errors.Is(err, ErrNoBackend) {
t.Errorf("zero-valued Provisioner.IsRunning: got err=%v, want ErrNoBackend", err)
}
if running {
t.Errorf("zero-valued Provisioner.IsRunning: got running=true, want false")
}
})
t.Run("CPProvisioner.Stop", func(t *testing.T) {
var p CPProvisioner
if err := p.Stop(context.Background(), "any"); !acceptableErr(err) {
t.Errorf("zero-valued CPProvisioner.Stop: got %v, want nil or ErrNoBackend", err)
}
})
t.Run("CPProvisioner.IsRunning", func(t *testing.T) {
var p CPProvisioner
_, err := p.IsRunning(context.Background(), "any")
if !acceptableErr(err) {
t.Errorf("zero-valued CPProvisioner.IsRunning: got err=%v, want nil or ErrNoBackend", err)
}
})
// Nil receivers must also be safe. The orphan sweeper code path can
// hit Stop on a *Provisioner that's nil (not just zero-valued) when
// the wiring path hasn't initialized at all. For nil receivers we
// always expect ErrNoBackend (the nil-check at the top fires before
// any DB lookup could absorb the case).
t.Run("nil_Provisioner.Stop", func(t *testing.T) {
var p *Provisioner
if err := p.Stop(context.Background(), "any"); !errors.Is(err, ErrNoBackend) {
t.Errorf("nil *Provisioner.Stop: got %v, want ErrNoBackend", err)
}
})
t.Run("nil_CPProvisioner.Stop", func(t *testing.T) {
var p *CPProvisioner
if err := p.Stop(context.Background(), "any"); !errors.Is(err, ErrNoBackend) {
t.Errorf("nil *CPProvisioner.Stop: got %v, want ErrNoBackend", err)
}
})
}

View File

@ -194,6 +194,9 @@ func (p *CPProvisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string,
// blocking the next provision with InvalidGroup.Duplicate — a full
// "Save & Restart" crash on SaaS.
func (p *CPProvisioner) Stop(ctx context.Context, workspaceID string) error {
if p == nil {
return ErrNoBackend
}
instanceID, err := resolveInstanceID(ctx, workspaceID)
if err != nil {
return fmt.Errorf("cp provisioner: stop: resolve instance_id: %w", err)
@ -201,9 +204,17 @@ func (p *CPProvisioner) Stop(ctx context.Context, workspaceID string) error {
if instanceID == "" {
// No instance was ever provisioned (or already deprovisioned and
// the column was cleared). Nothing to terminate — idempotent.
// Reached even when httpClient is nil since the empty-instance
// path doesn't need HTTP — symmetric with IsRunning.
log.Printf("CP provisioner: Stop for %s — no instance_id on file, nothing to do", workspaceID)
return nil
}
if p.httpClient == nil {
// HTTP wiring missing but we have an instance_id to terminate —
// can't make the DELETE call. Report ErrNoBackend so the
// orphan sweeper / shutdown path can branch.
return ErrNoBackend
}
url := fmt.Sprintf("%s/cp/workspaces/%s?instance_id=%s", p.baseURL, workspaceID, instanceID)
req, _ := http.NewRequestWithContext(ctx, "DELETE", url, nil)
p.provisionAuthHeaders(req)
@ -269,6 +280,9 @@ var resolveInstanceID = func(ctx context.Context, workspaceID string) (string, e
// Both callers are happy with (true, err); callers that need the
// previous (false, err) shape must inspect err themselves.
func (p *CPProvisioner) IsRunning(ctx context.Context, workspaceID string) (bool, error) {
if p == nil {
return false, ErrNoBackend
}
instanceID, err := resolveInstanceID(ctx, workspaceID)
if err != nil {
// Treat DB errors the same as transport errors — (true, err) keeps
@ -277,9 +291,20 @@ func (p *CPProvisioner) IsRunning(ctx context.Context, workspaceID string) (bool
}
if instanceID == "" {
// No instance recorded. Report "not running" cleanly (no error)
// so restart cascades can trigger a fresh provision.
// so restart cascades can trigger a fresh provision. This path
// is reached even on a zero-valued provisioner (no httpClient
// wired) — that's intentional; the resolveInstanceID lookup
// goes through the package-level db var, not p.httpClient, so
// a no-instance workspace gets a clean answer regardless of
// HTTP wiring state.
return false, nil
}
if p.httpClient == nil {
// HTTP wiring missing but we have an instance_id to query —
// can't proceed without a client. Report ErrNoBackend so the
// caller can branch (a2a_proxy keeps alive, healthsweep skips).
return false, ErrNoBackend
}
url := fmt.Sprintf("%s/cp/workspaces/%s/status?instance_id=%s", p.baseURL, workspaceID, instanceID)
req, _ := http.NewRequestWithContext(ctx, "GET", url, nil)
p.provisionAuthHeaders(req)

View File

@ -5,6 +5,7 @@ import (
"archive/tar"
"bytes"
"context"
"errors"
"fmt"
"io"
"log"
@ -24,6 +25,15 @@ import (
ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)
// ErrNoBackend is returned by lifecycle methods (Stop, IsRunning) when
// the receiver is zero-valued — i.e. no Docker daemon connection has
// been wired up. The orphan sweeper and the contract-test scaffolding
// both speculatively call these methods on a possibly-nil receiver;
// returning a typed error rather than panicking lets callers reason
// about the case explicitly. See docs/architecture/backends.md
// drift-risk #6.
var ErrNoBackend = errors.New("provisioner: no backend configured (zero-valued receiver)")
// RuntimeImages maps runtime names to their Docker image refs on GHCR.
// Each standalone template repo publishes its image via the reusable
// publish-template-image workflow in molecule-ci on every main merge.
@ -823,6 +833,9 @@ func (p *Provisioner) RemoveVolume(ctx context.Context, workspaceID string) erro
// respawn the container before ContainerRemove runs, leaving a zombie
// that re-registers via heartbeat after deletion.
func (p *Provisioner) Stop(ctx context.Context, workspaceID string) error {
if p == nil || p.cli == nil {
return ErrNoBackend
}
name := ContainerName(workspaceID)
// Force-remove kills and removes in one atomic operation, bypassing
@ -856,6 +869,9 @@ func (p *Provisioner) Stop(ctx context.Context, workspaceID string) error {
// so a real crash still surfaces via exec heartbeat or TTL, both of which
// have narrower false-positive windows than daemon-inspect RPC.
func (p *Provisioner) IsRunning(ctx context.Context, workspaceID string) (bool, error) {
if p == nil || p.cli == nil {
return false, ErrNoBackend
}
name := ContainerName(workspaceID)
info, err := p.cli.ContainerInspect(ctx, name)
if err != nil {