From 48b494def32f418d7feff7499988c6bbf1d0c4cc Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Sun, 26 Apr 2026 02:17:51 -0700 Subject: [PATCH] fix(provisioner): nil guards on Stop/IsRunning, unblock contract tests (closes #1813) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both backends panicked when called on a zero-valued or nil receiver: Provisioner.{Stop,IsRunning} dereferenced p.cli; CPProvisioner.{Stop, IsRunning} dereferenced p.httpClient. The orphan sweeper and shutdown paths can call these speculatively where the receiver isn't fully wired — the panic crashed the goroutine instead of the caller seeing a clean error. Three changes: 1. Add ErrNoBackend (typed sentinel) and nil-guard the four methods. - Provisioner.{Stop,IsRunning}: guard p == nil || p.cli == nil at the top. - CPProvisioner.Stop: guard p == nil up top, then httpClient nil AFTER resolveInstanceID + empty-instance check (the empty instance_id path doesn't need HTTP and stays a no-op success even on zero-valued receivers — preserved historical contract from TestIsRunning_EmptyInstanceIDReturnsFalse). - CPProvisioner.IsRunning: same shape — empty instance_id stays (false, nil); httpClient-nil with non-empty instance_id returns ErrNoBackend. 2. Flip the t.Skip on TestDockerBackend_Contract + TestCPProvisionerBackend_Contract — both contract tests run now that the panics are gone. Skipped scenarios were the regression guard for this fix. 3. Add TestZeroValuedBackends_NoPanic — explicit assertion that zero-valued and nil receivers return cleanly (no panic). Docker backend always returns ErrNoBackend on zero-valued; CPProvisioner may return (false, nil) when the DB-lookup layer absorbs the case (no instance to query → no HTTP needed). Both are acceptable per the issue's contract — the gate is no-panic. Tests: - 6 sub-cases across the new TestZeroValuedBackends_NoPanic - TestDockerBackend_Contract + TestCPProvisionerBackend_Contract now run their 2 scenarios (4 sub-cases each) - All existing provisioner tests still green - go build ./... + go vet ./... + go test ./... clean Closes drift-risk #6 in docs/architecture/backends.md. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../provisioner/backend_contract_test.go | 88 ++++++++++++++++--- .../internal/provisioner/cp_provisioner.go | 27 +++++- .../internal/provisioner/provisioner.go | 16 ++++ 3 files changed, 120 insertions(+), 11 deletions(-) diff --git a/workspace-server/internal/provisioner/backend_contract_test.go b/workspace-server/internal/provisioner/backend_contract_test.go index 0c31daf0..a5948f54 100644 --- a/workspace-server/internal/provisioner/backend_contract_test.go +++ b/workspace-server/internal/provisioner/backend_contract_test.go @@ -38,6 +38,7 @@ package provisioner import ( "context" + "errors" "testing" ) @@ -138,23 +139,90 @@ func runBackendContract(t *testing.T, backend Backend) { } // TestDockerBackend_Contract feeds the Docker backend through the -// shared runner. Skipped pending hardening: the scaffold exposed a -// real bug — neither backend's Stop/IsRunning handles a nil underlying -// client gracefully (both panic). Filing that as a separate issue; -// once both backends return (*, error) instead of panicking, flip this -// to t.Run and the contract scenarios exercise the fix. +// shared runner. Was Skip'd pending hardening of Provisioner.{Stop, +// IsRunning} against nil Docker client — that hardening landed in +// fix/provisioner-nil-guards-1813, so the scenarios run now. A +// zero-valued *Provisioner returns ErrNoBackend from each method +// instead of panicking, satisfying the no-panic contract. func TestDockerBackend_Contract(t *testing.T) { - t.Skip("scaffolding only — unblock by hardening Provisioner.{Stop,IsRunning} against nil Docker client; see docs/architecture/backends.md drift risk #6") var p Provisioner runBackendContract(t, &p) } // TestCPProvisionerBackend_Contract — same story as the Docker variant. -// CPProvisioner panics on nil httpClient today; once that's hardened, -// remove the Skip and this runner exercises both backends through a -// single contract. +// Hardening landed in fix/provisioner-nil-guards-1813; zero-valued +// *CPProvisioner returns ErrNoBackend from Stop/IsRunning instead of +// panicking on nil httpClient. func TestCPProvisionerBackend_Contract(t *testing.T) { - t.Skip("scaffolding only — unblock by hardening CPProvisioner.{Stop,IsRunning} against nil httpClient; see docs/architecture/backends.md drift risk #6") var p CPProvisioner runBackendContract(t, &p) } + +// TestZeroValuedBackends_NoPanic pins the contract that motivated #1813: +// a Backend with no underlying client wired up must return cleanly, +// never panic. The orphan sweeper and shutdown paths both call these +// methods speculatively where the receiver could be unset. +// +// The exact error shape varies between backends: +// - Docker Provisioner has no DB-lookup layer; zero-valued always +// returns ErrNoBackend. +// - CPProvisioner threads through a package-level resolveInstanceID +// lookup; when the DB has no row for the workspace (or db.DB +// itself is nil), instance_id comes back empty and the method +// short-circuits to (false, nil). Only when there's a real +// instance_id to query does the missing-httpClient case surface +// as ErrNoBackend. +// +// Both shapes are acceptable — the contract is "no panic, error is +// nil or ErrNoBackend." Anything else is a bug. +func TestZeroValuedBackends_NoPanic(t *testing.T) { + acceptableErr := func(err error) bool { + return err == nil || errors.Is(err, ErrNoBackend) + } + t.Run("Provisioner.Stop", func(t *testing.T) { + var p Provisioner + if err := p.Stop(context.Background(), "any"); !errors.Is(err, ErrNoBackend) { + t.Errorf("zero-valued Provisioner.Stop: got %v, want ErrNoBackend", err) + } + }) + t.Run("Provisioner.IsRunning", func(t *testing.T) { + var p Provisioner + running, err := p.IsRunning(context.Background(), "any") + if !errors.Is(err, ErrNoBackend) { + t.Errorf("zero-valued Provisioner.IsRunning: got err=%v, want ErrNoBackend", err) + } + if running { + t.Errorf("zero-valued Provisioner.IsRunning: got running=true, want false") + } + }) + t.Run("CPProvisioner.Stop", func(t *testing.T) { + var p CPProvisioner + if err := p.Stop(context.Background(), "any"); !acceptableErr(err) { + t.Errorf("zero-valued CPProvisioner.Stop: got %v, want nil or ErrNoBackend", err) + } + }) + t.Run("CPProvisioner.IsRunning", func(t *testing.T) { + var p CPProvisioner + _, err := p.IsRunning(context.Background(), "any") + if !acceptableErr(err) { + t.Errorf("zero-valued CPProvisioner.IsRunning: got err=%v, want nil or ErrNoBackend", err) + } + }) + // Nil receivers must also be safe. The orphan sweeper code path can + // hit Stop on a *Provisioner that's nil (not just zero-valued) when + // the wiring path hasn't initialized at all. For nil receivers we + // always expect ErrNoBackend (the nil-check at the top fires before + // any DB lookup could absorb the case). + t.Run("nil_Provisioner.Stop", func(t *testing.T) { + var p *Provisioner + if err := p.Stop(context.Background(), "any"); !errors.Is(err, ErrNoBackend) { + t.Errorf("nil *Provisioner.Stop: got %v, want ErrNoBackend", err) + } + }) + t.Run("nil_CPProvisioner.Stop", func(t *testing.T) { + var p *CPProvisioner + if err := p.Stop(context.Background(), "any"); !errors.Is(err, ErrNoBackend) { + t.Errorf("nil *CPProvisioner.Stop: got %v, want ErrNoBackend", err) + } + }) +} diff --git a/workspace-server/internal/provisioner/cp_provisioner.go b/workspace-server/internal/provisioner/cp_provisioner.go index 6f6ae58d..e9cfc8c9 100644 --- a/workspace-server/internal/provisioner/cp_provisioner.go +++ b/workspace-server/internal/provisioner/cp_provisioner.go @@ -194,6 +194,9 @@ func (p *CPProvisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, // blocking the next provision with InvalidGroup.Duplicate — a full // "Save & Restart" crash on SaaS. func (p *CPProvisioner) Stop(ctx context.Context, workspaceID string) error { + if p == nil { + return ErrNoBackend + } instanceID, err := resolveInstanceID(ctx, workspaceID) if err != nil { return fmt.Errorf("cp provisioner: stop: resolve instance_id: %w", err) @@ -201,9 +204,17 @@ func (p *CPProvisioner) Stop(ctx context.Context, workspaceID string) error { if instanceID == "" { // No instance was ever provisioned (or already deprovisioned and // the column was cleared). Nothing to terminate — idempotent. + // Reached even when httpClient is nil since the empty-instance + // path doesn't need HTTP — symmetric with IsRunning. log.Printf("CP provisioner: Stop for %s — no instance_id on file, nothing to do", workspaceID) return nil } + if p.httpClient == nil { + // HTTP wiring missing but we have an instance_id to terminate — + // can't make the DELETE call. Report ErrNoBackend so the + // orphan sweeper / shutdown path can branch. + return ErrNoBackend + } url := fmt.Sprintf("%s/cp/workspaces/%s?instance_id=%s", p.baseURL, workspaceID, instanceID) req, _ := http.NewRequestWithContext(ctx, "DELETE", url, nil) p.provisionAuthHeaders(req) @@ -269,6 +280,9 @@ var resolveInstanceID = func(ctx context.Context, workspaceID string) (string, e // Both callers are happy with (true, err); callers that need the // previous (false, err) shape must inspect err themselves. func (p *CPProvisioner) IsRunning(ctx context.Context, workspaceID string) (bool, error) { + if p == nil { + return false, ErrNoBackend + } instanceID, err := resolveInstanceID(ctx, workspaceID) if err != nil { // Treat DB errors the same as transport errors — (true, err) keeps @@ -277,9 +291,20 @@ func (p *CPProvisioner) IsRunning(ctx context.Context, workspaceID string) (bool } if instanceID == "" { // No instance recorded. Report "not running" cleanly (no error) - // so restart cascades can trigger a fresh provision. + // so restart cascades can trigger a fresh provision. This path + // is reached even on a zero-valued provisioner (no httpClient + // wired) — that's intentional; the resolveInstanceID lookup + // goes through the package-level db var, not p.httpClient, so + // a no-instance workspace gets a clean answer regardless of + // HTTP wiring state. return false, nil } + if p.httpClient == nil { + // HTTP wiring missing but we have an instance_id to query — + // can't proceed without a client. Report ErrNoBackend so the + // caller can branch (a2a_proxy keeps alive, healthsweep skips). + return false, ErrNoBackend + } url := fmt.Sprintf("%s/cp/workspaces/%s/status?instance_id=%s", p.baseURL, workspaceID, instanceID) req, _ := http.NewRequestWithContext(ctx, "GET", url, nil) p.provisionAuthHeaders(req) diff --git a/workspace-server/internal/provisioner/provisioner.go b/workspace-server/internal/provisioner/provisioner.go index ac04b15f..a57a4cb6 100644 --- a/workspace-server/internal/provisioner/provisioner.go +++ b/workspace-server/internal/provisioner/provisioner.go @@ -5,6 +5,7 @@ import ( "archive/tar" "bytes" "context" + "errors" "fmt" "io" "log" @@ -24,6 +25,15 @@ import ( ocispec "github.com/opencontainers/image-spec/specs-go/v1" ) +// ErrNoBackend is returned by lifecycle methods (Stop, IsRunning) when +// the receiver is zero-valued — i.e. no Docker daemon connection has +// been wired up. The orphan sweeper and the contract-test scaffolding +// both speculatively call these methods on a possibly-nil receiver; +// returning a typed error rather than panicking lets callers reason +// about the case explicitly. See docs/architecture/backends.md +// drift-risk #6. +var ErrNoBackend = errors.New("provisioner: no backend configured (zero-valued receiver)") + // RuntimeImages maps runtime names to their Docker image refs on GHCR. // Each standalone template repo publishes its image via the reusable // publish-template-image workflow in molecule-ci on every main merge. @@ -823,6 +833,9 @@ func (p *Provisioner) RemoveVolume(ctx context.Context, workspaceID string) erro // respawn the container before ContainerRemove runs, leaving a zombie // that re-registers via heartbeat after deletion. func (p *Provisioner) Stop(ctx context.Context, workspaceID string) error { + if p == nil || p.cli == nil { + return ErrNoBackend + } name := ContainerName(workspaceID) // Force-remove kills and removes in one atomic operation, bypassing @@ -856,6 +869,9 @@ func (p *Provisioner) Stop(ctx context.Context, workspaceID string) error { // so a real crash still surfaces via exec heartbeat or TTL, both of which // have narrower false-positive windows than daemon-inspect RPC. func (p *Provisioner) IsRunning(ctx context.Context, workspaceID string) (bool, error) { + if p == nil || p.cli == nil { + return false, ErrNoBackend + } name := ContainerName(workspaceID) info, err := p.cli.ContainerInspect(ctx, name) if err != nil {