From 48b494def32f418d7feff7499988c6bbf1d0c4cc Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Date: Sun, 26 Apr 2026 02:17:51 -0700
Subject: [PATCH] fix(provisioner): nil guards on Stop/IsRunning, unblock
 contract tests (closes #1813)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both backends panicked when called on a zero-valued or nil receiver:
Provisioner.{Stop,IsRunning} dereferenced p.cli; CPProvisioner.{Stop,
IsRunning} dereferenced p.httpClient. The orphan sweeper and shutdown
paths can call these speculatively where the receiver isn't fully
wired — the panic crashed the goroutine instead of the caller seeing
a clean error.

Three changes:

1. Add ErrNoBackend (typed sentinel) and nil-guard the four methods.
   - Provisioner.{Stop,IsRunning}: guard p == nil || p.cli == nil at
     the top.
   - CPProvisioner.Stop: guard p == nil up top, then httpClient nil
     AFTER resolveInstanceID + empty-instance check (the empty
     instance_id path doesn't need HTTP and stays a no-op success
     even on zero-valued receivers — preserved historical contract
     from TestIsRunning_EmptyInstanceIDReturnsFalse).
   - CPProvisioner.IsRunning: same shape — empty instance_id stays
     (false, nil); httpClient-nil with non-empty instance_id returns
     ErrNoBackend.

2. Flip the t.Skip on TestDockerBackend_Contract +
   TestCPProvisionerBackend_Contract — both contract tests run now
   that the panics are gone. Skipped scenarios were the regression
   guard for this fix.

3. Add TestZeroValuedBackends_NoPanic — explicit assertion that
   zero-valued and nil receivers return cleanly (no panic). Docker
   backend always returns ErrNoBackend on zero-valued; CPProvisioner
   may return (false, nil) when the DB-lookup layer absorbs the case
   (no instance to query → no HTTP needed). Both are acceptable per
   the issue's contract — the gate is no-panic.

Tests:
  - 6 sub-cases across the new TestZeroValuedBackends_NoPanic
  - TestDockerBackend_Contract + TestCPProvisionerBackend_Contract
    now run their 2 scenarios (4 sub-cases each)
  - All existing provisioner tests still green
  - go build ./... + go vet ./... + go test ./... clean

Closes drift-risk #6 in docs/architecture/backends.md.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../provisioner/backend_contract_test.go      | 88 ++++++++++++++++---
 .../internal/provisioner/cp_provisioner.go    | 27 +++++-
 .../internal/provisioner/provisioner.go       | 16 ++++
 3 files changed, 120 insertions(+), 11 deletions(-)

diff --git a/workspace-server/internal/provisioner/backend_contract_test.go b/workspace-server/internal/provisioner/backend_contract_test.go
index 0c31daf0..a5948f54 100644
--- a/workspace-server/internal/provisioner/backend_contract_test.go
+++ b/workspace-server/internal/provisioner/backend_contract_test.go
@@ -38,6 +38,7 @@ package provisioner
 
 import (
 	"context"
+	"errors"
 	"testing"
 )
 
@@ -138,23 +139,90 @@ func runBackendContract(t *testing.T, backend Backend) {
 }
 
 // TestDockerBackend_Contract feeds the Docker backend through the
-// shared runner. Skipped pending hardening: the scaffold exposed a
-// real bug — neither backend's Stop/IsRunning handles a nil underlying
-// client gracefully (both panic). Filing that as a separate issue;
-// once both backends return (*, error) instead of panicking, flip this
-// to t.Run and the contract scenarios exercise the fix.
+// shared runner. Was Skip'd pending hardening of Provisioner.{Stop,
+// IsRunning} against nil Docker client — that hardening landed in
+// fix/provisioner-nil-guards-1813, so the scenarios run now. A
+// zero-valued *Provisioner returns ErrNoBackend from each method
+// instead of panicking, satisfying the no-panic contract.
 func TestDockerBackend_Contract(t *testing.T) {
-	t.Skip("scaffolding only — unblock by hardening Provisioner.{Stop,IsRunning} against nil Docker client; see docs/architecture/backends.md drift risk #6")
 	var p Provisioner
 	runBackendContract(t, &p)
 }
 
 // TestCPProvisionerBackend_Contract — same story as the Docker variant.
-// CPProvisioner panics on nil httpClient today; once that's hardened,
-// remove the Skip and this runner exercises both backends through a
-// single contract.
+// Hardening landed in fix/provisioner-nil-guards-1813; zero-valued
+// *CPProvisioner returns ErrNoBackend from Stop/IsRunning instead of
+// panicking on nil httpClient.
 func TestCPProvisionerBackend_Contract(t *testing.T) {
-	t.Skip("scaffolding only — unblock by hardening CPProvisioner.{Stop,IsRunning} against nil httpClient; see docs/architecture/backends.md drift risk #6")
 	var p CPProvisioner
 	runBackendContract(t, &p)
 }
+
+// TestZeroValuedBackends_NoPanic pins the contract that motivated #1813:
+// a Backend with no underlying client wired up must return cleanly,
+// never panic. The orphan sweeper and shutdown paths both call these
+// methods speculatively where the receiver could be unset.
+//
+// The exact error shape varies between backends:
+//   - Docker Provisioner has no DB-lookup layer; zero-valued always
+//     returns ErrNoBackend.
+//   - CPProvisioner threads through a package-level resolveInstanceID
+//     lookup; when the DB has no row for the workspace (or db.DB
+//     itself is nil), instance_id comes back empty and the method
+//     short-circuits to (false, nil). Only when there's a real
+//     instance_id to query does the missing-httpClient case surface
+//     as ErrNoBackend.
+//
+// Both shapes are acceptable — the contract is "no panic, error is
+// nil or ErrNoBackend." Anything else is a bug.
+func TestZeroValuedBackends_NoPanic(t *testing.T) {
+	acceptableErr := func(err error) bool {
+		return err == nil || errors.Is(err, ErrNoBackend)
+	}
+	t.Run("Provisioner.Stop", func(t *testing.T) {
+		var p Provisioner
+		if err := p.Stop(context.Background(), "any"); !errors.Is(err, ErrNoBackend) {
+			t.Errorf("zero-valued Provisioner.Stop: got %v, want ErrNoBackend", err)
+		}
+	})
+	t.Run("Provisioner.IsRunning", func(t *testing.T) {
+		var p Provisioner
+		running, err := p.IsRunning(context.Background(), "any")
+		if !errors.Is(err, ErrNoBackend) {
+			t.Errorf("zero-valued Provisioner.IsRunning: got err=%v, want ErrNoBackend", err)
+		}
+		if running {
+			t.Errorf("zero-valued Provisioner.IsRunning: got running=true, want false")
+		}
+	})
+	t.Run("CPProvisioner.Stop", func(t *testing.T) {
+		var p CPProvisioner
+		if err := p.Stop(context.Background(), "any"); !acceptableErr(err) {
+			t.Errorf("zero-valued CPProvisioner.Stop: got %v, want nil or ErrNoBackend", err)
+		}
+	})
+	t.Run("CPProvisioner.IsRunning", func(t *testing.T) {
+		var p CPProvisioner
+		_, err := p.IsRunning(context.Background(), "any")
+		if !acceptableErr(err) {
+			t.Errorf("zero-valued CPProvisioner.IsRunning: got err=%v, want nil or ErrNoBackend", err)
+		}
+	})
+	// Nil receivers must also be safe. The orphan sweeper code path can
+	// hit Stop on a *Provisioner that's nil (not just zero-valued) when
+	// the wiring path hasn't initialized at all. For nil receivers we
+	// always expect ErrNoBackend (the nil-check at the top fires before
+	// any DB lookup could absorb the case).
+	t.Run("nil_Provisioner.Stop", func(t *testing.T) {
+		var p *Provisioner
+		if err := p.Stop(context.Background(), "any"); !errors.Is(err, ErrNoBackend) {
+			t.Errorf("nil *Provisioner.Stop: got %v, want ErrNoBackend", err)
+		}
+	})
+	t.Run("nil_CPProvisioner.Stop", func(t *testing.T) {
+		var p *CPProvisioner
+		if err := p.Stop(context.Background(), "any"); !errors.Is(err, ErrNoBackend) {
+			t.Errorf("nil *CPProvisioner.Stop: got %v, want ErrNoBackend", err)
+		}
+	})
+}
diff --git a/workspace-server/internal/provisioner/cp_provisioner.go b/workspace-server/internal/provisioner/cp_provisioner.go
index 6f6ae58d..e9cfc8c9 100644
--- a/workspace-server/internal/provisioner/cp_provisioner.go
+++ b/workspace-server/internal/provisioner/cp_provisioner.go
@@ -194,6 +194,9 @@ func (p *CPProvisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string,
 // blocking the next provision with InvalidGroup.Duplicate — a full
 // "Save & Restart" crash on SaaS.
 func (p *CPProvisioner) Stop(ctx context.Context, workspaceID string) error {
+	if p == nil {
+		return ErrNoBackend
+	}
 	instanceID, err := resolveInstanceID(ctx, workspaceID)
 	if err != nil {
 		return fmt.Errorf("cp provisioner: stop: resolve instance_id: %w", err)
@@ -201,9 +204,17 @@ func (p *CPProvisioner) Stop(ctx context.Context, workspaceID string) error {
 	if instanceID == "" {
 		// No instance was ever provisioned (or already deprovisioned and
 		// the column was cleared). Nothing to terminate — idempotent.
+		// Reached even when httpClient is nil since the empty-instance
+		// path doesn't need HTTP — symmetric with IsRunning.
 		log.Printf("CP provisioner: Stop for %s — no instance_id on file, nothing to do", workspaceID)
 		return nil
 	}
+	if p.httpClient == nil {
+		// HTTP wiring missing but we have an instance_id to terminate —
+		// can't make the DELETE call. Report ErrNoBackend so the
+		// orphan sweeper / shutdown path can branch.
+		return ErrNoBackend
+	}
 	url := fmt.Sprintf("%s/cp/workspaces/%s?instance_id=%s", p.baseURL, workspaceID, instanceID)
 	req, _ := http.NewRequestWithContext(ctx, "DELETE", url, nil)
 	p.provisionAuthHeaders(req)
@@ -269,6 +280,9 @@ var resolveInstanceID = func(ctx context.Context, workspaceID string) (string, e
 // Both callers are happy with (true, err); callers that need the
 // previous (false, err) shape must inspect err themselves.
 func (p *CPProvisioner) IsRunning(ctx context.Context, workspaceID string) (bool, error) {
+	if p == nil {
+		return false, ErrNoBackend
+	}
 	instanceID, err := resolveInstanceID(ctx, workspaceID)
 	if err != nil {
 		// Treat DB errors the same as transport errors — (true, err) keeps
@@ -277,9 +291,20 @@ func (p *CPProvisioner) IsRunning(ctx context.Context, workspaceID string) (bool
 	}
 	if instanceID == "" {
 		// No instance recorded. Report "not running" cleanly (no error)
-		// so restart cascades can trigger a fresh provision.
+		// so restart cascades can trigger a fresh provision. This path
+		// is reached even on a zero-valued provisioner (no httpClient
+		// wired) — that's intentional; the resolveInstanceID lookup
+		// goes through the package-level db var, not p.httpClient, so
+		// a no-instance workspace gets a clean answer regardless of
+		// HTTP wiring state.
 		return false, nil
 	}
+	if p.httpClient == nil {
+		// HTTP wiring missing but we have an instance_id to query —
+		// can't proceed without a client. Report ErrNoBackend so the
+		// caller can branch (a2a_proxy keeps alive, healthsweep skips).
+		return false, ErrNoBackend
+	}
 	url := fmt.Sprintf("%s/cp/workspaces/%s/status?instance_id=%s", p.baseURL, workspaceID, instanceID)
 	req, _ := http.NewRequestWithContext(ctx, "GET", url, nil)
 	p.provisionAuthHeaders(req)
diff --git a/workspace-server/internal/provisioner/provisioner.go b/workspace-server/internal/provisioner/provisioner.go
index ac04b15f..a57a4cb6 100644
--- a/workspace-server/internal/provisioner/provisioner.go
+++ b/workspace-server/internal/provisioner/provisioner.go
@@ -5,6 +5,7 @@ import (
 	"archive/tar"
 	"bytes"
 	"context"
+	"errors"
 	"fmt"
 	"io"
 	"log"
@@ -24,6 +25,15 @@ import (
 	ocispec "github.com/opencontainers/image-spec/specs-go/v1"
 )
 
+// ErrNoBackend is returned by lifecycle methods (Stop, IsRunning) when
+// the receiver is zero-valued — i.e. no Docker daemon connection has
+// been wired up. The orphan sweeper and the contract-test scaffolding
+// both speculatively call these methods on a possibly-nil receiver;
+// returning a typed error rather than panicking lets callers reason
+// about the case explicitly. See docs/architecture/backends.md
+// drift-risk #6.
+var ErrNoBackend = errors.New("provisioner: no backend configured (zero-valued receiver)")
+
 // RuntimeImages maps runtime names to their Docker image refs on GHCR.
 // Each standalone template repo publishes its image via the reusable
 // publish-template-image workflow in molecule-ci on every main merge.
@@ -823,6 +833,9 @@ func (p *Provisioner) RemoveVolume(ctx context.Context, workspaceID string) erro
 // respawn the container before ContainerRemove runs, leaving a zombie
 // that re-registers via heartbeat after deletion.
 func (p *Provisioner) Stop(ctx context.Context, workspaceID string) error {
+	if p == nil || p.cli == nil {
+		return ErrNoBackend
+	}
 	name := ContainerName(workspaceID)
 
 	// Force-remove kills and removes in one atomic operation, bypassing
@@ -856,6 +869,9 @@ func (p *Provisioner) Stop(ctx context.Context, workspaceID string) error {
 // so a real crash still surfaces via exec heartbeat or TTL, both of which
 // have narrower false-positive windows than daemon-inspect RPC.
 func (p *Provisioner) IsRunning(ctx context.Context, workspaceID string) (bool, error) {
+	if p == nil || p.cli == nil {
+		return false, ErrNoBackend
+	}
 	name := ContainerName(workspaceID)
 	info, err := p.cli.ContainerInspect(ctx, name)
 	if err != nil {