fix(restart): retry cpProv.Stop with backoff + flag exhaustion as LEAK-SUSPECT

Both restart paths (interactive Restart handler + auto-restart's stopForRestart) used to log-and-continue on cpProv.Stop failure. After PR #2500 made CPProvisioner.Stop surface CP non-2xx as an error, those paths became the actual leak generator: every transient CP/AWS hiccup = one orphan EC2 alongside the freshly provisioned one. The 13 zombie workspace EC2s on demo-prep staging traced to this exact path. Adds cpStopWithRetry helper with bounded exponential backoff (3 attempts, 1s/2s/4s). Different policy from workspace_crud.go's Delete handler: Delete returns 500 to the client on Stop failure (loud-fail-and-block — user asked to destroy, silent leak unacceptable), whereas Restart's contract is "make the workspace alive again" — refusing to reprovision strands the user with a dead workspace. So this helper retries to absorb transient failures, then on exhaustion emits a structured `LEAK-SUSPECT` log line for the (forthcoming) CP-side workspace orphan reconciler to correlate. Caller proceeds to reprovision regardless. ctx-cancel exits the retry early without sleeping the backoff (matters during shutdown drain); the cancel path emits a distinct log line and deliberately does NOT emit LEAK-SUSPECT — operator-cancel and retry-exhaustion are different signals and conflating them would noise up the orphan-reconciler queue with workspaces we never had a chance to retry. Tests: 5 behavior tests covering every branch (no-op, first-try success, eventual success, exhaustion, ctx-cancel) + 1 AST gate that pins the helper-only invariant (any future inline `h.cpProv.Stop(...)` in workspace_restart.go fires the gate, mutation-tested). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 23:36:38 -07:00 · 2026-05-01 23:36:38 -07:00 · f18ee8598a
commit f18ee8598a
parent 23ee9b5e53
2 changed files with 351 additions and 6 deletions
--- a/workspace-server/internal/handlers/workspace_restart.go
+++ b/workspace-server/internal/handlers/workspace_restart.go
@ -219,9 +219,7 @@ func (h *WorkspaceHandler) Restart(c *gin.Context) {
 		if h.provisioner != nil {
 			h.provisioner.Stop(bgCtx, id)
 		} else if h.cpProv != nil {
-			if err := h.cpProv.Stop(bgCtx, id); err != nil {
-				log.Printf("Restart: cpProv.Stop(%s) failed: %v (continuing to reprovision)", id, err)
-			}
+			h.cpStopWithRetry(bgCtx, id, "Restart")
 		}
 		if h.cpProv != nil {
 			h.provisionWorkspaceCP(id, templatePath, configFiles, payload)
@ -440,12 +438,72 @@ func (h *WorkspaceHandler) stopForRestart(ctx context.Context, workspaceID strin
 		return
 	}
 	if h.cpProv != nil {
-		if err := h.cpProv.Stop(ctx, workspaceID); err != nil {
-			log.Printf("Auto-restart: cpProv.Stop(%s) failed: %v (continuing to reprovision)", workspaceID, err)
-		}
+		h.cpStopWithRetry(ctx, workspaceID, "Auto-restart")
 	}
 }

+// cpStopRetryAttempts caps total Stop attempts (initial + retries). 3 catches
+// the transient CP/AWS hiccups that produce most leaks (one EC2 metadata
+// service stall, one IAM rate-limit blip) without slowing recovery noticeably
+// — worst-case wait is ~7s (1 + 2 + 4 backoff) and we run in a detached
+// goroutine, so user UX is unaffected. Package-level so tests can shrink it.
+var cpStopRetryAttempts = 3
+
+// cpStopRetryBaseDelay is the first-retry backoff. Doubles each attempt:
+// 1s, 2s, 4s for default attempts=3.
+var cpStopRetryBaseDelay = 1 * time.Second
+
+// cpStopWithRetry wraps cpProv.Stop with bounded exponential backoff for
+// the restart paths. Different policy from workspace_crud.go's Delete:
+// Delete returns 500 to the client on Stop failure (loud-fail-and-block,
+// since the user asked to destroy and silent leak is unacceptable),
+// whereas Restart's contract is "make the workspace alive again" — if we
+// refuse to reprovision when Stop fails, we strand the user with a dead
+// workspace. So this helper retries to absorb transient failures, then on
+// final exhaustion emits a structured `LEAK-SUSPECT` log and returns —
+// the caller proceeds with reprovision regardless. The leak signal is
+// the bridge to the (forthcoming) CP-side workspace orphan reconciler;
+// grep `LEAK-SUSPECT cpProv.Stop` to find affected workspace IDs.
+//
+// source tags the originating path ("Restart" / "Auto-restart") so the
+// log line attributes leaks to the path that produced them.
+//
+// Returns nothing — caller's contract is unchanged.
+func (h *WorkspaceHandler) cpStopWithRetry(ctx context.Context, workspaceID, source string) {
+	if h.cpProv == nil {
+		return
+	}
+	var lastErr error
+	delay := cpStopRetryBaseDelay
+	for attempt := 1; attempt <= cpStopRetryAttempts; attempt++ {
+		err := h.cpProv.Stop(ctx, workspaceID)
+		if err == nil {
+			if attempt > 1 {
+				log.Printf("%s: cpProv.Stop(%s) succeeded on attempt %d", source, workspaceID, attempt)
+			}
+			return
+		}
+		lastErr = err
+		if attempt == cpStopRetryAttempts {
+			break
+		}
+		// Sleep with ctx awareness so a cancelled ctx exits early instead
+		// of stalling the goroutine through the remaining backoff.
+		select {
+		case <-ctx.Done():
+			log.Printf("%s: cpProv.Stop(%s) abandoned mid-retry: ctx cancelled (last_err=%v)",
+				source, workspaceID, lastErr)
+			return
+		case <-time.After(delay):
+		}
+		delay *= 2
+	}
+	// Exhausted. Loud-flag: stable prefix `LEAK-SUSPECT` + key=value pairs
+	// so logs are greppable / parseable for the CP-side orphan reconciler.
+	log.Printf("LEAK-SUSPECT cpProv.Stop workspace_id=%s source=%s attempts=%d last_err=%q",
+		workspaceID, source, cpStopRetryAttempts, lastErr.Error())
+}
+
 // runRestartCycle does the actual stop+provision work for one restart
 // iteration. Synchronous (waits for provisionWorkspace to complete) so the
 // outer pending-flag loop in RestartByID can correctly coalesce — if this
--- a/workspace-server/internal/handlers/workspace_restart_stop_retry_test.go
+++ b/workspace-server/internal/handlers/workspace_restart_stop_retry_test.go
@ -0,0 +1,287 @@
+package handlers
+
+// workspace_restart_stop_retry_test.go — pins the contract of
+// cpStopWithRetry, the helper introduced 2026-05-02 in
+// fix/restart-stop-retry-then-flag.
+//
+// Why this helper exists, in brief: workspace_restart.go's two cpProv.Stop
+// callers (the interactive Restart handler + the auto-restart cycle's
+// stopForRestart) both used to log-and-continue on Stop failure. After
+// PR #2500 made CPProvisioner.Stop surface CP non-2xx as an error, those
+// log-and-continue paths became the actual leak generator: every transient
+// CP/AWS hiccup = one orphan EC2 alongside the freshly provisioned one.
+// 13 zombie workspace EC2s on demo-prep staging traced to this exact path.
+//
+// Helper contract:
+//   - bounded retry (default 3 attempts, 1s/2s/4s backoff)
+//   - early-exit on ctx cancel (don't stall the goroutine)
+//   - on retry exhaustion: loud structured log `LEAK-SUSPECT cpProv.Stop ...`
+//   - always returns (no error) — caller proceeds to reprovision regardless,
+//     because Restart's contract is "make the workspace alive again" and
+//     stranding the user with a dead workspace is worse than one leaked EC2
+//     that the CP-side orphan reconciler will catch.
+//
+// Tests below cover every branch:
+//   - no-op when cpProv is nil
+//   - succeeds on first try (no retry log noise)
+//   - succeeds after transient failures (retry log on success)
+//   - exhausts retries, emits LEAK-SUSPECT
+//   - ctx cancel mid-retry exits early without sleeping the backoff
+//
+// Plus an AST gate that pins the helper-only invariant: any future inline
+// `h.cpProv.Stop(...)` in workspace_restart.go must go through cpStopWithRetry.
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"go/ast"
+	"go/parser"
+	"go/token"
+	"path/filepath"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
+)
+
+// scriptedCPStop returns a fakeCPStop that returns errs[i] on call i, then
+// nil for any further calls. Lets each test express its retry expectation
+// declaratively without an ad-hoc counter inside the stub.
+type scriptedCPStop struct {
+	errs      []error
+	calls     int
+	stopDelay time.Duration // optional per-call sleep to prove ctx.Done wins
+}
+
+// satisfies provisioner.CPProvisionerAPI for the methods we touch in this test.
+// The other methods are unused; we don't bother stubbing them with state.
+func (s *scriptedCPStop) Stop(ctx context.Context, _ string) error {
+	if s.stopDelay > 0 {
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-time.After(s.stopDelay):
+		}
+	}
+	i := s.calls
+	s.calls++
+	if i < len(s.errs) {
+		return s.errs[i]
+	}
+	return nil
+}
+func (s *scriptedCPStop) Start(_ context.Context, _ provisioner.WorkspaceConfig) (string, error) {
+	return "", nil
+}
+func (s *scriptedCPStop) IsRunning(_ context.Context, _ string) (bool, error) {
+	return false, nil
+}
+func (s *scriptedCPStop) GetConsoleOutput(_ context.Context, _ string) (string, error) {
+	return "", nil
+}
+
+// captureLog is provided by workspace_provision_panic_test.go in this
+// package — returns a buffer that accumulates log output for the test's
+// lifetime. We don't redeclare it here.
+
+// shrinkRetryBackoff swaps cpStopRetryBaseDelay to a tiny value so retry
+// tests don't burn 7s of wall time. Restored on test cleanup.
+func shrinkRetryBackoff(t *testing.T) {
+	t.Helper()
+	prev := cpStopRetryBaseDelay
+	cpStopRetryBaseDelay = 1 * time.Millisecond
+	t.Cleanup(func() { cpStopRetryBaseDelay = prev })
+}
+
+// --- behavior tests ---
+
+func TestCPStopWithRetry_NoOpWhenCPProvNil(t *testing.T) {
+	buf := captureLog(t)
+	h := &WorkspaceHandler{} // cpProv left nil
+	h.cpStopWithRetry(context.Background(), "ws-x", "Restart")
+	if buf.Len() != 0 {
+		t.Errorf("expected silent no-op when cpProv is nil; got log: %q", buf.String())
+	}
+}
+
+func TestCPStopWithRetry_SucceedsOnFirstTry(t *testing.T) {
+	buf := captureLog(t)
+	stub := &scriptedCPStop{}
+	h := &WorkspaceHandler{cpProv: stub}
+	h.cpStopWithRetry(context.Background(), "ws-1", "Restart")
+	if stub.calls != 1 {
+		t.Errorf("expected exactly 1 Stop call on success; got %d", stub.calls)
+	}
+	out := buf.String()
+	if strings.Contains(out, "succeeded on attempt") {
+		t.Errorf("first-try success should not log a retry-success line; got %q", out)
+	}
+	if strings.Contains(out, "LEAK-SUSPECT") {
+		t.Errorf("first-try success must not emit LEAK-SUSPECT; got %q", out)
+	}
+}
+
+func TestCPStopWithRetry_RetriesTransientThenSucceeds(t *testing.T) {
+	shrinkRetryBackoff(t)
+	buf := captureLog(t)
+	stub := &scriptedCPStop{errs: []error{
+		errors.New("transient hiccup"),
+		errors.New("still flaky"),
+	}}
+	h := &WorkspaceHandler{cpProv: stub}
+	h.cpStopWithRetry(context.Background(), "ws-flaky", "Auto-restart")
+	if stub.calls != 3 {
+		t.Errorf("expected 3 Stop calls (2 fails + 1 success); got %d", stub.calls)
+	}
+	out := buf.String()
+	if !strings.Contains(out, "Auto-restart: cpProv.Stop(ws-flaky) succeeded on attempt 3") {
+		t.Errorf("expected eventual-success log; got %q", out)
+	}
+	if strings.Contains(out, "LEAK-SUSPECT") {
+		t.Errorf("eventual success must not emit LEAK-SUSPECT; got %q", out)
+	}
+}
+
+func TestCPStopWithRetry_AllRetriesExhaustEmitsLeakSuspect(t *testing.T) {
+	shrinkRetryBackoff(t)
+	buf := captureLog(t)
+	stub := &scriptedCPStop{errs: []error{
+		errors.New("cp 502 attempt 1"),
+		errors.New("cp 502 attempt 2"),
+		errors.New("cp 502 attempt 3 — final"),
+	}}
+	h := &WorkspaceHandler{cpProv: stub}
+	h.cpStopWithRetry(context.Background(), "ws-doomed", "Auto-restart")
+	if stub.calls != cpStopRetryAttempts {
+		t.Errorf("expected %d Stop calls when all fail; got %d", cpStopRetryAttempts, stub.calls)
+	}
+	out := buf.String()
+	// The LEAK-SUSPECT line is the bridge to the CP-side orphan reconciler.
+	// Assert every key field is present so a future stringer change can't
+	// silently break ops grep / parser.
+	for _, want := range []string{
+		"LEAK-SUSPECT cpProv.Stop",
+		"workspace_id=ws-doomed",
+		"source=Auto-restart",
+		fmt.Sprintf("attempts=%d", cpStopRetryAttempts),
+		"cp 502 attempt 3 — final", // the LAST error, not an earlier one
+	} {
+		if !strings.Contains(out, want) {
+			t.Errorf("LEAK-SUSPECT log missing %q; got %q", want, out)
+		}
+	}
+}
+
+func TestCPStopWithRetry_RespectsContextCancellation(t *testing.T) {
+	// Use the real (long) backoff so the test fails noisily if ctx-cancel
+	// isn't honored: a non-cancelling implementation would block ~1 second
+	// before the second attempt and the elapsed assertion below would fail.
+	buf := captureLog(t)
+	stub := &scriptedCPStop{errs: []error{
+		errors.New("first fail"),
+		errors.New("second fail"),
+		errors.New("third fail"),
+	}}
+	h := &WorkspaceHandler{cpProv: stub}
+
+	ctx, cancel := context.WithCancel(context.Background())
+	// Cancel before the goroutine starts retrying so the very first
+	// post-attempt-1 select hits the ctx.Done branch.
+	cancel()
+
+	start := time.Now()
+	h.cpStopWithRetry(ctx, "ws-cancel", "Restart")
+	elapsed := time.Since(start)
+
+	// One attempt before bailing on cancel — never a second.
+	if stub.calls != 1 {
+		t.Errorf("expected 1 Stop call before ctx-cancel exit; got %d", stub.calls)
+	}
+	// Backoff is 1s minimum; if we slept it the test would take >=1s.
+	if elapsed >= 500*time.Millisecond {
+		t.Errorf("ctx-cancel should exit well under 500ms; took %v (likely slept the backoff)", elapsed)
+	}
+	out := buf.String()
+	if !strings.Contains(out, "abandoned mid-retry: ctx cancelled") {
+		t.Errorf("expected ctx-cancel log line; got %q", out)
+	}
+	if strings.Contains(out, "LEAK-SUSPECT") {
+		// Ctx-cancel is operator-initiated (e.g. shutdown drain). It's
+		// a different signal than "we tried hard and failed" — emitting
+		// LEAK-SUSPECT here would noise up the orphan-reconciler queue
+		// with workspaces we never had a chance to retry. Keep them
+		// distinct in the log so triage doesn't conflate them.
+		t.Errorf("ctx-cancel should NOT emit LEAK-SUSPECT (different signal than retry exhaustion); got %q", out)
+	}
+}
+
+// --- AST gate ---
+//
+// Pins the invariant: in workspace_restart.go, the ONLY direct
+// `h.cpProv.Stop(...)` call lives inside cpStopWithRetry. Any other call
+// is a regression — re-introducing the pre-fix log-and-continue shape that
+// silently leaks an EC2 on every transient CP failure.
+//
+// Same family as TestRestart_StopRunsInsideGoroutine in
+// workspace_restart_async_test.go (per feedback memory: behavior-based AST
+// gates beat name-list gates).
+
+func TestRestart_CPStopOnlyInsideRetryHelper(t *testing.T) {
+	t.Parallel()
+
+	fset := token.NewFileSet()
+	f, err := parser.ParseFile(fset, filepath.Join(".", "workspace_restart.go"), nil, 0)
+	if err != nil {
+		t.Fatalf("parse workspace_restart.go: %v", err)
+	}
+
+	type violation struct {
+		fn   string
+		line int
+	}
+	var bad []violation
+
+	for _, decl := range f.Decls {
+		fn, ok := decl.(*ast.FuncDecl)
+		if !ok || fn.Body == nil || fn.Recv == nil {
+			continue
+		}
+		// cpStopWithRetry is the ONE allowed home for h.cpProv.Stop.
+		if fn.Name.Name == "cpStopWithRetry" {
+			continue
+		}
+		ast.Inspect(fn.Body, func(n ast.Node) bool {
+			call, ok := n.(*ast.CallExpr)
+			if !ok {
+				return true
+			}
+			sel, ok := call.Fun.(*ast.SelectorExpr)
+			if !ok || sel.Sel.Name != "Stop" {
+				return true
+			}
+			inner, ok := sel.X.(*ast.SelectorExpr)
+			if !ok || inner.Sel.Name != "cpProv" {
+				return true
+			}
+			bad = append(bad, violation{
+				fn:   fn.Name.Name,
+				line: fset.Position(call.Pos()).Line,
+			})
+			return true
+		})
+	}
+
+	for _, v := range bad {
+		t.Errorf(
+			"workspace_restart.go:%d %s calls h.cpProv.Stop directly. "+
+				"Use h.cpStopWithRetry(ctx, workspaceID, %q) instead — direct calls re-introduce "+
+				"the silent-leak shape that produced the 2026-05-01 demo-prep zombie EC2 incident "+
+				"(13 orphans on a 0-customer staging tenant). cpStopWithRetry adds bounded retry + "+
+				"a LEAK-SUSPECT structured log on exhaustion so the orphan reconciler can correlate. "+
+				"See fix/restart-stop-retry-then-flag (2026-05-02).",
+			v.line, v.fn, v.fn,
+		)
+	}
+}