diff --git a/workspace-server/internal/handlers/workspace_restart.go b/workspace-server/internal/handlers/workspace_restart.go index e5cdd5c5..eef2fc64 100644 --- a/workspace-server/internal/handlers/workspace_restart.go +++ b/workspace-server/internal/handlers/workspace_restart.go @@ -219,9 +219,7 @@ func (h *WorkspaceHandler) Restart(c *gin.Context) { if h.provisioner != nil { h.provisioner.Stop(bgCtx, id) } else if h.cpProv != nil { - if err := h.cpProv.Stop(bgCtx, id); err != nil { - log.Printf("Restart: cpProv.Stop(%s) failed: %v (continuing to reprovision)", id, err) - } + h.cpStopWithRetry(bgCtx, id, "Restart") } if h.cpProv != nil { h.provisionWorkspaceCP(id, templatePath, configFiles, payload) @@ -440,12 +438,72 @@ func (h *WorkspaceHandler) stopForRestart(ctx context.Context, workspaceID strin return } if h.cpProv != nil { - if err := h.cpProv.Stop(ctx, workspaceID); err != nil { - log.Printf("Auto-restart: cpProv.Stop(%s) failed: %v (continuing to reprovision)", workspaceID, err) - } + h.cpStopWithRetry(ctx, workspaceID, "Auto-restart") } } +// cpStopRetryAttempts caps total Stop attempts (initial + retries). 3 catches +// the transient CP/AWS hiccups that produce most leaks (one EC2 metadata +// service stall, one IAM rate-limit blip) without slowing recovery noticeably +// — worst-case wait is ~7s (1 + 2 + 4 backoff) and we run in a detached +// goroutine, so user UX is unaffected. Package-level so tests can shrink it. +var cpStopRetryAttempts = 3 + +// cpStopRetryBaseDelay is the first-retry backoff. Doubles each attempt: +// 1s, 2s, 4s for default attempts=3. +var cpStopRetryBaseDelay = 1 * time.Second + +// cpStopWithRetry wraps cpProv.Stop with bounded exponential backoff for +// the restart paths. Different policy from workspace_crud.go's Delete: +// Delete returns 500 to the client on Stop failure (loud-fail-and-block, +// since the user asked to destroy and silent leak is unacceptable), +// whereas Restart's contract is "make the workspace alive again" — if we +// refuse to reprovision when Stop fails, we strand the user with a dead +// workspace. So this helper retries to absorb transient failures, then on +// final exhaustion emits a structured `LEAK-SUSPECT` log and returns — +// the caller proceeds with reprovision regardless. The leak signal is +// the bridge to the (forthcoming) CP-side workspace orphan reconciler; +// grep `LEAK-SUSPECT cpProv.Stop` to find affected workspace IDs. +// +// source tags the originating path ("Restart" / "Auto-restart") so the +// log line attributes leaks to the path that produced them. +// +// Returns nothing — caller's contract is unchanged. +func (h *WorkspaceHandler) cpStopWithRetry(ctx context.Context, workspaceID, source string) { + if h.cpProv == nil { + return + } + var lastErr error + delay := cpStopRetryBaseDelay + for attempt := 1; attempt <= cpStopRetryAttempts; attempt++ { + err := h.cpProv.Stop(ctx, workspaceID) + if err == nil { + if attempt > 1 { + log.Printf("%s: cpProv.Stop(%s) succeeded on attempt %d", source, workspaceID, attempt) + } + return + } + lastErr = err + if attempt == cpStopRetryAttempts { + break + } + // Sleep with ctx awareness so a cancelled ctx exits early instead + // of stalling the goroutine through the remaining backoff. + select { + case <-ctx.Done(): + log.Printf("%s: cpProv.Stop(%s) abandoned mid-retry: ctx cancelled (last_err=%v)", + source, workspaceID, lastErr) + return + case <-time.After(delay): + } + delay *= 2 + } + // Exhausted. Loud-flag: stable prefix `LEAK-SUSPECT` + key=value pairs + // so logs are greppable / parseable for the CP-side orphan reconciler. + log.Printf("LEAK-SUSPECT cpProv.Stop workspace_id=%s source=%s attempts=%d last_err=%q", + workspaceID, source, cpStopRetryAttempts, lastErr.Error()) +} + // runRestartCycle does the actual stop+provision work for one restart // iteration. Synchronous (waits for provisionWorkspace to complete) so the // outer pending-flag loop in RestartByID can correctly coalesce — if this diff --git a/workspace-server/internal/handlers/workspace_restart_stop_retry_test.go b/workspace-server/internal/handlers/workspace_restart_stop_retry_test.go new file mode 100644 index 00000000..1104feec --- /dev/null +++ b/workspace-server/internal/handlers/workspace_restart_stop_retry_test.go @@ -0,0 +1,287 @@ +package handlers + +// workspace_restart_stop_retry_test.go — pins the contract of +// cpStopWithRetry, the helper introduced 2026-05-02 in +// fix/restart-stop-retry-then-flag. +// +// Why this helper exists, in brief: workspace_restart.go's two cpProv.Stop +// callers (the interactive Restart handler + the auto-restart cycle's +// stopForRestart) both used to log-and-continue on Stop failure. After +// PR #2500 made CPProvisioner.Stop surface CP non-2xx as an error, those +// log-and-continue paths became the actual leak generator: every transient +// CP/AWS hiccup = one orphan EC2 alongside the freshly provisioned one. +// 13 zombie workspace EC2s on demo-prep staging traced to this exact path. +// +// Helper contract: +// - bounded retry (default 3 attempts, 1s/2s/4s backoff) +// - early-exit on ctx cancel (don't stall the goroutine) +// - on retry exhaustion: loud structured log `LEAK-SUSPECT cpProv.Stop ...` +// - always returns (no error) — caller proceeds to reprovision regardless, +// because Restart's contract is "make the workspace alive again" and +// stranding the user with a dead workspace is worse than one leaked EC2 +// that the CP-side orphan reconciler will catch. +// +// Tests below cover every branch: +// - no-op when cpProv is nil +// - succeeds on first try (no retry log noise) +// - succeeds after transient failures (retry log on success) +// - exhausts retries, emits LEAK-SUSPECT +// - ctx cancel mid-retry exits early without sleeping the backoff +// +// Plus an AST gate that pins the helper-only invariant: any future inline +// `h.cpProv.Stop(...)` in workspace_restart.go must go through cpStopWithRetry. + +import ( + "context" + "errors" + "fmt" + "go/ast" + "go/parser" + "go/token" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner" +) + +// scriptedCPStop returns a fakeCPStop that returns errs[i] on call i, then +// nil for any further calls. Lets each test express its retry expectation +// declaratively without an ad-hoc counter inside the stub. +type scriptedCPStop struct { + errs []error + calls int + stopDelay time.Duration // optional per-call sleep to prove ctx.Done wins +} + +// satisfies provisioner.CPProvisionerAPI for the methods we touch in this test. +// The other methods are unused; we don't bother stubbing them with state. +func (s *scriptedCPStop) Stop(ctx context.Context, _ string) error { + if s.stopDelay > 0 { + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(s.stopDelay): + } + } + i := s.calls + s.calls++ + if i < len(s.errs) { + return s.errs[i] + } + return nil +} +func (s *scriptedCPStop) Start(_ context.Context, _ provisioner.WorkspaceConfig) (string, error) { + return "", nil +} +func (s *scriptedCPStop) IsRunning(_ context.Context, _ string) (bool, error) { + return false, nil +} +func (s *scriptedCPStop) GetConsoleOutput(_ context.Context, _ string) (string, error) { + return "", nil +} + +// captureLog is provided by workspace_provision_panic_test.go in this +// package — returns a buffer that accumulates log output for the test's +// lifetime. We don't redeclare it here. + +// shrinkRetryBackoff swaps cpStopRetryBaseDelay to a tiny value so retry +// tests don't burn 7s of wall time. Restored on test cleanup. +func shrinkRetryBackoff(t *testing.T) { + t.Helper() + prev := cpStopRetryBaseDelay + cpStopRetryBaseDelay = 1 * time.Millisecond + t.Cleanup(func() { cpStopRetryBaseDelay = prev }) +} + +// --- behavior tests --- + +func TestCPStopWithRetry_NoOpWhenCPProvNil(t *testing.T) { + buf := captureLog(t) + h := &WorkspaceHandler{} // cpProv left nil + h.cpStopWithRetry(context.Background(), "ws-x", "Restart") + if buf.Len() != 0 { + t.Errorf("expected silent no-op when cpProv is nil; got log: %q", buf.String()) + } +} + +func TestCPStopWithRetry_SucceedsOnFirstTry(t *testing.T) { + buf := captureLog(t) + stub := &scriptedCPStop{} + h := &WorkspaceHandler{cpProv: stub} + h.cpStopWithRetry(context.Background(), "ws-1", "Restart") + if stub.calls != 1 { + t.Errorf("expected exactly 1 Stop call on success; got %d", stub.calls) + } + out := buf.String() + if strings.Contains(out, "succeeded on attempt") { + t.Errorf("first-try success should not log a retry-success line; got %q", out) + } + if strings.Contains(out, "LEAK-SUSPECT") { + t.Errorf("first-try success must not emit LEAK-SUSPECT; got %q", out) + } +} + +func TestCPStopWithRetry_RetriesTransientThenSucceeds(t *testing.T) { + shrinkRetryBackoff(t) + buf := captureLog(t) + stub := &scriptedCPStop{errs: []error{ + errors.New("transient hiccup"), + errors.New("still flaky"), + }} + h := &WorkspaceHandler{cpProv: stub} + h.cpStopWithRetry(context.Background(), "ws-flaky", "Auto-restart") + if stub.calls != 3 { + t.Errorf("expected 3 Stop calls (2 fails + 1 success); got %d", stub.calls) + } + out := buf.String() + if !strings.Contains(out, "Auto-restart: cpProv.Stop(ws-flaky) succeeded on attempt 3") { + t.Errorf("expected eventual-success log; got %q", out) + } + if strings.Contains(out, "LEAK-SUSPECT") { + t.Errorf("eventual success must not emit LEAK-SUSPECT; got %q", out) + } +} + +func TestCPStopWithRetry_AllRetriesExhaustEmitsLeakSuspect(t *testing.T) { + shrinkRetryBackoff(t) + buf := captureLog(t) + stub := &scriptedCPStop{errs: []error{ + errors.New("cp 502 attempt 1"), + errors.New("cp 502 attempt 2"), + errors.New("cp 502 attempt 3 — final"), + }} + h := &WorkspaceHandler{cpProv: stub} + h.cpStopWithRetry(context.Background(), "ws-doomed", "Auto-restart") + if stub.calls != cpStopRetryAttempts { + t.Errorf("expected %d Stop calls when all fail; got %d", cpStopRetryAttempts, stub.calls) + } + out := buf.String() + // The LEAK-SUSPECT line is the bridge to the CP-side orphan reconciler. + // Assert every key field is present so a future stringer change can't + // silently break ops grep / parser. + for _, want := range []string{ + "LEAK-SUSPECT cpProv.Stop", + "workspace_id=ws-doomed", + "source=Auto-restart", + fmt.Sprintf("attempts=%d", cpStopRetryAttempts), + "cp 502 attempt 3 — final", // the LAST error, not an earlier one + } { + if !strings.Contains(out, want) { + t.Errorf("LEAK-SUSPECT log missing %q; got %q", want, out) + } + } +} + +func TestCPStopWithRetry_RespectsContextCancellation(t *testing.T) { + // Use the real (long) backoff so the test fails noisily if ctx-cancel + // isn't honored: a non-cancelling implementation would block ~1 second + // before the second attempt and the elapsed assertion below would fail. + buf := captureLog(t) + stub := &scriptedCPStop{errs: []error{ + errors.New("first fail"), + errors.New("second fail"), + errors.New("third fail"), + }} + h := &WorkspaceHandler{cpProv: stub} + + ctx, cancel := context.WithCancel(context.Background()) + // Cancel before the goroutine starts retrying so the very first + // post-attempt-1 select hits the ctx.Done branch. + cancel() + + start := time.Now() + h.cpStopWithRetry(ctx, "ws-cancel", "Restart") + elapsed := time.Since(start) + + // One attempt before bailing on cancel — never a second. + if stub.calls != 1 { + t.Errorf("expected 1 Stop call before ctx-cancel exit; got %d", stub.calls) + } + // Backoff is 1s minimum; if we slept it the test would take >=1s. + if elapsed >= 500*time.Millisecond { + t.Errorf("ctx-cancel should exit well under 500ms; took %v (likely slept the backoff)", elapsed) + } + out := buf.String() + if !strings.Contains(out, "abandoned mid-retry: ctx cancelled") { + t.Errorf("expected ctx-cancel log line; got %q", out) + } + if strings.Contains(out, "LEAK-SUSPECT") { + // Ctx-cancel is operator-initiated (e.g. shutdown drain). It's + // a different signal than "we tried hard and failed" — emitting + // LEAK-SUSPECT here would noise up the orphan-reconciler queue + // with workspaces we never had a chance to retry. Keep them + // distinct in the log so triage doesn't conflate them. + t.Errorf("ctx-cancel should NOT emit LEAK-SUSPECT (different signal than retry exhaustion); got %q", out) + } +} + +// --- AST gate --- +// +// Pins the invariant: in workspace_restart.go, the ONLY direct +// `h.cpProv.Stop(...)` call lives inside cpStopWithRetry. Any other call +// is a regression — re-introducing the pre-fix log-and-continue shape that +// silently leaks an EC2 on every transient CP failure. +// +// Same family as TestRestart_StopRunsInsideGoroutine in +// workspace_restart_async_test.go (per feedback memory: behavior-based AST +// gates beat name-list gates). + +func TestRestart_CPStopOnlyInsideRetryHelper(t *testing.T) { + t.Parallel() + + fset := token.NewFileSet() + f, err := parser.ParseFile(fset, filepath.Join(".", "workspace_restart.go"), nil, 0) + if err != nil { + t.Fatalf("parse workspace_restart.go: %v", err) + } + + type violation struct { + fn string + line int + } + var bad []violation + + for _, decl := range f.Decls { + fn, ok := decl.(*ast.FuncDecl) + if !ok || fn.Body == nil || fn.Recv == nil { + continue + } + // cpStopWithRetry is the ONE allowed home for h.cpProv.Stop. + if fn.Name.Name == "cpStopWithRetry" { + continue + } + ast.Inspect(fn.Body, func(n ast.Node) bool { + call, ok := n.(*ast.CallExpr) + if !ok { + return true + } + sel, ok := call.Fun.(*ast.SelectorExpr) + if !ok || sel.Sel.Name != "Stop" { + return true + } + inner, ok := sel.X.(*ast.SelectorExpr) + if !ok || inner.Sel.Name != "cpProv" { + return true + } + bad = append(bad, violation{ + fn: fn.Name.Name, + line: fset.Position(call.Pos()).Line, + }) + return true + }) + } + + for _, v := range bad { + t.Errorf( + "workspace_restart.go:%d %s calls h.cpProv.Stop directly. "+ + "Use h.cpStopWithRetry(ctx, workspaceID, %q) instead — direct calls re-introduce "+ + "the silent-leak shape that produced the 2026-05-01 demo-prep zombie EC2 incident "+ + "(13 orphans on a 0-customer staging tenant). cpStopWithRetry adds bounded retry + "+ + "a LEAK-SUSPECT structured log on exhaustion so the orphan reconciler can correlate. "+ + "See fix/restart-stop-retry-then-flag (2026-05-02).", + v.line, v.fn, v.fn, + ) + } +}