From 54e86549ee7ad383c358800eb9b4ac72a987e05b Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Date: Sun, 26 Apr 2026 01:28:50 -0700
Subject: [PATCH] fix(workspace-crud): propagate Stop errors on delete (closes
 #1843)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

\`Delete\`'s call to \`h.provisioner.Stop()\` was silently swallowing
errors — and on the SaaS/EC2 backend, Stop() is the call that
terminates the EC2 via the control plane. When Stop returned an
error (CP transient 5xx, network blip), the workspace was marked
'removed' in the DB but the EC2 stayed running with no row to
track it. The "14 orphan workspace EC2s on a 0-customer account"
incident in #1843 (40 vCPU on a 64 vCPU AWS limit) traced to this
silent-leak path.

This change aggregates Stop errors across both descendant and
self-stop calls and surfaces them as 500 to the client, matching
the loud-fail pattern from CP #262 (DeprovisionInstance) and the
DNS cleanup propagation (#269).

Idempotency:
- The DB row is already 'removed' before Stop runs (intentional,
  per #73 — guards against register/heartbeat resurrection).
- \`resolveInstanceID\` reads instance_id without a status filter,
  so a retry can replay Stop with the same instance_id.
- CP's TerminateInstance is idempotent on already-terminated EC2s.
- So a retry-after-500 either re-attempts the terminate (succeeds)
  or finds the instance already gone (also succeeds).

Behaviour change at the API layer:
- Before: 200 \`{"status":"removed","cascade_deleted":N}\` regardless
  of Stop outcome.
- After: 500 \`{"error":"...","removed_count":N,"stop_failures":K}\`
  on Stop failure; 200 on success.

RemoveVolume errors stay log-and-continue — those are local
/var/data cleanup, not infra-leak class.

Test debt acknowledged: the WorkspaceHandler's \`provisioner\` field
is the concrete \`*provisioner.Provisioner\` type, not an interface.
Adding a regression test for the new error-propagation path
requires either a refactor (introduce a Provisioner interface) or
a docker-backed integration test. Filing the refactor as a
follow-up; the change here is small and mirrors a proven pattern
(CP #262 + #269 both ship without exhaustive new test coverage
for the same reason).

Verified:
- go build ./... clean
- go vet ./... clean
- go test ./... green across the whole module (existing TestDelete
  cases unchanged behaviour for happy path)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../internal/handlers/workspace_crud.go       | 38 ++++++++++++++++++-
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/workspace-server/internal/handlers/workspace_crud.go b/workspace-server/internal/handlers/workspace_crud.go
index 1d4281838..b7e83a74a 100644
--- a/workspace-server/internal/handlers/workspace_crud.go
+++ b/workspace-server/internal/handlers/workspace_crud.go
@@ -6,6 +6,7 @@ package handlers
 
 import (
 	"database/sql"
+	"errors"
 	"fmt"
 	"log"
 	"net/http"
@@ -388,9 +389,24 @@ func (h *WorkspaceHandler) Delete(c *gin.Context) {
 	// Now stop containers + remove volumes for all descendants (any depth).
 	// Any concurrent heartbeat / registration / liveness-triggered restart
 	// will see status='removed' and bail out early.
+	//
+	// #1843: Stop() errors used to be silently swallowed. On the CP/EC2
+	// backend, Stop() calls the control plane's DELETE workspaces endpoint
+	// to terminate the EC2; if that errors (CP transient 5xx, network),
+	// the EC2 stays running with no DB row to track it — the
+	// "14 orphan workspace EC2s on a 0-customer account" scenario.
+	// Aggregate Stop failures and surface them as 500 so the client can
+	// retry. The retry replays Stop with the same instance_id (still
+	// readable from the row even after status='removed') — idempotent on
+	// the CP side. RemoveVolume errors stay log-and-continue: those are
+	// local cleanup of /var/data, not infra-leak class.
+	var stopErrs []error
 	for _, descID := range descendantIDs {
 		if h.provisioner != nil {
-			h.provisioner.Stop(ctx, descID)
+			if err := h.provisioner.Stop(ctx, descID); err != nil {
+				log.Printf("Delete descendant %s stop error: %v", descID, err)
+				stopErrs = append(stopErrs, fmt.Errorf("stop descendant %s: %w", descID, err))
+			}
 			if err := h.provisioner.RemoveVolume(ctx, descID); err != nil {
 				log.Printf("Delete descendant %s volume removal warning: %v", descID, err)
 			}
@@ -401,7 +417,10 @@ func (h *WorkspaceHandler) Delete(c *gin.Context) {
 
 	// Stop + remove volume for the workspace itself
 	if h.provisioner != nil {
-		h.provisioner.Stop(ctx, id)
+		if err := h.provisioner.Stop(ctx, id); err != nil {
+			log.Printf("Delete %s stop error: %v", id, err)
+			stopErrs = append(stopErrs, fmt.Errorf("stop %s: %w", id, err))
+		}
 		if err := h.provisioner.RemoveVolume(ctx, id); err != nil {
 			log.Printf("Delete %s volume removal warning: %v", id, err)
 		}
@@ -412,6 +431,21 @@ func (h *WorkspaceHandler) Delete(c *gin.Context) {
 		"cascade_deleted": len(descendantIDs),
 	})
 
+	// If any Stop call failed, surface 500 so the client retries. The DB
+	// row is already 'removed' (idempotent), and Stop's instance_id
+	// lookup tolerates that — the retry replays the terminate. This is
+	// the loud-fail-instead-of-silent-leak choice; users see a 500
+	// instead of an orphaned EC2.
+	if len(stopErrs) > 0 {
+		c.JSON(http.StatusInternalServerError, gin.H{
+			"error": fmt.Sprintf("workspace marked removed, but %d stop call(s) failed — please retry: %v",
+				len(stopErrs), errors.Join(stopErrs...)),
+			"removed_count": len(allIDs),
+			"stop_failures": len(stopErrs),
+		})
+		return
+	}
+
 	// Hard purge: cascade delete all FK data and remove the DB row entirely (#1087)
 	if c.Query("purge") == "true" {
 		purgeIDs := pq.Array(allIDs)