fix(workspace-crud): propagate Stop errors on delete (closes #1843)

\`Delete\`'s call to \`h.provisioner.Stop()\` was silently swallowing errors — and on the SaaS/EC2 backend, Stop() is the call that terminates the EC2 via the control plane. When Stop returned an error (CP transient 5xx, network blip), the workspace was marked 'removed' in the DB but the EC2 stayed running with no row to track it. The "14 orphan workspace EC2s on a 0-customer account" incident in #1843 (40 vCPU on a 64 vCPU AWS limit) traced to this silent-leak path. This change aggregates Stop errors across both descendant and self-stop calls and surfaces them as 500 to the client, matching the loud-fail pattern from CP #262 (DeprovisionInstance) and the DNS cleanup propagation (#269). Idempotency: - The DB row is already 'removed' before Stop runs (intentional, per #73 — guards against register/heartbeat resurrection). - \`resolveInstanceID\` reads instance_id without a status filter, so a retry can replay Stop with the same instance_id. - CP's TerminateInstance is idempotent on already-terminated EC2s. - So a retry-after-500 either re-attempts the terminate (succeeds) or finds the instance already gone (also succeeds). Behaviour change at the API layer: - Before: 200 \`{"status":"removed","cascade_deleted":N}\` regardless of Stop outcome. - After: 500 \`{"error":"...","removed_count":N,"stop_failures":K}\` on Stop failure; 200 on success. RemoveVolume errors stay log-and-continue — those are local /var/data cleanup, not infra-leak class. Test debt acknowledged: the WorkspaceHandler's \`provisioner\` field is the concrete \`*provisioner.Provisioner\` type, not an interface. Adding a regression test for the new error-propagation path requires either a refactor (introduce a Provisioner interface) or a docker-backed integration test. Filing the refactor as a follow-up; the change here is small and mirrors a proven pattern (CP #262 + #269 both ship without exhaustive new test coverage for the same reason). Verified: - go build ./... clean - go vet ./... clean - go test ./... green across the whole module (existing TestDelete cases unchanged behaviour for happy path) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 01:28:50 -07:00 · 2026-04-26 01:28:50 -07:00 · 54e86549ee
commit 54e86549ee
parent cbb8ee0807
1 changed files with 36 additions and 2 deletions
--- a/workspace-server/internal/handlers/workspace_crud.go
+++ b/workspace-server/internal/handlers/workspace_crud.go
@ -6,6 +6,7 @@ package handlers

 import (
 	"database/sql"
+	"errors"
 	"fmt"
 	"log"
 	"net/http"
@ -388,9 +389,24 @@ func (h *WorkspaceHandler) Delete(c *gin.Context) {
 	// Now stop containers + remove volumes for all descendants (any depth).
 	// Any concurrent heartbeat / registration / liveness-triggered restart
 	// will see status='removed' and bail out early.
+	//
+	// #1843: Stop() errors used to be silently swallowed. On the CP/EC2
+	// backend, Stop() calls the control plane's DELETE workspaces endpoint
+	// to terminate the EC2; if that errors (CP transient 5xx, network),
+	// the EC2 stays running with no DB row to track it — the
+	// "14 orphan workspace EC2s on a 0-customer account" scenario.
+	// Aggregate Stop failures and surface them as 500 so the client can
+	// retry. The retry replays Stop with the same instance_id (still
+	// readable from the row even after status='removed') — idempotent on
+	// the CP side. RemoveVolume errors stay log-and-continue: those are
+	// local cleanup of /var/data, not infra-leak class.
+	var stopErrs []error
 	for _, descID := range descendantIDs {
 		if h.provisioner != nil {
-			h.provisioner.Stop(ctx, descID)
+			if err := h.provisioner.Stop(ctx, descID); err != nil {
+				log.Printf("Delete descendant %s stop error: %v", descID, err)
+				stopErrs = append(stopErrs, fmt.Errorf("stop descendant %s: %w", descID, err))
+			}
 			if err := h.provisioner.RemoveVolume(ctx, descID); err != nil {
 				log.Printf("Delete descendant %s volume removal warning: %v", descID, err)
 			}
@ -401,7 +417,10 @@ func (h *WorkspaceHandler) Delete(c *gin.Context) {

 	// Stop + remove volume for the workspace itself
 	if h.provisioner != nil {
-		h.provisioner.Stop(ctx, id)
+		if err := h.provisioner.Stop(ctx, id); err != nil {
+			log.Printf("Delete %s stop error: %v", id, err)
+			stopErrs = append(stopErrs, fmt.Errorf("stop %s: %w", id, err))
+		}
 		if err := h.provisioner.RemoveVolume(ctx, id); err != nil {
 			log.Printf("Delete %s volume removal warning: %v", id, err)
 		}
@ -412,6 +431,21 @@ func (h *WorkspaceHandler) Delete(c *gin.Context) {
 		"cascade_deleted": len(descendantIDs),
 	})

+	// If any Stop call failed, surface 500 so the client retries. The DB
+	// row is already 'removed' (idempotent), and Stop's instance_id
+	// lookup tolerates that — the retry replays the terminate. This is
+	// the loud-fail-instead-of-silent-leak choice; users see a 500
+	// instead of an orphaned EC2.
+	if len(stopErrs) > 0 {
+		c.JSON(http.StatusInternalServerError, gin.H{
+			"error": fmt.Sprintf("workspace marked removed, but %d stop call(s) failed — please retry: %v",
+				len(stopErrs), errors.Join(stopErrs...)),
+			"removed_count": len(allIDs),
+			"stop_failures": len(stopErrs),
+		})
+		return
+	}
+
 	// Hard purge: cascade delete all FK data and remove the DB row entirely (#1087)
 	if c.Query("purge") == "true" {
 		purgeIDs := pq.Array(allIDs)