fix(cp-provisioner): surface CP non-2xx on Stop to plug EC2 leak

http.Client.Do only errors on transport failure — a CP 5xx (AWS
hiccup, missing IAM, transient outage) was silently treated as
success. Workspace row then flipped to status='removed' and the EC2
stayed alive forever with no DB pointer (the "orphan EC2 on a
0-customer account" scenario flagged in workspace_crud.go #1843).
Found while triaging 13 zombie workspace EC2s on demo-prep staging.

Adds a status-code check that returns an error tagged with the
workspace ID + status + bounded body excerpt, so the existing
loud-fail path in workspace_crud.go's Delete handler can populate
stop_failures and surface a 500. Body read is io.LimitReader-capped
at 512 bytes to keep error logs sane during a CP outage.

Tests: 4 new (5xx surfaces, 4xx surfaces, 2xx variants 200/202/204
all succeed, long body is truncated). Test-first verified — the
first three fail on the buggy code and all four pass on the fix.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hongming Wang 2026-05-01 22:59:01 -07:00
parent 81ee0cbd55
commit 5167e482d0
2 changed files with 138 additions and 1 deletions

View File

@ -10,6 +10,7 @@ import (
"log"
"net/http"
"os"
"strings"
"time"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
@ -254,7 +255,24 @@ func (p *CPProvisioner) Stop(ctx context.Context, workspaceID string) error {
if err != nil {
return fmt.Errorf("cp provisioner: stop: %w", err)
}
_ = resp.Body.Close()
defer func() { _ = resp.Body.Close() }()
// http.Client.Do only returns err on transport failure; a 4xx/5xx
// response is NOT an error. Without this status check, a CP that
// returns 5xx (AWS hiccup, missing IAM, transient outage) is read
// as success, the workspace row is then marked status='removed' by
// the caller, and the EC2 stays alive forever — there's no DB row
// left to point at the orphan. This is the leak source documented
// in workspace_crud.go's #1843 comment ("orphan EC2 on a
// 0-customer account scenario"); the loud-fail path already exists
// upstream, this just plumbs it through.
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
// Read a bounded slice of the body so the error message gives ops
// enough to triage without risking a multi-MB log line on a
// pathological response. 512 bytes covers any sane error envelope.
body, _ := io.ReadAll(io.LimitReader(resp.Body, 512))
return fmt.Errorf("cp provisioner: stop %s: unexpected %d: %s",
workspaceID, resp.StatusCode, strings.TrimSpace(string(body)))
}
return nil
}

View File

@ -348,6 +348,125 @@ func TestStop_TransportErrorSurfaces(t *testing.T) {
}
}
// TestStop_5xxResponseSurfacesError — pre-fix bug: the CP returning 500
// (AWS hiccup, missing IAM, transient outage) was silently treated as
// success because http.Client.Do only errors on transport failures.
// Result was a leaked workspace EC2 the orphan-report couldn't see.
// Workspace-server's Delete handler depends on this error to populate
// stopErrs and surface a 500 to its client (the loud-fail-instead-of-
// silent-leak choice documented in workspace_crud.go #1843).
func TestStop_5xxResponseSurfacesError(t *testing.T) {
primeInstanceIDLookup(t, map[string]string{"ws-1": "i-leaksrc"})
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
http.Error(w, "ec2 terminate hit IAM rate limit", http.StatusInternalServerError)
}))
defer srv.Close()
p := &CPProvisioner{
baseURL: srv.URL, orgID: "org-1",
sharedSecret: "s", adminToken: "t",
httpClient: srv.Client(),
}
err := p.Stop(context.Background(), "ws-1")
if err == nil {
t.Fatal("Stop swallowed CP 500 — leak source still open")
}
if !strings.Contains(err.Error(), "500") {
t.Errorf("error should contain status code, got %q", err.Error())
}
if !strings.Contains(err.Error(), "ws-1") {
t.Errorf("error should name the workspace for triage, got %q", err.Error())
}
if !strings.Contains(err.Error(), "IAM rate limit") {
t.Errorf("error should include CP body excerpt for triage, got %q", err.Error())
}
}
// TestStop_4xxResponseSurfacesError — same shape as 5xx for the 4xx
// case (e.g. CP returns 404 because the instance_id was already gone,
// or 401 because PROVISION_SHARED_SECRET drifted). 404 in particular
// is a real signal — caller may want to short-circuit, but it must
// NOT be swallowed as success because it leaves the workspace row
// flipped to status='removed' on a different mismatch path.
func TestStop_4xxResponseSurfacesError(t *testing.T) {
primeInstanceIDLookup(t, map[string]string{"ws-1": "i-gone"})
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusNotFound)
_, _ = w.Write([]byte(`{"error":"instance not found"}`))
}))
defer srv.Close()
p := &CPProvisioner{
baseURL: srv.URL, orgID: "org-1",
sharedSecret: "s", adminToken: "t",
httpClient: srv.Client(),
}
err := p.Stop(context.Background(), "ws-1")
if err == nil {
t.Fatal("Stop swallowed CP 404")
}
if !strings.Contains(err.Error(), "404") {
t.Errorf("error should contain status, got %q", err.Error())
}
}
// TestStop_2xxVariantsAllSucceed — ensures the new check accepts the
// full 2xx range. CP currently returns 200, but we have historically
// flipped between 200 and 204 No Content for DELETE; lock both in.
func TestStop_2xxVariantsAllSucceed(t *testing.T) {
primeInstanceIDLookup(t, map[string]string{"ws-1": "i-ok"})
for _, code := range []int{
http.StatusOK, // 200
http.StatusAccepted, // 202
http.StatusNoContent, // 204
} {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(code)
}))
p := &CPProvisioner{
baseURL: srv.URL, orgID: "org-1",
sharedSecret: "s", adminToken: "t",
httpClient: srv.Client(),
}
err := p.Stop(context.Background(), "ws-1")
srv.Close()
if err != nil {
t.Errorf("Stop with CP HTTP %d returned err = %v, want nil", code, err)
}
}
}
// TestStop_LongBodyTruncatedInError — defensive: a pathological CP
// returning a multi-MB body shouldn't blow up the error message. The
// fix uses io.LimitReader(512) so logs stay sane during an outage.
func TestStop_LongBodyTruncatedInError(t *testing.T) {
primeInstanceIDLookup(t, map[string]string{"ws-1": "i-bigbody"})
bigBody := strings.Repeat("x", 10000)
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusBadGateway)
_, _ = io.WriteString(w, bigBody)
}))
defer srv.Close()
p := &CPProvisioner{
baseURL: srv.URL, orgID: "org-1",
sharedSecret: "s", adminToken: "t",
httpClient: srv.Client(),
}
err := p.Stop(context.Background(), "ws-1")
if err == nil {
t.Fatal("Stop swallowed 502")
}
// Error should contain SOME body bytes (proves we read something) but
// not all of it (proves the LimitReader actually capped).
if !strings.Contains(err.Error(), "x") {
t.Errorf("error should include body excerpt, got %q", err.Error())
}
if len(err.Error()) > 1024 {
t.Errorf("error length = %d, want capped well below body length 10000", len(err.Error()))
}
}
// TestIsRunning_ParsesStateField — CP returns the EC2 state, we expose
// a bool ("running"/"pending"/"terminated" → true only for "running").
func TestIsRunning_ParsesStateField(t *testing.T) {