fix(workspace-server): IsRunning surfaces non-2xx + JSON errors

Pre-existing silent-failure path: IsRunning decoded CP responses
regardless of HTTP status, so a CP 500 → empty body → State="" →
returned (false, nil). The sweeper couldn't distinguish "workspace
stopped" from "CP broken" and would leave a dead row in place.

## Fix

  - Non-2xx → wrapped error, does NOT echo body (CP 5xx bodies may
    contain echoed headers; leaking into logs would expose bearer)
  - JSON decode error → wrapped error
  - Transport error → now wrapped with "cp provisioner: status:"
    prefix for easier log grepping

## Tests

+7 cases (5-status table + malformed JSON + existing transport).
IsRunning coverage 100%; overall cp_provisioner at 98%.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hongming Wang 2026-04-20 08:47:55 -07:00
parent b145a55b06
commit e502003c74
2 changed files with 79 additions and 2 deletions

View File

@ -168,17 +168,31 @@ func (p *CPProvisioner) Stop(ctx context.Context, workspaceID string) error {
}
// IsRunning checks workspace EC2 instance state via the control plane.
//
// Contract:
// - transport error → (false, error)
// - non-2xx HTTP response → (false, error). Previously swallowed;
// a CP 500 would return (false, nil) and the sweeper couldn't
// distinguish "workspace stopped" from "CP broken".
// - 2xx with state!="running" → (false, nil)
// - 2xx with state=="running" → (true, nil)
func (p *CPProvisioner) IsRunning(ctx context.Context, workspaceID string) (bool, error) {
url := fmt.Sprintf("%s/cp/workspaces/%s/status?instance_id=%s", p.baseURL, workspaceID, workspaceID)
req, _ := http.NewRequestWithContext(ctx, "GET", url, nil)
p.authHeaders(req)
resp, err := p.httpClient.Do(req)
if err != nil {
return false, err
return false, fmt.Errorf("cp provisioner: status: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
// Don't leak the body — upstream errors may echo headers.
return false, fmt.Errorf("cp provisioner: status: unexpected %d", resp.StatusCode)
}
var result struct{ State string `json:"state"` }
json.NewDecoder(resp.Body).Decode(&result)
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return false, fmt.Errorf("cp provisioner: status decode: %w", err)
}
return result.State == "running", nil
}

View File

@ -338,6 +338,69 @@ func TestIsRunning_TransportErrorReturnsFalse(t *testing.T) {
}
}
// TestIsRunning_Non2xxSurfacesError — a CP 500/502/etc. must NOT
// be silently treated as "workspace stopped". Previously the handler
// would decode an empty body → State="" → return (false, nil) and
// the sweeper would see the workspace as not-running. Now every
// non-2xx is an error the caller can log + retry.
func TestIsRunning_Non2xxSurfacesError(t *testing.T) {
cases := []struct {
name string
status int
}{
{"500 internal", 500},
{"502 bad gateway", 502},
{"503 unavailable", 503},
{"401 unauthorized", 401},
{"404 not found", 404},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(tc.status)
_, _ = io.WriteString(w, `{"state":"running"}`) // liar body — must not be trusted
}))
defer srv.Close()
p := &CPProvisioner{baseURL: srv.URL, orgID: "org-1", httpClient: srv.Client()}
got, err := p.IsRunning(context.Background(), "ws-1")
if err == nil {
t.Errorf("status %d: expected error, got nil", tc.status)
}
if got {
t.Errorf("status %d: must not report running=true on non-2xx", tc.status)
}
// Error must NOT echo the upstream body — CP 5xx bodies
// can contain echoed headers and we don't want logs to
// leak bearer tokens.
if err != nil && strings.Contains(err.Error(), "running") {
t.Errorf("status %d: error leaked upstream body: %q", tc.status, err.Error())
}
})
}
}
// TestIsRunning_MalformedJSONBodyReturnsError — 200 but invalid JSON
// must surface an error rather than silently returning false. Prevents
// a middleware glitch (HTML error page with 200) from looking like
// "workspace stopped".
func TestIsRunning_MalformedJSONBodyReturnsError(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(200)
_, _ = io.WriteString(w, "<html>maintenance mode</html>")
}))
defer srv.Close()
p := &CPProvisioner{baseURL: srv.URL, orgID: "org-1", httpClient: srv.Client()}
got, err := p.IsRunning(context.Background(), "ws-1")
if err == nil {
t.Errorf("malformed body: expected error, got nil (got=%v)", got)
}
if got {
t.Errorf("malformed body must not report running=true")
}
}
// TestClose_Noop — explicit contract: Close has no side effects and
// no error. Exists for the Provisioner interface; compliance guard.
func TestClose_Noop(t *testing.T) {