fix(workspace-server): IsRunning surfaces non-2xx + JSON errors
Pre-existing silent-failure path: IsRunning decoded CP responses
regardless of HTTP status, so a CP 500 → empty body → State="" →
returned (false, nil). The sweeper couldn't distinguish "workspace
stopped" from "CP broken" and would leave a dead row in place.
## Fix
- Non-2xx → wrapped error, does NOT echo body (CP 5xx bodies may
contain echoed headers; leaking into logs would expose bearer)
- JSON decode error → wrapped error
- Transport error → now wrapped with "cp provisioner: status:"
prefix for easier log grepping
## Tests
+7 cases (5-status table + malformed JSON + existing transport).
IsRunning coverage 100%; overall cp_provisioner at 98%.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
b145a55b06
commit
e502003c74
@ -168,17 +168,31 @@ func (p *CPProvisioner) Stop(ctx context.Context, workspaceID string) error {
|
||||
}
|
||||
|
||||
// IsRunning checks workspace EC2 instance state via the control plane.
|
||||
//
|
||||
// Contract:
|
||||
// - transport error → (false, error)
|
||||
// - non-2xx HTTP response → (false, error). Previously swallowed;
|
||||
// a CP 500 would return (false, nil) and the sweeper couldn't
|
||||
// distinguish "workspace stopped" from "CP broken".
|
||||
// - 2xx with state!="running" → (false, nil)
|
||||
// - 2xx with state=="running" → (true, nil)
|
||||
func (p *CPProvisioner) IsRunning(ctx context.Context, workspaceID string) (bool, error) {
|
||||
url := fmt.Sprintf("%s/cp/workspaces/%s/status?instance_id=%s", p.baseURL, workspaceID, workspaceID)
|
||||
req, _ := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||
p.authHeaders(req)
|
||||
resp, err := p.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return false, err
|
||||
return false, fmt.Errorf("cp provisioner: status: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
// Don't leak the body — upstream errors may echo headers.
|
||||
return false, fmt.Errorf("cp provisioner: status: unexpected %d", resp.StatusCode)
|
||||
}
|
||||
var result struct{ State string `json:"state"` }
|
||||
json.NewDecoder(resp.Body).Decode(&result)
|
||||
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
|
||||
return false, fmt.Errorf("cp provisioner: status decode: %w", err)
|
||||
}
|
||||
return result.State == "running", nil
|
||||
}
|
||||
|
||||
|
||||
@ -338,6 +338,69 @@ func TestIsRunning_TransportErrorReturnsFalse(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestIsRunning_Non2xxSurfacesError — a CP 500/502/etc. must NOT
|
||||
// be silently treated as "workspace stopped". Previously the handler
|
||||
// would decode an empty body → State="" → return (false, nil) and
|
||||
// the sweeper would see the workspace as not-running. Now every
|
||||
// non-2xx is an error the caller can log + retry.
|
||||
func TestIsRunning_Non2xxSurfacesError(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
status int
|
||||
}{
|
||||
{"500 internal", 500},
|
||||
{"502 bad gateway", 502},
|
||||
{"503 unavailable", 503},
|
||||
{"401 unauthorized", 401},
|
||||
{"404 not found", 404},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(tc.status)
|
||||
_, _ = io.WriteString(w, `{"state":"running"}`) // liar body — must not be trusted
|
||||
}))
|
||||
defer srv.Close()
|
||||
p := &CPProvisioner{baseURL: srv.URL, orgID: "org-1", httpClient: srv.Client()}
|
||||
|
||||
got, err := p.IsRunning(context.Background(), "ws-1")
|
||||
if err == nil {
|
||||
t.Errorf("status %d: expected error, got nil", tc.status)
|
||||
}
|
||||
if got {
|
||||
t.Errorf("status %d: must not report running=true on non-2xx", tc.status)
|
||||
}
|
||||
// Error must NOT echo the upstream body — CP 5xx bodies
|
||||
// can contain echoed headers and we don't want logs to
|
||||
// leak bearer tokens.
|
||||
if err != nil && strings.Contains(err.Error(), "running") {
|
||||
t.Errorf("status %d: error leaked upstream body: %q", tc.status, err.Error())
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestIsRunning_MalformedJSONBodyReturnsError — 200 but invalid JSON
|
||||
// must surface an error rather than silently returning false. Prevents
|
||||
// a middleware glitch (HTML error page with 200) from looking like
|
||||
// "workspace stopped".
|
||||
func TestIsRunning_MalformedJSONBodyReturnsError(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(200)
|
||||
_, _ = io.WriteString(w, "<html>maintenance mode</html>")
|
||||
}))
|
||||
defer srv.Close()
|
||||
p := &CPProvisioner{baseURL: srv.URL, orgID: "org-1", httpClient: srv.Client()}
|
||||
|
||||
got, err := p.IsRunning(context.Background(), "ws-1")
|
||||
if err == nil {
|
||||
t.Errorf("malformed body: expected error, got nil (got=%v)", got)
|
||||
}
|
||||
if got {
|
||||
t.Errorf("malformed body must not report running=true")
|
||||
}
|
||||
}
|
||||
|
||||
// TestClose_Noop — explicit contract: Close has no side effects and
|
||||
// no error. Exists for the Provisioner interface; compliance guard.
|
||||
func TestClose_Noop(t *testing.T) {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user