diff --git a/workspace-server/internal/provisioner/cp_provisioner.go b/workspace-server/internal/provisioner/cp_provisioner.go index 6167098b..efc5e6ab 100644 --- a/workspace-server/internal/provisioner/cp_provisioner.go +++ b/workspace-server/internal/provisioner/cp_provisioner.go @@ -168,17 +168,31 @@ func (p *CPProvisioner) Stop(ctx context.Context, workspaceID string) error { } // IsRunning checks workspace EC2 instance state via the control plane. +// +// Contract: +// - transport error → (false, error) +// - non-2xx HTTP response → (false, error). Previously swallowed; +// a CP 500 would return (false, nil) and the sweeper couldn't +// distinguish "workspace stopped" from "CP broken". +// - 2xx with state!="running" → (false, nil) +// - 2xx with state=="running" → (true, nil) func (p *CPProvisioner) IsRunning(ctx context.Context, workspaceID string) (bool, error) { url := fmt.Sprintf("%s/cp/workspaces/%s/status?instance_id=%s", p.baseURL, workspaceID, workspaceID) req, _ := http.NewRequestWithContext(ctx, "GET", url, nil) p.authHeaders(req) resp, err := p.httpClient.Do(req) if err != nil { - return false, err + return false, fmt.Errorf("cp provisioner: status: %w", err) } defer resp.Body.Close() + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + // Don't leak the body — upstream errors may echo headers. + return false, fmt.Errorf("cp provisioner: status: unexpected %d", resp.StatusCode) + } var result struct{ State string `json:"state"` } - json.NewDecoder(resp.Body).Decode(&result) + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return false, fmt.Errorf("cp provisioner: status decode: %w", err) + } return result.State == "running", nil } diff --git a/workspace-server/internal/provisioner/cp_provisioner_test.go b/workspace-server/internal/provisioner/cp_provisioner_test.go index 1fddcbb2..78b5e9d6 100644 --- a/workspace-server/internal/provisioner/cp_provisioner_test.go +++ b/workspace-server/internal/provisioner/cp_provisioner_test.go @@ -338,6 +338,69 @@ func TestIsRunning_TransportErrorReturnsFalse(t *testing.T) { } } +// TestIsRunning_Non2xxSurfacesError — a CP 500/502/etc. must NOT +// be silently treated as "workspace stopped". Previously the handler +// would decode an empty body → State="" → return (false, nil) and +// the sweeper would see the workspace as not-running. Now every +// non-2xx is an error the caller can log + retry. +func TestIsRunning_Non2xxSurfacesError(t *testing.T) { + cases := []struct { + name string + status int + }{ + {"500 internal", 500}, + {"502 bad gateway", 502}, + {"503 unavailable", 503}, + {"401 unauthorized", 401}, + {"404 not found", 404}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(tc.status) + _, _ = io.WriteString(w, `{"state":"running"}`) // liar body — must not be trusted + })) + defer srv.Close() + p := &CPProvisioner{baseURL: srv.URL, orgID: "org-1", httpClient: srv.Client()} + + got, err := p.IsRunning(context.Background(), "ws-1") + if err == nil { + t.Errorf("status %d: expected error, got nil", tc.status) + } + if got { + t.Errorf("status %d: must not report running=true on non-2xx", tc.status) + } + // Error must NOT echo the upstream body — CP 5xx bodies + // can contain echoed headers and we don't want logs to + // leak bearer tokens. + if err != nil && strings.Contains(err.Error(), "running") { + t.Errorf("status %d: error leaked upstream body: %q", tc.status, err.Error()) + } + }) + } +} + +// TestIsRunning_MalformedJSONBodyReturnsError — 200 but invalid JSON +// must surface an error rather than silently returning false. Prevents +// a middleware glitch (HTML error page with 200) from looking like +// "workspace stopped". +func TestIsRunning_MalformedJSONBodyReturnsError(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(200) + _, _ = io.WriteString(w, "maintenance mode") + })) + defer srv.Close() + p := &CPProvisioner{baseURL: srv.URL, orgID: "org-1", httpClient: srv.Client()} + + got, err := p.IsRunning(context.Background(), "ws-1") + if err == nil { + t.Errorf("malformed body: expected error, got nil (got=%v)", got) + } + if got { + t.Errorf("malformed body must not report running=true") + } +} + // TestClose_Noop — explicit contract: Close has no side effects and // no error. Exists for the Provisioner interface; compliance guard. func TestClose_Noop(t *testing.T) {