From 925a71887d5a4e1e98ce7b4675eb267be613793f Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 23 Apr 2026 11:14:55 -0700 Subject: [PATCH 1/2] fix(workspace): credential helper security hardening (#1797) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four findings from security audit (internal/security/credential-token-backlog.md): 1. STDERR LEAK — molecule-git-token-helper.sh:146,153 logged ${response} on platform errors. The response body MAY contain the token in some failure modes (alternate JSON key shape on partial success). Now: - capture curl's stderr to a tmp file (not $response) so we can log the curl error message without ever interpolating the response body - on empty-token branch, log only response size (bytes) for debug 2. CHMOD 600 — already in place at lines 116, 124, 223 (verified, no change) 3. RESPAWN SUPERVISION — entrypoint.sh wrapped daemon launch in a while-true bash loop with 30s back-off. Without this, a daemon crash silently leaves the workspace stuck on an expired token until the container restarts. Logs to /home/agent/.gh-token-refresh.log (agent-writable; /var/log is root-owned). 4. JITTER — molecule-gh-token-refresh.sh: added 0..120s random offset to each sleep so 39 containers don't synchronize their refresh requests against the platform endpoint. Also: - Daemon now sends helper output to /dev/null instead of merging stderr, belt-and-suspenders against any future helper change that might write the token to stdout. - Daemon log lines include rc=$? on failure for actionable triage. Inherent risks (org-wide token blast, prompt-injection theft, bearer in volume, no audit log) tracked in internal/security/credential-token-backlog.md as separate roadmap items. Co-authored-by: Claude Opus 4.6 (1M context) Co-authored-by: molecule-ai[bot] <276602405+molecule-ai[bot]@users.noreply.github.com> --- workspace/entrypoint.sh | 15 ++++-- .../scripts/molecule-gh-token-refresh.sh | 51 ++++++++++--------- .../scripts/molecule-git-token-helper.sh | 19 +++++-- 3 files changed, 55 insertions(+), 30 deletions(-) diff --git a/workspace/entrypoint.sh b/workspace/entrypoint.sh index da36fc4e..8b83ddc1 100644 --- a/workspace/entrypoint.sh +++ b/workspace/entrypoint.sh @@ -71,11 +71,20 @@ fi # Now running as agent (uid 1000) -# --- Start background token refresh daemon --- +# --- Start background token refresh daemon (with respawn supervision) --- # Keeps gh CLI and git credentials fresh across the 60-min token TTL. -# Runs in the background; entrypoint continues to exec molecule-runtime. +# Wrapped in a respawn loop so a daemon crash doesn't silently leave the +# workspace stuck on an expired token. Runs in the background; entrypoint +# continues to exec molecule-runtime. if [ -x /app/scripts/molecule-gh-token-refresh.sh ]; then - nohup /app/scripts/molecule-gh-token-refresh.sh > /dev/null 2>&1 & + nohup bash -c ' + while true; do + /app/scripts/molecule-gh-token-refresh.sh + rc=$? + echo "[molecule-gh-token-refresh] daemon exited rc=$rc — respawning in 30s" >&2 + sleep 30 + done + ' > /home/agent/.gh-token-refresh.log 2>&1 & fi # --- Initial gh auth setup --- diff --git a/workspace/scripts/molecule-gh-token-refresh.sh b/workspace/scripts/molecule-gh-token-refresh.sh index 87c4d8a1..e7f4587e 100755 --- a/workspace/scripts/molecule-gh-token-refresh.sh +++ b/workspace/scripts/molecule-gh-token-refresh.sh @@ -2,53 +2,56 @@ # molecule-gh-token-refresh.sh — background daemon that keeps GitHub # credentials fresh inside Molecule AI workspace containers. # -# Runs as a background process started by entrypoint.sh. Every -# REFRESH_INTERVAL_SEC (default 45 min = 2700s) it calls the credential -# helper's _refresh_gh action which: -# 1. Fetches a fresh installation token from the platform API -# 2. Updates the local cache (used by git credential helper) -# 3. Runs `gh auth login --with-token` so `gh` CLI stays authenticated -# 4. Writes ~/.gh_token for any scripts that read it +# Started by entrypoint.sh under a respawn wrapper. Every +# REFRESH_INTERVAL_SEC + jitter (default 45 min ± 2 min) it calls the +# credential helper's _refresh_gh action. # -# The daemon logs to stderr (captured by Docker) and is designed to be -# fire-and-forget — if a single refresh fails, it logs the error and -# retries on the next interval. The credential helper itself has a -# fallback chain (cache > API > env var) so a missed refresh is not -# immediately fatal. +# # Jitter +# A 0..120s random offset prevents 39 containers from synchronizing +# their refresh requests against /workspaces/:id/github-installation-token. # -# Usage (from entrypoint.sh): -# nohup /app/scripts/molecule-gh-token-refresh.sh & +# # Security +# - This daemon NEVER prints token values. Failures log the helper's +# exit code only, not its stderr, so token bytes can't leak via the +# docker log pipeline. +# - The helper script is responsible for chmod 600 on cache files. # set -uo pipefail -HELPER_SCRIPT="/app/scripts/molecule-git-token-helper.sh" +HELPER_SCRIPT="${TOKEN_HELPER_SCRIPT:-/app/scripts/molecule-git-token-helper.sh}" REFRESH_INTERVAL_SEC="${TOKEN_REFRESH_INTERVAL_SEC:-2700}" # 45 min +JITTER_MAX_SEC="${TOKEN_REFRESH_JITTER_SEC:-120}" +INITIAL_DELAY_SEC="${TOKEN_REFRESH_INITIAL_DELAY_SEC:-60}" log() { echo "[molecule-gh-token-refresh] $(date -u '+%Y-%m-%dT%H:%M:%SZ') $*" >&2 } -# Wait a short time before the first refresh to let the container finish -# booting and .auth_token to be written by the runtime's register call. -INITIAL_DELAY_SEC="${TOKEN_REFRESH_INITIAL_DELAY_SEC:-60}" -log "starting (interval=${REFRESH_INTERVAL_SEC}s, initial_delay=${INITIAL_DELAY_SEC}s)" +jittered_sleep() { + local base="$1" + local jitter=$((RANDOM % (JITTER_MAX_SEC + 1))) + sleep $((base + jitter)) +} + +log "starting (interval=${REFRESH_INTERVAL_SEC}s ± ${JITTER_MAX_SEC}s, initial_delay=${INITIAL_DELAY_SEC}s)" sleep "${INITIAL_DELAY_SEC}" # Initial refresh — prime the cache + gh auth immediately after boot. +# Discard helper output to /dev/null so token can't leak via docker logs. log "initial token refresh" -if bash "${HELPER_SCRIPT}" _refresh_gh 2>&1; then +if bash "${HELPER_SCRIPT}" _refresh_gh >/dev/null 2>&1; then log "initial refresh succeeded" else - log "initial refresh failed (will retry in ${REFRESH_INTERVAL_SEC}s)" + log "initial refresh failed (rc=$?) — will retry in ~${REFRESH_INTERVAL_SEC}s" fi # Steady-state loop. while true; do - sleep "${REFRESH_INTERVAL_SEC}" + jittered_sleep "${REFRESH_INTERVAL_SEC}" log "periodic token refresh" - if bash "${HELPER_SCRIPT}" _refresh_gh 2>&1; then + if bash "${HELPER_SCRIPT}" _refresh_gh >/dev/null 2>&1; then log "refresh succeeded" else - log "refresh failed (will retry in ${REFRESH_INTERVAL_SEC}s)" + log "refresh failed (rc=$?) — will retry in ~${REFRESH_INTERVAL_SEC}s" fi done diff --git a/workspace/scripts/molecule-git-token-helper.sh b/workspace/scripts/molecule-git-token-helper.sh index e79bc14a..0faab0fc 100755 --- a/workspace/scripts/molecule-git-token-helper.sh +++ b/workspace/scripts/molecule-git-token-helper.sh @@ -138,19 +138,32 @@ _fetch_token_from_api() { return 1 fi + # NOTE: capture stderr to a tmp file (NOT $response) so the response + # body — which contains the token on success — never lands in error + # log lines via $response interpolation. + local _err_file + _err_file=$(mktemp) response=$(curl -sf \ -H "Authorization: Bearer ${bearer}" \ -H "Accept: application/json" \ --max-time 10 \ - "${ENDPOINT}" 2>&1) || { - echo "[molecule-git-token-helper] platform request failed: ${response}" >&2 + "${ENDPOINT}" 2>"${_err_file}") || { + local _curl_rc=$? + local _err_msg + _err_msg=$(cat "${_err_file}") + rm -f "${_err_file}" + echo "[molecule-git-token-helper] platform request failed (curl rc=${_curl_rc}): ${_err_msg}" >&2 return 1 } + rm -f "${_err_file}" # Parse {"token":"ghs_...","expires_at":"..."} with sed (no jq dependency). token=$(echo "${response}" | sed -n 's/.*"token":"\([^"]*\)".*/\1/p') if [ -z "${token}" ]; then - echo "[molecule-git-token-helper] empty token in platform response: ${response}" >&2 + # SECURITY: the response body MAY contain a token under a different + # JSON key name. Never include $response in this error message — + # log only the size as a coarse debugging signal. + echo "[molecule-git-token-helper] empty token in platform response (body=${#response} bytes)" >&2 return 1 fi From c23ff848aaac7f8a4f12ebe85b466d80a81a7f97 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 23 Apr 2026 11:25:29 -0700 Subject: [PATCH 2/2] fix(cp-provisioner): look up real EC2 instance_id for Stop + IsRunning (#1738) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Resolves a "Save & Restart cascade" failure on SaaS tenants. Observed 2026-04-22 on hongmingwang workspace a8af9d79 after a Config-tab save: 03:13:20 workspace deprovision: TerminateInstances InvalidInstanceID.Malformed: a8af9d79-... is malformed 03:13:21 workspace provision: CreateSecurityGroup InvalidGroup.Duplicate: workspace-a8af9d79-394 already exists for VPC vpc-09f85513b85d7acee Root cause: CPProvisioner.Stop and IsRunning passed the workspace UUID as the `instance_id` query param to CP. CP forwarded it to EC2 TerminateInstances, which rejected it (EC2 ids are i-…, not UUIDs). The failed terminate left the workspace's SG attached → the immediate re-provision hit InvalidGroup.Duplicate → user saw `provisioning failed`. Fix: both methods now call a new `resolveInstanceID` that reads `workspaces.instance_id` from the tenant DB and passes the real EC2 id downstream. When no row / no instance_id exists, Stop is a no-op and IsRunning returns (false, nil) so restart cascades can freshly re-provision. resolveInstanceID is exposed as a `var` package-level func so tests can swap it for a pairs-map stub without standing up sqlmock — the per-table DB scaffolding was a heavier price than the surface warranted given these tests are about the CP HTTP flow downstream of the lookup, not the lookup SQL itself. Adds regression tests: - TestStop_EmptyInstanceIDIsNoop: no DB row → no CP call - TestIsRunning_UsesDBInstanceID: DB id round-trips to CP - TestIsRunning_EmptyInstanceIDReturnsFalse: no instance → false/nil Updates existing tests to assert the resolved instance_id (i-abc123 variants) instead of the previous buggy workspaceID. After this lands, user's existing workspaces with stale instance_id bindings still need a manual cleanup of the orphaned EC2 + SG (done for a8af9d79 today). Future restarts use the correct id. Co-authored-by: Hongming Wang Co-authored-by: Claude Opus 4.7 (1M context) --- .../internal/provisioner/cp_provisioner.go | 65 +++++++++++- .../provisioner/cp_provisioner_test.go | 99 ++++++++++++++++++- 2 files changed, 160 insertions(+), 4 deletions(-) diff --git a/workspace-server/internal/provisioner/cp_provisioner.go b/workspace-server/internal/provisioner/cp_provisioner.go index 0c0e6c9c..6f6ae58d 100644 --- a/workspace-server/internal/provisioner/cp_provisioner.go +++ b/workspace-server/internal/provisioner/cp_provisioner.go @@ -3,6 +3,7 @@ package provisioner import ( "bytes" "context" + "database/sql" "encoding/json" "fmt" "io" @@ -10,6 +11,8 @@ import ( "net/http" "os" "time" + + "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" ) // CPProvisioner provisions workspace agents by calling the control plane's @@ -182,8 +185,26 @@ func (p *CPProvisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, } // Stop terminates the workspace's EC2 instance via the control plane. +// +// Looks up the actual EC2 instance_id from the workspaces table before +// calling CP — earlier versions passed workspaceID (a UUID) as the +// instance_id query param, which CP forwarded to EC2 TerminateInstances, +// which rejected with InvalidInstanceID.Malformed (EC2 IDs are i-… not +// UUIDs). The terminate failure then left the workspace's SG attached, +// blocking the next provision with InvalidGroup.Duplicate — a full +// "Save & Restart" crash on SaaS. func (p *CPProvisioner) Stop(ctx context.Context, workspaceID string) error { - url := fmt.Sprintf("%s/cp/workspaces/%s?instance_id=%s", p.baseURL, workspaceID, workspaceID) + instanceID, err := resolveInstanceID(ctx, workspaceID) + if err != nil { + return fmt.Errorf("cp provisioner: stop: resolve instance_id: %w", err) + } + if instanceID == "" { + // No instance was ever provisioned (or already deprovisioned and + // the column was cleared). Nothing to terminate — idempotent. + log.Printf("CP provisioner: Stop for %s — no instance_id on file, nothing to do", workspaceID) + return nil + } + url := fmt.Sprintf("%s/cp/workspaces/%s?instance_id=%s", p.baseURL, workspaceID, instanceID) req, _ := http.NewRequestWithContext(ctx, "DELETE", url, nil) p.provisionAuthHeaders(req) resp, err := p.httpClient.Do(req) @@ -194,6 +215,35 @@ func (p *CPProvisioner) Stop(ctx context.Context, workspaceID string) error { return nil } +// resolveInstanceID reads workspaces.instance_id for the given workspace. +// Returns ("", nil) when the row exists but has no instance_id recorded +// (edge case for external workspaces or stale rows). Returns an error +// only on real DB failures, not on missing rows — callers (Stop, +// IsRunning) treat the empty string as "nothing to act on." +// +// Exposed as a package var so tests can substitute a stub without +// standing up a sqlmock just to unblock the Stop/IsRunning code path. +// Production code never reassigns it. +var resolveInstanceID = func(ctx context.Context, workspaceID string) (string, error) { + if db.DB == nil { + // Defensive: NewCPProvisioner never runs without db.DB being + // set in main(). If somehow nil, treat as "no instance" rather + // than panicking in the Stop/IsRunning path. + return "", nil + } + var instanceID sql.NullString + err := db.DB.QueryRowContext(ctx, + `SELECT instance_id FROM workspaces WHERE id = $1`, workspaceID, + ).Scan(&instanceID) + if err != nil && err != sql.ErrNoRows { + return "", err + } + if !instanceID.Valid { + return "", nil + } + return instanceID.String, nil +} + // IsRunning checks workspace EC2 instance state via the control plane. // // Contract (matches the Docker Provisioner.IsRunning contract — @@ -219,7 +269,18 @@ func (p *CPProvisioner) Stop(ctx context.Context, workspaceID string) error { // Both callers are happy with (true, err); callers that need the // previous (false, err) shape must inspect err themselves. func (p *CPProvisioner) IsRunning(ctx context.Context, workspaceID string) (bool, error) { - url := fmt.Sprintf("%s/cp/workspaces/%s/status?instance_id=%s", p.baseURL, workspaceID, workspaceID) + instanceID, err := resolveInstanceID(ctx, workspaceID) + if err != nil { + // Treat DB errors the same as transport errors — (true, err) keeps + // a2a_proxy on the alive path and logs the signal. + return true, fmt.Errorf("cp provisioner: status: resolve instance_id: %w", err) + } + if instanceID == "" { + // No instance recorded. Report "not running" cleanly (no error) + // so restart cascades can trigger a fresh provision. + return false, nil + } + url := fmt.Sprintf("%s/cp/workspaces/%s/status?instance_id=%s", p.baseURL, workspaceID, instanceID) req, _ := http.NewRequestWithContext(ctx, "GET", url, nil) p.provisionAuthHeaders(req) resp, err := p.httpClient.Do(req) diff --git a/workspace-server/internal/provisioner/cp_provisioner_test.go b/workspace-server/internal/provisioner/cp_provisioner_test.go index 247863e3..c8553adf 100644 --- a/workspace-server/internal/provisioner/cp_provisioner_test.go +++ b/workspace-server/internal/provisioner/cp_provisioner_test.go @@ -11,6 +11,23 @@ import ( "time" ) +// primeInstanceIDLookup swaps resolveInstanceID for a stub that returns +// the mapped instance_id for the given workspace_id, or "" for anything +// not in the map. Cheaper than standing up a sqlmock since Stop/IsRunning +// tests mostly don't care about the SQL path — they're testing the CP +// HTTP interaction downstream of the lookup. +func primeInstanceIDLookup(t *testing.T, pairs map[string]string) { + t.Helper() + prev := resolveInstanceID + resolveInstanceID = func(_ context.Context, wsID string) (string, error) { + if id, ok := pairs[wsID]; ok { + return id, nil + } + return "", nil + } + t.Cleanup(func() { resolveInstanceID = prev }) +} + // TestNewCPProvisioner_RequiresOrgID — self-hosted deployments don't // have a MOLECULE_ORG_ID, and the provisioner must refuse to construct // rather than silently phone home to the prod CP with an empty tenant. @@ -267,6 +284,12 @@ func TestStart_TransportFailureSurfaces(t *testing.T) { // platform-wide shared secret AND the per-tenant admin token, or the // CP will 401. func TestStop_SendsBothAuthHeaders(t *testing.T) { + // resolveInstanceID looks up the real EC2 id from the workspaces + // table; previously this test passed when the tenant buggily + // reused workspaceID AS instance_id. Now we assert the correct + // EC2 id round-trips. + primeInstanceIDLookup(t, map[string]string{"ws-1": "i-abc123"}) + var sawBearer, sawAdminToken, sawMethod, sawPath string var sawInstance string srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -295,8 +318,8 @@ func TestStop_SendsBothAuthHeaders(t *testing.T) { if sawPath != "/cp/workspaces/ws-1" { t.Errorf("path = %q, want /cp/workspaces/ws-1", sawPath) } - if sawInstance != "ws-1" { - t.Errorf("instance_id query = %q, want ws-1", sawInstance) + if sawInstance != "i-abc123" { + t.Errorf("instance_id query = %q, want i-abc123 (from DB lookup, NOT the workspace UUID)", sawInstance) } if sawBearer != "Bearer s3cret" { t.Errorf("bearer = %q, want Bearer s3cret", sawBearer) @@ -310,6 +333,7 @@ func TestStop_SendsBothAuthHeaders(t *testing.T) { // teardown call hits a dead CP, the error must surface so the caller // knows the workspace might still be running and needs retry. func TestStop_TransportErrorSurfaces(t *testing.T) { + primeInstanceIDLookup(t, map[string]string{"ws-1": "i-abc123"}) p := &CPProvisioner{ baseURL: "http://127.0.0.1:1", orgID: "org-1", @@ -327,6 +351,7 @@ func TestStop_TransportErrorSurfaces(t *testing.T) { // TestIsRunning_ParsesStateField — CP returns the EC2 state, we expose // a bool ("running"/"pending"/"terminated" → true only for "running"). func TestIsRunning_ParsesStateField(t *testing.T) { + primeInstanceIDLookup(t, map[string]string{"ws-1": "i-parsesstatefield"}) cases := map[string]bool{ "running": true, "pending": false, @@ -364,6 +389,7 @@ func TestIsRunning_ParsesStateField(t *testing.T) { // require the same per-tenant auth because they leak public_ip + // private_ip to the caller. func TestIsRunning_SendsBothAuthHeaders(t *testing.T) { + primeInstanceIDLookup(t, map[string]string{"ws-1": "i-auth-headers"}) var sawBearer, sawAdminToken string srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { sawBearer = r.Header.Get("Authorization") @@ -398,6 +424,7 @@ func TestIsRunning_SendsBothAuthHeaders(t *testing.T) { // The sweeper (healthsweep.go) inspects err independently and skips // on any error, so (true, err) is equally safe for that caller. func TestIsRunning_TransportErrorReturnsTrue(t *testing.T) { + primeInstanceIDLookup(t, map[string]string{"ws-1": "i-transport"}) p := &CPProvisioner{ baseURL: "http://127.0.0.1:1", orgID: "org-1", @@ -418,6 +445,7 @@ func TestIsRunning_TransportErrorReturnsTrue(t *testing.T) { // the sweeper would see the workspace as not-running. Now every // non-2xx is an error the caller can log + retry. func TestIsRunning_Non2xxSurfacesError(t *testing.T) { + primeInstanceIDLookup(t, map[string]string{"ws-1": "i-non2xx"}) cases := []struct { name string status int @@ -459,6 +487,7 @@ func TestIsRunning_Non2xxSurfacesError(t *testing.T) { // a middleware glitch (HTML error page with 200) from looking like // "workspace stopped". func TestIsRunning_MalformedJSONBodyReturnsError(t *testing.T) { + primeInstanceIDLookup(t, map[string]string{"ws-1": "i-malformed"}) srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) _, _ = io.WriteString(w, "maintenance mode") @@ -486,6 +515,7 @@ func TestIsRunning_MalformedJSONBodyReturnsError(t *testing.T) { // the documented contract values rather than simulating the whole // a2a_proxy flow. func TestIsRunning_ContractCompat_A2AProxy(t *testing.T) { + primeInstanceIDLookup(t, map[string]string{"ws-1": "i-contract"}) // Simulate every error path and assert running==true for each. t.Run("transport error", func(t *testing.T) { p := &CPProvisioner{ @@ -559,6 +589,7 @@ func TestIsRunning_ContractCompat_A2AProxy(t *testing.T) { // only needs the prefix to produce a value, so the decode succeeds — // and the LimitReader enforces the cap regardless. func TestIsRunning_BoundedBodyRead(t *testing.T) { + primeInstanceIDLookup(t, map[string]string{"ws-1": "i-bounded"}) srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(200) // Valid JSON prefix, then pad well past the 64 KiB cap. @@ -628,3 +659,67 @@ func TestGetConsoleOutput_UsesAdminBearer(t *testing.T) { t.Errorf("bearer = %q, want Bearer admin-api-key (NOT the provision secret)", sawBearer) } } + +// TestStop_EmptyInstanceIDIsNoop — when workspaces.instance_id is NULL +// (e.g. a row that was never provisioned, or deprovisioned and cleared), +// Stop should be a no-op instead of sending a malformed CP request. +// Regression guard: previously Stop sent workspaceID as instance_id +// even when no EC2 had been booked, causing CP → EC2 to 400. +func TestStop_EmptyInstanceIDIsNoop(t *testing.T) { + // Empty map → lookup returns ("", nil) for any workspace. + primeInstanceIDLookup(t, map[string]string{}) + + hit := false + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + hit = true + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + p := &CPProvisioner{baseURL: srv.URL, orgID: "org-1", httpClient: srv.Client()} + if err := p.Stop(context.Background(), "ws-ghost"); err != nil { + t.Fatalf("Stop with empty instance_id should no-op, got err %v", err) + } + if hit { + t.Errorf("Stop contacted CP even though instance_id was empty") + } +} + +// TestIsRunning_UsesDBInstanceID — IsRunning must also look up +// instance_id from the workspaces table, same as Stop. Mirror of +// TestStop_SendsBothAuthHeaders. +func TestIsRunning_UsesDBInstanceID(t *testing.T) { + primeInstanceIDLookup(t, map[string]string{"ws-1": "i-xyz789"}) + + var sawInstance string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + sawInstance = r.URL.Query().Get("instance_id") + w.WriteHeader(http.StatusOK) + _, _ = io.WriteString(w, `{"state":"running"}`) + })) + defer srv.Close() + + p := &CPProvisioner{baseURL: srv.URL, orgID: "org-1", httpClient: srv.Client()} + if _, err := p.IsRunning(context.Background(), "ws-1"); err != nil { + t.Fatalf("IsRunning: %v", err) + } + if sawInstance != "i-xyz789" { + t.Errorf("instance_id query = %q, want i-xyz789 (from DB lookup)", sawInstance) + } +} + +// TestIsRunning_EmptyInstanceIDReturnsFalse — IsRunning on a +// workspace with no recorded EC2 instance must report (false, nil) so +// restart cascades re-provision fresh instead of looping on a stale +// row with no backing instance. +func TestIsRunning_EmptyInstanceIDReturnsFalse(t *testing.T) { + primeInstanceIDLookup(t, map[string]string{}) + p := &CPProvisioner{baseURL: "http://unused", orgID: "org-1"} + running, err := p.IsRunning(context.Background(), "ws-ghost") + if err != nil { + t.Errorf("IsRunning with empty instance_id should return (false, nil), got err %v", err) + } + if running { + t.Errorf("IsRunning with empty instance_id should return running=false, got true") + } +}