From 536b51cbc886438857d18fed626d54ad81f2b1a1 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Sun, 7 Jun 2026 07:36:19 +0000 Subject: [PATCH 1/6] fix(provision): fail-closed on instance_id persist failure to prevent EC2 orphan (#1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When cpProv.Start succeeds but the DB UPDATE for instance_id fails, workspace row lacks the instance_id. Without it, later Stop/IsRunning lookups return empty, leaving the EC2 instance orphaned (untouchable and unbilled). Make the persist failure fatal: 1. Mark the workspace failed via markProvisionFailed so the operator sees the problem and can retry. The instance_id is logged prominently so an operator can manually reconcile. 2. DO NOT auto-terminate the live EC2 — the instance may contain valuable state the operator wants to recover. The CP orphan sweeper will handle cleanup if the workspace is later removed. Fixes ticket #1 from Researcher cleanup audit. --- .../internal/handlers/workspace_provision.go | 17 +++-- .../handlers/workspace_provision_test.go | 64 +++++++++++++++++++ 2 files changed, 77 insertions(+), 4 deletions(-) diff --git a/workspace-server/internal/handlers/workspace_provision.go b/workspace-server/internal/handlers/workspace_provision.go index 8dc4f921e..dc3d81b1f 100644 --- a/workspace-server/internal/handlers/workspace_provision.go +++ b/workspace-server/internal/handlers/workspace_provision.go @@ -1396,10 +1396,19 @@ func (h *WorkspaceHandler) provisionWorkspaceCP(workspaceID, templatePath string if _, err := db.DB.ExecContext(ctx, `UPDATE workspaces SET instance_id = $2, updated_at = now() WHERE id = $1`, workspaceID, machineID); err != nil { - // Non-fatal: provisioning succeeded, the workspace will still run. - // The row stays without instance_id — terminal falls back to the - // "CP-provisioned but unreachable" error, not a silent failure. - log.Printf("CPProvisioner: persist instance_id failed for %s: %v", workspaceID, err) + // Fatal: without instance_id in the DB we cannot Stop the EC2 later + // (resolveInstanceID returns ""). We do NOT auto-terminate the live + // instance — it may contain valuable state the operator wants to recover. + // Instead, log the instance_id prominently and mark the workspace failed + // so the operator can manually reconcile. The CP orphan sweeper will + // eventually clean up if the workspace is later removed. (#1 Researcher + // cleanup audit) + log.Printf("CPProvisioner: CRITICAL persist instance_id failed for %s: %v — EC2 instance %s is RUNNING but UNTRACKED. Operator must manually reconcile or remove the workspace to trigger orphan cleanup.", workspaceID, err, machineID) + h.markProvisionFailed(ctx, workspaceID, "instance_id persist failed — EC2 untracked", map[string]interface{}{ + "error": err.Error(), + "instance_id": machineID, + }) + return } log.Printf("CPProvisioner: workspace %s started as machine %s via control plane", workspaceID, machineID) diff --git a/workspace-server/internal/handlers/workspace_provision_test.go b/workspace-server/internal/handlers/workspace_provision_test.go index 2418f50a4..13f398f66 100644 --- a/workspace-server/internal/handlers/workspace_provision_test.go +++ b/workspace-server/internal/handlers/workspace_provision_test.go @@ -1760,6 +1760,70 @@ func (m *mockResolver) Fetch(_ context.Context, _, _ string) (string, error) { return m.fetchName, m.fetchErr } +// TestProvisionWorkspaceCP_InstanceIDPersistFail_MarksFailed asserts that +// when cpProv.Start succeeds but the DB UPDATE for instance_id fails, the +// handler marks the workspace failed WITHOUT terminating the live EC2 (the +// instance may contain valuable state; operator decides). Regression test for +// ticket #1 (Researcher cleanup audit). +func TestProvisionWorkspaceCP_InstanceIDPersistFail_MarksFailed(t *testing.T) { + t.Setenv("MOLECULE_LLM_BASE_URL", "https://api.example.test/api/v1/internal/llm/openai/v1") + t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "tenant-admin-token") + + mock := setupTestDB(t) + + mock.ExpectQuery(`SELECT key, encrypted_value, encryption_version FROM global_secrets`). + WillReturnRows(sqlmock.NewRows([]string{"key", "encrypted_value", "encryption_version"})) + mock.ExpectQuery(`SELECT key, encrypted_value, encryption_version FROM workspace_secrets`). + WithArgs("ws-cp-orphan"). + WillReturnRows(sqlmock.NewRows([]string{"key", "encrypted_value", "encryption_version"})) + + // instance_id persist fails + mock.ExpectExec(`UPDATE workspaces SET instance_id =`). + WithArgs("ws-cp-orphan", "i-12345"). + WillReturnError(fmt.Errorf("connection reset by peer")) + + // markProvisionFailed updates status to failed + mock.ExpectExec(`UPDATE workspaces SET status =`). + WithArgs(sqlmock.AnyArg(), sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) + + cap := &captureBroadcaster{} + stub := &stubInstanceIDPersistFailCPProv{instanceID: "i-12345"} + handler := NewWorkspaceHandler(cap, nil, "http://localhost:8080", t.TempDir()) + handler.SetCPProvisioner(stub) + + handler.provisionWorkspaceCP("ws-cp-orphan", "/nonexistent/template", nil, models.CreateWorkspacePayload{ + Name: "ws-cp-orphan", + Tier: 1, + Runtime: "claude-code", + }) + + if cap.lastData == nil { + t.Fatal("expected RecordAndBroadcast to capture data on persist failure; got nothing") + } + if got := cap.lastData["error"]; got != "instance_id persist failed — EC2 untracked" { + t.Errorf("broadcast error message = %q, want 'instance_id persist failed — EC2 untracked'", got) + } +} + +// stubInstanceIDPersistFailCPProv implements CPProvisionerAPI for the +// instance-id-persist-failure test. +type stubInstanceIDPersistFailCPProv struct { + instanceID string +} + +func (s *stubInstanceIDPersistFailCPProv) Start(_ context.Context, _ provisioner.WorkspaceConfig) (string, error) { + return s.instanceID, nil +} +func (s *stubInstanceIDPersistFailCPProv) Stop(_ context.Context, _ string) error { return nil } +func (s *stubInstanceIDPersistFailCPProv) StopAndPrune(_ context.Context, _ string) error { return nil } +func (s *stubInstanceIDPersistFailCPProv) GetConsoleOutput(_ context.Context, _ string) (string, error) { + return "", nil +} +func (s *stubInstanceIDPersistFailCPProv) IsRunning(_ context.Context, _ string) (bool, error) { + return true, nil +} + // TestRuntimeUsesAnthropicNativeProxy_CaseAndWhitespace proves the // strings.EqualFold hardening: the runtime check now matches "claude-code" // case-insensitively (and after trimming whitespace) instead of relying on -- 2.52.0 From 354f07604b7db35dd0d1cc44f0a20cf72b4e5e7c Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Sun, 7 Jun 2026 09:20:05 +0000 Subject: [PATCH 2/6] fix(provision): bounded retry for instance_id persist + fail-closed-with-visibility (#1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewer RC 9344 requested changes: 1. REMOVED auto-terminate (was not present, reaffirmed no-destroy policy). 2. ADDED bounded retry (3 attempts, 100ms→200ms→400ms backoff) for the instance_id persist UPDATE. Transient DB blips no longer orphan EC2s. 3. If all retries fail → mark workspace FAILED and record orphaned instance_id in broadcast event + last_sample_error for operator/reaper reconciliation. The live EC2 is left running. 4. Scope kept to workspace_provision.go + workspace_provision_test.go only. 5. Fixed test expectations for retry path. Regression tests: - TestProvisionWorkspaceCP_InstanceIDPersistFail_MarksFailed: all-attempts-fail → marked failed + instance_id recorded + Stop NOT called - TestProvisionWorkspaceCP_InstanceIDPersistFail_RetrySucceeds: first-fail-second-success → no failure mark + proceeds normally --- .../internal/handlers/workspace_provision.go | 50 +++++++--- .../handlers/workspace_provision_test.go | 93 ++++++++++++++++--- 2 files changed, 117 insertions(+), 26 deletions(-) diff --git a/workspace-server/internal/handlers/workspace_provision.go b/workspace-server/internal/handlers/workspace_provision.go index dc3d81b1f..1b9e423d0 100644 --- a/workspace-server/internal/handlers/workspace_provision.go +++ b/workspace-server/internal/handlers/workspace_provision.go @@ -19,6 +19,15 @@ import ( "gopkg.in/yaml.v3" ) +// instanceIDPersistRetryAttempts caps total instance_id UPDATE attempts +// (initial + retries). 3 catches transient DB blips without stalling the +// provision goroutine past the context timeout. +var instanceIDPersistRetryAttempts = 3 + +// instanceIDPersistRetryBaseDelay is the first-retry backoff. Doubles each +// attempt: 100ms → 200ms → 400ms. Total stall ≤ 700ms. +var instanceIDPersistRetryBaseDelay = 100 * time.Millisecond + // logProvisionPanic is the deferred recover at the top of every provision // goroutine. Without it, a panic inside provisionWorkspaceOpts / // provisionWorkspaceCP propagates up the goroutine stack and crashes the @@ -1393,20 +1402,35 @@ func (h *WorkspaceHandler) provisionWorkspaceCP(workspaceID, templatePath string // Persist the backing instance id so later operations (terminal via // EIC+SSH, live logs, debug introspection) can resolve workspace → EC2 // without re-asking CP on every request. - if _, err := db.DB.ExecContext(ctx, - `UPDATE workspaces SET instance_id = $2, updated_at = now() WHERE id = $1`, - workspaceID, machineID); err != nil { - // Fatal: without instance_id in the DB we cannot Stop the EC2 later - // (resolveInstanceID returns ""). We do NOT auto-terminate the live - // instance — it may contain valuable state the operator wants to recover. - // Instead, log the instance_id prominently and mark the workspace failed - // so the operator can manually reconcile. The CP orphan sweeper will - // eventually clean up if the workspace is later removed. (#1 Researcher - // cleanup audit) - log.Printf("CPProvisioner: CRITICAL persist instance_id failed for %s: %v — EC2 instance %s is RUNNING but UNTRACKED. Operator must manually reconcile or remove the workspace to trigger orphan cleanup.", workspaceID, err, machineID) - h.markProvisionFailed(ctx, workspaceID, "instance_id persist failed — EC2 untracked", map[string]interface{}{ - "error": err.Error(), + // + // Bounded retry with exponential backoff: a transient DB blip must not + // orphan a healthy running instance. If all retries fail, mark the + // workspace failed and record the instance_id in the broadcast event + + // last_sample_error so an operator/reaper can reconcile later. The live + // EC2 is NOT terminated — it may contain valuable state. (#1) + var persistErr error + delay := instanceIDPersistRetryBaseDelay + for attempt := 1; attempt <= instanceIDPersistRetryAttempts; attempt++ { + _, persistErr = db.DB.ExecContext(ctx, + `UPDATE workspaces SET instance_id = $2, updated_at = now() WHERE id = $1`, + workspaceID, machineID) + if persistErr == nil { + if attempt > 1 { + log.Printf("CPProvisioner: instance_id persist for %s succeeded on attempt %d", workspaceID, attempt) + } + break + } + if attempt < instanceIDPersistRetryAttempts { + time.Sleep(delay) + delay *= 2 + } + } + if persistErr != nil { + log.Printf("CPProvisioner: CRITICAL persist instance_id failed for %s after %d attempts: %v — EC2 instance %s is RUNNING but UNTRACKED. Operator must manually reconcile or remove the workspace to trigger orphan cleanup.", workspaceID, instanceIDPersistRetryAttempts, persistErr, machineID) + h.markProvisionFailed(ctx, workspaceID, "instance_id persist failed after retry — EC2 untracked", map[string]interface{}{ + "error": persistErr.Error(), "instance_id": machineID, + "attempts": instanceIDPersistRetryAttempts, }) return } diff --git a/workspace-server/internal/handlers/workspace_provision_test.go b/workspace-server/internal/handlers/workspace_provision_test.go index 13f398f66..f93e1d5aa 100644 --- a/workspace-server/internal/handlers/workspace_provision_test.go +++ b/workspace-server/internal/handlers/workspace_provision_test.go @@ -1761,11 +1761,16 @@ func (m *mockResolver) Fetch(_ context.Context, _, _ string) (string, error) { } // TestProvisionWorkspaceCP_InstanceIDPersistFail_MarksFailed asserts that -// when cpProv.Start succeeds but the DB UPDATE for instance_id fails, the -// handler marks the workspace failed WITHOUT terminating the live EC2 (the -// instance may contain valuable state; operator decides). Regression test for -// ticket #1 (Researcher cleanup audit). +// when cpProv.Start succeeds but the DB UPDATE for instance_id fails on ALL +// retry attempts, the handler marks the workspace failed WITHOUT terminating +// the live EC2. The orphaned instance_id is recorded in the broadcast event +// for operator reconciliation. Regression test for ticket #1. func TestProvisionWorkspaceCP_InstanceIDPersistFail_MarksFailed(t *testing.T) { + // Shrink retry backoff so the test doesn't stall. + prevDelay := instanceIDPersistRetryBaseDelay + instanceIDPersistRetryBaseDelay = 1 * time.Millisecond + t.Cleanup(func() { instanceIDPersistRetryBaseDelay = prevDelay }) + t.Setenv("MOLECULE_LLM_BASE_URL", "https://api.example.test/api/v1/internal/llm/openai/v1") t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "tenant-admin-token") @@ -1777,12 +1782,14 @@ func TestProvisionWorkspaceCP_InstanceIDPersistFail_MarksFailed(t *testing.T) { WithArgs("ws-cp-orphan"). WillReturnRows(sqlmock.NewRows([]string{"key", "encrypted_value", "encryption_version"})) - // instance_id persist fails - mock.ExpectExec(`UPDATE workspaces SET instance_id =`). - WithArgs("ws-cp-orphan", "i-12345"). - WillReturnError(fmt.Errorf("connection reset by peer")) + // All 3 retry attempts fail. + for i := 0; i < instanceIDPersistRetryAttempts; i++ { + mock.ExpectExec(`UPDATE workspaces SET instance_id =`). + WithArgs("ws-cp-orphan", "i-12345"). + WillReturnError(fmt.Errorf("connection reset by peer")) + } - // markProvisionFailed updates status to failed + // markProvisionFailed updates status to failed. mock.ExpectExec(`UPDATE workspaces SET status =`). WithArgs(sqlmock.AnyArg(), sqlmock.AnyArg(), sqlmock.AnyArg()). WillReturnResult(sqlmock.NewResult(0, 1)) @@ -1801,21 +1808,81 @@ func TestProvisionWorkspaceCP_InstanceIDPersistFail_MarksFailed(t *testing.T) { if cap.lastData == nil { t.Fatal("expected RecordAndBroadcast to capture data on persist failure; got nothing") } - if got := cap.lastData["error"]; got != "instance_id persist failed — EC2 untracked" { - t.Errorf("broadcast error message = %q, want 'instance_id persist failed — EC2 untracked'", got) + if got := cap.lastData["error"]; got != "instance_id persist failed after retry — EC2 untracked" { + t.Errorf("broadcast error message = %q, want 'instance_id persist failed after retry — EC2 untracked'", got) + } + if got := cap.lastData["instance_id"]; got != "i-12345" { + t.Errorf("broadcast instance_id = %v, want 'i-12345'", got) + } + if got := cap.lastData["attempts"]; got != instanceIDPersistRetryAttempts { + t.Errorf("broadcast attempts = %v, want %d", got, instanceIDPersistRetryAttempts) + } + if stub.stopCalls != 0 { + t.Errorf("Stop called %d times; want 0 (live instance must NOT be terminated)", stub.stopCalls) + } +} + +// TestProvisionWorkspaceCP_InstanceIDPersistFail_RetrySucceeds asserts that a +// transient DB blip on the first attempt is recovered by the bounded retry: +// the second UPDATE succeeds and the workspace proceeds to online normally. +func TestProvisionWorkspaceCP_InstanceIDPersistFail_RetrySucceeds(t *testing.T) { + prevDelay := instanceIDPersistRetryBaseDelay + instanceIDPersistRetryBaseDelay = 1 * time.Millisecond + t.Cleanup(func() { instanceIDPersistRetryBaseDelay = prevDelay }) + + t.Setenv("MOLECULE_LLM_BASE_URL", "https://api.example.test/api/v1/internal/llm/openai/v1") + t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "tenant-admin-token") + + mock := setupTestDB(t) + + mock.ExpectQuery(`SELECT key, encrypted_value, encryption_version FROM global_secrets`). + WillReturnRows(sqlmock.NewRows([]string{"key", "encrypted_value", "encryption_version"})) + mock.ExpectQuery(`SELECT key, encrypted_value, encryption_version FROM workspace_secrets`). + WithArgs("ws-cp-retry-ok"). + WillReturnRows(sqlmock.NewRows([]string{"key", "encrypted_value", "encryption_version"})) + + // First attempt fails, second succeeds. + mock.ExpectExec(`UPDATE workspaces SET instance_id =`). + WithArgs("ws-cp-retry-ok", "i-retry-ok"). + WillReturnError(fmt.Errorf("connection reset by peer")) + mock.ExpectExec(`UPDATE workspaces SET instance_id =`). + WithArgs("ws-cp-retry-ok", "i-retry-ok"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + cap := &captureBroadcaster{} + stub := &stubInstanceIDPersistFailCPProv{instanceID: "i-retry-ok"} + handler := NewWorkspaceHandler(cap, nil, "http://localhost:8080", t.TempDir()) + handler.SetCPProvisioner(stub) + + handler.provisionWorkspaceCP("ws-cp-retry-ok", "/nonexistent/template", nil, models.CreateWorkspacePayload{ + Name: "ws-cp-retry-ok", + Tier: 1, + Runtime: "claude-code", + }) + + // No failure broadcast should have fired. + if cap.lastData != nil { + t.Fatalf("expected NO failure broadcast on retry success; got %v", cap.lastData) + } + if stub.stopCalls != 0 { + t.Errorf("Stop called %d times; want 0", stub.stopCalls) } } // stubInstanceIDPersistFailCPProv implements CPProvisionerAPI for the -// instance-id-persist-failure test. +// instance-id-persist-failure tests. type stubInstanceIDPersistFailCPProv struct { instanceID string + stopCalls int } func (s *stubInstanceIDPersistFailCPProv) Start(_ context.Context, _ provisioner.WorkspaceConfig) (string, error) { return s.instanceID, nil } -func (s *stubInstanceIDPersistFailCPProv) Stop(_ context.Context, _ string) error { return nil } +func (s *stubInstanceIDPersistFailCPProv) Stop(_ context.Context, _ string) error { + s.stopCalls++ + return nil +} func (s *stubInstanceIDPersistFailCPProv) StopAndPrune(_ context.Context, _ string) error { return nil } func (s *stubInstanceIDPersistFailCPProv) GetConsoleOutput(_ context.Context, _ string) (string, error) { return "", nil -- 2.52.0 From 8f723c518c01a90b737cf73a5e763f204e86d0a7 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Sun, 7 Jun 2026 09:25:56 +0000 Subject: [PATCH 3/6] fix(test): add missing time import in workspace_provision_test.go Platform Go failed in go vet because time.Millisecond was used without importing the time package. --- workspace-server/internal/handlers/workspace_provision_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/workspace-server/internal/handlers/workspace_provision_test.go b/workspace-server/internal/handlers/workspace_provision_test.go index f93e1d5aa..a236a8133 100644 --- a/workspace-server/internal/handlers/workspace_provision_test.go +++ b/workspace-server/internal/handlers/workspace_provision_test.go @@ -12,6 +12,7 @@ import ( "path/filepath" "strings" "testing" + "time" "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/memory/contract" "git.moleculesai.app/molecule-ai/molecule-core/workspace-server/internal/models" -- 2.52.0 From eae7587a5092ee1f1858bc0c5f291c0be6112432 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Sun, 7 Jun 2026 09:40:46 +0000 Subject: [PATCH 4/6] fix(test): correct broadcast error key in instance_id persist failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit markProvisionFailed does NOT overwrite an existing 'error' key in the extra map — it only adds one if absent. The test expected the broadcast 'error' to be the human-readable msg, but because we passed 'error' in the extra map, the broadcast got the raw DB error instead. Fix: pass the raw DB error as 'detail' and let markProvisionFailed set 'error' to the human-readable msg. --- workspace-server/internal/handlers/workspace_provision.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workspace-server/internal/handlers/workspace_provision.go b/workspace-server/internal/handlers/workspace_provision.go index 1b9e423d0..4fa14accd 100644 --- a/workspace-server/internal/handlers/workspace_provision.go +++ b/workspace-server/internal/handlers/workspace_provision.go @@ -1428,9 +1428,9 @@ func (h *WorkspaceHandler) provisionWorkspaceCP(workspaceID, templatePath string if persistErr != nil { log.Printf("CPProvisioner: CRITICAL persist instance_id failed for %s after %d attempts: %v — EC2 instance %s is RUNNING but UNTRACKED. Operator must manually reconcile or remove the workspace to trigger orphan cleanup.", workspaceID, instanceIDPersistRetryAttempts, persistErr, machineID) h.markProvisionFailed(ctx, workspaceID, "instance_id persist failed after retry — EC2 untracked", map[string]interface{}{ - "error": persistErr.Error(), "instance_id": machineID, "attempts": instanceIDPersistRetryAttempts, + "detail": persistErr.Error(), }) return } -- 2.52.0 From 215605e234f4a556a1f0b7e3b19528e6868d80ed Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Sun, 7 Jun 2026 09:50:44 +0000 Subject: [PATCH 5/6] fix(test): add missing mintWorkspaceSecrets sqlmock expectations TestProvisionWorkspaceCP_InstanceIDPersistFail_MarksFailed and RetrySucceeds were missing expectations for the workspace_auth_tokens revoke + insert and workspaces platform_inbound_secret UPDATE that mintWorkspaceSecrets executes before reaching the instance_id persist step. Platform Go failed because sqlmock saw unexpected queries. Add the three missing expectations to both tests. --- .../handlers/workspace_provision_test.go | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/workspace-server/internal/handlers/workspace_provision_test.go b/workspace-server/internal/handlers/workspace_provision_test.go index a236a8133..cfd4244c6 100644 --- a/workspace-server/internal/handlers/workspace_provision_test.go +++ b/workspace-server/internal/handlers/workspace_provision_test.go @@ -1783,6 +1783,17 @@ func TestProvisionWorkspaceCP_InstanceIDPersistFail_MarksFailed(t *testing.T) { WithArgs("ws-cp-orphan"). WillReturnRows(sqlmock.NewRows([]string{"key", "encrypted_value", "encryption_version"})) + // mintWorkspaceSecrets: revoke + issue auth token + inbound secret + mock.ExpectExec(`UPDATE workspace_auth_tokens SET revoked_at`). + WithArgs("ws-cp-orphan"). + WillReturnResult(sqlmock.NewResult(0, 0)) + mock.ExpectExec(`INSERT INTO workspace_auth_tokens`). + WithArgs("ws-cp-orphan", sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(1, 1)) + mock.ExpectExec(`UPDATE workspaces SET platform_inbound_secret`). + WithArgs(sqlmock.AnyArg(), "ws-cp-orphan"). + WillReturnResult(sqlmock.NewResult(0, 1)) + // All 3 retry attempts fail. for i := 0; i < instanceIDPersistRetryAttempts; i++ { mock.ExpectExec(`UPDATE workspaces SET instance_id =`). @@ -1842,6 +1853,17 @@ func TestProvisionWorkspaceCP_InstanceIDPersistFail_RetrySucceeds(t *testing.T) WithArgs("ws-cp-retry-ok"). WillReturnRows(sqlmock.NewRows([]string{"key", "encrypted_value", "encryption_version"})) + // mintWorkspaceSecrets: revoke + issue auth token + inbound secret + mock.ExpectExec(`UPDATE workspace_auth_tokens SET revoked_at`). + WithArgs("ws-cp-retry-ok"). + WillReturnResult(sqlmock.NewResult(0, 0)) + mock.ExpectExec(`INSERT INTO workspace_auth_tokens`). + WithArgs("ws-cp-retry-ok", sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(1, 1)) + mock.ExpectExec(`UPDATE workspaces SET platform_inbound_secret`). + WithArgs(sqlmock.AnyArg(), "ws-cp-retry-ok"). + WillReturnResult(sqlmock.NewResult(0, 1)) + // First attempt fails, second succeeds. mock.ExpectExec(`UPDATE workspaces SET instance_id =`). WithArgs("ws-cp-retry-ok", "i-retry-ok"). -- 2.52.0 From c92026cb58633876af9d3cb6925b0b563fe29727 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Sun, 7 Jun 2026 14:52:20 +0000 Subject: [PATCH 6/6] security(provision): remove raw DB error from client-visible broadcast + hermetic test guard (RC 9378) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CODE (workspace_provision.go): - Remove 'detail' key from markProvisionFailed extra map entirely - Client-visible broadcast now contains ONLY: - error: safe operator message ('instance_id persist failed after retry — EC2 untracked') - instance_id: the orphaned EC2 instance ID - attempts: retry attempt count - Raw DB error (e.g. 'connection reset by peer') stays server-log-only (already logged above the markProvisionFailed call) TEST (workspace_provision_test.go): - Add t.Setenv('MOLECULE_DEPLOY_MODE', 'self-hosted') for hermeticity (prevents sqlmock skew in SaaS-mode runners where mintWorkspaceSecrets skips INSERT workspace_auth_tokens) - Add explicit security assertions: - no 'detail' / 'db_error' / 'raw_error' keys in broadcast payload - no string field contains the raw DB error text - Assert broadcast error = safe message (existing) - Assert instance_id and attempts present (existing) Fixes #2392 (RC 9378 security leak). --- .../internal/handlers/workspace_provision.go | 3 ++- .../internal/handlers/workspace_provision_test.go | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/workspace-server/internal/handlers/workspace_provision.go b/workspace-server/internal/handlers/workspace_provision.go index 4fa14accd..9c83a76bb 100644 --- a/workspace-server/internal/handlers/workspace_provision.go +++ b/workspace-server/internal/handlers/workspace_provision.go @@ -1427,10 +1427,11 @@ func (h *WorkspaceHandler) provisionWorkspaceCP(workspaceID, templatePath string } if persistErr != nil { log.Printf("CPProvisioner: CRITICAL persist instance_id failed for %s after %d attempts: %v — EC2 instance %s is RUNNING but UNTRACKED. Operator must manually reconcile or remove the workspace to trigger orphan cleanup.", workspaceID, instanceIDPersistRetryAttempts, persistErr, machineID) + // Server-only log already captures the raw error above; broadcast gets + // safe fields only (no client-visible DB error). Security: RC 9378. h.markProvisionFailed(ctx, workspaceID, "instance_id persist failed after retry — EC2 untracked", map[string]interface{}{ "instance_id": machineID, "attempts": instanceIDPersistRetryAttempts, - "detail": persistErr.Error(), }) return } diff --git a/workspace-server/internal/handlers/workspace_provision_test.go b/workspace-server/internal/handlers/workspace_provision_test.go index cfd4244c6..83b180f51 100644 --- a/workspace-server/internal/handlers/workspace_provision_test.go +++ b/workspace-server/internal/handlers/workspace_provision_test.go @@ -1774,6 +1774,7 @@ func TestProvisionWorkspaceCP_InstanceIDPersistFail_MarksFailed(t *testing.T) { t.Setenv("MOLECULE_LLM_BASE_URL", "https://api.example.test/api/v1/internal/llm/openai/v1") t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "tenant-admin-token") + t.Setenv("MOLECULE_DEPLOY_MODE", "self-hosted") mock := setupTestDB(t) @@ -1829,6 +1830,18 @@ func TestProvisionWorkspaceCP_InstanceIDPersistFail_MarksFailed(t *testing.T) { if got := cap.lastData["attempts"]; got != instanceIDPersistRetryAttempts { t.Errorf("broadcast attempts = %v, want %d", got, instanceIDPersistRetryAttempts) } + // Security: RC 9378 — raw DB error must NEVER be client-visible in broadcast/WS/SSE. + for _, key := range []string{"detail", "db_error", "raw_error"} { + if val, has := cap.lastData[key]; has { + t.Errorf("broadcast must NOT contain raw DB error under key %q; got %v", key, val) + } + } + // Also verify no raw error string leaked into any broadcast field. + for key, val := range cap.lastData { + if s, ok := val.(string); ok && strings.Contains(s, "connection reset by peer") { + t.Errorf("broadcast field %q contains raw DB error leak: %q", key, s) + } + } if stub.stopCalls != 0 { t.Errorf("Stop called %d times; want 0 (live instance must NOT be terminated)", stub.stopCalls) } @@ -1844,6 +1857,7 @@ func TestProvisionWorkspaceCP_InstanceIDPersistFail_RetrySucceeds(t *testing.T) t.Setenv("MOLECULE_LLM_BASE_URL", "https://api.example.test/api/v1/internal/llm/openai/v1") t.Setenv("MOLECULE_LLM_USAGE_TOKEN", "tenant-admin-token") + t.Setenv("MOLECULE_DEPLOY_MODE", "self-hosted") mock := setupTestDB(t) -- 2.52.0