From 977502befdb2716c7fd4b5fd4c3b9c961f831e9d Mon Sep 17 00:00:00 2001 From: core-devops Date: Thu, 11 Jun 2026 12:43:53 -0700 Subject: [PATCH] fix(secrets): never auto-restart the org's platform root on secret changes (core#2573) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The #2573 self-write skip only covers callers presenting a workspace token / X-Workspace-ID. The org concierge's management MCP authenticates with the tenant ADMIN token, so callerID resolves to "" and the skip never fired: a secret write/delete targeting the concierge terminated the org root's box mid-turn — twice on 2026-06-11, once costing a 14h outage (silent provision failure, cp#691). - autoRestartAllowed(): self-write skip + NEW kind='platform' skip; the kind lookup FAILS CLOSED (unknown kind -> no restart) since a wrong restart on the org root is the exact outage this prevents, while a skipped restart only delays env propagation until the next explicit restart. - restartAllAffectedByGlobalKey(): same exclusion in the global-secret fan-out query (COALESCE(kind,'workspace') <> 'platform') — a global rotation must not tear down the org root either. - Tests: platform-root skip for Set/Delete, fan-out SQL exclusion pinned by regex, kind-lookup expectations added to the existing restart-fire tests (#2584 spoof tests included). Co-Authored-By: Claude Fable 5 --- .../handlers/handlers_additional_test.go | 8 + workspace-server/internal/handlers/secrets.go | 56 ++++++- .../internal/handlers/secrets_test.go | 151 ++++++++++++++++++ 3 files changed, 210 insertions(+), 5 deletions(-) diff --git a/workspace-server/internal/handlers/handlers_additional_test.go b/workspace-server/internal/handlers/handlers_additional_test.go index a47540a5..f06bbc15 100644 --- a/workspace-server/internal/handlers/handlers_additional_test.go +++ b/workspace-server/internal/handlers/handlers_additional_test.go @@ -843,6 +843,10 @@ func TestSecretsSet_TriggersAutoRestart(t *testing.T) { mock.ExpectExec("INSERT INTO workspace_secrets"). WithArgs("77777777-7777-7777-7777-777777777777", "NEW_KEY", sqlmock.AnyArg(), sqlmock.AnyArg()). WillReturnResult(sqlmock.NewResult(0, 1)) + // autoRestartAllowed (core#2573) checks the target's kind before firing. + mock.ExpectQuery(`SELECT COALESCE\(kind`). + WithArgs("77777777-7777-7777-7777-777777777777"). + WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("workspace")) w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) @@ -910,6 +914,10 @@ func TestSecretsDelete_TriggersAutoRestart(t *testing.T) { mock.ExpectExec("DELETE FROM workspace_secrets WHERE workspace_id"). WithArgs("cccccccc-cccc-cccc-cccc-cccccccccccc", "REMOVE_KEY"). WillReturnResult(sqlmock.NewResult(0, 1)) + // autoRestartAllowed (core#2573) checks the target's kind before firing. + mock.ExpectQuery(`SELECT COALESCE\(kind`). + WithArgs("cccccccc-cccc-cccc-cccc-cccccccccccc"). + WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("workspace")) w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) diff --git a/workspace-server/internal/handlers/secrets.go b/workspace-server/internal/handlers/secrets.go index c136d62c..21c108c8 100644 --- a/workspace-server/internal/handlers/secrets.go +++ b/workspace-server/internal/handlers/secrets.go @@ -118,6 +118,46 @@ func NewSecretsHandler(restartFunc func(string)) *SecretsHandler { return &SecretsHandler{restartFunc: restartFunc} } +// autoRestartAllowed (core#2573) decides whether a secret change on +// workspaceID may fire the auto-restart. Two skips: +// +// 1. Self-write: the caller IS the target workspace -- restarting would +// kill the writing agent mid-turn (original #2573 fix). Only covers +// callers that present a workspace token / X-Workspace-ID, which the +// concierge's management MCP does NOT (it authenticates with the +// tenant ADMIN token, so callerID is "" and skip 1 never fired -- +// that gap terminated the org root's box twice on 2026-06-11, once +// costing a 14h outage). +// 2. Platform root: the target is the org's kind='platform' concierge. +// An auto-restart here tears down the ORG ROOT's EC2 (terminate + +// re-provision, minutes of downtime, and the provision leg has +// failed silently before -- cp#691). The concierge picks changes up +// on its next explicit restart; the canvas Restart button covers +// operators who need it now. +func autoRestartAllowed(ctx context.Context, callerID, workspaceID string) bool { + if callerID == workspaceID { + log.Printf("secrets: skipping auto-restart of %s (self-write, core#2573)", workspaceID) + return false + } + var kind string + if err := db.DB.QueryRowContext(ctx, + `SELECT COALESCE(kind, 'workspace') FROM workspaces WHERE id = $1`, workspaceID). + Scan(&kind); err != nil { + // Fail closed: if we cannot prove the target is a regular + // workspace, do NOT restart it -- a wrongly-fired restart on the + // org root is exactly the outage this guard exists to prevent, + // while a skipped restart just delays env propagation until the + // next explicit restart. + log.Printf("secrets: skipping auto-restart of %s (kind lookup failed, fail-closed: %v)", workspaceID, err) + return false + } + if kind == "platform" { + log.Printf("secrets: skipping auto-restart of %s (platform root, core#2573 -- restart explicitly to apply)", workspaceID) + return false + } + return true +} + // List handles GET /workspaces/:id/secrets // Returns a merged view: workspace-level overrides + inherited global secrets. // Each entry includes a "scope" field ("workspace" or "global") so the frontend @@ -398,10 +438,10 @@ func (h *SecretsHandler) Set(c *gin.Context) { // drain the detached restart goroutine before db.DB is swapped — see // drainTestAsync in handlers_test.go and the canonical 69d9b4e3 fix. // - // #2573: skip self-restart when the secret-write caller is the target - // workspace, to avoid killing the writing agent mid-turn. + // #2573: skip the auto-restart for self-writes AND for the platform + // root (see autoRestartAllowed for the full rationale). callerID := callerWorkspaceID(c) - if h.restartFunc != nil && callerID != workspaceID { + if h.restartFunc != nil && autoRestartAllowed(c.Request.Context(), callerID, workspaceID) { wsID := workspaceID globalGoAsync(func() { h.restartFunc(wsID) }) } @@ -449,9 +489,10 @@ func (h *SecretsHandler) Delete(c *gin.Context) { // Auto-restart workspace to pick up removed secret. // RFC internal#524 Layer 1: see Set() above for the drain rationale. - // #2573: skip self-restart when the secret-write caller is the target. + // #2573: skip the auto-restart for self-writes AND for the platform + // root (see autoRestartAllowed). callerID := callerWorkspaceID(c) - if h.restartFunc != nil && callerID != workspaceID { + if h.restartFunc != nil && autoRestartAllowed(c.Request.Context(), callerID, workspaceID) { wsID := workspaceID globalGoAsync(func() { h.restartFunc(wsID) }) } @@ -579,10 +620,15 @@ func (h *SecretsHandler) restartAllAffectedByGlobalKey(key, excludeWorkspaceID s return } ctx := context.Background() + // core#2573: the org's kind='platform' root is excluded for the same + // reason autoRestartAllowed skips it -- an auto-restart terminates the + // org root's box. It picks up rotated globals on its next explicit + // restart. rows, err := db.DB.QueryContext(ctx, ` SELECT id FROM workspaces WHERE status NOT IN ('removed', 'paused') AND COALESCE(runtime, '') <> 'external' + AND COALESCE(kind, 'workspace') <> 'platform' AND id NOT IN ( SELECT workspace_id FROM workspace_secrets WHERE key = $1 ) diff --git a/workspace-server/internal/handlers/secrets_test.go b/workspace-server/internal/handlers/secrets_test.go index 3c6c4819..cb2a873a 100644 --- a/workspace-server/internal/handlers/secrets_test.go +++ b/workspace-server/internal/handlers/secrets_test.go @@ -257,6 +257,10 @@ func TestSecretsSet_AutoRestart(t *testing.T) { mock.ExpectExec("INSERT INTO workspace_secrets"). WithArgs("550e8400-e29b-41d4-a716-446655440000", "DB_PASS", sqlmock.AnyArg(), sqlmock.AnyArg()). WillReturnResult(sqlmock.NewResult(0, 1)) + // autoRestartAllowed (core#2573) checks the target's kind before firing. + mock.ExpectQuery(`SELECT COALESCE\(kind`). + WithArgs("550e8400-e29b-41d4-a716-446655440000"). + WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("workspace")) w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) @@ -444,6 +448,10 @@ func TestSecretsDelete_AutoRestart(t *testing.T) { mock.ExpectExec("DELETE FROM workspace_secrets WHERE workspace_id"). WithArgs("550e8400-e29b-41d4-a716-446655440000", "OLD_KEY"). WillReturnResult(sqlmock.NewResult(0, 1)) + // autoRestartAllowed (core#2573) checks the target's kind before firing. + mock.ExpectQuery(`SELECT COALESCE\(kind`). + WithArgs("550e8400-e29b-41d4-a716-446655440000"). + WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("workspace")) w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) @@ -1330,6 +1338,95 @@ func TestSecretsDelete_SkipSelfRestart_WhenCallerIsTarget(t *testing.T) { } } +// TestSecretsSet_SkipAutoRestart_PlatformRoot (#2573): a secret write +// targeting the org's kind='platform' concierge must NOT auto-restart it, +// regardless of who the caller is. The management MCP authenticates with the +// tenant ADMIN token (callerID == ""), so the self-write skip never fires for +// the concierge — this kind check is what actually protects the org root. +func TestSecretsSet_SkipAutoRestart_PlatformRoot(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + + restarted := make(chan string, 1) + restartFunc := func(wsID string) { restarted <- wsID } + handler := NewSecretsHandler(restartFunc) + + mock.ExpectExec("INSERT INTO workspace_secrets"). + WithArgs("550e8400-e29b-41d4-a716-446655440000", "TEST_SECRET", sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectQuery(`SELECT COALESCE\(kind`). + WithArgs("550e8400-e29b-41d4-a716-446655440000"). + WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform")) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "550e8400-e29b-41d4-a716-446655440000"}} + body := `{"key":"TEST_SECRET","value":"v"}` + c.Request = httptest.NewRequest("POST", "/workspaces/550e8400-e29b-41d4-a716-446655440000/secrets", bytes.NewBufferString(body)) + c.Request.Header.Set("Content-Type", "application/json") + // No workspace bearer / X-Workspace-ID: admin-token caller, callerID == "". + + handler.Set(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String()) + } + + select { + case <-restarted: + t.Fatal("restart must NOT fire for a kind='platform' target") + case <-time.After(200 * time.Millisecond): + // Expected — no restart. + } + + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + +// TestSecretsDelete_SkipAutoRestart_PlatformRoot (#2573): symmetric skip for +// the DELETE path — cleaning up secrets on the concierge must not kill it. +func TestSecretsDelete_SkipAutoRestart_PlatformRoot(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + + restarted := make(chan string, 1) + restartFunc := func(wsID string) { restarted <- wsID } + handler := NewSecretsHandler(restartFunc) + + mock.ExpectExec("DELETE FROM workspace_secrets WHERE workspace_id"). + WithArgs("550e8400-e29b-41d4-a716-446655440000", "TEST_SECRET"). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectQuery(`SELECT COALESCE\(kind`). + WithArgs("550e8400-e29b-41d4-a716-446655440000"). + WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform")) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{ + {Key: "id", Value: "550e8400-e29b-41d4-a716-446655440000"}, + {Key: "key", Value: "TEST_SECRET"}, + } + c.Request = httptest.NewRequest("DELETE", "/workspaces/550e8400-e29b-41d4-a716-446655440000/secrets/TEST_SECRET", nil) + + handler.Delete(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String()) + } + + select { + case <-restarted: + t.Fatal("restart must NOT fire for a kind='platform' target") + case <-time.After(200 * time.Millisecond): + // Expected — no restart. + } + + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + // TestSetGlobal_SkipSelfRestart_WhenCallerIsAffected (#2573): when the org // platform agent (caller) sets a global secret, its own workspace must be // excluded from the fan-out restart so it isn't killed mid-turn. @@ -1390,6 +1487,52 @@ func TestSetGlobal_SkipSelfRestart_WhenCallerIsAffected(t *testing.T) { } } +// TestSetGlobal_FanOutQuery_ExcludesPlatformRoot (#2573): the fan-out query +// itself must exclude kind='platform' workspaces — the org root must never be +// auto-restarted by a global-secret rotation. The regex pins the SQL filter; +// rows returned already reflect it, so the assertion lives in the query match. +func TestSetGlobal_FanOutQuery_ExcludesPlatformRoot(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + + restarted := make(chan string, 2) + restartFunc := func(wsID string) { restarted <- wsID } + handler := NewSecretsHandler(restartFunc) + + mock.ExpectExec("INSERT INTO global_secrets"). + WithArgs("ROTATED_KEY", sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) + + mock.ExpectQuery(`(?s)SELECT id FROM workspaces.*COALESCE\(kind, 'workspace'\) <> 'platform'`). + WithArgs("ROTATED_KEY"). + WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-a")) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + body := `{"key":"ROTATED_KEY","value":"v2"}` + c.Request = httptest.NewRequest("POST", "/admin/secrets", bytes.NewBufferString(body)) + c.Request.Header.Set("Content-Type", "application/json") + + handler.SetGlobal(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String()) + } + + select { + case id := <-restarted: + if id != "ws-a" { + t.Errorf("expected ws-a restarted, got %q", id) + } + case <-time.After(2 * time.Second): + t.Fatal("restart not fired for the affected regular workspace") + } + + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + // TestSecretsSet_SpoofedHeader_DoesNotSuppressRestart (core#2584 regression): // a request presenting a valid workspace bearer token MUST have its identity // derived from the token, NOT from a spoofed X-Workspace-ID header. If the @@ -1413,6 +1556,10 @@ func TestSecretsSet_SpoofedHeader_DoesNotSuppressRestart(t *testing.T) { // Token lookup: the bearer resolves to ws-caller, NOT the target. mock.ExpectQuery(`SELECT t\.id, t\.workspace_id.*FROM workspace_auth_tokens t.*JOIN workspaces`). WillReturnRows(sqlmock.NewRows([]string{"id", "workspace_id"}).AddRow("tok-1", "ws-caller")) + // autoRestartAllowed (core#2573) checks the target's kind before firing. + mock.ExpectQuery(`SELECT COALESCE\(kind`). + WithArgs("550e8400-e29b-41d4-a716-446655440000"). + WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("workspace")) w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) @@ -1463,6 +1610,10 @@ func TestSecretsDelete_SpoofedHeader_DoesNotSuppressRestart(t *testing.T) { // Token lookup: the bearer resolves to ws-caller, NOT the target. mock.ExpectQuery(`SELECT t\.id, t\.workspace_id.*FROM workspace_auth_tokens t.*JOIN workspaces`). WillReturnRows(sqlmock.NewRows([]string{"id", "workspace_id"}).AddRow("tok-1", "ws-caller")) + // autoRestartAllowed (core#2573) checks the target's kind before firing. + mock.ExpectQuery(`SELECT COALESCE\(kind`). + WithArgs("550e8400-e29b-41d4-a716-446655440000"). + WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("workspace")) w := httptest.NewRecorder() c, _ := gin.CreateTestContext(w) -- 2.52.0