fix(secrets): never auto-restart the org's platform root on secret changes (core#2573) #2603

Merged
devops-engineer merged 1 commits from fix/2573-no-autorestart-platform-root into main 2026-06-11 20:01:24 +00:00
3 changed files with 210 additions and 5 deletions
@@ -843,6 +843,10 @@ func TestSecretsSet_TriggersAutoRestart(t *testing.T) {
mock.ExpectExec("INSERT INTO workspace_secrets").
WithArgs("77777777-7777-7777-7777-777777777777", "NEW_KEY", sqlmock.AnyArg(), sqlmock.AnyArg()).
WillReturnResult(sqlmock.NewResult(0, 1))
// autoRestartAllowed (core#2573) checks the target's kind before firing.
mock.ExpectQuery(`SELECT COALESCE\(kind`).
WithArgs("77777777-7777-7777-7777-777777777777").
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("workspace"))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
@@ -910,6 +914,10 @@ func TestSecretsDelete_TriggersAutoRestart(t *testing.T) {
mock.ExpectExec("DELETE FROM workspace_secrets WHERE workspace_id").
WithArgs("cccccccc-cccc-cccc-cccc-cccccccccccc", "REMOVE_KEY").
WillReturnResult(sqlmock.NewResult(0, 1))
// autoRestartAllowed (core#2573) checks the target's kind before firing.
mock.ExpectQuery(`SELECT COALESCE\(kind`).
WithArgs("cccccccc-cccc-cccc-cccc-cccccccccccc").
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("workspace"))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
+51 -5
View File
@@ -118,6 +118,46 @@ func NewSecretsHandler(restartFunc func(string)) *SecretsHandler {
return &SecretsHandler{restartFunc: restartFunc}
}
// autoRestartAllowed (core#2573) decides whether a secret change on
// workspaceID may fire the auto-restart. Two skips:
//
// 1. Self-write: the caller IS the target workspace -- restarting would
// kill the writing agent mid-turn (original #2573 fix). Only covers
// callers that present a workspace token / X-Workspace-ID, which the
// concierge's management MCP does NOT (it authenticates with the
// tenant ADMIN token, so callerID is "" and skip 1 never fired --
// that gap terminated the org root's box twice on 2026-06-11, once
// costing a 14h outage).
// 2. Platform root: the target is the org's kind='platform' concierge.
// An auto-restart here tears down the ORG ROOT's EC2 (terminate +
// re-provision, minutes of downtime, and the provision leg has
// failed silently before -- cp#691). The concierge picks changes up
// on its next explicit restart; the canvas Restart button covers
// operators who need it now.
func autoRestartAllowed(ctx context.Context, callerID, workspaceID string) bool {
if callerID == workspaceID {
log.Printf("secrets: skipping auto-restart of %s (self-write, core#2573)", workspaceID)
return false
}
var kind string
if err := db.DB.QueryRowContext(ctx,
`SELECT COALESCE(kind, 'workspace') FROM workspaces WHERE id = $1`, workspaceID).
Scan(&kind); err != nil {
// Fail closed: if we cannot prove the target is a regular
// workspace, do NOT restart it -- a wrongly-fired restart on the
// org root is exactly the outage this guard exists to prevent,
// while a skipped restart just delays env propagation until the
// next explicit restart.
log.Printf("secrets: skipping auto-restart of %s (kind lookup failed, fail-closed: %v)", workspaceID, err)
return false
}
if kind == "platform" {
log.Printf("secrets: skipping auto-restart of %s (platform root, core#2573 -- restart explicitly to apply)", workspaceID)
return false
}
return true
}
// List handles GET /workspaces/:id/secrets
// Returns a merged view: workspace-level overrides + inherited global secrets.
// Each entry includes a "scope" field ("workspace" or "global") so the frontend
@@ -398,10 +438,10 @@ func (h *SecretsHandler) Set(c *gin.Context) {
// drain the detached restart goroutine before db.DB is swapped — see
// drainTestAsync in handlers_test.go and the canonical 69d9b4e3 fix.
//
// #2573: skip self-restart when the secret-write caller is the target
// workspace, to avoid killing the writing agent mid-turn.
// #2573: skip the auto-restart for self-writes AND for the platform
// root (see autoRestartAllowed for the full rationale).
callerID := callerWorkspaceID(c)
if h.restartFunc != nil && callerID != workspaceID {
if h.restartFunc != nil && autoRestartAllowed(c.Request.Context(), callerID, workspaceID) {
wsID := workspaceID
globalGoAsync(func() { h.restartFunc(wsID) })
}
@@ -449,9 +489,10 @@ func (h *SecretsHandler) Delete(c *gin.Context) {
// Auto-restart workspace to pick up removed secret.
// RFC internal#524 Layer 1: see Set() above for the drain rationale.
// #2573: skip self-restart when the secret-write caller is the target.
// #2573: skip the auto-restart for self-writes AND for the platform
// root (see autoRestartAllowed).
callerID := callerWorkspaceID(c)
if h.restartFunc != nil && callerID != workspaceID {
if h.restartFunc != nil && autoRestartAllowed(c.Request.Context(), callerID, workspaceID) {
wsID := workspaceID
globalGoAsync(func() { h.restartFunc(wsID) })
}
@@ -579,10 +620,15 @@ func (h *SecretsHandler) restartAllAffectedByGlobalKey(key, excludeWorkspaceID s
return
}
ctx := context.Background()
// core#2573: the org's kind='platform' root is excluded for the same
// reason autoRestartAllowed skips it -- an auto-restart terminates the
// org root's box. It picks up rotated globals on its next explicit
// restart.
rows, err := db.DB.QueryContext(ctx, `
SELECT id FROM workspaces
WHERE status NOT IN ('removed', 'paused')
AND COALESCE(runtime, '') <> 'external'
AND COALESCE(kind, 'workspace') <> 'platform'
AND id NOT IN (
SELECT workspace_id FROM workspace_secrets WHERE key = $1
)
@@ -257,6 +257,10 @@ func TestSecretsSet_AutoRestart(t *testing.T) {
mock.ExpectExec("INSERT INTO workspace_secrets").
WithArgs("550e8400-e29b-41d4-a716-446655440000", "DB_PASS", sqlmock.AnyArg(), sqlmock.AnyArg()).
WillReturnResult(sqlmock.NewResult(0, 1))
// autoRestartAllowed (core#2573) checks the target's kind before firing.
mock.ExpectQuery(`SELECT COALESCE\(kind`).
WithArgs("550e8400-e29b-41d4-a716-446655440000").
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("workspace"))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
@@ -444,6 +448,10 @@ func TestSecretsDelete_AutoRestart(t *testing.T) {
mock.ExpectExec("DELETE FROM workspace_secrets WHERE workspace_id").
WithArgs("550e8400-e29b-41d4-a716-446655440000", "OLD_KEY").
WillReturnResult(sqlmock.NewResult(0, 1))
// autoRestartAllowed (core#2573) checks the target's kind before firing.
mock.ExpectQuery(`SELECT COALESCE\(kind`).
WithArgs("550e8400-e29b-41d4-a716-446655440000").
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("workspace"))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
@@ -1330,6 +1338,95 @@ func TestSecretsDelete_SkipSelfRestart_WhenCallerIsTarget(t *testing.T) {
}
}
// TestSecretsSet_SkipAutoRestart_PlatformRoot (#2573): a secret write
// targeting the org's kind='platform' concierge must NOT auto-restart it,
// regardless of who the caller is. The management MCP authenticates with the
// tenant ADMIN token (callerID == ""), so the self-write skip never fires for
// the concierge — this kind check is what actually protects the org root.
func TestSecretsSet_SkipAutoRestart_PlatformRoot(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
restarted := make(chan string, 1)
restartFunc := func(wsID string) { restarted <- wsID }
handler := NewSecretsHandler(restartFunc)
mock.ExpectExec("INSERT INTO workspace_secrets").
WithArgs("550e8400-e29b-41d4-a716-446655440000", "TEST_SECRET", sqlmock.AnyArg(), sqlmock.AnyArg()).
WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectQuery(`SELECT COALESCE\(kind`).
WithArgs("550e8400-e29b-41d4-a716-446655440000").
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform"))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
c.Params = gin.Params{{Key: "id", Value: "550e8400-e29b-41d4-a716-446655440000"}}
body := `{"key":"TEST_SECRET","value":"v"}`
c.Request = httptest.NewRequest("POST", "/workspaces/550e8400-e29b-41d4-a716-446655440000/secrets", bytes.NewBufferString(body))
c.Request.Header.Set("Content-Type", "application/json")
// No workspace bearer / X-Workspace-ID: admin-token caller, callerID == "".
handler.Set(c)
if w.Code != http.StatusOK {
t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
}
select {
case <-restarted:
t.Fatal("restart must NOT fire for a kind='platform' target")
case <-time.After(200 * time.Millisecond):
// Expected — no restart.
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet sqlmock expectations: %v", err)
}
}
// TestSecretsDelete_SkipAutoRestart_PlatformRoot (#2573): symmetric skip for
// the DELETE path — cleaning up secrets on the concierge must not kill it.
func TestSecretsDelete_SkipAutoRestart_PlatformRoot(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
restarted := make(chan string, 1)
restartFunc := func(wsID string) { restarted <- wsID }
handler := NewSecretsHandler(restartFunc)
mock.ExpectExec("DELETE FROM workspace_secrets WHERE workspace_id").
WithArgs("550e8400-e29b-41d4-a716-446655440000", "TEST_SECRET").
WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectQuery(`SELECT COALESCE\(kind`).
WithArgs("550e8400-e29b-41d4-a716-446655440000").
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform"))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
c.Params = gin.Params{
{Key: "id", Value: "550e8400-e29b-41d4-a716-446655440000"},
{Key: "key", Value: "TEST_SECRET"},
}
c.Request = httptest.NewRequest("DELETE", "/workspaces/550e8400-e29b-41d4-a716-446655440000/secrets/TEST_SECRET", nil)
handler.Delete(c)
if w.Code != http.StatusOK {
t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
}
select {
case <-restarted:
t.Fatal("restart must NOT fire for a kind='platform' target")
case <-time.After(200 * time.Millisecond):
// Expected — no restart.
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet sqlmock expectations: %v", err)
}
}
// TestSetGlobal_SkipSelfRestart_WhenCallerIsAffected (#2573): when the org
// platform agent (caller) sets a global secret, its own workspace must be
// excluded from the fan-out restart so it isn't killed mid-turn.
@@ -1390,6 +1487,52 @@ func TestSetGlobal_SkipSelfRestart_WhenCallerIsAffected(t *testing.T) {
}
}
// TestSetGlobal_FanOutQuery_ExcludesPlatformRoot (#2573): the fan-out query
// itself must exclude kind='platform' workspaces — the org root must never be
// auto-restarted by a global-secret rotation. The regex pins the SQL filter;
// rows returned already reflect it, so the assertion lives in the query match.
func TestSetGlobal_FanOutQuery_ExcludesPlatformRoot(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
restarted := make(chan string, 2)
restartFunc := func(wsID string) { restarted <- wsID }
handler := NewSecretsHandler(restartFunc)
mock.ExpectExec("INSERT INTO global_secrets").
WithArgs("ROTATED_KEY", sqlmock.AnyArg(), sqlmock.AnyArg()).
WillReturnResult(sqlmock.NewResult(0, 1))
mock.ExpectQuery(`(?s)SELECT id FROM workspaces.*COALESCE\(kind, 'workspace'\) <> 'platform'`).
WithArgs("ROTATED_KEY").
WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-a"))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
body := `{"key":"ROTATED_KEY","value":"v2"}`
c.Request = httptest.NewRequest("POST", "/admin/secrets", bytes.NewBufferString(body))
c.Request.Header.Set("Content-Type", "application/json")
handler.SetGlobal(c)
if w.Code != http.StatusOK {
t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
}
select {
case id := <-restarted:
if id != "ws-a" {
t.Errorf("expected ws-a restarted, got %q", id)
}
case <-time.After(2 * time.Second):
t.Fatal("restart not fired for the affected regular workspace")
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet sqlmock expectations: %v", err)
}
}
// TestSecretsSet_SpoofedHeader_DoesNotSuppressRestart (core#2584 regression):
// a request presenting a valid workspace bearer token MUST have its identity
// derived from the token, NOT from a spoofed X-Workspace-ID header. If the
@@ -1413,6 +1556,10 @@ func TestSecretsSet_SpoofedHeader_DoesNotSuppressRestart(t *testing.T) {
// Token lookup: the bearer resolves to ws-caller, NOT the target.
mock.ExpectQuery(`SELECT t\.id, t\.workspace_id.*FROM workspace_auth_tokens t.*JOIN workspaces`).
WillReturnRows(sqlmock.NewRows([]string{"id", "workspace_id"}).AddRow("tok-1", "ws-caller"))
// autoRestartAllowed (core#2573) checks the target's kind before firing.
mock.ExpectQuery(`SELECT COALESCE\(kind`).
WithArgs("550e8400-e29b-41d4-a716-446655440000").
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("workspace"))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
@@ -1463,6 +1610,10 @@ func TestSecretsDelete_SpoofedHeader_DoesNotSuppressRestart(t *testing.T) {
// Token lookup: the bearer resolves to ws-caller, NOT the target.
mock.ExpectQuery(`SELECT t\.id, t\.workspace_id.*FROM workspace_auth_tokens t.*JOIN workspaces`).
WillReturnRows(sqlmock.NewRows([]string{"id", "workspace_id"}).AddRow("tok-1", "ws-caller"))
// autoRestartAllowed (core#2573) checks the target's kind before firing.
mock.ExpectQuery(`SELECT COALESCE\(kind`).
WithArgs("550e8400-e29b-41d4-a716-446655440000").
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("workspace"))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)