fix(secrets): never auto-restart the org's platform root on secret changes (core#2573) #2603
@@ -843,6 +843,10 @@ func TestSecretsSet_TriggersAutoRestart(t *testing.T) {
|
||||
mock.ExpectExec("INSERT INTO workspace_secrets").
|
||||
WithArgs("77777777-7777-7777-7777-777777777777", "NEW_KEY", sqlmock.AnyArg(), sqlmock.AnyArg()).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
// autoRestartAllowed (core#2573) checks the target's kind before firing.
|
||||
mock.ExpectQuery(`SELECT COALESCE\(kind`).
|
||||
WithArgs("77777777-7777-7777-7777-777777777777").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("workspace"))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
@@ -910,6 +914,10 @@ func TestSecretsDelete_TriggersAutoRestart(t *testing.T) {
|
||||
mock.ExpectExec("DELETE FROM workspace_secrets WHERE workspace_id").
|
||||
WithArgs("cccccccc-cccc-cccc-cccc-cccccccccccc", "REMOVE_KEY").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
// autoRestartAllowed (core#2573) checks the target's kind before firing.
|
||||
mock.ExpectQuery(`SELECT COALESCE\(kind`).
|
||||
WithArgs("cccccccc-cccc-cccc-cccc-cccccccccccc").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("workspace"))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
|
||||
@@ -118,6 +118,46 @@ func NewSecretsHandler(restartFunc func(string)) *SecretsHandler {
|
||||
return &SecretsHandler{restartFunc: restartFunc}
|
||||
}
|
||||
|
||||
// autoRestartAllowed (core#2573) decides whether a secret change on
|
||||
// workspaceID may fire the auto-restart. Two skips:
|
||||
//
|
||||
// 1. Self-write: the caller IS the target workspace -- restarting would
|
||||
// kill the writing agent mid-turn (original #2573 fix). Only covers
|
||||
// callers that present a workspace token / X-Workspace-ID, which the
|
||||
// concierge's management MCP does NOT (it authenticates with the
|
||||
// tenant ADMIN token, so callerID is "" and skip 1 never fired --
|
||||
// that gap terminated the org root's box twice on 2026-06-11, once
|
||||
// costing a 14h outage).
|
||||
// 2. Platform root: the target is the org's kind='platform' concierge.
|
||||
// An auto-restart here tears down the ORG ROOT's EC2 (terminate +
|
||||
// re-provision, minutes of downtime, and the provision leg has
|
||||
// failed silently before -- cp#691). The concierge picks changes up
|
||||
// on its next explicit restart; the canvas Restart button covers
|
||||
// operators who need it now.
|
||||
func autoRestartAllowed(ctx context.Context, callerID, workspaceID string) bool {
|
||||
if callerID == workspaceID {
|
||||
log.Printf("secrets: skipping auto-restart of %s (self-write, core#2573)", workspaceID)
|
||||
return false
|
||||
}
|
||||
var kind string
|
||||
if err := db.DB.QueryRowContext(ctx,
|
||||
`SELECT COALESCE(kind, 'workspace') FROM workspaces WHERE id = $1`, workspaceID).
|
||||
Scan(&kind); err != nil {
|
||||
// Fail closed: if we cannot prove the target is a regular
|
||||
// workspace, do NOT restart it -- a wrongly-fired restart on the
|
||||
// org root is exactly the outage this guard exists to prevent,
|
||||
// while a skipped restart just delays env propagation until the
|
||||
// next explicit restart.
|
||||
log.Printf("secrets: skipping auto-restart of %s (kind lookup failed, fail-closed: %v)", workspaceID, err)
|
||||
return false
|
||||
}
|
||||
if kind == "platform" {
|
||||
log.Printf("secrets: skipping auto-restart of %s (platform root, core#2573 -- restart explicitly to apply)", workspaceID)
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// List handles GET /workspaces/:id/secrets
|
||||
// Returns a merged view: workspace-level overrides + inherited global secrets.
|
||||
// Each entry includes a "scope" field ("workspace" or "global") so the frontend
|
||||
@@ -398,10 +438,10 @@ func (h *SecretsHandler) Set(c *gin.Context) {
|
||||
// drain the detached restart goroutine before db.DB is swapped — see
|
||||
// drainTestAsync in handlers_test.go and the canonical 69d9b4e3 fix.
|
||||
//
|
||||
// #2573: skip self-restart when the secret-write caller is the target
|
||||
// workspace, to avoid killing the writing agent mid-turn.
|
||||
// #2573: skip the auto-restart for self-writes AND for the platform
|
||||
// root (see autoRestartAllowed for the full rationale).
|
||||
callerID := callerWorkspaceID(c)
|
||||
if h.restartFunc != nil && callerID != workspaceID {
|
||||
if h.restartFunc != nil && autoRestartAllowed(c.Request.Context(), callerID, workspaceID) {
|
||||
wsID := workspaceID
|
||||
globalGoAsync(func() { h.restartFunc(wsID) })
|
||||
}
|
||||
@@ -449,9 +489,10 @@ func (h *SecretsHandler) Delete(c *gin.Context) {
|
||||
|
||||
// Auto-restart workspace to pick up removed secret.
|
||||
// RFC internal#524 Layer 1: see Set() above for the drain rationale.
|
||||
// #2573: skip self-restart when the secret-write caller is the target.
|
||||
// #2573: skip the auto-restart for self-writes AND for the platform
|
||||
// root (see autoRestartAllowed).
|
||||
callerID := callerWorkspaceID(c)
|
||||
if h.restartFunc != nil && callerID != workspaceID {
|
||||
if h.restartFunc != nil && autoRestartAllowed(c.Request.Context(), callerID, workspaceID) {
|
||||
wsID := workspaceID
|
||||
globalGoAsync(func() { h.restartFunc(wsID) })
|
||||
}
|
||||
@@ -579,10 +620,15 @@ func (h *SecretsHandler) restartAllAffectedByGlobalKey(key, excludeWorkspaceID s
|
||||
return
|
||||
}
|
||||
ctx := context.Background()
|
||||
// core#2573: the org's kind='platform' root is excluded for the same
|
||||
// reason autoRestartAllowed skips it -- an auto-restart terminates the
|
||||
// org root's box. It picks up rotated globals on its next explicit
|
||||
// restart.
|
||||
rows, err := db.DB.QueryContext(ctx, `
|
||||
SELECT id FROM workspaces
|
||||
WHERE status NOT IN ('removed', 'paused')
|
||||
AND COALESCE(runtime, '') <> 'external'
|
||||
AND COALESCE(kind, 'workspace') <> 'platform'
|
||||
AND id NOT IN (
|
||||
SELECT workspace_id FROM workspace_secrets WHERE key = $1
|
||||
)
|
||||
|
||||
@@ -257,6 +257,10 @@ func TestSecretsSet_AutoRestart(t *testing.T) {
|
||||
mock.ExpectExec("INSERT INTO workspace_secrets").
|
||||
WithArgs("550e8400-e29b-41d4-a716-446655440000", "DB_PASS", sqlmock.AnyArg(), sqlmock.AnyArg()).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
// autoRestartAllowed (core#2573) checks the target's kind before firing.
|
||||
mock.ExpectQuery(`SELECT COALESCE\(kind`).
|
||||
WithArgs("550e8400-e29b-41d4-a716-446655440000").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("workspace"))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
@@ -444,6 +448,10 @@ func TestSecretsDelete_AutoRestart(t *testing.T) {
|
||||
mock.ExpectExec("DELETE FROM workspace_secrets WHERE workspace_id").
|
||||
WithArgs("550e8400-e29b-41d4-a716-446655440000", "OLD_KEY").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
// autoRestartAllowed (core#2573) checks the target's kind before firing.
|
||||
mock.ExpectQuery(`SELECT COALESCE\(kind`).
|
||||
WithArgs("550e8400-e29b-41d4-a716-446655440000").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("workspace"))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
@@ -1330,6 +1338,95 @@ func TestSecretsDelete_SkipSelfRestart_WhenCallerIsTarget(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestSecretsSet_SkipAutoRestart_PlatformRoot (#2573): a secret write
|
||||
// targeting the org's kind='platform' concierge must NOT auto-restart it,
|
||||
// regardless of who the caller is. The management MCP authenticates with the
|
||||
// tenant ADMIN token (callerID == ""), so the self-write skip never fires for
|
||||
// the concierge — this kind check is what actually protects the org root.
|
||||
func TestSecretsSet_SkipAutoRestart_PlatformRoot(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
|
||||
restarted := make(chan string, 1)
|
||||
restartFunc := func(wsID string) { restarted <- wsID }
|
||||
handler := NewSecretsHandler(restartFunc)
|
||||
|
||||
mock.ExpectExec("INSERT INTO workspace_secrets").
|
||||
WithArgs("550e8400-e29b-41d4-a716-446655440000", "TEST_SECRET", sqlmock.AnyArg(), sqlmock.AnyArg()).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
mock.ExpectQuery(`SELECT COALESCE\(kind`).
|
||||
WithArgs("550e8400-e29b-41d4-a716-446655440000").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform"))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{{Key: "id", Value: "550e8400-e29b-41d4-a716-446655440000"}}
|
||||
body := `{"key":"TEST_SECRET","value":"v"}`
|
||||
c.Request = httptest.NewRequest("POST", "/workspaces/550e8400-e29b-41d4-a716-446655440000/secrets", bytes.NewBufferString(body))
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
// No workspace bearer / X-Workspace-ID: admin-token caller, callerID == "".
|
||||
|
||||
handler.Set(c)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
|
||||
select {
|
||||
case <-restarted:
|
||||
t.Fatal("restart must NOT fire for a kind='platform' target")
|
||||
case <-time.After(200 * time.Millisecond):
|
||||
// Expected — no restart.
|
||||
}
|
||||
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet sqlmock expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestSecretsDelete_SkipAutoRestart_PlatformRoot (#2573): symmetric skip for
|
||||
// the DELETE path — cleaning up secrets on the concierge must not kill it.
|
||||
func TestSecretsDelete_SkipAutoRestart_PlatformRoot(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
|
||||
restarted := make(chan string, 1)
|
||||
restartFunc := func(wsID string) { restarted <- wsID }
|
||||
handler := NewSecretsHandler(restartFunc)
|
||||
|
||||
mock.ExpectExec("DELETE FROM workspace_secrets WHERE workspace_id").
|
||||
WithArgs("550e8400-e29b-41d4-a716-446655440000", "TEST_SECRET").
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
mock.ExpectQuery(`SELECT COALESCE\(kind`).
|
||||
WithArgs("550e8400-e29b-41d4-a716-446655440000").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform"))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
c.Params = gin.Params{
|
||||
{Key: "id", Value: "550e8400-e29b-41d4-a716-446655440000"},
|
||||
{Key: "key", Value: "TEST_SECRET"},
|
||||
}
|
||||
c.Request = httptest.NewRequest("DELETE", "/workspaces/550e8400-e29b-41d4-a716-446655440000/secrets/TEST_SECRET", nil)
|
||||
|
||||
handler.Delete(c)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
|
||||
select {
|
||||
case <-restarted:
|
||||
t.Fatal("restart must NOT fire for a kind='platform' target")
|
||||
case <-time.After(200 * time.Millisecond):
|
||||
// Expected — no restart.
|
||||
}
|
||||
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet sqlmock expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestSetGlobal_SkipSelfRestart_WhenCallerIsAffected (#2573): when the org
|
||||
// platform agent (caller) sets a global secret, its own workspace must be
|
||||
// excluded from the fan-out restart so it isn't killed mid-turn.
|
||||
@@ -1390,6 +1487,52 @@ func TestSetGlobal_SkipSelfRestart_WhenCallerIsAffected(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestSetGlobal_FanOutQuery_ExcludesPlatformRoot (#2573): the fan-out query
|
||||
// itself must exclude kind='platform' workspaces — the org root must never be
|
||||
// auto-restarted by a global-secret rotation. The regex pins the SQL filter;
|
||||
// rows returned already reflect it, so the assertion lives in the query match.
|
||||
func TestSetGlobal_FanOutQuery_ExcludesPlatformRoot(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
setupTestRedis(t)
|
||||
|
||||
restarted := make(chan string, 2)
|
||||
restartFunc := func(wsID string) { restarted <- wsID }
|
||||
handler := NewSecretsHandler(restartFunc)
|
||||
|
||||
mock.ExpectExec("INSERT INTO global_secrets").
|
||||
WithArgs("ROTATED_KEY", sqlmock.AnyArg(), sqlmock.AnyArg()).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
mock.ExpectQuery(`(?s)SELECT id FROM workspaces.*COALESCE\(kind, 'workspace'\) <> 'platform'`).
|
||||
WithArgs("ROTATED_KEY").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-a"))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
body := `{"key":"ROTATED_KEY","value":"v2"}`
|
||||
c.Request = httptest.NewRequest("POST", "/admin/secrets", bytes.NewBufferString(body))
|
||||
c.Request.Header.Set("Content-Type", "application/json")
|
||||
|
||||
handler.SetGlobal(c)
|
||||
|
||||
if w.Code != http.StatusOK {
|
||||
t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
|
||||
}
|
||||
|
||||
select {
|
||||
case id := <-restarted:
|
||||
if id != "ws-a" {
|
||||
t.Errorf("expected ws-a restarted, got %q", id)
|
||||
}
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("restart not fired for the affected regular workspace")
|
||||
}
|
||||
|
||||
if err := mock.ExpectationsWereMet(); err != nil {
|
||||
t.Errorf("unmet sqlmock expectations: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestSecretsSet_SpoofedHeader_DoesNotSuppressRestart (core#2584 regression):
|
||||
// a request presenting a valid workspace bearer token MUST have its identity
|
||||
// derived from the token, NOT from a spoofed X-Workspace-ID header. If the
|
||||
@@ -1413,6 +1556,10 @@ func TestSecretsSet_SpoofedHeader_DoesNotSuppressRestart(t *testing.T) {
|
||||
// Token lookup: the bearer resolves to ws-caller, NOT the target.
|
||||
mock.ExpectQuery(`SELECT t\.id, t\.workspace_id.*FROM workspace_auth_tokens t.*JOIN workspaces`).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id", "workspace_id"}).AddRow("tok-1", "ws-caller"))
|
||||
// autoRestartAllowed (core#2573) checks the target's kind before firing.
|
||||
mock.ExpectQuery(`SELECT COALESCE\(kind`).
|
||||
WithArgs("550e8400-e29b-41d4-a716-446655440000").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("workspace"))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
@@ -1463,6 +1610,10 @@ func TestSecretsDelete_SpoofedHeader_DoesNotSuppressRestart(t *testing.T) {
|
||||
// Token lookup: the bearer resolves to ws-caller, NOT the target.
|
||||
mock.ExpectQuery(`SELECT t\.id, t\.workspace_id.*FROM workspace_auth_tokens t.*JOIN workspaces`).
|
||||
WillReturnRows(sqlmock.NewRows([]string{"id", "workspace_id"}).AddRow("tok-1", "ws-caller"))
|
||||
// autoRestartAllowed (core#2573) checks the target's kind before firing.
|
||||
mock.ExpectQuery(`SELECT COALESCE\(kind`).
|
||||
WithArgs("550e8400-e29b-41d4-a716-446655440000").
|
||||
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("workspace"))
|
||||
|
||||
w := httptest.NewRecorder()
|
||||
c, _ := gin.CreateTestContext(w)
|
||||
|
||||
Reference in New Issue
Block a user