fix(core#2566): refuse concierge/agent set_workspace_secret self-writes on org root #2756

Merged
devops-engineer merged 1 commits from fix/core2566-concierge-self-secret-write into main 2026-06-13 19:06:31 +00:00
2 changed files with 306 additions and 0 deletions
@@ -144,6 +144,57 @@ func NewSecretsHandler(restartFunc func(string)) *SecretsHandler {
return &SecretsHandler{restartFunc: restartFunc}
}
// conciergeSelfSecretWriteBlocked (core#2566) reports whether a Set call
// from the concierge/agent surface (AdminAuth ADMIN_TOKEN) targeting the
// org's kind='platform' concierge workspace should be refused outright
// (self-DoS guard). Returns a non-empty reason when blocked.
//
// Threat model (core#2566, 2026-06-11 live incident):
// - The concierge's management MCP authenticates with the tenant
// ADMIN token. On a Set call against the concierge's own workspace
// (kind='platform'), #2573's auto-restart skip *does* fire (skip 2
// covers kind='platform') — but that guard is best-effort and there
// is a second, earlier failure path: the secret write itself
// triggers env-var reload side effects inside the live container
// mid-turn, and any path that later invokes a restart (operator
// click, restart-on-failure watchdog, the next Set/Delete) tears
// the concierge down. The org root going offline has already cost
// a multi-hour outage once.
// - Approval gating (#2574) is necessary but NOT sufficient: an
// approval can be issued by a sleepy operator, and the concierge
// consuming it still self-DoSes. The only safe posture is to
// refuse the self-targeted write and force a human to apply the
// change through a non-agent path (canvas Secrets tab, operator
// session with explicit operator-action audit).
//
// Scope: AdminAuth admin-token callers ONLY. Session-cookie
// (cp_session_actor) and ordinary workspace-token callers are NOT
// blocked — they are human operators / non-concierge agents and may
// legitimately need to write the concierge's own secrets.
//
// Fail-closed: if the kind lookup errors, we refuse the write (and
// log) — a wrongly-fired write on the org root is exactly the outage
// this guard exists to prevent, while a refused write is just a retry
// after the DB hiccup clears.
func conciergeSelfSecretWriteBlocked(c *gin.Context, targetWorkspaceID string) (bool, string) {
if !callerIsAdminToken(c) {
return false, ""
}
ctx := c.Request.Context()
var kind string
if err := db.DB.QueryRowContext(ctx,
`SELECT COALESCE(kind, 'workspace') FROM workspaces WHERE id = $1`, targetWorkspaceID).
Scan(&kind); err != nil {
// Fail closed — see doc comment.
log.Printf("secrets: blocking admin-token set_workspace_secret on %s (kind lookup failed, fail-closed: %v)", targetWorkspaceID, err)
return true, "self-DoS guard: workspace kind lookup failed; refusing to write to a workspace whose kind cannot be verified"
}
if kind == "platform" {
return true, "concierge cannot set_workspace_secret on its own platform-root workspace (self-DoS guard, core#2566); apply this secret through the canvas Secrets tab as a human operator"
}
return false, ""
}
// autoRestartAllowed (core#2573) decides whether a secret change on
// workspaceID may fire the auto-restart. Two skips:
//
@@ -412,6 +463,28 @@ func (h *SecretsHandler) Set(c *gin.Context) {
return
}
// core#2566: refuse concierge/agent self-targeted secret writes on the
// org-root (kind='platform') workspace. Fires BEFORE the approval gate so
// a refused write does not even create a pending approval — the operator
// is told to apply the change through the canvas Secrets tab instead, and
// the audit trail records the refused attempt.
if blocked, reason := conciergeSelfSecretWriteBlocked(c, workspaceID); blocked {
log.Printf("secrets: refusing admin-token set_workspace_secret on %s key=%s (core#2566 self-DoS guard): %s", workspaceID, body.Key, reason)
c.JSON(http.StatusForbidden, gin.H{
"error": reason,
"workspace_id": workspaceID,
"key": body.Key,
"code": "CONCIERGE_SELF_WRITE_BLOCKED",
})
audit.Emit(c.Request.Context(), "secret.set.refused", map[string]any{
"workspace_id": workspaceID,
"key": body.Key,
"reason": "concierge_self_write_blocked",
"issue": "core#2566",
})
return
}
// RFC platform-agent Phase 4b: gate org-token (platform-agent) secret writes
// behind human approval. The context includes the key so an approval for one
// secret cannot authorise writing another. No-op for ordinary callers and
@@ -1297,6 +1297,15 @@ func TestSecretsSet_AdminToken_GatedByApproval(t *testing.T) {
setupTestRedis(t)
handler := NewSecretsHandler(nil)
// core#2566 self-DoS guard: the guard runs BEFORE the approval gate
// for admin-token callers. The target here is a regular workspace
// (kind='workspace'), so the guard passes and the approval gate
// takes over. Mock the kind lookup so the guard does not fail-closed
// on an unmocked query.
mock.ExpectQuery(`SELECT COALESCE\(kind`).
WithArgs("550e8400-e29b-41d4-a716-446655440000").
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("workspace"))
// requireApproval sequence for an admin-token caller (gated action,
// no pre-existing approval). The gate's requireApproval runs THREE
// queries: UPDATE (consume), INSERT (new pending), SELECT (parent_id).
@@ -1941,3 +1950,227 @@ func TestPlatformManagedLLMModeForWorkspace_GatesOnModelNotResolvedMode(t *testi
}
})
}
// =============================================================================
// core#2566: concierge self-DoS guard for set_workspace_secret
// =============================================================================
//
// The org-root concierge authenticates its management MCP with the tenant
// ADMIN_TOKEN (AdminAuth, caller_is_admin_token=true). When the concierge
// calls set_workspace_secret against its own workspace (kind='platform'),
// any of (a) a future restart on the container, (b) env-var reload side
// effects, or (c) the next concierge turn starting on a half-applied env
// can terminate the org root mid-task — the live 2026-06-11 incident took
// the concierge offline and required operator hand-repair.
//
// The guard: for admin-token callers, look up the target's kind and refuse
// the write when it is 'platform'. Fail-closed on DB error. Session-cookie
// (cp_session_actor) and ordinary workspace-token callers are NOT blocked —
// they are human operators / non-concierge agents and may legitimately
// need to write the concierge's own secrets.
// TestSecretsSet_AdminTokenSelfWrite_Refused (#2566): an admin-token caller
// writing a secret to the org-root (kind='platform') workspace MUST be
// refused with 403. No secret row is written, no approval is created, and
// the response carries a structured code so the caller can surface a
// self-DoS-safe error to the user.
func TestSecretsSet_AdminTokenSelfWrite_Refused(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
handler := NewSecretsHandler(nil)
// Guard's kind lookup. If the guard were bypassed (the bug), the
// handler would proceed to the approval gate and sqlmock would NOT see
// this query, failing the test. Mock it now to drive the guard.
mock.ExpectQuery(`SELECT COALESCE\(kind`).
WithArgs("550e8400-e29b-41d4-a716-446655440000").
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("platform"))
// NOTE: deliberately NO approval-gate queries, NO INSERT mock setup.
// A refused write must short-circuit BEFORE the gate — so neither is
// reached.
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
c.Params = gin.Params{{Key: "id", Value: "550e8400-e29b-41d4-a716-446655440000"}}
body := `{"key":"SELF_DOS_TEST","value":"x"}`
c.Request = httptest.NewRequest("POST", "/workspaces/550e8400-e29b-41d4-a716-446655440000/secrets", bytes.NewBufferString(body))
c.Request.Header.Set("Content-Type", "application/json")
// core#2566: the concierge's MCP authenticates with the tenant
// ADMIN_TOKEN. AdminAuth sets these context keys.
c.Set("caller_is_admin_token", true)
c.Set("caller_credential_class", "admin-token")
handler.Set(c)
if w.Code != http.StatusForbidden {
t.Fatalf("admin-token self-write to kind='platform' MUST be refused with 403, got %d: %s",
w.Code, w.Body.String())
}
var resp map[string]interface{}
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
t.Fatalf("failed to parse response: %v", err)
}
if resp["code"] != "CONCIERGE_SELF_WRITE_BLOCKED" {
t.Errorf("code = %v, want CONCIERGE_SELF_WRITE_BLOCKED", resp["code"])
}
if resp["workspace_id"] != "550e8400-e29b-41d4-a716-446655440000" {
t.Errorf("workspace_id = %v, want 550e8400-e29b-41d4-a716-446655440000", resp["workspace_id"])
}
if resp["key"] != "SELF_DOS_TEST" {
t.Errorf("key = %v, want SELF_DOS_TEST", resp["key"])
}
if errMsg, _ := resp["error"].(string); !strings.Contains(errMsg, "self-DoS") {
t.Errorf("error %q should mention self-DoS", errMsg)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet sqlmock expectations: %v", err)
}
}
// TestSecretsSet_AdminTokenOnOtherWorkspace_NotBlocked (#2566): the guard
// is scoped to the concierge's OWN workspace. An admin-token write to a
// regular workspace must NOT be blocked by this guard (it is still gated
// by the Phase-4 approval gate in #2574, which returns 202).
func TestSecretsSet_AdminTokenOnOtherWorkspace_NotBlocked(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
handler := NewSecretsHandler(nil)
// Guard's kind lookup: target is a regular workspace, so the guard
// returns false and the approval gate takes over.
mock.ExpectQuery(`SELECT COALESCE\(kind`).
WithArgs("550e8400-e29b-41d4-a716-446655440000").
WillReturnRows(sqlmock.NewRows([]string{"kind"}).AddRow("workspace"))
// Approval gate (admin-token caller → always gated, regardless of
// rollout flag — see TestSecretsSet_AdminToken_GatedByApproval).
mock.ExpectQuery(`UPDATE approval_requests SET consumed_at`).
WillReturnError(sql.ErrNoRows)
mock.ExpectQuery(`WITH existing AS`).
WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("appr-core2566-allow"))
mock.ExpectQuery(`SELECT parent_id FROM workspaces WHERE id`).
WillReturnRows(sqlmock.NewRows([]string{"parent_id"}).AddRow(nil))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
c.Params = gin.Params{{Key: "id", Value: "550e8400-e29b-41d4-a716-446655440000"}}
body := `{"key":"NOT_SELF","value":"v"}`
c.Request = httptest.NewRequest("POST", "/workspaces/550e8400-e29b-41d4-a716-446655440000/secrets", bytes.NewBufferString(body))
c.Request.Header.Set("Content-Type", "application/json")
c.Set("caller_is_admin_token", true)
c.Set("caller_credential_class", "admin-token")
handler.Set(c)
// Guard did NOT block → approval gate fires → 202 (admin-token
// always gated). This proves the guard only fires on the platform
// root and lets ordinary writes fall through to the gate.
if w.Code != http.StatusAccepted {
t.Fatalf("admin-token write to a regular workspace must fall through to the approval gate (202), got %d: %s",
w.Code, w.Body.String())
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet sqlmock expectations: %v", err)
}
}
// TestSecretsSet_AdminTokenKindLookupError_FailsClosed (#2566): when the
// DB hiccup prevents verifying the target's kind, the guard refuses
// the write (fail-closed). This is the safe default: a wrongly-fired
// self-DoS is exactly the outage this guard exists to prevent, while
// a refused write is just a retry after the DB hiccup clears.
func TestSecretsSet_AdminTokenKindLookupError_FailsClosed(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
handler := NewSecretsHandler(nil)
mock.ExpectQuery(`SELECT COALESCE\(kind`).
WithArgs("550e8400-e29b-41d4-a716-446655440000").
WillReturnError(sql.ErrConnDone) // arbitrary transient DB error
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
c.Params = gin.Params{{Key: "id", Value: "550e8400-e29b-41d4-a716-446655440000"}}
body := `{"key":"FAIL_CLOSED","value":"v"}`
c.Request = httptest.NewRequest("POST", "/workspaces/550e8400-e29b-41d4-a716-446655440000/secrets", bytes.NewBufferString(body))
c.Request.Header.Set("Content-Type", "application/json")
c.Set("caller_is_admin_token", true)
c.Set("caller_credential_class", "admin-token")
handler.Set(c)
// Fail-closed: 403 with CONCIERGE_SELF_WRITE_BLOCKED. We chose 403
// (not 500) because the guard is making a policy decision, not a
// transient error report — the operator sees the structured code
// and knows exactly why the write was refused. A 500 would invite
// a retry loop on a path that is fundamentally unsafe to retry.
if w.Code != http.StatusForbidden {
t.Fatalf("admin-token write with kind-lookup failure must fail closed with 403, got %d: %s",
w.Code, w.Body.String())
}
var resp map[string]interface{}
if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
t.Fatalf("failed to parse response: %v", err)
}
if resp["code"] != "CONCIERGE_SELF_WRITE_BLOCKED" {
t.Errorf("code = %v, want CONCIERGE_SELF_WRITE_BLOCKED", resp["code"])
}
if errMsg, _ := resp["error"].(string); !strings.Contains(errMsg, "kind lookup failed") {
t.Errorf("error %q should mention the kind-lookup failure", errMsg)
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet sqlmock expectations: %v", err)
}
}
// TestSecretsSet_SessionCookieCaller_NotBlocked (#2566): a human operator
// using the canvas session cookie (cp_session_actor, NOT admin-token) MUST
// NOT be blocked by this guard. The guard is scoped to AdminAuth
// admin-token callers only — the concierge's surface. Operators must
// retain the ability to write the concierge's own secrets through the
// canvas Secrets tab, which uses the session tier.
func TestSecretsSet_SessionCookieCaller_NotBlocked(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
handler := NewSecretsHandler(nil)
// Guard's kind lookup: should NOT happen, because the caller is
// session-cookie, not admin-token. If the test fails with an unmet
// expectation here, the guard is over-firing.
//
// (No `SELECT COALESCE(kind` mock.)
//
// The handler proceeds past the (skipped) guard to the approval
// gate. Session cookies are not in the gated set, so the gate
// passes (returns true → handler INSERTs and 200s).
mock.ExpectExec("INSERT INTO workspace_secrets").
WithArgs("550e8400-e29b-41d4-a716-446655440000", "OPERATOR_SECRET", sqlmock.AnyArg(), sqlmock.AnyArg()).
WillReturnResult(sqlmock.NewResult(0, 1))
w := httptest.NewRecorder()
c, _ := gin.CreateTestContext(w)
c.Params = gin.Params{{Key: "id", Value: "550e8400-e29b-41d4-a716-446655440000"}}
body := `{"key":"OPERATOR_SECRET","value":"v"}`
c.Request = httptest.NewRequest("POST", "/workspaces/550e8400-e29b-41d4-a716-446655440000/secrets", bytes.NewBufferString(body))
c.Request.Header.Set("Content-Type", "application/json")
// Session-cookie caller: cp_session_actor set, caller_is_admin_token
// is NOT set (AdminAuth's session tier does not flip that flag).
c.Set("cp_session_actor", "user-op-1234")
handler.Set(c)
if w.Code != http.StatusOK {
t.Fatalf("session-cookie caller must NOT be blocked by the self-DoS guard, got %d: %s",
w.Code, w.Body.String())
}
if err := mock.ExpectationsWereMet(); err != nil {
t.Errorf("unmet sqlmock expectations: %v", err)
}
}