From 1b432f6ffd4b9c02511ad175f6816cbad0afaf36 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Tue, 14 Apr 2026 12:39:00 -0700 Subject: [PATCH] fix(secrets): auto-restart workspaces on global secret change (#15) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Global secrets (e.g. CLAUDE_CODE_OAUTH_TOKEN) are injected as container env vars at Start() time. Until now, rotating one only propagated to a workspace on the next full restart-from-zero, which manual ops had to drive via a `POST /workspaces/:id/restart` loop. Tier-3 Claude Code agents hit the stale-token path first and surfaced as 401s inside the SDK. Restart-time re-read of global_secrets + workspace_secrets was already correct in `provisionWorkspaceOpts` — the missing piece was the trigger. SetGlobal / DeleteGlobal now enqueue RestartByID for every non-paused, non-removed, non-external workspace that does NOT shadow the key with a workspace-level override. Matches the existing behaviour of workspace-scoped `Set` / `Delete`. Adds two sqlmock-backed tests exercising both branches. Co-Authored-By: Claude Opus 4.6 (1M context) --- platform/internal/handlers/secrets.go | 52 +++++++++++ platform/internal/handlers/secrets_test.go | 100 +++++++++++++++++++++ 2 files changed, 152 insertions(+) diff --git a/platform/internal/handlers/secrets.go b/platform/internal/handlers/secrets.go index c7eb4bf6..256d102c 100644 --- a/platform/internal/handlers/secrets.go +++ b/platform/internal/handlers/secrets.go @@ -1,6 +1,7 @@ package handlers import ( + "context" "database/sql" "log" "net/http" @@ -355,9 +356,56 @@ func (h *SecretsHandler) SetGlobal(c *gin.Context) { return } + // Issue #15: global secrets are injected into containers as env vars at + // Start() time, so a rotating token (e.g. CLAUDE_CODE_OAUTH_TOKEN) doesn't + // reach existing workspaces until the container is recreated. Auto-restart + // every workspace whose env is affected — i.e. those WITHOUT a + // workspace-level override of the same key. + go h.restartAllAffectedByGlobalKey(body.Key) + c.JSON(http.StatusOK, gin.H{"status": "saved", "key": body.Key, "scope": "global"}) } +// restartAllAffectedByGlobalKey restarts every non-paused, non-removed +// workspace that would inherit the given global-secret key (i.e. does NOT +// have a workspace-level override). Used on SetGlobal / DeleteGlobal so +// rotated credentials (OAuth tokens, API keys) propagate without a manual +// restart loop. See issue #15. +func (h *SecretsHandler) restartAllAffectedByGlobalKey(key string) { + if h.restartFunc == nil { + return + } + ctx := context.Background() + rows, err := db.DB.QueryContext(ctx, ` + SELECT id FROM workspaces + WHERE status NOT IN ('removed', 'paused') + AND COALESCE(runtime, '') <> 'external' + AND id NOT IN ( + SELECT workspace_id FROM workspace_secrets WHERE key = $1 + ) + `, key) + if err != nil { + log.Printf("Global secret %s: failed to list affected workspaces for auto-restart: %v", key, err) + return + } + defer rows.Close() + + var ids []string + for rows.Next() { + var id string + if err := rows.Scan(&id); err == nil { + ids = append(ids, id) + } + } + if len(ids) == 0 { + return + } + log.Printf("Global secret %s changed: auto-restarting %d workspace(s) to refresh env", key, len(ids)) + for _, id := range ids { + go h.restartFunc(id) + } +} + // DeleteGlobal handles DELETE /admin/secrets/:key func (h *SecretsHandler) DeleteGlobal(c *gin.Context) { key := c.Param("key") @@ -376,6 +424,10 @@ func (h *SecretsHandler) DeleteGlobal(c *gin.Context) { return } + // Issue #15: propagate deletion to running containers — otherwise they + // keep the stale env var until manual restart. + go h.restartAllAffectedByGlobalKey(key) + c.JSON(http.StatusOK, gin.H{"status": "deleted", "key": key, "scope": "global"}) } diff --git a/platform/internal/handlers/secrets_test.go b/platform/internal/handlers/secrets_test.go index 1f8ac8c4..f95fe0f6 100644 --- a/platform/internal/handlers/secrets_test.go +++ b/platform/internal/handlers/secrets_test.go @@ -684,3 +684,103 @@ func TestSecretsValues_InvalidWorkspaceID(t *testing.T) { t.Errorf("expected 400, got %d", w.Code) } } + +// ==================== Global secret auto-restart (issue #15) ==================== + +// TestSetGlobal_AutoRestartsAffectedWorkspaces documents the fix for #15: +// rotating a global secret (e.g. CLAUDE_CODE_OAUTH_TOKEN) must propagate to +// every running workspace without a manual restart loop. The handler should +// fire RestartByID for each non-paused/non-removed workspace that does NOT +// have a workspace-level override of the same key. +func TestSetGlobal_AutoRestartsAffectedWorkspaces(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + + restarted := make(chan string, 4) + restartFunc := func(wsID string) { restarted <- wsID } + handler := NewSecretsHandler(restartFunc) + + // INSERT ... ON CONFLICT for the global secret itself. + mock.ExpectExec("INSERT INTO global_secrets"). + WithArgs("CLAUDE_CODE_OAUTH_TOKEN", sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) + + // Query for affected workspaces — ws-A inherits, ws-B overrides (excluded). + mock.ExpectQuery("SELECT id FROM workspaces"). + WithArgs("CLAUDE_CODE_OAUTH_TOKEN"). + WillReturnRows(sqlmock.NewRows([]string{"id"}). + AddRow("ws-a"). + AddRow("ws-c")) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + body := `{"key":"CLAUDE_CODE_OAUTH_TOKEN","value":"sk-ant-oat01-new"}` + c.Request = httptest.NewRequest("POST", "/admin/secrets", bytes.NewBufferString(body)) + c.Request.Header.Set("Content-Type", "application/json") + + handler.SetGlobal(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String()) + } + + // Collect both expected restarts (order not guaranteed). + seen := map[string]bool{} + deadline := time.After(2 * time.Second) + for len(seen) < 2 { + select { + case id := <-restarted: + seen[id] = true + case <-deadline: + t.Fatalf("auto-restart not fired for all affected workspaces; got %v", seen) + } + } + if !seen["ws-a"] || !seen["ws-c"] { + t.Errorf("expected ws-a and ws-c restarted, got %v", seen) + } + + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + +// TestDeleteGlobal_AutoRestartsAffectedWorkspaces covers the delete branch of #15. +func TestDeleteGlobal_AutoRestartsAffectedWorkspaces(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + + restarted := make(chan string, 2) + handler := NewSecretsHandler(func(id string) { restarted <- id }) + + mock.ExpectExec("DELETE FROM global_secrets"). + WithArgs("OLD_KEY"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + mock.ExpectQuery("SELECT id FROM workspaces"). + WithArgs("OLD_KEY"). + WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-x")) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "key", Value: "OLD_KEY"}} + c.Request = httptest.NewRequest("DELETE", "/admin/secrets/OLD_KEY", nil) + + handler.DeleteGlobal(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String()) + } + + select { + case id := <-restarted: + if id != "ws-x" { + t.Errorf("expected ws-x, got %q", id) + } + case <-time.After(2 * time.Second): + t.Fatal("auto-restart not fired") + } + + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +}