From 53130ee020ac43868a3376a2905eb7baaeb78666 Mon Sep 17 00:00:00 2001 From: "Molecule AI Dev Engineer A (Kimi)" Date: Fri, 5 Jun 2026 09:03:52 +0000 Subject: [PATCH] fix(reconciler): export RestartDebounceWindow and assert >= reconcile interval (#2284) The restart debounce window (60s) was set exactly equal to the CP instance reconciler interval (60s) with zero margin. If someone drops the reconciler interval below the debounce window, a workspace flipped offline by one tick can be reprovisioned again by the next tick before the debounce drops it, reopening the double-reprovision thrash class (internal#544). Changes: - Export RestartDebounceWindow from handlers package (was unexported). - Add explicit COUPLING comment documenting the >= relationship. - Add runtime assertion in main.go that fatals if the interval ever exceeds the debounce window, making the coupling fail-closed. - Update test that manipulates the window to use the exported name. Closes #2284 (MED item) --- workspace-server/cmd/server/main.go | 11 +++++++++- .../internal/handlers/workspace_restart.go | 22 ++++++++++++------- .../workspace_restart_self_fire_test.go | 6 ++--- 3 files changed, 27 insertions(+), 12 deletions(-) diff --git a/workspace-server/cmd/server/main.go b/workspace-server/cmd/server/main.go index 56c2e4a15..4e371e885 100644 --- a/workspace-server/cmd/server/main.go +++ b/workspace-server/cmd/server/main.go @@ -351,8 +351,17 @@ func main() { // (true, err) on any transient error, so a CP blip never flips a healthy // workspace. if cpProv != nil { + // Guard against double-reprovision thrash (internal#544): the restart + // debounce window must cover the reconciler interval so a workspace + // flipped offline by one reconcile tick isn't immediately reprovisioned + // again by the next tick before the debounce drops it. If the interval + // ever shrinks below the debounce window, the coupling silently breaks. + reconcileInterval := 60 * time.Second + if handlers.RestartDebounceWindow < reconcileInterval { + log.Fatalf("RestartDebounceWindow (%s) must be >= CP instance reconciler interval (%s) to prevent double-reprovision thrash (internal#544)", handlers.RestartDebounceWindow, reconcileInterval) + } go supervised.RunWithRecover(ctx, "cp-instance-reconciler", func(c context.Context) { - registry.StartCPInstanceReconciler(c, cpProv, onWorkspaceOffline, 60*time.Second) + registry.StartCPInstanceReconciler(c, cpProv, onWorkspaceOffline, reconcileInterval) }) } diff --git a/workspace-server/internal/handlers/workspace_restart.go b/workspace-server/internal/handlers/workspace_restart.go index 3129386f2..e27b44a5e 100644 --- a/workspace-server/internal/handlers/workspace_restart.go +++ b/workspace-server/internal/handlers/workspace_restart.go @@ -45,7 +45,7 @@ type restartState struct { // flipped running=true. Used by the self-fire debounce (internal#544, // the ws-server self-fire restart feedback loop seen in prod-Reviewer/ // Researcher 2026-05-19 ~00:05Z 4x reprov thrash): any RestartByID - // arriving within restartDebounceWindow of this timestamp is silently + // arriving within RestartDebounceWindow of this timestamp is silently // dropped so a probe firing during the EC2-pending window can't // re-trigger a fresh full cycle on the just-launched instance. restartStartedAt time.Time @@ -55,13 +55,19 @@ type restartState struct { // its own entry so unrelated workspaces don't serialize on each other. var restartStates sync.Map // map[workspaceID]*restartState -// restartDebounceWindow is the silent-drop window for successive RestartByID +// RestartDebounceWindow is the silent-drop window for successive RestartByID // calls. Sized to cover the typical EC2 pending → online interval (20-30s) // with a margin so a probe firing during the just-after-online but still- // flaky heartbeat window also gets dropped. Bigger than that would block // legitimate "Restart failed, retry" recoveries; smaller would let the // 4x thrash class through. Package-level so tests can shrink it. -var restartDebounceWindow = 60 * time.Second +// +// COUPLING: this window MUST be >= the CP instance reconciler interval +// (cmd/server/main.go:355, currently 60s). If the interval ever shrinks +// below this window, a workspace flipped offline by one reconcile tick can +// be reprovisioned again by the next tick before the debounce drops it, +// reopening the double-reprovision thrash class (internal#544). +var RestartDebounceWindow = 60 * time.Second // restartByIDDropCounter is incremented every time RestartByID drops a call // inside the debounce window. Exposed as a package-level atomic counter so @@ -536,7 +542,7 @@ func (h *WorkspaceHandler) RestartByID(workspaceID string) { return } // Self-fire debounce: drop (not coalesce) successive RestartByID calls - // within restartDebounceWindow of the most recent cycle's start. This + // within RestartDebounceWindow of the most recent cycle's start. This // is the load-bearing protection against the 4x reprov thrash class — // coalesceRestart's pending-flag would otherwise drain by running // ANOTHER full cycle of stop+provision on the just-launched EC2 (still @@ -550,14 +556,14 @@ func (h *WorkspaceHandler) RestartByID(workspaceID string) { if shouldDebounceRestart(workspaceID) { restartByIDDropCounter.Add(1) log.Printf("RestartByID: %s — dropped (within %s self-fire debounce window; total dropped=%d)", - workspaceID, restartDebounceWindow, restartByIDDropCounter.Load()) + workspaceID, RestartDebounceWindow, restartByIDDropCounter.Load()) return } coalesceRestart(workspaceID, func() { h.runRestartCycle(workspaceID) }) } // shouldDebounceRestart reports whether the most recent cycle for this -// workspace started within restartDebounceWindow. Read-only on +// workspace started within RestartDebounceWindow. Read-only on // restartState; the actual restartStartedAt stamp is written in // coalesceRestart when running flips false→true. func shouldDebounceRestart(workspaceID string) bool { @@ -571,7 +577,7 @@ func shouldDebounceRestart(workspaceID string) bool { if state.restartStartedAt.IsZero() { return false } - return time.Since(state.restartStartedAt) < restartDebounceWindow + return time.Since(state.restartStartedAt) < RestartDebounceWindow } // coalesceRestart implements the pending-flag gate around an arbitrary cycle @@ -594,7 +600,7 @@ func coalesceRestart(workspaceID string, cycle func()) { } state.running = true // Stamp the start time so the RestartByID debounce can drop any - // self-fire probe that hits within restartDebounceWindow. Only the + // self-fire probe that hits within RestartDebounceWindow. Only the // false→true edge stamps; the drain-loop's inner cycles re-use the // same start (they're effectively one "restart event" from the // debounce's POV). diff --git a/workspace-server/internal/handlers/workspace_restart_self_fire_test.go b/workspace-server/internal/handlers/workspace_restart_self_fire_test.go index 832fdb2ef..3ec153020 100644 --- a/workspace-server/internal/handlers/workspace_restart_self_fire_test.go +++ b/workspace-server/internal/handlers/workspace_restart_self_fire_test.go @@ -203,9 +203,9 @@ func TestRestartByID_DebounceExpiresAfterWindow(t *testing.T) { const wsID = "self-fire-ws-debounce-release" resetSelfFireState(wsID) - orig := restartDebounceWindow - restartDebounceWindow = 5 * time.Millisecond - defer func() { restartDebounceWindow = orig }() + orig := RestartDebounceWindow + RestartDebounceWindow = 5 * time.Millisecond + defer func() { RestartDebounceWindow = orig }() // Stamp inside the window. sv, _ := restartStates.LoadOrStore(wsID, &restartState{}) -- 2.52.0