fix(reconciler): export RestartDebounceWindow and assert >= reconcile interval (#2284) #2297

Merged
devops-engineer merged 4 commits from fix/reconciler-debounce-coupling-2284 into main 2026-06-06 21:17:30 +00:00
3 changed files with 27 additions and 12 deletions
+10 -1
View File
@@ -351,8 +351,17 @@ func main() {
// (true, err) on any transient error, so a CP blip never flips a healthy
// workspace.
if cpProv != nil {
// Guard against double-reprovision thrash (internal#544): the restart
// debounce window must cover the reconciler interval so a workspace
// flipped offline by one reconcile tick isn't immediately reprovisioned
// again by the next tick before the debounce drops it. If the interval
// ever shrinks below the debounce window, the coupling silently breaks.
reconcileInterval := 60 * time.Second
if handlers.RestartDebounceWindow < reconcileInterval {
log.Fatalf("RestartDebounceWindow (%s) must be >= CP instance reconciler interval (%s) to prevent double-reprovision thrash (internal#544)", handlers.RestartDebounceWindow, reconcileInterval)
}
go supervised.RunWithRecover(ctx, "cp-instance-reconciler", func(c context.Context) {
registry.StartCPInstanceReconciler(c, cpProv, onWorkspaceOffline, 60*time.Second)
registry.StartCPInstanceReconciler(c, cpProv, onWorkspaceOffline, reconcileInterval)
})
}
@@ -45,7 +45,7 @@ type restartState struct {
// flipped running=true. Used by the self-fire debounce (internal#544,
// the ws-server self-fire restart feedback loop seen in prod-Reviewer/
// Researcher 2026-05-19 ~00:05Z 4x reprov thrash): any RestartByID
// arriving within restartDebounceWindow of this timestamp is silently
// arriving within RestartDebounceWindow of this timestamp is silently
// dropped so a probe firing during the EC2-pending window can't
// re-trigger a fresh full cycle on the just-launched instance.
restartStartedAt time.Time
@@ -55,13 +55,19 @@ type restartState struct {
// its own entry so unrelated workspaces don't serialize on each other.
var restartStates sync.Map // map[workspaceID]*restartState
// restartDebounceWindow is the silent-drop window for successive RestartByID
// RestartDebounceWindow is the silent-drop window for successive RestartByID
// calls. Sized to cover the typical EC2 pending → online interval (20-30s)
// with a margin so a probe firing during the just-after-online but still-
// flaky heartbeat window also gets dropped. Bigger than that would block
// legitimate "Restart failed, retry" recoveries; smaller would let the
// 4x thrash class through. Package-level so tests can shrink it.
var restartDebounceWindow = 60 * time.Second
//
// COUPLING: this window MUST be >= the CP instance reconciler interval
// (cmd/server/main.go:355, currently 60s). If the interval ever shrinks
// below this window, a workspace flipped offline by one reconcile tick can
// be reprovisioned again by the next tick before the debounce drops it,
// reopening the double-reprovision thrash class (internal#544).
var RestartDebounceWindow = 60 * time.Second
// restartByIDDropCounter is incremented every time RestartByID drops a call
// inside the debounce window. Exposed as a package-level atomic counter so
@@ -536,7 +542,7 @@ func (h *WorkspaceHandler) RestartByID(workspaceID string) {
return
}
// Self-fire debounce: drop (not coalesce) successive RestartByID calls
// within restartDebounceWindow of the most recent cycle's start. This
// within RestartDebounceWindow of the most recent cycle's start. This
// is the load-bearing protection against the 4x reprov thrash class —
// coalesceRestart's pending-flag would otherwise drain by running
// ANOTHER full cycle of stop+provision on the just-launched EC2 (still
@@ -550,14 +556,14 @@ func (h *WorkspaceHandler) RestartByID(workspaceID string) {
if shouldDebounceRestart(workspaceID) {
restartByIDDropCounter.Add(1)
log.Printf("RestartByID: %s — dropped (within %s self-fire debounce window; total dropped=%d)",
workspaceID, restartDebounceWindow, restartByIDDropCounter.Load())
workspaceID, RestartDebounceWindow, restartByIDDropCounter.Load())
return
}
coalesceRestart(workspaceID, func() { h.runRestartCycle(workspaceID) })
}
// shouldDebounceRestart reports whether the most recent cycle for this
// workspace started within restartDebounceWindow. Read-only on
// workspace started within RestartDebounceWindow. Read-only on
// restartState; the actual restartStartedAt stamp is written in
// coalesceRestart when running flips false→true.
func shouldDebounceRestart(workspaceID string) bool {
@@ -571,7 +577,7 @@ func shouldDebounceRestart(workspaceID string) bool {
if state.restartStartedAt.IsZero() {
return false
}
return time.Since(state.restartStartedAt) < restartDebounceWindow
return time.Since(state.restartStartedAt) < RestartDebounceWindow
}
// coalesceRestart implements the pending-flag gate around an arbitrary cycle
@@ -594,7 +600,7 @@ func coalesceRestart(workspaceID string, cycle func()) {
}
state.running = true
// Stamp the start time so the RestartByID debounce can drop any
// self-fire probe that hits within restartDebounceWindow. Only the
// self-fire probe that hits within RestartDebounceWindow. Only the
// false→true edge stamps; the drain-loop's inner cycles re-use the
// same start (they're effectively one "restart event" from the
// debounce's POV).
@@ -203,9 +203,9 @@ func TestRestartByID_DebounceExpiresAfterWindow(t *testing.T) {
const wsID = "self-fire-ws-debounce-release"
resetSelfFireState(wsID)
orig := restartDebounceWindow
restartDebounceWindow = 5 * time.Millisecond
defer func() { restartDebounceWindow = orig }()
orig := RestartDebounceWindow
RestartDebounceWindow = 5 * time.Millisecond
defer func() { RestartDebounceWindow = orig }()
// Stamp inside the window.
sv, _ := restartStates.LoadOrStore(wsID, &restartState{})