fix(workspace-server): debounce file-write → RestartByID tight loop (#624 )

Canvas Save fires N PUT /files in a 30-60s burst (10-17 files observed on claude-code SEO Agent 3fe84b89-eb65-42fc-ad1f-5c93582ca3e7). Each successful write at templates.go:575/591/607/662/682/697 and template_import.go:239/275/297 previously called `goAsync(func() { wh.RestartByID(wsID) })`. The existing 60s self-fire debounce in RestartByID catches calls within the window, but writes at T+65s+ pass the debounce, set pending=true on the running coalesceRestart cycle, and drain immediately into cycle 2 — which DELETEs+recreates EC2 mid-burst → user PUTs 500 with EC2InstanceStateInvalidException. Fix in two layers (both shipped here): Path A (call-site debounce): new `maybeRestartAfterFileWrite` helper with a 15s per-workspace coalescing window, backed by a sync.Map of *atomic.Int64 last-fire timestamps. Replaces the 9 file-write goAsync(RestartByID) sites in templates.go (6) and template_import.go (3). CAS-loop on the timestamp so concurrent writers can't both fire. Path B (drain-loop re-stamp): coalesceRestart now re-stamps restartStartedAt at the top of each drained iteration past the first. The original design (stamp only on false→true edge) treated all drained pending as "one event" for debounce purposes, which lets a write at T+65s pipe through cycle 1 into cycle 2 without re-debounce. Re-stamping means any RestartByID arriving during cycle N hits a fresh 60s window and is dropped by shouldDebounceRestart instead of chaining into N+1. The secrets-batch coalescing semantic (one cycle picks up everyone who arrived during it) is preserved. Tests (workspace_restart_file_write_debounce_test.go, 7 cases): - FirstWriteRestarts (baseline — drop counter stays 0) - SecondWriteWithin15sSkipped (core fix — drop=1, stamp unchanged) - ManyWritesInBurstCoalesceToOne (10-PUT burst → 1 fire + 9 drops) - AfterWindowExpiresFiresAgain (5ms shrink, second fires post-window) - DifferentWorkspacesIndependent (per-workspace isolation) - ConcurrentCallsSafelyDebounced (50-goroutine herd → exactly 1 fires) - TestCoalesceRestart_DrainRespectsRestartedAtBetweenIterations (Path B regression: drained cycle's restartStartedAt is observably newer) All existing TestCoalesceRestart_* + TestRestartByID_* + self-fire + isRestarting tests still pass. Closes #624 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 15:32:42 -07:00
4 changed files with 497 additions and 27 deletions
@@ -234,9 +234,13 @@ func (h *TemplatesHandler) ReplaceFiles(c *gin.Context) {
 			"source":    "ec2-ssh",
 		})
 		if h.wh != nil {
-			// RFC internal#524 Layer 1: per-handler goAsync (drains via h.wh.waitAsyncForTest)
-			wsID := workspaceID
-			h.wh.goAsync(func() { h.wh.RestartByID(wsID) })
+			// internal#624: 15s per-workspace debounce around the file-write
+			// → RestartByID trigger. Canvas Save / ReplaceFiles fires N PUTs
+			// in a burst; without this each PUT chains into the
+			// coalesceRestart drain loop. The helper still uses goAsync
+			// internally (drains via h.wh.waitAsyncForTest), preserving
+			// RFC internal#524 Layer 1.
+			h.wh.maybeRestartAfterFileWrite(workspaceID)
 		}
 		return
 	}
@@ -270,9 +274,13 @@ func (h *TemplatesHandler) ReplaceFiles(c *gin.Context) {
 			"source":    "container",
 		})
 		if h.wh != nil {
-			// RFC internal#524 Layer 1: per-handler goAsync (drains via h.wh.waitAsyncForTest)
-			wsID := workspaceID
-			h.wh.goAsync(func() { h.wh.RestartByID(wsID) })
+			// internal#624: 15s per-workspace debounce around the file-write
+			// → RestartByID trigger. Canvas Save / ReplaceFiles fires N PUTs
+			// in a burst; without this each PUT chains into the
+			// coalesceRestart drain loop. The helper still uses goAsync
+			// internally (drains via h.wh.waitAsyncForTest), preserving
+			// RFC internal#524 Layer 1.
+			h.wh.maybeRestartAfterFileWrite(workspaceID)
 		}
 		return
 	}
@@ -292,8 +300,12 @@ func (h *TemplatesHandler) ReplaceFiles(c *gin.Context) {

 	c.JSON(http.StatusOK, gin.H{"status": "replaced", "workspace": workspaceID, "files": len(body.Files), "source": "volume"})
 	if h.wh != nil {
-		// RFC internal#524 Layer 1: per-handler goAsync (drains via h.wh.waitAsyncForTest)
-		wsID := workspaceID
-		h.wh.goAsync(func() { h.wh.RestartByID(wsID) })
+		// internal#624: 15s per-workspace debounce around the file-write
+		// → RestartByID trigger. Canvas Save / ReplaceFiles fires N PUTs
+		// in a burst; without this each PUT chains into the
+		// coalesceRestart drain loop. The helper still uses goAsync
+		// internally (drains via h.wh.waitAsyncForTest), preserving
+		// RFC internal#524 Layer 1.
+		h.wh.maybeRestartAfterFileWrite(workspaceID)
 	}
 }
@@ -570,9 +570,13 @@ func (h *TemplatesHandler) WriteFile(c *gin.Context) {
 		}
 		c.JSON(http.StatusOK, gin.H{"status": "saved", "path": filePath})
 		if h.wh != nil {
-			// RFC internal#524 Layer 1: per-handler goAsync (drains via h.wh.waitAsyncForTest)
-			wsID := workspaceID
-			h.wh.goAsync(func() { h.wh.RestartByID(wsID) })
+			// internal#624: 15s per-workspace debounce around the file-write
+			// → RestartByID trigger. Canvas Save fires N PUTs in a burst;
+			// without this each PUT chains into the coalesceRestart drain
+			// loop and produces back-to-back EC2 recreate cycles. The
+			// helper still uses goAsync internally (drains via
+			// h.wh.waitAsyncForTest), preserving RFC internal#524 Layer 1.
+			h.wh.maybeRestartAfterFileWrite(workspaceID)
 		}
 		return
 	}
@@ -586,9 +590,13 @@ func (h *TemplatesHandler) WriteFile(c *gin.Context) {
 		}
 		c.JSON(http.StatusOK, gin.H{"status": "saved", "path": filePath})
 		if h.wh != nil {
-			// RFC internal#524 Layer 1: per-handler goAsync (drains via h.wh.waitAsyncForTest)
-			wsID := workspaceID
-			h.wh.goAsync(func() { h.wh.RestartByID(wsID) })
+			// internal#624: 15s per-workspace debounce around the file-write
+			// → RestartByID trigger. Canvas Save fires N PUTs in a burst;
+			// without this each PUT chains into the coalesceRestart drain
+			// loop and produces back-to-back EC2 recreate cycles. The
+			// helper still uses goAsync internally (drains via
+			// h.wh.waitAsyncForTest), preserving RFC internal#524 Layer 1.
+			h.wh.maybeRestartAfterFileWrite(workspaceID)
 		}
 		return
 	}
@@ -602,9 +610,13 @@ func (h *TemplatesHandler) WriteFile(c *gin.Context) {
 	}
 	c.JSON(http.StatusOK, gin.H{"status": "saved", "path": filePath})
 	if h.wh != nil {
-		// RFC internal#524 Layer 1: per-handler goAsync (drains via h.wh.waitAsyncForTest)
-		wsID := workspaceID
-		h.wh.goAsync(func() { h.wh.RestartByID(wsID) })
+		// internal#624: 15s per-workspace debounce around the file-write
+		// → RestartByID trigger. Canvas Save fires N PUTs in a burst;
+		// without this each PUT chains into the coalesceRestart drain
+		// loop and produces back-to-back EC2 recreate cycles. The
+		// helper still uses goAsync internally (drains via
+		// h.wh.waitAsyncForTest), preserving RFC internal#524 Layer 1.
+		h.wh.maybeRestartAfterFileWrite(workspaceID)
 	}
 }

@@ -657,9 +669,13 @@ func (h *TemplatesHandler) DeleteFile(c *gin.Context) {
 		}
 		c.JSON(http.StatusOK, gin.H{"status": "deleted", "path": filePath})
 		if h.wh != nil {
-			// RFC internal#524 Layer 1: per-handler goAsync (drains via h.wh.waitAsyncForTest)
-			wsID := workspaceID
-			h.wh.goAsync(func() { h.wh.RestartByID(wsID) })
+			// internal#624: 15s per-workspace debounce around the file-write
+			// → RestartByID trigger. Canvas Save fires N PUTs in a burst;
+			// without this each PUT chains into the coalesceRestart drain
+			// loop and produces back-to-back EC2 recreate cycles. The
+			// helper still uses goAsync internally (drains via
+			// h.wh.waitAsyncForTest), preserving RFC internal#524 Layer 1.
+			h.wh.maybeRestartAfterFileWrite(workspaceID)
 		}
 		return
 	}
@@ -677,9 +693,13 @@ func (h *TemplatesHandler) DeleteFile(c *gin.Context) {
 		}
 		c.JSON(http.StatusOK, gin.H{"status": "deleted", "path": filePath})
 		if h.wh != nil {
-			// RFC internal#524 Layer 1: per-handler goAsync (drains via h.wh.waitAsyncForTest)
-			wsID := workspaceID
-			h.wh.goAsync(func() { h.wh.RestartByID(wsID) })
+			// internal#624: 15s per-workspace debounce around the file-write
+			// → RestartByID trigger. Canvas Save fires N PUTs in a burst;
+			// without this each PUT chains into the coalesceRestart drain
+			// loop and produces back-to-back EC2 recreate cycles. The
+			// helper still uses goAsync internally (drains via
+			// h.wh.waitAsyncForTest), preserving RFC internal#524 Layer 1.
+			h.wh.maybeRestartAfterFileWrite(workspaceID)
 		}
 		return
 	}
@@ -692,8 +712,12 @@ func (h *TemplatesHandler) DeleteFile(c *gin.Context) {
 	}
 	c.JSON(http.StatusOK, gin.H{"status": "deleted", "path": filePath})
 	if h.wh != nil {
-		// RFC internal#524 Layer 1: per-handler goAsync (drains via h.wh.waitAsyncForTest)
-		wsID := workspaceID
-		h.wh.goAsync(func() { h.wh.RestartByID(wsID) })
+		// internal#624: 15s per-workspace debounce around the file-write
+		// → RestartByID trigger. Canvas Save fires N PUTs in a burst;
+		// without this each PUT chains into the coalesceRestart drain
+		// loop and produces back-to-back EC2 recreate cycles. The
+		// helper still uses goAsync internally (drains via
+		// h.wh.waitAsyncForTest), preserving RFC internal#524 Layer 1.
+		h.wh.maybeRestartAfterFileWrite(workspaceID)
 	}
 }
@@ -70,6 +70,97 @@ var restartDebounceWindow = 60 * time.Second
 // workspace-server yet — that's a separate RFC.
 var restartByIDDropCounter atomic.Uint64

+// fileWriteRestartDebounceWindow is the per-workspace coalescing window for
+// the file-write → RestartByID trigger fired by templates.go's WriteFile,
+// DeleteFile, and ReplaceFiles handlers (and template_import.go's variants).
+//
+// Background (internal#624 2026-05-20): canvas Save fires N PUT /files
+// requests in a 30-60s burst (claude-code SEO agent observed 10-17 files in
+// 60s). Each successful write previously fired `goAsync(RestartByID)`. The
+// 60s self-fire debounce in RestartByID itself catches calls 1-60s, but
+// writes at T+65s+ pass the debounce, set pending=true on a still-running
+// coalesceRestart cycle, and drain immediately into cycle 2 — which DELETEs
+// + recreates EC2 mid-burst, returning 500 EC2InstanceStateInvalidException
+// on the in-flight user PUTs.
+//
+// 15s is sized to absorb a canvas Save burst (writes typically land within
+// a 5-10s window) while still letting a deliberate "edit, wait, edit again"
+// pattern restart twice. Bigger than that would silently swallow legitimate
+// rapid-iteration edits; smaller would let burst tails leak through.
+var fileWriteRestartDebounceWindow = 15 * time.Second
+
+// fileWriteRestartLastFireAt records the last time `maybeRestartAfterFileWrite`
+// actually fired a restart for each workspace. sync.Map (not RWMutex+map)
+// because writes happen on every successful file-write handler, reads on
+// every subsequent file-write handler call — both per-workspace — and the
+// keys are sparse + long-lived. Stored as int64 unix-nano so the load/store
+// path can stay lock-free (atomic.Int64 inside sync.Map.Value is fine, but
+// time.Time itself isn't atomically loadable).
+var fileWriteRestartLastFireAt sync.Map // map[workspaceID]*atomic.Int64
+
+// fileWriteRestartDropCounter counts how many file-write restart triggers
+// were silently coalesced. Same observability rationale as
+// restartByIDDropCounter — package-level atomic so tests can assert the
+// drop fired and ops can correlate with "user clicked Save 10 times,
+// only saw 1 restart cycle".
+var fileWriteRestartDropCounter atomic.Uint64
+
+// maybeRestartAfterFileWrite is the call-site debounce wrapper for the 9
+// file-write trigger sites in templates.go + template_import.go. Replaces
+// the direct `goAsync(func() { wh.RestartByID(wsID) })` pattern with a
+// 15s per-workspace coalescing window:
+//
+//   - First call (no prior fire OR last fire >15s ago): records the
+//     current timestamp and fires goAsync(RestartByID).
+//   - Subsequent calls within 15s of the last fire: silently dropped,
+//     drop counter incremented.
+//
+// This is the call-site-layer protection (internal#624 Path A). The drain-
+// loop layer in coalesceRestart (Path B, re-stamping restartStartedAt per
+// iteration) is the platform-layer defense in depth — together they close
+// the file-write tight-loop class regardless of which entry point fires.
+//
+// Stateless on the handler so any handler with access to a WorkspaceHandler
+// can use it; the per-workspace state lives in the package-level sync.Map.
+func (h *WorkspaceHandler) maybeRestartAfterFileWrite(workspaceID string) {
+	now := time.Now().UnixNano()
+
+	// LoadOrStore the per-workspace last-fire stamp. First write for a
+	// brand-new workspace falls through the CompareAndSwap below because
+	// the zero-init value (0) is far enough in the past to satisfy the
+	// "last fire >15s ago" predicate.
+	sv, _ := fileWriteRestartLastFireAt.LoadOrStore(workspaceID, new(atomic.Int64))
+	stamp := sv.(*atomic.Int64)
+
+	// CAS loop: read last, decide, swap. We use CAS instead of Lock/Unlock
+	// because the typical case is "thousands of writes, one restart per
+	// 15s" — uncontended atomic is ~5ns vs ~30ns mutex. Bounded retry
+	// because in the rare contended case (two writes finishing nanoseconds
+	// apart) one will win the swap and the other will see the new stamp,
+	// drop, and bail.
+	for retry := 0; retry < 4; retry++ {
+		last := stamp.Load()
+		elapsed := time.Duration(now - last)
+		if last != 0 && elapsed < fileWriteRestartDebounceWindow {
+			// Within debounce window — drop silently.
+			fileWriteRestartDropCounter.Add(1)
+			log.Printf("maybeRestartAfterFileWrite: %s — coalesced "+
+				"(last fire %s ago < %s window; total dropped=%d)",
+				workspaceID, elapsed.Round(time.Millisecond),
+				fileWriteRestartDebounceWindow,
+				fileWriteRestartDropCounter.Load())
+			return
+		}
+		if stamp.CompareAndSwap(last, now) {
+			break
+		}
+		// Another writer beat us to the stamp update. Re-read and retry;
+		// the retry will almost certainly see the new value and drop.
+	}
+
+	h.goAsync(func() { h.RestartByID(workspaceID) })
+}
+
 // isRestarting reports whether a restart cycle is currently in flight for
 // the workspace. Callers that have their own "container looks dead" probe
 // MUST consult this before triggering a restart, because during the
@@ -513,6 +604,27 @@ func coalesceRestart(workspaceID string, cycle func()) {
 	// inside provisionWorkspace, so any writes that committed since the
 	// last cycle are picked up. Continues until no pending request was
 	// observed at the top of an iteration.
+	//
+	// internal#624 Path B (defense in depth for the file-write tight-loop
+	// class): re-stamp restartStartedAt at the top of every drain iteration
+	// past the first. The original design (stamp only on false→true edge)
+	// treated all drained pending as "one event from the debounce's POV",
+	// which is correct for the secrets-batch use case but lets a file-write
+	// burst at T+65s of a 60s drain pipe straight into another full cycle.
+	// Re-stamping closes that hole — each drained cycle gets its own fresh
+	// debounce window, so any RestartByID arriving during cycle N is
+	// dropped by shouldDebounceRestart instead of accumulating into
+	// pending=true for cycle N+1.
+	//
+	// The original "one cycle picks up everyone who arrived during it"
+	// semantic still holds for the secrets-write path: callers that hit
+	// coalesceRestart during cycle 1 still set pending=true and still get
+	// their effects landed in cycle 2. What changes is that callers
+	// arriving during cycle 2 (via RestartByID) now hit the re-stamped
+	// debounce and are dropped instead of being chained into cycle 3,
+	// which is exactly the chain that produced the 22:08-22:10 thrash on
+	// 3fe84b89.
+	iteration := 0
 	for {
 		state.mu.Lock()
 		if !state.pending {
@@ -520,7 +632,13 @@ func coalesceRestart(workspaceID string, cycle func()) {
 			return // defer clears running
 		}
 		state.pending = false
+		if iteration > 0 {
+			// Re-stamp for drained iterations only; the false→true edge
+			// already stamped at the top of coalesceRestart.
+			state.restartStartedAt = time.Now()
+		}
 		state.mu.Unlock()
+		iteration++

 		cycle()
 	}
@@ -0,0 +1,316 @@
+package handlers
+
+// Tests for internal#624 — file-write → RestartByID tight-loop fix.
+//
+// Empirical chain (Loki 2026-05-20 22:00-22:11Z on workspace
+// 3fe84b89-eb65-42fc-ad1f-5c93582ca3e7, claude-code SEO Agent):
+//
+//   1. Canvas Save writes 10-17 files in a 30-60s window.
+//   2. Each successful PUT /files at templates.go:575 / 591 / 607 / 662 /
+//      682 / 697 (and template_import.go:239 / 275 / 297) fires
+//      `goAsync(func() { wh.RestartByID(wsID) })`.
+//   3. RestartByID's existing 60s self-fire debounce catches calls 1-60s
+//      after the cycle starts. But writes at T+65s+ pass the debounce,
+//      set pending=true on the still-running coalesceRestart cycle, and
+//      drain IMMEDIATELY into cycle 2 — no re-debounce because the
+//      original drain loop re-uses the same restartStartedAt.
+//   4. Cycle 2 DELETEs+recreates EC2 mid-burst → user sees
+//      EC2InstanceStateInvalidException 500 on the in-flight PUTs.
+//
+// Fix: two layers (both shipped in the same PR).
+//
+//   Path A (call-site debounce): every file-write trigger goes through
+//   maybeRestartAfterFileWrite, which silently drops re-fires within 15s
+//   of the last fire for the same workspace.
+//
+//   Path B (drain-loop re-stamp): coalesceRestart now re-stamps
+//   restartStartedAt at the top of each drained iteration, so any
+//   RestartByID arriving during a drained cycle hits a fresh 60s window
+//   and is dropped by shouldDebounceRestart instead of chaining further.
+
+import (
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+)
+
+// resetFileWriteDebounceState wipes the package-level sync.Map + drop
+// counter for the given workspace ID. Tests must call this between
+// scenarios because fileWriteRestartLastFireAt is shared.
+func resetFileWriteDebounceState(workspaceID string) {
+	fileWriteRestartLastFireAt.Delete(workspaceID)
+	fileWriteRestartDropCounter.Store(0)
+}
+
+// newFileWriteDebounceHandler constructs a minimal *WorkspaceHandler with
+// no provisioner so RestartByID short-circuits at HasProvisioner()=false
+// — we only care that maybeRestartAfterFileWrite reaches goAsync at all.
+// The asyncWG inside goAsync lets us wait for the goroutine to finish so
+// we can deterministically observe whether RestartByID was scheduled.
+func newFileWriteDebounceHandler(t *testing.T) *WorkspaceHandler {
+	t.Helper()
+	return NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
+}
+
+// TestMaybeRestartAfterFileWrite_FirstWriteRestarts — the baseline case:
+// the very first call for a workspace must actually fire goAsync (i.e.
+// no debounce-drop on the first PUT). Without this the helper would
+// silently swallow every legitimate single-file save.
+func TestMaybeRestartAfterFileWrite_FirstWriteRestarts(t *testing.T) {
+	const wsID = "fw-debounce-first"
+	resetFileWriteDebounceState(wsID)
+
+	h := newFileWriteDebounceHandler(t)
+	h.maybeRestartAfterFileWrite(wsID)
+
+	// Drop counter must NOT have incremented — the call fired.
+	if got := fileWriteRestartDropCounter.Load(); got != 0 {
+		t.Errorf("first call to maybeRestartAfterFileWrite must fire (drop counter must stay 0), got %d", got)
+	}
+
+	// Last-fire timestamp must be populated (non-zero) so the next call
+	// will compare against it.
+	sv, ok := fileWriteRestartLastFireAt.Load(wsID)
+	if !ok {
+		t.Fatal("first call must register the workspace in fileWriteRestartLastFireAt")
+	}
+	stamp := sv.(*atomic.Int64).Load()
+	if stamp == 0 {
+		t.Error("first call must record a non-zero last-fire timestamp")
+	}
+
+	// Wait for the spawned goroutine to finish so it doesn't leak into
+	// the next test (RestartByID will short-circuit on no-provisioner).
+	h.waitAsyncForTest()
+}
+
+// TestMaybeRestartAfterFileWrite_SecondWriteWithin15sSkipped — the core
+// fix: a second call within fileWriteRestartDebounceWindow of the first
+// MUST NOT fire RestartByID. The drop counter must increment by exactly
+// one and the last-fire timestamp must remain the FIRST call's stamp
+// (proof that the second call did not overwrite it).
+func TestMaybeRestartAfterFileWrite_SecondWriteWithin15sSkipped(t *testing.T) {
+	const wsID = "fw-debounce-second-within"
+	resetFileWriteDebounceState(wsID)
+
+	h := newFileWriteDebounceHandler(t)
+
+	// First call — fires.
+	h.maybeRestartAfterFileWrite(wsID)
+	h.waitAsyncForTest()
+
+	sv, _ := fileWriteRestartLastFireAt.Load(wsID)
+	firstStamp := sv.(*atomic.Int64).Load()
+
+	// Second call immediately — must be dropped.
+	h.maybeRestartAfterFileWrite(wsID)
+
+	if got := fileWriteRestartDropCounter.Load(); got != 1 {
+		t.Errorf("second call within 15s must increment drop counter by exactly 1, got %d", got)
+	}
+
+	// The CAS-loop must NOT have overwritten the first-call stamp — the
+	// debounce branch short-circuits before the CompareAndSwap.
+	stampAfter := sv.(*atomic.Int64).Load()
+	if stampAfter != firstStamp {
+		t.Errorf("dropped call must NOT update last-fire stamp (preserves debounce window); "+
+			"first=%d after=%d", firstStamp, stampAfter)
+	}
+}
+
+// TestMaybeRestartAfterFileWrite_ManyWritesInBurstCoalesceToOne — the
+// "bonus" regression test called out in the issue: 10 simulated PUTs
+// over 60s (compressed to a tight loop, all within 15s) must produce
+// exactly 1 RestartByID schedule and 9 drops. Models the canvas Save
+// burst shape that triggered the prod incident.
+func TestMaybeRestartAfterFileWrite_ManyWritesInBurstCoalesceToOne(t *testing.T) {
+	const wsID = "fw-debounce-burst"
+	resetFileWriteDebounceState(wsID)
+
+	h := newFileWriteDebounceHandler(t)
+
+	// 10 rapid-fire calls — simulates 10 PUTs landing inside the canvas
+	// Save burst window.
+	const burstSize = 10
+	for i := 0; i < burstSize; i++ {
+		h.maybeRestartAfterFileWrite(wsID)
+	}
+	h.waitAsyncForTest()
+
+	// One fired (call #1) + 9 dropped.
+	if got := fileWriteRestartDropCounter.Load(); got != burstSize-1 {
+		t.Errorf("expected %d drops for a %d-call burst (only call #1 fires), got %d",
+			burstSize-1, burstSize, got)
+	}
+}
+
+// TestMaybeRestartAfterFileWrite_AfterWindowExpiresFiresAgain — outside
+// the debounce window, the helper must release and fire again. Shrinks
+// fileWriteRestartDebounceWindow to 5ms so we don't sleep 15s in CI.
+// Important: without this, a legitimate "user edited, walked away for
+// a minute, edited again" would never restart and config changes would
+// never reach the agent.
+func TestMaybeRestartAfterFileWrite_AfterWindowExpiresFiresAgain(t *testing.T) {
+	const wsID = "fw-debounce-window-expires"
+	resetFileWriteDebounceState(wsID)
+
+	orig := fileWriteRestartDebounceWindow
+	fileWriteRestartDebounceWindow = 5 * time.Millisecond
+	defer func() { fileWriteRestartDebounceWindow = orig }()
+
+	h := newFileWriteDebounceHandler(t)
+
+	h.maybeRestartAfterFileWrite(wsID) // fires
+	h.waitAsyncForTest()
+
+	// Wait past the window.
+	time.Sleep(20 * time.Millisecond)
+
+	h.maybeRestartAfterFileWrite(wsID) // must fire again
+	h.waitAsyncForTest()
+
+	// Drop counter must still be 0 — both calls fired.
+	if got := fileWriteRestartDropCounter.Load(); got != 0 {
+		t.Errorf("second call after window expiry must fire (not drop), got %d drops", got)
+	}
+}
+
+// TestMaybeRestartAfterFileWrite_DifferentWorkspacesIndependent — the
+// per-workspace state map must isolate: a burst on workspace A must not
+// affect workspace B's debounce. Pinning so a future "use a single
+// global atomic" refactor breaks loudly.
+func TestMaybeRestartAfterFileWrite_DifferentWorkspacesIndependent(t *testing.T) {
+	const wsA = "fw-debounce-ws-a"
+	const wsB = "fw-debounce-ws-b"
+	resetFileWriteDebounceState(wsA)
+	resetFileWriteDebounceState(wsB)
+
+	h := newFileWriteDebounceHandler(t)
+
+	// 5 calls on A, all but one drop.
+	for i := 0; i < 5; i++ {
+		h.maybeRestartAfterFileWrite(wsA)
+	}
+	h.waitAsyncForTest()
+
+	dropsAfterA := fileWriteRestartDropCounter.Load()
+
+	// First call on B — must fire (its own independent window).
+	h.maybeRestartAfterFileWrite(wsB)
+	h.waitAsyncForTest()
+
+	// B's call must not have incremented the drop counter — it fired.
+	if got := fileWriteRestartDropCounter.Load(); got != dropsAfterA {
+		t.Errorf("workspace B's first call must fire (not share workspace A's debounce); "+
+			"drops after A=%d, drops after B=%d", dropsAfterA, got)
+	}
+
+	// Both workspaces must have their own last-fire entries.
+	if _, ok := fileWriteRestartLastFireAt.Load(wsA); !ok {
+		t.Error("workspace A missing from fileWriteRestartLastFireAt")
+	}
+	if _, ok := fileWriteRestartLastFireAt.Load(wsB); !ok {
+		t.Error("workspace B missing from fileWriteRestartLastFireAt")
+	}
+}
+
+// TestMaybeRestartAfterFileWrite_ConcurrentCallsSafelyDebounced — the
+// CAS-loop contract: many goroutines hitting the helper concurrently
+// must still produce at most one fired call (drops = N-1). Pinning the
+// "thousands of writes, one restart" performance shape called out in
+// the helper's comment. Uses sync.WaitGroup to release all goroutines
+// in a tight burst so the CAS is genuinely contended.
+func TestMaybeRestartAfterFileWrite_ConcurrentCallsSafelyDebounced(t *testing.T) {
+	const wsID = "fw-debounce-concurrent"
+	resetFileWriteDebounceState(wsID)
+
+	h := newFileWriteDebounceHandler(t)
+
+	const goroutines = 50
+	start := make(chan struct{})
+	var wg sync.WaitGroup
+	for i := 0; i < goroutines; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			<-start // hold every goroutine at the gate
+			h.maybeRestartAfterFileWrite(wsID)
+		}()
+	}
+	close(start) // release the herd
+	wg.Wait()
+	h.waitAsyncForTest()
+
+	// Exactly N-1 drops: one goroutine wins the CAS and fires, all
+	// other N-1 see a fresh stamp and drop into the debounce branch.
+	if got := fileWriteRestartDropCounter.Load(); got != goroutines-1 {
+		t.Errorf("expected %d drops for %d concurrent callers (exactly one fires), got %d",
+			goroutines-1, goroutines, got)
+	}
+}
+
+// TestCoalesceRestart_DrainRespectsRestartedAtBetweenIterations —
+// Path B regression: when coalesceRestart drains a pending request into
+// a follow-up cycle, the restartStartedAt timestamp must be re-stamped
+// for that follow-up iteration. Without this, a RestartByID arriving
+// during cycle 2 would hit a stale 60s window (computed from cycle 1's
+// start) and could pass the debounce just because cycle 1 + cycle 2's
+// runtime exceeded 60s combined.
+//
+// The test fires cycle 1 → completes → sets pending=true to trigger
+// cycle 2 → asserts that restartStartedAt was advanced for the drained
+// iteration. The cycle function itself just records the wall-clock at
+// which it observed restartStartedAt, so the test can compare cycle 1's
+// stamp vs cycle 2's stamp.
+func TestCoalesceRestart_DrainRespectsRestartedAtBetweenIterations(t *testing.T) {
+	const wsID = "fw-debounce-drain-restamp"
+	resetRestartStatesFor(wsID)
+
+	// Capture the restartStartedAt observed at the top of each cycle
+	// iteration. The cycle reads it directly from the state map so we
+	// see what coalesceRestart wrote.
+	var stamps []time.Time
+	var stampsMu sync.Mutex
+	cycleCount := 0
+	cycle := func() {
+		sv, _ := restartStates.Load(wsID)
+		state := sv.(*restartState)
+		state.mu.Lock()
+		stampsMu.Lock()
+		stamps = append(stamps, state.restartStartedAt)
+		stampsMu.Unlock()
+		state.mu.Unlock()
+
+		cycleCount++
+		if cycleCount == 1 {
+			// While inside cycle 1, set pending=true so the drain loop
+			// runs cycle 2 next iteration. Mirrors the prod shape: a
+			// PUT lands during cycle 1, sets pending=true via
+			// RestartByID → coalesceRestart's pending branch.
+			state.mu.Lock()
+			state.pending = true
+			state.mu.Unlock()
+
+			// Sleep briefly so cycle 2's stamp is observably later
+			// than cycle 1's. Without a real wall-clock gap the
+			// assertion can't tell re-stamp from no-op.
+			time.Sleep(20 * time.Millisecond)
+		}
+	}
+
+	coalesceRestart(wsID, cycle)
+
+	stampsMu.Lock()
+	defer stampsMu.Unlock()
+	if len(stamps) != 2 {
+		t.Fatalf("expected 2 cycle iterations (original + drained pending), got %d", len(stamps))
+	}
+	if !stamps[1].After(stamps[0]) {
+		t.Errorf("Path B regression: cycle 2's restartStartedAt (%v) must be AFTER "+
+			"cycle 1's (%v) — drained iterations must re-stamp so the self-fire "+
+			"debounce window resets per cycle. Without this, a RestartByID arriving "+
+			"during cycle 2 sees a stale window and can chain into cycle 3.",
+			stamps[1], stamps[0])
+	}
+}