Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| c5c759227c |
@@ -234,9 +234,13 @@ func (h *TemplatesHandler) ReplaceFiles(c *gin.Context) {
|
||||
"source": "ec2-ssh",
|
||||
})
|
||||
if h.wh != nil {
|
||||
// RFC internal#524 Layer 1: per-handler goAsync (drains via h.wh.waitAsyncForTest)
|
||||
wsID := workspaceID
|
||||
h.wh.goAsync(func() { h.wh.RestartByID(wsID) })
|
||||
// internal#624: 15s per-workspace debounce around the file-write
|
||||
// → RestartByID trigger. Canvas Save / ReplaceFiles fires N PUTs
|
||||
// in a burst; without this each PUT chains into the
|
||||
// coalesceRestart drain loop. The helper still uses goAsync
|
||||
// internally (drains via h.wh.waitAsyncForTest), preserving
|
||||
// RFC internal#524 Layer 1.
|
||||
h.wh.maybeRestartAfterFileWrite(workspaceID)
|
||||
}
|
||||
return
|
||||
}
|
||||
@@ -270,9 +274,13 @@ func (h *TemplatesHandler) ReplaceFiles(c *gin.Context) {
|
||||
"source": "container",
|
||||
})
|
||||
if h.wh != nil {
|
||||
// RFC internal#524 Layer 1: per-handler goAsync (drains via h.wh.waitAsyncForTest)
|
||||
wsID := workspaceID
|
||||
h.wh.goAsync(func() { h.wh.RestartByID(wsID) })
|
||||
// internal#624: 15s per-workspace debounce around the file-write
|
||||
// → RestartByID trigger. Canvas Save / ReplaceFiles fires N PUTs
|
||||
// in a burst; without this each PUT chains into the
|
||||
// coalesceRestart drain loop. The helper still uses goAsync
|
||||
// internally (drains via h.wh.waitAsyncForTest), preserving
|
||||
// RFC internal#524 Layer 1.
|
||||
h.wh.maybeRestartAfterFileWrite(workspaceID)
|
||||
}
|
||||
return
|
||||
}
|
||||
@@ -292,8 +300,12 @@ func (h *TemplatesHandler) ReplaceFiles(c *gin.Context) {
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{"status": "replaced", "workspace": workspaceID, "files": len(body.Files), "source": "volume"})
|
||||
if h.wh != nil {
|
||||
// RFC internal#524 Layer 1: per-handler goAsync (drains via h.wh.waitAsyncForTest)
|
||||
wsID := workspaceID
|
||||
h.wh.goAsync(func() { h.wh.RestartByID(wsID) })
|
||||
// internal#624: 15s per-workspace debounce around the file-write
|
||||
// → RestartByID trigger. Canvas Save / ReplaceFiles fires N PUTs
|
||||
// in a burst; without this each PUT chains into the
|
||||
// coalesceRestart drain loop. The helper still uses goAsync
|
||||
// internally (drains via h.wh.waitAsyncForTest), preserving
|
||||
// RFC internal#524 Layer 1.
|
||||
h.wh.maybeRestartAfterFileWrite(workspaceID)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -570,9 +570,13 @@ func (h *TemplatesHandler) WriteFile(c *gin.Context) {
|
||||
}
|
||||
c.JSON(http.StatusOK, gin.H{"status": "saved", "path": filePath})
|
||||
if h.wh != nil {
|
||||
// RFC internal#524 Layer 1: per-handler goAsync (drains via h.wh.waitAsyncForTest)
|
||||
wsID := workspaceID
|
||||
h.wh.goAsync(func() { h.wh.RestartByID(wsID) })
|
||||
// internal#624: 15s per-workspace debounce around the file-write
|
||||
// → RestartByID trigger. Canvas Save fires N PUTs in a burst;
|
||||
// without this each PUT chains into the coalesceRestart drain
|
||||
// loop and produces back-to-back EC2 recreate cycles. The
|
||||
// helper still uses goAsync internally (drains via
|
||||
// h.wh.waitAsyncForTest), preserving RFC internal#524 Layer 1.
|
||||
h.wh.maybeRestartAfterFileWrite(workspaceID)
|
||||
}
|
||||
return
|
||||
}
|
||||
@@ -586,9 +590,13 @@ func (h *TemplatesHandler) WriteFile(c *gin.Context) {
|
||||
}
|
||||
c.JSON(http.StatusOK, gin.H{"status": "saved", "path": filePath})
|
||||
if h.wh != nil {
|
||||
// RFC internal#524 Layer 1: per-handler goAsync (drains via h.wh.waitAsyncForTest)
|
||||
wsID := workspaceID
|
||||
h.wh.goAsync(func() { h.wh.RestartByID(wsID) })
|
||||
// internal#624: 15s per-workspace debounce around the file-write
|
||||
// → RestartByID trigger. Canvas Save fires N PUTs in a burst;
|
||||
// without this each PUT chains into the coalesceRestart drain
|
||||
// loop and produces back-to-back EC2 recreate cycles. The
|
||||
// helper still uses goAsync internally (drains via
|
||||
// h.wh.waitAsyncForTest), preserving RFC internal#524 Layer 1.
|
||||
h.wh.maybeRestartAfterFileWrite(workspaceID)
|
||||
}
|
||||
return
|
||||
}
|
||||
@@ -602,9 +610,13 @@ func (h *TemplatesHandler) WriteFile(c *gin.Context) {
|
||||
}
|
||||
c.JSON(http.StatusOK, gin.H{"status": "saved", "path": filePath})
|
||||
if h.wh != nil {
|
||||
// RFC internal#524 Layer 1: per-handler goAsync (drains via h.wh.waitAsyncForTest)
|
||||
wsID := workspaceID
|
||||
h.wh.goAsync(func() { h.wh.RestartByID(wsID) })
|
||||
// internal#624: 15s per-workspace debounce around the file-write
|
||||
// → RestartByID trigger. Canvas Save fires N PUTs in a burst;
|
||||
// without this each PUT chains into the coalesceRestart drain
|
||||
// loop and produces back-to-back EC2 recreate cycles. The
|
||||
// helper still uses goAsync internally (drains via
|
||||
// h.wh.waitAsyncForTest), preserving RFC internal#524 Layer 1.
|
||||
h.wh.maybeRestartAfterFileWrite(workspaceID)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -657,9 +669,13 @@ func (h *TemplatesHandler) DeleteFile(c *gin.Context) {
|
||||
}
|
||||
c.JSON(http.StatusOK, gin.H{"status": "deleted", "path": filePath})
|
||||
if h.wh != nil {
|
||||
// RFC internal#524 Layer 1: per-handler goAsync (drains via h.wh.waitAsyncForTest)
|
||||
wsID := workspaceID
|
||||
h.wh.goAsync(func() { h.wh.RestartByID(wsID) })
|
||||
// internal#624: 15s per-workspace debounce around the file-write
|
||||
// → RestartByID trigger. Canvas Save fires N PUTs in a burst;
|
||||
// without this each PUT chains into the coalesceRestart drain
|
||||
// loop and produces back-to-back EC2 recreate cycles. The
|
||||
// helper still uses goAsync internally (drains via
|
||||
// h.wh.waitAsyncForTest), preserving RFC internal#524 Layer 1.
|
||||
h.wh.maybeRestartAfterFileWrite(workspaceID)
|
||||
}
|
||||
return
|
||||
}
|
||||
@@ -677,9 +693,13 @@ func (h *TemplatesHandler) DeleteFile(c *gin.Context) {
|
||||
}
|
||||
c.JSON(http.StatusOK, gin.H{"status": "deleted", "path": filePath})
|
||||
if h.wh != nil {
|
||||
// RFC internal#524 Layer 1: per-handler goAsync (drains via h.wh.waitAsyncForTest)
|
||||
wsID := workspaceID
|
||||
h.wh.goAsync(func() { h.wh.RestartByID(wsID) })
|
||||
// internal#624: 15s per-workspace debounce around the file-write
|
||||
// → RestartByID trigger. Canvas Save fires N PUTs in a burst;
|
||||
// without this each PUT chains into the coalesceRestart drain
|
||||
// loop and produces back-to-back EC2 recreate cycles. The
|
||||
// helper still uses goAsync internally (drains via
|
||||
// h.wh.waitAsyncForTest), preserving RFC internal#524 Layer 1.
|
||||
h.wh.maybeRestartAfterFileWrite(workspaceID)
|
||||
}
|
||||
return
|
||||
}
|
||||
@@ -692,8 +712,12 @@ func (h *TemplatesHandler) DeleteFile(c *gin.Context) {
|
||||
}
|
||||
c.JSON(http.StatusOK, gin.H{"status": "deleted", "path": filePath})
|
||||
if h.wh != nil {
|
||||
// RFC internal#524 Layer 1: per-handler goAsync (drains via h.wh.waitAsyncForTest)
|
||||
wsID := workspaceID
|
||||
h.wh.goAsync(func() { h.wh.RestartByID(wsID) })
|
||||
// internal#624: 15s per-workspace debounce around the file-write
|
||||
// → RestartByID trigger. Canvas Save fires N PUTs in a burst;
|
||||
// without this each PUT chains into the coalesceRestart drain
|
||||
// loop and produces back-to-back EC2 recreate cycles. The
|
||||
// helper still uses goAsync internally (drains via
|
||||
// h.wh.waitAsyncForTest), preserving RFC internal#524 Layer 1.
|
||||
h.wh.maybeRestartAfterFileWrite(workspaceID)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -70,6 +70,97 @@ var restartDebounceWindow = 60 * time.Second
|
||||
// workspace-server yet — that's a separate RFC.
|
||||
var restartByIDDropCounter atomic.Uint64
|
||||
|
||||
// fileWriteRestartDebounceWindow is the per-workspace coalescing window for
|
||||
// the file-write → RestartByID trigger fired by templates.go's WriteFile,
|
||||
// DeleteFile, and ReplaceFiles handlers (and template_import.go's variants).
|
||||
//
|
||||
// Background (internal#624 2026-05-20): canvas Save fires N PUT /files
|
||||
// requests in a 30-60s burst (claude-code SEO agent observed 10-17 files in
|
||||
// 60s). Each successful write previously fired `goAsync(RestartByID)`. The
|
||||
// 60s self-fire debounce in RestartByID itself catches calls 1-60s, but
|
||||
// writes at T+65s+ pass the debounce, set pending=true on a still-running
|
||||
// coalesceRestart cycle, and drain immediately into cycle 2 — which DELETEs
|
||||
// + recreates EC2 mid-burst, returning 500 EC2InstanceStateInvalidException
|
||||
// on the in-flight user PUTs.
|
||||
//
|
||||
// 15s is sized to absorb a canvas Save burst (writes typically land within
|
||||
// a 5-10s window) while still letting a deliberate "edit, wait, edit again"
|
||||
// pattern restart twice. Bigger than that would silently swallow legitimate
|
||||
// rapid-iteration edits; smaller would let burst tails leak through.
|
||||
var fileWriteRestartDebounceWindow = 15 * time.Second
|
||||
|
||||
// fileWriteRestartLastFireAt records the last time `maybeRestartAfterFileWrite`
|
||||
// actually fired a restart for each workspace. sync.Map (not RWMutex+map)
|
||||
// because writes happen on every successful file-write handler, reads on
|
||||
// every subsequent file-write handler call — both per-workspace — and the
|
||||
// keys are sparse + long-lived. Stored as int64 unix-nano so the load/store
|
||||
// path can stay lock-free (atomic.Int64 inside sync.Map.Value is fine, but
|
||||
// time.Time itself isn't atomically loadable).
|
||||
var fileWriteRestartLastFireAt sync.Map // map[workspaceID]*atomic.Int64
|
||||
|
||||
// fileWriteRestartDropCounter counts how many file-write restart triggers
|
||||
// were silently coalesced. Same observability rationale as
|
||||
// restartByIDDropCounter — package-level atomic so tests can assert the
|
||||
// drop fired and ops can correlate with "user clicked Save 10 times,
|
||||
// only saw 1 restart cycle".
|
||||
var fileWriteRestartDropCounter atomic.Uint64
|
||||
|
||||
// maybeRestartAfterFileWrite is the call-site debounce wrapper for the 9
|
||||
// file-write trigger sites in templates.go + template_import.go. Replaces
|
||||
// the direct `goAsync(func() { wh.RestartByID(wsID) })` pattern with a
|
||||
// 15s per-workspace coalescing window:
|
||||
//
|
||||
// - First call (no prior fire OR last fire >15s ago): records the
|
||||
// current timestamp and fires goAsync(RestartByID).
|
||||
// - Subsequent calls within 15s of the last fire: silently dropped,
|
||||
// drop counter incremented.
|
||||
//
|
||||
// This is the call-site-layer protection (internal#624 Path A). The drain-
|
||||
// loop layer in coalesceRestart (Path B, re-stamping restartStartedAt per
|
||||
// iteration) is the platform-layer defense in depth — together they close
|
||||
// the file-write tight-loop class regardless of which entry point fires.
|
||||
//
|
||||
// Stateless on the handler so any handler with access to a WorkspaceHandler
|
||||
// can use it; the per-workspace state lives in the package-level sync.Map.
|
||||
func (h *WorkspaceHandler) maybeRestartAfterFileWrite(workspaceID string) {
|
||||
now := time.Now().UnixNano()
|
||||
|
||||
// LoadOrStore the per-workspace last-fire stamp. First write for a
|
||||
// brand-new workspace falls through the CompareAndSwap below because
|
||||
// the zero-init value (0) is far enough in the past to satisfy the
|
||||
// "last fire >15s ago" predicate.
|
||||
sv, _ := fileWriteRestartLastFireAt.LoadOrStore(workspaceID, new(atomic.Int64))
|
||||
stamp := sv.(*atomic.Int64)
|
||||
|
||||
// CAS loop: read last, decide, swap. We use CAS instead of Lock/Unlock
|
||||
// because the typical case is "thousands of writes, one restart per
|
||||
// 15s" — uncontended atomic is ~5ns vs ~30ns mutex. Bounded retry
|
||||
// because in the rare contended case (two writes finishing nanoseconds
|
||||
// apart) one will win the swap and the other will see the new stamp,
|
||||
// drop, and bail.
|
||||
for retry := 0; retry < 4; retry++ {
|
||||
last := stamp.Load()
|
||||
elapsed := time.Duration(now - last)
|
||||
if last != 0 && elapsed < fileWriteRestartDebounceWindow {
|
||||
// Within debounce window — drop silently.
|
||||
fileWriteRestartDropCounter.Add(1)
|
||||
log.Printf("maybeRestartAfterFileWrite: %s — coalesced "+
|
||||
"(last fire %s ago < %s window; total dropped=%d)",
|
||||
workspaceID, elapsed.Round(time.Millisecond),
|
||||
fileWriteRestartDebounceWindow,
|
||||
fileWriteRestartDropCounter.Load())
|
||||
return
|
||||
}
|
||||
if stamp.CompareAndSwap(last, now) {
|
||||
break
|
||||
}
|
||||
// Another writer beat us to the stamp update. Re-read and retry;
|
||||
// the retry will almost certainly see the new value and drop.
|
||||
}
|
||||
|
||||
h.goAsync(func() { h.RestartByID(workspaceID) })
|
||||
}
|
||||
|
||||
// isRestarting reports whether a restart cycle is currently in flight for
|
||||
// the workspace. Callers that have their own "container looks dead" probe
|
||||
// MUST consult this before triggering a restart, because during the
|
||||
@@ -513,6 +604,27 @@ func coalesceRestart(workspaceID string, cycle func()) {
|
||||
// inside provisionWorkspace, so any writes that committed since the
|
||||
// last cycle are picked up. Continues until no pending request was
|
||||
// observed at the top of an iteration.
|
||||
//
|
||||
// internal#624 Path B (defense in depth for the file-write tight-loop
|
||||
// class): re-stamp restartStartedAt at the top of every drain iteration
|
||||
// past the first. The original design (stamp only on false→true edge)
|
||||
// treated all drained pending as "one event from the debounce's POV",
|
||||
// which is correct for the secrets-batch use case but lets a file-write
|
||||
// burst at T+65s of a 60s drain pipe straight into another full cycle.
|
||||
// Re-stamping closes that hole — each drained cycle gets its own fresh
|
||||
// debounce window, so any RestartByID arriving during cycle N is
|
||||
// dropped by shouldDebounceRestart instead of accumulating into
|
||||
// pending=true for cycle N+1.
|
||||
//
|
||||
// The original "one cycle picks up everyone who arrived during it"
|
||||
// semantic still holds for the secrets-write path: callers that hit
|
||||
// coalesceRestart during cycle 1 still set pending=true and still get
|
||||
// their effects landed in cycle 2. What changes is that callers
|
||||
// arriving during cycle 2 (via RestartByID) now hit the re-stamped
|
||||
// debounce and are dropped instead of being chained into cycle 3,
|
||||
// which is exactly the chain that produced the 22:08-22:10 thrash on
|
||||
// 3fe84b89.
|
||||
iteration := 0
|
||||
for {
|
||||
state.mu.Lock()
|
||||
if !state.pending {
|
||||
@@ -520,7 +632,13 @@ func coalesceRestart(workspaceID string, cycle func()) {
|
||||
return // defer clears running
|
||||
}
|
||||
state.pending = false
|
||||
if iteration > 0 {
|
||||
// Re-stamp for drained iterations only; the false→true edge
|
||||
// already stamped at the top of coalesceRestart.
|
||||
state.restartStartedAt = time.Now()
|
||||
}
|
||||
state.mu.Unlock()
|
||||
iteration++
|
||||
|
||||
cycle()
|
||||
}
|
||||
|
||||
@@ -0,0 +1,316 @@
|
||||
package handlers
|
||||
|
||||
// Tests for internal#624 — file-write → RestartByID tight-loop fix.
|
||||
//
|
||||
// Empirical chain (Loki 2026-05-20 22:00-22:11Z on workspace
|
||||
// 3fe84b89-eb65-42fc-ad1f-5c93582ca3e7, claude-code SEO Agent):
|
||||
//
|
||||
// 1. Canvas Save writes 10-17 files in a 30-60s window.
|
||||
// 2. Each successful PUT /files at templates.go:575 / 591 / 607 / 662 /
|
||||
// 682 / 697 (and template_import.go:239 / 275 / 297) fires
|
||||
// `goAsync(func() { wh.RestartByID(wsID) })`.
|
||||
// 3. RestartByID's existing 60s self-fire debounce catches calls 1-60s
|
||||
// after the cycle starts. But writes at T+65s+ pass the debounce,
|
||||
// set pending=true on the still-running coalesceRestart cycle, and
|
||||
// drain IMMEDIATELY into cycle 2 — no re-debounce because the
|
||||
// original drain loop re-uses the same restartStartedAt.
|
||||
// 4. Cycle 2 DELETEs+recreates EC2 mid-burst → user sees
|
||||
// EC2InstanceStateInvalidException 500 on the in-flight PUTs.
|
||||
//
|
||||
// Fix: two layers (both shipped in the same PR).
|
||||
//
|
||||
// Path A (call-site debounce): every file-write trigger goes through
|
||||
// maybeRestartAfterFileWrite, which silently drops re-fires within 15s
|
||||
// of the last fire for the same workspace.
|
||||
//
|
||||
// Path B (drain-loop re-stamp): coalesceRestart now re-stamps
|
||||
// restartStartedAt at the top of each drained iteration, so any
|
||||
// RestartByID arriving during a drained cycle hits a fresh 60s window
|
||||
// and is dropped by shouldDebounceRestart instead of chaining further.
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// resetFileWriteDebounceState wipes the package-level sync.Map + drop
|
||||
// counter for the given workspace ID. Tests must call this between
|
||||
// scenarios because fileWriteRestartLastFireAt is shared.
|
||||
func resetFileWriteDebounceState(workspaceID string) {
|
||||
fileWriteRestartLastFireAt.Delete(workspaceID)
|
||||
fileWriteRestartDropCounter.Store(0)
|
||||
}
|
||||
|
||||
// newFileWriteDebounceHandler constructs a minimal *WorkspaceHandler with
|
||||
// no provisioner so RestartByID short-circuits at HasProvisioner()=false
|
||||
// — we only care that maybeRestartAfterFileWrite reaches goAsync at all.
|
||||
// The asyncWG inside goAsync lets us wait for the goroutine to finish so
|
||||
// we can deterministically observe whether RestartByID was scheduled.
|
||||
func newFileWriteDebounceHandler(t *testing.T) *WorkspaceHandler {
|
||||
t.Helper()
|
||||
return NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
|
||||
}
|
||||
|
||||
// TestMaybeRestartAfterFileWrite_FirstWriteRestarts — the baseline case:
|
||||
// the very first call for a workspace must actually fire goAsync (i.e.
|
||||
// no debounce-drop on the first PUT). Without this the helper would
|
||||
// silently swallow every legitimate single-file save.
|
||||
func TestMaybeRestartAfterFileWrite_FirstWriteRestarts(t *testing.T) {
|
||||
const wsID = "fw-debounce-first"
|
||||
resetFileWriteDebounceState(wsID)
|
||||
|
||||
h := newFileWriteDebounceHandler(t)
|
||||
h.maybeRestartAfterFileWrite(wsID)
|
||||
|
||||
// Drop counter must NOT have incremented — the call fired.
|
||||
if got := fileWriteRestartDropCounter.Load(); got != 0 {
|
||||
t.Errorf("first call to maybeRestartAfterFileWrite must fire (drop counter must stay 0), got %d", got)
|
||||
}
|
||||
|
||||
// Last-fire timestamp must be populated (non-zero) so the next call
|
||||
// will compare against it.
|
||||
sv, ok := fileWriteRestartLastFireAt.Load(wsID)
|
||||
if !ok {
|
||||
t.Fatal("first call must register the workspace in fileWriteRestartLastFireAt")
|
||||
}
|
||||
stamp := sv.(*atomic.Int64).Load()
|
||||
if stamp == 0 {
|
||||
t.Error("first call must record a non-zero last-fire timestamp")
|
||||
}
|
||||
|
||||
// Wait for the spawned goroutine to finish so it doesn't leak into
|
||||
// the next test (RestartByID will short-circuit on no-provisioner).
|
||||
h.waitAsyncForTest()
|
||||
}
|
||||
|
||||
// TestMaybeRestartAfterFileWrite_SecondWriteWithin15sSkipped — the core
|
||||
// fix: a second call within fileWriteRestartDebounceWindow of the first
|
||||
// MUST NOT fire RestartByID. The drop counter must increment by exactly
|
||||
// one and the last-fire timestamp must remain the FIRST call's stamp
|
||||
// (proof that the second call did not overwrite it).
|
||||
func TestMaybeRestartAfterFileWrite_SecondWriteWithin15sSkipped(t *testing.T) {
|
||||
const wsID = "fw-debounce-second-within"
|
||||
resetFileWriteDebounceState(wsID)
|
||||
|
||||
h := newFileWriteDebounceHandler(t)
|
||||
|
||||
// First call — fires.
|
||||
h.maybeRestartAfterFileWrite(wsID)
|
||||
h.waitAsyncForTest()
|
||||
|
||||
sv, _ := fileWriteRestartLastFireAt.Load(wsID)
|
||||
firstStamp := sv.(*atomic.Int64).Load()
|
||||
|
||||
// Second call immediately — must be dropped.
|
||||
h.maybeRestartAfterFileWrite(wsID)
|
||||
|
||||
if got := fileWriteRestartDropCounter.Load(); got != 1 {
|
||||
t.Errorf("second call within 15s must increment drop counter by exactly 1, got %d", got)
|
||||
}
|
||||
|
||||
// The CAS-loop must NOT have overwritten the first-call stamp — the
|
||||
// debounce branch short-circuits before the CompareAndSwap.
|
||||
stampAfter := sv.(*atomic.Int64).Load()
|
||||
if stampAfter != firstStamp {
|
||||
t.Errorf("dropped call must NOT update last-fire stamp (preserves debounce window); "+
|
||||
"first=%d after=%d", firstStamp, stampAfter)
|
||||
}
|
||||
}
|
||||
|
||||
// TestMaybeRestartAfterFileWrite_ManyWritesInBurstCoalesceToOne — the
|
||||
// "bonus" regression test called out in the issue: 10 simulated PUTs
|
||||
// over 60s (compressed to a tight loop, all within 15s) must produce
|
||||
// exactly 1 RestartByID schedule and 9 drops. Models the canvas Save
|
||||
// burst shape that triggered the prod incident.
|
||||
func TestMaybeRestartAfterFileWrite_ManyWritesInBurstCoalesceToOne(t *testing.T) {
|
||||
const wsID = "fw-debounce-burst"
|
||||
resetFileWriteDebounceState(wsID)
|
||||
|
||||
h := newFileWriteDebounceHandler(t)
|
||||
|
||||
// 10 rapid-fire calls — simulates 10 PUTs landing inside the canvas
|
||||
// Save burst window.
|
||||
const burstSize = 10
|
||||
for i := 0; i < burstSize; i++ {
|
||||
h.maybeRestartAfterFileWrite(wsID)
|
||||
}
|
||||
h.waitAsyncForTest()
|
||||
|
||||
// One fired (call #1) + 9 dropped.
|
||||
if got := fileWriteRestartDropCounter.Load(); got != burstSize-1 {
|
||||
t.Errorf("expected %d drops for a %d-call burst (only call #1 fires), got %d",
|
||||
burstSize-1, burstSize, got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestMaybeRestartAfterFileWrite_AfterWindowExpiresFiresAgain — outside
|
||||
// the debounce window, the helper must release and fire again. Shrinks
|
||||
// fileWriteRestartDebounceWindow to 5ms so we don't sleep 15s in CI.
|
||||
// Important: without this, a legitimate "user edited, walked away for
|
||||
// a minute, edited again" would never restart and config changes would
|
||||
// never reach the agent.
|
||||
func TestMaybeRestartAfterFileWrite_AfterWindowExpiresFiresAgain(t *testing.T) {
|
||||
const wsID = "fw-debounce-window-expires"
|
||||
resetFileWriteDebounceState(wsID)
|
||||
|
||||
orig := fileWriteRestartDebounceWindow
|
||||
fileWriteRestartDebounceWindow = 5 * time.Millisecond
|
||||
defer func() { fileWriteRestartDebounceWindow = orig }()
|
||||
|
||||
h := newFileWriteDebounceHandler(t)
|
||||
|
||||
h.maybeRestartAfterFileWrite(wsID) // fires
|
||||
h.waitAsyncForTest()
|
||||
|
||||
// Wait past the window.
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
|
||||
h.maybeRestartAfterFileWrite(wsID) // must fire again
|
||||
h.waitAsyncForTest()
|
||||
|
||||
// Drop counter must still be 0 — both calls fired.
|
||||
if got := fileWriteRestartDropCounter.Load(); got != 0 {
|
||||
t.Errorf("second call after window expiry must fire (not drop), got %d drops", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestMaybeRestartAfterFileWrite_DifferentWorkspacesIndependent — the
|
||||
// per-workspace state map must isolate: a burst on workspace A must not
|
||||
// affect workspace B's debounce. Pinning so a future "use a single
|
||||
// global atomic" refactor breaks loudly.
|
||||
func TestMaybeRestartAfterFileWrite_DifferentWorkspacesIndependent(t *testing.T) {
|
||||
const wsA = "fw-debounce-ws-a"
|
||||
const wsB = "fw-debounce-ws-b"
|
||||
resetFileWriteDebounceState(wsA)
|
||||
resetFileWriteDebounceState(wsB)
|
||||
|
||||
h := newFileWriteDebounceHandler(t)
|
||||
|
||||
// 5 calls on A, all but one drop.
|
||||
for i := 0; i < 5; i++ {
|
||||
h.maybeRestartAfterFileWrite(wsA)
|
||||
}
|
||||
h.waitAsyncForTest()
|
||||
|
||||
dropsAfterA := fileWriteRestartDropCounter.Load()
|
||||
|
||||
// First call on B — must fire (its own independent window).
|
||||
h.maybeRestartAfterFileWrite(wsB)
|
||||
h.waitAsyncForTest()
|
||||
|
||||
// B's call must not have incremented the drop counter — it fired.
|
||||
if got := fileWriteRestartDropCounter.Load(); got != dropsAfterA {
|
||||
t.Errorf("workspace B's first call must fire (not share workspace A's debounce); "+
|
||||
"drops after A=%d, drops after B=%d", dropsAfterA, got)
|
||||
}
|
||||
|
||||
// Both workspaces must have their own last-fire entries.
|
||||
if _, ok := fileWriteRestartLastFireAt.Load(wsA); !ok {
|
||||
t.Error("workspace A missing from fileWriteRestartLastFireAt")
|
||||
}
|
||||
if _, ok := fileWriteRestartLastFireAt.Load(wsB); !ok {
|
||||
t.Error("workspace B missing from fileWriteRestartLastFireAt")
|
||||
}
|
||||
}
|
||||
|
||||
// TestMaybeRestartAfterFileWrite_ConcurrentCallsSafelyDebounced — the
|
||||
// CAS-loop contract: many goroutines hitting the helper concurrently
|
||||
// must still produce at most one fired call (drops = N-1). Pinning the
|
||||
// "thousands of writes, one restart" performance shape called out in
|
||||
// the helper's comment. Uses sync.WaitGroup to release all goroutines
|
||||
// in a tight burst so the CAS is genuinely contended.
|
||||
func TestMaybeRestartAfterFileWrite_ConcurrentCallsSafelyDebounced(t *testing.T) {
|
||||
const wsID = "fw-debounce-concurrent"
|
||||
resetFileWriteDebounceState(wsID)
|
||||
|
||||
h := newFileWriteDebounceHandler(t)
|
||||
|
||||
const goroutines = 50
|
||||
start := make(chan struct{})
|
||||
var wg sync.WaitGroup
|
||||
for i := 0; i < goroutines; i++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
<-start // hold every goroutine at the gate
|
||||
h.maybeRestartAfterFileWrite(wsID)
|
||||
}()
|
||||
}
|
||||
close(start) // release the herd
|
||||
wg.Wait()
|
||||
h.waitAsyncForTest()
|
||||
|
||||
// Exactly N-1 drops: one goroutine wins the CAS and fires, all
|
||||
// other N-1 see a fresh stamp and drop into the debounce branch.
|
||||
if got := fileWriteRestartDropCounter.Load(); got != goroutines-1 {
|
||||
t.Errorf("expected %d drops for %d concurrent callers (exactly one fires), got %d",
|
||||
goroutines-1, goroutines, got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestCoalesceRestart_DrainRespectsRestartedAtBetweenIterations —
|
||||
// Path B regression: when coalesceRestart drains a pending request into
|
||||
// a follow-up cycle, the restartStartedAt timestamp must be re-stamped
|
||||
// for that follow-up iteration. Without this, a RestartByID arriving
|
||||
// during cycle 2 would hit a stale 60s window (computed from cycle 1's
|
||||
// start) and could pass the debounce just because cycle 1 + cycle 2's
|
||||
// runtime exceeded 60s combined.
|
||||
//
|
||||
// The test fires cycle 1 → completes → sets pending=true to trigger
|
||||
// cycle 2 → asserts that restartStartedAt was advanced for the drained
|
||||
// iteration. The cycle function itself just records the wall-clock at
|
||||
// which it observed restartStartedAt, so the test can compare cycle 1's
|
||||
// stamp vs cycle 2's stamp.
|
||||
func TestCoalesceRestart_DrainRespectsRestartedAtBetweenIterations(t *testing.T) {
|
||||
const wsID = "fw-debounce-drain-restamp"
|
||||
resetRestartStatesFor(wsID)
|
||||
|
||||
// Capture the restartStartedAt observed at the top of each cycle
|
||||
// iteration. The cycle reads it directly from the state map so we
|
||||
// see what coalesceRestart wrote.
|
||||
var stamps []time.Time
|
||||
var stampsMu sync.Mutex
|
||||
cycleCount := 0
|
||||
cycle := func() {
|
||||
sv, _ := restartStates.Load(wsID)
|
||||
state := sv.(*restartState)
|
||||
state.mu.Lock()
|
||||
stampsMu.Lock()
|
||||
stamps = append(stamps, state.restartStartedAt)
|
||||
stampsMu.Unlock()
|
||||
state.mu.Unlock()
|
||||
|
||||
cycleCount++
|
||||
if cycleCount == 1 {
|
||||
// While inside cycle 1, set pending=true so the drain loop
|
||||
// runs cycle 2 next iteration. Mirrors the prod shape: a
|
||||
// PUT lands during cycle 1, sets pending=true via
|
||||
// RestartByID → coalesceRestart's pending branch.
|
||||
state.mu.Lock()
|
||||
state.pending = true
|
||||
state.mu.Unlock()
|
||||
|
||||
// Sleep briefly so cycle 2's stamp is observably later
|
||||
// than cycle 1's. Without a real wall-clock gap the
|
||||
// assertion can't tell re-stamp from no-op.
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
}
|
||||
}
|
||||
|
||||
coalesceRestart(wsID, cycle)
|
||||
|
||||
stampsMu.Lock()
|
||||
defer stampsMu.Unlock()
|
||||
if len(stamps) != 2 {
|
||||
t.Fatalf("expected 2 cycle iterations (original + drained pending), got %d", len(stamps))
|
||||
}
|
||||
if !stamps[1].After(stamps[0]) {
|
||||
t.Errorf("Path B regression: cycle 2's restartStartedAt (%v) must be AFTER "+
|
||||
"cycle 1's (%v) — drained iterations must re-stamp so the self-fire "+
|
||||
"debounce window resets per cycle. Without this, a RestartByID arriving "+
|
||||
"during cycle 2 sees a stale window and can chain into cycle 3.",
|
||||
stamps[1], stamps[0])
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user