From ec52d155f49d288a437794d0dbf06ea2f0e90e96 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Mon, 20 Apr 2026 20:38:41 -0700 Subject: [PATCH] fix(sweeper): emit WORKSPACE_PROVISION_FAILED so canvas updates UI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The provision-timeout sweeper was emitting a new WORKSPACE_PROVISION_TIMEOUT event type, but the canvas event handler (canvas-events.ts:234) only has a case for WORKSPACE_PROVISION_FAILED — the sweep's event fell through silently. DB was being marked 'failed' but the UI stayed on 'starting' indefinitely until the user hard-refreshed. Reusing the existing event name keeps the UI reaction uniform across both fail paths (runtime-crash via bootstrap-watcher and boot-timeout via sweeper). Operators who need to distinguish can read the `source` payload field — "bootstrap_watcher" vs "provision_timeout_sweep". Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/registry/provisiontimeout.go | 12 +++++++++--- .../internal/registry/provisiontimeout_test.go | 6 +++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/workspace-server/internal/registry/provisiontimeout.go b/workspace-server/internal/registry/provisiontimeout.go index 8c8bd5d6..0201eb9b 100644 --- a/workspace-server/internal/registry/provisiontimeout.go +++ b/workspace-server/internal/registry/provisiontimeout.go @@ -127,9 +127,15 @@ func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter continue } log.Printf("Provision-timeout sweep: %s stuck in provisioning > %s — marked failed", id, timeout) - if emitErr := emitter.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_TIMEOUT", id, map[string]interface{}{ - "error": msg, - "timeout_secs": timeoutSec, + // Emit as WORKSPACE_PROVISION_FAILED, not _TIMEOUT, because the + // canvas event handler only flips node state on the _FAILED case. + // A separate event type was considered but the UI reaction is + // identical either way — operators who need to distinguish can + // tell from the `source` payload field. + if emitErr := emitter.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", id, map[string]interface{}{ + "error": msg, + "timeout_secs": timeoutSec, + "source": "provision_timeout_sweep", }); emitErr != nil { log.Printf("Provision-timeout sweep: broadcast failed for %s: %v", id, emitErr) } diff --git a/workspace-server/internal/registry/provisiontimeout_test.go b/workspace-server/internal/registry/provisiontimeout_test.go index 07a04a4e..a5009a56 100644 --- a/workspace-server/internal/registry/provisiontimeout_test.go +++ b/workspace-server/internal/registry/provisiontimeout_test.go @@ -58,8 +58,8 @@ func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) { if emit.count() != 1 { t.Fatalf("expected 1 event, got %d", emit.count()) } - if emit.events[0].Type != "WORKSPACE_PROVISION_TIMEOUT" { - t.Errorf("event type = %q, want WORKSPACE_PROVISION_TIMEOUT", emit.events[0].Type) + if emit.events[0].Type != "WORKSPACE_PROVISION_FAILED" { + t.Errorf("event type = %q, want WORKSPACE_PROVISION_FAILED", emit.events[0].Type) } if emit.events[0].WorkspaceID != "ws-stuck" { t.Errorf("workspace id = %q, want ws-stuck", emit.events[0].WorkspaceID) @@ -72,7 +72,7 @@ func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) { // TestSweepStuckProvisioning_RaceSafe covers the case where UPDATE affects // 0 rows because the workspace flipped to online (or got restarted) between // the SELECT and the UPDATE. We should skip the event, not emit a false -// WORKSPACE_PROVISION_TIMEOUT. +// WORKSPACE_PROVISION_FAILED. func TestSweepStuckProvisioning_RaceSafe(t *testing.T) { mock := setupTestDB(t)