fix(sweeper): emit WORKSPACE_PROVISION_FAILED so canvas updates UI

The provision-timeout sweeper was emitting a new WORKSPACE_PROVISION_TIMEOUT
event type, but the canvas event handler (canvas-events.ts:234) only
has a case for WORKSPACE_PROVISION_FAILED — the sweep's event fell
through silently. DB was being marked 'failed' but the UI stayed on
'starting' indefinitely until the user hard-refreshed.

Reusing the existing event name keeps the UI reaction uniform across
both fail paths (runtime-crash via bootstrap-watcher and boot-timeout
via sweeper). Operators who need to distinguish can read the `source`
payload field — "bootstrap_watcher" vs "provision_timeout_sweep".

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Hongming Wang 2026-04-20 20:38:41 -07:00
parent 7158f8f01c
commit ec52d155f4
2 changed files with 12 additions and 6 deletions

View File

@ -127,9 +127,15 @@ func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter
continue
}
log.Printf("Provision-timeout sweep: %s stuck in provisioning > %s — marked failed", id, timeout)
if emitErr := emitter.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_TIMEOUT", id, map[string]interface{}{
"error": msg,
"timeout_secs": timeoutSec,
// Emit as WORKSPACE_PROVISION_FAILED, not _TIMEOUT, because the
// canvas event handler only flips node state on the _FAILED case.
// A separate event type was considered but the UI reaction is
// identical either way — operators who need to distinguish can
// tell from the `source` payload field.
if emitErr := emitter.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", id, map[string]interface{}{
"error": msg,
"timeout_secs": timeoutSec,
"source": "provision_timeout_sweep",
}); emitErr != nil {
log.Printf("Provision-timeout sweep: broadcast failed for %s: %v", id, emitErr)
}

View File

@ -58,8 +58,8 @@ func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) {
if emit.count() != 1 {
t.Fatalf("expected 1 event, got %d", emit.count())
}
if emit.events[0].Type != "WORKSPACE_PROVISION_TIMEOUT" {
t.Errorf("event type = %q, want WORKSPACE_PROVISION_TIMEOUT", emit.events[0].Type)
if emit.events[0].Type != "WORKSPACE_PROVISION_FAILED" {
t.Errorf("event type = %q, want WORKSPACE_PROVISION_FAILED", emit.events[0].Type)
}
if emit.events[0].WorkspaceID != "ws-stuck" {
t.Errorf("workspace id = %q, want ws-stuck", emit.events[0].WorkspaceID)
@ -72,7 +72,7 @@ func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) {
// TestSweepStuckProvisioning_RaceSafe covers the case where UPDATE affects
// 0 rows because the workspace flipped to online (or got restarted) between
// the SELECT and the UPDATE. We should skip the event, not emit a false
// WORKSPACE_PROVISION_TIMEOUT.
// WORKSPACE_PROVISION_FAILED.
func TestSweepStuckProvisioning_RaceSafe(t *testing.T) {
mock := setupTestDB(t)