From 7d602c3bfaf23557b580e798973f1accdf6369df Mon Sep 17 00:00:00 2001 From: core-devops Date: Wed, 17 Jun 2026 13:04:05 -0700 Subject: [PATCH] =?UTF-8?q?fix(reconcile):=20dedupe=20boot-installed=20plu?= =?UTF-8?q?gins=20=E2=80=94=20no=20EIC=20re-deliver+restart=20(#38)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Headline design-flaw fix from the #32 review. The runtime-image entrypoint boot-installs declared plugins to the box on EVERY boot, BEFORE the box registers online. The post-online ReconcileWorkspacePlugins then ran anyway: because boot-install writes no workspace_plugins tracking row, the reconcile saw every declared plugin as "missing" and re-delivered it via EIC + fired restartFunc — a full SaaS re-provision. That's the observed ~12-min reprovision churn (one wasted new instance per fresh workspace; converges after one cycle). Fix: before delivering, check if the plugin is already on the box (pluginPresentOnBox → readPluginManifestViaEIC). If present (boot-installed), record the workspace_plugins tracking row ONLY — no EIC push, no restart. If absent (boot-install disabled/failed, or a non-image path), deliver as before (safety net). pluginPresentOnBox is conservative: false on any uncertainty (not SaaS / no instance / read error / empty manifest), so a genuinely-missing install is never silently skipped — only a confirmed-present one is deduped. Tracking row is still written either way, so drift-sweeper + UI keep working. Test: TestReconcile_BootInstalled_RecordsWithoutDeliver — present-on-box → recorded but NOT delivered. Existing reconcile tests green. go build/vet/test ok. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../internal/handlers/plugins_reconcile.go | 39 +++++++++++++++++-- .../handlers/plugins_reconcile_test.go | 35 +++++++++++++++++ 2 files changed, 71 insertions(+), 3 deletions(-) diff --git a/workspace-server/internal/handlers/plugins_reconcile.go b/workspace-server/internal/handlers/plugins_reconcile.go index 56d07f68..73759723 100644 --- a/workspace-server/internal/handlers/plugins_reconcile.go +++ b/workspace-server/internal/handlers/plugins_reconcile.go @@ -110,13 +110,26 @@ func (h *PluginsHandler) ReconcileWorkspacePlugins(ctx context.Context, workspac continue } - deliverErr := h.deliver(ctx, workspaceID, stage) - stage.cleanup() - if deliverErr != nil { + // RFC#2843 #38: the runtime-image entrypoint boot-installs declared + // plugins to the box on EVERY boot, BEFORE this online-transition + // reconcile runs — so by now the plugin is typically already on the box. + // If present, record the tracking row WITHOUT re-delivering via EIC + + // restarting: the EIC push + restartFunc was the redundant churn (one + // wasted full re-provision per fresh workspace — the observed ~12-min + // reprovision). Only deliver when the plugin is NOT already present + // (boot-install disabled/failed, or a non-boot-install path) — the + // safety net. pluginPresentOnBox is conservative (false on any + // uncertainty) so a genuinely-missing install is never silently skipped. + if h.pluginPresentOnBox(ctx, workspaceID, stage.PluginName) { + log.Printf("Plugin reconcile: workspace=%s plugin=%s already on box (boot-installed) — recording tracking row only, no re-deliver/restart", + workspaceID, stage.PluginName) + } else if deliverErr := h.deliver(ctx, workspaceID, stage); deliverErr != nil { + stage.cleanup() log.Printf("Plugin reconcile: workspace=%s plugin=%s deliver failed: %v", workspaceID, d.PluginName, deliverErr) continue } + stage.cleanup() if recErr := recordWorkspacePluginInstall( ctx, workspaceID, stage.PluginName, stage.Source.Raw(), track, stage.InstalledSHA, @@ -141,6 +154,26 @@ func (h *PluginsHandler) ReconcileWorkspacePlugins(ctx context.Context, workspac } } +// pluginPresentOnBox reports whether the plugin is already installed on the +// workspace box — e.g. delivered by the runtime-image boot-install +// (RFC#2843 #32) before this online-transition reconcile runs. Used to skip the +// redundant EIC re-deliver + restart that caused the per-fresh-workspace +// reprovision churn (#38). CONSERVATIVE by design: returns false on any +// uncertainty (not SaaS / no instance / read error / empty manifest), so the +// caller falls back to delivering — a genuinely-missing install is never +// silently skipped, only a confirmed-present one is deduped. +func (h *PluginsHandler) pluginPresentOnBox(ctx context.Context, workspaceID, pluginName string) bool { + instanceID, runtime := h.lookupSaaSDispatch(workspaceID) + if instanceID == "" { + return false // not a SaaS box we can probe — deliver as before + } + data, err := readPluginManifestViaEIC(ctx, instanceID, runtime, pluginName) + if err != nil || len(data) == 0 { + return false // can't confirm presence — fall back to deliver + } + return true +} + // cleanup removes the staged tempdir. Mirrors the defer the interactive // install path uses; the reconcile must clean up explicitly because it stages // many plugins in a loop rather than one-per-request. diff --git a/workspace-server/internal/handlers/plugins_reconcile_test.go b/workspace-server/internal/handlers/plugins_reconcile_test.go index 3b08bda2..3549c266 100644 --- a/workspace-server/internal/handlers/plugins_reconcile_test.go +++ b/workspace-server/internal/handlers/plugins_reconcile_test.go @@ -228,3 +228,38 @@ func TestProvisioningChannelCarriesNoPlugins(t *testing.T) { "so the post-online reconcile can install them") } } + +// TestReconcile_BootInstalled_RecordsWithoutDeliver pins the RFC#2843 #38 fix: +// when the plugin is already on the box (boot-installed by the runtime-image +// entrypoint before this online-transition reconcile runs), the reconcile must +// record the tracking row but NOT re-deliver via EIC + restart (the churn that +// caused one wasted re-provision per fresh workspace). +func TestReconcile_BootInstalled_RecordsWithoutDeliver(t *testing.T) { + mock, cleanup := withMockDB(t) + defer cleanup() + h, delivered := newReconcileHandler(t) + // SaaS box present + manifest read returns non-empty → pluginPresentOnBox=true. + h.instanceIDLookup = func(string) (string, error) { return "i-boot", nil } + orig := readPluginManifestViaEIC + readPluginManifestViaEIC = func(ctx context.Context, instanceID, runtime, pluginName string) ([]byte, error) { + return []byte("name: seo-all\n"), nil + } + defer func() { readPluginManifestViaEIC = orig }() + + expectDeclared(mock, + []DeclaredPlugin{{PluginName: "seo-all", SourceRaw: "local://seo-all"}}, + nil, + ) + // Tracking row IS still recorded (so drift/UI work) — just no deliver/restart. + mock.ExpectExec(`INSERT INTO workspace_plugins`). + WillReturnResult(sqlmock.NewResult(1, 1)) + + h.ReconcileWorkspacePlugins(context.Background(), "ws-1") + + if len(*delivered) != 0 { + t.Fatalf("boot-installed plugin must NOT be re-delivered (no EIC push/restart churn), but delivered %v", *delivered) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet DB expectations (tracking row must still be recorded): %v", err) + } +} -- 2.52.0