Merge pull request #2803 from Molecule-AI/fix/memory-cutover-misconfig-warn

fix(memory v2): warn at boot when cutover env half-configured
This commit is contained in:
Hongming Wang 2026-05-05 00:27:24 +00:00 committed by GitHub
commit 461e5dcad0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 105 additions and 2 deletions

View File

@ -43,15 +43,37 @@ type Bundle struct {
// circuit breaker handles ongoing unavailability; we don't want to
// block workspace-server boot just because the memory plugin is
// briefly down.
//
// Silent-misconfig guard: if MEMORY_V2_CUTOVER=true is set without
// MEMORY_PLUGIN_URL, the cutoverActive() check in handlers silently
// returns false and the legacy SQL path serves every request. The
// operator sees no errors, no warnings, and assumes the cutover is
// live. Log a LOUD WARN at boot when the env is half-configured so
// the misconfig is visible in the boot log, not detectable only by
// observing that the legacy table is still being written to.
func Build(db *sql.DB) *Bundle {
if os.Getenv("MEMORY_PLUGIN_URL") == "" {
cutover := os.Getenv("MEMORY_V2_CUTOVER") == "true"
pluginURL := os.Getenv("MEMORY_PLUGIN_URL")
if pluginURL == "" {
if cutover {
log.Printf("memory-plugin: ⚠️ MEMORY_V2_CUTOVER=true but MEMORY_PLUGIN_URL is unset — cutover is INACTIVE, legacy SQL path is serving every request. Either unset MEMORY_V2_CUTOVER or point MEMORY_PLUGIN_URL at a reachable plugin server.")
}
return nil
}
plugin := mclient.New(mclient.Config{})
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
if hr, err := plugin.Boot(ctx); err != nil {
log.Printf("memory-plugin: /v1/health probe failed (will retry per-request): %v", err)
// Log even louder when cutover is on — an unreachable plugin
// during cutover means writes that the operator THINKS are
// going to v2 will silently fall back to legacy via the
// circuit breaker on each request. Make it impossible to miss.
if cutover {
log.Printf("memory-plugin: ⚠️ MEMORY_V2_CUTOVER=true and MEMORY_PLUGIN_URL=%s but /v1/health probe failed (%v). Cutover writes will fall back to legacy via circuit breaker. Verify the plugin server is reachable.", pluginURL, err)
} else {
log.Printf("memory-plugin: /v1/health probe failed (will retry per-request): %v", err)
}
} else {
log.Printf("memory-plugin: ok, capabilities=%v", hr.Capabilities)
}

View File

@ -1,9 +1,12 @@
package wiring
import (
"bytes"
"context"
"log"
"net/http"
"net/http/httptest"
"strings"
"sync"
"testing"
@ -151,6 +154,84 @@ func TestNamespaceCleanupFn_PluginErrorDoesNotPanic(t *testing.T) {
cleanup(context.Background(), "ws-1")
}
// captureLogs runs fn with log output captured into a buffer, returns the
// captured text. Restores the prior log destination on exit.
func captureLogs(t *testing.T, fn func()) string {
t.Helper()
var buf bytes.Buffer
prev := log.Writer()
log.SetOutput(&buf)
t.Cleanup(func() { log.SetOutput(prev) })
fn()
return buf.String()
}
// TestBuild_WarnsWhenCutoverWithoutPluginURL pins the silent-misconfig
// guard: an operator who flips MEMORY_V2_CUTOVER=true without also
// pointing MEMORY_PLUGIN_URL at a plugin server has just disabled the
// cutover with no error visible. Without this WARN, the only signal
// is "the legacy table is still being written to" — invisible to
// every operator who doesn't explicitly check.
func TestBuild_WarnsWhenCutoverWithoutPluginURL(t *testing.T) {
t.Setenv("MEMORY_V2_CUTOVER", "true")
t.Setenv("MEMORY_PLUGIN_URL", "")
out := captureLogs(t, func() {
if got := Build(nil); got != nil {
t.Errorf("expected nil bundle, got %+v", got)
}
})
if !strings.Contains(out, "MEMORY_V2_CUTOVER=true") || !strings.Contains(out, "MEMORY_PLUGIN_URL is unset") {
t.Errorf("expected loud WARN about half-configured cutover; got log:\n%s", out)
}
}
// TestBuild_NoWarnWhenNeitherSet pins the happy default: an operator
// running without the v2 plugin should not see scary warnings.
func TestBuild_NoWarnWhenNeitherSet(t *testing.T) {
t.Setenv("MEMORY_V2_CUTOVER", "")
t.Setenv("MEMORY_PLUGIN_URL", "")
out := captureLogs(t, func() { _ = Build(nil) })
if strings.Contains(out, "MEMORY_V2_CUTOVER") {
t.Errorf("expected no MEMORY_V2_CUTOVER warning when env is unset; got log:\n%s", out)
}
}
// TestBuild_LoudWarnWhenCutoverAndProbeFails pins the second
// half-config case: cutover is on AND plugin URL is set, but the
// /v1/health probe fails (server down or wrong URL). Without this
// loud WARN, the operator sees only the generic "probe failed" line
// that gets emitted even when cutover is OFF — hiding the fact that
// real cutover writes will quietly fall back via circuit breaker.
func TestBuild_LoudWarnWhenCutoverAndProbeFails(t *testing.T) {
t.Setenv("MEMORY_V2_CUTOVER", "true")
t.Setenv("MEMORY_PLUGIN_URL", "http://127.0.0.1:1") // bogus port
db, _, _ := sqlmock.New()
defer db.Close()
out := captureLogs(t, func() { _ = Build(db) })
if !strings.Contains(out, "MEMORY_V2_CUTOVER=true") || !strings.Contains(out, "probe failed") {
t.Errorf("expected loud WARN about cutover-with-failing-probe; got log:\n%s", out)
}
}
// TestBuild_QuietProbeFailWhenCutoverOff: the operator is in PRE-cutover
// mode (plugin URL set, cutover off — they're warming up the plugin).
// A failing probe in this state is not a misconfig — it should log the
// generic message, NOT the loud cutover-specific one (so log noise
// doesn't drown out real cutover misconfigs in dashboards).
func TestBuild_QuietProbeFailWhenCutoverOff(t *testing.T) {
t.Setenv("MEMORY_V2_CUTOVER", "")
t.Setenv("MEMORY_PLUGIN_URL", "http://127.0.0.1:1")
db, _, _ := sqlmock.New()
defer db.Close()
out := captureLogs(t, func() { _ = Build(db) })
if strings.Contains(out, "MEMORY_V2_CUTOVER=true") {
t.Errorf("expected no cutover-specific warning when cutover is off; got log:\n%s", out)
}
if !strings.Contains(out, "probe failed") {
t.Errorf("expected generic probe-failed log; got log:\n%s", out)
}
}
func pathsAndMethods(paths, methods []string) []string {
out := make([]string, len(paths))
for i := range paths {