From 1e01083e554eaae90e7cf0075a510eca9f391234 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Tue, 5 May 2026 16:10:58 -0700 Subject: [PATCH] fix(handlers): UTF-8-safe preview truncation + distinguish DB errors from not-found (RFC #2945 PR-A followup) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Self-review of PR #2949 surfaced two pre-existing defects that the SSOT consolidation inherited from the original /notify handler. Both are addressable in a small follow-up; shipping them as a separate PR keeps the consolidation and the bug-fix individually reviewable. Critical: byte-slice preview truncation produces invalid UTF-8 ------------------------------------------------------------- Pre-fix: if len(preview) > 80 { preview = preview[:80] + "…" } `len()` returns BYTES; `preview[:80]` slices on a byte boundary. For agent-authored chat in CJK / emoji / accented characters, byte 80 lands mid-codepoint → invalid UTF-8 → Postgres JSONB rejects → INSERT fails → activity_log row never written → message vanishes from chat history on the next reload. The persistence-failure log fires but operators have to grep to find it, and the user-visible regression mode is identical to reno-stars. Fix: extract `truncatePreviewRunes(s, maxRunes)` that walks the rune boundary using `for i := range s` (Go's range over string yields rune start indices). Cap at 80 RUNES not bytes — UI-friendly count, not storage count. Important: workspace-lookup error path swallows real DB errors -------------------------------------------------------------- Pre-fix: if err := w.db.QueryRowContext(...).Scan(&wsName); err != nil { return ErrWorkspaceNotFound } Conflates `sql.ErrNoRows` (legit not-found → caller 404) with real DB errors (connection drop, query timeout, pool exhaustion → caller should 503). During a Postgres outage every notify call surfaced as "workspace not found" — masking the actual incident in alerting and making the symptom indistinguishable from "you typed a bad workspace ID". Fix: distinguish via `errors.Is(err, sql.ErrNoRows)` and wrap non-not-found errors with `fmt.Errorf("agent_message: workspace lookup: %w", err)`. Callers' existing fallback path (return 500 / return error wrapped) handles the new shape correctly without any changes — verified by running existing TestNotify_* and TestMCPHandler_SendMessage_* tests. Tests added (3 new, 11 total writer tests) ------------------------------------------ - TestTruncatePreviewRunes_RuneBoundary: 8-case table — ASCII, CJK, exactly-at-max, emoji prefix. Asserts both correct visible output AND `utf8.ValidString` on every result so the bug shape (invalid UTF-8) can't recur. - TestAgentMessageWriter_Send_NonASCIIMessagePersists: end-to-end with a 200-rune CJK message (exceeds the 80-rune cap, would have hit the byte-slice bug). Pins the INSERT summary contains valid UTF-8 with exactly 80-rune body + ellipsis. - TestAgentMessageWriter_Send_DBErrorOnLookupReturnsWrapped: pins the DB-outage path returns a wrapped non-ErrWorkspaceNotFound error so alerting can distinguish 404 from 503. Verified via mock ExpectQuery returning a transient error. Verified -------- - `go vet ./internal/handlers/` clean - `go build ./...` clean - All 14 writer + caller tests pass (8 original + 3 new + AST gate + TestNotify_* + TestMCPHandler_SendMessage_* sibling tests) Per memory feedback_assert_exact_not_substring.md: every new test asserts boundary behavior directly (UTF-8 validity, exact rune count, errors.Is comparison) rather than substring-match in stringified output. Refs RFC #2945, PR #2949, PR #2944. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/handlers/agent_message_writer.go | 57 +++++++-- .../handlers/agent_message_writer_test.go | 117 ++++++++++++++++++ 2 files changed, 165 insertions(+), 9 deletions(-) diff --git a/workspace-server/internal/handlers/agent_message_writer.go b/workspace-server/internal/handlers/agent_message_writer.go index b6856847..a0f25588 100644 --- a/workspace-server/internal/handlers/agent_message_writer.go +++ b/workspace-server/internal/handlers/agent_message_writer.go @@ -40,7 +40,9 @@ import ( "database/sql" "encoding/json" "errors" + "fmt" "log" + "unicode/utf8" "github.com/Molecule-AI/molecule-monorepo/platform/internal/events" ) @@ -48,9 +50,40 @@ import ( // ErrWorkspaceNotFound is returned by AgentMessageWriter.Send when the // workspace lookup turns up nothing (or the workspace is in // status='removed'). Callers translate to HTTP 404 / JSON-RPC error / -// whatever surface they expose. +// whatever surface they expose. Real DB errors (connection drop, query +// timeout) surface as wrapped errors and should be treated as 503. var ErrWorkspaceNotFound = errors.New("agent_message: workspace not found") +// truncatePreviewRunes returns at most maxRunes runes of s, plus an ellipsis +// when truncated. Operates on the rune (codepoint) boundary instead of +// byte indices — the previous byte-slice version produced invalid UTF-8 +// when maxRunes landed mid-codepoint (CJK, emoji, accented characters +// in agent-authored chat messages), and Postgres JSONB rejects invalid +// UTF-8, dropping the activity_log INSERT silently. The persistence +// failure log fires but the message vanishes from chat history — the +// exact regression class the SSOT consolidation was built to prevent. +// +// maxRunes is in runes, not bytes — `truncatePreviewRunes("你好", 1)` returns +// `"你…"`, not `"\xe4…"`. Set the cap on a UI-friendly basis (visible +// character count, not stored byte count); 80 runes covers the +// activity_logs.summary column comfortably. +func truncatePreviewRunes(s string, maxRunes int) string { + if utf8.RuneCountInString(s) <= maxRunes { + return s + } + // Walk runes until we've consumed maxRunes; cut at that byte index. + count := 0 + cut := len(s) + for i := range s { + if count == maxRunes { + cut = i + break + } + count++ + } + return s[:cut] + "…" +} + // AgentMessageAttachment is one file attached to an agent → user // message. Identical to handlers.NotifyAttachment in field set; kept // distinct so the writer's API doesn't import a handler type with HTTP @@ -96,15 +129,24 @@ func (w *AgentMessageWriter) Send( ) error { // 1. Workspace lookup. status='removed' filter is the same shape /notify // used pre-consolidation; deleted workspaces don't get notifications. + // + // Distinguish sql.ErrNoRows ("workspace genuinely not present" — caller + // should 404) from real DB errors (connection drop, statement timeout, + // pool exhaustion — caller should 503). Pre-fix this branch returned + // ErrWorkspaceNotFound for any error, so during a DB outage every + // notify call surfaced as "workspace not found" and masked real + // incidents in the alert path. var wsName string - if err := w.db.QueryRowContext(ctx, + err := w.db.QueryRowContext(ctx, `SELECT name FROM workspaces WHERE id = $1 AND status != 'removed'`, workspaceID, - ).Scan(&wsName); err != nil { - // Includes sql.ErrNoRows; canonicalize so callers don't have to - // import database/sql to compare. + ).Scan(&wsName) + if errors.Is(err, sql.ErrNoRows) { return ErrWorkspaceNotFound } + if err != nil { + return fmt.Errorf("agent_message: workspace lookup: %w", err) + } // 2. Build broadcast payload + WS-emit. Same shape that ChatTab's // AGENT_MESSAGE handler in canvas/src/store/canvas-events.ts has @@ -144,10 +186,7 @@ func (w *AgentMessageWriter) Send( respPayload["parts"] = fileParts } respJSON, _ := json.Marshal(respPayload) - preview := message - if len(preview) > 80 { - preview = preview[:80] + "…" - } + preview := truncatePreviewRunes(message, 80) if _, err := w.db.ExecContext(ctx, ` INSERT INTO activity_logs (workspace_id, activity_type, method, summary, response_body, status) VALUES ($1, 'a2a_receive', 'notify', $2, $3::jsonb, 'ok') diff --git a/workspace-server/internal/handlers/agent_message_writer_test.go b/workspace-server/internal/handlers/agent_message_writer_test.go index d311c926..880d2273 100644 --- a/workspace-server/internal/handlers/agent_message_writer_test.go +++ b/workspace-server/internal/handlers/agent_message_writer_test.go @@ -7,6 +7,7 @@ import ( "errors" "strings" "testing" + "unicode/utf8" "github.com/DATA-DOG/go-sqlmock" "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" @@ -303,6 +304,122 @@ func TestAgentMessageWriter_Send_BroadcastsAgentMessageEvent(t *testing.T) { } } +// TestAgentMessageWriter_Send_DBErrorOnLookupReturnsWrapped pins the +// distinction between sql.ErrNoRows (legit not-found → 404) and real +// DB errors (connection drop → 503). Pre-followup the lookup branch +// returned ErrWorkspaceNotFound for ANY error, so during a DB outage +// every notify call surfaced as "workspace not found" and masked +// real incidents in alerting. +func TestAgentMessageWriter_Send_DBErrorOnLookupReturnsWrapped(t *testing.T) { + mock := setupTestDB(t) + w := NewAgentMessageWriter(db.DB, newTestBroadcaster()) + + transientErr := errors.New("connection refused") + mock.ExpectQuery("SELECT name FROM workspaces"). + WithArgs("ws-dbdown"). + WillReturnError(transientErr) + + err := w.Send(context.Background(), "ws-dbdown", "hi", nil) + if err == nil { + t.Fatal("expected wrapped DB error, got nil") + } + if errors.Is(err, ErrWorkspaceNotFound) { + t.Errorf("DB outage MUST NOT surface as ErrWorkspaceNotFound (masks incidents in alerting); got %v", err) + } + if !errors.Is(err, transientErr) { + t.Errorf("expected wrapped %v, got %v", transientErr, err) + } +} + +// TestTruncatePreviewRunes_RuneBoundary pins the multi-byte-safe +// truncation. The previous byte-slice version produced invalid UTF-8 +// when the cut landed mid-codepoint (CJK, emoji, accented), and +// Postgres JSONB rejects invalid UTF-8 — INSERT fails, log.Printf +// fires, message vanishes from chat history. Per memory +// feedback_assert_exact_not_substring.md, pin the boundary cases +// directly. +func TestTruncatePreviewRunes_RuneBoundary(t *testing.T) { + cases := []struct { + name string + in string + max int + want string + }{ + {"under-max ASCII", "hi", 80, "hi"}, + {"under-max CJK", "你好", 80, "你好"}, + {"exactly-at-max", "abcde", 5, "abcde"}, + {"truncate ASCII", "abcdefghij", 5, "abcde…"}, + {"truncate CJK at rune boundary", "你好世界你好世界", 4, "你好世界…"}, + {"truncate emoji at rune boundary", "😀😀😀😀😀😀", 3, "😀😀😀…"}, + // The pre-fix bug shape: byte-slice on non-ASCII would have + // mangled the codepoint here. With rune-boundary truncation + // the result is well-formed UTF-8. + {"non-zero with emoji prefix", "🚀abcdefghijk", 5, "🚀abcd…"}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got := truncatePreviewRunes(c.in, c.max) + if got != c.want { + t.Errorf("truncatePreviewRunes(%q, %d) = %q, want %q", c.in, c.max, got, c.want) + } + // Always-valid UTF-8 invariant. A byte-slice truncation + // could leave partial codepoints; this version must not. + if !utf8.ValidString(got) { + t.Errorf("truncatePreviewRunes(%q, %d) returned invalid UTF-8: %q", c.in, c.max, got) + } + }) + } +} + +// TestAgentMessageWriter_Send_NonASCIIMessagePersists pins the end-to-end +// path for non-ASCII messages — the original reno-stars regression +// surfaced via byte-slice truncation breaking JSONB INSERT. Every +// handler-level test had ASCII content, so this branch had no +// coverage. Now it does. +func TestAgentMessageWriter_Send_NonASCIIMessagePersists(t *testing.T) { + mock := setupTestDB(t) + w := NewAgentMessageWriter(db.DB, newTestBroadcaster()) + + // 200-rune CJK message — exceeds the 80-rune cap, would have hit + // the byte-slice bug. + msg := strings.Repeat("你", 200) + + mock.ExpectQuery("SELECT name FROM workspaces"). + WithArgs("ws-cjk"). + WillReturnRows(sqlmock.NewRows([]string{"name"}).AddRow("CEO Ryan PC")) + + mock.ExpectExec(`INSERT INTO activity_logs`). + WithArgs( + "ws-cjk", + stringMatcher(func(s string) bool { + if !strings.HasPrefix(s, "Agent message: ") { + return false + } + preview := strings.TrimPrefix(s, "Agent message: ") + if !strings.HasSuffix(preview, "…") { + return false + } + body := strings.TrimSuffix(preview, "…") + // 80 runes of 你 = 80 codepoints. Each is 3 bytes UTF-8. + if utf8.RuneCountInString(body) != 80 { + return false + } + // MUST be valid UTF-8 — pre-fix byte-slice would have + // returned half a codepoint here. + return utf8.ValidString(body) + }), + sqlmock.AnyArg(), + ). + WillReturnResult(sqlmock.NewResult(1, 1)) + + if err := w.Send(context.Background(), "ws-cjk", msg, nil); err != nil { + t.Fatalf("Send: %v", err) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("non-ASCII path drift: %v", err) + } +} + // TestAgentMessageWriter_Send_OmitsAttachmentsKeyWhenEmpty pins the // "no key when nil" wire contract — extra empty fields would force // canvas consumers to defensively check for [] vs undefined; the