diff --git a/workspace-server/cmd/memory-backfill/verify.go b/workspace-server/cmd/memory-backfill/verify.go index e522e740..cf11435e 100644 --- a/workspace-server/cmd/memory-backfill/verify.go +++ b/workspace-server/cmd/memory-backfill/verify.go @@ -21,6 +21,7 @@ import ( "os" "github.com/Molecule-AI/molecule-monorepo/platform/internal/memory/contract" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/textutil" ) // verifyConfig is the typed dependency bundle for verifyParity. @@ -121,7 +122,7 @@ func verifyParity(ctx context.Context, cfg verifyConfig, stdout *os.File) (*veri matched := true for _, c := range legacy { if pluginContents[c] == 0 { - fmt.Fprintf(stdout, "[mismatch] workspace=%s missing-from-plugin content=%q\n", wsID, truncate(c, 80)) + fmt.Fprintf(stdout, "[mismatch] workspace=%s missing-from-plugin content=%q\n", wsID, textutil.TruncateBytes(c, 80)) matched = false break } @@ -192,9 +193,4 @@ func queryLegacyMemories(ctx context.Context, db *sql.DB, workspaceID string) ([ return out, rows.Err() } -func truncate(s string, n int) string { - if len(s) <= n { - return s - } - return s[:n] + "…" -} +// truncation moved to internal/textutil.TruncateBytes (#2962 SSOT). diff --git a/workspace-server/cmd/memory-backfill/verify_test.go b/workspace-server/cmd/memory-backfill/verify_test.go index 8ffe806a..1610da43 100644 --- a/workspace-server/cmd/memory-backfill/verify_test.go +++ b/workspace-server/cmd/memory-backfill/verify_test.go @@ -349,16 +349,8 @@ func TestVerifyParity_PickSampleError(t *testing.T) { } } -// --- Truncate --- - -func TestVerifyTruncate(t *testing.T) { - if got := truncate("short", 10); got != "short" { - t.Errorf("got %q", got) - } - if got := truncate(strings.Repeat("a", 200), 10); !strings.HasSuffix(got, "…") { - t.Errorf("expected ellipsis: %q", got) - } -} +// Truncate moved to internal/textutil — coverage in +// internal/textutil/truncate_test.go (TestTruncateBytes_RuneBoundary). // --- CLI: -verify mode --- diff --git a/workspace-server/internal/handlers/a2a_queue.go b/workspace-server/internal/handlers/a2a_queue.go index dd012e02..a3e25a9c 100644 --- a/workspace-server/internal/handlers/a2a_queue.go +++ b/workspace-server/internal/handlers/a2a_queue.go @@ -22,6 +22,7 @@ import ( "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" "github.com/Molecule-AI/molecule-monorepo/platform/internal/events" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/textutil" ) // extractIdempotencyKey pulls params.message.messageId out of an A2A JSON-RPC @@ -420,7 +421,7 @@ func (h *WorkspaceHandler) stitchDrainResponseToDelegation(ctx context.Context, AND method = 'delegate_result' AND target_id = $4 AND response_body->>'delegation_id' = $5 - `, "Delegation completed ("+truncate(responseText, 80)+")", string(respJSON), + `, "Delegation completed ("+textutil.TruncateBytes(responseText, 80)+")", string(respJSON), sourceID, targetID, delegationID) if err != nil { log.Printf("A2AQueue drain stitch: update failed for delegation %s: %v", delegationID, err) @@ -439,7 +440,7 @@ func (h *WorkspaceHandler) stitchDrainResponseToDelegation(ctx context.Context, h.broadcaster.RecordAndBroadcast(ctx, string(events.EventDelegationComplete), sourceID, map[string]interface{}{ "delegation_id": delegationID, "target_id": targetID, - "response_preview": truncate(responseText, 200), + "response_preview": textutil.TruncateBytes(responseText, 200), "via": "queue_drain", }) } diff --git a/workspace-server/internal/handlers/agent_message_writer.go b/workspace-server/internal/handlers/agent_message_writer.go index dbbce3e7..6efea603 100644 --- a/workspace-server/internal/handlers/agent_message_writer.go +++ b/workspace-server/internal/handlers/agent_message_writer.go @@ -42,9 +42,9 @@ import ( "errors" "fmt" "log" - "unicode/utf8" "github.com/Molecule-AI/molecule-monorepo/platform/internal/events" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/textutil" ) // ErrWorkspaceNotFound is returned by AgentMessageWriter.Send when the @@ -54,36 +54,6 @@ import ( // timeout) surface as wrapped errors and should be treated as 503. var ErrWorkspaceNotFound = errors.New("agent_message: workspace not found") -// truncatePreviewRunes returns at most maxRunes runes of s, plus an ellipsis -// when truncated. Operates on the rune (codepoint) boundary instead of -// byte indices — the previous byte-slice version produced invalid UTF-8 -// when maxRunes landed mid-codepoint (CJK, emoji, accented characters -// in agent-authored chat messages), and Postgres JSONB rejects invalid -// UTF-8, dropping the activity_log INSERT silently. The persistence -// failure log fires but the message vanishes from chat history — the -// exact regression class the SSOT consolidation was built to prevent. -// -// maxRunes is in runes, not bytes — `truncatePreviewRunes("你好", 1)` returns -// `"你…"`, not `"\xe4…"`. Set the cap on a UI-friendly basis (visible -// character count, not stored byte count); 80 runes covers the -// activity_logs.summary column comfortably. -func truncatePreviewRunes(s string, maxRunes int) string { - if utf8.RuneCountInString(s) <= maxRunes { - return s - } - // Walk runes until we've consumed maxRunes; cut at that byte index. - count := 0 - cut := len(s) - for i := range s { - if count == maxRunes { - cut = i - break - } - count++ - } - return s[:cut] + "…" -} - // AgentMessageAttachment is one file attached to an agent → user // message. Identical to handlers.NotifyAttachment in field set; kept // distinct so the writer's API doesn't import a handler type with HTTP @@ -186,7 +156,7 @@ func (w *AgentMessageWriter) Send( respPayload["parts"] = fileParts } respJSON, _ := json.Marshal(respPayload) - preview := truncatePreviewRunes(message, 80) + preview := textutil.TruncateRunes(message, 80) if _, err := w.db.ExecContext(ctx, ` INSERT INTO activity_logs (workspace_id, activity_type, method, summary, response_body, status) VALUES ($1, 'a2a_receive', 'notify', $2, $3::jsonb, 'ok') diff --git a/workspace-server/internal/handlers/agent_message_writer_test.go b/workspace-server/internal/handlers/agent_message_writer_test.go index 880d2273..20f5540f 100644 --- a/workspace-server/internal/handlers/agent_message_writer_test.go +++ b/workspace-server/internal/handlers/agent_message_writer_test.go @@ -331,45 +331,11 @@ func TestAgentMessageWriter_Send_DBErrorOnLookupReturnsWrapped(t *testing.T) { } } -// TestTruncatePreviewRunes_RuneBoundary pins the multi-byte-safe -// truncation. The previous byte-slice version produced invalid UTF-8 -// when the cut landed mid-codepoint (CJK, emoji, accented), and -// Postgres JSONB rejects invalid UTF-8 — INSERT fails, log.Printf -// fires, message vanishes from chat history. Per memory -// feedback_assert_exact_not_substring.md, pin the boundary cases -// directly. -func TestTruncatePreviewRunes_RuneBoundary(t *testing.T) { - cases := []struct { - name string - in string - max int - want string - }{ - {"under-max ASCII", "hi", 80, "hi"}, - {"under-max CJK", "你好", 80, "你好"}, - {"exactly-at-max", "abcde", 5, "abcde"}, - {"truncate ASCII", "abcdefghij", 5, "abcde…"}, - {"truncate CJK at rune boundary", "你好世界你好世界", 4, "你好世界…"}, - {"truncate emoji at rune boundary", "😀😀😀😀😀😀", 3, "😀😀😀…"}, - // The pre-fix bug shape: byte-slice on non-ASCII would have - // mangled the codepoint here. With rune-boundary truncation - // the result is well-formed UTF-8. - {"non-zero with emoji prefix", "🚀abcdefghijk", 5, "🚀abcd…"}, - } - for _, c := range cases { - t.Run(c.name, func(t *testing.T) { - got := truncatePreviewRunes(c.in, c.max) - if got != c.want { - t.Errorf("truncatePreviewRunes(%q, %d) = %q, want %q", c.in, c.max, got, c.want) - } - // Always-valid UTF-8 invariant. A byte-slice truncation - // could leave partial codepoints; this version must not. - if !utf8.ValidString(got) { - t.Errorf("truncatePreviewRunes(%q, %d) returned invalid UTF-8: %q", c.in, c.max, got) - } - }) - } -} +// Helper-level truncate tests now live in +// internal/textutil/truncate_test.go (TestTruncateRunes). The +// integration-level coverage that exercises the agent_message_writer +// path with non-ASCII content is TestAgentMessageWriter_Send_NonASCIIMessagePersists +// below. // TestAgentMessageWriter_Send_NonASCIIMessagePersists pins the end-to-end // path for non-ASCII messages — the original reno-stars regression diff --git a/workspace-server/internal/handlers/delegation.go b/workspace-server/internal/handlers/delegation.go index 247fc35f..1bda2c63 100644 --- a/workspace-server/internal/handlers/delegation.go +++ b/workspace-server/internal/handlers/delegation.go @@ -10,6 +10,7 @@ import ( "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" "github.com/Molecule-AI/molecule-monorepo/platform/internal/events" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/textutil" "github.com/gin-gonic/gin" "github.com/google/uuid" ) @@ -167,7 +168,7 @@ func (h *DelegationHandler) Delegate(c *gin.Context) { h.broadcaster.RecordAndBroadcast(ctx, string(events.EventDelegationSent), sourceID, map[string]interface{}{ "delegation_id": delegationID, "target_id": body.TargetID, - "task_preview": truncate(body.Task, 100), + "task_preview": textutil.TruncateBytes(body.Task, 100), }) resp := gin.H{ @@ -407,7 +408,7 @@ func (h *DelegationHandler) executeDelegation(sourceID, targetID, delegationID s if _, err := db.DB.ExecContext(ctx, ` INSERT INTO activity_logs (workspace_id, activity_type, method, source_id, target_id, summary, response_body, status) VALUES ($1, 'delegation', 'delegate_result', $2, $3, $4, $5::jsonb, 'completed') - `, sourceID, sourceID, targetID, "Delegation completed ("+truncate(responseText, 80)+")", string(respJSON)); err != nil { + `, sourceID, sourceID, targetID, "Delegation completed ("+textutil.TruncateBytes(responseText, 80)+")", string(respJSON)); err != nil { log.Printf("Delegation %s: failed to insert success log: %v", delegationID, err) } @@ -423,7 +424,7 @@ func (h *DelegationHandler) executeDelegation(sourceID, targetID, delegationID s h.broadcaster.RecordAndBroadcast(ctx, string(events.EventDelegationComplete), sourceID, map[string]interface{}{ "delegation_id": delegationID, "target_id": targetID, - "response_preview": truncate(responseText, 200), + "response_preview": textutil.TruncateBytes(responseText, 200), }) // RFC #2829 PR-2 result-push (see UpdateStatus for rationale). pushDelegationResultToInbox(ctx, sourceID, delegationID, "completed", responseText, "") @@ -506,7 +507,7 @@ func (h *DelegationHandler) Record(c *gin.Context) { h.broadcaster.RecordAndBroadcast(ctx, string(events.EventDelegationSent), sourceID, map[string]interface{}{ "delegation_id": body.DelegationID, "target_id": body.TargetID, - "task_preview": truncate(body.Task, 100), + "task_preview": textutil.TruncateBytes(body.Task, 100), }) c.JSON(http.StatusAccepted, gin.H{ @@ -555,12 +556,12 @@ func (h *DelegationHandler) UpdateStatus(c *gin.Context) { if _, err := db.DB.ExecContext(ctx, ` INSERT INTO activity_logs (workspace_id, activity_type, method, source_id, summary, response_body, status) VALUES ($1, 'delegation', 'delegate_result', $2, $3, $4::jsonb, 'completed') - `, sourceID, sourceID, "Delegation completed ("+truncate(body.ResponsePreview, 80)+")", string(respJSON)); err != nil { + `, sourceID, sourceID, "Delegation completed ("+textutil.TruncateBytes(body.ResponsePreview, 80)+")", string(respJSON)); err != nil { log.Printf("Delegation UpdateStatus: result insert failed for %s: %v", delegationID, err) } h.broadcaster.RecordAndBroadcast(ctx, string(events.EventDelegationComplete), sourceID, map[string]interface{}{ "delegation_id": delegationID, - "response_preview": truncate(body.ResponsePreview, 200), + "response_preview": textutil.TruncateBytes(body.ResponsePreview, 200), }) // RFC #2829 PR-2 result-push: when the gate is on, also write an // a2a_receive row so the caller's inbox poller surfaces this to @@ -626,7 +627,7 @@ func (h *DelegationHandler) ListDelegations(c *gin.Context) { entry["error"] = errorDetail } if responseBody != "" { - entry["response_preview"] = truncate(responseBody, 300) + entry["response_preview"] = textutil.TruncateBytes(responseBody, 300) } delegations = append(delegations, entry) } @@ -727,9 +728,3 @@ func extractResponseText(body []byte) string { return string(body) } -func truncate(s string, max int) string { - if len(s) <= max { - return s - } - return s[:max] + "..." -} diff --git a/workspace-server/internal/handlers/delegation_ledger.go b/workspace-server/internal/handlers/delegation_ledger.go index 621cdbd9..89ee2d80 100644 --- a/workspace-server/internal/handlers/delegation_ledger.go +++ b/workspace-server/internal/handlers/delegation_ledger.go @@ -8,6 +8,7 @@ import ( "time" "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/textutil" ) // delegation_ledger.go — durable per-task ledger for A2A delegation @@ -50,40 +51,15 @@ func NewDelegationLedger(handle *sql.DB) *DelegationLedger { return &DelegationLedger{db: handle} } -// truncatePreview caps stored preview at 4KB. The full prompt/response is -// already in activity_logs.{request,response}_body — this is the at-a-glance -// view for the dashboard, not a forensic record. +// previewCap caps stored preview at 4KB. The full prompt/response is +// already in activity_logs.{request,response}_body — this is the +// at-a-glance view for the dashboard, not a forensic record. // -// Rune-safe: previous byte-slice form (s[:previewCap]) split on a byte -// boundary, which on a multi-byte codepoint at byte 4096 produced -// invalid UTF-8 — Postgres JSONB rejects → ledger row not inserted → -// audit gap. Issue #2962. Walks the string by rune, stops at the last -// rune-boundary index that fits inside the cap. ASCII-only strings hit -// the cap exactly; CJK/emoji strings stop slightly under the cap, -// never over. -// -// Mirrors the truncatePreviewRunes fix from agent_message_writer.go -// (#2959). Both call sites should consume a shared helper after both -// fixes have landed — followup deduplication tracked in #2962's body. +// Truncation goes through textutil.TruncateBytesNoMarker so it's +// rune-safe (#2026 / #2959 / #2962 bug class: byte-slice mid-codepoint +// → Postgres JSONB rejects → silent INSERT failure → audit gap). const previewCap = 4096 -func truncatePreview(s string) string { - if len(s) <= previewCap { - return s - } - // Range over a string yields rune-boundary byte indices. Walk - // until the next index would exceed previewCap; the previous - // index is the safe truncation point. - end := 0 - for i := range s { - if i > previewCap { - break - } - end = i - } - return s[:end] -} - // InsertOpts is the agent's record-of-intent. Caller, callee, task preview, // and the chosen delegation_id are required; idempotency_key is optional. type InsertOpts struct { @@ -118,7 +94,7 @@ func (l *DelegationLedger) Insert(ctx context.Context, opts InsertOpts) { ) VALUES ($1, $2, $3, $4, 'queued', $5, $6) ON CONFLICT (delegation_id) DO NOTHING `, opts.DelegationID, opts.CallerID, opts.CalleeID, - truncatePreview(opts.TaskPreview), deadline, idemArg) + textutil.TruncateBytesNoMarker(opts.TaskPreview, previewCap), deadline, idemArg) if err != nil { log.Printf("delegation_ledger Insert(%s): %v", opts.DelegationID, err) } @@ -197,7 +173,7 @@ func (l *DelegationLedger) SetStatus(ctx context.Context, result_preview = NULLIF($4, ''), updated_at = now() WHERE delegation_id = $1 - `, delegationID, status, errorDetail, truncatePreview(resultPreview)) + `, delegationID, status, errorDetail, textutil.TruncateBytesNoMarker(resultPreview, previewCap)) return err } diff --git a/workspace-server/internal/handlers/delegation_ledger_test.go b/workspace-server/internal/handlers/delegation_ledger_test.go index 48f1a3cd..78c26def 100644 --- a/workspace-server/internal/handlers/delegation_ledger_test.go +++ b/workspace-server/internal/handlers/delegation_ledger_test.go @@ -2,6 +2,7 @@ package handlers import ( "context" + "database/sql/driver" "errors" "strings" "testing" @@ -74,15 +75,20 @@ func TestLedgerInsert_TruncatesOversizedPreview(t *testing.T) { mock := setupTestDB(t) l := NewDelegationLedger(nil) - huge := strings.Repeat("x", 10_000) // > previewCap + // 4096 / 3 = 1365 runes; +10 for margin so we cross the cap. + // '世' is 3 bytes in UTF-8 (worst case for byte-cap rune walking). + huge := strings.Repeat("世", (previewCap/3)+10) + if len(huge) <= previewCap { + t.Fatalf("test setup: input too short (%d bytes) — must exceed previewCap=%d", len(huge), previewCap) + } mock.ExpectExec(`INSERT INTO delegations`). WithArgs( "deleg-big", "c", "ca", - sqlmock.AnyArg(), // truncated preview — verify length below via custom matcher - sqlmock.AnyArg(), - sqlmock.AnyArg(), + capValidUTF8Matcher{cap: previewCap}, // truncated preview must fit cap AND be valid UTF-8 + sqlmock.AnyArg(), // deadline + sqlmock.AnyArg(), // idempotency_key ). WillReturnResult(sqlmock.NewResult(0, 1)) @@ -97,87 +103,28 @@ func TestLedgerInsert_TruncatesOversizedPreview(t *testing.T) { } } -// ---------- truncatePreview unit ---------- +// capValidUTF8Matcher pins #2962 at the integration boundary: the +// preview that lands in the INSERT MUST be valid UTF-8 (else Postgres +// JSONB rejects → silent audit gap) AND fit within the byte cap. Pre- +// migration this would have asserted on the corrupted "世" mid-codepoint +// byte slice; post-migration it asserts the truncated preview is a +// clean rune-aligned prefix. +type capValidUTF8Matcher struct{ cap int } -func TestTruncatePreview_UnderCap(t *testing.T) { - in := "short" - if got := truncatePreview(in); got != in { - t.Errorf("under-cap should passthrough; got %q", got) +func (m capValidUTF8Matcher) Match(v driver.Value) bool { + s, ok := v.(string) + if !ok { + return false } + return len(s) <= m.cap && utf8.ValidString(s) } -func TestTruncatePreview_OverCapTruncatesAtBoundary(t *testing.T) { - in := strings.Repeat("a", previewCap+100) - got := truncatePreview(in) - if len(got) != previewCap { - t.Errorf("expected len=%d got len=%d", previewCap, len(got)) - } -} - -func TestTruncatePreview_ExactlyAtCap(t *testing.T) { - in := strings.Repeat("a", previewCap) - got := truncatePreview(in) - if got != in { - t.Errorf("at-cap should passthrough unchanged") - } -} - -// TestTruncatePreview_NeverProducesInvalidUTF8 — pins #2962. The old -// byte-slice implementation (s[:previewCap]) split on a byte boundary, -// so a multi-byte codepoint straddling byte 4096 produced invalid -// UTF-8 → Postgres JSONB rejects → ledger row not inserted → audit -// gap. Test feeds a CJK / emoji-padded string longer than previewCap -// and asserts utf8.ValidString on the result. -func TestTruncatePreview_NeverProducesInvalidUTF8(t *testing.T) { - // Build a string of '世' (3 bytes per rune in UTF-8) that's just - // past the cap. With the old implementation, the slice at byte - // previewCap would land mid-rune and ValidString would fail. - // With the rune-aware implementation, the result is always valid - // UTF-8 even if the byte length is < previewCap. - rune3 := "世" // U+4E16, 3 bytes - // Need at least previewCap/3 + 1 runes so we cross the cap with - // margin to spare. - in := strings.Repeat(rune3, (previewCap/3)+10) - if len(in) <= previewCap { - t.Fatalf("test setup: input too short (%d bytes) — must exceed previewCap=%d", len(in), previewCap) - } - got := truncatePreview(in) - if !utf8.ValidString(got) { - t.Errorf("truncatePreview produced invalid UTF-8 — JSONB will reject this row. len(got)=%d", len(got)) - } - if len(got) > previewCap { - t.Errorf("truncatePreview exceeded cap: len(got)=%d > previewCap=%d", len(got), previewCap) - } - // Defense-in-depth: the result should also be a clean rune - // prefix of the input — not some garbled sequence. - if !strings.HasPrefix(in, got) { - t.Errorf("truncatePreview should return a prefix of the input") - } -} - -// TestTruncatePreview_MultiByteAtBoundary — most-targeted regression. -// Feeds an input where the cap byte falls EXACTLY in the middle of a -// 3-byte codepoint. Pre-fix, this is the case that produces invalid -// UTF-8; post-fix, the truncate stops at the previous rune boundary. -func TestTruncatePreview_MultiByteAtBoundary(t *testing.T) { - // Build a string that's `previewCap-1` ASCII bytes followed by - // '世' (3 bytes). Total = previewCap + 2. The old impl would - // slice at byte previewCap, landing inside the '世' codepoint. - prefix := strings.Repeat("a", previewCap-1) - in := prefix + "世" - if len(in) != previewCap+2 { - t.Fatalf("test setup: expected len %d, got %d", previewCap+2, len(in)) - } - got := truncatePreview(in) - if !utf8.ValidString(got) { - t.Errorf("truncatePreview produced invalid UTF-8 at the multi-byte boundary case") - } - // Result should be exactly the ASCII prefix — '世' was past - // the cap so it must be dropped entirely. - if got != prefix { - t.Errorf("expected exact ASCII prefix, got %q (len=%d)", got[len(got)-10:], len(got)) - } -} +// Helper-level truncation tests now live in +// internal/textutil/truncate_test.go. The integration-level path +// (TestLedgerInsert_TruncatesOversizedPreview above) still exercises +// the previewCap boundary through the SQL write so a regression in +// the wiring (wrong cap, wrong helper, missing call) would still go +// red here. // ---------- SetStatus lifecycle ---------- diff --git a/workspace-server/internal/memory/client/client.go b/workspace-server/internal/memory/client/client.go index 194ea21b..2dce9267 100644 --- a/workspace-server/internal/memory/client/client.go +++ b/workspace-server/internal/memory/client/client.go @@ -35,6 +35,7 @@ import ( "time" "github.com/Molecule-AI/molecule-monorepo/platform/internal/memory/contract" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/textutil" ) const ( @@ -340,7 +341,7 @@ func decodeError(resp *http.Response) error { // have rather than dropping it. return &contract.Error{ Code: httpStatusToCode(resp.StatusCode), - Message: fmt.Sprintf("status %d: %s", resp.StatusCode, truncate(string(body), 256)), + Message: fmt.Sprintf("status %d: %s", resp.StatusCode, textutil.TruncateBytes(string(body), 256)), } } return &e @@ -359,12 +360,7 @@ func httpStatusToCode(status int) contract.ErrorCode { } } -func truncate(s string, n int) string { - if len(s) <= n { - return s - } - return s[:n] + "…" -} +// truncation moved to internal/textutil.TruncateBytes (#2962 SSOT). // --- Circuit breaker --- diff --git a/workspace-server/internal/memory/client/client_test.go b/workspace-server/internal/memory/client/client_test.go index 9d18d23c..3f7830b4 100644 --- a/workspace-server/internal/memory/client/client_test.go +++ b/workspace-server/internal/memory/client/client_test.go @@ -499,14 +499,10 @@ func TestHttpStatusToCode(t *testing.T) { } } -func TestTruncate(t *testing.T) { - if got := truncate("short", 10); got != "short" { - t.Errorf("got %q", got) - } - if got := truncate(strings.Repeat("a", 300), 10); !strings.HasSuffix(got, "…") { - t.Errorf("expected ellipsis: %q", got) - } -} +// Truncate moved to internal/textutil — coverage lives in +// internal/textutil/truncate_test.go (TestTruncateBytes_RuneBoundary). +// memory/client just calls it as a wire-shape helper for error +// messages; no client-specific behavior to pin here. // --- Circuit breaker --- diff --git a/workspace-server/internal/scheduler/scheduler.go b/workspace-server/internal/scheduler/scheduler.go index a7a969ca..53f17e0c 100644 --- a/workspace-server/internal/scheduler/scheduler.go +++ b/workspace-server/internal/scheduler/scheduler.go @@ -17,6 +17,7 @@ import ( "github.com/Molecule-AI/molecule-monorepo/platform/internal/events" "github.com/Molecule-AI/molecule-monorepo/platform/internal/metrics" "github.com/Molecule-AI/molecule-monorepo/platform/internal/supervised" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/textutil" ) const ( @@ -522,7 +523,7 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) { "schedule_id": sched.ID, "schedule_name": sched.Name, "cron_expr": sched.CronExpr, - "prompt": sanitizeUTF8(truncate(sched.Prompt, 200)), + "prompt": sanitizeUTF8(textutil.TruncateBytes(sched.Prompt, 200)), }) // #152: persist lastError into error_detail on the activity_logs row // so GET /workspaces/:id/schedules/:id/history can surface why a run @@ -807,27 +808,10 @@ func isEmptyResponse(body []byte) bool { return false } -// truncate shortens s to at most maxLen bytes, appending "..." if truncated. -// #2026: UTF-8 safe — byte-slicing at maxLen-3 would split multi-byte runes -// (observed: U+2026 `…` = 0xe2 0x80 0xa6, sliced mid-char, concatenated with -// "..." producing 0xe2 0x80 0x2e — rejected by Postgres as invalid UTF-8, -// which wedged the activity_logs INSERT with no deadline and stalled the -// scheduler). -func truncate(s string, maxLen int) string { - if len(s) <= maxLen { - return s - } - cut := maxLen - 3 - if cut < 0 { - cut = 0 - } - // Back up to a rune boundary — utf8.RuneStart returns true for any - // non-continuation byte (ASCII, or the lead byte of a multi-byte rune). - for cut > 0 && !utf8.RuneStart(s[cut]) { - cut-- - } - return s[:cut] + "..." -} +// truncation moved to internal/textutil.TruncateBytes (#2962 SSOT). +// The original #2026 fix lives in textutil's package docs as canonical +// prior art. Ellipsis was previously "..." (3 ASCII bytes); the SSOT +// uses "…" (3 UTF-8 bytes) — same byte budget, single-glyph display. // short returns up to n leading characters of s without panicking when s is // shorter than n. Used to safely display UUID prefixes in log lines where diff --git a/workspace-server/internal/scheduler/scheduler_test.go b/workspace-server/internal/scheduler/scheduler_test.go index 2367d721..742ec0ad 100644 --- a/workspace-server/internal/scheduler/scheduler_test.go +++ b/workspace-server/internal/scheduler/scheduler_test.go @@ -10,6 +10,7 @@ import ( sqlmock "github.com/DATA-DOG/go-sqlmock" "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/textutil" ) // errDBDown is a sentinel error used by tests to simulate a DB connection failure. @@ -618,7 +619,7 @@ func TestTruncate_utf8Safe_regression2026(t *testing.T) { filler += "a" } input := filler + "…xxx" // 195 ASCII + 3-byte rune + 3 trailing - out := truncate(input, 200) + out := textutil.TruncateBytes(input, 200) if !utf8.ValidString(out) { t.Fatalf("truncate produced invalid UTF-8: %x", []byte(out)) diff --git a/workspace-server/internal/textutil/truncate.go b/workspace-server/internal/textutil/truncate.go new file mode 100644 index 00000000..4a338598 --- /dev/null +++ b/workspace-server/internal/textutil/truncate.go @@ -0,0 +1,130 @@ +// Package textutil provides string-handling helpers that respect UTF-8 +// rune boundaries. +// +// Why this package exists +// ----------------------- +// `s[:max]` truncates by BYTES; for any string with a multi-byte +// codepoint at byte `max` (CJK, emoji, accented Latin), the slice +// produces invalid UTF-8. Postgres `text` and `jsonb` columns reject +// invalid UTF-8 with `invalid byte sequence for encoding "UTF8"`, +// which silently fails the INSERT and holds the surrounding tx open +// — a class of audit-gap that has bitten this codebase three times +// (scheduler.go #2026, agent_message_writer.go #2959, +// delegation_ledger.go #2962). Six per-package helpers had +// independently re-implemented this logic with varying correctness; +// this package is the single source of truth. +// +// Use sites +// --------- +// - DB writes whose column is bytes-bounded (jsonb preview field, +// varchar(N)): TruncateBytes / TruncateBytesNoMarker. +// - UI summaries whose cap is in display chars, not bytes: +// TruncateRunes. +// +// All functions guarantee `utf8.ValidString(out) == true` for any +// `s` where `utf8.ValidString(s) == true`. Inputs that are already +// invalid UTF-8 should be sanitized at the trust boundary (e.g. via +// `strings.ToValidUTF8`); this package does not silently fix +// upstream invalid input. +package textutil + +import "unicode/utf8" + +// ellipsis is the truncation marker. U+2026 HORIZONTAL ELLIPSIS — +// 3 bytes in UTF-8, 1 rune, 1 display column. Standardized across +// the codebase to avoid the "..." (3 ASCII chars) vs "…" (1 char) +// inconsistency the per-package helpers had drifted into. +const ellipsis = "…" + +// TruncateBytes returns s if `len(s) <= maxBytes`, otherwise returns +// the longest rune-aligned prefix of s that fits in `maxBytes - 3` +// bytes followed by the ellipsis marker. The returned string is +// always at most `maxBytes` bytes long. +// +// Example: TruncateBytes("你好世界你好", 10) returns "你好世…" (9 bytes) +// — three "你好" runes (each 3 bytes = 9 bytes) plus "…" (3 bytes) +// would be 12 bytes, so we walk back to "你好" (6 bytes) + "…" (3) = 9. +// +// Edge cases: +// - maxBytes <= 0: returns "" (no room even for input or marker) +// - maxBytes < len(ellipsis): returns "" (can't add marker without +// exceeding cap, and we won't return a marker-less truncation +// here — caller wanted a marker; use TruncateBytesNoMarker if +// they don't) +// - s contains invalid UTF-8: continuation bytes are walked over +// same as valid runes; the result preserves the (invalid) input +// bytes up to the truncation point. Caller is responsible for +// pre-sanitizing if Postgres validity is required. +func TruncateBytes(s string, maxBytes int) string { + if len(s) <= maxBytes { + return s + } + if maxBytes < len(ellipsis) { + return "" + } + // Reserve room for the marker, then walk back to the nearest + // rune boundary at or below the cut point. + cut := maxBytes - len(ellipsis) + for cut > 0 && !utf8.RuneStart(s[cut]) { + cut-- + } + return s[:cut] + ellipsis +} + +// TruncateBytesNoMarker returns s if `len(s) <= maxBytes`, otherwise +// returns the longest rune-aligned prefix of s that fits in +// `maxBytes` bytes. No marker is appended — useful when the caller's +// storage already conveys "preview" / "snippet" semantics and an +// extra ellipsis would push the result over a hard column cap. +// +// Example: TruncateBytesNoMarker("hello world", 5) returns "hello". +// +// Edge case: maxBytes <= 0 returns "". +func TruncateBytesNoMarker(s string, maxBytes int) string { + if len(s) <= maxBytes { + return s + } + if maxBytes <= 0 { + return "" + } + cut := maxBytes + for cut > 0 && !utf8.RuneStart(s[cut]) { + cut-- + } + return s[:cut] +} + +// TruncateRunes returns s if it has at most maxRunes runes, otherwise +// returns the first maxRunes runes followed by the ellipsis marker. +// Use this when the cap is in user-visible characters (UI summary, +// activity feed line) rather than bytes (DB column). +// +// Example: TruncateRunes("你好世界你好", 3) returns "你好世…" — three +// runes plus the marker, regardless of the resulting byte count. +// +// Edge case: maxRunes <= 0 returns "" (caller asked for no content). +func TruncateRunes(s string, maxRunes int) string { + if maxRunes <= 0 { + return "" + } + // Fast path: if every byte is a single-byte rune, the byte-length + // upper-bounds the rune count. This avoids a runes alloc for the + // common ASCII case where the input fits. + if len(s) <= maxRunes { + return s + } + // Walk by rune boundaries; stop at the (maxRunes+1)-th rune so we + // know the cut point and that truncation is needed. + count := 0 + for i := range s { + if count == maxRunes { + return s[:i] + ellipsis + } + count++ + } + // Reachable when the byte count exceeded maxRunes but the actual + // rune count didn't (e.g. all single-byte runes that just happen + // to be more than maxRunes). The fast path catches len(s) <= + // maxRunes; this catches maxRunes < runeCount(s) <= len(s). + return s +} diff --git a/workspace-server/internal/textutil/truncate_test.go b/workspace-server/internal/textutil/truncate_test.go new file mode 100644 index 00000000..fce0ae0f --- /dev/null +++ b/workspace-server/internal/textutil/truncate_test.go @@ -0,0 +1,222 @@ +package textutil + +import ( + "testing" + "unicode/utf8" +) + +// TestTruncateBytes_RuneBoundary pins the byte-cap, marker-bearing +// truncation path. Every case asserts both: +// 1. the exact expected output (so a refactor that flips ellipsis or +// drops a rune is caught), and +// 2. utf8.ValidString on the output (the invariant that the bug class +// in #2026/#2959/#2962 violated by slicing mid-codepoint). +// +// Per memory feedback_assert_exact_not_substring.md, asserts are exact +// equality, not substring matches. +func TestTruncateBytes_RuneBoundary(t *testing.T) { + cases := []struct { + name string + in string + maxBytes int + want string + }{ + // Under-cap: returns input verbatim. + {"empty", "", 10, ""}, + {"under-cap ASCII", "hi", 10, "hi"}, + {"exactly-at-cap ASCII", "hello", 5, "hello"}, + {"under-cap CJK", "你好", 10, "你好"}, // 6 bytes + {"exactly-at-cap CJK", "你好", 6, "你好"}, + + // Over-cap ASCII: trims to (maxBytes - 3) bytes + "…". + {"over-cap ASCII", "abcdefghij", 6, "abc…"}, + + // Over-cap CJK where cut would land mid-codepoint. The + // pre-fix bug shape: 7 - 3 = 4, but byte 4 is mid-"好" + // (好 is bytes 3..5 of "你好世界"). Walking back to byte 3 + // (start of 好 — wait, that IS the start). Actually 你=0..2, + // 好=3..5, 世=6..8, 界=9..11. Cut=4, walk back to 3 (start + // of 好), then s[:3]="你", + "…" = "你…" (3+3=6 bytes ≤ 7). + {"over-cap CJK lands mid-codepoint", "你好世界", 7, "你…"}, + + // Over-cap CJK where cut lands exactly on rune boundary. + // 9 - 3 = 6, byte 6 is start of 世. Walk-back is no-op. + // s[:6]="你好" + "…" = "你好…" (9 bytes). + {"over-cap CJK rune-aligned", "你好世界", 9, "你好…"}, + + // Emoji: 😀 is 4 bytes (U+1F600). 7 - 3 = 4, byte 4 is start + // of second 😀 — walk-back no-op. s[:4]="😀" + "…" = "😀…". + {"over-cap emoji", "😀😀😀", 7, "😀…"}, + + // Mixed ASCII + CJK. "ab你好世界": a(1) b(1) 你(3) 好(3) 世(3) 界(3) = 14 bytes. + // maxBytes=8, 8-3=5. byte 5 is mid-好. Walk back to start of 好 = byte 5? Let me + // recompute: a=0, b=1, 你=2..4, 好=5..7, 世=8..10. Byte 5 IS start of 好. + // Walk-back keeps cut at 5. s[:5] = "ab你" + "…" = "ab你…" (8 bytes). + {"mixed prefix ASCII over-cap CJK", "ab你好世界", 8, "ab你…"}, + + // Pathological: maxBytes too small to even fit the marker. + {"cap below ellipsis len", "hello", 2, ""}, + {"cap zero", "hello", 0, ""}, + {"cap negative", "hello", -1, ""}, + + // Cap exactly == ellipsis len: no room for content, but + // the marker fits. This returns "" (cut = 0, s[:0] = ""). + {"cap equals ellipsis len", "hello", 3, "…"}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got := TruncateBytes(c.in, c.maxBytes) + if got != c.want { + t.Errorf("TruncateBytes(%q, %d) = %q, want %q", c.in, c.maxBytes, got, c.want) + } + if !utf8.ValidString(got) { + t.Errorf("TruncateBytes(%q, %d) returned invalid UTF-8: %q", c.in, c.maxBytes, got) + } + // Output never exceeds the byte cap (when one is set). + if c.maxBytes > 0 && len(got) > c.maxBytes { + t.Errorf("TruncateBytes(%q, %d) overflowed cap: len(out)=%d > %d", + c.in, c.maxBytes, len(got), c.maxBytes) + } + }) + } +} + +// TestTruncateBytesNoMarker pins the marker-less variant. Same +// boundary handling as TruncateBytes but no ellipsis cost — the cut +// happens at maxBytes itself, walking back only if that lands +// mid-codepoint. +func TestTruncateBytesNoMarker(t *testing.T) { + cases := []struct { + name string + in string + maxBytes int + want string + }{ + {"empty", "", 10, ""}, + {"under-cap ASCII", "hi", 10, "hi"}, + {"exactly-at-cap ASCII", "hello", 5, "hello"}, + {"over-cap ASCII", "abcdefghij", 5, "abcde"}, + + // Over-cap CJK rune-aligned: "你好世界", maxBytes=6, byte 6 is start of 世. + // s[:6]="你好" — perfect cut. + {"over-cap CJK rune-aligned", "你好世界", 6, "你好"}, + + // Over-cap CJK mid-codepoint: maxBytes=4, byte 4 is mid-好. + // Walk back to byte 3 (start of 好), s[:3]="你". + {"over-cap CJK mid-codepoint", "你好世界", 4, "你"}, + + // Emoji: maxBytes=5, "😀😀" is bytes 0..3 then 4..7. byte 5 is mid-second-😀. + // Walk back to byte 4 (start of second 😀), s[:4]="😀". + {"over-cap emoji", "😀😀", 5, "😀"}, + + // Edge: cap zero or negative → "". + {"cap zero", "hello", 0, ""}, + {"cap negative", "hello", -1, ""}, + + // Cap = 1 and first rune is multi-byte: walk-back to 0, return "". + {"cap one with leading CJK", "你hello", 1, ""}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got := TruncateBytesNoMarker(c.in, c.maxBytes) + if got != c.want { + t.Errorf("TruncateBytesNoMarker(%q, %d) = %q, want %q", c.in, c.maxBytes, got, c.want) + } + if !utf8.ValidString(got) { + t.Errorf("TruncateBytesNoMarker(%q, %d) returned invalid UTF-8: %q", c.in, c.maxBytes, got) + } + if c.maxBytes > 0 && len(got) > c.maxBytes { + t.Errorf("TruncateBytesNoMarker(%q, %d) overflowed cap: len(out)=%d > %d", + c.in, c.maxBytes, len(got), c.maxBytes) + } + }) + } +} + +// TestTruncateRunes pins the rune-cap variant. The key contract is +// that maxRunes counts user-visible characters (Go runes, which line +// up with Unicode codepoints), not bytes — so "你好世界" with +// maxRunes=2 returns "你好…", regardless of the resulting byte count. +func TestTruncateRunes(t *testing.T) { + cases := []struct { + name string + in string + maxRunes int + want string + }{ + {"empty", "", 5, ""}, + {"under-cap ASCII", "hi", 5, "hi"}, + {"exactly-at-cap ASCII", "hello", 5, "hello"}, + {"over-cap ASCII", "abcdefghij", 5, "abcde…"}, + + {"under-cap CJK", "你好", 5, "你好"}, + {"exactly-at-cap CJK", "你好", 2, "你好"}, + + // Over-cap CJK: maxRunes=3, expect first 3 runes + marker. + {"over-cap CJK", "你好世界你好", 3, "你好世…"}, + + // Emoji is one rune per glyph in Go (no ZWJ here). + {"over-cap emoji", "😀😀😀😀😀", 2, "😀😀…"}, + + // Mixed: maxRunes=3 of "ab你好世界" → "ab你…". + {"mixed prefix", "ab你好世界", 3, "ab你…"}, + + // Edge: maxRunes 0 / negative → "". + {"cap zero", "hello", 0, ""}, + {"cap negative", "hello", -1, ""}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got := TruncateRunes(c.in, c.maxRunes) + if got != c.want { + t.Errorf("TruncateRunes(%q, %d) = %q, want %q", c.in, c.maxRunes, got, c.want) + } + if !utf8.ValidString(got) { + t.Errorf("TruncateRunes(%q, %d) returned invalid UTF-8: %q", c.in, c.maxRunes, got) + } + }) + } +} + +// TestTruncate_FuzzInvariants stays as a property-style sanity check: +// for any rune-valid input and any cap, the output is rune-valid and +// (for byte-cap variants) within the cap. This catches off-by-one +// regressions in cuts that slip past the table-test cases above. +func TestTruncate_FuzzInvariants(t *testing.T) { + inputs := []string{ + "", + "a", + "hello world", + "你好世界", + "😀😀😀", + "ab你c好d世e界", + "日本語の文字列", + "🇺🇸🇯🇵", // flags: each is 2 codepoints (regional indicators) + } + for _, in := range inputs { + for cap := -1; cap <= len(in)+5; cap++ { + t.Run("", func(t *testing.T) { + gotB := TruncateBytes(in, cap) + if !utf8.ValidString(gotB) { + t.Errorf("TruncateBytes(%q, %d) invalid UTF-8: %q", in, cap, gotB) + } + if cap > 0 && len(gotB) > cap { + t.Errorf("TruncateBytes(%q, %d) overflowed: %q (%d bytes)", in, cap, gotB, len(gotB)) + } + + gotN := TruncateBytesNoMarker(in, cap) + if !utf8.ValidString(gotN) { + t.Errorf("TruncateBytesNoMarker(%q, %d) invalid UTF-8: %q", in, cap, gotN) + } + if cap > 0 && len(gotN) > cap { + t.Errorf("TruncateBytesNoMarker(%q, %d) overflowed: %q (%d bytes)", in, cap, gotN, len(gotN)) + } + + gotR := TruncateRunes(in, cap) + if !utf8.ValidString(gotR) { + t.Errorf("TruncateRunes(%q, %d) invalid UTF-8: %q", in, cap, gotR) + } + }) + } + } +}