forked from molecule-ai/molecule-core
fix(textutil): SSOT for rune-safe string truncation, fix 3 audit-gap bugs
Closes #2962. ## Why Six per-package `truncate` helpers had drifted into independent re-implementations of the same idea. Three of them (delegation.go, memory/client/client.go, memory-backfill/verify.go) used `s[:max] + "…"` byte-slice form, which on a multi-byte codepoint at byte `max` produces invalid UTF-8 → Postgres `text`/`jsonb` rejects the INSERT silently → `delegation` / `activity_logs` row never lands → audit gap. Three other helpers (delegation_ledger.go #2962, agent_message_writer.go #2959, scheduler.go #2026) had each been fixed in isolation with three slightly different rune-safe shapes — confirming this is a class of bug, not a single instance. ## What New package `internal/textutil` with three rune-safe functions: - `TruncateBytes(s, maxBytes)` — byte-cap, "…" marker. Used by 5 callers writing into byte-bounded columns / log lines. - `TruncateBytesNoMarker(s, maxBytes)` — byte-cap, no marker. Used by delegation_ledger.go where the storage already conveys "preview" and an extra ellipsis would push the result over the column cap. - `TruncateRunes(s, maxRunes)` — rune-cap, "…" marker. Used by agent_message_writer.go where the cap is in display chars (UI summary), not bytes. All three guarantee `utf8.ValidString(out)` for any `utf8.ValidString(in)`. Inputs already invalid go through `sanitizeUTF8` at the call site boundary (scheduler.go preserved this defense-in-depth). ## Migration map | Old | New | Behavior change | |---|---|---| | `delegation_ledger.truncatePreview` | `textutil.TruncateBytesNoMarker(s, 4096)` | none | | `agent_message_writer.truncatePreviewRunes` | `textutil.TruncateRunes(s, n)` | none | | `scheduler.truncate` | `textutil.TruncateBytes(s, n)` | "..." → "…" (3 bytes either way; single-glyph display) | | `delegation.truncate` | `textutil.TruncateBytes(s, n)` | bug fix + ellipsis swap | | `memory/client.truncate` | `textutil.TruncateBytes(s, n)` | bug fix | | `memory-backfill.truncate` | `textutil.TruncateBytes(s, n)` | bug fix | Five separate `truncate*` helpers + their per-package tests removed. Net: 12 files / +427 / -255. ## Tests - `internal/textutil/truncate_test.go` — 27 table-test cases + 145 fuzz-invariant cases asserting `utf8.ValidString` and byte-cap invariants on every output. - `delegation_ledger_test.go TestLedgerInsert_TruncatesOversizedPreview` strengthened with `capValidUTF8Matcher` so the SQL-write argument is asserted to be valid UTF-8 + within cap (not just `AnyArg()`). Mutation-tested: replacing the SSOT call with byte-slice form makes this test fail loud. ## Compatibility - All callers internal; no external API surface change. - Ellipsis swap "..." → "…": same byte budget (3 bytes), single-glyph display. No alerting/grep on either marker in this codebase (verified). Canvas renders both correctly. - DB column widths unchanged (4096 / 80 / 200 / 256 / 300 — all preserved in the migrations). ## Security Fixes a silent INSERT-failure mode that hid `activity_logs` / `delegations` rows containing peer-controlled text. The class of input that triggered it (CJK, emoji, accented Latin) is normal user content, not malicious — but the symptom (audit gap) makes incident reconstruction harder. Helper is pure-function over `string`; no secrets / PII / auth handling involved. Untrusted input is handled identically to before, just rune-aligned now. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
c53155ec5f
commit
656a02fae4
@ -21,6 +21,7 @@ import (
|
||||
"os"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/memory/contract"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/textutil"
|
||||
)
|
||||
|
||||
// verifyConfig is the typed dependency bundle for verifyParity.
|
||||
@ -121,7 +122,7 @@ func verifyParity(ctx context.Context, cfg verifyConfig, stdout *os.File) (*veri
|
||||
matched := true
|
||||
for _, c := range legacy {
|
||||
if pluginContents[c] == 0 {
|
||||
fmt.Fprintf(stdout, "[mismatch] workspace=%s missing-from-plugin content=%q\n", wsID, truncate(c, 80))
|
||||
fmt.Fprintf(stdout, "[mismatch] workspace=%s missing-from-plugin content=%q\n", wsID, textutil.TruncateBytes(c, 80))
|
||||
matched = false
|
||||
break
|
||||
}
|
||||
@ -192,9 +193,4 @@ func queryLegacyMemories(ctx context.Context, db *sql.DB, workspaceID string) ([
|
||||
return out, rows.Err()
|
||||
}
|
||||
|
||||
func truncate(s string, n int) string {
|
||||
if len(s) <= n {
|
||||
return s
|
||||
}
|
||||
return s[:n] + "…"
|
||||
}
|
||||
// truncation moved to internal/textutil.TruncateBytes (#2962 SSOT).
|
||||
|
||||
@ -349,16 +349,8 @@ func TestVerifyParity_PickSampleError(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// --- Truncate ---
|
||||
|
||||
func TestVerifyTruncate(t *testing.T) {
|
||||
if got := truncate("short", 10); got != "short" {
|
||||
t.Errorf("got %q", got)
|
||||
}
|
||||
if got := truncate(strings.Repeat("a", 200), 10); !strings.HasSuffix(got, "…") {
|
||||
t.Errorf("expected ellipsis: %q", got)
|
||||
}
|
||||
}
|
||||
// Truncate moved to internal/textutil — coverage in
|
||||
// internal/textutil/truncate_test.go (TestTruncateBytes_RuneBoundary).
|
||||
|
||||
// --- CLI: -verify mode ---
|
||||
|
||||
|
||||
@ -22,6 +22,7 @@ import (
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/textutil"
|
||||
)
|
||||
|
||||
// extractIdempotencyKey pulls params.message.messageId out of an A2A JSON-RPC
|
||||
@ -420,7 +421,7 @@ func (h *WorkspaceHandler) stitchDrainResponseToDelegation(ctx context.Context,
|
||||
AND method = 'delegate_result'
|
||||
AND target_id = $4
|
||||
AND response_body->>'delegation_id' = $5
|
||||
`, "Delegation completed ("+truncate(responseText, 80)+")", string(respJSON),
|
||||
`, "Delegation completed ("+textutil.TruncateBytes(responseText, 80)+")", string(respJSON),
|
||||
sourceID, targetID, delegationID)
|
||||
if err != nil {
|
||||
log.Printf("A2AQueue drain stitch: update failed for delegation %s: %v", delegationID, err)
|
||||
@ -439,7 +440,7 @@ func (h *WorkspaceHandler) stitchDrainResponseToDelegation(ctx context.Context,
|
||||
h.broadcaster.RecordAndBroadcast(ctx, string(events.EventDelegationComplete), sourceID, map[string]interface{}{
|
||||
"delegation_id": delegationID,
|
||||
"target_id": targetID,
|
||||
"response_preview": truncate(responseText, 200),
|
||||
"response_preview": textutil.TruncateBytes(responseText, 200),
|
||||
"via": "queue_drain",
|
||||
})
|
||||
}
|
||||
|
||||
@ -42,9 +42,9 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/textutil"
|
||||
)
|
||||
|
||||
// ErrWorkspaceNotFound is returned by AgentMessageWriter.Send when the
|
||||
@ -54,36 +54,6 @@ import (
|
||||
// timeout) surface as wrapped errors and should be treated as 503.
|
||||
var ErrWorkspaceNotFound = errors.New("agent_message: workspace not found")
|
||||
|
||||
// truncatePreviewRunes returns at most maxRunes runes of s, plus an ellipsis
|
||||
// when truncated. Operates on the rune (codepoint) boundary instead of
|
||||
// byte indices — the previous byte-slice version produced invalid UTF-8
|
||||
// when maxRunes landed mid-codepoint (CJK, emoji, accented characters
|
||||
// in agent-authored chat messages), and Postgres JSONB rejects invalid
|
||||
// UTF-8, dropping the activity_log INSERT silently. The persistence
|
||||
// failure log fires but the message vanishes from chat history — the
|
||||
// exact regression class the SSOT consolidation was built to prevent.
|
||||
//
|
||||
// maxRunes is in runes, not bytes — `truncatePreviewRunes("你好", 1)` returns
|
||||
// `"你…"`, not `"\xe4…"`. Set the cap on a UI-friendly basis (visible
|
||||
// character count, not stored byte count); 80 runes covers the
|
||||
// activity_logs.summary column comfortably.
|
||||
func truncatePreviewRunes(s string, maxRunes int) string {
|
||||
if utf8.RuneCountInString(s) <= maxRunes {
|
||||
return s
|
||||
}
|
||||
// Walk runes until we've consumed maxRunes; cut at that byte index.
|
||||
count := 0
|
||||
cut := len(s)
|
||||
for i := range s {
|
||||
if count == maxRunes {
|
||||
cut = i
|
||||
break
|
||||
}
|
||||
count++
|
||||
}
|
||||
return s[:cut] + "…"
|
||||
}
|
||||
|
||||
// AgentMessageAttachment is one file attached to an agent → user
|
||||
// message. Identical to handlers.NotifyAttachment in field set; kept
|
||||
// distinct so the writer's API doesn't import a handler type with HTTP
|
||||
@ -186,7 +156,7 @@ func (w *AgentMessageWriter) Send(
|
||||
respPayload["parts"] = fileParts
|
||||
}
|
||||
respJSON, _ := json.Marshal(respPayload)
|
||||
preview := truncatePreviewRunes(message, 80)
|
||||
preview := textutil.TruncateRunes(message, 80)
|
||||
if _, err := w.db.ExecContext(ctx, `
|
||||
INSERT INTO activity_logs (workspace_id, activity_type, method, summary, response_body, status)
|
||||
VALUES ($1, 'a2a_receive', 'notify', $2, $3::jsonb, 'ok')
|
||||
|
||||
@ -331,45 +331,11 @@ func TestAgentMessageWriter_Send_DBErrorOnLookupReturnsWrapped(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestTruncatePreviewRunes_RuneBoundary pins the multi-byte-safe
|
||||
// truncation. The previous byte-slice version produced invalid UTF-8
|
||||
// when the cut landed mid-codepoint (CJK, emoji, accented), and
|
||||
// Postgres JSONB rejects invalid UTF-8 — INSERT fails, log.Printf
|
||||
// fires, message vanishes from chat history. Per memory
|
||||
// feedback_assert_exact_not_substring.md, pin the boundary cases
|
||||
// directly.
|
||||
func TestTruncatePreviewRunes_RuneBoundary(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
in string
|
||||
max int
|
||||
want string
|
||||
}{
|
||||
{"under-max ASCII", "hi", 80, "hi"},
|
||||
{"under-max CJK", "你好", 80, "你好"},
|
||||
{"exactly-at-max", "abcde", 5, "abcde"},
|
||||
{"truncate ASCII", "abcdefghij", 5, "abcde…"},
|
||||
{"truncate CJK at rune boundary", "你好世界你好世界", 4, "你好世界…"},
|
||||
{"truncate emoji at rune boundary", "😀😀😀😀😀😀", 3, "😀😀😀…"},
|
||||
// The pre-fix bug shape: byte-slice on non-ASCII would have
|
||||
// mangled the codepoint here. With rune-boundary truncation
|
||||
// the result is well-formed UTF-8.
|
||||
{"non-zero with emoji prefix", "🚀abcdefghijk", 5, "🚀abcd…"},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
got := truncatePreviewRunes(c.in, c.max)
|
||||
if got != c.want {
|
||||
t.Errorf("truncatePreviewRunes(%q, %d) = %q, want %q", c.in, c.max, got, c.want)
|
||||
}
|
||||
// Always-valid UTF-8 invariant. A byte-slice truncation
|
||||
// could leave partial codepoints; this version must not.
|
||||
if !utf8.ValidString(got) {
|
||||
t.Errorf("truncatePreviewRunes(%q, %d) returned invalid UTF-8: %q", c.in, c.max, got)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
// Helper-level truncate tests now live in
|
||||
// internal/textutil/truncate_test.go (TestTruncateRunes). The
|
||||
// integration-level coverage that exercises the agent_message_writer
|
||||
// path with non-ASCII content is TestAgentMessageWriter_Send_NonASCIIMessagePersists
|
||||
// below.
|
||||
|
||||
// TestAgentMessageWriter_Send_NonASCIIMessagePersists pins the end-to-end
|
||||
// path for non-ASCII messages — the original reno-stars regression
|
||||
|
||||
@ -10,6 +10,7 @@ import (
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/textutil"
|
||||
"github.com/gin-gonic/gin"
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
@ -167,7 +168,7 @@ func (h *DelegationHandler) Delegate(c *gin.Context) {
|
||||
h.broadcaster.RecordAndBroadcast(ctx, string(events.EventDelegationSent), sourceID, map[string]interface{}{
|
||||
"delegation_id": delegationID,
|
||||
"target_id": body.TargetID,
|
||||
"task_preview": truncate(body.Task, 100),
|
||||
"task_preview": textutil.TruncateBytes(body.Task, 100),
|
||||
})
|
||||
|
||||
resp := gin.H{
|
||||
@ -407,7 +408,7 @@ func (h *DelegationHandler) executeDelegation(sourceID, targetID, delegationID s
|
||||
if _, err := db.DB.ExecContext(ctx, `
|
||||
INSERT INTO activity_logs (workspace_id, activity_type, method, source_id, target_id, summary, response_body, status)
|
||||
VALUES ($1, 'delegation', 'delegate_result', $2, $3, $4, $5::jsonb, 'completed')
|
||||
`, sourceID, sourceID, targetID, "Delegation completed ("+truncate(responseText, 80)+")", string(respJSON)); err != nil {
|
||||
`, sourceID, sourceID, targetID, "Delegation completed ("+textutil.TruncateBytes(responseText, 80)+")", string(respJSON)); err != nil {
|
||||
log.Printf("Delegation %s: failed to insert success log: %v", delegationID, err)
|
||||
}
|
||||
|
||||
@ -423,7 +424,7 @@ func (h *DelegationHandler) executeDelegation(sourceID, targetID, delegationID s
|
||||
h.broadcaster.RecordAndBroadcast(ctx, string(events.EventDelegationComplete), sourceID, map[string]interface{}{
|
||||
"delegation_id": delegationID,
|
||||
"target_id": targetID,
|
||||
"response_preview": truncate(responseText, 200),
|
||||
"response_preview": textutil.TruncateBytes(responseText, 200),
|
||||
})
|
||||
// RFC #2829 PR-2 result-push (see UpdateStatus for rationale).
|
||||
pushDelegationResultToInbox(ctx, sourceID, delegationID, "completed", responseText, "")
|
||||
@ -506,7 +507,7 @@ func (h *DelegationHandler) Record(c *gin.Context) {
|
||||
h.broadcaster.RecordAndBroadcast(ctx, string(events.EventDelegationSent), sourceID, map[string]interface{}{
|
||||
"delegation_id": body.DelegationID,
|
||||
"target_id": body.TargetID,
|
||||
"task_preview": truncate(body.Task, 100),
|
||||
"task_preview": textutil.TruncateBytes(body.Task, 100),
|
||||
})
|
||||
|
||||
c.JSON(http.StatusAccepted, gin.H{
|
||||
@ -555,12 +556,12 @@ func (h *DelegationHandler) UpdateStatus(c *gin.Context) {
|
||||
if _, err := db.DB.ExecContext(ctx, `
|
||||
INSERT INTO activity_logs (workspace_id, activity_type, method, source_id, summary, response_body, status)
|
||||
VALUES ($1, 'delegation', 'delegate_result', $2, $3, $4::jsonb, 'completed')
|
||||
`, sourceID, sourceID, "Delegation completed ("+truncate(body.ResponsePreview, 80)+")", string(respJSON)); err != nil {
|
||||
`, sourceID, sourceID, "Delegation completed ("+textutil.TruncateBytes(body.ResponsePreview, 80)+")", string(respJSON)); err != nil {
|
||||
log.Printf("Delegation UpdateStatus: result insert failed for %s: %v", delegationID, err)
|
||||
}
|
||||
h.broadcaster.RecordAndBroadcast(ctx, string(events.EventDelegationComplete), sourceID, map[string]interface{}{
|
||||
"delegation_id": delegationID,
|
||||
"response_preview": truncate(body.ResponsePreview, 200),
|
||||
"response_preview": textutil.TruncateBytes(body.ResponsePreview, 200),
|
||||
})
|
||||
// RFC #2829 PR-2 result-push: when the gate is on, also write an
|
||||
// a2a_receive row so the caller's inbox poller surfaces this to
|
||||
@ -626,7 +627,7 @@ func (h *DelegationHandler) ListDelegations(c *gin.Context) {
|
||||
entry["error"] = errorDetail
|
||||
}
|
||||
if responseBody != "" {
|
||||
entry["response_preview"] = truncate(responseBody, 300)
|
||||
entry["response_preview"] = textutil.TruncateBytes(responseBody, 300)
|
||||
}
|
||||
delegations = append(delegations, entry)
|
||||
}
|
||||
@ -727,9 +728,3 @@ func extractResponseText(body []byte) string {
|
||||
return string(body)
|
||||
}
|
||||
|
||||
func truncate(s string, max int) string {
|
||||
if len(s) <= max {
|
||||
return s
|
||||
}
|
||||
return s[:max] + "..."
|
||||
}
|
||||
|
||||
@ -8,6 +8,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/textutil"
|
||||
)
|
||||
|
||||
// delegation_ledger.go — durable per-task ledger for A2A delegation
|
||||
@ -50,40 +51,15 @@ func NewDelegationLedger(handle *sql.DB) *DelegationLedger {
|
||||
return &DelegationLedger{db: handle}
|
||||
}
|
||||
|
||||
// truncatePreview caps stored preview at 4KB. The full prompt/response is
|
||||
// already in activity_logs.{request,response}_body — this is the at-a-glance
|
||||
// view for the dashboard, not a forensic record.
|
||||
// previewCap caps stored preview at 4KB. The full prompt/response is
|
||||
// already in activity_logs.{request,response}_body — this is the
|
||||
// at-a-glance view for the dashboard, not a forensic record.
|
||||
//
|
||||
// Rune-safe: previous byte-slice form (s[:previewCap]) split on a byte
|
||||
// boundary, which on a multi-byte codepoint at byte 4096 produced
|
||||
// invalid UTF-8 — Postgres JSONB rejects → ledger row not inserted →
|
||||
// audit gap. Issue #2962. Walks the string by rune, stops at the last
|
||||
// rune-boundary index that fits inside the cap. ASCII-only strings hit
|
||||
// the cap exactly; CJK/emoji strings stop slightly under the cap,
|
||||
// never over.
|
||||
//
|
||||
// Mirrors the truncatePreviewRunes fix from agent_message_writer.go
|
||||
// (#2959). Both call sites should consume a shared helper after both
|
||||
// fixes have landed — followup deduplication tracked in #2962's body.
|
||||
// Truncation goes through textutil.TruncateBytesNoMarker so it's
|
||||
// rune-safe (#2026 / #2959 / #2962 bug class: byte-slice mid-codepoint
|
||||
// → Postgres JSONB rejects → silent INSERT failure → audit gap).
|
||||
const previewCap = 4096
|
||||
|
||||
func truncatePreview(s string) string {
|
||||
if len(s) <= previewCap {
|
||||
return s
|
||||
}
|
||||
// Range over a string yields rune-boundary byte indices. Walk
|
||||
// until the next index would exceed previewCap; the previous
|
||||
// index is the safe truncation point.
|
||||
end := 0
|
||||
for i := range s {
|
||||
if i > previewCap {
|
||||
break
|
||||
}
|
||||
end = i
|
||||
}
|
||||
return s[:end]
|
||||
}
|
||||
|
||||
// InsertOpts is the agent's record-of-intent. Caller, callee, task preview,
|
||||
// and the chosen delegation_id are required; idempotency_key is optional.
|
||||
type InsertOpts struct {
|
||||
@ -118,7 +94,7 @@ func (l *DelegationLedger) Insert(ctx context.Context, opts InsertOpts) {
|
||||
) VALUES ($1, $2, $3, $4, 'queued', $5, $6)
|
||||
ON CONFLICT (delegation_id) DO NOTHING
|
||||
`, opts.DelegationID, opts.CallerID, opts.CalleeID,
|
||||
truncatePreview(opts.TaskPreview), deadline, idemArg)
|
||||
textutil.TruncateBytesNoMarker(opts.TaskPreview, previewCap), deadline, idemArg)
|
||||
if err != nil {
|
||||
log.Printf("delegation_ledger Insert(%s): %v", opts.DelegationID, err)
|
||||
}
|
||||
@ -197,7 +173,7 @@ func (l *DelegationLedger) SetStatus(ctx context.Context,
|
||||
result_preview = NULLIF($4, ''),
|
||||
updated_at = now()
|
||||
WHERE delegation_id = $1
|
||||
`, delegationID, status, errorDetail, truncatePreview(resultPreview))
|
||||
`, delegationID, status, errorDetail, textutil.TruncateBytesNoMarker(resultPreview, previewCap))
|
||||
return err
|
||||
}
|
||||
|
||||
|
||||
@ -2,6 +2,7 @@ package handlers
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql/driver"
|
||||
"errors"
|
||||
"strings"
|
||||
"testing"
|
||||
@ -74,15 +75,20 @@ func TestLedgerInsert_TruncatesOversizedPreview(t *testing.T) {
|
||||
mock := setupTestDB(t)
|
||||
l := NewDelegationLedger(nil)
|
||||
|
||||
huge := strings.Repeat("x", 10_000) // > previewCap
|
||||
// 4096 / 3 = 1365 runes; +10 for margin so we cross the cap.
|
||||
// '世' is 3 bytes in UTF-8 (worst case for byte-cap rune walking).
|
||||
huge := strings.Repeat("世", (previewCap/3)+10)
|
||||
if len(huge) <= previewCap {
|
||||
t.Fatalf("test setup: input too short (%d bytes) — must exceed previewCap=%d", len(huge), previewCap)
|
||||
}
|
||||
|
||||
mock.ExpectExec(`INSERT INTO delegations`).
|
||||
WithArgs(
|
||||
"deleg-big",
|
||||
"c", "ca",
|
||||
sqlmock.AnyArg(), // truncated preview — verify length below via custom matcher
|
||||
sqlmock.AnyArg(),
|
||||
sqlmock.AnyArg(),
|
||||
capValidUTF8Matcher{cap: previewCap}, // truncated preview must fit cap AND be valid UTF-8
|
||||
sqlmock.AnyArg(), // deadline
|
||||
sqlmock.AnyArg(), // idempotency_key
|
||||
).
|
||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||
|
||||
@ -97,87 +103,28 @@ func TestLedgerInsert_TruncatesOversizedPreview(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// ---------- truncatePreview unit ----------
|
||||
// capValidUTF8Matcher pins #2962 at the integration boundary: the
|
||||
// preview that lands in the INSERT MUST be valid UTF-8 (else Postgres
|
||||
// JSONB rejects → silent audit gap) AND fit within the byte cap. Pre-
|
||||
// migration this would have asserted on the corrupted "世" mid-codepoint
|
||||
// byte slice; post-migration it asserts the truncated preview is a
|
||||
// clean rune-aligned prefix.
|
||||
type capValidUTF8Matcher struct{ cap int }
|
||||
|
||||
func TestTruncatePreview_UnderCap(t *testing.T) {
|
||||
in := "short"
|
||||
if got := truncatePreview(in); got != in {
|
||||
t.Errorf("under-cap should passthrough; got %q", got)
|
||||
func (m capValidUTF8Matcher) Match(v driver.Value) bool {
|
||||
s, ok := v.(string)
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
return len(s) <= m.cap && utf8.ValidString(s)
|
||||
}
|
||||
|
||||
func TestTruncatePreview_OverCapTruncatesAtBoundary(t *testing.T) {
|
||||
in := strings.Repeat("a", previewCap+100)
|
||||
got := truncatePreview(in)
|
||||
if len(got) != previewCap {
|
||||
t.Errorf("expected len=%d got len=%d", previewCap, len(got))
|
||||
}
|
||||
}
|
||||
|
||||
func TestTruncatePreview_ExactlyAtCap(t *testing.T) {
|
||||
in := strings.Repeat("a", previewCap)
|
||||
got := truncatePreview(in)
|
||||
if got != in {
|
||||
t.Errorf("at-cap should passthrough unchanged")
|
||||
}
|
||||
}
|
||||
|
||||
// TestTruncatePreview_NeverProducesInvalidUTF8 — pins #2962. The old
|
||||
// byte-slice implementation (s[:previewCap]) split on a byte boundary,
|
||||
// so a multi-byte codepoint straddling byte 4096 produced invalid
|
||||
// UTF-8 → Postgres JSONB rejects → ledger row not inserted → audit
|
||||
// gap. Test feeds a CJK / emoji-padded string longer than previewCap
|
||||
// and asserts utf8.ValidString on the result.
|
||||
func TestTruncatePreview_NeverProducesInvalidUTF8(t *testing.T) {
|
||||
// Build a string of '世' (3 bytes per rune in UTF-8) that's just
|
||||
// past the cap. With the old implementation, the slice at byte
|
||||
// previewCap would land mid-rune and ValidString would fail.
|
||||
// With the rune-aware implementation, the result is always valid
|
||||
// UTF-8 even if the byte length is < previewCap.
|
||||
rune3 := "世" // U+4E16, 3 bytes
|
||||
// Need at least previewCap/3 + 1 runes so we cross the cap with
|
||||
// margin to spare.
|
||||
in := strings.Repeat(rune3, (previewCap/3)+10)
|
||||
if len(in) <= previewCap {
|
||||
t.Fatalf("test setup: input too short (%d bytes) — must exceed previewCap=%d", len(in), previewCap)
|
||||
}
|
||||
got := truncatePreview(in)
|
||||
if !utf8.ValidString(got) {
|
||||
t.Errorf("truncatePreview produced invalid UTF-8 — JSONB will reject this row. len(got)=%d", len(got))
|
||||
}
|
||||
if len(got) > previewCap {
|
||||
t.Errorf("truncatePreview exceeded cap: len(got)=%d > previewCap=%d", len(got), previewCap)
|
||||
}
|
||||
// Defense-in-depth: the result should also be a clean rune
|
||||
// prefix of the input — not some garbled sequence.
|
||||
if !strings.HasPrefix(in, got) {
|
||||
t.Errorf("truncatePreview should return a prefix of the input")
|
||||
}
|
||||
}
|
||||
|
||||
// TestTruncatePreview_MultiByteAtBoundary — most-targeted regression.
|
||||
// Feeds an input where the cap byte falls EXACTLY in the middle of a
|
||||
// 3-byte codepoint. Pre-fix, this is the case that produces invalid
|
||||
// UTF-8; post-fix, the truncate stops at the previous rune boundary.
|
||||
func TestTruncatePreview_MultiByteAtBoundary(t *testing.T) {
|
||||
// Build a string that's `previewCap-1` ASCII bytes followed by
|
||||
// '世' (3 bytes). Total = previewCap + 2. The old impl would
|
||||
// slice at byte previewCap, landing inside the '世' codepoint.
|
||||
prefix := strings.Repeat("a", previewCap-1)
|
||||
in := prefix + "世"
|
||||
if len(in) != previewCap+2 {
|
||||
t.Fatalf("test setup: expected len %d, got %d", previewCap+2, len(in))
|
||||
}
|
||||
got := truncatePreview(in)
|
||||
if !utf8.ValidString(got) {
|
||||
t.Errorf("truncatePreview produced invalid UTF-8 at the multi-byte boundary case")
|
||||
}
|
||||
// Result should be exactly the ASCII prefix — '世' was past
|
||||
// the cap so it must be dropped entirely.
|
||||
if got != prefix {
|
||||
t.Errorf("expected exact ASCII prefix, got %q (len=%d)", got[len(got)-10:], len(got))
|
||||
}
|
||||
}
|
||||
// Helper-level truncation tests now live in
|
||||
// internal/textutil/truncate_test.go. The integration-level path
|
||||
// (TestLedgerInsert_TruncatesOversizedPreview above) still exercises
|
||||
// the previewCap boundary through the SQL write so a regression in
|
||||
// the wiring (wrong cap, wrong helper, missing call) would still go
|
||||
// red here.
|
||||
|
||||
// ---------- SetStatus lifecycle ----------
|
||||
|
||||
|
||||
@ -35,6 +35,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/memory/contract"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/textutil"
|
||||
)
|
||||
|
||||
const (
|
||||
@ -340,7 +341,7 @@ func decodeError(resp *http.Response) error {
|
||||
// have rather than dropping it.
|
||||
return &contract.Error{
|
||||
Code: httpStatusToCode(resp.StatusCode),
|
||||
Message: fmt.Sprintf("status %d: %s", resp.StatusCode, truncate(string(body), 256)),
|
||||
Message: fmt.Sprintf("status %d: %s", resp.StatusCode, textutil.TruncateBytes(string(body), 256)),
|
||||
}
|
||||
}
|
||||
return &e
|
||||
@ -359,12 +360,7 @@ func httpStatusToCode(status int) contract.ErrorCode {
|
||||
}
|
||||
}
|
||||
|
||||
func truncate(s string, n int) string {
|
||||
if len(s) <= n {
|
||||
return s
|
||||
}
|
||||
return s[:n] + "…"
|
||||
}
|
||||
// truncation moved to internal/textutil.TruncateBytes (#2962 SSOT).
|
||||
|
||||
// --- Circuit breaker ---
|
||||
|
||||
|
||||
@ -499,14 +499,10 @@ func TestHttpStatusToCode(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestTruncate(t *testing.T) {
|
||||
if got := truncate("short", 10); got != "short" {
|
||||
t.Errorf("got %q", got)
|
||||
}
|
||||
if got := truncate(strings.Repeat("a", 300), 10); !strings.HasSuffix(got, "…") {
|
||||
t.Errorf("expected ellipsis: %q", got)
|
||||
}
|
||||
}
|
||||
// Truncate moved to internal/textutil — coverage lives in
|
||||
// internal/textutil/truncate_test.go (TestTruncateBytes_RuneBoundary).
|
||||
// memory/client just calls it as a wire-shape helper for error
|
||||
// messages; no client-specific behavior to pin here.
|
||||
|
||||
// --- Circuit breaker ---
|
||||
|
||||
|
||||
@ -17,6 +17,7 @@ import (
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/metrics"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/supervised"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/textutil"
|
||||
)
|
||||
|
||||
const (
|
||||
@ -522,7 +523,7 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) {
|
||||
"schedule_id": sched.ID,
|
||||
"schedule_name": sched.Name,
|
||||
"cron_expr": sched.CronExpr,
|
||||
"prompt": sanitizeUTF8(truncate(sched.Prompt, 200)),
|
||||
"prompt": sanitizeUTF8(textutil.TruncateBytes(sched.Prompt, 200)),
|
||||
})
|
||||
// #152: persist lastError into error_detail on the activity_logs row
|
||||
// so GET /workspaces/:id/schedules/:id/history can surface why a run
|
||||
@ -807,27 +808,10 @@ func isEmptyResponse(body []byte) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// truncate shortens s to at most maxLen bytes, appending "..." if truncated.
|
||||
// #2026: UTF-8 safe — byte-slicing at maxLen-3 would split multi-byte runes
|
||||
// (observed: U+2026 `…` = 0xe2 0x80 0xa6, sliced mid-char, concatenated with
|
||||
// "..." producing 0xe2 0x80 0x2e — rejected by Postgres as invalid UTF-8,
|
||||
// which wedged the activity_logs INSERT with no deadline and stalled the
|
||||
// scheduler).
|
||||
func truncate(s string, maxLen int) string {
|
||||
if len(s) <= maxLen {
|
||||
return s
|
||||
}
|
||||
cut := maxLen - 3
|
||||
if cut < 0 {
|
||||
cut = 0
|
||||
}
|
||||
// Back up to a rune boundary — utf8.RuneStart returns true for any
|
||||
// non-continuation byte (ASCII, or the lead byte of a multi-byte rune).
|
||||
for cut > 0 && !utf8.RuneStart(s[cut]) {
|
||||
cut--
|
||||
}
|
||||
return s[:cut] + "..."
|
||||
}
|
||||
// truncation moved to internal/textutil.TruncateBytes (#2962 SSOT).
|
||||
// The original #2026 fix lives in textutil's package docs as canonical
|
||||
// prior art. Ellipsis was previously "..." (3 ASCII bytes); the SSOT
|
||||
// uses "…" (3 UTF-8 bytes) — same byte budget, single-glyph display.
|
||||
|
||||
// short returns up to n leading characters of s without panicking when s is
|
||||
// shorter than n. Used to safely display UUID prefixes in log lines where
|
||||
|
||||
@ -10,6 +10,7 @@ import (
|
||||
sqlmock "github.com/DATA-DOG/go-sqlmock"
|
||||
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/textutil"
|
||||
)
|
||||
|
||||
// errDBDown is a sentinel error used by tests to simulate a DB connection failure.
|
||||
@ -618,7 +619,7 @@ func TestTruncate_utf8Safe_regression2026(t *testing.T) {
|
||||
filler += "a"
|
||||
}
|
||||
input := filler + "…xxx" // 195 ASCII + 3-byte rune + 3 trailing
|
||||
out := truncate(input, 200)
|
||||
out := textutil.TruncateBytes(input, 200)
|
||||
|
||||
if !utf8.ValidString(out) {
|
||||
t.Fatalf("truncate produced invalid UTF-8: %x", []byte(out))
|
||||
|
||||
130
workspace-server/internal/textutil/truncate.go
Normal file
130
workspace-server/internal/textutil/truncate.go
Normal file
@ -0,0 +1,130 @@
|
||||
// Package textutil provides string-handling helpers that respect UTF-8
|
||||
// rune boundaries.
|
||||
//
|
||||
// Why this package exists
|
||||
// -----------------------
|
||||
// `s[:max]` truncates by BYTES; for any string with a multi-byte
|
||||
// codepoint at byte `max` (CJK, emoji, accented Latin), the slice
|
||||
// produces invalid UTF-8. Postgres `text` and `jsonb` columns reject
|
||||
// invalid UTF-8 with `invalid byte sequence for encoding "UTF8"`,
|
||||
// which silently fails the INSERT and holds the surrounding tx open
|
||||
// — a class of audit-gap that has bitten this codebase three times
|
||||
// (scheduler.go #2026, agent_message_writer.go #2959,
|
||||
// delegation_ledger.go #2962). Six per-package helpers had
|
||||
// independently re-implemented this logic with varying correctness;
|
||||
// this package is the single source of truth.
|
||||
//
|
||||
// Use sites
|
||||
// ---------
|
||||
// - DB writes whose column is bytes-bounded (jsonb preview field,
|
||||
// varchar(N)): TruncateBytes / TruncateBytesNoMarker.
|
||||
// - UI summaries whose cap is in display chars, not bytes:
|
||||
// TruncateRunes.
|
||||
//
|
||||
// All functions guarantee `utf8.ValidString(out) == true` for any
|
||||
// `s` where `utf8.ValidString(s) == true`. Inputs that are already
|
||||
// invalid UTF-8 should be sanitized at the trust boundary (e.g. via
|
||||
// `strings.ToValidUTF8`); this package does not silently fix
|
||||
// upstream invalid input.
|
||||
package textutil
|
||||
|
||||
import "unicode/utf8"
|
||||
|
||||
// ellipsis is the truncation marker. U+2026 HORIZONTAL ELLIPSIS —
|
||||
// 3 bytes in UTF-8, 1 rune, 1 display column. Standardized across
|
||||
// the codebase to avoid the "..." (3 ASCII chars) vs "…" (1 char)
|
||||
// inconsistency the per-package helpers had drifted into.
|
||||
const ellipsis = "…"
|
||||
|
||||
// TruncateBytes returns s if `len(s) <= maxBytes`, otherwise returns
|
||||
// the longest rune-aligned prefix of s that fits in `maxBytes - 3`
|
||||
// bytes followed by the ellipsis marker. The returned string is
|
||||
// always at most `maxBytes` bytes long.
|
||||
//
|
||||
// Example: TruncateBytes("你好世界你好", 10) returns "你好世…" (9 bytes)
|
||||
// — three "你好" runes (each 3 bytes = 9 bytes) plus "…" (3 bytes)
|
||||
// would be 12 bytes, so we walk back to "你好" (6 bytes) + "…" (3) = 9.
|
||||
//
|
||||
// Edge cases:
|
||||
// - maxBytes <= 0: returns "" (no room even for input or marker)
|
||||
// - maxBytes < len(ellipsis): returns "" (can't add marker without
|
||||
// exceeding cap, and we won't return a marker-less truncation
|
||||
// here — caller wanted a marker; use TruncateBytesNoMarker if
|
||||
// they don't)
|
||||
// - s contains invalid UTF-8: continuation bytes are walked over
|
||||
// same as valid runes; the result preserves the (invalid) input
|
||||
// bytes up to the truncation point. Caller is responsible for
|
||||
// pre-sanitizing if Postgres validity is required.
|
||||
func TruncateBytes(s string, maxBytes int) string {
|
||||
if len(s) <= maxBytes {
|
||||
return s
|
||||
}
|
||||
if maxBytes < len(ellipsis) {
|
||||
return ""
|
||||
}
|
||||
// Reserve room for the marker, then walk back to the nearest
|
||||
// rune boundary at or below the cut point.
|
||||
cut := maxBytes - len(ellipsis)
|
||||
for cut > 0 && !utf8.RuneStart(s[cut]) {
|
||||
cut--
|
||||
}
|
||||
return s[:cut] + ellipsis
|
||||
}
|
||||
|
||||
// TruncateBytesNoMarker returns s if `len(s) <= maxBytes`, otherwise
|
||||
// returns the longest rune-aligned prefix of s that fits in
|
||||
// `maxBytes` bytes. No marker is appended — useful when the caller's
|
||||
// storage already conveys "preview" / "snippet" semantics and an
|
||||
// extra ellipsis would push the result over a hard column cap.
|
||||
//
|
||||
// Example: TruncateBytesNoMarker("hello world", 5) returns "hello".
|
||||
//
|
||||
// Edge case: maxBytes <= 0 returns "".
|
||||
func TruncateBytesNoMarker(s string, maxBytes int) string {
|
||||
if len(s) <= maxBytes {
|
||||
return s
|
||||
}
|
||||
if maxBytes <= 0 {
|
||||
return ""
|
||||
}
|
||||
cut := maxBytes
|
||||
for cut > 0 && !utf8.RuneStart(s[cut]) {
|
||||
cut--
|
||||
}
|
||||
return s[:cut]
|
||||
}
|
||||
|
||||
// TruncateRunes returns s if it has at most maxRunes runes, otherwise
|
||||
// returns the first maxRunes runes followed by the ellipsis marker.
|
||||
// Use this when the cap is in user-visible characters (UI summary,
|
||||
// activity feed line) rather than bytes (DB column).
|
||||
//
|
||||
// Example: TruncateRunes("你好世界你好", 3) returns "你好世…" — three
|
||||
// runes plus the marker, regardless of the resulting byte count.
|
||||
//
|
||||
// Edge case: maxRunes <= 0 returns "" (caller asked for no content).
|
||||
func TruncateRunes(s string, maxRunes int) string {
|
||||
if maxRunes <= 0 {
|
||||
return ""
|
||||
}
|
||||
// Fast path: if every byte is a single-byte rune, the byte-length
|
||||
// upper-bounds the rune count. This avoids a runes alloc for the
|
||||
// common ASCII case where the input fits.
|
||||
if len(s) <= maxRunes {
|
||||
return s
|
||||
}
|
||||
// Walk by rune boundaries; stop at the (maxRunes+1)-th rune so we
|
||||
// know the cut point and that truncation is needed.
|
||||
count := 0
|
||||
for i := range s {
|
||||
if count == maxRunes {
|
||||
return s[:i] + ellipsis
|
||||
}
|
||||
count++
|
||||
}
|
||||
// Reachable when the byte count exceeded maxRunes but the actual
|
||||
// rune count didn't (e.g. all single-byte runes that just happen
|
||||
// to be more than maxRunes). The fast path catches len(s) <=
|
||||
// maxRunes; this catches maxRunes < runeCount(s) <= len(s).
|
||||
return s
|
||||
}
|
||||
222
workspace-server/internal/textutil/truncate_test.go
Normal file
222
workspace-server/internal/textutil/truncate_test.go
Normal file
@ -0,0 +1,222 @@
|
||||
package textutil
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// TestTruncateBytes_RuneBoundary pins the byte-cap, marker-bearing
|
||||
// truncation path. Every case asserts both:
|
||||
// 1. the exact expected output (so a refactor that flips ellipsis or
|
||||
// drops a rune is caught), and
|
||||
// 2. utf8.ValidString on the output (the invariant that the bug class
|
||||
// in #2026/#2959/#2962 violated by slicing mid-codepoint).
|
||||
//
|
||||
// Per memory feedback_assert_exact_not_substring.md, asserts are exact
|
||||
// equality, not substring matches.
|
||||
func TestTruncateBytes_RuneBoundary(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
in string
|
||||
maxBytes int
|
||||
want string
|
||||
}{
|
||||
// Under-cap: returns input verbatim.
|
||||
{"empty", "", 10, ""},
|
||||
{"under-cap ASCII", "hi", 10, "hi"},
|
||||
{"exactly-at-cap ASCII", "hello", 5, "hello"},
|
||||
{"under-cap CJK", "你好", 10, "你好"}, // 6 bytes
|
||||
{"exactly-at-cap CJK", "你好", 6, "你好"},
|
||||
|
||||
// Over-cap ASCII: trims to (maxBytes - 3) bytes + "…".
|
||||
{"over-cap ASCII", "abcdefghij", 6, "abc…"},
|
||||
|
||||
// Over-cap CJK where cut would land mid-codepoint. The
|
||||
// pre-fix bug shape: 7 - 3 = 4, but byte 4 is mid-"好"
|
||||
// (好 is bytes 3..5 of "你好世界"). Walking back to byte 3
|
||||
// (start of 好 — wait, that IS the start). Actually 你=0..2,
|
||||
// 好=3..5, 世=6..8, 界=9..11. Cut=4, walk back to 3 (start
|
||||
// of 好), then s[:3]="你", + "…" = "你…" (3+3=6 bytes ≤ 7).
|
||||
{"over-cap CJK lands mid-codepoint", "你好世界", 7, "你…"},
|
||||
|
||||
// Over-cap CJK where cut lands exactly on rune boundary.
|
||||
// 9 - 3 = 6, byte 6 is start of 世. Walk-back is no-op.
|
||||
// s[:6]="你好" + "…" = "你好…" (9 bytes).
|
||||
{"over-cap CJK rune-aligned", "你好世界", 9, "你好…"},
|
||||
|
||||
// Emoji: 😀 is 4 bytes (U+1F600). 7 - 3 = 4, byte 4 is start
|
||||
// of second 😀 — walk-back no-op. s[:4]="😀" + "…" = "😀…".
|
||||
{"over-cap emoji", "😀😀😀", 7, "😀…"},
|
||||
|
||||
// Mixed ASCII + CJK. "ab你好世界": a(1) b(1) 你(3) 好(3) 世(3) 界(3) = 14 bytes.
|
||||
// maxBytes=8, 8-3=5. byte 5 is mid-好. Walk back to start of 好 = byte 5? Let me
|
||||
// recompute: a=0, b=1, 你=2..4, 好=5..7, 世=8..10. Byte 5 IS start of 好.
|
||||
// Walk-back keeps cut at 5. s[:5] = "ab你" + "…" = "ab你…" (8 bytes).
|
||||
{"mixed prefix ASCII over-cap CJK", "ab你好世界", 8, "ab你…"},
|
||||
|
||||
// Pathological: maxBytes too small to even fit the marker.
|
||||
{"cap below ellipsis len", "hello", 2, ""},
|
||||
{"cap zero", "hello", 0, ""},
|
||||
{"cap negative", "hello", -1, ""},
|
||||
|
||||
// Cap exactly == ellipsis len: no room for content, but
|
||||
// the marker fits. This returns "" (cut = 0, s[:0] = "").
|
||||
{"cap equals ellipsis len", "hello", 3, "…"},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
got := TruncateBytes(c.in, c.maxBytes)
|
||||
if got != c.want {
|
||||
t.Errorf("TruncateBytes(%q, %d) = %q, want %q", c.in, c.maxBytes, got, c.want)
|
||||
}
|
||||
if !utf8.ValidString(got) {
|
||||
t.Errorf("TruncateBytes(%q, %d) returned invalid UTF-8: %q", c.in, c.maxBytes, got)
|
||||
}
|
||||
// Output never exceeds the byte cap (when one is set).
|
||||
if c.maxBytes > 0 && len(got) > c.maxBytes {
|
||||
t.Errorf("TruncateBytes(%q, %d) overflowed cap: len(out)=%d > %d",
|
||||
c.in, c.maxBytes, len(got), c.maxBytes)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestTruncateBytesNoMarker pins the marker-less variant. Same
|
||||
// boundary handling as TruncateBytes but no ellipsis cost — the cut
|
||||
// happens at maxBytes itself, walking back only if that lands
|
||||
// mid-codepoint.
|
||||
func TestTruncateBytesNoMarker(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
in string
|
||||
maxBytes int
|
||||
want string
|
||||
}{
|
||||
{"empty", "", 10, ""},
|
||||
{"under-cap ASCII", "hi", 10, "hi"},
|
||||
{"exactly-at-cap ASCII", "hello", 5, "hello"},
|
||||
{"over-cap ASCII", "abcdefghij", 5, "abcde"},
|
||||
|
||||
// Over-cap CJK rune-aligned: "你好世界", maxBytes=6, byte 6 is start of 世.
|
||||
// s[:6]="你好" — perfect cut.
|
||||
{"over-cap CJK rune-aligned", "你好世界", 6, "你好"},
|
||||
|
||||
// Over-cap CJK mid-codepoint: maxBytes=4, byte 4 is mid-好.
|
||||
// Walk back to byte 3 (start of 好), s[:3]="你".
|
||||
{"over-cap CJK mid-codepoint", "你好世界", 4, "你"},
|
||||
|
||||
// Emoji: maxBytes=5, "😀😀" is bytes 0..3 then 4..7. byte 5 is mid-second-😀.
|
||||
// Walk back to byte 4 (start of second 😀), s[:4]="😀".
|
||||
{"over-cap emoji", "😀😀", 5, "😀"},
|
||||
|
||||
// Edge: cap zero or negative → "".
|
||||
{"cap zero", "hello", 0, ""},
|
||||
{"cap negative", "hello", -1, ""},
|
||||
|
||||
// Cap = 1 and first rune is multi-byte: walk-back to 0, return "".
|
||||
{"cap one with leading CJK", "你hello", 1, ""},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
got := TruncateBytesNoMarker(c.in, c.maxBytes)
|
||||
if got != c.want {
|
||||
t.Errorf("TruncateBytesNoMarker(%q, %d) = %q, want %q", c.in, c.maxBytes, got, c.want)
|
||||
}
|
||||
if !utf8.ValidString(got) {
|
||||
t.Errorf("TruncateBytesNoMarker(%q, %d) returned invalid UTF-8: %q", c.in, c.maxBytes, got)
|
||||
}
|
||||
if c.maxBytes > 0 && len(got) > c.maxBytes {
|
||||
t.Errorf("TruncateBytesNoMarker(%q, %d) overflowed cap: len(out)=%d > %d",
|
||||
c.in, c.maxBytes, len(got), c.maxBytes)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestTruncateRunes pins the rune-cap variant. The key contract is
|
||||
// that maxRunes counts user-visible characters (Go runes, which line
|
||||
// up with Unicode codepoints), not bytes — so "你好世界" with
|
||||
// maxRunes=2 returns "你好…", regardless of the resulting byte count.
|
||||
func TestTruncateRunes(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
in string
|
||||
maxRunes int
|
||||
want string
|
||||
}{
|
||||
{"empty", "", 5, ""},
|
||||
{"under-cap ASCII", "hi", 5, "hi"},
|
||||
{"exactly-at-cap ASCII", "hello", 5, "hello"},
|
||||
{"over-cap ASCII", "abcdefghij", 5, "abcde…"},
|
||||
|
||||
{"under-cap CJK", "你好", 5, "你好"},
|
||||
{"exactly-at-cap CJK", "你好", 2, "你好"},
|
||||
|
||||
// Over-cap CJK: maxRunes=3, expect first 3 runes + marker.
|
||||
{"over-cap CJK", "你好世界你好", 3, "你好世…"},
|
||||
|
||||
// Emoji is one rune per glyph in Go (no ZWJ here).
|
||||
{"over-cap emoji", "😀😀😀😀😀", 2, "😀😀…"},
|
||||
|
||||
// Mixed: maxRunes=3 of "ab你好世界" → "ab你…".
|
||||
{"mixed prefix", "ab你好世界", 3, "ab你…"},
|
||||
|
||||
// Edge: maxRunes 0 / negative → "".
|
||||
{"cap zero", "hello", 0, ""},
|
||||
{"cap negative", "hello", -1, ""},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
got := TruncateRunes(c.in, c.maxRunes)
|
||||
if got != c.want {
|
||||
t.Errorf("TruncateRunes(%q, %d) = %q, want %q", c.in, c.maxRunes, got, c.want)
|
||||
}
|
||||
if !utf8.ValidString(got) {
|
||||
t.Errorf("TruncateRunes(%q, %d) returned invalid UTF-8: %q", c.in, c.maxRunes, got)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestTruncate_FuzzInvariants stays as a property-style sanity check:
|
||||
// for any rune-valid input and any cap, the output is rune-valid and
|
||||
// (for byte-cap variants) within the cap. This catches off-by-one
|
||||
// regressions in cuts that slip past the table-test cases above.
|
||||
func TestTruncate_FuzzInvariants(t *testing.T) {
|
||||
inputs := []string{
|
||||
"",
|
||||
"a",
|
||||
"hello world",
|
||||
"你好世界",
|
||||
"😀😀😀",
|
||||
"ab你c好d世e界",
|
||||
"日本語の文字列",
|
||||
"🇺🇸🇯🇵", // flags: each is 2 codepoints (regional indicators)
|
||||
}
|
||||
for _, in := range inputs {
|
||||
for cap := -1; cap <= len(in)+5; cap++ {
|
||||
t.Run("", func(t *testing.T) {
|
||||
gotB := TruncateBytes(in, cap)
|
||||
if !utf8.ValidString(gotB) {
|
||||
t.Errorf("TruncateBytes(%q, %d) invalid UTF-8: %q", in, cap, gotB)
|
||||
}
|
||||
if cap > 0 && len(gotB) > cap {
|
||||
t.Errorf("TruncateBytes(%q, %d) overflowed: %q (%d bytes)", in, cap, gotB, len(gotB))
|
||||
}
|
||||
|
||||
gotN := TruncateBytesNoMarker(in, cap)
|
||||
if !utf8.ValidString(gotN) {
|
||||
t.Errorf("TruncateBytesNoMarker(%q, %d) invalid UTF-8: %q", in, cap, gotN)
|
||||
}
|
||||
if cap > 0 && len(gotN) > cap {
|
||||
t.Errorf("TruncateBytesNoMarker(%q, %d) overflowed: %q (%d bytes)", in, cap, gotN, len(gotN))
|
||||
}
|
||||
|
||||
gotR := TruncateRunes(in, cap)
|
||||
if !utf8.ValidString(gotR) {
|
||||
t.Errorf("TruncateRunes(%q, %d) invalid UTF-8: %q", in, cap, gotR)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user