fix(textutil): SSOT for rune-safe string truncation, fix 3 audit-gap bugs
Closes #2962. ## Why Six per-package `truncate` helpers had drifted into independent re-implementations of the same idea. Three of them (delegation.go, memory/client/client.go, memory-backfill/verify.go) used `s[:max] + "…"` byte-slice form, which on a multi-byte codepoint at byte `max` produces invalid UTF-8 → Postgres `text`/`jsonb` rejects the INSERT silently → `delegation` / `activity_logs` row never lands → audit gap. Three other helpers (delegation_ledger.go #2962, agent_message_writer.go #2959, scheduler.go #2026) had each been fixed in isolation with three slightly different rune-safe shapes — confirming this is a class of bug, not a single instance. ## What New package `internal/textutil` with three rune-safe functions: - `TruncateBytes(s, maxBytes)` — byte-cap, "…" marker. Used by 5 callers writing into byte-bounded columns / log lines. - `TruncateBytesNoMarker(s, maxBytes)` — byte-cap, no marker. Used by delegation_ledger.go where the storage already conveys "preview" and an extra ellipsis would push the result over the column cap. - `TruncateRunes(s, maxRunes)` — rune-cap, "…" marker. Used by agent_message_writer.go where the cap is in display chars (UI summary), not bytes. All three guarantee `utf8.ValidString(out)` for any `utf8.ValidString(in)`. Inputs already invalid go through `sanitizeUTF8` at the call site boundary (scheduler.go preserved this defense-in-depth). ## Migration map | Old | New | Behavior change | |---|---|---| | `delegation_ledger.truncatePreview` | `textutil.TruncateBytesNoMarker(s, 4096)` | none | | `agent_message_writer.truncatePreviewRunes` | `textutil.TruncateRunes(s, n)` | none | | `scheduler.truncate` | `textutil.TruncateBytes(s, n)` | "..." → "…" (3 bytes either way; single-glyph display) | | `delegation.truncate` | `textutil.TruncateBytes(s, n)` | bug fix + ellipsis swap | | `memory/client.truncate` | `textutil.TruncateBytes(s, n)` | bug fix | | `memory-backfill.truncate` | `textutil.TruncateBytes(s, n)` | bug fix | Five separate `truncate*` helpers + their per-package tests removed. Net: 12 files / +427 / -255. ## Tests - `internal/textutil/truncate_test.go` — 27 table-test cases + 145 fuzz-invariant cases asserting `utf8.ValidString` and byte-cap invariants on every output. - `delegation_ledger_test.go TestLedgerInsert_TruncatesOversizedPreview` strengthened with `capValidUTF8Matcher` so the SQL-write argument is asserted to be valid UTF-8 + within cap (not just `AnyArg()`). Mutation-tested: replacing the SSOT call with byte-slice form makes this test fail loud. ## Compatibility - All callers internal; no external API surface change. - Ellipsis swap "..." → "…": same byte budget (3 bytes), single-glyph display. No alerting/grep on either marker in this codebase (verified). Canvas renders both correctly. - DB column widths unchanged (4096 / 80 / 200 / 256 / 300 — all preserved in the migrations). ## Security Fixes a silent INSERT-failure mode that hid `activity_logs` / `delegations` rows containing peer-controlled text. The class of input that triggered it (CJK, emoji, accented Latin) is normal user content, not malicious — but the symptom (audit gap) makes incident reconstruction harder. Helper is pure-function over `string`; no secrets / PII / auth handling involved. Untrusted input is handled identically to before, just rune-aligned now. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
c53155ec5f
commit
656a02fae4
@ -21,6 +21,7 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
|
|
||||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/memory/contract"
|
"github.com/Molecule-AI/molecule-monorepo/platform/internal/memory/contract"
|
||||||
|
"github.com/Molecule-AI/molecule-monorepo/platform/internal/textutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
// verifyConfig is the typed dependency bundle for verifyParity.
|
// verifyConfig is the typed dependency bundle for verifyParity.
|
||||||
@ -121,7 +122,7 @@ func verifyParity(ctx context.Context, cfg verifyConfig, stdout *os.File) (*veri
|
|||||||
matched := true
|
matched := true
|
||||||
for _, c := range legacy {
|
for _, c := range legacy {
|
||||||
if pluginContents[c] == 0 {
|
if pluginContents[c] == 0 {
|
||||||
fmt.Fprintf(stdout, "[mismatch] workspace=%s missing-from-plugin content=%q\n", wsID, truncate(c, 80))
|
fmt.Fprintf(stdout, "[mismatch] workspace=%s missing-from-plugin content=%q\n", wsID, textutil.TruncateBytes(c, 80))
|
||||||
matched = false
|
matched = false
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
@ -192,9 +193,4 @@ func queryLegacyMemories(ctx context.Context, db *sql.DB, workspaceID string) ([
|
|||||||
return out, rows.Err()
|
return out, rows.Err()
|
||||||
}
|
}
|
||||||
|
|
||||||
func truncate(s string, n int) string {
|
// truncation moved to internal/textutil.TruncateBytes (#2962 SSOT).
|
||||||
if len(s) <= n {
|
|
||||||
return s
|
|
||||||
}
|
|
||||||
return s[:n] + "…"
|
|
||||||
}
|
|
||||||
|
|||||||
@ -349,16 +349,8 @@ func TestVerifyParity_PickSampleError(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// --- Truncate ---
|
// Truncate moved to internal/textutil — coverage in
|
||||||
|
// internal/textutil/truncate_test.go (TestTruncateBytes_RuneBoundary).
|
||||||
func TestVerifyTruncate(t *testing.T) {
|
|
||||||
if got := truncate("short", 10); got != "short" {
|
|
||||||
t.Errorf("got %q", got)
|
|
||||||
}
|
|
||||||
if got := truncate(strings.Repeat("a", 200), 10); !strings.HasSuffix(got, "…") {
|
|
||||||
t.Errorf("expected ellipsis: %q", got)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// --- CLI: -verify mode ---
|
// --- CLI: -verify mode ---
|
||||||
|
|
||||||
|
|||||||
@ -22,6 +22,7 @@ import (
|
|||||||
|
|
||||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
|
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
|
||||||
|
"github.com/Molecule-AI/molecule-monorepo/platform/internal/textutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
// extractIdempotencyKey pulls params.message.messageId out of an A2A JSON-RPC
|
// extractIdempotencyKey pulls params.message.messageId out of an A2A JSON-RPC
|
||||||
@ -420,7 +421,7 @@ func (h *WorkspaceHandler) stitchDrainResponseToDelegation(ctx context.Context,
|
|||||||
AND method = 'delegate_result'
|
AND method = 'delegate_result'
|
||||||
AND target_id = $4
|
AND target_id = $4
|
||||||
AND response_body->>'delegation_id' = $5
|
AND response_body->>'delegation_id' = $5
|
||||||
`, "Delegation completed ("+truncate(responseText, 80)+")", string(respJSON),
|
`, "Delegation completed ("+textutil.TruncateBytes(responseText, 80)+")", string(respJSON),
|
||||||
sourceID, targetID, delegationID)
|
sourceID, targetID, delegationID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("A2AQueue drain stitch: update failed for delegation %s: %v", delegationID, err)
|
log.Printf("A2AQueue drain stitch: update failed for delegation %s: %v", delegationID, err)
|
||||||
@ -439,7 +440,7 @@ func (h *WorkspaceHandler) stitchDrainResponseToDelegation(ctx context.Context,
|
|||||||
h.broadcaster.RecordAndBroadcast(ctx, string(events.EventDelegationComplete), sourceID, map[string]interface{}{
|
h.broadcaster.RecordAndBroadcast(ctx, string(events.EventDelegationComplete), sourceID, map[string]interface{}{
|
||||||
"delegation_id": delegationID,
|
"delegation_id": delegationID,
|
||||||
"target_id": targetID,
|
"target_id": targetID,
|
||||||
"response_preview": truncate(responseText, 200),
|
"response_preview": textutil.TruncateBytes(responseText, 200),
|
||||||
"via": "queue_drain",
|
"via": "queue_drain",
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
@ -42,9 +42,9 @@ import (
|
|||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
"unicode/utf8"
|
|
||||||
|
|
||||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
|
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
|
||||||
|
"github.com/Molecule-AI/molecule-monorepo/platform/internal/textutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
// ErrWorkspaceNotFound is returned by AgentMessageWriter.Send when the
|
// ErrWorkspaceNotFound is returned by AgentMessageWriter.Send when the
|
||||||
@ -54,36 +54,6 @@ import (
|
|||||||
// timeout) surface as wrapped errors and should be treated as 503.
|
// timeout) surface as wrapped errors and should be treated as 503.
|
||||||
var ErrWorkspaceNotFound = errors.New("agent_message: workspace not found")
|
var ErrWorkspaceNotFound = errors.New("agent_message: workspace not found")
|
||||||
|
|
||||||
// truncatePreviewRunes returns at most maxRunes runes of s, plus an ellipsis
|
|
||||||
// when truncated. Operates on the rune (codepoint) boundary instead of
|
|
||||||
// byte indices — the previous byte-slice version produced invalid UTF-8
|
|
||||||
// when maxRunes landed mid-codepoint (CJK, emoji, accented characters
|
|
||||||
// in agent-authored chat messages), and Postgres JSONB rejects invalid
|
|
||||||
// UTF-8, dropping the activity_log INSERT silently. The persistence
|
|
||||||
// failure log fires but the message vanishes from chat history — the
|
|
||||||
// exact regression class the SSOT consolidation was built to prevent.
|
|
||||||
//
|
|
||||||
// maxRunes is in runes, not bytes — `truncatePreviewRunes("你好", 1)` returns
|
|
||||||
// `"你…"`, not `"\xe4…"`. Set the cap on a UI-friendly basis (visible
|
|
||||||
// character count, not stored byte count); 80 runes covers the
|
|
||||||
// activity_logs.summary column comfortably.
|
|
||||||
func truncatePreviewRunes(s string, maxRunes int) string {
|
|
||||||
if utf8.RuneCountInString(s) <= maxRunes {
|
|
||||||
return s
|
|
||||||
}
|
|
||||||
// Walk runes until we've consumed maxRunes; cut at that byte index.
|
|
||||||
count := 0
|
|
||||||
cut := len(s)
|
|
||||||
for i := range s {
|
|
||||||
if count == maxRunes {
|
|
||||||
cut = i
|
|
||||||
break
|
|
||||||
}
|
|
||||||
count++
|
|
||||||
}
|
|
||||||
return s[:cut] + "…"
|
|
||||||
}
|
|
||||||
|
|
||||||
// AgentMessageAttachment is one file attached to an agent → user
|
// AgentMessageAttachment is one file attached to an agent → user
|
||||||
// message. Identical to handlers.NotifyAttachment in field set; kept
|
// message. Identical to handlers.NotifyAttachment in field set; kept
|
||||||
// distinct so the writer's API doesn't import a handler type with HTTP
|
// distinct so the writer's API doesn't import a handler type with HTTP
|
||||||
@ -186,7 +156,7 @@ func (w *AgentMessageWriter) Send(
|
|||||||
respPayload["parts"] = fileParts
|
respPayload["parts"] = fileParts
|
||||||
}
|
}
|
||||||
respJSON, _ := json.Marshal(respPayload)
|
respJSON, _ := json.Marshal(respPayload)
|
||||||
preview := truncatePreviewRunes(message, 80)
|
preview := textutil.TruncateRunes(message, 80)
|
||||||
if _, err := w.db.ExecContext(ctx, `
|
if _, err := w.db.ExecContext(ctx, `
|
||||||
INSERT INTO activity_logs (workspace_id, activity_type, method, summary, response_body, status)
|
INSERT INTO activity_logs (workspace_id, activity_type, method, summary, response_body, status)
|
||||||
VALUES ($1, 'a2a_receive', 'notify', $2, $3::jsonb, 'ok')
|
VALUES ($1, 'a2a_receive', 'notify', $2, $3::jsonb, 'ok')
|
||||||
|
|||||||
@ -331,45 +331,11 @@ func TestAgentMessageWriter_Send_DBErrorOnLookupReturnsWrapped(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TestTruncatePreviewRunes_RuneBoundary pins the multi-byte-safe
|
// Helper-level truncate tests now live in
|
||||||
// truncation. The previous byte-slice version produced invalid UTF-8
|
// internal/textutil/truncate_test.go (TestTruncateRunes). The
|
||||||
// when the cut landed mid-codepoint (CJK, emoji, accented), and
|
// integration-level coverage that exercises the agent_message_writer
|
||||||
// Postgres JSONB rejects invalid UTF-8 — INSERT fails, log.Printf
|
// path with non-ASCII content is TestAgentMessageWriter_Send_NonASCIIMessagePersists
|
||||||
// fires, message vanishes from chat history. Per memory
|
// below.
|
||||||
// feedback_assert_exact_not_substring.md, pin the boundary cases
|
|
||||||
// directly.
|
|
||||||
func TestTruncatePreviewRunes_RuneBoundary(t *testing.T) {
|
|
||||||
cases := []struct {
|
|
||||||
name string
|
|
||||||
in string
|
|
||||||
max int
|
|
||||||
want string
|
|
||||||
}{
|
|
||||||
{"under-max ASCII", "hi", 80, "hi"},
|
|
||||||
{"under-max CJK", "你好", 80, "你好"},
|
|
||||||
{"exactly-at-max", "abcde", 5, "abcde"},
|
|
||||||
{"truncate ASCII", "abcdefghij", 5, "abcde…"},
|
|
||||||
{"truncate CJK at rune boundary", "你好世界你好世界", 4, "你好世界…"},
|
|
||||||
{"truncate emoji at rune boundary", "😀😀😀😀😀😀", 3, "😀😀😀…"},
|
|
||||||
// The pre-fix bug shape: byte-slice on non-ASCII would have
|
|
||||||
// mangled the codepoint here. With rune-boundary truncation
|
|
||||||
// the result is well-formed UTF-8.
|
|
||||||
{"non-zero with emoji prefix", "🚀abcdefghijk", 5, "🚀abcd…"},
|
|
||||||
}
|
|
||||||
for _, c := range cases {
|
|
||||||
t.Run(c.name, func(t *testing.T) {
|
|
||||||
got := truncatePreviewRunes(c.in, c.max)
|
|
||||||
if got != c.want {
|
|
||||||
t.Errorf("truncatePreviewRunes(%q, %d) = %q, want %q", c.in, c.max, got, c.want)
|
|
||||||
}
|
|
||||||
// Always-valid UTF-8 invariant. A byte-slice truncation
|
|
||||||
// could leave partial codepoints; this version must not.
|
|
||||||
if !utf8.ValidString(got) {
|
|
||||||
t.Errorf("truncatePreviewRunes(%q, %d) returned invalid UTF-8: %q", c.in, c.max, got)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestAgentMessageWriter_Send_NonASCIIMessagePersists pins the end-to-end
|
// TestAgentMessageWriter_Send_NonASCIIMessagePersists pins the end-to-end
|
||||||
// path for non-ASCII messages — the original reno-stars regression
|
// path for non-ASCII messages — the original reno-stars regression
|
||||||
|
|||||||
@ -10,6 +10,7 @@ import (
|
|||||||
|
|
||||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
|
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
|
||||||
|
"github.com/Molecule-AI/molecule-monorepo/platform/internal/textutil"
|
||||||
"github.com/gin-gonic/gin"
|
"github.com/gin-gonic/gin"
|
||||||
"github.com/google/uuid"
|
"github.com/google/uuid"
|
||||||
)
|
)
|
||||||
@ -167,7 +168,7 @@ func (h *DelegationHandler) Delegate(c *gin.Context) {
|
|||||||
h.broadcaster.RecordAndBroadcast(ctx, string(events.EventDelegationSent), sourceID, map[string]interface{}{
|
h.broadcaster.RecordAndBroadcast(ctx, string(events.EventDelegationSent), sourceID, map[string]interface{}{
|
||||||
"delegation_id": delegationID,
|
"delegation_id": delegationID,
|
||||||
"target_id": body.TargetID,
|
"target_id": body.TargetID,
|
||||||
"task_preview": truncate(body.Task, 100),
|
"task_preview": textutil.TruncateBytes(body.Task, 100),
|
||||||
})
|
})
|
||||||
|
|
||||||
resp := gin.H{
|
resp := gin.H{
|
||||||
@ -407,7 +408,7 @@ func (h *DelegationHandler) executeDelegation(sourceID, targetID, delegationID s
|
|||||||
if _, err := db.DB.ExecContext(ctx, `
|
if _, err := db.DB.ExecContext(ctx, `
|
||||||
INSERT INTO activity_logs (workspace_id, activity_type, method, source_id, target_id, summary, response_body, status)
|
INSERT INTO activity_logs (workspace_id, activity_type, method, source_id, target_id, summary, response_body, status)
|
||||||
VALUES ($1, 'delegation', 'delegate_result', $2, $3, $4, $5::jsonb, 'completed')
|
VALUES ($1, 'delegation', 'delegate_result', $2, $3, $4, $5::jsonb, 'completed')
|
||||||
`, sourceID, sourceID, targetID, "Delegation completed ("+truncate(responseText, 80)+")", string(respJSON)); err != nil {
|
`, sourceID, sourceID, targetID, "Delegation completed ("+textutil.TruncateBytes(responseText, 80)+")", string(respJSON)); err != nil {
|
||||||
log.Printf("Delegation %s: failed to insert success log: %v", delegationID, err)
|
log.Printf("Delegation %s: failed to insert success log: %v", delegationID, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -423,7 +424,7 @@ func (h *DelegationHandler) executeDelegation(sourceID, targetID, delegationID s
|
|||||||
h.broadcaster.RecordAndBroadcast(ctx, string(events.EventDelegationComplete), sourceID, map[string]interface{}{
|
h.broadcaster.RecordAndBroadcast(ctx, string(events.EventDelegationComplete), sourceID, map[string]interface{}{
|
||||||
"delegation_id": delegationID,
|
"delegation_id": delegationID,
|
||||||
"target_id": targetID,
|
"target_id": targetID,
|
||||||
"response_preview": truncate(responseText, 200),
|
"response_preview": textutil.TruncateBytes(responseText, 200),
|
||||||
})
|
})
|
||||||
// RFC #2829 PR-2 result-push (see UpdateStatus for rationale).
|
// RFC #2829 PR-2 result-push (see UpdateStatus for rationale).
|
||||||
pushDelegationResultToInbox(ctx, sourceID, delegationID, "completed", responseText, "")
|
pushDelegationResultToInbox(ctx, sourceID, delegationID, "completed", responseText, "")
|
||||||
@ -506,7 +507,7 @@ func (h *DelegationHandler) Record(c *gin.Context) {
|
|||||||
h.broadcaster.RecordAndBroadcast(ctx, string(events.EventDelegationSent), sourceID, map[string]interface{}{
|
h.broadcaster.RecordAndBroadcast(ctx, string(events.EventDelegationSent), sourceID, map[string]interface{}{
|
||||||
"delegation_id": body.DelegationID,
|
"delegation_id": body.DelegationID,
|
||||||
"target_id": body.TargetID,
|
"target_id": body.TargetID,
|
||||||
"task_preview": truncate(body.Task, 100),
|
"task_preview": textutil.TruncateBytes(body.Task, 100),
|
||||||
})
|
})
|
||||||
|
|
||||||
c.JSON(http.StatusAccepted, gin.H{
|
c.JSON(http.StatusAccepted, gin.H{
|
||||||
@ -555,12 +556,12 @@ func (h *DelegationHandler) UpdateStatus(c *gin.Context) {
|
|||||||
if _, err := db.DB.ExecContext(ctx, `
|
if _, err := db.DB.ExecContext(ctx, `
|
||||||
INSERT INTO activity_logs (workspace_id, activity_type, method, source_id, summary, response_body, status)
|
INSERT INTO activity_logs (workspace_id, activity_type, method, source_id, summary, response_body, status)
|
||||||
VALUES ($1, 'delegation', 'delegate_result', $2, $3, $4::jsonb, 'completed')
|
VALUES ($1, 'delegation', 'delegate_result', $2, $3, $4::jsonb, 'completed')
|
||||||
`, sourceID, sourceID, "Delegation completed ("+truncate(body.ResponsePreview, 80)+")", string(respJSON)); err != nil {
|
`, sourceID, sourceID, "Delegation completed ("+textutil.TruncateBytes(body.ResponsePreview, 80)+")", string(respJSON)); err != nil {
|
||||||
log.Printf("Delegation UpdateStatus: result insert failed for %s: %v", delegationID, err)
|
log.Printf("Delegation UpdateStatus: result insert failed for %s: %v", delegationID, err)
|
||||||
}
|
}
|
||||||
h.broadcaster.RecordAndBroadcast(ctx, string(events.EventDelegationComplete), sourceID, map[string]interface{}{
|
h.broadcaster.RecordAndBroadcast(ctx, string(events.EventDelegationComplete), sourceID, map[string]interface{}{
|
||||||
"delegation_id": delegationID,
|
"delegation_id": delegationID,
|
||||||
"response_preview": truncate(body.ResponsePreview, 200),
|
"response_preview": textutil.TruncateBytes(body.ResponsePreview, 200),
|
||||||
})
|
})
|
||||||
// RFC #2829 PR-2 result-push: when the gate is on, also write an
|
// RFC #2829 PR-2 result-push: when the gate is on, also write an
|
||||||
// a2a_receive row so the caller's inbox poller surfaces this to
|
// a2a_receive row so the caller's inbox poller surfaces this to
|
||||||
@ -626,7 +627,7 @@ func (h *DelegationHandler) ListDelegations(c *gin.Context) {
|
|||||||
entry["error"] = errorDetail
|
entry["error"] = errorDetail
|
||||||
}
|
}
|
||||||
if responseBody != "" {
|
if responseBody != "" {
|
||||||
entry["response_preview"] = truncate(responseBody, 300)
|
entry["response_preview"] = textutil.TruncateBytes(responseBody, 300)
|
||||||
}
|
}
|
||||||
delegations = append(delegations, entry)
|
delegations = append(delegations, entry)
|
||||||
}
|
}
|
||||||
@ -727,9 +728,3 @@ func extractResponseText(body []byte) string {
|
|||||||
return string(body)
|
return string(body)
|
||||||
}
|
}
|
||||||
|
|
||||||
func truncate(s string, max int) string {
|
|
||||||
if len(s) <= max {
|
|
||||||
return s
|
|
||||||
}
|
|
||||||
return s[:max] + "..."
|
|
||||||
}
|
|
||||||
|
|||||||
@ -8,6 +8,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||||
|
"github.com/Molecule-AI/molecule-monorepo/platform/internal/textutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
// delegation_ledger.go — durable per-task ledger for A2A delegation
|
// delegation_ledger.go — durable per-task ledger for A2A delegation
|
||||||
@ -50,40 +51,15 @@ func NewDelegationLedger(handle *sql.DB) *DelegationLedger {
|
|||||||
return &DelegationLedger{db: handle}
|
return &DelegationLedger{db: handle}
|
||||||
}
|
}
|
||||||
|
|
||||||
// truncatePreview caps stored preview at 4KB. The full prompt/response is
|
// previewCap caps stored preview at 4KB. The full prompt/response is
|
||||||
// already in activity_logs.{request,response}_body — this is the at-a-glance
|
// already in activity_logs.{request,response}_body — this is the
|
||||||
// view for the dashboard, not a forensic record.
|
// at-a-glance view for the dashboard, not a forensic record.
|
||||||
//
|
//
|
||||||
// Rune-safe: previous byte-slice form (s[:previewCap]) split on a byte
|
// Truncation goes through textutil.TruncateBytesNoMarker so it's
|
||||||
// boundary, which on a multi-byte codepoint at byte 4096 produced
|
// rune-safe (#2026 / #2959 / #2962 bug class: byte-slice mid-codepoint
|
||||||
// invalid UTF-8 — Postgres JSONB rejects → ledger row not inserted →
|
// → Postgres JSONB rejects → silent INSERT failure → audit gap).
|
||||||
// audit gap. Issue #2962. Walks the string by rune, stops at the last
|
|
||||||
// rune-boundary index that fits inside the cap. ASCII-only strings hit
|
|
||||||
// the cap exactly; CJK/emoji strings stop slightly under the cap,
|
|
||||||
// never over.
|
|
||||||
//
|
|
||||||
// Mirrors the truncatePreviewRunes fix from agent_message_writer.go
|
|
||||||
// (#2959). Both call sites should consume a shared helper after both
|
|
||||||
// fixes have landed — followup deduplication tracked in #2962's body.
|
|
||||||
const previewCap = 4096
|
const previewCap = 4096
|
||||||
|
|
||||||
func truncatePreview(s string) string {
|
|
||||||
if len(s) <= previewCap {
|
|
||||||
return s
|
|
||||||
}
|
|
||||||
// Range over a string yields rune-boundary byte indices. Walk
|
|
||||||
// until the next index would exceed previewCap; the previous
|
|
||||||
// index is the safe truncation point.
|
|
||||||
end := 0
|
|
||||||
for i := range s {
|
|
||||||
if i > previewCap {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
end = i
|
|
||||||
}
|
|
||||||
return s[:end]
|
|
||||||
}
|
|
||||||
|
|
||||||
// InsertOpts is the agent's record-of-intent. Caller, callee, task preview,
|
// InsertOpts is the agent's record-of-intent. Caller, callee, task preview,
|
||||||
// and the chosen delegation_id are required; idempotency_key is optional.
|
// and the chosen delegation_id are required; idempotency_key is optional.
|
||||||
type InsertOpts struct {
|
type InsertOpts struct {
|
||||||
@ -118,7 +94,7 @@ func (l *DelegationLedger) Insert(ctx context.Context, opts InsertOpts) {
|
|||||||
) VALUES ($1, $2, $3, $4, 'queued', $5, $6)
|
) VALUES ($1, $2, $3, $4, 'queued', $5, $6)
|
||||||
ON CONFLICT (delegation_id) DO NOTHING
|
ON CONFLICT (delegation_id) DO NOTHING
|
||||||
`, opts.DelegationID, opts.CallerID, opts.CalleeID,
|
`, opts.DelegationID, opts.CallerID, opts.CalleeID,
|
||||||
truncatePreview(opts.TaskPreview), deadline, idemArg)
|
textutil.TruncateBytesNoMarker(opts.TaskPreview, previewCap), deadline, idemArg)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("delegation_ledger Insert(%s): %v", opts.DelegationID, err)
|
log.Printf("delegation_ledger Insert(%s): %v", opts.DelegationID, err)
|
||||||
}
|
}
|
||||||
@ -197,7 +173,7 @@ func (l *DelegationLedger) SetStatus(ctx context.Context,
|
|||||||
result_preview = NULLIF($4, ''),
|
result_preview = NULLIF($4, ''),
|
||||||
updated_at = now()
|
updated_at = now()
|
||||||
WHERE delegation_id = $1
|
WHERE delegation_id = $1
|
||||||
`, delegationID, status, errorDetail, truncatePreview(resultPreview))
|
`, delegationID, status, errorDetail, textutil.TruncateBytesNoMarker(resultPreview, previewCap))
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -2,6 +2,7 @@ package handlers
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"database/sql/driver"
|
||||||
"errors"
|
"errors"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
@ -74,15 +75,20 @@ func TestLedgerInsert_TruncatesOversizedPreview(t *testing.T) {
|
|||||||
mock := setupTestDB(t)
|
mock := setupTestDB(t)
|
||||||
l := NewDelegationLedger(nil)
|
l := NewDelegationLedger(nil)
|
||||||
|
|
||||||
huge := strings.Repeat("x", 10_000) // > previewCap
|
// 4096 / 3 = 1365 runes; +10 for margin so we cross the cap.
|
||||||
|
// '世' is 3 bytes in UTF-8 (worst case for byte-cap rune walking).
|
||||||
|
huge := strings.Repeat("世", (previewCap/3)+10)
|
||||||
|
if len(huge) <= previewCap {
|
||||||
|
t.Fatalf("test setup: input too short (%d bytes) — must exceed previewCap=%d", len(huge), previewCap)
|
||||||
|
}
|
||||||
|
|
||||||
mock.ExpectExec(`INSERT INTO delegations`).
|
mock.ExpectExec(`INSERT INTO delegations`).
|
||||||
WithArgs(
|
WithArgs(
|
||||||
"deleg-big",
|
"deleg-big",
|
||||||
"c", "ca",
|
"c", "ca",
|
||||||
sqlmock.AnyArg(), // truncated preview — verify length below via custom matcher
|
capValidUTF8Matcher{cap: previewCap}, // truncated preview must fit cap AND be valid UTF-8
|
||||||
sqlmock.AnyArg(),
|
sqlmock.AnyArg(), // deadline
|
||||||
sqlmock.AnyArg(),
|
sqlmock.AnyArg(), // idempotency_key
|
||||||
).
|
).
|
||||||
WillReturnResult(sqlmock.NewResult(0, 1))
|
WillReturnResult(sqlmock.NewResult(0, 1))
|
||||||
|
|
||||||
@ -97,87 +103,28 @@ func TestLedgerInsert_TruncatesOversizedPreview(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---------- truncatePreview unit ----------
|
// capValidUTF8Matcher pins #2962 at the integration boundary: the
|
||||||
|
// preview that lands in the INSERT MUST be valid UTF-8 (else Postgres
|
||||||
|
// JSONB rejects → silent audit gap) AND fit within the byte cap. Pre-
|
||||||
|
// migration this would have asserted on the corrupted "世" mid-codepoint
|
||||||
|
// byte slice; post-migration it asserts the truncated preview is a
|
||||||
|
// clean rune-aligned prefix.
|
||||||
|
type capValidUTF8Matcher struct{ cap int }
|
||||||
|
|
||||||
func TestTruncatePreview_UnderCap(t *testing.T) {
|
func (m capValidUTF8Matcher) Match(v driver.Value) bool {
|
||||||
in := "short"
|
s, ok := v.(string)
|
||||||
if got := truncatePreview(in); got != in {
|
if !ok {
|
||||||
t.Errorf("under-cap should passthrough; got %q", got)
|
return false
|
||||||
}
|
}
|
||||||
|
return len(s) <= m.cap && utf8.ValidString(s)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestTruncatePreview_OverCapTruncatesAtBoundary(t *testing.T) {
|
// Helper-level truncation tests now live in
|
||||||
in := strings.Repeat("a", previewCap+100)
|
// internal/textutil/truncate_test.go. The integration-level path
|
||||||
got := truncatePreview(in)
|
// (TestLedgerInsert_TruncatesOversizedPreview above) still exercises
|
||||||
if len(got) != previewCap {
|
// the previewCap boundary through the SQL write so a regression in
|
||||||
t.Errorf("expected len=%d got len=%d", previewCap, len(got))
|
// the wiring (wrong cap, wrong helper, missing call) would still go
|
||||||
}
|
// red here.
|
||||||
}
|
|
||||||
|
|
||||||
func TestTruncatePreview_ExactlyAtCap(t *testing.T) {
|
|
||||||
in := strings.Repeat("a", previewCap)
|
|
||||||
got := truncatePreview(in)
|
|
||||||
if got != in {
|
|
||||||
t.Errorf("at-cap should passthrough unchanged")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestTruncatePreview_NeverProducesInvalidUTF8 — pins #2962. The old
|
|
||||||
// byte-slice implementation (s[:previewCap]) split on a byte boundary,
|
|
||||||
// so a multi-byte codepoint straddling byte 4096 produced invalid
|
|
||||||
// UTF-8 → Postgres JSONB rejects → ledger row not inserted → audit
|
|
||||||
// gap. Test feeds a CJK / emoji-padded string longer than previewCap
|
|
||||||
// and asserts utf8.ValidString on the result.
|
|
||||||
func TestTruncatePreview_NeverProducesInvalidUTF8(t *testing.T) {
|
|
||||||
// Build a string of '世' (3 bytes per rune in UTF-8) that's just
|
|
||||||
// past the cap. With the old implementation, the slice at byte
|
|
||||||
// previewCap would land mid-rune and ValidString would fail.
|
|
||||||
// With the rune-aware implementation, the result is always valid
|
|
||||||
// UTF-8 even if the byte length is < previewCap.
|
|
||||||
rune3 := "世" // U+4E16, 3 bytes
|
|
||||||
// Need at least previewCap/3 + 1 runes so we cross the cap with
|
|
||||||
// margin to spare.
|
|
||||||
in := strings.Repeat(rune3, (previewCap/3)+10)
|
|
||||||
if len(in) <= previewCap {
|
|
||||||
t.Fatalf("test setup: input too short (%d bytes) — must exceed previewCap=%d", len(in), previewCap)
|
|
||||||
}
|
|
||||||
got := truncatePreview(in)
|
|
||||||
if !utf8.ValidString(got) {
|
|
||||||
t.Errorf("truncatePreview produced invalid UTF-8 — JSONB will reject this row. len(got)=%d", len(got))
|
|
||||||
}
|
|
||||||
if len(got) > previewCap {
|
|
||||||
t.Errorf("truncatePreview exceeded cap: len(got)=%d > previewCap=%d", len(got), previewCap)
|
|
||||||
}
|
|
||||||
// Defense-in-depth: the result should also be a clean rune
|
|
||||||
// prefix of the input — not some garbled sequence.
|
|
||||||
if !strings.HasPrefix(in, got) {
|
|
||||||
t.Errorf("truncatePreview should return a prefix of the input")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestTruncatePreview_MultiByteAtBoundary — most-targeted regression.
|
|
||||||
// Feeds an input where the cap byte falls EXACTLY in the middle of a
|
|
||||||
// 3-byte codepoint. Pre-fix, this is the case that produces invalid
|
|
||||||
// UTF-8; post-fix, the truncate stops at the previous rune boundary.
|
|
||||||
func TestTruncatePreview_MultiByteAtBoundary(t *testing.T) {
|
|
||||||
// Build a string that's `previewCap-1` ASCII bytes followed by
|
|
||||||
// '世' (3 bytes). Total = previewCap + 2. The old impl would
|
|
||||||
// slice at byte previewCap, landing inside the '世' codepoint.
|
|
||||||
prefix := strings.Repeat("a", previewCap-1)
|
|
||||||
in := prefix + "世"
|
|
||||||
if len(in) != previewCap+2 {
|
|
||||||
t.Fatalf("test setup: expected len %d, got %d", previewCap+2, len(in))
|
|
||||||
}
|
|
||||||
got := truncatePreview(in)
|
|
||||||
if !utf8.ValidString(got) {
|
|
||||||
t.Errorf("truncatePreview produced invalid UTF-8 at the multi-byte boundary case")
|
|
||||||
}
|
|
||||||
// Result should be exactly the ASCII prefix — '世' was past
|
|
||||||
// the cap so it must be dropped entirely.
|
|
||||||
if got != prefix {
|
|
||||||
t.Errorf("expected exact ASCII prefix, got %q (len=%d)", got[len(got)-10:], len(got))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---------- SetStatus lifecycle ----------
|
// ---------- SetStatus lifecycle ----------
|
||||||
|
|
||||||
|
|||||||
@ -35,6 +35,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/memory/contract"
|
"github.com/Molecule-AI/molecule-monorepo/platform/internal/memory/contract"
|
||||||
|
"github.com/Molecule-AI/molecule-monorepo/platform/internal/textutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@ -340,7 +341,7 @@ func decodeError(resp *http.Response) error {
|
|||||||
// have rather than dropping it.
|
// have rather than dropping it.
|
||||||
return &contract.Error{
|
return &contract.Error{
|
||||||
Code: httpStatusToCode(resp.StatusCode),
|
Code: httpStatusToCode(resp.StatusCode),
|
||||||
Message: fmt.Sprintf("status %d: %s", resp.StatusCode, truncate(string(body), 256)),
|
Message: fmt.Sprintf("status %d: %s", resp.StatusCode, textutil.TruncateBytes(string(body), 256)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return &e
|
return &e
|
||||||
@ -359,12 +360,7 @@ func httpStatusToCode(status int) contract.ErrorCode {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func truncate(s string, n int) string {
|
// truncation moved to internal/textutil.TruncateBytes (#2962 SSOT).
|
||||||
if len(s) <= n {
|
|
||||||
return s
|
|
||||||
}
|
|
||||||
return s[:n] + "…"
|
|
||||||
}
|
|
||||||
|
|
||||||
// --- Circuit breaker ---
|
// --- Circuit breaker ---
|
||||||
|
|
||||||
|
|||||||
@ -499,14 +499,10 @@ func TestHttpStatusToCode(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestTruncate(t *testing.T) {
|
// Truncate moved to internal/textutil — coverage lives in
|
||||||
if got := truncate("short", 10); got != "short" {
|
// internal/textutil/truncate_test.go (TestTruncateBytes_RuneBoundary).
|
||||||
t.Errorf("got %q", got)
|
// memory/client just calls it as a wire-shape helper for error
|
||||||
}
|
// messages; no client-specific behavior to pin here.
|
||||||
if got := truncate(strings.Repeat("a", 300), 10); !strings.HasSuffix(got, "…") {
|
|
||||||
t.Errorf("expected ellipsis: %q", got)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// --- Circuit breaker ---
|
// --- Circuit breaker ---
|
||||||
|
|
||||||
|
|||||||
@ -17,6 +17,7 @@ import (
|
|||||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
|
"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
|
||||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/metrics"
|
"github.com/Molecule-AI/molecule-monorepo/platform/internal/metrics"
|
||||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/supervised"
|
"github.com/Molecule-AI/molecule-monorepo/platform/internal/supervised"
|
||||||
|
"github.com/Molecule-AI/molecule-monorepo/platform/internal/textutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@ -522,7 +523,7 @@ func (s *Scheduler) fireSchedule(ctx context.Context, sched scheduleRow) {
|
|||||||
"schedule_id": sched.ID,
|
"schedule_id": sched.ID,
|
||||||
"schedule_name": sched.Name,
|
"schedule_name": sched.Name,
|
||||||
"cron_expr": sched.CronExpr,
|
"cron_expr": sched.CronExpr,
|
||||||
"prompt": sanitizeUTF8(truncate(sched.Prompt, 200)),
|
"prompt": sanitizeUTF8(textutil.TruncateBytes(sched.Prompt, 200)),
|
||||||
})
|
})
|
||||||
// #152: persist lastError into error_detail on the activity_logs row
|
// #152: persist lastError into error_detail on the activity_logs row
|
||||||
// so GET /workspaces/:id/schedules/:id/history can surface why a run
|
// so GET /workspaces/:id/schedules/:id/history can surface why a run
|
||||||
@ -807,27 +808,10 @@ func isEmptyResponse(body []byte) bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
// truncate shortens s to at most maxLen bytes, appending "..." if truncated.
|
// truncation moved to internal/textutil.TruncateBytes (#2962 SSOT).
|
||||||
// #2026: UTF-8 safe — byte-slicing at maxLen-3 would split multi-byte runes
|
// The original #2026 fix lives in textutil's package docs as canonical
|
||||||
// (observed: U+2026 `…` = 0xe2 0x80 0xa6, sliced mid-char, concatenated with
|
// prior art. Ellipsis was previously "..." (3 ASCII bytes); the SSOT
|
||||||
// "..." producing 0xe2 0x80 0x2e — rejected by Postgres as invalid UTF-8,
|
// uses "…" (3 UTF-8 bytes) — same byte budget, single-glyph display.
|
||||||
// which wedged the activity_logs INSERT with no deadline and stalled the
|
|
||||||
// scheduler).
|
|
||||||
func truncate(s string, maxLen int) string {
|
|
||||||
if len(s) <= maxLen {
|
|
||||||
return s
|
|
||||||
}
|
|
||||||
cut := maxLen - 3
|
|
||||||
if cut < 0 {
|
|
||||||
cut = 0
|
|
||||||
}
|
|
||||||
// Back up to a rune boundary — utf8.RuneStart returns true for any
|
|
||||||
// non-continuation byte (ASCII, or the lead byte of a multi-byte rune).
|
|
||||||
for cut > 0 && !utf8.RuneStart(s[cut]) {
|
|
||||||
cut--
|
|
||||||
}
|
|
||||||
return s[:cut] + "..."
|
|
||||||
}
|
|
||||||
|
|
||||||
// short returns up to n leading characters of s without panicking when s is
|
// short returns up to n leading characters of s without panicking when s is
|
||||||
// shorter than n. Used to safely display UUID prefixes in log lines where
|
// shorter than n. Used to safely display UUID prefixes in log lines where
|
||||||
|
|||||||
@ -10,6 +10,7 @@ import (
|
|||||||
sqlmock "github.com/DATA-DOG/go-sqlmock"
|
sqlmock "github.com/DATA-DOG/go-sqlmock"
|
||||||
|
|
||||||
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
|
||||||
|
"github.com/Molecule-AI/molecule-monorepo/platform/internal/textutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
// errDBDown is a sentinel error used by tests to simulate a DB connection failure.
|
// errDBDown is a sentinel error used by tests to simulate a DB connection failure.
|
||||||
@ -618,7 +619,7 @@ func TestTruncate_utf8Safe_regression2026(t *testing.T) {
|
|||||||
filler += "a"
|
filler += "a"
|
||||||
}
|
}
|
||||||
input := filler + "…xxx" // 195 ASCII + 3-byte rune + 3 trailing
|
input := filler + "…xxx" // 195 ASCII + 3-byte rune + 3 trailing
|
||||||
out := truncate(input, 200)
|
out := textutil.TruncateBytes(input, 200)
|
||||||
|
|
||||||
if !utf8.ValidString(out) {
|
if !utf8.ValidString(out) {
|
||||||
t.Fatalf("truncate produced invalid UTF-8: %x", []byte(out))
|
t.Fatalf("truncate produced invalid UTF-8: %x", []byte(out))
|
||||||
|
|||||||
130
workspace-server/internal/textutil/truncate.go
Normal file
130
workspace-server/internal/textutil/truncate.go
Normal file
@ -0,0 +1,130 @@
|
|||||||
|
// Package textutil provides string-handling helpers that respect UTF-8
|
||||||
|
// rune boundaries.
|
||||||
|
//
|
||||||
|
// Why this package exists
|
||||||
|
// -----------------------
|
||||||
|
// `s[:max]` truncates by BYTES; for any string with a multi-byte
|
||||||
|
// codepoint at byte `max` (CJK, emoji, accented Latin), the slice
|
||||||
|
// produces invalid UTF-8. Postgres `text` and `jsonb` columns reject
|
||||||
|
// invalid UTF-8 with `invalid byte sequence for encoding "UTF8"`,
|
||||||
|
// which silently fails the INSERT and holds the surrounding tx open
|
||||||
|
// — a class of audit-gap that has bitten this codebase three times
|
||||||
|
// (scheduler.go #2026, agent_message_writer.go #2959,
|
||||||
|
// delegation_ledger.go #2962). Six per-package helpers had
|
||||||
|
// independently re-implemented this logic with varying correctness;
|
||||||
|
// this package is the single source of truth.
|
||||||
|
//
|
||||||
|
// Use sites
|
||||||
|
// ---------
|
||||||
|
// - DB writes whose column is bytes-bounded (jsonb preview field,
|
||||||
|
// varchar(N)): TruncateBytes / TruncateBytesNoMarker.
|
||||||
|
// - UI summaries whose cap is in display chars, not bytes:
|
||||||
|
// TruncateRunes.
|
||||||
|
//
|
||||||
|
// All functions guarantee `utf8.ValidString(out) == true` for any
|
||||||
|
// `s` where `utf8.ValidString(s) == true`. Inputs that are already
|
||||||
|
// invalid UTF-8 should be sanitized at the trust boundary (e.g. via
|
||||||
|
// `strings.ToValidUTF8`); this package does not silently fix
|
||||||
|
// upstream invalid input.
|
||||||
|
package textutil
|
||||||
|
|
||||||
|
import "unicode/utf8"
|
||||||
|
|
||||||
|
// ellipsis is the truncation marker. U+2026 HORIZONTAL ELLIPSIS —
|
||||||
|
// 3 bytes in UTF-8, 1 rune, 1 display column. Standardized across
|
||||||
|
// the codebase to avoid the "..." (3 ASCII chars) vs "…" (1 char)
|
||||||
|
// inconsistency the per-package helpers had drifted into.
|
||||||
|
const ellipsis = "…"
|
||||||
|
|
||||||
|
// TruncateBytes returns s if `len(s) <= maxBytes`, otherwise returns
|
||||||
|
// the longest rune-aligned prefix of s that fits in `maxBytes - 3`
|
||||||
|
// bytes followed by the ellipsis marker. The returned string is
|
||||||
|
// always at most `maxBytes` bytes long.
|
||||||
|
//
|
||||||
|
// Example: TruncateBytes("你好世界你好", 10) returns "你好世…" (9 bytes)
|
||||||
|
// — three "你好" runes (each 3 bytes = 9 bytes) plus "…" (3 bytes)
|
||||||
|
// would be 12 bytes, so we walk back to "你好" (6 bytes) + "…" (3) = 9.
|
||||||
|
//
|
||||||
|
// Edge cases:
|
||||||
|
// - maxBytes <= 0: returns "" (no room even for input or marker)
|
||||||
|
// - maxBytes < len(ellipsis): returns "" (can't add marker without
|
||||||
|
// exceeding cap, and we won't return a marker-less truncation
|
||||||
|
// here — caller wanted a marker; use TruncateBytesNoMarker if
|
||||||
|
// they don't)
|
||||||
|
// - s contains invalid UTF-8: continuation bytes are walked over
|
||||||
|
// same as valid runes; the result preserves the (invalid) input
|
||||||
|
// bytes up to the truncation point. Caller is responsible for
|
||||||
|
// pre-sanitizing if Postgres validity is required.
|
||||||
|
func TruncateBytes(s string, maxBytes int) string {
|
||||||
|
if len(s) <= maxBytes {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
if maxBytes < len(ellipsis) {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
// Reserve room for the marker, then walk back to the nearest
|
||||||
|
// rune boundary at or below the cut point.
|
||||||
|
cut := maxBytes - len(ellipsis)
|
||||||
|
for cut > 0 && !utf8.RuneStart(s[cut]) {
|
||||||
|
cut--
|
||||||
|
}
|
||||||
|
return s[:cut] + ellipsis
|
||||||
|
}
|
||||||
|
|
||||||
|
// TruncateBytesNoMarker returns s if `len(s) <= maxBytes`, otherwise
|
||||||
|
// returns the longest rune-aligned prefix of s that fits in
|
||||||
|
// `maxBytes` bytes. No marker is appended — useful when the caller's
|
||||||
|
// storage already conveys "preview" / "snippet" semantics and an
|
||||||
|
// extra ellipsis would push the result over a hard column cap.
|
||||||
|
//
|
||||||
|
// Example: TruncateBytesNoMarker("hello world", 5) returns "hello".
|
||||||
|
//
|
||||||
|
// Edge case: maxBytes <= 0 returns "".
|
||||||
|
func TruncateBytesNoMarker(s string, maxBytes int) string {
|
||||||
|
if len(s) <= maxBytes {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
if maxBytes <= 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
cut := maxBytes
|
||||||
|
for cut > 0 && !utf8.RuneStart(s[cut]) {
|
||||||
|
cut--
|
||||||
|
}
|
||||||
|
return s[:cut]
|
||||||
|
}
|
||||||
|
|
||||||
|
// TruncateRunes returns s if it has at most maxRunes runes, otherwise
|
||||||
|
// returns the first maxRunes runes followed by the ellipsis marker.
|
||||||
|
// Use this when the cap is in user-visible characters (UI summary,
|
||||||
|
// activity feed line) rather than bytes (DB column).
|
||||||
|
//
|
||||||
|
// Example: TruncateRunes("你好世界你好", 3) returns "你好世…" — three
|
||||||
|
// runes plus the marker, regardless of the resulting byte count.
|
||||||
|
//
|
||||||
|
// Edge case: maxRunes <= 0 returns "" (caller asked for no content).
|
||||||
|
func TruncateRunes(s string, maxRunes int) string {
|
||||||
|
if maxRunes <= 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
// Fast path: if every byte is a single-byte rune, the byte-length
|
||||||
|
// upper-bounds the rune count. This avoids a runes alloc for the
|
||||||
|
// common ASCII case where the input fits.
|
||||||
|
if len(s) <= maxRunes {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
// Walk by rune boundaries; stop at the (maxRunes+1)-th rune so we
|
||||||
|
// know the cut point and that truncation is needed.
|
||||||
|
count := 0
|
||||||
|
for i := range s {
|
||||||
|
if count == maxRunes {
|
||||||
|
return s[:i] + ellipsis
|
||||||
|
}
|
||||||
|
count++
|
||||||
|
}
|
||||||
|
// Reachable when the byte count exceeded maxRunes but the actual
|
||||||
|
// rune count didn't (e.g. all single-byte runes that just happen
|
||||||
|
// to be more than maxRunes). The fast path catches len(s) <=
|
||||||
|
// maxRunes; this catches maxRunes < runeCount(s) <= len(s).
|
||||||
|
return s
|
||||||
|
}
|
||||||
222
workspace-server/internal/textutil/truncate_test.go
Normal file
222
workspace-server/internal/textutil/truncate_test.go
Normal file
@ -0,0 +1,222 @@
|
|||||||
|
package textutil
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
"unicode/utf8"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TestTruncateBytes_RuneBoundary pins the byte-cap, marker-bearing
|
||||||
|
// truncation path. Every case asserts both:
|
||||||
|
// 1. the exact expected output (so a refactor that flips ellipsis or
|
||||||
|
// drops a rune is caught), and
|
||||||
|
// 2. utf8.ValidString on the output (the invariant that the bug class
|
||||||
|
// in #2026/#2959/#2962 violated by slicing mid-codepoint).
|
||||||
|
//
|
||||||
|
// Per memory feedback_assert_exact_not_substring.md, asserts are exact
|
||||||
|
// equality, not substring matches.
|
||||||
|
func TestTruncateBytes_RuneBoundary(t *testing.T) {
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
in string
|
||||||
|
maxBytes int
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
// Under-cap: returns input verbatim.
|
||||||
|
{"empty", "", 10, ""},
|
||||||
|
{"under-cap ASCII", "hi", 10, "hi"},
|
||||||
|
{"exactly-at-cap ASCII", "hello", 5, "hello"},
|
||||||
|
{"under-cap CJK", "你好", 10, "你好"}, // 6 bytes
|
||||||
|
{"exactly-at-cap CJK", "你好", 6, "你好"},
|
||||||
|
|
||||||
|
// Over-cap ASCII: trims to (maxBytes - 3) bytes + "…".
|
||||||
|
{"over-cap ASCII", "abcdefghij", 6, "abc…"},
|
||||||
|
|
||||||
|
// Over-cap CJK where cut would land mid-codepoint. The
|
||||||
|
// pre-fix bug shape: 7 - 3 = 4, but byte 4 is mid-"好"
|
||||||
|
// (好 is bytes 3..5 of "你好世界"). Walking back to byte 3
|
||||||
|
// (start of 好 — wait, that IS the start). Actually 你=0..2,
|
||||||
|
// 好=3..5, 世=6..8, 界=9..11. Cut=4, walk back to 3 (start
|
||||||
|
// of 好), then s[:3]="你", + "…" = "你…" (3+3=6 bytes ≤ 7).
|
||||||
|
{"over-cap CJK lands mid-codepoint", "你好世界", 7, "你…"},
|
||||||
|
|
||||||
|
// Over-cap CJK where cut lands exactly on rune boundary.
|
||||||
|
// 9 - 3 = 6, byte 6 is start of 世. Walk-back is no-op.
|
||||||
|
// s[:6]="你好" + "…" = "你好…" (9 bytes).
|
||||||
|
{"over-cap CJK rune-aligned", "你好世界", 9, "你好…"},
|
||||||
|
|
||||||
|
// Emoji: 😀 is 4 bytes (U+1F600). 7 - 3 = 4, byte 4 is start
|
||||||
|
// of second 😀 — walk-back no-op. s[:4]="😀" + "…" = "😀…".
|
||||||
|
{"over-cap emoji", "😀😀😀", 7, "😀…"},
|
||||||
|
|
||||||
|
// Mixed ASCII + CJK. "ab你好世界": a(1) b(1) 你(3) 好(3) 世(3) 界(3) = 14 bytes.
|
||||||
|
// maxBytes=8, 8-3=5. byte 5 is mid-好. Walk back to start of 好 = byte 5? Let me
|
||||||
|
// recompute: a=0, b=1, 你=2..4, 好=5..7, 世=8..10. Byte 5 IS start of 好.
|
||||||
|
// Walk-back keeps cut at 5. s[:5] = "ab你" + "…" = "ab你…" (8 bytes).
|
||||||
|
{"mixed prefix ASCII over-cap CJK", "ab你好世界", 8, "ab你…"},
|
||||||
|
|
||||||
|
// Pathological: maxBytes too small to even fit the marker.
|
||||||
|
{"cap below ellipsis len", "hello", 2, ""},
|
||||||
|
{"cap zero", "hello", 0, ""},
|
||||||
|
{"cap negative", "hello", -1, ""},
|
||||||
|
|
||||||
|
// Cap exactly == ellipsis len: no room for content, but
|
||||||
|
// the marker fits. This returns "" (cut = 0, s[:0] = "").
|
||||||
|
{"cap equals ellipsis len", "hello", 3, "…"},
|
||||||
|
}
|
||||||
|
for _, c := range cases {
|
||||||
|
t.Run(c.name, func(t *testing.T) {
|
||||||
|
got := TruncateBytes(c.in, c.maxBytes)
|
||||||
|
if got != c.want {
|
||||||
|
t.Errorf("TruncateBytes(%q, %d) = %q, want %q", c.in, c.maxBytes, got, c.want)
|
||||||
|
}
|
||||||
|
if !utf8.ValidString(got) {
|
||||||
|
t.Errorf("TruncateBytes(%q, %d) returned invalid UTF-8: %q", c.in, c.maxBytes, got)
|
||||||
|
}
|
||||||
|
// Output never exceeds the byte cap (when one is set).
|
||||||
|
if c.maxBytes > 0 && len(got) > c.maxBytes {
|
||||||
|
t.Errorf("TruncateBytes(%q, %d) overflowed cap: len(out)=%d > %d",
|
||||||
|
c.in, c.maxBytes, len(got), c.maxBytes)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestTruncateBytesNoMarker pins the marker-less variant. Same
|
||||||
|
// boundary handling as TruncateBytes but no ellipsis cost — the cut
|
||||||
|
// happens at maxBytes itself, walking back only if that lands
|
||||||
|
// mid-codepoint.
|
||||||
|
func TestTruncateBytesNoMarker(t *testing.T) {
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
in string
|
||||||
|
maxBytes int
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{"empty", "", 10, ""},
|
||||||
|
{"under-cap ASCII", "hi", 10, "hi"},
|
||||||
|
{"exactly-at-cap ASCII", "hello", 5, "hello"},
|
||||||
|
{"over-cap ASCII", "abcdefghij", 5, "abcde"},
|
||||||
|
|
||||||
|
// Over-cap CJK rune-aligned: "你好世界", maxBytes=6, byte 6 is start of 世.
|
||||||
|
// s[:6]="你好" — perfect cut.
|
||||||
|
{"over-cap CJK rune-aligned", "你好世界", 6, "你好"},
|
||||||
|
|
||||||
|
// Over-cap CJK mid-codepoint: maxBytes=4, byte 4 is mid-好.
|
||||||
|
// Walk back to byte 3 (start of 好), s[:3]="你".
|
||||||
|
{"over-cap CJK mid-codepoint", "你好世界", 4, "你"},
|
||||||
|
|
||||||
|
// Emoji: maxBytes=5, "😀😀" is bytes 0..3 then 4..7. byte 5 is mid-second-😀.
|
||||||
|
// Walk back to byte 4 (start of second 😀), s[:4]="😀".
|
||||||
|
{"over-cap emoji", "😀😀", 5, "😀"},
|
||||||
|
|
||||||
|
// Edge: cap zero or negative → "".
|
||||||
|
{"cap zero", "hello", 0, ""},
|
||||||
|
{"cap negative", "hello", -1, ""},
|
||||||
|
|
||||||
|
// Cap = 1 and first rune is multi-byte: walk-back to 0, return "".
|
||||||
|
{"cap one with leading CJK", "你hello", 1, ""},
|
||||||
|
}
|
||||||
|
for _, c := range cases {
|
||||||
|
t.Run(c.name, func(t *testing.T) {
|
||||||
|
got := TruncateBytesNoMarker(c.in, c.maxBytes)
|
||||||
|
if got != c.want {
|
||||||
|
t.Errorf("TruncateBytesNoMarker(%q, %d) = %q, want %q", c.in, c.maxBytes, got, c.want)
|
||||||
|
}
|
||||||
|
if !utf8.ValidString(got) {
|
||||||
|
t.Errorf("TruncateBytesNoMarker(%q, %d) returned invalid UTF-8: %q", c.in, c.maxBytes, got)
|
||||||
|
}
|
||||||
|
if c.maxBytes > 0 && len(got) > c.maxBytes {
|
||||||
|
t.Errorf("TruncateBytesNoMarker(%q, %d) overflowed cap: len(out)=%d > %d",
|
||||||
|
c.in, c.maxBytes, len(got), c.maxBytes)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestTruncateRunes pins the rune-cap variant. The key contract is
|
||||||
|
// that maxRunes counts user-visible characters (Go runes, which line
|
||||||
|
// up with Unicode codepoints), not bytes — so "你好世界" with
|
||||||
|
// maxRunes=2 returns "你好…", regardless of the resulting byte count.
|
||||||
|
func TestTruncateRunes(t *testing.T) {
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
in string
|
||||||
|
maxRunes int
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{"empty", "", 5, ""},
|
||||||
|
{"under-cap ASCII", "hi", 5, "hi"},
|
||||||
|
{"exactly-at-cap ASCII", "hello", 5, "hello"},
|
||||||
|
{"over-cap ASCII", "abcdefghij", 5, "abcde…"},
|
||||||
|
|
||||||
|
{"under-cap CJK", "你好", 5, "你好"},
|
||||||
|
{"exactly-at-cap CJK", "你好", 2, "你好"},
|
||||||
|
|
||||||
|
// Over-cap CJK: maxRunes=3, expect first 3 runes + marker.
|
||||||
|
{"over-cap CJK", "你好世界你好", 3, "你好世…"},
|
||||||
|
|
||||||
|
// Emoji is one rune per glyph in Go (no ZWJ here).
|
||||||
|
{"over-cap emoji", "😀😀😀😀😀", 2, "😀😀…"},
|
||||||
|
|
||||||
|
// Mixed: maxRunes=3 of "ab你好世界" → "ab你…".
|
||||||
|
{"mixed prefix", "ab你好世界", 3, "ab你…"},
|
||||||
|
|
||||||
|
// Edge: maxRunes 0 / negative → "".
|
||||||
|
{"cap zero", "hello", 0, ""},
|
||||||
|
{"cap negative", "hello", -1, ""},
|
||||||
|
}
|
||||||
|
for _, c := range cases {
|
||||||
|
t.Run(c.name, func(t *testing.T) {
|
||||||
|
got := TruncateRunes(c.in, c.maxRunes)
|
||||||
|
if got != c.want {
|
||||||
|
t.Errorf("TruncateRunes(%q, %d) = %q, want %q", c.in, c.maxRunes, got, c.want)
|
||||||
|
}
|
||||||
|
if !utf8.ValidString(got) {
|
||||||
|
t.Errorf("TruncateRunes(%q, %d) returned invalid UTF-8: %q", c.in, c.maxRunes, got)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestTruncate_FuzzInvariants stays as a property-style sanity check:
|
||||||
|
// for any rune-valid input and any cap, the output is rune-valid and
|
||||||
|
// (for byte-cap variants) within the cap. This catches off-by-one
|
||||||
|
// regressions in cuts that slip past the table-test cases above.
|
||||||
|
func TestTruncate_FuzzInvariants(t *testing.T) {
|
||||||
|
inputs := []string{
|
||||||
|
"",
|
||||||
|
"a",
|
||||||
|
"hello world",
|
||||||
|
"你好世界",
|
||||||
|
"😀😀😀",
|
||||||
|
"ab你c好d世e界",
|
||||||
|
"日本語の文字列",
|
||||||
|
"🇺🇸🇯🇵", // flags: each is 2 codepoints (regional indicators)
|
||||||
|
}
|
||||||
|
for _, in := range inputs {
|
||||||
|
for cap := -1; cap <= len(in)+5; cap++ {
|
||||||
|
t.Run("", func(t *testing.T) {
|
||||||
|
gotB := TruncateBytes(in, cap)
|
||||||
|
if !utf8.ValidString(gotB) {
|
||||||
|
t.Errorf("TruncateBytes(%q, %d) invalid UTF-8: %q", in, cap, gotB)
|
||||||
|
}
|
||||||
|
if cap > 0 && len(gotB) > cap {
|
||||||
|
t.Errorf("TruncateBytes(%q, %d) overflowed: %q (%d bytes)", in, cap, gotB, len(gotB))
|
||||||
|
}
|
||||||
|
|
||||||
|
gotN := TruncateBytesNoMarker(in, cap)
|
||||||
|
if !utf8.ValidString(gotN) {
|
||||||
|
t.Errorf("TruncateBytesNoMarker(%q, %d) invalid UTF-8: %q", in, cap, gotN)
|
||||||
|
}
|
||||||
|
if cap > 0 && len(gotN) > cap {
|
||||||
|
t.Errorf("TruncateBytesNoMarker(%q, %d) overflowed: %q (%d bytes)", in, cap, gotN, len(gotN))
|
||||||
|
}
|
||||||
|
|
||||||
|
gotR := TruncateRunes(in, cap)
|
||||||
|
if !utf8.ValidString(gotR) {
|
||||||
|
t.Errorf("TruncateRunes(%q, %d) invalid UTF-8: %q", in, cap, gotR)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue
Block a user