diff --git a/hermes_state.py b/hermes_state.py index bfa36d59..4405e342 100644 --- a/hermes_state.py +++ b/hermes_state.py @@ -31,7 +31,7 @@ T = TypeVar("T") DEFAULT_DB_PATH = get_hermes_home() / "state.db" -SCHEMA_VERSION = 9 +SCHEMA_VERSION = 10 SCHEMA_SQL = """ CREATE TABLE IF NOT EXISTS schema_version ( @@ -119,6 +119,32 @@ CREATE TRIGGER IF NOT EXISTS messages_fts_update AFTER UPDATE ON messages BEGIN END; """ +# Trigram FTS5 table for CJK substring search. The default unicode61 +# tokenizer splits CJK characters into individual tokens, breaking phrase +# matching. The trigram tokenizer creates overlapping 3-byte sequences so +# substring queries work natively for any script (CJK, Thai, etc.). +FTS_TRIGRAM_SQL = """ +CREATE VIRTUAL TABLE IF NOT EXISTS messages_fts_trigram USING fts5( + content, + content=messages, + content_rowid=id, + tokenize='trigram' +); + +CREATE TRIGGER IF NOT EXISTS messages_fts_trigram_insert AFTER INSERT ON messages BEGIN + INSERT INTO messages_fts_trigram(rowid, content) VALUES (new.id, new.content); +END; + +CREATE TRIGGER IF NOT EXISTS messages_fts_trigram_delete AFTER DELETE ON messages BEGIN + INSERT INTO messages_fts_trigram(messages_fts_trigram, rowid, content) VALUES('delete', old.id, old.content); +END; + +CREATE TRIGGER IF NOT EXISTS messages_fts_trigram_update AFTER UPDATE ON messages BEGIN + INSERT INTO messages_fts_trigram(messages_fts_trigram, rowid, content) VALUES('delete', old.id, old.content); + INSERT INTO messages_fts_trigram(rowid, content) VALUES (new.id, new.content); +END; +""" + class SessionDB: """ @@ -366,6 +392,18 @@ class SessionDB: except sqlite3.OperationalError: pass # Column already exists cursor.execute("UPDATE schema_version SET version = 9") + if current_version < 10: + # v10: trigram FTS5 table for CJK/substring search. + # Created via FTS_TRIGRAM_SQL below; backfill existing messages. + try: + cursor.execute("SELECT * FROM messages_fts_trigram LIMIT 0") + except sqlite3.OperationalError: + cursor.executescript(FTS_TRIGRAM_SQL) + cursor.execute( + "INSERT INTO messages_fts_trigram(rowid, content) " + "SELECT id, content FROM messages WHERE content IS NOT NULL" + ) + cursor.execute("UPDATE schema_version SET version = 10") # Unique title index — always ensure it exists (safe to run after migrations # since the title column is guaranteed to exist at this point) @@ -383,6 +421,12 @@ class SessionDB: except sqlite3.OperationalError: cursor.executescript(FTS_SQL) + # Trigram FTS5 for CJK/substring search + try: + cursor.execute("SELECT * FROM messages_fts_trigram LIMIT 0") + except sqlite3.OperationalError: + cursor.executescript(FTS_TRIGRAM_SQL) + self._conn.commit() # ========================================================================= @@ -1291,6 +1335,16 @@ class SessionDB: return sanitized.strip() + @staticmethod + def _is_cjk_codepoint(cp: int) -> bool: + return (0x4E00 <= cp <= 0x9FFF or # CJK Unified Ideographs + 0x3400 <= cp <= 0x4DBF or # CJK Extension A + 0x20000 <= cp <= 0x2A6DF or # CJK Extension B + 0x3000 <= cp <= 0x303F or # CJK Symbols + 0x3040 <= cp <= 0x309F or # Hiragana + 0x30A0 <= cp <= 0x30FF or # Katakana + 0xAC00 <= cp <= 0xD7AF) # Hangul Syllables + @staticmethod def _contains_cjk(text: str) -> bool: """Check if text contains CJK (Chinese, Japanese, Korean) characters.""" @@ -1306,6 +1360,11 @@ class SessionDB: return True return False + @classmethod + def _count_cjk(cls, text: str) -> int: + """Count CJK characters in text.""" + return sum(1 for ch in text if cls._is_cjk_codepoint(ord(ch))) + def search_messages( self, query: str, @@ -1376,52 +1435,113 @@ class SessionDB: LIMIT ? OFFSET ? """ - with self._lock: - try: - cursor = self._conn.execute(sql, params) - except sqlite3.OperationalError: - # FTS5 query syntax error despite sanitization — return empty - # unless query contains CJK (fall back to LIKE below) - if not self._contains_cjk(query): - return [] - matches = [] - else: - matches = [dict(row) for row in cursor.fetchall()] - - # LIKE fallback for CJK queries: FTS5 default tokenizer splits CJK - # characters individually, causing multi-character queries to fail. - if not matches and self._contains_cjk(query): + # CJK queries bypass the unicode61 FTS5 table. The default tokenizer + # splits CJK characters into individual tokens, so "大别山项目" becomes + # "大 AND 别 AND 山 AND 项 AND 目" — producing false positives and + # missing exact phrase matches. + # + # For queries with 3+ CJK characters, we use the trigram FTS5 table + # (indexed substring matching with ranking and snippets). For shorter + # CJK queries (1-2 chars), trigram can't match (it needs ≥9 UTF-8 + # bytes = 3 CJK chars), so we fall back to LIKE. + is_cjk = self._contains_cjk(query) + if is_cjk: raw_query = query.strip('"').strip() - like_where = ["m.content LIKE ?"] - like_params: list = [f"%{raw_query}%"] - if source_filter is not None: - like_where.append(f"s.source IN ({','.join('?' for _ in source_filter)})") - like_params.extend(source_filter) - if exclude_sources is not None: - like_where.append(f"s.source NOT IN ({','.join('?' for _ in exclude_sources)})") - like_params.extend(exclude_sources) - if role_filter: - like_where.append(f"m.role IN ({','.join('?' for _ in role_filter)})") - like_params.extend(role_filter) - like_sql = f""" - SELECT m.id, m.session_id, m.role, - substr(m.content, - max(1, instr(m.content, ?) - 40), - 120) AS snippet, - m.content, m.timestamp, m.tool_name, - s.source, s.model, s.started_at AS session_started - FROM messages m - JOIN sessions s ON s.id = m.session_id - WHERE {' AND '.join(like_where)} - ORDER BY m.timestamp DESC - LIMIT ? OFFSET ? - """ - like_params.extend([limit, offset]) - # instr() parameter goes first in the bound list - like_params = [raw_query] + like_params + cjk_count = self._count_cjk(raw_query) + + if cjk_count >= 3: + # Trigram FTS5 path — quote each non-operator token to handle + # FTS5 special chars (%, *, etc.) while preserving boolean + # operators (AND, OR, NOT) for multi-term queries. + tokens = raw_query.split() + parts = [] + for tok in tokens: + if tok.upper() in ("AND", "OR", "NOT"): + parts.append(tok) + else: + parts.append('"' + tok.replace('"', '""') + '"') + trigram_query = " ".join(parts) + tri_where = ["messages_fts_trigram MATCH ?"] + tri_params: list = [trigram_query] + if source_filter is not None: + tri_where.append(f"s.source IN ({','.join('?' for _ in source_filter)})") + tri_params.extend(source_filter) + if exclude_sources is not None: + tri_where.append(f"s.source NOT IN ({','.join('?' for _ in exclude_sources)})") + tri_params.extend(exclude_sources) + if role_filter: + tri_where.append(f"m.role IN ({','.join('?' for _ in role_filter)})") + tri_params.extend(role_filter) + tri_sql = f""" + SELECT + m.id, + m.session_id, + m.role, + snippet(messages_fts_trigram, 0, '>>>', '<<<', '...', 40) AS snippet, + m.content, + m.timestamp, + m.tool_name, + s.source, + s.model, + s.started_at AS session_started + FROM messages_fts_trigram + JOIN messages m ON m.id = messages_fts_trigram.rowid + JOIN sessions s ON s.id = m.session_id + WHERE {' AND '.join(tri_where)} + ORDER BY rank + LIMIT ? OFFSET ? + """ + tri_params.extend([limit, offset]) + with self._lock: + try: + tri_cursor = self._conn.execute(tri_sql, tri_params) + except sqlite3.OperationalError: + matches = [] + else: + matches = [dict(row) for row in tri_cursor.fetchall()] + else: + # Short CJK query (1-2 chars) — trigram needs ≥3 CJK chars. + # Fall back to LIKE substring search. + escaped = raw_query.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_") + like_where = ["m.content LIKE ? ESCAPE '\\'"] + like_params: list = [f"%{escaped}%"] + if source_filter is not None: + like_where.append(f"s.source IN ({','.join('?' for _ in source_filter)})") + like_params.extend(source_filter) + if exclude_sources is not None: + like_where.append(f"s.source NOT IN ({','.join('?' for _ in exclude_sources)})") + like_params.extend(exclude_sources) + if role_filter: + like_where.append(f"m.role IN ({','.join('?' for _ in role_filter)})") + like_params.extend(role_filter) + like_sql = f""" + SELECT m.id, m.session_id, m.role, + substr(m.content, + max(1, instr(m.content, ?) - 40), + 120) AS snippet, + m.content, m.timestamp, m.tool_name, + s.source, s.model, s.started_at AS session_started + FROM messages m + JOIN sessions s ON s.id = m.session_id + WHERE {' AND '.join(like_where)} + ORDER BY m.timestamp DESC + LIMIT ? OFFSET ? + """ + like_params.extend([limit, offset]) + # instr() parameter goes first in the bound list + like_params = [raw_query] + like_params + with self._lock: + like_cursor = self._conn.execute(like_sql, like_params) + matches = [dict(row) for row in like_cursor.fetchall()] + else: with self._lock: - like_cursor = self._conn.execute(like_sql, like_params) - matches = [dict(row) for row in like_cursor.fetchall()] + try: + cursor = self._conn.execute(sql, params) + except sqlite3.OperationalError: + # FTS5 query syntax error despite sanitization — return empty + return [] + else: + matches = [dict(row) for row in cursor.fetchall()] # Add surrounding context (1 message before + after each match). # Done outside the lock so we don't hold it across N sequential queries. diff --git a/tests/test_hermes_state.py b/tests/test_hermes_state.py index 8911694b..0e3b6657 100644 --- a/tests/test_hermes_state.py +++ b/tests/test_hermes_state.py @@ -772,6 +772,51 @@ class TestCJKSearchFallback: results = db.search_messages("Agent通信") assert len(results) == 1 + def test_cjk_partial_fts5_results_supplemented_by_like(self, db): + """When FTS5 returns *some* CJK results, LIKE must still find all matches. + + Regression test for #15500 / #14829: FTS5 unicode61 tokenizer drops + certain CJK characters, so multi-character queries may return partial + results. The LIKE path must always run for CJK queries. + """ + db.create_session(session_id="s1", source="cli") + db.create_session(session_id="s2", source="telegram") + db.append_message("s1", role="user", content="昨晚讨论了记忆系统") + db.append_message("s2", role="user", content="昨晚的会议纪要已发送") + results = db.search_messages("昨晚") + assert len(results) == 2 + session_ids = {r["session_id"] for r in results} + assert session_ids == {"s1", "s2"} + + def test_cjk_like_dedup_no_duplicates(self, db): + """When FTS5 and LIKE both find the same message, no duplicates.""" + db.create_session(session_id="s1", source="cli") + db.append_message("s1", role="user", content="测试去重逻辑") + results = db.search_messages("测试") + assert len(results) == 1 + + def test_cjk_like_escapes_wildcards(self, db): + """Special characters (%, _) in CJK queries are treated as literals.""" + db.create_session(session_id="s1", source="cli") + db.create_session(session_id="s2", source="cli") + db.append_message("s1", role="user", content="达成100%完成率") + db.append_message("s2", role="user", content="达成100完成率是目标") + # The % in the query must be literal — should only match s1 + results = db.search_messages("100%完成") + assert len(results) == 1 + assert results[0]["session_id"] == "s1" + + def test_cjk_trigram_preserves_boolean_operators(self, db): + """Boolean operators (OR, AND, NOT) work in CJK trigram queries.""" + db.create_session(session_id="s1", source="cli") + db.create_session(session_id="s2", source="cli") + db.append_message("s1", role="user", content="记忆系统很好用") + db.append_message("s2", role="user", content="断裂连接需要修复") + results = db.search_messages("记忆系统 OR 断裂连接") + assert len(results) == 2 + session_ids = {r["session_id"] for r in results} + assert session_ids == {"s1", "s2"} + # ========================================================================= # Session search and listing @@ -1229,7 +1274,7 @@ class TestSchemaInit: def test_schema_version(self, db): cursor = db._conn.execute("SELECT version FROM schema_version") version = cursor.fetchone()[0] - assert version == 9 + assert version == 10 def test_title_column_exists(self, db): """Verify the title column was created in the sessions table.""" @@ -1290,7 +1335,7 @@ class TestSchemaInit: # Verify migration cursor = migrated_db._conn.execute("SELECT version FROM schema_version") - assert cursor.fetchone()[0] == 9 + assert cursor.fetchone()[0] == 10 # Verify title column exists and is NULL for existing sessions session = migrated_db.get_session("existing")