From 2a02b69e1d514b1f6662c14ccb1e65d98752f14e Mon Sep 17 00:00:00 2001 From: yangdx Date: Mon, 22 Dec 2025 00:19:37 +0800 Subject: [PATCH] Improve CJK detection and safely drop Neo4j indexes - Expand CJK regex to extensions A-F - Use DROP INDEX IF EXISTS - Add cleanup in multi-workspace test - Safely handle legacy index drops --- lightrag/kg/neo4j_impl.py | 23 ++++++++++++++++------- tests/test_neo4j_fulltext_index.py | 11 +++++++++++ 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py index 046f57fc..6dbe3286 100644 --- a/lightrag/kg/neo4j_impl.py +++ b/lightrag/kg/neo4j_impl.py @@ -101,9 +101,18 @@ class Neo4JStorage(BaseGraphStorage): return f"entity_id_fulltext_idx_{suffix}" def _is_chinese_text(self, text: str) -> bool: - """Check if text contains Chinese characters.""" - chinese_pattern = re.compile(r"[\u4e00-\u9fff]+") - return bool(chinese_pattern.search(text)) + """Check if text contains Chinese/CJK characters. + + Covers: + - CJK Unified Ideographs (U+4E00-U+9FFF) + - CJK Extension A (U+3400-U+4DBF) + - CJK Compatibility Ideographs (U+F900-U+FAFF) + - CJK Extension B-F (U+20000-U+2FA1F) - supplementary planes + """ + cjk_pattern = re.compile( + r"[\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff]|[\U00020000-\U0002fa1f]" + ) + return bool(cjk_pattern.search(text)) async def initialize(self): async with get_data_init_lock(): @@ -288,8 +297,8 @@ class Neo4JStorage(BaseGraphStorage): f"[{self.workspace}] Found legacy index '{legacy_index_name}'. Migrating to '{index_name}'." ) try: - # Drop the legacy index - drop_query = f"DROP INDEX {legacy_index_name}" + # Drop the legacy index (use IF EXISTS for safety) + drop_query = f"DROP INDEX {legacy_index_name} IF EXISTS" result = await session.run(drop_query) await result.consume() logger.info( @@ -329,10 +338,10 @@ class Neo4JStorage(BaseGraphStorage): needs_creation = existing_index is None if needs_recreation or needs_creation: - # Drop existing index if it needs recreation + # Drop existing index if it needs recreation (use IF EXISTS for safety) if needs_recreation: try: - drop_query = f"DROP INDEX {index_name}" + drop_query = f"DROP INDEX {index_name} IF EXISTS" result = await session.run(drop_query) await result.consume() logger.info( diff --git a/tests/test_neo4j_fulltext_index.py b/tests/test_neo4j_fulltext_index.py index ab6e33c1..da26a1c3 100644 --- a/tests/test_neo4j_fulltext_index.py +++ b/tests/test_neo4j_fulltext_index.py @@ -294,6 +294,17 @@ async def test_multiple_workspaces_have_separate_indexes(neo4j_storage): ), f"Workspace 2 index '{workspace2_index}' should exist" finally: + # Clean up: drop the fulltext index created for workspace 2 to prevent accumulation + try: + async with storage2._driver.session(database=storage2._DATABASE) as session: + index_name = storage2._get_fulltext_index_name( + storage2._get_workspace_label() + ) + drop_query = f"DROP INDEX {index_name} IF EXISTS" + result = await session.run(drop_query) + await result.consume() + except Exception: + pass # Ignore errors during cleanup await storage2.drop() await storage2.finalize()