From 2a02b69e1d514b1f6662c14ccb1e65d98752f14e Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Mon, 22 Dec 2025 00:19:37 +0800
Subject: [PATCH] Improve CJK detection and safely drop Neo4j indexes

- Expand CJK regex to extensions A-F
- Use DROP INDEX IF EXISTS
- Add cleanup in multi-workspace test
- Safely handle legacy index drops
---
 lightrag/kg/neo4j_impl.py          | 23 ++++++++++++++++-------
 tests/test_neo4j_fulltext_index.py | 11 +++++++++++
 2 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/lightrag/kg/neo4j_impl.py b/lightrag/kg/neo4j_impl.py
index 046f57fc..6dbe3286 100644
--- a/lightrag/kg/neo4j_impl.py
+++ b/lightrag/kg/neo4j_impl.py
@@ -101,9 +101,18 @@ class Neo4JStorage(BaseGraphStorage):
         return f"entity_id_fulltext_idx_{suffix}"
 
     def _is_chinese_text(self, text: str) -> bool:
-        """Check if text contains Chinese characters."""
-        chinese_pattern = re.compile(r"[\u4e00-\u9fff]+")
-        return bool(chinese_pattern.search(text))
+        """Check if text contains Chinese/CJK characters.
+
+        Covers:
+        - CJK Unified Ideographs (U+4E00-U+9FFF)
+        - CJK Extension A (U+3400-U+4DBF)
+        - CJK Compatibility Ideographs (U+F900-U+FAFF)
+        - CJK Extension B-F (U+20000-U+2FA1F) - supplementary planes
+        """
+        cjk_pattern = re.compile(
+            r"[\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff]|[\U00020000-\U0002fa1f]"
+        )
+        return bool(cjk_pattern.search(text))
 
     async def initialize(self):
         async with get_data_init_lock():
@@ -288,8 +297,8 @@ class Neo4JStorage(BaseGraphStorage):
                         f"[{self.workspace}] Found legacy index '{legacy_index_name}'. Migrating to '{index_name}'."
                     )
                     try:
-                        # Drop the legacy index
-                        drop_query = f"DROP INDEX {legacy_index_name}"
+                        # Drop the legacy index (use IF EXISTS for safety)
+                        drop_query = f"DROP INDEX {legacy_index_name} IF EXISTS"
                         result = await session.run(drop_query)
                         await result.consume()
                         logger.info(
@@ -329,10 +338,10 @@ class Neo4JStorage(BaseGraphStorage):
                 needs_creation = existing_index is None
 
                 if needs_recreation or needs_creation:
-                    # Drop existing index if it needs recreation
+                    # Drop existing index if it needs recreation (use IF EXISTS for safety)
                     if needs_recreation:
                         try:
-                            drop_query = f"DROP INDEX {index_name}"
+                            drop_query = f"DROP INDEX {index_name} IF EXISTS"
                             result = await session.run(drop_query)
                             await result.consume()
                             logger.info(
diff --git a/tests/test_neo4j_fulltext_index.py b/tests/test_neo4j_fulltext_index.py
index ab6e33c1..da26a1c3 100644
--- a/tests/test_neo4j_fulltext_index.py
+++ b/tests/test_neo4j_fulltext_index.py
@@ -294,6 +294,17 @@ async def test_multiple_workspaces_have_separate_indexes(neo4j_storage):
             ), f"Workspace 2 index '{workspace2_index}' should exist"
 
     finally:
+        # Clean up: drop the fulltext index created for workspace 2 to prevent accumulation
+        try:
+            async with storage2._driver.session(database=storage2._DATABASE) as session:
+                index_name = storage2._get_fulltext_index_name(
+                    storage2._get_workspace_label()
+                )
+                drop_query = f"DROP INDEX {index_name} IF EXISTS"
+                result = await session.run(drop_query)
+                await result.consume()
+        except Exception:
+            pass  # Ignore errors during cleanup
         await storage2.drop()
         await storage2.finalize()