Improve CJK detection and safely drop Neo4j indexes

- Expand CJK regex to extensions A-F - Use DROP INDEX IF EXISTS - Add cleanup in multi-workspace test - Safely handle legacy index drops
2025-12-22 00:19:37 +08:00
parent a6c365e81d
commit 2a02b69e1d
2 changed files with 27 additions and 7 deletions
--- a/lightrag/kg/neo4j_impl.py
+++ b/lightrag/kg/neo4j_impl.py
@@ -101,9 +101,18 @@ class Neo4JStorage(BaseGraphStorage):
        return f"entity_id_fulltext_idx_{suffix}"

    def _is_chinese_text(self, text: str) -> bool:
-        """Check if text contains Chinese characters."""
-        chinese_pattern = re.compile(r"[\u4e00-\u9fff]+")
-        return bool(chinese_pattern.search(text))
+        """Check if text contains Chinese/CJK characters.
+
+        Covers:
+        - CJK Unified Ideographs (U+4E00-U+9FFF)
+        - CJK Extension A (U+3400-U+4DBF)
+        - CJK Compatibility Ideographs (U+F900-U+FAFF)
+        - CJK Extension B-F (U+20000-U+2FA1F) - supplementary planes
+        """
+        cjk_pattern = re.compile(
+            r"[\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff]|[\U00020000-\U0002fa1f]"
+        )
+        return bool(cjk_pattern.search(text))

    async def initialize(self):
        async with get_data_init_lock():
@@ -288,8 +297,8 @@ class Neo4JStorage(BaseGraphStorage):
                        f"[{self.workspace}] Found legacy index '{legacy_index_name}'. Migrating to '{index_name}'."
                    )
                    try:
-                        # Drop the legacy index
-                        drop_query = f"DROP INDEX {legacy_index_name}"
+                        # Drop the legacy index (use IF EXISTS for safety)
+                        drop_query = f"DROP INDEX {legacy_index_name} IF EXISTS"
                        result = await session.run(drop_query)
                        await result.consume()
                        logger.info(
@@ -329,10 +338,10 @@ class Neo4JStorage(BaseGraphStorage):
                needs_creation = existing_index is None

                if needs_recreation or needs_creation:
-                    # Drop existing index if it needs recreation
+                    # Drop existing index if it needs recreation (use IF EXISTS for safety)
                    if needs_recreation:
                        try:
-                            drop_query = f"DROP INDEX {index_name}"
+                            drop_query = f"DROP INDEX {index_name} IF EXISTS"
                            result = await session.run(drop_query)
                            await result.consume()
                            logger.info(
--- a/tests/test_neo4j_fulltext_index.py
+++ b/tests/test_neo4j_fulltext_index.py
@@ -294,6 +294,17 @@ async def test_multiple_workspaces_have_separate_indexes(neo4j_storage):
            ), f"Workspace 2 index '{workspace2_index}' should exist"

    finally:
+        # Clean up: drop the fulltext index created for workspace 2 to prevent accumulation
+        try:
+            async with storage2._driver.session(database=storage2._DATABASE) as session:
+                index_name = storage2._get_fulltext_index_name(
+                    storage2._get_workspace_label()
+                )
+                drop_query = f"DROP INDEX {index_name} IF EXISTS"
+                result = await session.run(drop_query)
+                await result.consume()
+        except Exception:
+            pass  # Ignore errors during cleanup
        await storage2.drop()
        await storage2.finalize()