Fix Neo4j fulltext index name mismatch between creation and query

2025-11-22 15:23:32 -05:00
parent 10780f4a69 8cb04cea5c
commit e9f2b13b26
2 changed files with 324 additions and 1 deletions
--- a/lightrag/kg/neo4j_impl.py
+++ b/lightrag/kg/neo4j_impl.py
@@ -231,6 +231,7 @@ class Neo4JStorage(BaseGraphStorage):
    ):
        """Create a full-text index on the entity_id property with Chinese tokenizer support."""
        index_name = f"entity_id_fulltext_idx_{workspace_label}"
+        legacy_index_name = "entity_id_fulltext_idx"
        try:
            async with driver.session(database=database) as session:
                # Check if the full-text index exists and get its configuration
@@ -240,11 +241,34 @@ class Neo4JStorage(BaseGraphStorage):
                await result.consume()

                existing_index = None
+                legacy_index = None
                for idx in indexes:
                    if idx["name"] == index_name:
                        existing_index = idx
+                    elif idx["name"] == legacy_index_name:
+                        legacy_index = idx
+                    # Break early if we found both indexes
+                    if existing_index and legacy_index:
                        break

+                # Handle legacy index migration
+                if legacy_index and not existing_index:
+                    logger.info(
+                        f"[{self.workspace}] Found legacy index '{legacy_index_name}'. Migrating to '{index_name}'."
+                    )
+                    try:
+                        # Drop the legacy index
+                        drop_query = f"DROP INDEX {legacy_index_name}"
+                        result = await session.run(drop_query)
+                        await result.consume()
+                        logger.info(
+                            f"[{self.workspace}] Dropped legacy index '{legacy_index_name}'"
+                        )
+                    except Exception as drop_error:
+                        logger.warning(
+                            f"[{self.workspace}] Failed to drop legacy index: {str(drop_error)}"
+                        )
+
                # Check if index exists and is online
                if existing_index:
                    index_state = existing_index.get("state", "UNKNOWN")
@@ -1641,7 +1665,7 @@ class Neo4JStorage(BaseGraphStorage):

        query_lower = query_strip.lower()
        is_chinese = self._is_chinese_text(query_strip)
-        index_name = "entity_id_fulltext_idx"
+        index_name = f"entity_id_fulltext_idx_{workspace_label}"

        # Attempt to use the full-text index first
        try:
--- a/tests/test_neo4j_fulltext_index.py
+++ b/tests/test_neo4j_fulltext_index.py
@@ -0,0 +1,299 @@
+#!/usr/bin/env python
+"""
+Test Neo4j full-text index functionality, specifically:
+1. Workspace-specific index naming
+2. Legacy index migration
+3. search_labels functionality with workspace-specific indexes
+"""
+
+import asyncio
+import os
+import sys
+import pytest
+import numpy as np
+
+# Add the project root directory to the Python path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from lightrag.kg.shared_storage import initialize_share_data
+
+
+# Mock embedding function that returns random vectors
+async def mock_embedding_func(texts):
+    return np.random.rand(len(texts), 10)  # Return 10-dimensional random vectors
+
+
+@pytest.fixture
+async def neo4j_storage():
+    """
+    Initialize Neo4j storage for testing.
+    Requires Neo4j to be running and configured via environment variables.
+    """
+    # Check if Neo4j is configured
+    if not os.getenv("NEO4J_URI"):
+        pytest.skip("Neo4j not configured (NEO4J_URI not set)")
+
+    from lightrag.kg.neo4j_impl import Neo4JStorage
+
+    # Initialize shared_storage for locks
+    initialize_share_data()
+
+    global_config = {
+        "embedding_batch_num": 10,
+        "vector_db_storage_cls_kwargs": {"cosine_better_than_threshold": 0.5},
+        "working_dir": os.environ.get("WORKING_DIR", "./rag_storage"),
+    }
+
+    storage = Neo4JStorage(
+        namespace="test_fulltext_index",
+        workspace="test_workspace",
+        global_config=global_config,
+        embedding_func=mock_embedding_func,
+    )
+
+    # Initialize the connection
+    await storage.initialize()
+
+    # Clean up any existing data
+    await storage.drop()
+
+    yield storage
+
+    # Cleanup
+    await storage.drop()
+    await storage.finalize()
+
+
+@pytest.mark.integration
+@pytest.mark.requires_db
+async def test_fulltext_index_creation(neo4j_storage):
+    """
+    Test that the full-text index is created with the workspace-specific name.
+    """
+    storage = neo4j_storage
+    workspace_label = storage._get_workspace_label()
+    expected_index_name = f"entity_id_fulltext_idx_{workspace_label}"
+
+    # Query Neo4j to check if the index exists
+    async with storage._driver.session(database=storage._DATABASE) as session:
+        result = await session.run("SHOW FULLTEXT INDEXES")
+        indexes = await result.data()
+        await result.consume()
+
+        # Check if the workspace-specific index exists
+        index_names = [idx["name"] for idx in indexes]
+        assert (
+            expected_index_name in index_names
+        ), f"Expected index '{expected_index_name}' not found. Found indexes: {index_names}"
+
+        # Check if the legacy index doesn't exist (should be migrated if it was there)
+        legacy_index_name = "entity_id_fulltext_idx"
+        if legacy_index_name in index_names:
+            # If legacy index exists, it should be for a different workspace
+            # or it means migration didn't happen
+            print(
+                f"Warning: Legacy index '{legacy_index_name}' still exists alongside '{expected_index_name}'"
+            )
+
+
+@pytest.mark.integration
+@pytest.mark.requires_db
+async def test_search_labels_with_workspace_index(neo4j_storage):
+    """
+    Test that search_labels uses the workspace-specific index and returns results.
+    """
+    storage = neo4j_storage
+
+    # Insert test nodes
+    test_nodes = [
+        {
+            "entity_id": "Artificial Intelligence",
+            "description": "AI field",
+            "keywords": "AI,ML,DL",
+            "entity_type": "Technology",
+        },
+        {
+            "entity_id": "Machine Learning",
+            "description": "ML subfield",
+            "keywords": "supervised,unsupervised",
+            "entity_type": "Technology",
+        },
+        {
+            "entity_id": "Deep Learning",
+            "description": "DL subfield",
+            "keywords": "neural networks",
+            "entity_type": "Technology",
+        },
+        {
+            "entity_id": "Natural Language Processing",
+            "description": "NLP field",
+            "keywords": "text,language",
+            "entity_type": "Technology",
+        },
+    ]
+
+    for node_data in test_nodes:
+        await storage.upsert_node(node_data["entity_id"], node_data)
+
+    # Give the index time to become consistent (eventually consistent index)
+    await asyncio.sleep(2)
+
+    # Test search_labels
+    results = await storage.search_labels("Learning", limit=10)
+
+    # Should find nodes with "Learning" in them
+    assert len(results) > 0, "search_labels should return results for 'Learning'"
+    assert any(
+        "Learning" in result for result in results
+    ), "Results should contain 'Learning'"
+
+    # Test case-insensitive search
+    results_lower = await storage.search_labels("learning", limit=10)
+    assert len(results_lower) > 0, "search_labels should be case-insensitive"
+
+    # Test partial match
+    results_partial = await storage.search_labels("Intelli", limit=10)
+    assert (
+        len(results_partial) > 0
+    ), "search_labels should support partial matching with wildcard"
+    assert any(
+        "Intelligence" in result for result in results_partial
+    ), "Should find 'Artificial Intelligence'"
+
+
+@pytest.mark.integration
+@pytest.mark.requires_db
+async def test_search_labels_chinese_text(neo4j_storage):
+    """
+    Test that search_labels works with Chinese text using the CJK analyzer.
+    """
+    storage = neo4j_storage
+
+    # Insert Chinese test nodes
+    chinese_nodes = [
+        {
+            "entity_id": "人工智能",
+            "description": "人工智能领域",
+            "keywords": "AI,机器学习",
+            "entity_type": "技术",
+        },
+        {
+            "entity_id": "机器学习",
+            "description": "机器学习子领域",
+            "keywords": "监督学习,无监督学习",
+            "entity_type": "技术",
+        },
+        {
+            "entity_id": "深度学习",
+            "description": "深度学习子领域",
+            "keywords": "神经网络",
+            "entity_type": "技术",
+        },
+    ]
+
+    for node_data in chinese_nodes:
+        await storage.upsert_node(node_data["entity_id"], node_data)
+
+    # Give the index time to become consistent
+    await asyncio.sleep(2)
+
+    # Test Chinese text search
+    results = await storage.search_labels("学习", limit=10)
+
+    # Should find nodes with "学习" in them
+    assert len(results) > 0, "search_labels should return results for Chinese text"
+    assert any(
+        "学习" in result for result in results
+    ), "Results should contain Chinese characters '学习'"
+
+
+@pytest.mark.integration
+@pytest.mark.requires_db
+async def test_search_labels_fallback_to_contains(neo4j_storage):
+    """
+    Test that search_labels falls back to CONTAINS search if the index fails.
+    This can happen with older Neo4j versions or if the index is not yet available.
+    """
+    storage = neo4j_storage
+
+    # Insert test nodes
+    test_nodes = [
+        {
+            "entity_id": "Test Node Alpha",
+            "description": "Test node",
+            "keywords": "test",
+            "entity_type": "Test",
+        },
+        {
+            "entity_id": "Test Node Beta",
+            "description": "Test node",
+            "keywords": "test",
+            "entity_type": "Test",
+        },
+    ]
+
+    for node_data in test_nodes:
+        await storage.upsert_node(node_data["entity_id"], node_data)
+
+    # Even if the full-text index is not available, CONTAINS should work
+    results = await storage.search_labels("Alpha", limit=10)
+
+    # Should find the node using fallback CONTAINS search
+    assert len(results) > 0, "Fallback CONTAINS search should return results"
+    assert "Test Node Alpha" in results, "Should find 'Test Node Alpha'"
+
+
+@pytest.mark.integration
+@pytest.mark.requires_db
+async def test_multiple_workspaces_have_separate_indexes(neo4j_storage):
+    """
+    Test that different workspaces have their own separate indexes.
+    """
+    from lightrag.kg.neo4j_impl import Neo4JStorage
+
+    # Create storage for workspace 1
+    storage1 = neo4j_storage
+
+    # Create storage for workspace 2
+    global_config = {
+        "embedding_batch_num": 10,
+        "vector_db_storage_cls_kwargs": {"cosine_better_than_threshold": 0.5},
+        "working_dir": os.environ.get("WORKING_DIR", "./rag_storage"),
+    }
+
+    storage2 = Neo4JStorage(
+        namespace="test_fulltext_index",
+        workspace="test_workspace_2",
+        global_config=global_config,
+        embedding_func=mock_embedding_func,
+    )
+
+    await storage2.initialize()
+    await storage2.drop()
+
+    try:
+        # Check that both workspaces have their own indexes
+        async with storage1._driver.session(database=storage1._DATABASE) as session:
+            result = await session.run("SHOW FULLTEXT INDEXES")
+            indexes = await result.data()
+            await result.consume()
+
+            index_names = [idx["name"] for idx in indexes]
+            workspace1_index = f"entity_id_fulltext_idx_{storage1._get_workspace_label()}"
+            workspace2_index = f"entity_id_fulltext_idx_{storage2._get_workspace_label()}"
+
+            assert (
+                workspace1_index in index_names
+            ), f"Workspace 1 index '{workspace1_index}' should exist"
+            assert (
+                workspace2_index in index_names
+            ), f"Workspace 2 index '{workspace2_index}' should exist"
+
+    finally:
+        await storage2.drop()
+        await storage2.finalize()
+
+
+if __name__ == "__main__":
+    # Run tests with pytest
+    pytest.main([__file__, "-v", "--run-integration"])