Files
lightrag/tests/test_dimension_mismatch.py
yangdx 1c083c6699 Remove redundant pytest.mark.asyncio decorators
- Remove explicit asyncio markers
- Clean up unused imports in tests
2025-12-19 16:00:37 +08:00

389 lines
15 KiB
Python

"""
Tests for dimension mismatch handling during migration.
This test module verifies that both PostgreSQL and Qdrant storage backends
properly detect and handle vector dimension mismatches when migrating from
legacy collections/tables to new ones with different embedding models.
"""
import pytest
from unittest.mock import MagicMock, AsyncMock, patch
from lightrag.kg.qdrant_impl import QdrantVectorDBStorage
from lightrag.kg.postgres_impl import PGVectorStorage
from lightrag.exceptions import DataMigrationError
# Note: Tests should use proper table names that have DDL templates
# Valid base tables: LIGHTRAG_VDB_CHUNKS, LIGHTRAG_VDB_ENTITIES, LIGHTRAG_VDB_RELATIONSHIPS,
# LIGHTRAG_DOC_CHUNKS, LIGHTRAG_DOC_FULL_DOCS, LIGHTRAG_DOC_TEXT_CHUNKS
class TestQdrantDimensionMismatch:
"""Test suite for Qdrant dimension mismatch handling."""
def test_qdrant_dimension_mismatch_raises_error(self):
"""
Test that Qdrant raises DataMigrationError when dimensions don't match.
Scenario: Legacy collection has 1536d vectors, new model expects 3072d.
Expected: DataMigrationError is raised to prevent data corruption.
"""
from qdrant_client import models
# Setup mock client
client = MagicMock()
# Mock legacy collection with 1536d vectors
legacy_collection_info = MagicMock()
legacy_collection_info.config.params.vectors.size = 1536
# Setup collection existence checks
def collection_exists_side_effect(name):
if (
name == "lightrag_vdb_chunks"
): # legacy (matches _find_legacy_collection pattern)
return True
elif name == "lightrag_chunks_model_3072d": # new
return False
return False
client.collection_exists.side_effect = collection_exists_side_effect
client.get_collection.return_value = legacy_collection_info
client.count.return_value.count = 100 # Legacy has data
# Patch _find_legacy_collection to return the legacy collection name
with patch(
"lightrag.kg.qdrant_impl._find_legacy_collection",
return_value="lightrag_vdb_chunks",
):
# Call setup_collection with 3072d (different from legacy 1536d)
# Should raise DataMigrationError due to dimension mismatch
with pytest.raises(DataMigrationError) as exc_info:
QdrantVectorDBStorage.setup_collection(
client,
"lightrag_chunks_model_3072d",
namespace="chunks",
workspace="test",
vectors_config=models.VectorParams(
size=3072, distance=models.Distance.COSINE
),
hnsw_config=models.HnswConfigDiff(
payload_m=16,
m=0,
),
)
# Verify error message contains dimension information
assert "3072" in str(exc_info.value) or "1536" in str(exc_info.value)
# Verify new collection was NOT created (error raised before creation)
client.create_collection.assert_not_called()
# Verify migration was NOT attempted
client.scroll.assert_not_called()
client.upsert.assert_not_called()
def test_qdrant_dimension_match_proceed_migration(self):
"""
Test that Qdrant proceeds with migration when dimensions match.
Scenario: Legacy collection has 1536d vectors, new model also expects 1536d.
Expected: Migration proceeds normally.
"""
from qdrant_client import models
client = MagicMock()
# Mock legacy collection with 1536d vectors (matching new)
legacy_collection_info = MagicMock()
legacy_collection_info.config.params.vectors.size = 1536
def collection_exists_side_effect(name):
if name == "lightrag_chunks": # legacy
return True
elif name == "lightrag_chunks_model_1536d": # new
return False
return False
client.collection_exists.side_effect = collection_exists_side_effect
client.get_collection.return_value = legacy_collection_info
# Track whether upsert has been called (migration occurred)
migration_done = {"value": False}
def upsert_side_effect(*args, **kwargs):
migration_done["value"] = True
return MagicMock()
client.upsert.side_effect = upsert_side_effect
# Mock count to return different values based on collection name and migration state
# Before migration: new collection has 0 records
# After migration: new collection has 1 record (matching migrated data)
def count_side_effect(collection_name, **kwargs):
result = MagicMock()
if collection_name == "lightrag_chunks": # legacy
result.count = 1 # Legacy has 1 record
elif collection_name == "lightrag_chunks_model_1536d": # new
# Return 0 before migration, 1 after migration
result.count = 1 if migration_done["value"] else 0
else:
result.count = 0
return result
client.count.side_effect = count_side_effect
# Mock scroll to return sample data (1 record for easier verification)
sample_point = MagicMock()
sample_point.id = "test_id"
sample_point.vector = [0.1] * 1536
sample_point.payload = {"id": "test"}
client.scroll.return_value = ([sample_point], None)
# Mock _find_legacy_collection to return the legacy collection name
with patch(
"lightrag.kg.qdrant_impl._find_legacy_collection",
return_value="lightrag_chunks",
):
# Call setup_collection with matching 1536d
QdrantVectorDBStorage.setup_collection(
client,
"lightrag_chunks_model_1536d",
namespace="chunks",
workspace="test",
vectors_config=models.VectorParams(
size=1536, distance=models.Distance.COSINE
),
hnsw_config=models.HnswConfigDiff(
payload_m=16,
m=0,
),
)
# Verify migration WAS attempted
client.create_collection.assert_called_once()
client.scroll.assert_called()
client.upsert.assert_called()
class TestPostgresDimensionMismatch:
"""Test suite for PostgreSQL dimension mismatch handling."""
async def test_postgres_dimension_mismatch_skip_migration_metadata(self):
"""
Test that PostgreSQL skips migration when dimensions don't match (via metadata).
Scenario: Legacy table has 1536d vectors (detected via pg_attribute),
new model expects 3072d.
Expected: Migration skipped, new empty table created, legacy preserved.
"""
# Setup mock database
db = AsyncMock()
# Mock table existence and dimension checks
async def query_side_effect(query, params, **kwargs):
if "information_schema.tables" in query:
if params[0] == "LIGHTRAG_DOC_CHUNKS": # legacy
return {"exists": True}
elif params[0] == "LIGHTRAG_DOC_CHUNKS_model_3072d": # new
return {"exists": False}
elif "COUNT(*)" in query:
return {"count": 100} # Legacy has data
elif "pg_attribute" in query:
return {"vector_dim": 1536} # Legacy has 1536d vectors
return {}
db.query.side_effect = query_side_effect
db.execute = AsyncMock()
db._create_vector_index = AsyncMock()
# Call setup_table with 3072d (different from legacy 1536d)
await PGVectorStorage.setup_table(
db,
"LIGHTRAG_DOC_CHUNKS_model_3072d",
legacy_table_name="LIGHTRAG_DOC_CHUNKS",
base_table="LIGHTRAG_DOC_CHUNKS",
embedding_dim=3072,
workspace="test",
)
# Verify migration was NOT attempted (no INSERT calls)
# Note: _pg_create_table is mocked, so we check INSERT calls to verify migration was skipped
insert_calls = [
call
for call in db.execute.call_args_list
if call[0][0] and "INSERT INTO" in call[0][0]
]
assert (
len(insert_calls) == 0
), "Migration should be skipped due to dimension mismatch"
async def test_postgres_dimension_mismatch_skip_migration_sampling(self):
"""
Test that PostgreSQL skips migration when dimensions don't match (via sampling).
Scenario: Legacy table dimension detection fails via metadata,
falls back to vector sampling, detects 1536d vs expected 3072d.
Expected: Migration skipped, new empty table created, legacy preserved.
"""
db = AsyncMock()
# Mock table existence and dimension checks
async def query_side_effect(query, params, **kwargs):
if "information_schema.tables" in query:
if params[0] == "LIGHTRAG_DOC_CHUNKS": # legacy
return {"exists": True}
elif params[0] == "LIGHTRAG_DOC_CHUNKS_model_3072d": # new
return {"exists": False}
elif "COUNT(*)" in query:
return {"count": 100} # Legacy has data
elif "pg_attribute" in query:
return {"vector_dim": -1} # Metadata check fails
elif "SELECT content_vector FROM" in query:
# Return sample vector with 1536 dimensions
return {"content_vector": [0.1] * 1536}
return {}
db.query.side_effect = query_side_effect
db.execute = AsyncMock()
db._create_vector_index = AsyncMock()
# Call setup_table with 3072d (different from legacy 1536d)
await PGVectorStorage.setup_table(
db,
"LIGHTRAG_DOC_CHUNKS_model_3072d",
legacy_table_name="LIGHTRAG_DOC_CHUNKS",
base_table="LIGHTRAG_DOC_CHUNKS",
embedding_dim=3072,
workspace="test",
)
# Verify new table was created
create_table_calls = [
call
for call in db.execute.call_args_list
if call[0][0] and "CREATE TABLE" in call[0][0]
]
assert len(create_table_calls) > 0, "New table should be created"
# Verify migration was NOT attempted
insert_calls = [
call
for call in db.execute.call_args_list
if call[0][0] and "INSERT INTO" in call[0][0]
]
assert len(insert_calls) == 0, "Migration should be skipped"
async def test_postgres_dimension_match_proceed_migration(self):
"""
Test that PostgreSQL proceeds with migration when dimensions match.
Scenario: Legacy table has 1536d vectors, new model also expects 1536d.
Expected: Migration proceeds normally.
"""
db = AsyncMock()
# Track migration state
migration_done = {"value": False}
# Define exactly 2 records for consistency
mock_records = [
{
"id": "test1",
"content_vector": [0.1] * 1536,
"workspace": "test",
},
{
"id": "test2",
"content_vector": [0.2] * 1536,
"workspace": "test",
},
]
async def query_side_effect(query, params, **kwargs):
multirows = kwargs.get("multirows", False)
if "information_schema.tables" in query:
if params[0] == "LIGHTRAG_DOC_CHUNKS": # legacy
return {"exists": True}
elif params[0] == "LIGHTRAG_DOC_CHUNKS_model_1536d": # new
return {"exists": False}
elif "COUNT(*)" in query:
# Return different counts based on table name in query and migration state
if "LIGHTRAG_DOC_CHUNKS_model_1536d" in query:
# After migration: return migrated count, before: return 0
return {
"count": len(mock_records) if migration_done["value"] else 0
}
# Legacy table always has 2 records (matching mock_records)
return {"count": len(mock_records)}
elif "pg_attribute" in query:
return {"vector_dim": 1536} # Legacy has matching 1536d
elif "SELECT * FROM" in query and multirows:
# Return sample data for migration (first batch)
# Handle workspace filtering: params = [workspace, offset, limit]
if "WHERE workspace" in query:
offset = params[1] if len(params) > 1 else 0
else:
offset = params[0] if params else 0
if offset == 0: # First batch
return mock_records
else: # offset > 0
return [] # No more data
return {}
db.query.side_effect = query_side_effect
# Mock _run_with_retry to track when migration happens
original_run_with_retry = db._run_with_retry
async def mock_run_with_retry(operation, *args, **kwargs):
result = await original_run_with_retry(operation, *args, **kwargs)
# After executemany is called, migration is done
migration_done["value"] = True
return result
db._run_with_retry.side_effect = mock_run_with_retry
db.execute = AsyncMock()
db._create_vector_index = AsyncMock()
# Mock _pg_table_exists
async def mock_table_exists(db_inst, name):
if name == "LIGHTRAG_DOC_CHUNKS": # legacy exists
return True
elif name == "LIGHTRAG_DOC_CHUNKS_model_1536d": # new doesn't exist
return False
return False
# Custom mock for _pg_migrate_workspace_data that updates migration_done
async def mock_migrate_func(*args, **kwargs):
migration_done["value"] = (
True # Set BEFORE returning so verification query sees it
)
return len(mock_records)
with (
patch(
"lightrag.kg.postgres_impl._pg_table_exists",
side_effect=mock_table_exists,
),
patch(
"lightrag.kg.postgres_impl._pg_migrate_workspace_data",
side_effect=mock_migrate_func,
) as mock_migrate,
):
# Call setup_table with matching 1536d
await PGVectorStorage.setup_table(
db,
"LIGHTRAG_DOC_CHUNKS_model_1536d",
legacy_table_name="LIGHTRAG_DOC_CHUNKS",
base_table="LIGHTRAG_DOC_CHUNKS",
embedding_dim=1536,
workspace="test",
)
# Verify migration function WAS called
mock_migrate.assert_called_once()