Update PG mismatch tests to expect errors

- Assert DataMigrationError on mismatch
- Mock check_table_exists explicitly
- Return JSON string for vector sampling
- Check dimension info in error message
This commit is contained in:
yangdx
2025-12-21 18:54:17 +08:00
parent be744a28a7
commit afe3f3788a

View File

@@ -6,6 +6,7 @@ properly detect and handle vector dimension mismatches when migrating from
legacy collections/tables to new ones with different embedding models.
"""
import json
import pytest
from unittest.mock import MagicMock, AsyncMock, patch
@@ -172,28 +173,33 @@ class TestQdrantDimensionMismatch:
class TestPostgresDimensionMismatch:
"""Test suite for PostgreSQL dimension mismatch handling."""
async def test_postgres_dimension_mismatch_skip_migration_metadata(self):
async def test_postgres_dimension_mismatch_raises_error_metadata(self):
"""
Test that PostgreSQL skips migration when dimensions don't match (via metadata).
Test that PostgreSQL raises DataMigrationError when dimensions don't match.
Scenario: Legacy table has 1536d vectors (detected via pg_attribute),
new model expects 3072d.
Expected: Migration skipped, new empty table created, legacy preserved.
Scenario: Legacy table has 1536d vectors, new model expects 3072d.
Expected: DataMigrationError is raised to prevent data corruption.
"""
# Setup mock database
db = AsyncMock()
# Mock check_table_exists
async def mock_check_table_exists(table_name):
if table_name == "LIGHTRAG_DOC_CHUNKS": # legacy
return True
elif table_name == "LIGHTRAG_DOC_CHUNKS_model_3072d": # new
return False
return False
db.check_table_exists = AsyncMock(side_effect=mock_check_table_exists)
# Mock table existence and dimension checks
async def query_side_effect(query, params, **kwargs):
if "information_schema.tables" in query:
if params[0] == "LIGHTRAG_DOC_CHUNKS": # legacy
return {"exists": True}
elif params[0] == "LIGHTRAG_DOC_CHUNKS_model_3072d": # new
return {"exists": False}
elif "COUNT(*)" in query:
if "COUNT(*)" in query:
return {"count": 100} # Legacy has data
elif "pg_attribute" in query:
return {"vector_dim": 1536} # Legacy has 1536d vectors
elif "SELECT content_vector FROM" in query:
# Return sample vector with 1536 dimensions
return {"content_vector": [0.1] * 1536}
return {}
db.query.side_effect = query_side_effect
@@ -201,32 +207,25 @@ class TestPostgresDimensionMismatch:
db._create_vector_index = AsyncMock()
# Call setup_table with 3072d (different from legacy 1536d)
await PGVectorStorage.setup_table(
db,
"LIGHTRAG_DOC_CHUNKS_model_3072d",
legacy_table_name="LIGHTRAG_DOC_CHUNKS",
base_table="LIGHTRAG_DOC_CHUNKS",
embedding_dim=3072,
workspace="test",
)
# Should raise DataMigrationError due to dimension mismatch
with pytest.raises(DataMigrationError) as exc_info:
await PGVectorStorage.setup_table(
db,
"LIGHTRAG_DOC_CHUNKS_model_3072d",
legacy_table_name="LIGHTRAG_DOC_CHUNKS",
base_table="LIGHTRAG_DOC_CHUNKS",
embedding_dim=3072,
workspace="test",
)
# Verify migration was NOT attempted (no INSERT calls)
# Note: _pg_create_table is mocked, so we check INSERT calls to verify migration was skipped
insert_calls = [
call
for call in db.execute.call_args_list
if call[0][0] and "INSERT INTO" in call[0][0]
]
assert (
len(insert_calls) == 0
), "Migration should be skipped due to dimension mismatch"
# Verify error message contains dimension information
assert "3072" in str(exc_info.value) or "1536" in str(exc_info.value)
async def test_postgres_dimension_mismatch_skip_migration_sampling(self):
async def test_postgres_dimension_mismatch_raises_error_sampling(self):
"""
Test that PostgreSQL raises error when dimensions don't match (via sampling).
Scenario: Legacy table dimension detection fails via metadata,
falls back to vector sampling, detects 1536d vs expected 3072d.
Scenario: Legacy table vector sampling detects 1536d vs expected 3072d.
Expected: DataMigrationError is raised to prevent data corruption.
"""
db = AsyncMock()
@@ -250,11 +249,9 @@ class TestPostgresDimensionMismatch:
return {"exists": False}
elif "COUNT(*)" in query:
return {"count": 100} # Legacy has data
elif "pg_attribute" in query:
return {"vector_dim": 1536} # Legacy has 1536d vectors
elif "SELECT content_vector FROM" in query:
# Return sample vector with 1536 dimensions
return {"content_vector": [0.1] * 1536}
# Return sample vector with 1536 dimensions as a JSON string
return {"content_vector": json.dumps([0.1] * 1536)}
return {}
db.query.side_effect = query_side_effect