* Implement safe index name generation * Hash table names if index exceeds 63B * Fix index detection for long models * Define PG identifier limit constant * Add tests for index name safety
211 lines
8.1 KiB
Python
211 lines
8.1 KiB
Python
"""
|
|
Unit tests for PostgreSQL safe index name generation.
|
|
|
|
This module tests the _safe_index_name helper function which prevents
|
|
PostgreSQL's silent 63-byte identifier truncation from causing index
|
|
lookup failures.
|
|
"""
|
|
|
|
import pytest
|
|
|
|
# Mark all tests as offline (no external dependencies)
|
|
pytestmark = pytest.mark.offline
|
|
|
|
|
|
class TestSafeIndexName:
|
|
"""Test suite for _safe_index_name function."""
|
|
|
|
def test_short_name_unchanged(self):
|
|
"""Short index names should remain unchanged."""
|
|
from lightrag.kg.postgres_impl import _safe_index_name
|
|
|
|
# Short table name - should return unchanged
|
|
result = _safe_index_name("lightrag_vdb_entity", "hnsw_cosine")
|
|
assert result == "idx_lightrag_vdb_entity_hnsw_cosine"
|
|
assert len(result.encode("utf-8")) <= 63
|
|
|
|
def test_long_name_gets_hashed(self):
|
|
"""Long table names exceeding 63 bytes should get hashed."""
|
|
from lightrag.kg.postgres_impl import _safe_index_name
|
|
|
|
# Long table name that would exceed 63 bytes
|
|
long_table_name = "LIGHTRAG_VDB_ENTITY_text_embedding_3_large_3072d"
|
|
result = _safe_index_name(long_table_name, "hnsw_cosine")
|
|
|
|
# Should be within 63 bytes
|
|
assert len(result.encode("utf-8")) <= 63
|
|
|
|
# Should start with idx_ prefix
|
|
assert result.startswith("idx_")
|
|
|
|
# Should contain the suffix
|
|
assert result.endswith("_hnsw_cosine")
|
|
|
|
# Should NOT be the naive concatenation (which would be truncated)
|
|
naive_name = f"idx_{long_table_name.lower()}_hnsw_cosine"
|
|
assert result != naive_name
|
|
|
|
def test_deterministic_output(self):
|
|
"""Same input should always produce same output (deterministic)."""
|
|
from lightrag.kg.postgres_impl import _safe_index_name
|
|
|
|
table_name = "LIGHTRAG_VDB_CHUNKS_text_embedding_3_large_3072d"
|
|
suffix = "hnsw_cosine"
|
|
|
|
result1 = _safe_index_name(table_name, suffix)
|
|
result2 = _safe_index_name(table_name, suffix)
|
|
|
|
assert result1 == result2
|
|
|
|
def test_different_suffixes_different_results(self):
|
|
"""Different suffixes should produce different index names."""
|
|
from lightrag.kg.postgres_impl import _safe_index_name
|
|
|
|
table_name = "LIGHTRAG_VDB_ENTITY_text_embedding_3_large_3072d"
|
|
|
|
result1 = _safe_index_name(table_name, "hnsw_cosine")
|
|
result2 = _safe_index_name(table_name, "ivfflat_cosine")
|
|
|
|
assert result1 != result2
|
|
|
|
def test_case_insensitive(self):
|
|
"""Table names should be normalized to lowercase."""
|
|
from lightrag.kg.postgres_impl import _safe_index_name
|
|
|
|
result_upper = _safe_index_name("LIGHTRAG_VDB_ENTITY", "hnsw_cosine")
|
|
result_lower = _safe_index_name("lightrag_vdb_entity", "hnsw_cosine")
|
|
|
|
assert result_upper == result_lower
|
|
|
|
def test_boundary_case_exactly_63_bytes(self):
|
|
"""Test boundary case where name is exactly at 63-byte limit."""
|
|
from lightrag.kg.postgres_impl import _safe_index_name
|
|
|
|
# Create a table name that results in exactly 63 bytes
|
|
# idx_ (4) + table_name + _ (1) + suffix = 63
|
|
# So table_name + suffix = 58
|
|
|
|
# Test a name that's just under the limit (should remain unchanged)
|
|
short_suffix = "id"
|
|
# idx_ (4) + 56 chars + _ (1) + id (2) = 63
|
|
table_56 = "a" * 56
|
|
result = _safe_index_name(table_56, short_suffix)
|
|
expected = f"idx_{table_56}_{short_suffix}"
|
|
assert result == expected
|
|
assert len(result.encode("utf-8")) == 63
|
|
|
|
def test_unicode_handling(self):
|
|
"""Unicode characters should be properly handled (bytes, not chars)."""
|
|
from lightrag.kg.postgres_impl import _safe_index_name
|
|
|
|
# Unicode characters can take more bytes than visible chars
|
|
# Chinese characters are 3 bytes each in UTF-8
|
|
table_name = "lightrag_测试_table" # Contains Chinese chars
|
|
result = _safe_index_name(table_name, "hnsw_cosine")
|
|
|
|
# Should always be within 63 bytes
|
|
assert len(result.encode("utf-8")) <= 63
|
|
|
|
def test_real_world_model_names(self):
|
|
"""Test with real-world embedding model names that cause issues."""
|
|
from lightrag.kg.postgres_impl import _safe_index_name
|
|
|
|
# These are actual model names that have caused issues
|
|
test_cases = [
|
|
("LIGHTRAG_VDB_CHUNKS_text_embedding_3_large_3072d", "hnsw_cosine"),
|
|
("LIGHTRAG_VDB_ENTITY_text_embedding_3_large_3072d", "hnsw_cosine"),
|
|
("LIGHTRAG_VDB_RELATION_text_embedding_3_large_3072d", "hnsw_cosine"),
|
|
(
|
|
"LIGHTRAG_VDB_ENTITY_bge_m3_1024d",
|
|
"hnsw_cosine",
|
|
), # Shorter model name
|
|
(
|
|
"LIGHTRAG_VDB_CHUNKS_nomic_embed_text_v1_768d",
|
|
"ivfflat_cosine",
|
|
), # Different index type
|
|
]
|
|
|
|
for table_name, suffix in test_cases:
|
|
result = _safe_index_name(table_name, suffix)
|
|
|
|
# Critical: must be within PostgreSQL's 63-byte limit
|
|
assert (
|
|
len(result.encode("utf-8")) <= 63
|
|
), f"Index name too long: {result} for table {table_name}"
|
|
|
|
# Must have consistent format
|
|
assert result.startswith("idx_"), f"Missing idx_ prefix: {result}"
|
|
assert result.endswith(f"_{suffix}"), f"Missing suffix {suffix}: {result}"
|
|
|
|
def test_hash_uniqueness_for_similar_tables(self):
|
|
"""Similar but different table names should produce different hashes."""
|
|
from lightrag.kg.postgres_impl import _safe_index_name
|
|
|
|
# These tables have similar names but should have different hashes
|
|
tables = [
|
|
"LIGHTRAG_VDB_CHUNKS_model_a_1024d",
|
|
"LIGHTRAG_VDB_CHUNKS_model_b_1024d",
|
|
"LIGHTRAG_VDB_ENTITY_model_a_1024d",
|
|
]
|
|
|
|
results = [_safe_index_name(t, "hnsw_cosine") for t in tables]
|
|
|
|
# All results should be unique
|
|
assert len(set(results)) == len(results), "Hash collision detected!"
|
|
|
|
|
|
class TestIndexNameIntegration:
|
|
"""Integration-style tests for index name usage patterns."""
|
|
|
|
def test_pg_indexes_lookup_compatibility(self):
|
|
"""
|
|
Test that the generated index name will work with pg_indexes lookup.
|
|
|
|
This is the core problem: PostgreSQL stores the truncated name,
|
|
but we were looking up the untruncated name. Our fix ensures we
|
|
always use a name that fits within 63 bytes.
|
|
"""
|
|
from lightrag.kg.postgres_impl import _safe_index_name
|
|
|
|
table_name = "LIGHTRAG_VDB_CHUNKS_text_embedding_3_large_3072d"
|
|
suffix = "hnsw_cosine"
|
|
|
|
# Generate the index name
|
|
index_name = _safe_index_name(table_name, suffix)
|
|
|
|
# Simulate what PostgreSQL would store (truncate at 63 bytes)
|
|
stored_name = index_name.encode("utf-8")[:63].decode("utf-8", errors="ignore")
|
|
|
|
# The key fix: our generated name should equal the stored name
|
|
# because it's already within the 63-byte limit
|
|
assert (
|
|
index_name == stored_name
|
|
), "Index name would be truncated by PostgreSQL, causing lookup failures!"
|
|
|
|
def test_backward_compatibility_short_names(self):
|
|
"""
|
|
Ensure backward compatibility with existing short index names.
|
|
|
|
For tables that have existing indexes with short names (pre-model-suffix era),
|
|
the function should not change their names.
|
|
"""
|
|
from lightrag.kg.postgres_impl import _safe_index_name
|
|
|
|
# Legacy table names without model suffix
|
|
legacy_tables = [
|
|
"LIGHTRAG_VDB_ENTITY",
|
|
"LIGHTRAG_VDB_RELATION",
|
|
"LIGHTRAG_VDB_CHUNKS",
|
|
]
|
|
|
|
for table in legacy_tables:
|
|
for suffix in ["hnsw_cosine", "ivfflat_cosine", "id"]:
|
|
result = _safe_index_name(table, suffix)
|
|
expected = f"idx_{table.lower()}_{suffix}"
|
|
|
|
# Short names should remain unchanged for backward compatibility
|
|
if len(expected.encode("utf-8")) <= 63:
|
|
assert (
|
|
result == expected
|
|
), f"Short name changed unexpectedly: {result} != {expected}"
|