Fix PostgreSQL index lookup failure for long table names

* Implement safe index name generation * Hash table names if index exceeds 63B * Fix index detection for long models * Define PG identifier limit constant * Add tests for index name safety
2025-12-20 05:40:59 +08:00
parent e12dfdb9e0
commit 1aa4a3a385
2 changed files with 252 additions and 3 deletions
--- a/lightrag/kg/postgres_impl.py
+++ b/lightrag/kg/postgres_impl.py
@@ -1,4 +1,5 @@
 import asyncio
+import hashlib
 import json
 import os
 import re
@@ -56,6 +57,42 @@ load_dotenv(dotenv_path=".env", override=False)

 T = TypeVar("T")

+# PostgreSQL identifier length limit (in bytes)
+PG_MAX_IDENTIFIER_LENGTH = 63
+
+
+def _safe_index_name(table_name: str, index_suffix: str) -> str:
+    """
+    Generate a PostgreSQL-safe index name that won't be truncated.
+
+    PostgreSQL silently truncates identifiers to 63 bytes. This function
+    ensures index names stay within that limit by hashing long table names.
+
+    Args:
+        table_name: The table name (may be long with model suffix)
+        index_suffix: The index type suffix (e.g., 'hnsw_cosine', 'id', 'workspace_id')
+
+    Returns:
+        A deterministic index name that fits within 63 bytes
+    """
+    # Construct the full index name
+    full_name = f"idx_{table_name.lower()}_{index_suffix}"
+
+    # If it fits within the limit, use it as-is
+    if len(full_name.encode("utf-8")) <= PG_MAX_IDENTIFIER_LENGTH:
+        return full_name
+
+    # Otherwise, hash the table name to create a shorter unique identifier
+    # Keep 'idx_' prefix and suffix readable, hash the middle
+    hash_input = table_name.lower().encode("utf-8")
+    table_hash = hashlib.md5(hash_input).hexdigest()[:12]  # 12 hex chars
+
+    # Format: idx_{hash}_{suffix} - guaranteed to fit
+    # Maximum: idx_ (4) + hash (12) + _ (1) + suffix (variable) = 17 + suffix
+    shortened_name = f"idx_{table_hash}_{index_suffix}"
+
+    return shortened_name
+

 class PostgreSQLDB:
    def __init__(self, config: dict[str, Any], **kwargs: Any):
@@ -1435,7 +1472,9 @@ class PostgreSQLDB:
            return

        k = table_name
-        vector_index_name = f"idx_{k.lower()}_{self.vector_index_type.lower()}_cosine"
+        # Use _safe_index_name to avoid PostgreSQL's 63-byte identifier truncation
+        index_suffix = f"{self.vector_index_type.lower()}_cosine"
+        vector_index_name = _safe_index_name(k, index_suffix)
        check_vector_index_sql = f"""
            SELECT 1 FROM pg_indexes
            WHERE indexname = '{vector_index_name}' AND tablename = '{k.lower()}'
@@ -2378,9 +2417,9 @@ class PGVectorStorage(BaseVectorStorage):
        logger.info(f"PostgreSQL table name: {self.table_name}")

        # Validate table name length (PostgreSQL identifier limit is 63 characters)
-        if len(self.table_name) > 63:
+        if len(self.table_name) > PG_MAX_IDENTIFIER_LENGTH:
            raise ValueError(
-                f"PostgreSQL table name exceeds 63 character limit: '{self.table_name}' "
+                f"PostgreSQL table name exceeds {PG_MAX_IDENTIFIER_LENGTH} character limit: '{self.table_name}' "
                f"(length: {len(self.table_name)}). "
                f"Consider using a shorter embedding model name or workspace name."
            )
--- a/tests/test_postgres_index_name.py
+++ b/tests/test_postgres_index_name.py
@@ -0,0 +1,210 @@
+"""
+Unit tests for PostgreSQL safe index name generation.
+
+This module tests the _safe_index_name helper function which prevents
+PostgreSQL's silent 63-byte identifier truncation from causing index
+lookup failures.
+"""
+
+import pytest
+
+# Mark all tests as offline (no external dependencies)
+pytestmark = pytest.mark.offline
+
+
+class TestSafeIndexName:
+    """Test suite for _safe_index_name function."""
+
+    def test_short_name_unchanged(self):
+        """Short index names should remain unchanged."""
+        from lightrag.kg.postgres_impl import _safe_index_name
+
+        # Short table name - should return unchanged
+        result = _safe_index_name("lightrag_vdb_entity", "hnsw_cosine")
+        assert result == "idx_lightrag_vdb_entity_hnsw_cosine"
+        assert len(result.encode("utf-8")) <= 63
+
+    def test_long_name_gets_hashed(self):
+        """Long table names exceeding 63 bytes should get hashed."""
+        from lightrag.kg.postgres_impl import _safe_index_name
+
+        # Long table name that would exceed 63 bytes
+        long_table_name = "LIGHTRAG_VDB_ENTITY_text_embedding_3_large_3072d"
+        result = _safe_index_name(long_table_name, "hnsw_cosine")
+
+        # Should be within 63 bytes
+        assert len(result.encode("utf-8")) <= 63
+
+        # Should start with idx_ prefix
+        assert result.startswith("idx_")
+
+        # Should contain the suffix
+        assert result.endswith("_hnsw_cosine")
+
+        # Should NOT be the naive concatenation (which would be truncated)
+        naive_name = f"idx_{long_table_name.lower()}_hnsw_cosine"
+        assert result != naive_name
+
+    def test_deterministic_output(self):
+        """Same input should always produce same output (deterministic)."""
+        from lightrag.kg.postgres_impl import _safe_index_name
+
+        table_name = "LIGHTRAG_VDB_CHUNKS_text_embedding_3_large_3072d"
+        suffix = "hnsw_cosine"
+
+        result1 = _safe_index_name(table_name, suffix)
+        result2 = _safe_index_name(table_name, suffix)
+
+        assert result1 == result2
+
+    def test_different_suffixes_different_results(self):
+        """Different suffixes should produce different index names."""
+        from lightrag.kg.postgres_impl import _safe_index_name
+
+        table_name = "LIGHTRAG_VDB_ENTITY_text_embedding_3_large_3072d"
+
+        result1 = _safe_index_name(table_name, "hnsw_cosine")
+        result2 = _safe_index_name(table_name, "ivfflat_cosine")
+
+        assert result1 != result2
+
+    def test_case_insensitive(self):
+        """Table names should be normalized to lowercase."""
+        from lightrag.kg.postgres_impl import _safe_index_name
+
+        result_upper = _safe_index_name("LIGHTRAG_VDB_ENTITY", "hnsw_cosine")
+        result_lower = _safe_index_name("lightrag_vdb_entity", "hnsw_cosine")
+
+        assert result_upper == result_lower
+
+    def test_boundary_case_exactly_63_bytes(self):
+        """Test boundary case where name is exactly at 63-byte limit."""
+        from lightrag.kg.postgres_impl import _safe_index_name
+
+        # Create a table name that results in exactly 63 bytes
+        # idx_ (4) + table_name + _ (1) + suffix = 63
+        # So table_name + suffix = 58
+
+        # Test a name that's just under the limit (should remain unchanged)
+        short_suffix = "id"
+        # idx_ (4) + 56 chars + _ (1) + id (2) = 63
+        table_56 = "a" * 56
+        result = _safe_index_name(table_56, short_suffix)
+        expected = f"idx_{table_56}_{short_suffix}"
+        assert result == expected
+        assert len(result.encode("utf-8")) == 63
+
+    def test_unicode_handling(self):
+        """Unicode characters should be properly handled (bytes, not chars)."""
+        from lightrag.kg.postgres_impl import _safe_index_name
+
+        # Unicode characters can take more bytes than visible chars
+        # Chinese characters are 3 bytes each in UTF-8
+        table_name = "lightrag_测试_table"  # Contains Chinese chars
+        result = _safe_index_name(table_name, "hnsw_cosine")
+
+        # Should always be within 63 bytes
+        assert len(result.encode("utf-8")) <= 63
+
+    def test_real_world_model_names(self):
+        """Test with real-world embedding model names that cause issues."""
+        from lightrag.kg.postgres_impl import _safe_index_name
+
+        # These are actual model names that have caused issues
+        test_cases = [
+            ("LIGHTRAG_VDB_CHUNKS_text_embedding_3_large_3072d", "hnsw_cosine"),
+            ("LIGHTRAG_VDB_ENTITY_text_embedding_3_large_3072d", "hnsw_cosine"),
+            ("LIGHTRAG_VDB_RELATION_text_embedding_3_large_3072d", "hnsw_cosine"),
+            (
+                "LIGHTRAG_VDB_ENTITY_bge_m3_1024d",
+                "hnsw_cosine",
+            ),  # Shorter model name
+            (
+                "LIGHTRAG_VDB_CHUNKS_nomic_embed_text_v1_768d",
+                "ivfflat_cosine",
+            ),  # Different index type
+        ]
+
+        for table_name, suffix in test_cases:
+            result = _safe_index_name(table_name, suffix)
+
+            # Critical: must be within PostgreSQL's 63-byte limit
+            assert (
+                len(result.encode("utf-8")) <= 63
+            ), f"Index name too long: {result} for table {table_name}"
+
+            # Must have consistent format
+            assert result.startswith("idx_"), f"Missing idx_ prefix: {result}"
+            assert result.endswith(f"_{suffix}"), f"Missing suffix {suffix}: {result}"
+
+    def test_hash_uniqueness_for_similar_tables(self):
+        """Similar but different table names should produce different hashes."""
+        from lightrag.kg.postgres_impl import _safe_index_name
+
+        # These tables have similar names but should have different hashes
+        tables = [
+            "LIGHTRAG_VDB_CHUNKS_model_a_1024d",
+            "LIGHTRAG_VDB_CHUNKS_model_b_1024d",
+            "LIGHTRAG_VDB_ENTITY_model_a_1024d",
+        ]
+
+        results = [_safe_index_name(t, "hnsw_cosine") for t in tables]
+
+        # All results should be unique
+        assert len(set(results)) == len(results), "Hash collision detected!"
+
+
+class TestIndexNameIntegration:
+    """Integration-style tests for index name usage patterns."""
+
+    def test_pg_indexes_lookup_compatibility(self):
+        """
+        Test that the generated index name will work with pg_indexes lookup.
+
+        This is the core problem: PostgreSQL stores the truncated name,
+        but we were looking up the untruncated name. Our fix ensures we
+        always use a name that fits within 63 bytes.
+        """
+        from lightrag.kg.postgres_impl import _safe_index_name
+
+        table_name = "LIGHTRAG_VDB_CHUNKS_text_embedding_3_large_3072d"
+        suffix = "hnsw_cosine"
+
+        # Generate the index name
+        index_name = _safe_index_name(table_name, suffix)
+
+        # Simulate what PostgreSQL would store (truncate at 63 bytes)
+        stored_name = index_name.encode("utf-8")[:63].decode("utf-8", errors="ignore")
+
+        # The key fix: our generated name should equal the stored name
+        # because it's already within the 63-byte limit
+        assert (
+            index_name == stored_name
+        ), "Index name would be truncated by PostgreSQL, causing lookup failures!"
+
+    def test_backward_compatibility_short_names(self):
+        """
+        Ensure backward compatibility with existing short index names.
+
+        For tables that have existing indexes with short names (pre-model-suffix era),
+        the function should not change their names.
+        """
+        from lightrag.kg.postgres_impl import _safe_index_name
+
+        # Legacy table names without model suffix
+        legacy_tables = [
+            "LIGHTRAG_VDB_ENTITY",
+            "LIGHTRAG_VDB_RELATION",
+            "LIGHTRAG_VDB_CHUNKS",
+        ]
+
+        for table in legacy_tables:
+            for suffix in ["hnsw_cosine", "ivfflat_cosine", "id"]:
+                result = _safe_index_name(table, suffix)
+                expected = f"idx_{table.lower()}_{suffix}"
+
+                # Short names should remain unchanged for backward compatibility
+                if len(expected.encode("utf-8")) <= 63:
+                    assert (
+                        result == expected
+                    ), f"Short name changed unexpectedly: {result} != {expected}"