Improve CJK detection and safely drop Neo4j indexes

- Expand CJK regex to extensions A-F
- Use DROP INDEX IF EXISTS
- Add cleanup in multi-workspace test
- Safely handle legacy index drops
This commit is contained in:
yangdx
2025-12-22 00:19:37 +08:00
parent a6c365e81d
commit 2a02b69e1d
2 changed files with 27 additions and 7 deletions

View File

@@ -101,9 +101,18 @@ class Neo4JStorage(BaseGraphStorage):
return f"entity_id_fulltext_idx_{suffix}"
def _is_chinese_text(self, text: str) -> bool:
"""Check if text contains Chinese characters."""
chinese_pattern = re.compile(r"[\u4e00-\u9fff]+")
return bool(chinese_pattern.search(text))
"""Check if text contains Chinese/CJK characters.
Covers:
- CJK Unified Ideographs (U+4E00-U+9FFF)
- CJK Extension A (U+3400-U+4DBF)
- CJK Compatibility Ideographs (U+F900-U+FAFF)
- CJK Extension B-F (U+20000-U+2FA1F) - supplementary planes
"""
cjk_pattern = re.compile(
r"[\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff]|[\U00020000-\U0002fa1f]"
)
return bool(cjk_pattern.search(text))
async def initialize(self):
async with get_data_init_lock():
@@ -288,8 +297,8 @@ class Neo4JStorage(BaseGraphStorage):
f"[{self.workspace}] Found legacy index '{legacy_index_name}'. Migrating to '{index_name}'."
)
try:
# Drop the legacy index
drop_query = f"DROP INDEX {legacy_index_name}"
# Drop the legacy index (use IF EXISTS for safety)
drop_query = f"DROP INDEX {legacy_index_name} IF EXISTS"
result = await session.run(drop_query)
await result.consume()
logger.info(
@@ -329,10 +338,10 @@ class Neo4JStorage(BaseGraphStorage):
needs_creation = existing_index is None
if needs_recreation or needs_creation:
# Drop existing index if it needs recreation
# Drop existing index if it needs recreation (use IF EXISTS for safety)
if needs_recreation:
try:
drop_query = f"DROP INDEX {index_name}"
drop_query = f"DROP INDEX {index_name} IF EXISTS"
result = await session.run(drop_query)
await result.consume()
logger.info(

View File

@@ -294,6 +294,17 @@ async def test_multiple_workspaces_have_separate_indexes(neo4j_storage):
), f"Workspace 2 index '{workspace2_index}' should exist"
finally:
# Clean up: drop the fulltext index created for workspace 2 to prevent accumulation
try:
async with storage2._driver.session(database=storage2._DATABASE) as session:
index_name = storage2._get_fulltext_index_name(
storage2._get_workspace_label()
)
drop_query = f"DROP INDEX {index_name} IF EXISTS"
result = await session.run(drop_query)
await result.consume()
except Exception:
pass # Ignore errors during cleanup
await storage2.drop()
await storage2.finalize()