Get max source Id config from .env and lightRAG init
This commit is contained in:
@@ -73,8 +73,8 @@ ENABLE_LLM_CACHE=true
|
||||
# MAX_RELATION_TOKENS=8000
|
||||
### control the maximum tokens send to LLM (include entities, relations and chunks)
|
||||
# MAX_TOTAL_TOKENS=30000
|
||||
### control the maximum chunk_ids stored in vector db
|
||||
# MAX_CHUNK_IDS_PER_ENTITY=500
|
||||
### control the maximum chunk_ids stored
|
||||
# MAX_SOURCE_IDS_PER_ENTITY=500
|
||||
|
||||
### maximum number of related chunks per source entity or relation
|
||||
### The chunk picker uses this value to determine the total number of chunks selected from KG(knowledge graph)
|
||||
|
||||
@@ -13,7 +13,7 @@ DEFAULT_MAX_GRAPH_NODES = 1000
|
||||
# Default values for extraction settings
|
||||
DEFAULT_SUMMARY_LANGUAGE = "English" # Default language for document processing
|
||||
DEFAULT_MAX_GLEANING = 1
|
||||
DEFAULT_MAX_CHUNK_IDS_PER_ENTITY = 500 # Applies to Both Graph + Vector DBs
|
||||
DEFAULT_MAX_SOURCE_IDS_PER_ENTITY = 500 # Applies to Both Graph + Vector DBs
|
||||
|
||||
# Number of description fragments to trigger LLM summary
|
||||
DEFAULT_FORCE_LLM_SUMMARY_ON_MERGE = 8
|
||||
|
||||
@@ -39,6 +39,7 @@ from lightrag.constants import (
|
||||
DEFAULT_MAX_ASYNC,
|
||||
DEFAULT_MAX_PARALLEL_INSERT,
|
||||
DEFAULT_MAX_GRAPH_NODES,
|
||||
DEFAULT_MAX_SOURCE_IDS_PER_ENTITY,
|
||||
DEFAULT_ENTITY_TYPES,
|
||||
DEFAULT_SUMMARY_LANGUAGE,
|
||||
DEFAULT_LLM_TIMEOUT,
|
||||
@@ -359,6 +360,11 @@ class LightRAG:
|
||||
)
|
||||
"""Maximum number of graph nodes to return in knowledge graph queries."""
|
||||
|
||||
max_source_ids_per_entity: int = field(
|
||||
default=get_env_value("MAX_SOURCE_IDS_PER_ENTITY", DEFAULT_MAX_SOURCE_IDS_PER_ENTITY, int)
|
||||
)
|
||||
"""Maximum number of source (chunk) ids in entity Grpah + VDB."""
|
||||
|
||||
addon_params: dict[str, Any] = field(
|
||||
default_factory=lambda: {
|
||||
"language": get_env_value(
|
||||
|
||||
@@ -1374,7 +1374,7 @@ async def _merge_nodes_then_upsert(
|
||||
|
||||
merged_source_ids: set = set([dp["source_id"] for dp in nodes_data] + already_source_ids)
|
||||
|
||||
source_ids = truncate_entity_source_id(merged_source_ids, entity_name)
|
||||
source_ids = truncate_entity_source_id(merged_source_ids, entity_name, global_config)
|
||||
source_id = GRAPH_FIELD_SEP.join(source_ids)
|
||||
|
||||
file_path = build_file_path(already_file_paths, nodes_data, entity_name)
|
||||
@@ -1661,7 +1661,7 @@ async def merge_nodes_and_edges(
|
||||
[entity_name], namespace=namespace, enable_logging=False
|
||||
):
|
||||
try:
|
||||
logger.info(f"Inserting {entity_name} in Graph")
|
||||
logger.debug(f"Inserting {entity_name} in Graph")
|
||||
# Graph database operation (critical path, must succeed)
|
||||
entity_data = await _merge_nodes_then_upsert(
|
||||
entity_name,
|
||||
@@ -1690,7 +1690,7 @@ async def merge_nodes_and_edges(
|
||||
}
|
||||
|
||||
|
||||
logger.info(f"Inserting {entity_name} in Graph")
|
||||
logger.debug(f"Inserting {entity_name} in Graph")
|
||||
# Use safe operation wrapper - VDB failure must throw exception
|
||||
await safe_vdb_operation_with_exception(
|
||||
operation=lambda: entity_vdb.upsert(data_for_vdb),
|
||||
|
||||
@@ -26,7 +26,6 @@ from lightrag.constants import (
|
||||
GRAPH_FIELD_SEP,
|
||||
DEFAULT_MAX_TOTAL_TOKENS,
|
||||
DEFAULT_MAX_FILE_PATH_LENGTH,
|
||||
DEFAULT_MAX_CHUNK_IDS_PER_ENTITY,
|
||||
)
|
||||
|
||||
# Initialize logger with basic configuration
|
||||
@@ -2465,23 +2464,25 @@ async def process_chunks_unified(
|
||||
|
||||
return final_chunks
|
||||
|
||||
def truncate_entity_source_id(chunk_ids: set, entity_name: str) -> set:
|
||||
def truncate_entity_source_id(chunk_ids: set, entity_name: str, global_config: dict) -> set:
|
||||
"""Limit chunk_ids, for entities that appear a HUGE no of times (To not break VDB hard upper limits)"""
|
||||
already_len: int = len(chunk_ids)
|
||||
|
||||
max_chunk_ids_per_entity = get_env_value("MAX_CHUNK_IDS_PER_ENTITY", DEFAULT_MAX_CHUNK_IDS_PER_ENTITY, int)
|
||||
max_chunk_ids_per_entity = global_config["max_source_ids_per_entity"]
|
||||
|
||||
if already_len <= max_chunk_ids_per_entity:
|
||||
return chunk_ids
|
||||
|
||||
logger.warning(
|
||||
f"Source Ids already exceeds {max_chunk_ids_per_entity } for {entity_name}, "
|
||||
f"current size: {already_len}, truncating..."
|
||||
)
|
||||
|
||||
if already_len >= max_chunk_ids_per_entity:
|
||||
logger.warning(
|
||||
f"Chunk Ids already exceeds {max_chunk_ids_per_entity } for {entity_name}, "
|
||||
f"current size: {already_len} entries."
|
||||
)
|
||||
|
||||
truncated_chunk_ids = set(list(chunk_ids)[0:max_chunk_ids_per_entity ])
|
||||
|
||||
return truncated_chunk_ids
|
||||
|
||||
|
||||
|
||||
def build_file_path(already_file_paths, data_list, target):
|
||||
"""Build file path string with UTF-8 byte length limit and deduplication
|
||||
|
||||
|
||||
Reference in New Issue
Block a user