refactor: move document deps to api group, remove dynamic imports
- Merge offline-docs into api extras - Remove pipmaster dynamic installs - Add async document processing - Pre-check docling availability - Update offline deployment docs
This commit is contained in:
@@ -23,10 +23,11 @@ LightRAG uses dynamic package installation (`pipmaster`) for optional features b
|
||||
|
||||
LightRAG dynamically installs packages for:
|
||||
|
||||
- **Document Processing**: `docling`, `pypdf2`, `python-docx`, `python-pptx`, `openpyxl`
|
||||
- **Storage Backends**: `redis`, `neo4j`, `pymilvus`, `pymongo`, `asyncpg`, `qdrant-client`
|
||||
- **LLM Providers**: `openai`, `anthropic`, `ollama`, `zhipuai`, `aioboto3`, `voyageai`, `llama-index`, `lmdeploy`, `transformers`, `torch`
|
||||
- Tiktoken Models**: BPE encoding models downloaded from OpenAI CDN
|
||||
- **Tiktoken Models**: BPE encoding models downloaded from OpenAI CDN
|
||||
|
||||
**Note**: Document processing dependencies (`pypdf`, `python-docx`, `python-pptx`, `openpyxl`) are now pre-installed with the `api` extras group and no longer require dynamic installation.
|
||||
|
||||
## Quick Start
|
||||
|
||||
@@ -75,32 +76,31 @@ LightRAG provides flexible dependency groups for different use cases:
|
||||
|
||||
| Group | Description | Use Case |
|
||||
|-------|-------------|----------|
|
||||
| `offline-docs` | Document processing | PDF, DOCX, PPTX, XLSX files |
|
||||
| `api` | API server + document processing | FastAPI server with PDF, DOCX, PPTX, XLSX support |
|
||||
| `offline-storage` | Storage backends | Redis, Neo4j, MongoDB, PostgreSQL, etc. |
|
||||
| `offline-llm` | LLM providers | OpenAI, Anthropic, Ollama, etc. |
|
||||
| `offline` | All of the above | Complete offline deployment |
|
||||
| `offline` | Complete offline package | API + Storage + LLM (all features) |
|
||||
|
||||
**Note**: Document processing (PDF, DOCX, PPTX, XLSX) is included in the `api` extras group. The previous `offline-docs` group has been merged into `api` for better integration.
|
||||
|
||||
> Software packages requiring `transformers`, `torch`, or `cuda` will not be included in the offline dependency group.
|
||||
|
||||
### Installation Examples
|
||||
|
||||
```bash
|
||||
# Install only document processing dependencies
|
||||
pip install lightrag-hku[offline-docs]
|
||||
# Install API with document processing
|
||||
pip install lightrag-hku[api]
|
||||
|
||||
# Install document processing and storage backends
|
||||
pip install lightrag-hku[offline-docs,offline-storage]
|
||||
# Install API and storage backends
|
||||
pip install lightrag-hku[api,offline-storage]
|
||||
|
||||
# Install all offline dependencies
|
||||
# Install all offline dependencies (recommended for offline deployment)
|
||||
pip install lightrag-hku[offline]
|
||||
```
|
||||
|
||||
### Using Individual Requirements Files
|
||||
|
||||
```bash
|
||||
# Document processing only
|
||||
pip install -r requirements-offline-docs.txt
|
||||
|
||||
# Storage backends only
|
||||
pip install -r requirements-offline-storage.txt
|
||||
|
||||
@@ -244,8 +244,8 @@ ls -la ~/.tiktoken_cache/
|
||||
**Solution**:
|
||||
```bash
|
||||
# Pre-install the specific package you need
|
||||
# For document processing:
|
||||
pip install lightrag-hku[offline-docs]
|
||||
# For API with document processing:
|
||||
pip install lightrag-hku[api]
|
||||
|
||||
# For storage backends:
|
||||
pip install lightrag-hku[offline-storage]
|
||||
@@ -297,9 +297,9 @@ mkdir -p ~/my_tiktoken_cache
|
||||
|
||||
5. **Minimal Installation**: Only install what you need:
|
||||
```bash
|
||||
# If you only process PDFs with OpenAI
|
||||
pip install lightrag-hku[offline-docs]
|
||||
# Then manually add: pip install openai
|
||||
# If you only need API with document processing
|
||||
pip install lightrag-hku[api]
|
||||
# Then manually add specific LLM: pip install openai
|
||||
```
|
||||
|
||||
## Additional Resources
|
||||
|
||||
@@ -7,10 +7,10 @@ from lightrag.utils import logger, get_pinyin_sort_key
|
||||
import aiofiles
|
||||
import shutil
|
||||
import traceback
|
||||
import pipmaster as pm
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Any, Literal
|
||||
from io import BytesIO
|
||||
from fastapi import (
|
||||
APIRouter,
|
||||
BackgroundTasks,
|
||||
@@ -27,6 +27,20 @@ from lightrag.utils import generate_track_id
|
||||
from lightrag.api.utils_api import get_combined_auth_dependency
|
||||
from ..config import global_args
|
||||
|
||||
# Check docling availability at module load time
|
||||
DOCLING_AVAILABLE = False
|
||||
try:
|
||||
import docling # noqa: F401 # type: ignore[import-not-found]
|
||||
|
||||
DOCLING_AVAILABLE = True
|
||||
except ImportError:
|
||||
if global_args.document_loading_engine == "DOCLING":
|
||||
logger.warning(
|
||||
"DOCLING engine requested but 'docling' package not installed. "
|
||||
"Falling back to standard document processing. "
|
||||
"To use DOCLING, install with: pip install lightrag-hku[api,docling]"
|
||||
)
|
||||
|
||||
|
||||
# Function to format datetime to ISO format string with timezone information
|
||||
def format_datetime(dt: Any) -> Optional[str]:
|
||||
@@ -879,7 +893,6 @@ def get_unique_filename_in_enqueued(target_dir: Path, original_name: str) -> str
|
||||
Returns:
|
||||
str: Unique filename (may have numeric suffix added)
|
||||
"""
|
||||
from pathlib import Path
|
||||
import time
|
||||
|
||||
original_path = Path(original_name)
|
||||
@@ -902,6 +915,122 @@ def get_unique_filename_in_enqueued(target_dir: Path, original_name: str) -> str
|
||||
return f"{base_name}_{timestamp}{extension}"
|
||||
|
||||
|
||||
# Document processing helper functions (synchronous)
|
||||
# These functions run in thread pool via asyncio.to_thread() to avoid blocking the event loop
|
||||
|
||||
|
||||
def _convert_with_docling(file_path: Path) -> str:
|
||||
"""Convert document using docling (synchronous).
|
||||
|
||||
Args:
|
||||
file_path: Path to the document file
|
||||
|
||||
Returns:
|
||||
str: Extracted markdown content
|
||||
"""
|
||||
from docling.document_converter import DocumentConverter # type: ignore
|
||||
|
||||
converter = DocumentConverter()
|
||||
result = converter.convert(file_path)
|
||||
return result.document.export_to_markdown()
|
||||
|
||||
|
||||
def _extract_pdf_pypdf(file_bytes: bytes, password: str = None) -> str:
|
||||
"""Extract PDF content using pypdf (synchronous).
|
||||
|
||||
Args:
|
||||
file_bytes: PDF file content as bytes
|
||||
password: Optional password for encrypted PDFs
|
||||
|
||||
Returns:
|
||||
str: Extracted text content
|
||||
|
||||
Raises:
|
||||
Exception: If PDF is encrypted and password is incorrect or missing
|
||||
"""
|
||||
from pypdf import PdfReader # type: ignore
|
||||
|
||||
pdf_file = BytesIO(file_bytes)
|
||||
reader = PdfReader(pdf_file)
|
||||
|
||||
# Check if PDF is encrypted
|
||||
if reader.is_encrypted:
|
||||
if not password:
|
||||
raise Exception("PDF is encrypted but no password provided")
|
||||
|
||||
decrypt_result = reader.decrypt(password)
|
||||
if decrypt_result == 0:
|
||||
raise Exception("Incorrect PDF password")
|
||||
|
||||
# Extract text from all pages
|
||||
content = ""
|
||||
for page in reader.pages:
|
||||
content += page.extract_text() + "\n"
|
||||
|
||||
return content
|
||||
|
||||
|
||||
def _extract_docx(file_bytes: bytes) -> str:
|
||||
"""Extract DOCX content (synchronous).
|
||||
|
||||
Args:
|
||||
file_bytes: DOCX file content as bytes
|
||||
|
||||
Returns:
|
||||
str: Extracted text content
|
||||
"""
|
||||
from docx import Document # type: ignore
|
||||
|
||||
docx_file = BytesIO(file_bytes)
|
||||
doc = Document(docx_file)
|
||||
return "\n".join([paragraph.text for paragraph in doc.paragraphs])
|
||||
|
||||
|
||||
def _extract_pptx(file_bytes: bytes) -> str:
|
||||
"""Extract PPTX content (synchronous).
|
||||
|
||||
Args:
|
||||
file_bytes: PPTX file content as bytes
|
||||
|
||||
Returns:
|
||||
str: Extracted text content
|
||||
"""
|
||||
from pptx import Presentation # type: ignore
|
||||
|
||||
pptx_file = BytesIO(file_bytes)
|
||||
prs = Presentation(pptx_file)
|
||||
content = ""
|
||||
for slide in prs.slides:
|
||||
for shape in slide.shapes:
|
||||
if hasattr(shape, "text"):
|
||||
content += shape.text + "\n"
|
||||
return content
|
||||
|
||||
|
||||
def _extract_xlsx(file_bytes: bytes) -> str:
|
||||
"""Extract XLSX content (synchronous).
|
||||
|
||||
Args:
|
||||
file_bytes: XLSX file content as bytes
|
||||
|
||||
Returns:
|
||||
str: Extracted text content
|
||||
"""
|
||||
from openpyxl import load_workbook # type: ignore
|
||||
|
||||
xlsx_file = BytesIO(file_bytes)
|
||||
wb = load_workbook(xlsx_file)
|
||||
content = ""
|
||||
for sheet in wb:
|
||||
content += f"Sheet: {sheet.title}\n"
|
||||
for row in sheet.iter_rows(values_only=True):
|
||||
content += (
|
||||
"\t".join(str(cell) if cell is not None else "" for cell in row) + "\n"
|
||||
)
|
||||
content += "\n"
|
||||
return content
|
||||
|
||||
|
||||
async def pipeline_enqueue_file(
|
||||
rag: LightRAG, file_path: Path, track_id: str = None
|
||||
) -> tuple[bool, str]:
|
||||
@@ -1072,87 +1201,21 @@ async def pipeline_enqueue_file(
|
||||
|
||||
case ".pdf":
|
||||
try:
|
||||
if global_args.document_loading_engine == "DOCLING":
|
||||
if not pm.is_installed("docling"): # type: ignore
|
||||
pm.install("docling")
|
||||
from docling.document_converter import DocumentConverter # type: ignore
|
||||
|
||||
converter = DocumentConverter()
|
||||
result = converter.convert(file_path)
|
||||
content = result.document.export_to_markdown()
|
||||
# Try DOCLING first if configured and available
|
||||
if (
|
||||
global_args.document_loading_engine == "DOCLING"
|
||||
and DOCLING_AVAILABLE
|
||||
):
|
||||
content = await asyncio.to_thread(
|
||||
_convert_with_docling, file_path
|
||||
)
|
||||
else:
|
||||
if not pm.is_installed("pypdf"): # type: ignore
|
||||
pm.install("pypdf")
|
||||
if not pm.is_installed("pycryptodome"): # type: ignore
|
||||
pm.install("pycryptodome")
|
||||
from pypdf import PdfReader # type: ignore
|
||||
from io import BytesIO
|
||||
|
||||
pdf_file = BytesIO(file)
|
||||
reader = PdfReader(pdf_file)
|
||||
|
||||
# Check if PDF is encrypted
|
||||
if reader.is_encrypted:
|
||||
pdf_password = global_args.pdf_decrypt_password
|
||||
if not pdf_password:
|
||||
# PDF is encrypted but no password provided
|
||||
error_files = [
|
||||
{
|
||||
"file_path": str(file_path.name),
|
||||
"error_description": "[File Extraction]PDF is encrypted but no password provided",
|
||||
"original_error": "Please set PDF_DECRYPT_PASSWORD environment variable to decrypt this PDF file",
|
||||
"file_size": file_size,
|
||||
}
|
||||
]
|
||||
await rag.apipeline_enqueue_error_documents(
|
||||
error_files, track_id
|
||||
)
|
||||
logger.error(
|
||||
f"[File Extraction]PDF is encrypted but no password provided: {file_path.name}"
|
||||
)
|
||||
return False, track_id
|
||||
|
||||
# Try to decrypt with password
|
||||
try:
|
||||
decrypt_result = reader.decrypt(pdf_password)
|
||||
if decrypt_result == 0:
|
||||
# Password is incorrect
|
||||
error_files = [
|
||||
{
|
||||
"file_path": str(file_path.name),
|
||||
"error_description": "[File Extraction]Failed to decrypt PDF - incorrect password",
|
||||
"original_error": "The provided PDF_DECRYPT_PASSWORD is incorrect for this file",
|
||||
"file_size": file_size,
|
||||
}
|
||||
]
|
||||
await rag.apipeline_enqueue_error_documents(
|
||||
error_files, track_id
|
||||
)
|
||||
logger.error(
|
||||
f"[File Extraction]Incorrect PDF password: {file_path.name}"
|
||||
)
|
||||
return False, track_id
|
||||
except Exception as decrypt_error:
|
||||
# Decryption process error
|
||||
error_files = [
|
||||
{
|
||||
"file_path": str(file_path.name),
|
||||
"error_description": "[File Extraction]PDF decryption failed",
|
||||
"original_error": f"Error during PDF decryption: {str(decrypt_error)}",
|
||||
"file_size": file_size,
|
||||
}
|
||||
]
|
||||
await rag.apipeline_enqueue_error_documents(
|
||||
error_files, track_id
|
||||
)
|
||||
logger.error(
|
||||
f"[File Extraction]PDF decryption error for {file_path.name}: {str(decrypt_error)}"
|
||||
)
|
||||
return False, track_id
|
||||
|
||||
# Extract text from PDF (encrypted PDFs are now decrypted, unencrypted PDFs proceed directly)
|
||||
for page in reader.pages:
|
||||
content += page.extract_text() + "\n"
|
||||
# Use pypdf (non-blocking via to_thread)
|
||||
content = await asyncio.to_thread(
|
||||
_extract_pdf_pypdf,
|
||||
file,
|
||||
global_args.pdf_decrypt_password,
|
||||
)
|
||||
except Exception as e:
|
||||
error_files = [
|
||||
{
|
||||
@@ -1172,28 +1235,17 @@ async def pipeline_enqueue_file(
|
||||
|
||||
case ".docx":
|
||||
try:
|
||||
if global_args.document_loading_engine == "DOCLING":
|
||||
if not pm.is_installed("docling"): # type: ignore
|
||||
pm.install("docling")
|
||||
from docling.document_converter import DocumentConverter # type: ignore
|
||||
|
||||
converter = DocumentConverter()
|
||||
result = converter.convert(file_path)
|
||||
content = result.document.export_to_markdown()
|
||||
else:
|
||||
if not pm.is_installed("python-docx"): # type: ignore
|
||||
try:
|
||||
pm.install("python-docx")
|
||||
except Exception:
|
||||
pm.install("docx")
|
||||
from docx import Document # type: ignore
|
||||
from io import BytesIO
|
||||
|
||||
docx_file = BytesIO(file)
|
||||
doc = Document(docx_file)
|
||||
content = "\n".join(
|
||||
[paragraph.text for paragraph in doc.paragraphs]
|
||||
# Try DOCLING first if configured and available
|
||||
if (
|
||||
global_args.document_loading_engine == "DOCLING"
|
||||
and DOCLING_AVAILABLE
|
||||
):
|
||||
content = await asyncio.to_thread(
|
||||
_convert_with_docling, file_path
|
||||
)
|
||||
else:
|
||||
# Use python-docx (non-blocking via to_thread)
|
||||
content = await asyncio.to_thread(_extract_docx, file)
|
||||
except Exception as e:
|
||||
error_files = [
|
||||
{
|
||||
@@ -1213,26 +1265,17 @@ async def pipeline_enqueue_file(
|
||||
|
||||
case ".pptx":
|
||||
try:
|
||||
if global_args.document_loading_engine == "DOCLING":
|
||||
if not pm.is_installed("docling"): # type: ignore
|
||||
pm.install("docling")
|
||||
from docling.document_converter import DocumentConverter # type: ignore
|
||||
|
||||
converter = DocumentConverter()
|
||||
result = converter.convert(file_path)
|
||||
content = result.document.export_to_markdown()
|
||||
# Try DOCLING first if configured and available
|
||||
if (
|
||||
global_args.document_loading_engine == "DOCLING"
|
||||
and DOCLING_AVAILABLE
|
||||
):
|
||||
content = await asyncio.to_thread(
|
||||
_convert_with_docling, file_path
|
||||
)
|
||||
else:
|
||||
if not pm.is_installed("python-pptx"): # type: ignore
|
||||
pm.install("pptx")
|
||||
from pptx import Presentation # type: ignore
|
||||
from io import BytesIO
|
||||
|
||||
pptx_file = BytesIO(file)
|
||||
prs = Presentation(pptx_file)
|
||||
for slide in prs.slides:
|
||||
for shape in slide.shapes:
|
||||
if hasattr(shape, "text"):
|
||||
content += shape.text + "\n"
|
||||
# Use python-pptx (non-blocking via to_thread)
|
||||
content = await asyncio.to_thread(_extract_pptx, file)
|
||||
except Exception as e:
|
||||
error_files = [
|
||||
{
|
||||
@@ -1252,33 +1295,17 @@ async def pipeline_enqueue_file(
|
||||
|
||||
case ".xlsx":
|
||||
try:
|
||||
if global_args.document_loading_engine == "DOCLING":
|
||||
if not pm.is_installed("docling"): # type: ignore
|
||||
pm.install("docling")
|
||||
from docling.document_converter import DocumentConverter # type: ignore
|
||||
|
||||
converter = DocumentConverter()
|
||||
result = converter.convert(file_path)
|
||||
content = result.document.export_to_markdown()
|
||||
# Try DOCLING first if configured and available
|
||||
if (
|
||||
global_args.document_loading_engine == "DOCLING"
|
||||
and DOCLING_AVAILABLE
|
||||
):
|
||||
content = await asyncio.to_thread(
|
||||
_convert_with_docling, file_path
|
||||
)
|
||||
else:
|
||||
if not pm.is_installed("openpyxl"): # type: ignore
|
||||
pm.install("openpyxl")
|
||||
from openpyxl import load_workbook # type: ignore
|
||||
from io import BytesIO
|
||||
|
||||
xlsx_file = BytesIO(file)
|
||||
wb = load_workbook(xlsx_file)
|
||||
for sheet in wb:
|
||||
content += f"Sheet: {sheet.title}\n"
|
||||
for row in sheet.iter_rows(values_only=True):
|
||||
content += (
|
||||
"\t".join(
|
||||
str(cell) if cell is not None else ""
|
||||
for cell in row
|
||||
)
|
||||
+ "\n"
|
||||
)
|
||||
content += "\n"
|
||||
# Use openpyxl (non-blocking via to_thread)
|
||||
content = await asyncio.to_thread(_extract_xlsx, file)
|
||||
except Exception as e:
|
||||
error_files = [
|
||||
{
|
||||
|
||||
@@ -79,18 +79,20 @@ api = [
|
||||
"python-multipart",
|
||||
"pytz",
|
||||
"uvicorn",
|
||||
# Document processing dependencies (required for API document upload functionality)
|
||||
"openpyxl>=3.0.0,<4.0.0", # XLSX processing
|
||||
"pycryptodome>=3.0.0,<4.0.0", # PDF encryption support
|
||||
"pypdf>=6.1.0", # PDF processing
|
||||
"python-docx>=0.8.11,<2.0.0", # DOCX processing
|
||||
"python-pptx>=0.6.21,<2.0.0", # PPTX processing
|
||||
]
|
||||
|
||||
# Advanced document processing engine (optional)
|
||||
docling = [
|
||||
"docling>=2.0.0,<3.0.0",
|
||||
]
|
||||
|
||||
# Offline deployment dependencies (layered design for flexibility)
|
||||
offline-docs = [
|
||||
# Document processing dependencies
|
||||
"openpyxl>=3.0.0,<4.0.0",
|
||||
"pycryptodome>=3.0.0,<4.0.0",
|
||||
"pypdf>=6.1.0",
|
||||
"python-docx>=0.8.11,<2.0.0",
|
||||
"python-pptx>=0.6.21,<2.0.0",
|
||||
]
|
||||
|
||||
offline-storage = [
|
||||
# Storage backend dependencies
|
||||
"redis>=5.0.0,<8.0.0",
|
||||
@@ -115,8 +117,8 @@ offline-llm = [
|
||||
]
|
||||
|
||||
offline = [
|
||||
# Complete offline package (includes all offline dependencies)
|
||||
"lightrag-hku[offline-docs,offline-storage,offline-llm]",
|
||||
# Complete offline package (includes api for document processing, plus storage and LLM)
|
||||
"lightrag-hku[api,offline-storage,offline-llm]",
|
||||
]
|
||||
|
||||
evaluation = [
|
||||
|
||||
@@ -1,15 +0,0 @@
|
||||
# LightRAG Offline Dependencies - Document Processing
|
||||
# Install with: pip install -r requirements-offline-docs.txt
|
||||
# For offline installation:
|
||||
# pip download -r requirements-offline-docs.txt -d ./packages
|
||||
# pip install --no-index --find-links=./packages -r requirements-offline-docs.txt
|
||||
#
|
||||
# Recommended: Use pip install lightrag-hku[offline-docs] for the same effect
|
||||
# Or use constraints: pip install --constraint constraints-offline.txt -r requirements-offline-docs.txt
|
||||
|
||||
# Document processing dependencies (with version constraints matching pyproject.toml)
|
||||
openpyxl>=3.0.0,<4.0.0
|
||||
pycryptodome>=3.0.0,<4.0.0
|
||||
pypdf>=6.1.0
|
||||
python-docx>=0.8.11,<2.0.0
|
||||
python-pptx>=0.6.21,<2.0.0
|
||||
Reference in New Issue
Block a user