refactor: move document deps to api group, remove dynamic imports

- Merge offline-docs into api extras
- Remove pipmaster dynamic installs
- Add async document processing
- Pre-check docling availability
- Update offline deployment docs
This commit is contained in:
yangdx
2025-11-13 13:34:09 +08:00
parent 7d394fb0a4
commit 69a0b74ce7
4 changed files with 205 additions and 191 deletions

View File

@@ -23,10 +23,11 @@ LightRAG uses dynamic package installation (`pipmaster`) for optional features b
LightRAG dynamically installs packages for:
- **Document Processing**: `docling`, `pypdf2`, `python-docx`, `python-pptx`, `openpyxl`
- **Storage Backends**: `redis`, `neo4j`, `pymilvus`, `pymongo`, `asyncpg`, `qdrant-client`
- **LLM Providers**: `openai`, `anthropic`, `ollama`, `zhipuai`, `aioboto3`, `voyageai`, `llama-index`, `lmdeploy`, `transformers`, `torch`
- Tiktoken Models**: BPE encoding models downloaded from OpenAI CDN
- **Tiktoken Models**: BPE encoding models downloaded from OpenAI CDN
**Note**: Document processing dependencies (`pypdf`, `python-docx`, `python-pptx`, `openpyxl`) are now pre-installed with the `api` extras group and no longer require dynamic installation.
## Quick Start
@@ -75,32 +76,31 @@ LightRAG provides flexible dependency groups for different use cases:
| Group | Description | Use Case |
|-------|-------------|----------|
| `offline-docs` | Document processing | PDF, DOCX, PPTX, XLSX files |
| `api` | API server + document processing | FastAPI server with PDF, DOCX, PPTX, XLSX support |
| `offline-storage` | Storage backends | Redis, Neo4j, MongoDB, PostgreSQL, etc. |
| `offline-llm` | LLM providers | OpenAI, Anthropic, Ollama, etc. |
| `offline` | All of the above | Complete offline deployment |
| `offline` | Complete offline package | API + Storage + LLM (all features) |
**Note**: Document processing (PDF, DOCX, PPTX, XLSX) is included in the `api` extras group. The previous `offline-docs` group has been merged into `api` for better integration.
> Software packages requiring `transformers`, `torch`, or `cuda` will not be included in the offline dependency group.
### Installation Examples
```bash
# Install only document processing dependencies
pip install lightrag-hku[offline-docs]
# Install API with document processing
pip install lightrag-hku[api]
# Install document processing and storage backends
pip install lightrag-hku[offline-docs,offline-storage]
# Install API and storage backends
pip install lightrag-hku[api,offline-storage]
# Install all offline dependencies
# Install all offline dependencies (recommended for offline deployment)
pip install lightrag-hku[offline]
```
### Using Individual Requirements Files
```bash
# Document processing only
pip install -r requirements-offline-docs.txt
# Storage backends only
pip install -r requirements-offline-storage.txt
@@ -244,8 +244,8 @@ ls -la ~/.tiktoken_cache/
**Solution**:
```bash
# Pre-install the specific package you need
# For document processing:
pip install lightrag-hku[offline-docs]
# For API with document processing:
pip install lightrag-hku[api]
# For storage backends:
pip install lightrag-hku[offline-storage]
@@ -297,9 +297,9 @@ mkdir -p ~/my_tiktoken_cache
5. **Minimal Installation**: Only install what you need:
```bash
# If you only process PDFs with OpenAI
pip install lightrag-hku[offline-docs]
# Then manually add: pip install openai
# If you only need API with document processing
pip install lightrag-hku[api]
# Then manually add specific LLM: pip install openai
```
## Additional Resources

View File

@@ -7,10 +7,10 @@ from lightrag.utils import logger, get_pinyin_sort_key
import aiofiles
import shutil
import traceback
import pipmaster as pm
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Any, Literal
from io import BytesIO
from fastapi import (
APIRouter,
BackgroundTasks,
@@ -27,6 +27,20 @@ from lightrag.utils import generate_track_id
from lightrag.api.utils_api import get_combined_auth_dependency
from ..config import global_args
# Check docling availability at module load time
DOCLING_AVAILABLE = False
try:
import docling # noqa: F401 # type: ignore[import-not-found]
DOCLING_AVAILABLE = True
except ImportError:
if global_args.document_loading_engine == "DOCLING":
logger.warning(
"DOCLING engine requested but 'docling' package not installed. "
"Falling back to standard document processing. "
"To use DOCLING, install with: pip install lightrag-hku[api,docling]"
)
# Function to format datetime to ISO format string with timezone information
def format_datetime(dt: Any) -> Optional[str]:
@@ -879,7 +893,6 @@ def get_unique_filename_in_enqueued(target_dir: Path, original_name: str) -> str
Returns:
str: Unique filename (may have numeric suffix added)
"""
from pathlib import Path
import time
original_path = Path(original_name)
@@ -902,6 +915,122 @@ def get_unique_filename_in_enqueued(target_dir: Path, original_name: str) -> str
return f"{base_name}_{timestamp}{extension}"
# Document processing helper functions (synchronous)
# These functions run in thread pool via asyncio.to_thread() to avoid blocking the event loop
def _convert_with_docling(file_path: Path) -> str:
"""Convert document using docling (synchronous).
Args:
file_path: Path to the document file
Returns:
str: Extracted markdown content
"""
from docling.document_converter import DocumentConverter # type: ignore
converter = DocumentConverter()
result = converter.convert(file_path)
return result.document.export_to_markdown()
def _extract_pdf_pypdf(file_bytes: bytes, password: str = None) -> str:
"""Extract PDF content using pypdf (synchronous).
Args:
file_bytes: PDF file content as bytes
password: Optional password for encrypted PDFs
Returns:
str: Extracted text content
Raises:
Exception: If PDF is encrypted and password is incorrect or missing
"""
from pypdf import PdfReader # type: ignore
pdf_file = BytesIO(file_bytes)
reader = PdfReader(pdf_file)
# Check if PDF is encrypted
if reader.is_encrypted:
if not password:
raise Exception("PDF is encrypted but no password provided")
decrypt_result = reader.decrypt(password)
if decrypt_result == 0:
raise Exception("Incorrect PDF password")
# Extract text from all pages
content = ""
for page in reader.pages:
content += page.extract_text() + "\n"
return content
def _extract_docx(file_bytes: bytes) -> str:
"""Extract DOCX content (synchronous).
Args:
file_bytes: DOCX file content as bytes
Returns:
str: Extracted text content
"""
from docx import Document # type: ignore
docx_file = BytesIO(file_bytes)
doc = Document(docx_file)
return "\n".join([paragraph.text for paragraph in doc.paragraphs])
def _extract_pptx(file_bytes: bytes) -> str:
"""Extract PPTX content (synchronous).
Args:
file_bytes: PPTX file content as bytes
Returns:
str: Extracted text content
"""
from pptx import Presentation # type: ignore
pptx_file = BytesIO(file_bytes)
prs = Presentation(pptx_file)
content = ""
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
content += shape.text + "\n"
return content
def _extract_xlsx(file_bytes: bytes) -> str:
"""Extract XLSX content (synchronous).
Args:
file_bytes: XLSX file content as bytes
Returns:
str: Extracted text content
"""
from openpyxl import load_workbook # type: ignore
xlsx_file = BytesIO(file_bytes)
wb = load_workbook(xlsx_file)
content = ""
for sheet in wb:
content += f"Sheet: {sheet.title}\n"
for row in sheet.iter_rows(values_only=True):
content += (
"\t".join(str(cell) if cell is not None else "" for cell in row) + "\n"
)
content += "\n"
return content
async def pipeline_enqueue_file(
rag: LightRAG, file_path: Path, track_id: str = None
) -> tuple[bool, str]:
@@ -1072,87 +1201,21 @@ async def pipeline_enqueue_file(
case ".pdf":
try:
if global_args.document_loading_engine == "DOCLING":
if not pm.is_installed("docling"): # type: ignore
pm.install("docling")
from docling.document_converter import DocumentConverter # type: ignore
converter = DocumentConverter()
result = converter.convert(file_path)
content = result.document.export_to_markdown()
# Try DOCLING first if configured and available
if (
global_args.document_loading_engine == "DOCLING"
and DOCLING_AVAILABLE
):
content = await asyncio.to_thread(
_convert_with_docling, file_path
)
else:
if not pm.is_installed("pypdf"): # type: ignore
pm.install("pypdf")
if not pm.is_installed("pycryptodome"): # type: ignore
pm.install("pycryptodome")
from pypdf import PdfReader # type: ignore
from io import BytesIO
pdf_file = BytesIO(file)
reader = PdfReader(pdf_file)
# Check if PDF is encrypted
if reader.is_encrypted:
pdf_password = global_args.pdf_decrypt_password
if not pdf_password:
# PDF is encrypted but no password provided
error_files = [
{
"file_path": str(file_path.name),
"error_description": "[File Extraction]PDF is encrypted but no password provided",
"original_error": "Please set PDF_DECRYPT_PASSWORD environment variable to decrypt this PDF file",
"file_size": file_size,
}
]
await rag.apipeline_enqueue_error_documents(
error_files, track_id
)
logger.error(
f"[File Extraction]PDF is encrypted but no password provided: {file_path.name}"
)
return False, track_id
# Try to decrypt with password
try:
decrypt_result = reader.decrypt(pdf_password)
if decrypt_result == 0:
# Password is incorrect
error_files = [
{
"file_path": str(file_path.name),
"error_description": "[File Extraction]Failed to decrypt PDF - incorrect password",
"original_error": "The provided PDF_DECRYPT_PASSWORD is incorrect for this file",
"file_size": file_size,
}
]
await rag.apipeline_enqueue_error_documents(
error_files, track_id
)
logger.error(
f"[File Extraction]Incorrect PDF password: {file_path.name}"
)
return False, track_id
except Exception as decrypt_error:
# Decryption process error
error_files = [
{
"file_path": str(file_path.name),
"error_description": "[File Extraction]PDF decryption failed",
"original_error": f"Error during PDF decryption: {str(decrypt_error)}",
"file_size": file_size,
}
]
await rag.apipeline_enqueue_error_documents(
error_files, track_id
)
logger.error(
f"[File Extraction]PDF decryption error for {file_path.name}: {str(decrypt_error)}"
)
return False, track_id
# Extract text from PDF (encrypted PDFs are now decrypted, unencrypted PDFs proceed directly)
for page in reader.pages:
content += page.extract_text() + "\n"
# Use pypdf (non-blocking via to_thread)
content = await asyncio.to_thread(
_extract_pdf_pypdf,
file,
global_args.pdf_decrypt_password,
)
except Exception as e:
error_files = [
{
@@ -1172,28 +1235,17 @@ async def pipeline_enqueue_file(
case ".docx":
try:
if global_args.document_loading_engine == "DOCLING":
if not pm.is_installed("docling"): # type: ignore
pm.install("docling")
from docling.document_converter import DocumentConverter # type: ignore
converter = DocumentConverter()
result = converter.convert(file_path)
content = result.document.export_to_markdown()
else:
if not pm.is_installed("python-docx"): # type: ignore
try:
pm.install("python-docx")
except Exception:
pm.install("docx")
from docx import Document # type: ignore
from io import BytesIO
docx_file = BytesIO(file)
doc = Document(docx_file)
content = "\n".join(
[paragraph.text for paragraph in doc.paragraphs]
# Try DOCLING first if configured and available
if (
global_args.document_loading_engine == "DOCLING"
and DOCLING_AVAILABLE
):
content = await asyncio.to_thread(
_convert_with_docling, file_path
)
else:
# Use python-docx (non-blocking via to_thread)
content = await asyncio.to_thread(_extract_docx, file)
except Exception as e:
error_files = [
{
@@ -1213,26 +1265,17 @@ async def pipeline_enqueue_file(
case ".pptx":
try:
if global_args.document_loading_engine == "DOCLING":
if not pm.is_installed("docling"): # type: ignore
pm.install("docling")
from docling.document_converter import DocumentConverter # type: ignore
converter = DocumentConverter()
result = converter.convert(file_path)
content = result.document.export_to_markdown()
# Try DOCLING first if configured and available
if (
global_args.document_loading_engine == "DOCLING"
and DOCLING_AVAILABLE
):
content = await asyncio.to_thread(
_convert_with_docling, file_path
)
else:
if not pm.is_installed("python-pptx"): # type: ignore
pm.install("pptx")
from pptx import Presentation # type: ignore
from io import BytesIO
pptx_file = BytesIO(file)
prs = Presentation(pptx_file)
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
content += shape.text + "\n"
# Use python-pptx (non-blocking via to_thread)
content = await asyncio.to_thread(_extract_pptx, file)
except Exception as e:
error_files = [
{
@@ -1252,33 +1295,17 @@ async def pipeline_enqueue_file(
case ".xlsx":
try:
if global_args.document_loading_engine == "DOCLING":
if not pm.is_installed("docling"): # type: ignore
pm.install("docling")
from docling.document_converter import DocumentConverter # type: ignore
converter = DocumentConverter()
result = converter.convert(file_path)
content = result.document.export_to_markdown()
# Try DOCLING first if configured and available
if (
global_args.document_loading_engine == "DOCLING"
and DOCLING_AVAILABLE
):
content = await asyncio.to_thread(
_convert_with_docling, file_path
)
else:
if not pm.is_installed("openpyxl"): # type: ignore
pm.install("openpyxl")
from openpyxl import load_workbook # type: ignore
from io import BytesIO
xlsx_file = BytesIO(file)
wb = load_workbook(xlsx_file)
for sheet in wb:
content += f"Sheet: {sheet.title}\n"
for row in sheet.iter_rows(values_only=True):
content += (
"\t".join(
str(cell) if cell is not None else ""
for cell in row
)
+ "\n"
)
content += "\n"
# Use openpyxl (non-blocking via to_thread)
content = await asyncio.to_thread(_extract_xlsx, file)
except Exception as e:
error_files = [
{

View File

@@ -79,18 +79,20 @@ api = [
"python-multipart",
"pytz",
"uvicorn",
# Document processing dependencies (required for API document upload functionality)
"openpyxl>=3.0.0,<4.0.0", # XLSX processing
"pycryptodome>=3.0.0,<4.0.0", # PDF encryption support
"pypdf>=6.1.0", # PDF processing
"python-docx>=0.8.11,<2.0.0", # DOCX processing
"python-pptx>=0.6.21,<2.0.0", # PPTX processing
]
# Advanced document processing engine (optional)
docling = [
"docling>=2.0.0,<3.0.0",
]
# Offline deployment dependencies (layered design for flexibility)
offline-docs = [
# Document processing dependencies
"openpyxl>=3.0.0,<4.0.0",
"pycryptodome>=3.0.0,<4.0.0",
"pypdf>=6.1.0",
"python-docx>=0.8.11,<2.0.0",
"python-pptx>=0.6.21,<2.0.0",
]
offline-storage = [
# Storage backend dependencies
"redis>=5.0.0,<8.0.0",
@@ -115,8 +117,8 @@ offline-llm = [
]
offline = [
# Complete offline package (includes all offline dependencies)
"lightrag-hku[offline-docs,offline-storage,offline-llm]",
# Complete offline package (includes api for document processing, plus storage and LLM)
"lightrag-hku[api,offline-storage,offline-llm]",
]
evaluation = [

View File

@@ -1,15 +0,0 @@
# LightRAG Offline Dependencies - Document Processing
# Install with: pip install -r requirements-offline-docs.txt
# For offline installation:
# pip download -r requirements-offline-docs.txt -d ./packages
# pip install --no-index --find-links=./packages -r requirements-offline-docs.txt
#
# Recommended: Use pip install lightrag-hku[offline-docs] for the same effect
# Or use constraints: pip install --constraint constraints-offline.txt -r requirements-offline-docs.txt
# Document processing dependencies (with version constraints matching pyproject.toml)
openpyxl>=3.0.0,<4.0.0
pycryptodome>=3.0.0,<4.0.0
pypdf>=6.1.0
python-docx>=0.8.11,<2.0.0
python-pptx>=0.6.21,<2.0.0