From 69a0b74ce73e1438dc1ccb6fcb31c386b2d0c3be Mon Sep 17 00:00:00 2001 From: yangdx Date: Thu, 13 Nov 2025 13:34:09 +0800 Subject: [PATCH] refactor: move document deps to api group, remove dynamic imports - Merge offline-docs into api extras - Remove pipmaster dynamic installs - Add async document processing - Pre-check docling availability - Update offline deployment docs --- docs/OfflineDeployment.md | 34 +-- lightrag/api/routers/document_routes.py | 323 +++++++++++++----------- pyproject.toml | 24 +- requirements-offline-docs.txt | 15 -- 4 files changed, 205 insertions(+), 191 deletions(-) delete mode 100644 requirements-offline-docs.txt diff --git a/docs/OfflineDeployment.md b/docs/OfflineDeployment.md index 5307da6f..e186dda0 100644 --- a/docs/OfflineDeployment.md +++ b/docs/OfflineDeployment.md @@ -23,10 +23,11 @@ LightRAG uses dynamic package installation (`pipmaster`) for optional features b LightRAG dynamically installs packages for: -- **Document Processing**: `docling`, `pypdf2`, `python-docx`, `python-pptx`, `openpyxl` - **Storage Backends**: `redis`, `neo4j`, `pymilvus`, `pymongo`, `asyncpg`, `qdrant-client` - **LLM Providers**: `openai`, `anthropic`, `ollama`, `zhipuai`, `aioboto3`, `voyageai`, `llama-index`, `lmdeploy`, `transformers`, `torch` -- Tiktoken Models**: BPE encoding models downloaded from OpenAI CDN +- **Tiktoken Models**: BPE encoding models downloaded from OpenAI CDN + +**Note**: Document processing dependencies (`pypdf`, `python-docx`, `python-pptx`, `openpyxl`) are now pre-installed with the `api` extras group and no longer require dynamic installation. ## Quick Start @@ -75,32 +76,31 @@ LightRAG provides flexible dependency groups for different use cases: | Group | Description | Use Case | |-------|-------------|----------| -| `offline-docs` | Document processing | PDF, DOCX, PPTX, XLSX files | +| `api` | API server + document processing | FastAPI server with PDF, DOCX, PPTX, XLSX support | | `offline-storage` | Storage backends | Redis, Neo4j, MongoDB, PostgreSQL, etc. | | `offline-llm` | LLM providers | OpenAI, Anthropic, Ollama, etc. | -| `offline` | All of the above | Complete offline deployment | +| `offline` | Complete offline package | API + Storage + LLM (all features) | + +**Note**: Document processing (PDF, DOCX, PPTX, XLSX) is included in the `api` extras group. The previous `offline-docs` group has been merged into `api` for better integration. > Software packages requiring `transformers`, `torch`, or `cuda` will not be included in the offline dependency group. ### Installation Examples ```bash -# Install only document processing dependencies -pip install lightrag-hku[offline-docs] +# Install API with document processing +pip install lightrag-hku[api] -# Install document processing and storage backends -pip install lightrag-hku[offline-docs,offline-storage] +# Install API and storage backends +pip install lightrag-hku[api,offline-storage] -# Install all offline dependencies +# Install all offline dependencies (recommended for offline deployment) pip install lightrag-hku[offline] ``` ### Using Individual Requirements Files ```bash -# Document processing only -pip install -r requirements-offline-docs.txt - # Storage backends only pip install -r requirements-offline-storage.txt @@ -244,8 +244,8 @@ ls -la ~/.tiktoken_cache/ **Solution**: ```bash # Pre-install the specific package you need -# For document processing: -pip install lightrag-hku[offline-docs] +# For API with document processing: +pip install lightrag-hku[api] # For storage backends: pip install lightrag-hku[offline-storage] @@ -297,9 +297,9 @@ mkdir -p ~/my_tiktoken_cache 5. **Minimal Installation**: Only install what you need: ```bash - # If you only process PDFs with OpenAI - pip install lightrag-hku[offline-docs] - # Then manually add: pip install openai + # If you only need API with document processing + pip install lightrag-hku[api] + # Then manually add specific LLM: pip install openai ``` ## Additional Resources diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 3ba4e733..528e5aed 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -7,10 +7,10 @@ from lightrag.utils import logger, get_pinyin_sort_key import aiofiles import shutil import traceback -import pipmaster as pm from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Optional, Any, Literal +from io import BytesIO from fastapi import ( APIRouter, BackgroundTasks, @@ -27,6 +27,20 @@ from lightrag.utils import generate_track_id from lightrag.api.utils_api import get_combined_auth_dependency from ..config import global_args +# Check docling availability at module load time +DOCLING_AVAILABLE = False +try: + import docling # noqa: F401 # type: ignore[import-not-found] + + DOCLING_AVAILABLE = True +except ImportError: + if global_args.document_loading_engine == "DOCLING": + logger.warning( + "DOCLING engine requested but 'docling' package not installed. " + "Falling back to standard document processing. " + "To use DOCLING, install with: pip install lightrag-hku[api,docling]" + ) + # Function to format datetime to ISO format string with timezone information def format_datetime(dt: Any) -> Optional[str]: @@ -879,7 +893,6 @@ def get_unique_filename_in_enqueued(target_dir: Path, original_name: str) -> str Returns: str: Unique filename (may have numeric suffix added) """ - from pathlib import Path import time original_path = Path(original_name) @@ -902,6 +915,122 @@ def get_unique_filename_in_enqueued(target_dir: Path, original_name: str) -> str return f"{base_name}_{timestamp}{extension}" +# Document processing helper functions (synchronous) +# These functions run in thread pool via asyncio.to_thread() to avoid blocking the event loop + + +def _convert_with_docling(file_path: Path) -> str: + """Convert document using docling (synchronous). + + Args: + file_path: Path to the document file + + Returns: + str: Extracted markdown content + """ + from docling.document_converter import DocumentConverter # type: ignore + + converter = DocumentConverter() + result = converter.convert(file_path) + return result.document.export_to_markdown() + + +def _extract_pdf_pypdf(file_bytes: bytes, password: str = None) -> str: + """Extract PDF content using pypdf (synchronous). + + Args: + file_bytes: PDF file content as bytes + password: Optional password for encrypted PDFs + + Returns: + str: Extracted text content + + Raises: + Exception: If PDF is encrypted and password is incorrect or missing + """ + from pypdf import PdfReader # type: ignore + + pdf_file = BytesIO(file_bytes) + reader = PdfReader(pdf_file) + + # Check if PDF is encrypted + if reader.is_encrypted: + if not password: + raise Exception("PDF is encrypted but no password provided") + + decrypt_result = reader.decrypt(password) + if decrypt_result == 0: + raise Exception("Incorrect PDF password") + + # Extract text from all pages + content = "" + for page in reader.pages: + content += page.extract_text() + "\n" + + return content + + +def _extract_docx(file_bytes: bytes) -> str: + """Extract DOCX content (synchronous). + + Args: + file_bytes: DOCX file content as bytes + + Returns: + str: Extracted text content + """ + from docx import Document # type: ignore + + docx_file = BytesIO(file_bytes) + doc = Document(docx_file) + return "\n".join([paragraph.text for paragraph in doc.paragraphs]) + + +def _extract_pptx(file_bytes: bytes) -> str: + """Extract PPTX content (synchronous). + + Args: + file_bytes: PPTX file content as bytes + + Returns: + str: Extracted text content + """ + from pptx import Presentation # type: ignore + + pptx_file = BytesIO(file_bytes) + prs = Presentation(pptx_file) + content = "" + for slide in prs.slides: + for shape in slide.shapes: + if hasattr(shape, "text"): + content += shape.text + "\n" + return content + + +def _extract_xlsx(file_bytes: bytes) -> str: + """Extract XLSX content (synchronous). + + Args: + file_bytes: XLSX file content as bytes + + Returns: + str: Extracted text content + """ + from openpyxl import load_workbook # type: ignore + + xlsx_file = BytesIO(file_bytes) + wb = load_workbook(xlsx_file) + content = "" + for sheet in wb: + content += f"Sheet: {sheet.title}\n" + for row in sheet.iter_rows(values_only=True): + content += ( + "\t".join(str(cell) if cell is not None else "" for cell in row) + "\n" + ) + content += "\n" + return content + + async def pipeline_enqueue_file( rag: LightRAG, file_path: Path, track_id: str = None ) -> tuple[bool, str]: @@ -1072,87 +1201,21 @@ async def pipeline_enqueue_file( case ".pdf": try: - if global_args.document_loading_engine == "DOCLING": - if not pm.is_installed("docling"): # type: ignore - pm.install("docling") - from docling.document_converter import DocumentConverter # type: ignore - - converter = DocumentConverter() - result = converter.convert(file_path) - content = result.document.export_to_markdown() + # Try DOCLING first if configured and available + if ( + global_args.document_loading_engine == "DOCLING" + and DOCLING_AVAILABLE + ): + content = await asyncio.to_thread( + _convert_with_docling, file_path + ) else: - if not pm.is_installed("pypdf"): # type: ignore - pm.install("pypdf") - if not pm.is_installed("pycryptodome"): # type: ignore - pm.install("pycryptodome") - from pypdf import PdfReader # type: ignore - from io import BytesIO - - pdf_file = BytesIO(file) - reader = PdfReader(pdf_file) - - # Check if PDF is encrypted - if reader.is_encrypted: - pdf_password = global_args.pdf_decrypt_password - if not pdf_password: - # PDF is encrypted but no password provided - error_files = [ - { - "file_path": str(file_path.name), - "error_description": "[File Extraction]PDF is encrypted but no password provided", - "original_error": "Please set PDF_DECRYPT_PASSWORD environment variable to decrypt this PDF file", - "file_size": file_size, - } - ] - await rag.apipeline_enqueue_error_documents( - error_files, track_id - ) - logger.error( - f"[File Extraction]PDF is encrypted but no password provided: {file_path.name}" - ) - return False, track_id - - # Try to decrypt with password - try: - decrypt_result = reader.decrypt(pdf_password) - if decrypt_result == 0: - # Password is incorrect - error_files = [ - { - "file_path": str(file_path.name), - "error_description": "[File Extraction]Failed to decrypt PDF - incorrect password", - "original_error": "The provided PDF_DECRYPT_PASSWORD is incorrect for this file", - "file_size": file_size, - } - ] - await rag.apipeline_enqueue_error_documents( - error_files, track_id - ) - logger.error( - f"[File Extraction]Incorrect PDF password: {file_path.name}" - ) - return False, track_id - except Exception as decrypt_error: - # Decryption process error - error_files = [ - { - "file_path": str(file_path.name), - "error_description": "[File Extraction]PDF decryption failed", - "original_error": f"Error during PDF decryption: {str(decrypt_error)}", - "file_size": file_size, - } - ] - await rag.apipeline_enqueue_error_documents( - error_files, track_id - ) - logger.error( - f"[File Extraction]PDF decryption error for {file_path.name}: {str(decrypt_error)}" - ) - return False, track_id - - # Extract text from PDF (encrypted PDFs are now decrypted, unencrypted PDFs proceed directly) - for page in reader.pages: - content += page.extract_text() + "\n" + # Use pypdf (non-blocking via to_thread) + content = await asyncio.to_thread( + _extract_pdf_pypdf, + file, + global_args.pdf_decrypt_password, + ) except Exception as e: error_files = [ { @@ -1172,28 +1235,17 @@ async def pipeline_enqueue_file( case ".docx": try: - if global_args.document_loading_engine == "DOCLING": - if not pm.is_installed("docling"): # type: ignore - pm.install("docling") - from docling.document_converter import DocumentConverter # type: ignore - - converter = DocumentConverter() - result = converter.convert(file_path) - content = result.document.export_to_markdown() - else: - if not pm.is_installed("python-docx"): # type: ignore - try: - pm.install("python-docx") - except Exception: - pm.install("docx") - from docx import Document # type: ignore - from io import BytesIO - - docx_file = BytesIO(file) - doc = Document(docx_file) - content = "\n".join( - [paragraph.text for paragraph in doc.paragraphs] + # Try DOCLING first if configured and available + if ( + global_args.document_loading_engine == "DOCLING" + and DOCLING_AVAILABLE + ): + content = await asyncio.to_thread( + _convert_with_docling, file_path ) + else: + # Use python-docx (non-blocking via to_thread) + content = await asyncio.to_thread(_extract_docx, file) except Exception as e: error_files = [ { @@ -1213,26 +1265,17 @@ async def pipeline_enqueue_file( case ".pptx": try: - if global_args.document_loading_engine == "DOCLING": - if not pm.is_installed("docling"): # type: ignore - pm.install("docling") - from docling.document_converter import DocumentConverter # type: ignore - - converter = DocumentConverter() - result = converter.convert(file_path) - content = result.document.export_to_markdown() + # Try DOCLING first if configured and available + if ( + global_args.document_loading_engine == "DOCLING" + and DOCLING_AVAILABLE + ): + content = await asyncio.to_thread( + _convert_with_docling, file_path + ) else: - if not pm.is_installed("python-pptx"): # type: ignore - pm.install("pptx") - from pptx import Presentation # type: ignore - from io import BytesIO - - pptx_file = BytesIO(file) - prs = Presentation(pptx_file) - for slide in prs.slides: - for shape in slide.shapes: - if hasattr(shape, "text"): - content += shape.text + "\n" + # Use python-pptx (non-blocking via to_thread) + content = await asyncio.to_thread(_extract_pptx, file) except Exception as e: error_files = [ { @@ -1252,33 +1295,17 @@ async def pipeline_enqueue_file( case ".xlsx": try: - if global_args.document_loading_engine == "DOCLING": - if not pm.is_installed("docling"): # type: ignore - pm.install("docling") - from docling.document_converter import DocumentConverter # type: ignore - - converter = DocumentConverter() - result = converter.convert(file_path) - content = result.document.export_to_markdown() + # Try DOCLING first if configured and available + if ( + global_args.document_loading_engine == "DOCLING" + and DOCLING_AVAILABLE + ): + content = await asyncio.to_thread( + _convert_with_docling, file_path + ) else: - if not pm.is_installed("openpyxl"): # type: ignore - pm.install("openpyxl") - from openpyxl import load_workbook # type: ignore - from io import BytesIO - - xlsx_file = BytesIO(file) - wb = load_workbook(xlsx_file) - for sheet in wb: - content += f"Sheet: {sheet.title}\n" - for row in sheet.iter_rows(values_only=True): - content += ( - "\t".join( - str(cell) if cell is not None else "" - for cell in row - ) - + "\n" - ) - content += "\n" + # Use openpyxl (non-blocking via to_thread) + content = await asyncio.to_thread(_extract_xlsx, file) except Exception as e: error_files = [ { diff --git a/pyproject.toml b/pyproject.toml index 81e44aff..53378de0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -79,18 +79,20 @@ api = [ "python-multipart", "pytz", "uvicorn", + # Document processing dependencies (required for API document upload functionality) + "openpyxl>=3.0.0,<4.0.0", # XLSX processing + "pycryptodome>=3.0.0,<4.0.0", # PDF encryption support + "pypdf>=6.1.0", # PDF processing + "python-docx>=0.8.11,<2.0.0", # DOCX processing + "python-pptx>=0.6.21,<2.0.0", # PPTX processing +] + +# Advanced document processing engine (optional) +docling = [ + "docling>=2.0.0,<3.0.0", ] # Offline deployment dependencies (layered design for flexibility) -offline-docs = [ - # Document processing dependencies - "openpyxl>=3.0.0,<4.0.0", - "pycryptodome>=3.0.0,<4.0.0", - "pypdf>=6.1.0", - "python-docx>=0.8.11,<2.0.0", - "python-pptx>=0.6.21,<2.0.0", -] - offline-storage = [ # Storage backend dependencies "redis>=5.0.0,<8.0.0", @@ -115,8 +117,8 @@ offline-llm = [ ] offline = [ - # Complete offline package (includes all offline dependencies) - "lightrag-hku[offline-docs,offline-storage,offline-llm]", + # Complete offline package (includes api for document processing, plus storage and LLM) + "lightrag-hku[api,offline-storage,offline-llm]", ] evaluation = [ diff --git a/requirements-offline-docs.txt b/requirements-offline-docs.txt deleted file mode 100644 index 12f02080..00000000 --- a/requirements-offline-docs.txt +++ /dev/null @@ -1,15 +0,0 @@ -# LightRAG Offline Dependencies - Document Processing -# Install with: pip install -r requirements-offline-docs.txt -# For offline installation: -# pip download -r requirements-offline-docs.txt -d ./packages -# pip install --no-index --find-links=./packages -r requirements-offline-docs.txt -# -# Recommended: Use pip install lightrag-hku[offline-docs] for the same effect -# Or use constraints: pip install --constraint constraints-offline.txt -r requirements-offline-docs.txt - -# Document processing dependencies (with version constraints matching pyproject.toml) -openpyxl>=3.0.0,<4.0.0 -pycryptodome>=3.0.0,<4.0.0 -pypdf>=6.1.0 -python-docx>=0.8.11,<2.0.0 -python-pptx>=0.6.21,<2.0.0