refactor: move document deps to api group, remove dynamic imports

- Merge offline-docs into api extras - Remove pipmaster dynamic installs - Add async document processing - Pre-check docling availability - Update offline deployment docs
2025-11-13 13:34:09 +08:00
parent 7d394fb0a4
commit 69a0b74ce7
4 changed files with 205 additions and 191 deletions
--- a/docs/OfflineDeployment.md
+++ b/docs/OfflineDeployment.md
@@ -23,10 +23,11 @@ LightRAG uses dynamic package installation (`pipmaster`) for optional features b

 LightRAG dynamically installs packages for:

- **Document Processing**: `docling`, `pypdf2`, `python-docx`, `python-pptx`, `openpyxl`
 - **Storage Backends**: `redis`, `neo4j`, `pymilvus`, `pymongo`, `asyncpg`, `qdrant-client`
 - **LLM Providers**: `openai`, `anthropic`, `ollama`, `zhipuai`, `aioboto3`, `voyageai`, `llama-index`, `lmdeploy`, `transformers`, `torch`
- Tiktoken Models**: BPE encoding models downloaded from OpenAI CDN
+- **Tiktoken Models**: BPE encoding models downloaded from OpenAI CDN
+
+**Note**: Document processing dependencies (`pypdf`, `python-docx`, `python-pptx`, `openpyxl`) are now pre-installed with the `api` extras group and no longer require dynamic installation.

 ## Quick Start

@@ -75,32 +76,31 @@ LightRAG provides flexible dependency groups for different use cases:

 | Group | Description | Use Case |
 |-------|-------------|----------|
-| `offline-docs` | Document processing | PDF, DOCX, PPTX, XLSX files |
+| `api` | API server + document processing | FastAPI server with PDF, DOCX, PPTX, XLSX support |
 | `offline-storage` | Storage backends | Redis, Neo4j, MongoDB, PostgreSQL, etc. |
 | `offline-llm` | LLM providers | OpenAI, Anthropic, Ollama, etc. |
-| `offline` | All of the above | Complete offline deployment |
+| `offline` | Complete offline package | API + Storage + LLM (all features) |
+
+**Note**: Document processing (PDF, DOCX, PPTX, XLSX) is included in the `api` extras group. The previous `offline-docs` group has been merged into `api` for better integration.

 > Software packages requiring `transformers`, `torch`, or `cuda` will not be included in the offline dependency group.

 ### Installation Examples

 ```bash
-# Install only document processing dependencies
-pip install lightrag-hku[offline-docs]
+# Install API with document processing
+pip install lightrag-hku[api]

-# Install document processing and storage backends
-pip install lightrag-hku[offline-docs,offline-storage]
+# Install API and storage backends
+pip install lightrag-hku[api,offline-storage]

-# Install all offline dependencies
+# Install all offline dependencies (recommended for offline deployment)
 pip install lightrag-hku[offline]
 ```

 ### Using Individual Requirements Files

 ```bash
-# Document processing only
-pip install -r requirements-offline-docs.txt
-
 # Storage backends only
 pip install -r requirements-offline-storage.txt

@@ -244,8 +244,8 @@ ls -la ~/.tiktoken_cache/
 **Solution**:
 ```bash
 # Pre-install the specific package you need
-# For document processing:
-pip install lightrag-hku[offline-docs]
+# For API with document processing:
+pip install lightrag-hku[api]

 # For storage backends:
 pip install lightrag-hku[offline-storage]
@@ -297,9 +297,9 @@ mkdir -p ~/my_tiktoken_cache

 5. **Minimal Installation**: Only install what you need:
   ```bash
-   # If you only process PDFs with OpenAI
-   pip install lightrag-hku[offline-docs]
-   # Then manually add: pip install openai
+   # If you only need API with document processing
+   pip install lightrag-hku[api]
+   # Then manually add specific LLM: pip install openai
   ```

 ## Additional Resources
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@@ -7,10 +7,10 @@ from lightrag.utils import logger, get_pinyin_sort_key
 import aiofiles
 import shutil
 import traceback
-import pipmaster as pm
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Dict, List, Optional, Any, Literal
+from io import BytesIO
 from fastapi import (
    APIRouter,
    BackgroundTasks,
@@ -27,6 +27,20 @@ from lightrag.utils import generate_track_id
 from lightrag.api.utils_api import get_combined_auth_dependency
 from ..config import global_args

+# Check docling availability at module load time
+DOCLING_AVAILABLE = False
+try:
+    import docling  # noqa: F401  # type: ignore[import-not-found]
+
+    DOCLING_AVAILABLE = True
+except ImportError:
+    if global_args.document_loading_engine == "DOCLING":
+        logger.warning(
+            "DOCLING engine requested but 'docling' package not installed. "
+            "Falling back to standard document processing. "
+            "To use DOCLING, install with: pip install lightrag-hku[api,docling]"
+        )
+

 # Function to format datetime to ISO format string with timezone information
 def format_datetime(dt: Any) -> Optional[str]:
@@ -879,7 +893,6 @@ def get_unique_filename_in_enqueued(target_dir: Path, original_name: str) -> str
    Returns:
        str: Unique filename (may have numeric suffix added)
    """
-    from pathlib import Path
    import time

    original_path = Path(original_name)
@@ -902,6 +915,122 @@ def get_unique_filename_in_enqueued(target_dir: Path, original_name: str) -> str
    return f"{base_name}_{timestamp}{extension}"


+# Document processing helper functions (synchronous)
+# These functions run in thread pool via asyncio.to_thread() to avoid blocking the event loop
+
+
+def _convert_with_docling(file_path: Path) -> str:
+    """Convert document using docling (synchronous).
+
+    Args:
+        file_path: Path to the document file
+
+    Returns:
+        str: Extracted markdown content
+    """
+    from docling.document_converter import DocumentConverter  # type: ignore
+
+    converter = DocumentConverter()
+    result = converter.convert(file_path)
+    return result.document.export_to_markdown()
+
+
+def _extract_pdf_pypdf(file_bytes: bytes, password: str = None) -> str:
+    """Extract PDF content using pypdf (synchronous).
+
+    Args:
+        file_bytes: PDF file content as bytes
+        password: Optional password for encrypted PDFs
+
+    Returns:
+        str: Extracted text content
+
+    Raises:
+        Exception: If PDF is encrypted and password is incorrect or missing
+    """
+    from pypdf import PdfReader  # type: ignore
+
+    pdf_file = BytesIO(file_bytes)
+    reader = PdfReader(pdf_file)
+
+    # Check if PDF is encrypted
+    if reader.is_encrypted:
+        if not password:
+            raise Exception("PDF is encrypted but no password provided")
+
+        decrypt_result = reader.decrypt(password)
+        if decrypt_result == 0:
+            raise Exception("Incorrect PDF password")
+
+    # Extract text from all pages
+    content = ""
+    for page in reader.pages:
+        content += page.extract_text() + "\n"
+
+    return content
+
+
+def _extract_docx(file_bytes: bytes) -> str:
+    """Extract DOCX content (synchronous).
+
+    Args:
+        file_bytes: DOCX file content as bytes
+
+    Returns:
+        str: Extracted text content
+    """
+    from docx import Document  # type: ignore
+
+    docx_file = BytesIO(file_bytes)
+    doc = Document(docx_file)
+    return "\n".join([paragraph.text for paragraph in doc.paragraphs])
+
+
+def _extract_pptx(file_bytes: bytes) -> str:
+    """Extract PPTX content (synchronous).
+
+    Args:
+        file_bytes: PPTX file content as bytes
+
+    Returns:
+        str: Extracted text content
+    """
+    from pptx import Presentation  # type: ignore
+
+    pptx_file = BytesIO(file_bytes)
+    prs = Presentation(pptx_file)
+    content = ""
+    for slide in prs.slides:
+        for shape in slide.shapes:
+            if hasattr(shape, "text"):
+                content += shape.text + "\n"
+    return content
+
+
+def _extract_xlsx(file_bytes: bytes) -> str:
+    """Extract XLSX content (synchronous).
+
+    Args:
+        file_bytes: XLSX file content as bytes
+
+    Returns:
+        str: Extracted text content
+    """
+    from openpyxl import load_workbook  # type: ignore
+
+    xlsx_file = BytesIO(file_bytes)
+    wb = load_workbook(xlsx_file)
+    content = ""
+    for sheet in wb:
+        content += f"Sheet: {sheet.title}\n"
+        for row in sheet.iter_rows(values_only=True):
+            content += (
+                "\t".join(str(cell) if cell is not None else "" for cell in row) + "\n"
+            )
+        content += "\n"
+    return content
+
+
 async def pipeline_enqueue_file(
    rag: LightRAG, file_path: Path, track_id: str = None
 ) -> tuple[bool, str]:
@@ -1072,87 +1201,21 @@ async def pipeline_enqueue_file(

                case ".pdf":
                    try:
-                        if global_args.document_loading_engine == "DOCLING":
-                            if not pm.is_installed("docling"):  # type: ignore
-                                pm.install("docling")
-                            from docling.document_converter import DocumentConverter  # type: ignore
-
-                            converter = DocumentConverter()
-                            result = converter.convert(file_path)
-                            content = result.document.export_to_markdown()
+                        # Try DOCLING first if configured and available
+                        if (
+                            global_args.document_loading_engine == "DOCLING"
+                            and DOCLING_AVAILABLE
+                        ):
+                            content = await asyncio.to_thread(
+                                _convert_with_docling, file_path
+                            )
                        else:
-                            if not pm.is_installed("pypdf"):  # type: ignore
-                                pm.install("pypdf")
-                            if not pm.is_installed("pycryptodome"):  # type: ignore
-                                pm.install("pycryptodome")
-                            from pypdf import PdfReader  # type: ignore
-                            from io import BytesIO
-
-                            pdf_file = BytesIO(file)
-                            reader = PdfReader(pdf_file)
-
-                            # Check if PDF is encrypted
-                            if reader.is_encrypted:
-                                pdf_password = global_args.pdf_decrypt_password
-                                if not pdf_password:
-                                    # PDF is encrypted but no password provided
-                                    error_files = [
-                                        {
-                                            "file_path": str(file_path.name),
-                                            "error_description": "[File Extraction]PDF is encrypted but no password provided",
-                                            "original_error": "Please set PDF_DECRYPT_PASSWORD environment variable to decrypt this PDF file",
-                                            "file_size": file_size,
-                                        }
-                                    ]
-                                    await rag.apipeline_enqueue_error_documents(
-                                        error_files, track_id
-                                    )
-                                    logger.error(
-                                        f"[File Extraction]PDF is encrypted but no password provided: {file_path.name}"
-                                    )
-                                    return False, track_id
-
-                                # Try to decrypt with password
-                                try:
-                                    decrypt_result = reader.decrypt(pdf_password)
-                                    if decrypt_result == 0:
-                                        # Password is incorrect
-                                        error_files = [
-                                            {
-                                                "file_path": str(file_path.name),
-                                                "error_description": "[File Extraction]Failed to decrypt PDF - incorrect password",
-                                                "original_error": "The provided PDF_DECRYPT_PASSWORD is incorrect for this file",
-                                                "file_size": file_size,
-                                            }
-                                        ]
-                                        await rag.apipeline_enqueue_error_documents(
-                                            error_files, track_id
-                                        )
-                                        logger.error(
-                                            f"[File Extraction]Incorrect PDF password: {file_path.name}"
-                                        )
-                                        return False, track_id
-                                except Exception as decrypt_error:
-                                    # Decryption process error
-                                    error_files = [
-                                        {
-                                            "file_path": str(file_path.name),
-                                            "error_description": "[File Extraction]PDF decryption failed",
-                                            "original_error": f"Error during PDF decryption: {str(decrypt_error)}",
-                                            "file_size": file_size,
-                                        }
-                                    ]
-                                    await rag.apipeline_enqueue_error_documents(
-                                        error_files, track_id
-                                    )
-                                    logger.error(
-                                        f"[File Extraction]PDF decryption error for {file_path.name}: {str(decrypt_error)}"
-                                    )
-                                    return False, track_id
-
-                            # Extract text from PDF (encrypted PDFs are now decrypted, unencrypted PDFs proceed directly)
-                            for page in reader.pages:
-                                content += page.extract_text() + "\n"
+                            # Use pypdf (non-blocking via to_thread)
+                            content = await asyncio.to_thread(
+                                _extract_pdf_pypdf,
+                                file,
+                                global_args.pdf_decrypt_password,
+                            )
                    except Exception as e:
                        error_files = [
                            {
@@ -1172,28 +1235,17 @@ async def pipeline_enqueue_file(

                case ".docx":
                    try:
-                        if global_args.document_loading_engine == "DOCLING":
-                            if not pm.is_installed("docling"):  # type: ignore
-                                pm.install("docling")
-                            from docling.document_converter import DocumentConverter  # type: ignore
-
-                            converter = DocumentConverter()
-                            result = converter.convert(file_path)
-                            content = result.document.export_to_markdown()
-                        else:
-                            if not pm.is_installed("python-docx"):  # type: ignore
-                                try:
-                                    pm.install("python-docx")
-                                except Exception:
-                                    pm.install("docx")
-                            from docx import Document  # type: ignore
-                            from io import BytesIO
-
-                            docx_file = BytesIO(file)
-                            doc = Document(docx_file)
-                            content = "\n".join(
-                                [paragraph.text for paragraph in doc.paragraphs]
+                        # Try DOCLING first if configured and available
+                        if (
+                            global_args.document_loading_engine == "DOCLING"
+                            and DOCLING_AVAILABLE
+                        ):
+                            content = await asyncio.to_thread(
+                                _convert_with_docling, file_path
                            )
+                        else:
+                            # Use python-docx (non-blocking via to_thread)
+                            content = await asyncio.to_thread(_extract_docx, file)
                    except Exception as e:
                        error_files = [
                            {
@@ -1213,26 +1265,17 @@ async def pipeline_enqueue_file(

                case ".pptx":
                    try:
-                        if global_args.document_loading_engine == "DOCLING":
-                            if not pm.is_installed("docling"):  # type: ignore
-                                pm.install("docling")
-                            from docling.document_converter import DocumentConverter  # type: ignore
-
-                            converter = DocumentConverter()
-                            result = converter.convert(file_path)
-                            content = result.document.export_to_markdown()
+                        # Try DOCLING first if configured and available
+                        if (
+                            global_args.document_loading_engine == "DOCLING"
+                            and DOCLING_AVAILABLE
+                        ):
+                            content = await asyncio.to_thread(
+                                _convert_with_docling, file_path
+                            )
                        else:
-                            if not pm.is_installed("python-pptx"):  # type: ignore
-                                pm.install("pptx")
-                            from pptx import Presentation  # type: ignore
-                            from io import BytesIO
-
-                            pptx_file = BytesIO(file)
-                            prs = Presentation(pptx_file)
-                            for slide in prs.slides:
-                                for shape in slide.shapes:
-                                    if hasattr(shape, "text"):
-                                        content += shape.text + "\n"
+                            # Use python-pptx (non-blocking via to_thread)
+                            content = await asyncio.to_thread(_extract_pptx, file)
                    except Exception as e:
                        error_files = [
                            {
@@ -1252,33 +1295,17 @@ async def pipeline_enqueue_file(

                case ".xlsx":
                    try:
-                        if global_args.document_loading_engine == "DOCLING":
-                            if not pm.is_installed("docling"):  # type: ignore
-                                pm.install("docling")
-                            from docling.document_converter import DocumentConverter  # type: ignore
-
-                            converter = DocumentConverter()
-                            result = converter.convert(file_path)
-                            content = result.document.export_to_markdown()
+                        # Try DOCLING first if configured and available
+                        if (
+                            global_args.document_loading_engine == "DOCLING"
+                            and DOCLING_AVAILABLE
+                        ):
+                            content = await asyncio.to_thread(
+                                _convert_with_docling, file_path
+                            )
                        else:
-                            if not pm.is_installed("openpyxl"):  # type: ignore
-                                pm.install("openpyxl")
-                            from openpyxl import load_workbook  # type: ignore
-                            from io import BytesIO
-
-                            xlsx_file = BytesIO(file)
-                            wb = load_workbook(xlsx_file)
-                            for sheet in wb:
-                                content += f"Sheet: {sheet.title}\n"
-                                for row in sheet.iter_rows(values_only=True):
-                                    content += (
-                                        "\t".join(
-                                            str(cell) if cell is not None else ""
-                                            for cell in row
-                                        )
-                                        + "\n"
-                                    )
-                                content += "\n"
+                            # Use openpyxl (non-blocking via to_thread)
+                            content = await asyncio.to_thread(_extract_xlsx, file)
                    except Exception as e:
                        error_files = [
                            {
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -79,18 +79,20 @@ api = [
    "python-multipart",
    "pytz",
    "uvicorn",
+    # Document processing dependencies (required for API document upload functionality)
+    "openpyxl>=3.0.0,<4.0.0",      # XLSX processing
+    "pycryptodome>=3.0.0,<4.0.0",  # PDF encryption support
+    "pypdf>=6.1.0",                 # PDF processing
+    "python-docx>=0.8.11,<2.0.0",  # DOCX processing
+    "python-pptx>=0.6.21,<2.0.0",  # PPTX processing
+]
+
+# Advanced document processing engine (optional)
+docling = [
+    "docling>=2.0.0,<3.0.0",
 ]

 # Offline deployment dependencies (layered design for flexibility)
-offline-docs = [
-    # Document processing dependencies
-    "openpyxl>=3.0.0,<4.0.0",
-    "pycryptodome>=3.0.0,<4.0.0",
-    "pypdf>=6.1.0",
-    "python-docx>=0.8.11,<2.0.0",
-    "python-pptx>=0.6.21,<2.0.0",
-]
-
 offline-storage = [
    # Storage backend dependencies
    "redis>=5.0.0,<8.0.0",
@@ -115,8 +117,8 @@ offline-llm = [
 ]

 offline = [
-    # Complete offline package (includes all offline dependencies)
-    "lightrag-hku[offline-docs,offline-storage,offline-llm]",
+    # Complete offline package (includes api for document processing, plus storage and LLM)
+    "lightrag-hku[api,offline-storage,offline-llm]",
 ]

 evaluation = [
--- a/requirements-offline-docs.txt
+++ b/requirements-offline-docs.txt
@@ -1,15 +0,0 @@
-# LightRAG Offline Dependencies - Document Processing
-# Install with: pip install -r requirements-offline-docs.txt
-# For offline installation:
-#   pip download -r requirements-offline-docs.txt -d ./packages
-#   pip install --no-index --find-links=./packages -r requirements-offline-docs.txt
-#
-# Recommended: Use pip install lightrag-hku[offline-docs] for the same effect
-# Or use constraints: pip install --constraint constraints-offline.txt -r requirements-offline-docs.txt
-
-# Document processing dependencies (with version constraints matching pyproject.toml)
-openpyxl>=3.0.0,<4.0.0
-pycryptodome>=3.0.0,<4.0.0
-pypdf>=6.1.0
-python-docx>=0.8.11,<2.0.0
-python-pptx>=0.6.21,<2.0.0