Merge branch 'main' into feature/vector-model-isolation

2025-12-12 10:28:59 +08:00
parent cf68cdfe3a 9562a974d2
commit 19ab979a9c
49 changed files with 4137 additions and 1760 deletions
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -0,0 +1,206 @@
+# Keep GitHub Actions up to date with GitHub's Dependabot...
+# https://docs.github.com/en/code-security/dependabot/working-with-dependabot/keeping-your-actions-up-to-date-with-dependabot
+# https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file#package-ecosystem
+version: 2
+updates:
+  # ============================================================
+  # GitHub Actions
+  # PR Strategy:
+  #   - All updates (major/minor/patch): Grouped into a single PR
+  # ============================================================
+  - package-ecosystem: github-actions
+    directory: /
+    groups:
+      github-actions:
+        patterns:
+          - "*"  # Group all Actions updates into a single larger pull request
+    schedule:
+      interval: weekly
+      day: monday
+      time: "02:00"
+      timezone: "Asia/Shanghai"
+    labels:
+      - "dependencies"
+      - "github-actions"
+    open-pull-requests-limit: 2
+
+  # ============================================================
+  # Python (pip) Dependencies
+  # PR Strategy:
+  #   - Major updates: Individual PR per package (except numpy which is ignored)
+  #   - Minor updates: Grouped by category (llm-providers, storage, etc.)
+  #   - Patch updates: Grouped by category
+  # ============================================================
+  - package-ecosystem: "pip"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+      day: "wednesday"
+      time: "02:00"
+      timezone: "Asia/Shanghai"
+    cooldown:
+      default-days: 5
+      semver-major-days: 30
+      semver-minor-days: 7
+      semver-patch-days: 3
+    groups:
+      # Core dependencies - LLM providers and embeddings
+      llm-providers:
+        patterns:
+          - "openai"
+          - "anthropic"
+          - "google-*"
+          - "boto3"
+          - "botocore"
+          - "ollama"
+        update-types:
+          - "minor"
+          - "patch"
+      # Storage backends
+      storage:
+        patterns:
+          - "neo4j"
+          - "pymongo"
+          - "redis"
+          - "psycopg*"
+          - "asyncpg"
+          - "milvus*"
+          - "qdrant*"
+        update-types:
+          - "minor"
+          - "patch"
+      # Data processing and ML
+      data-processing:
+        patterns:
+          - "numpy"
+          - "scipy"
+          - "pandas"
+          - "tiktoken"
+          - "transformers"
+          - "torch*"
+        update-types:
+          - "minor"
+          - "patch"
+      # Web framework and API
+      web-framework:
+        patterns:
+          - "fastapi"
+          - "uvicorn"
+          - "gunicorn"
+          - "starlette"
+          - "pydantic*"
+        update-types:
+          - "minor"
+          - "patch"
+      # Development and testing tools
+      dev-tools:
+        patterns:
+          - "pytest*"
+          - "ruff"
+          - "pre-commit"
+          - "black"
+          - "mypy"
+        update-types:
+          - "minor"
+          - "patch"
+      # Minor and patch updates for everything else
+      python-minor-patch:
+        patterns:
+          - "*"
+        update-types:
+          - "minor"
+          - "patch"
+    ignore:
+      - dependency-name: "numpy"
+        update-types:
+          - "version-update:semver-major"
+    labels:
+      - "dependencies"
+      - "python"
+    open-pull-requests-limit: 5
+
+  # ============================================================
+  # Frontend (bun) Dependencies
+  # PR Strategy:
+  #   - Major updates: Individual PR per package
+  #   - Minor updates: Grouped by category (react, ui-components, etc.)
+  #   - Patch updates: Grouped by category
+  # ============================================================
+  - package-ecosystem: "bun"
+    directory: "/lightrag_webui"
+    schedule:
+      interval: "weekly"
+      day: "friday"
+      time: "02:00"
+      timezone: "Asia/Shanghai"
+    cooldown:
+      default-days: 5
+      semver-major-days: 30
+      semver-minor-days: 7
+      semver-patch-days: 3
+    groups:
+      # React ecosystem
+      react:
+        patterns:
+          - "react"
+          - "react-dom"
+          - "react-router*"
+          - "@types/react*"
+        update-types:
+          - "minor"
+          - "patch"
+      # UI components and styling
+      ui-components:
+        patterns:
+          - "@radix-ui/*"
+          - "tailwind*"
+          - "@tailwindcss/*"
+          - "lucide-react"
+          - "class-variance-authority"
+          - "clsx"
+        update-types:
+          - "minor"
+          - "patch"
+      # Graph visualization
+      graph-viz:
+        patterns:
+          - "sigma"
+          - "@sigma/*"
+          - "graphology*"
+        update-types:
+          - "minor"
+          - "patch"
+      # Build tools and dev dependencies
+      build-tools:
+        patterns:
+          - "vite"
+          - "@vitejs/*"
+          - "typescript"
+          - "eslint*"
+          - "@eslint/*"
+          - "typescript-eslint"
+          - "prettier"
+          - "prettier-*"
+          - "@types/bun"
+        update-types:
+          - "minor"
+          - "patch"
+      # Content rendering libraries (math, diagrams, etc.)
+      content-rendering:
+        patterns:
+          - "katex"
+          - "mermaid"
+        update-types:
+          - "minor"
+          - "patch"
+      # All other minor and patch updates
+      frontend-minor-patch:
+        patterns:
+          - "*"
+        update-types:
+          - "minor"
+          - "patch"
+    labels:
+      - "dependencies"
+      - "frontend"
+    open-pull-requests-limit: 5
--- a/.github/workflows/copilot-setup-steps.yml
+++ b/.github/workflows/copilot-setup-steps.yml
@@ -0,0 +1,58 @@
+name: "Copilot Setup Steps"
+
+# Automatically run the setup steps when they are changed to allow for easy validation, and
+# allow manual testing through the repository's "Actions" tab
+on:
+  workflow_dispatch:
+  push:
+    paths:
+      - .github/workflows/copilot-setup-steps.yml
+  pull_request:
+    paths:
+      - .github/workflows/copilot-setup-steps.yml
+
+jobs:
+  # The job MUST be called `copilot-setup-steps` or it will not be picked up by Copilot.
+  copilot-setup-steps:
+    runs-on: ubuntu-latest
+
+    # Timeout after 30 minutes (maximum is 59)
+    timeout-minutes: 30
+
+    # You can define any steps you want, and they will run before the agent starts.
+    # If you do not check out your code, Copilot will do this for you.
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v6
+
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.11'
+
+      - name: Cache pip packages
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-pip-copilot-${{ hashFiles('**/pyproject.toml') }}
+          restore-keys: |
+            ${{ runner.os }}-pip-copilot-
+            ${{ runner.os }}-pip-
+
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e ".[api]"
+          pip install pytest pytest-asyncio httpx
+
+      - name: Create minimal frontend stub for Copilot agent
+        run: |
+          mkdir -p lightrag/api/webui
+          echo '<!DOCTYPE html><html><head><title>LightRAG - Copilot Agent</title></head><body><h1>Copilot Agent Mode</h1></body></html>' > lightrag/api/webui/index.html
+          echo "Created minimal frontend stub for Copilot agent environment"
+
+      - name: Verify installation
+        run: |
+          python --version
+          pip list | grep lightrag
+          lightrag-server --help || echo "Note: Server requires .env configuration to run"
--- a/.github/workflows/docker-build-lite.yml
+++ b/.github/workflows/docker-build-lite.yml
@@ -18,7 +18,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
        with:
          fetch-depth: 0

@@ -66,7 +66,7 @@ jobs:
            type=raw,value=lite

      - name: Build and push lite Docker image
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          context: .
          file: ./Dockerfile.lite
--- a/.github/workflows/docker-build-manual.yml
+++ b/.github/workflows/docker-build-manual.yml
@@ -18,7 +18,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
        with:
          fetch-depth: 0  # Fetch all history for tags

@@ -61,7 +61,7 @@ jobs:
            type=raw,value=${{ steps.get_tag.outputs.tag }}

      - name: Build and push Docker image
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          context: .
          file: ./Dockerfile
--- a/.github/workflows/docker-publish.yml
+++ b/.github/workflows/docker-publish.yml
@@ -14,7 +14,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
        with:
          fetch-depth: 0  # Fetch all history for tags

@@ -63,7 +63,7 @@ jobs:
            type=raw,value=latest,enable=${{ steps.check_prerelease.outputs.is_prerelease == 'false' }}

      - name: Build and push Docker image
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          context: .
          file: ./Dockerfile
--- a/.github/workflows/linting.yaml
+++ b/.github/workflows/linting.yaml
@@ -10,14 +10,15 @@ on:

 jobs:
    lint-and-format:
+        name: Linting and Formatting
        runs-on: ubuntu-latest

        steps:
            - name: Checkout code
-              uses: actions/checkout@v2
+              uses: actions/checkout@v6

            - name: Set up Python
-              uses: actions/setup-python@v2
+              uses: actions/setup-python@v6
              with:
                python-version: '3.x'

--- a/.github/workflows/pypi-publish.yml
+++ b/.github/workflows/pypi-publish.yml
@@ -13,13 +13,13 @@ jobs:
    runs-on: ubuntu-latest

    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
        with:
          fetch-depth: 0  # Fetch all history for tags

      # Build frontend WebUI
      - name: Setup Bun
-        uses: oven-sh/setup-bun@v1
+        uses: oven-sh/setup-bun@v2
        with:
          bun-version: latest

@@ -40,7 +40,7 @@ jobs:
          echo "Frontend files:"
          ls -lh lightrag/api/webui/ | head -10

-      - uses: actions/setup-python@v5
+      - uses: actions/setup-python@v6
        with:
          python-version: "3.x"

@@ -64,7 +64,7 @@ jobs:
          python -m build

      - name: Upload distributions
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v5
        with:
          name: release-dists
          path: dist/
@@ -81,7 +81,7 @@ jobs:

    steps:
      - name: Retrieve release distributions
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v6
        with:
          name: release-dists
          path: dist/
--- a/.github/workflows/stale.yaml
+++ b/.github/workflows/stale.yaml
@@ -13,7 +13,7 @@ jobs:
  stale:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/stale@v9
+      - uses: actions/stale@v10
        with:
          days-before-stale: 90 # 90 days
          days-before-close: 7 # 7 days after marked as stale
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -13,13 +13,13 @@ jobs:

    strategy:
      matrix:
-        python-version: ['3.10', '3.11', '3.12']
+        python-version: ['3.12', '3.13', '3.14']

    steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v6

    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v5
+      uses: actions/setup-python@v6
      with:
        python-version: ${{ matrix.python-version }}

@@ -45,7 +45,7 @@ jobs:

    - name: Upload test results
      if: always()
-      uses: actions/upload-artifact@v4
+      uses: actions/upload-artifact@v5
      with:
        name: test-results-py${{ matrix.python-version }}
        path: |
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -12,7 +12,7 @@ LightRAG is an advanced Retrieval-Augmented Generation (RAG) framework designed
 - `python -m venv .venv && source .venv/bin/activate`: set up the Python runtime.
 - `pip install -e .` / `pip install -e .[api]`: install the package and API extras in editable mode.
 - `lightrag-server` or `uvicorn lightrag.api.lightrag_server:app --reload`: start the API locally; ensure `.env` is present.
- `python -m pytest tests` or `python test_graph_storage.py`: run the full suite or a targeted script.
+- `python -m pytest tests` (offline markers apply by default) or `python -m pytest tests --run-integration` / `python test_graph_storage.py`: run the full suite, opt into integration coverage, or target an individual script.
 - `ruff check .`: lint Python sources before committing.
 - `bun install`, `bun run dev`, `bun run build`, `bun test`: manage the web UI workflow (Bun is mandatory).

@@ -24,9 +24,11 @@ LightRAG is an advanced Retrieval-Augmented Generation (RAG) framework designed
 - Front-end code should remain in TypeScript with two-space indentation, rely on functional React components with hooks, and follow Tailwind utility style.

 ## Testing Guidelines
- Add pytest cases beside the affected module or the relevant `test_*.py`; functions should start with `test_`.
- Export required `LIGHTRAG_*` environment variables before running integration or storage tests.
- For UI updates, pair code with Vitest specs and run `bun test`.
+- Keep pytest additions close to the code you touch (`tests/` mirrors feature folders and there are root-level `test_*.py` helpers); functions must start with `test_`.
+- Follow `tests/pytest.ini`: markers include `offline`, `integration`, `requires_db`, and `requires_api`, and the suite runs with `-m "not integration"` by default—pass `--run-integration` (or set `LIGHTRAG_RUN_INTEGRATION=true`) when external services are available.
+- Use the custom CLI toggles from `tests/conftest.py`: `--keep-artifacts`/`LIGHTRAG_KEEP_ARTIFACTS=true`, `--stress-test`/`LIGHTRAG_STRESS_TEST=true`, and `--test-workers N`/`LIGHTRAG_TEST_WORKERS` to dial up workloads or preserve temp files during investigations.
+- Export other required `LIGHTRAG_*` environment variables before running integration or storage tests so adapters can reach configured backends.
+- For UI updates, pair changes with Vitest specs and run `bun test`.

 ## Commit & Pull Request Guidelines
 - Use concise, imperative commit subjects (e.g., `Fix lock key normalization`) and add body context only when necessary.
@@ -37,3 +39,10 @@ LightRAG is an advanced Retrieval-Augmented Generation (RAG) framework designed
 - Copy `.env.example` and `config.ini.example`; never commit secrets or real connection strings.
 - Configure storage backends through `LIGHTRAG_*` variables and validate them with `docker-compose` services when needed.
 - Treat `lightrag.log*` as local artefacts; purge sensitive information before sharing logs or outputs.
+
+## Automation & Agent Workflow
+- Use repo-relative `workdir` arguments for every shell command and prefer `rg`/`rg --files` for searches since they are faster under the CLI harness.
+- Default edits to ASCII, rely on `apply_patch` for single-file changes, and only add concise comments that aid comprehension of complex logic.
+- Honor existing local modifications; never revert or discard user changes (especially via `git reset --hard`) unless explicitly asked.
+- Follow the planning tool guidance: skip it for trivial fixes, but provide multi-step plans for non-trivial work and keep the plan updated as steps progress.
+- Validate changes by running the relevant `ruff`/`pytest`/`bun test` commands whenever feasible, and describe any unrun checks with follow-up guidance.
--- a/README-zh.md
+++ b/README-zh.md
@@ -53,28 +53,24 @@

 ## 🎉 新闻

- [x] [2025.11.05]🎯📢添加**基于RAGAS的**评估框架和**Langfuse**可观测性支持。
- [x] [2025.10.22]🎯📢消除处理**大规模数据集**的瓶颈。
- [x] [2025.09.15]🎯📢显著提升**小型LLM**（如Qwen3-30B-A3B）的知识图谱提取准确性。
- [x] [2025.08.29]🎯📢现已支持**Reranker**，显著提升混合查询性能。
- [x] [2025.08.04]🎯📢支持**文档删除**并重新生成知识图谱以确保查询性能。
- [x] [2025.06.16]🎯📢我们的团队发布了[RAG-Anything](https://github.com/HKUDS/RAG-Anything)，一个用于无缝处理文本、图像、表格和方程式的全功能多模态 RAG 系统。
- [X] [2025.06.05]🎯📢LightRAG现已集成[RAG-Anything](https://github.com/HKUDS/RAG-Anything)，支持全面的多模态文档解析与RAG能力（PDF、图片、Office文档、表格、公式等）。详见下方[多模态处理模块](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#多模态文档处理rag-anything集成)。
- [X] [2025.03.18]🎯📢LightRAG现已支持引文功能。
- [X] [2025.02.05]🎯📢我们团队发布了[VideoRAG](https://github.com/HKUDS/VideoRAG)，用于理解超长上下文视频。
- [X] [2025.01.13]🎯📢我们团队发布了[MiniRAG](https://github.com/HKUDS/MiniRAG)，使用小型模型简化RAG。
- [X] [2025.01.06]🎯📢现在您可以[使用PostgreSQL进行存储](#using-postgresql-for-storage)。
- [X] [2024.11.25]🎯📢LightRAG现在支持无缝集成[自定义知识图谱](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#insert-custom-kg)，使用户能够用自己的领域专业知识增强系统。
- [X] [2024.11.19]🎯📢LightRAG的综合指南现已在[LearnOpenCV](https://learnopencv.com/lightrag)上发布。非常感谢博客作者。
- [X] [2024.11.11]🎯📢LightRAG现在支持[通过实体名称删除实体](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#delete)。
- [X] [2024.11.09]🎯📢推出[LightRAG Gui](https://lightrag-gui.streamlit.app)，允许您插入、查询、可视化和下载LightRAG知识。
- [X] [2024.11.04]🎯📢现在您可以[使用Neo4J进行存储](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#using-neo4j-for-storage)。
- [X] [2024.10.29]🎯📢LightRAG现在通过`textract`支持多种文件类型，包括PDF、DOC、PPT和CSV。
- [X] [2024.10.20]🎯📢我们为LightRAG添加了一个新功能：图形可视化。
- [X] [2024.10.18]🎯📢我们添加了[LightRAG介绍视频](https://youtu.be/oageL-1I0GE)的链接。感谢作者！
- [X] [2024.10.17]🎯📢我们创建了一个[Discord频道](https://discord.gg/yF2MmDJyGJ)！欢迎加入分享和讨论！🎉🎉
- [X] [2024.10.16]🎯📢LightRAG现在支持[Ollama模型](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)！
- [X] [2024.10.15]🎯📢LightRAG现在支持[Hugging Face模型](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)！
+- [2025.11.05]🎯添加**基于RAGAS的**评估框架和**Langfuse**可观测性支持（API可随查询结果返回召回上下文）。
+- [2025.10.22]🎯消除处理**大规模数据集**的性能瓶颈。
+- [2025.09.15]🎯显著提升**小型LLM**（如Qwen3-30B-A3B）的知识图谱提取准确性。
+- [2025.08.29]🎯现已支持**Reranker**，显著提升混合查询性能(现已设为默认查询模式)。
+- [2025.08.04]🎯支持**文档删除**并重新生成知识图谱以确保查询性能。
+- [2025.06.16]🎯我们的团队发布了[RAG-Anything](https://github.com/HKUDS/RAG-Anything)，一个用于无缝处理文本、图像、表格和方程式的全功能多模态 RAG 系统。
+- [2025.06.05]🎯LightRAG现已集成[RAG-Anything](https://github.com/HKUDS/RAG-Anything)，支持全面的多模态文档解析与RAG能力（PDF、图片、Office文档、表格、公式等）。详见下方[多模态处理模块](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#多模态文档处理rag-anything集成)。
+- [2025.03.18]🎯LightRAG现已支持参考文献功能。
+- [2025.02.12]🎯现在您可以使用MongoDB作为一体化存储解决方案。
+- [2025.02.05]🎯我们团队发布了[VideoRAG](https://github.com/HKUDS/VideoRAG)，用于理解超长上下文视频。
+- [2025.01.13]🎯我们团队发布了[MiniRAG](https://github.com/HKUDS/MiniRAG)，使用小型模型简化RAG。
+- [2025.01.06]🎯现在您可以使用PostgreSQL作为一体化存储解决方案。
+- [2024.11.19]🎯LightRAG的综合指南现已在[LearnOpenCV](https://learnopencv.com/lightrag)上发布。非常感谢博客作者。
+- [2024.11.09]🎯推出LightRAG Webui，允许您插入、查询、可视化LightRAG知识。
+- [2024.11.04]🎯现在您可以[使用Neo4J进行存储](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#using-neo4j-for-storage)。
+- [2024.10.18]🎯我们添加了[LightRAG介绍视频](https://youtu.be/oageL-1I0GE)的链接。感谢作者！
+- [2024.10.17]🎯我们创建了一个[Discord频道](https://discord.gg/yF2MmDJyGJ)！欢迎加入分享和讨论！🎉🎉
+- [2024.10.16]🎯LightRAG现在支持[Ollama模型](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)！

 <details>
  <summary style="font-size: 1.4em; font-weight: bold; cursor: pointer; display: list-item;">
@@ -411,6 +407,11 @@ LightRAG 需要利用LLM和Embeding模型来完成文档索引和知识库查询
 * LightRAG还支持类OpenAI的聊天/嵌入API：

 ```python
+import os
+import numpy as np
+from lightrag.utils import wrap_embedding_func_with_attrs
+from lightrag.llm.openai import openai_complete_if_cache, openai_embed
+
 async def llm_model_func(
    prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
 ) -> str:
@@ -424,8 +425,9 @@ async def llm_model_func(
        **kwargs
    )

+@wrap_embedding_func_with_attrs(embedding_dim=4096, max_token_size=8192)
 async def embedding_func(texts: list[str]) -> np.ndarray:
-    return await openai_embed(
+    return await openai_embed.func(
        texts,
        model="solar-embedding-1-large-query",
        api_key=os.getenv("UPSTAGE_API_KEY"),
@@ -436,16 +438,17 @@ async def initialize_rag():
    rag = LightRAG(
        working_dir=WORKING_DIR,
        llm_model_func=llm_model_func,
-        embedding_func=EmbeddingFunc(
-            embedding_dim=4096,
-            func=embedding_func
-        )
+        embedding_func=embedding_func  # 直接传入装饰后的函数
    )

    await rag.initialize_storages()
    return rag
 ```

+> **关于嵌入函数封装的重要说明：**
+>
+> `EmbeddingFunc` 不能嵌套封装。已经被 `@wrap_embedding_func_with_attrs` 装饰过的嵌入函数（如 `openai_embed`、`ollama_embed` 等）不能再次使用 `EmbeddingFunc()` 封装。这就是为什么在创建自定义嵌入函数时，我们调用 `xxx_embed.func`（底层未封装的函数）而不是直接调用 `xxx_embed`。
+
 </details>

 <details>
@@ -477,24 +480,26 @@ rag = LightRAG(

 <details>
 <summary> <b>使用Ollama模型</b> </summary>
+
 如果您想使用Ollama模型，您需要拉取计划使用的模型和嵌入模型，例如`nomic-embed-text`。

 然后您只需要按如下方式设置LightRAG：

 ```python
+import numpy as np
+from lightrag.utils import wrap_embedding_func_with_attrs
+from lightrag.llm.ollama import ollama_model_complete, ollama_embed
+
+@wrap_embedding_func_with_attrs(embedding_dim=768, max_token_size=8192)
+async def embedding_func(texts: list[str]) -> np.ndarray:
+    return await ollama_embed.func(texts, embed_model="nomic-embed-text")
+
 # 使用Ollama模型初始化LightRAG
 rag = LightRAG(
    working_dir=WORKING_DIR,
    llm_model_func=ollama_model_complete,  # 使用Ollama模型进行文本生成
    llm_model_name='your_model_name', # 您的模型名称
-    # 使用Ollama嵌入函数
-    embedding_func=EmbeddingFunc(
-        embedding_dim=768,
-        func=lambda texts: ollama_embed(
-            texts,
-            embed_model="nomic-embed-text"
-        )
-    ),
+    embedding_func=embedding_func,  # 直接传入装饰后的函数
 )
 ```

@@ -533,22 +538,27 @@ ollama create -f Modelfile qwen2m
 您可以使用`llm_model_kwargs`参数配置ollama：

 ```python
+import numpy as np
+from lightrag.utils import wrap_embedding_func_with_attrs
+from lightrag.llm.ollama import ollama_model_complete, ollama_embed
+
+@wrap_embedding_func_with_attrs(embedding_dim=768, max_token_size=8192)
+async def embedding_func(texts: list[str]) -> np.ndarray:
+    return await ollama_embed.func(texts, embed_model="nomic-embed-text")
+
 rag = LightRAG(
    working_dir=WORKING_DIR,
    llm_model_func=ollama_model_complete,  # 使用Ollama模型进行文本生成
    llm_model_name='your_model_name', # 您的模型名称
    llm_model_kwargs={"options": {"num_ctx": 32768}},
-    # 使用Ollama嵌入函数
-    embedding_func=EmbeddingFunc(
-        embedding_dim=768,
-        func=lambda texts: ollama_embed(
-            texts,
-            embed_model="nomic-embed-text"
-        )
-    ),
+    embedding_func=embedding_func,  # 直接传入装饰后的函数
 )
 ```

+> **关于嵌入函数封装的重要说明：**
+>
+> `EmbeddingFunc` 不能嵌套封装。已经被 `@wrap_embedding_func_with_attrs` 装饰过的嵌入函数（如 `openai_embed`、`ollama_embed` 等）不能再次使用 `EmbeddingFunc()` 封装。这就是为什么在创建自定义嵌入函数时，我们调用 `xxx_embed.func`（底层未封装的函数）而不是直接调用 `xxx_embed`。
+
 * **低RAM GPU**

 为了在低RAM GPU上运行此实验，您应该选择小型模型并调整上下文窗口（增加上下文会增加内存消耗）。例如，在6Gb RAM的改装挖矿GPU上运行这个ollama示例需要将上下文大小设置为26k，同时使用`gemma2:2b`。它能够在`book.txt`中找到197个实体和19个关系。
@@ -560,7 +570,7 @@ rag = LightRAG(
 LightRAG支持与LlamaIndex集成 (`llm/llama_index_impl.py`):

 - 通过LlamaIndex与OpenAI和其他提供商集成
- 详细设置和示例请参见[LlamaIndex文档](lightrag/llm/Readme.md)
+- 详细设置和示例请参见[LlamaIndex文档](https://developers.llamaindex.ai/python/framework/)

 **使用示例：**

@@ -622,9 +632,10 @@ if __name__ == "__main__":

 **详细文档和示例，请参见：**

- [LlamaIndex文档](lightrag/llm/Readme.md)
- [直接OpenAI示例](examples/lightrag_llamaindex_direct_demo.py)
- [LiteLLM代理示例](examples/lightrag_llamaindex_litellm_demo.py)
+- [LlamaIndex文档](https://developers.llamaindex.ai/python/framework/)
+- [直接OpenAI示例](examples/unofficial-sample/lightrag_llamaindex_direct_demo.py)
+- [LiteLLM代理示例](examples/unofficial-sample/lightrag_llamaindex_litellm_demo.py)
+- [LiteLLM+OPIK代理示例](examples/unofficial-sample/lightrag_llamaindex_litellm_opik_demo.py)

 </details>

@@ -885,7 +896,7 @@ rag = LightRAG(

 对于生产级场景，您很可能想要利用企业级解决方案。PostgreSQL可以为您提供一站式储解解决方案，作为KV存储、向量数据库（pgvector）和图数据库（apache AGE）。支持 PostgreSQL 版本为16.6或以上。

-* 如果您是初学者并想避免麻烦，推荐使用docker，请从这个镜像开始（请务必阅读概述）：https://hub.docker.com/r/shangor/postgres-for-rag
+* 如果您是初学者并想避免麻烦，推荐使用docker，请从这个镜像开始（默认帐号密码:rag/rag）：https://hub.docker.com/r/gzdaniel/postgres-for-rag
 * Apache AGE的性能不如Neo4j。追求高性能的图数据库请使用Noe4j。

 </details>
@@ -1527,7 +1538,7 @@ LANGFUSE_ENABLE_TRACE=true

 ## RAGAS评估

-**RAGAS**（Retrieval Augmented Generation Assessment，检索增强生成评估）是一个使用LLM对RAG系统进行无参考评估的框架。我们提供了基于RAGAS的评估脚本。详细信息请参阅[基于RAGAS的评估框架](lightrag/evaluation/README.md)。
+**RAGAS**（Retrieval Augmented Generation Assessment，检索增强生成评估）是一个使用LLM对RAG系统进行无参考评估的框架。我们提供了基于RAGAS的评估脚本。详细信息请参阅[基于RAGAS的评估框架](lightrag/evaluation/README_EVALUASTION_RAGAS.md)。

 ## 评估

--- a/README.md
+++ b/README.md
@@ -51,28 +51,24 @@

 ---
 ## 🎉 News
- [x] [2025.11.05]🎯📢Add **RAGAS-based** Evaluation Framework and **Langfuse** observability for LightRAG.
- [x] [2025.10.22]🎯📢Eliminate bottlenecks in processing **large-scale datasets**.
- [x] [2025.09.15]🎯📢Significantly enhances KG extraction accuracy for **small LLMs** like Qwen3-30B-A3B.
- [x] [2025.08.29]🎯📢**Reranker** is supported now , significantly boosting performance for mixed queries.
- [x] [2025.08.04]🎯📢**Document deletion** with KG regeneration to ensure query performance.
- [x] [2025.06.16]🎯📢Our team has released [RAG-Anything](https://github.com/HKUDS/RAG-Anything) an All-in-One Multimodal RAG System for seamless text, image, table, and equation processing.
- [X] [2025.06.05]🎯📢LightRAG now supports comprehensive multimodal data handling through [RAG-Anything](https://github.com/HKUDS/RAG-Anything) integration, enabling seamless document parsing and RAG capabilities across diverse formats including PDFs, images, Office documents, tables, and formulas. Please refer to the new [multimodal section](https://github.com/HKUDS/LightRAG/?tab=readme-ov-file#multimodal-document-processing-rag-anything-integration) for details.
- [X] [2025.03.18]🎯📢LightRAG now supports citation functionality, enabling proper source attribution.
- [X] [2025.02.05]🎯📢Our team has released [VideoRAG](https://github.com/HKUDS/VideoRAG) understanding extremely long-context videos.
- [X] [2025.01.13]🎯📢Our team has released [MiniRAG](https://github.com/HKUDS/MiniRAG) making RAG simpler with small models.
- [X] [2025.01.06]🎯📢You can now [use PostgreSQL for Storage](#using-postgresql-for-storage).
- [X] [2024.11.25]🎯📢LightRAG now supports seamless integration of [custom knowledge graphs](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#insert-custom-kg), empowering users to enhance the system with their own domain expertise.
- [X] [2024.11.19]🎯📢A comprehensive guide to LightRAG is now available on [LearnOpenCV](https://learnopencv.com/lightrag). Many thanks to the blog author.
- [X] [2024.11.11]🎯📢LightRAG now supports [deleting entities by their names](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#delete).
- [X] [2024.11.09]🎯📢Introducing the [LightRAG Gui](https://lightrag-gui.streamlit.app), which allows you to insert, query, visualize, and download LightRAG knowledge.
- [X] [2024.11.04]🎯📢You can now [use Neo4J for Storage](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#using-neo4j-for-storage).
- [X] [2024.10.29]🎯📢LightRAG now supports multiple file types, including PDF, DOC, PPT, and CSV via `textract`.
- [X] [2024.10.20]🎯📢We've added a new feature to LightRAG: Graph Visualization.
- [X] [2024.10.18]🎯📢We've added a link to a [LightRAG Introduction Video](https://youtu.be/oageL-1I0GE). Thanks to the author!
- [X] [2024.10.17]🎯📢We have created a [Discord channel](https://discord.gg/yF2MmDJyGJ)! Welcome to join for sharing and discussions! 🎉🎉
- [X] [2024.10.16]🎯📢LightRAG now supports [Ollama models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)!
- [X] [2024.10.15]🎯📢LightRAG now supports [Hugging Face models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)!
+- [2025.11]🎯[New Feature]: Integrated **RAGAS for Evaluation** and **Langfuse for Tracing**. Updated the API to return retrieved contexts alongside query results to support context precision metrics.
+- [2025.10]🎯[Scalability Enhancement]: Eliminated processing bottlenecks to support **Large-Scale Datasets Efficiently**.
+- [2025.09]🎯[New Feature] Enhances knowledge graph extraction accuracy for **Open-Sourced LLMs** such as Qwen3-30B-A3B.
+- [2025.08]🎯[New Feature] **Reranker** is now supported, significantly boosting performance for mixed queries (set as default query mode).
+- [2025.08]🎯[New Feature] Added **Document Deletion** with automatic KG regeneration to ensure optimal query performance.
+- [2025.06]🎯[New Release] Our team has released [RAG-Anything](https://github.com/HKUDS/RAG-Anything) — an **All-in-One Multimodal RAG** system for seamless processing of text, images, tables, and equations.
+- [2025.06]🎯[New Feature] LightRAG now supports comprehensive multimodal data handling through [RAG-Anything](https://github.com/HKUDS/RAG-Anything) integration, enabling seamless document parsing and RAG capabilities across diverse formats including PDFs, images, Office documents, tables, and formulas. Please refer to the new [multimodal section](https://github.com/HKUDS/LightRAG/?tab=readme-ov-file#multimodal-document-processing-rag-anything-integration) for details.
+- [2025.03]🎯[New Feature] LightRAG now supports citation functionality, enabling proper source attribution and enhanced document traceability.
+- [2025.02]🎯[New Feature] You can now use MongoDB as an all-in-one storage solution for unified data management.
+- [2025.02]🎯[New Release] Our team has released [VideoRAG](https://github.com/HKUDS/VideoRAG)-a RAG system for understanding extremely long-context videos
+- [2025.01]🎯[New Release] Our team has released [MiniRAG](https://github.com/HKUDS/MiniRAG) making RAG simpler with small models.
+- [2025.01]🎯You can now use PostgreSQL as an all-in-one storage solution for data management.
+- [2024.11]🎯[New Resource] A comprehensive guide to LightRAG is now available on [LearnOpenCV](https://learnopencv.com/lightrag). — explore in-depth tutorials and best practices. Many thanks to the blog author for this excellent contribution!
+- [2024.11]🎯[New Feature] Introducing the LightRAG WebUI — an interface that allows you to insert, query, and visualize LightRAG knowledge through an intuitive web-based dashboard.
+- [2024.11]🎯[New Feature] You can now [use Neo4J for Storage](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#using-neo4j-for-storage)-enabling graph database support.
+- [2024.10]🎯[New Feature] We've added a link to a [LightRAG Introduction Video](https://youtu.be/oageL-1I0GE). — a walkthrough of LightRAG's capabilities. Thanks to the author for this excellent contribution!
+- [2024.10]🎯[New Channel] We have created a [Discord channel](https://discord.gg/yF2MmDJyGJ)!💬 Welcome to join our community for sharing, discussions, and collaboration! 🎉🎉
+- [2024.10]🎯[New Feature] LightRAG now supports [Ollama models](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#quick-start)!

 <details>
  <summary style="font-size: 1.4em; font-weight: bold; cursor: pointer; display: list-item;">
@@ -218,7 +214,7 @@ For a streaming response implementation example, please see `examples/lightrag_o

 **Note 2**: Only `lightrag_openai_demo.py` and `lightrag_openai_compatible_demo.py` are officially supported sample codes. Other sample files are community contributions that haven't undergone full testing and optimization.

-## Programing with LightRAG Core
+## Programming with LightRAG Core

 > ⚠️ **If you would like to integrate LightRAG into your project, we recommend utilizing the REST API provided by the LightRAG Server**. LightRAG Core is typically intended for embedded applications or for researchers who wish to conduct studies and evaluations.

@@ -317,7 +313,7 @@ A full list of LightRAG init parameters:
 | **vector_db_storage_cls_kwargs** | `dict` | Additional parameters for vector database, like setting the threshold for nodes and relations retrieval | cosine_better_than_threshold: 0.2（default value changed by env var COSINE_THRESHOLD) |
 | **enable_llm_cache** | `bool` | If `TRUE`, stores LLM results in cache; repeated prompts return cached responses | `TRUE` |
 | **enable_llm_cache_for_entity_extract** | `bool` | If `TRUE`, stores LLM results in cache for entity extraction; Good for beginners to debug your application | `TRUE` |
-| **addon_params** | `dict` | Additional parameters, e.g., `{"language": "Simplified Chinese", "entity_types": ["organization", "person", "location", "event"]}`: sets example limit, entiy/relation extraction output language | language: English` |
+| **addon_params** | `dict` | Additional parameters, e.g., `{"language": "Simplified Chinese", "entity_types": ["organization", "person", "location", "event"]}`: sets example limit, entity/relation extraction output language | language: English` |
 | **embedding_cache_config** | `dict` | Configuration for question-answer caching. Contains three parameters: `enabled`: Boolean value to enable/disable cache lookup functionality. When enabled, the system will check cached responses before generating new answers. `similarity_threshold`: Float value (0-1), similarity threshold. When a new question's similarity with a cached question exceeds this threshold, the cached answer will be returned directly without calling the LLM. `use_llm_check`: Boolean value to enable/disable LLM similarity verification. When enabled, LLM will be used as a secondary check to verify the similarity between questions before returning cached answers. | Default: `{"enabled": False, "similarity_threshold": 0.95, "use_llm_check": False}` |

 </details>
@@ -368,7 +364,7 @@ class QueryParam:
    max_total_tokens: int = int(os.getenv("MAX_TOTAL_TOKENS", "30000"))
    """Maximum total tokens budget for the entire query context (entities + relations + chunks + system prompt)."""

-    # History mesages is only send to LLM for context, not used for retrieval
+    # History messages are only sent to LLM for context, not used for retrieval
    conversation_history: list[dict[str, str]] = field(default_factory=list)
    """Stores past conversation history to maintain context.
    Format: [{"role": "user/assistant", "content": "message"}].
@@ -407,6 +403,11 @@ LightRAG requires the utilization of LLM and Embedding models to accomplish docu
 * LightRAG also supports Open AI-like chat/embeddings APIs:

 ```python
+import os
+import numpy as np
+from lightrag.utils import wrap_embedding_func_with_attrs
+from lightrag.llm.openai import openai_complete_if_cache, openai_embed
+
 async def llm_model_func(
    prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
 ) -> str:
@@ -420,8 +421,9 @@ async def llm_model_func(
        **kwargs
    )

+@wrap_embedding_func_with_attrs(embedding_dim=4096, max_token_size=8192)
 async def embedding_func(texts: list[str]) -> np.ndarray:
-    return await openai_embed(
+    return await openai_embed.func(
        texts,
        model="solar-embedding-1-large-query",
        api_key=os.getenv("UPSTAGE_API_KEY"),
@@ -432,16 +434,17 @@ async def initialize_rag():
    rag = LightRAG(
        working_dir=WORKING_DIR,
        llm_model_func=llm_model_func,
-        embedding_func=EmbeddingFunc(
-            embedding_dim=4096,
-            func=embedding_func
-        )
+        embedding_func=embedding_func  # Pass the decorated function directly
    )

    await rag.initialize_storages()
    return rag
 ```

+> **Important Note on Embedding Function Wrapping:**
+>
+> `EmbeddingFunc` cannot be nested. Functions that have been decorated with `@wrap_embedding_func_with_attrs` (such as `openai_embed`, `ollama_embed`, etc.) cannot be wrapped again using `EmbeddingFunc()`. This is why we call `xxx_embed.func` (the underlying unwrapped function) instead of `xxx_embed` directly when creating custom embedding functions.
+
 </details>

 <details>
@@ -473,6 +476,7 @@ rag = LightRAG(

 <details>
 <summary> <b>Using Ollama Models</b> </summary>
+
 **Overview**

 If you want to use Ollama models, you need to pull model you plan to use and embedding model, for example `nomic-embed-text`.
@@ -480,19 +484,20 @@ If you want to use Ollama models, you need to pull model you plan to use and emb
 Then you only need to set LightRAG as follows:

 ```python
+import numpy as np
+from lightrag.utils import wrap_embedding_func_with_attrs
+from lightrag.llm.ollama import ollama_model_complete, ollama_embed
+
+@wrap_embedding_func_with_attrs(embedding_dim=768, max_token_size=8192)
+async def embedding_func(texts: list[str]) -> np.ndarray:
+    return await ollama_embed.func(texts, embed_model="nomic-embed-text")
+
 # Initialize LightRAG with Ollama model
 rag = LightRAG(
    working_dir=WORKING_DIR,
    llm_model_func=ollama_model_complete,  # Use Ollama model for text generation
    llm_model_name='your_model_name', # Your model name
-    # Use Ollama embedding function
-    embedding_func=EmbeddingFunc(
-        embedding_dim=768,
-        func=lambda texts: ollama_embed(
-            texts,
-            embed_model="nomic-embed-text"
-        )
-    ),
+    embedding_func=embedding_func,  # Pass the decorated function directly
 )
 ```

@@ -531,22 +536,27 @@ ollama create -f Modelfile qwen2m
 Tiy can use `llm_model_kwargs` param to configure ollama:

 ```python
+import numpy as np
+from lightrag.utils import wrap_embedding_func_with_attrs
+from lightrag.llm.ollama import ollama_model_complete, ollama_embed
+
+@wrap_embedding_func_with_attrs(embedding_dim=768, max_token_size=8192)
+async def embedding_func(texts: list[str]) -> np.ndarray:
+    return await ollama_embed.func(texts, embed_model="nomic-embed-text")
+
 rag = LightRAG(
    working_dir=WORKING_DIR,
    llm_model_func=ollama_model_complete,  # Use Ollama model for text generation
    llm_model_name='your_model_name', # Your model name
    llm_model_kwargs={"options": {"num_ctx": 32768}},
-    # Use Ollama embedding function
-    embedding_func=EmbeddingFunc(
-        embedding_dim=768,
-        func=lambda texts: ollama_embed(
-            texts,
-            embed_model="nomic-embed-text"
-        )
-    ),
+    embedding_func=embedding_func,  # Pass the decorated function directly
 )
 ```

+> **Important Note on Embedding Function Wrapping:**
+>
+> `EmbeddingFunc` cannot be nested. Functions that have been decorated with `@wrap_embedding_func_with_attrs` (such as `openai_embed`, `ollama_embed`, etc.) cannot be wrapped again using `EmbeddingFunc()`. This is why we call `xxx_embed.func` (the underlying unwrapped function) instead of `xxx_embed` directly when creating custom embedding functions.
+
 * **Low RAM GPUs**

 In order to run this experiment on low RAM GPU you should select small model and tune context window (increasing context increase memory consumption). For example, running this ollama example on repurposed mining GPU with 6Gb of RAM required to set context size to 26k while using `gemma2:2b`. It was able to find 197 entities and 19 relations on `book.txt`.
@@ -558,7 +568,7 @@ In order to run this experiment on low RAM GPU you should select small model and
 LightRAG supports integration with LlamaIndex (`llm/llama_index_impl.py`):

 - Integrates with OpenAI and other providers through LlamaIndex
- See [LlamaIndex Documentation](lightrag/llm/Readme.md) for detailed setup and examples
+- See [LlamaIndex Documentation](https://developers.llamaindex.ai/python/framework/) for detailed setup or the [examples](examples/unofficial-sample/)

 **Example Usage**

@@ -620,9 +630,10 @@ if __name__ == "__main__":

 **For detailed documentation and examples, see:**

- [LlamaIndex Documentation](lightrag/llm/Readme.md)
- [Direct OpenAI Example](examples/lightrag_llamaindex_direct_demo.py)
- [LiteLLM Proxy Example](examples/lightrag_llamaindex_litellm_demo.py)
+- [LlamaIndex Documentation](https://developers.llamaindex.ai/python/framework/)
+- [Direct OpenAI Example](examples/unofficial-sample/lightrag_llamaindex_direct_demo.py)
+- [LiteLLM Proxy Example](examples/unofficial-sample/lightrag_llamaindex_litellm_demo.py)
+- [LiteLLM Proxy with Opik Example](examples/unofficial-sample/lightrag_llamaindex_litellm_opik_demo.py)

 </details>

@@ -849,7 +860,7 @@ see test_neo4j.py for a working example.
 For production level scenarios you will most likely want to leverage an enterprise solution. PostgreSQL can provide a one-stop solution for you as KV store, VectorDB (pgvector) and GraphDB (apache AGE). PostgreSQL version 16.6 or higher is supported.

 * PostgreSQL is lightweight,the whole binary distribution including all necessary plugins can be zipped to 40MB: Ref to [Windows Release](https://github.com/ShanGor/apache-age-windows/releases/tag/PG17%2Fv1.5.0-rc0) as it is easy to install for Linux/Mac.
-* If you prefer docker, please start with this image if you are a beginner to avoid hiccups (DO read the overview): https://hub.docker.com/r/shangor/postgres-for-rag
+* If you prefer docker, please start with this image if you are a beginner to avoid hiccups (Default user password:rag/rag): https://hub.docker.com/r/gzdaniel/postgres-for-rag
 * How to start? Ref to: [examples/lightrag_zhipu_postgres_demo.py](https://github.com/HKUDS/LightRAG/blob/main/examples/lightrag_zhipu_postgres_demo.py)
 * For high-performance graph database requirements, Neo4j is recommended as Apache AGE's performance is not as competitive.

@@ -1559,7 +1570,7 @@ Langfuse provides a drop-in replacement for the OpenAI client that automatically
 pip install lightrag-hku
 pip install lightrag-hku[observability]

-# Or install from souce code with debug mode enabled
+# Or install from source code with debug mode enabled
 pip install -e .
 pip install -e ".[observability]"
 ```
@@ -1595,7 +1606,7 @@ Once installed and configured, Langfuse automatically traces all OpenAI LLM call

 ## RAGAS-based Evaluation

-**RAGAS** (Retrieval Augmented Generation Assessment) is a framework for reference-free evaluation of RAG systems using LLMs. There is an evaluation script based on RAGAS. For detailed information, please refer to [RAGAS-based Evaluation Framework](lightrag/evaluation/README.md).
+**RAGAS** (Retrieval Augmented Generation Assessment) is a framework for reference-free evaluation of RAG systems using LLMs. There is an evaluation script based on RAGAS. For detailed information, please refer to [RAGAS-based Evaluation Framework](lightrag/evaluation/README_EVALUASTION_RAGAS.md).

 ## Evaluation

--- a/env.example
+++ b/env.example
@@ -102,6 +102,9 @@ RERANK_BINDING=null
 # RERANK_MODEL=rerank-v3.5
 # RERANK_BINDING_HOST=https://api.cohere.com/v2/rerank
 # RERANK_BINDING_API_KEY=your_rerank_api_key_here
+### Cohere rerank chunking configuration (useful for models with token limits like ColBERT)
+# RERANK_ENABLE_CHUNKING=true
+# RERANK_MAX_TOKENS_PER_DOC=480

 ### Default value for Jina AI
 # RERANK_MODEL=jina-reranker-v2-base-multilingual
@@ -183,9 +186,13 @@ LLM_MODEL=gpt-4o
 LLM_BINDING_HOST=https://api.openai.com/v1
 LLM_BINDING_API_KEY=your_api_key

-### Env vars for Azure openai
+### Azure OpenAI example
+### Use deployment name as model name or set AZURE_OPENAI_DEPLOYMENT instead
 # AZURE_OPENAI_API_VERSION=2024-08-01-preview
-# AZURE_OPENAI_DEPLOYMENT=gpt-4o
+# LLM_BINDING=azure_openai
+# LLM_BINDING_HOST=https://xxxx.openai.azure.com/
+# LLM_BINDING_API_KEY=your_api_key
+# LLM_MODEL=my-gpt-mini-deployment

 ### Openrouter example
 # LLM_MODEL=google/gemini-2.5-flash
@@ -273,11 +280,14 @@ EMBEDDING_TOKEN_LIMIT=8192
 EMBEDDING_BINDING_HOST=https://api.openai.com/v1
 EMBEDDING_BINDING_API_KEY=your_api_key

-### Optional for Azure
-# AZURE_EMBEDDING_DEPLOYMENT=text-embedding-3-large
-# AZURE_EMBEDDING_API_VERSION=2023-05-15
-# AZURE_EMBEDDING_ENDPOINT=your_endpoint
-# AZURE_EMBEDDING_API_KEY=your_api_key
+### Optional for Azure embedding
+### Use deployment name as model name or set AZURE_EMBEDDING_DEPLOYMENT instead
+# AZURE_EMBEDDING_API_VERSION=2024-08-01-preview
+# EMBEDDING_BINDING=azure_openai
+# EMBEDDING_BINDING_HOST=https://xxxx.openai.azure.com/
+# EMBEDDING_API_KEY=your_api_key
+# EMBEDDING_MODEL==my-text-embedding-3-large-deployment
+# EMBEDDING_DIM=3072

 ### Gemini embedding
 # EMBEDDING_BINDING=gemini
@@ -440,6 +450,17 @@ MEMGRAPH_DATABASE=memgraph
 ### DB specific workspace should not be set, keep for compatible only
 ### MEMGRAPH_WORKSPACE=forced_workspace_name

+###########################################################
+### Langfuse Observability Configuration
+### Only works with LLM provided by OpenAI compatible API
+### Install with: pip install lightrag-hku[observability]
+### Sign up at: https://cloud.langfuse.com or self-host
+###########################################################
+# LANGFUSE_SECRET_KEY=""
+# LANGFUSE_PUBLIC_KEY=""
+# LANGFUSE_HOST="https://cloud.langfuse.com"  # 或您的自托管实例地址
+# LANGFUSE_ENABLE_TRACE=true
+
 ############################
 ### Evaluation Configuration
 ############################
--- a/env.ollama-binding-options.example
+++ b/env.ollama-binding-options.example
@@ -1,195 +0,0 @@
-################################################################################
-# Autogenerated .env entries list for LightRAG binding options
-#
-# To generate run:
-# $ python -m lightrag.llm.binding_options
-################################################################################
-# ollama_embedding -- Context window size (number of tokens)
-# OLLAMA_EMBEDDING_NUM_CTX=4096
-
-# ollama_embedding -- Maximum number of tokens to predict
-# OLLAMA_EMBEDDING_NUM_PREDICT=128
-
-# ollama_embedding -- Number of tokens to keep from the initial prompt
-# OLLAMA_EMBEDDING_NUM_KEEP=0
-
-# ollama_embedding -- Random seed for generation (-1 for random)
-# OLLAMA_EMBEDDING_SEED=-1
-
-# ollama_embedding -- Controls randomness (0.0-2.0, higher = more creative)
-# OLLAMA_EMBEDDING_TEMPERATURE=0.8
-
-# ollama_embedding -- Top-k sampling parameter (0 = disabled)
-# OLLAMA_EMBEDDING_TOP_K=40
-
-# ollama_embedding -- Top-p (nucleus) sampling parameter (0.0-1.0)
-# OLLAMA_EMBEDDING_TOP_P=0.9
-
-# ollama_embedding -- Tail free sampling parameter (1.0 = disabled)
-# OLLAMA_EMBEDDING_TFS_Z=1.0
-
-# ollama_embedding -- Typical probability mass (1.0 = disabled)
-# OLLAMA_EMBEDDING_TYPICAL_P=1.0
-
-# ollama_embedding -- Minimum probability threshold (0.0 = disabled)
-# OLLAMA_EMBEDDING_MIN_P=0.0
-
-# ollama_embedding -- Number of tokens to consider for repetition penalty
-# OLLAMA_EMBEDDING_REPEAT_LAST_N=64
-
-# ollama_embedding -- Penalty for repetition (1.0 = no penalty)
-# OLLAMA_EMBEDDING_REPEAT_PENALTY=1.1
-
-# ollama_embedding -- Penalty for token presence (-2.0 to 2.0)
-# OLLAMA_EMBEDDING_PRESENCE_PENALTY=0.0
-
-# ollama_embedding -- Penalty for token frequency (-2.0 to 2.0)
-# OLLAMA_EMBEDDING_FREQUENCY_PENALTY=0.0
-
-# ollama_embedding -- Mirostat sampling algorithm (0=disabled, 1=Mirostat 1.0, 2=Mirostat 2.0)
-# OLLAMA_EMBEDDING_MIROSTAT=0
-
-# ollama_embedding -- Mirostat target entropy
-# OLLAMA_EMBEDDING_MIROSTAT_TAU=5.0
-
-# ollama_embedding -- Mirostat learning rate
-# OLLAMA_EMBEDDING_MIROSTAT_ETA=0.1
-
-# ollama_embedding -- Enable NUMA optimization
-# OLLAMA_EMBEDDING_NUMA=False
-
-# ollama_embedding -- Batch size for processing
-# OLLAMA_EMBEDDING_NUM_BATCH=512
-
-# ollama_embedding -- Number of GPUs to use (-1 for auto)
-# OLLAMA_EMBEDDING_NUM_GPU=-1
-
-# ollama_embedding -- Main GPU index
-# OLLAMA_EMBEDDING_MAIN_GPU=0
-
-# ollama_embedding -- Optimize for low VRAM
-# OLLAMA_EMBEDDING_LOW_VRAM=False
-
-# ollama_embedding -- Number of CPU threads (0 for auto)
-# OLLAMA_EMBEDDING_NUM_THREAD=0
-
-# ollama_embedding -- Use half-precision for key/value cache
-# OLLAMA_EMBEDDING_F16_KV=True
-
-# ollama_embedding -- Return logits for all tokens
-# OLLAMA_EMBEDDING_LOGITS_ALL=False
-
-# ollama_embedding -- Only load vocabulary
-# OLLAMA_EMBEDDING_VOCAB_ONLY=False
-
-# ollama_embedding -- Use memory mapping for model files
-# OLLAMA_EMBEDDING_USE_MMAP=True
-
-# ollama_embedding -- Lock model in memory
-# OLLAMA_EMBEDDING_USE_MLOCK=False
-
-# ollama_embedding -- Only use for embeddings
-# OLLAMA_EMBEDDING_EMBEDDING_ONLY=False
-
-# ollama_embedding -- Penalize newline tokens
-# OLLAMA_EMBEDDING_PENALIZE_NEWLINE=True
-
-# ollama_embedding -- Stop sequences (comma-separated string)
-# OLLAMA_EMBEDDING_STOP=
-
-# ollama_llm -- Context window size (number of tokens)
-# OLLAMA_LLM_NUM_CTX=4096
-
-# ollama_llm -- Maximum number of tokens to predict
-# OLLAMA_LLM_NUM_PREDICT=128
-
-# ollama_llm -- Number of tokens to keep from the initial prompt
-# OLLAMA_LLM_NUM_KEEP=0
-
-# ollama_llm -- Random seed for generation (-1 for random)
-# OLLAMA_LLM_SEED=-1
-
-# ollama_llm -- Controls randomness (0.0-2.0, higher = more creative)
-# OLLAMA_LLM_TEMPERATURE=0.8
-
-# ollama_llm -- Top-k sampling parameter (0 = disabled)
-# OLLAMA_LLM_TOP_K=40
-
-# ollama_llm -- Top-p (nucleus) sampling parameter (0.0-1.0)
-# OLLAMA_LLM_TOP_P=0.9
-
-# ollama_llm -- Tail free sampling parameter (1.0 = disabled)
-# OLLAMA_LLM_TFS_Z=1.0
-
-# ollama_llm -- Typical probability mass (1.0 = disabled)
-# OLLAMA_LLM_TYPICAL_P=1.0
-
-# ollama_llm -- Minimum probability threshold (0.0 = disabled)
-# OLLAMA_LLM_MIN_P=0.0
-
-# ollama_llm -- Number of tokens to consider for repetition penalty
-# OLLAMA_LLM_REPEAT_LAST_N=64
-
-# ollama_llm -- Penalty for repetition (1.0 = no penalty)
-# OLLAMA_LLM_REPEAT_PENALTY=1.1
-
-# ollama_llm -- Penalty for token presence (-2.0 to 2.0)
-# OLLAMA_LLM_PRESENCE_PENALTY=0.0
-
-# ollama_llm -- Penalty for token frequency (-2.0 to 2.0)
-# OLLAMA_LLM_FREQUENCY_PENALTY=0.0
-
-# ollama_llm -- Mirostat sampling algorithm (0=disabled, 1=Mirostat 1.0, 2=Mirostat 2.0)
-# OLLAMA_LLM_MIROSTAT=0
-
-# ollama_llm -- Mirostat target entropy
-# OLLAMA_LLM_MIROSTAT_TAU=5.0
-
-# ollama_llm -- Mirostat learning rate
-# OLLAMA_LLM_MIROSTAT_ETA=0.1
-
-# ollama_llm -- Enable NUMA optimization
-# OLLAMA_LLM_NUMA=False
-
-# ollama_llm -- Batch size for processing
-# OLLAMA_LLM_NUM_BATCH=512
-
-# ollama_llm -- Number of GPUs to use (-1 for auto)
-# OLLAMA_LLM_NUM_GPU=-1
-
-# ollama_llm -- Main GPU index
-# OLLAMA_LLM_MAIN_GPU=0
-
-# ollama_llm -- Optimize for low VRAM
-# OLLAMA_LLM_LOW_VRAM=False
-
-# ollama_llm -- Number of CPU threads (0 for auto)
-# OLLAMA_LLM_NUM_THREAD=0
-
-# ollama_llm -- Use half-precision for key/value cache
-# OLLAMA_LLM_F16_KV=True
-
-# ollama_llm -- Return logits for all tokens
-# OLLAMA_LLM_LOGITS_ALL=False
-
-# ollama_llm -- Only load vocabulary
-# OLLAMA_LLM_VOCAB_ONLY=False
-
-# ollama_llm -- Use memory mapping for model files
-# OLLAMA_LLM_USE_MMAP=True
-
-# ollama_llm -- Lock model in memory
-# OLLAMA_LLM_USE_MLOCK=False
-
-# ollama_llm -- Only use for embeddings
-# OLLAMA_LLM_EMBEDDING_ONLY=False
-
-# ollama_llm -- Penalize newline tokens
-# OLLAMA_LLM_PENALIZE_NEWLINE=True
-
-# ollama_llm -- Stop sequences (comma-separated string)
-# OLLAMA_LLM_STOP=
-
-#
-# End of .env entries for LightRAG binding options
-################################################################################
--- a/examples/rerank_example.py
+++ b/examples/rerank_example.py
@@ -15,9 +15,12 @@ Configuration Required:
    EMBEDDING_BINDING_HOST
    EMBEDDING_BINDING_API_KEY
 3. Set your vLLM deployed AI rerank model setting with env vars:
-    RERANK_MODEL
-    RERANK_BINDING_HOST
+    RERANK_BINDING=cohere
+    RERANK_MODEL (e.g., answerai-colbert-small-v1 or rerank-v3.5)
+    RERANK_BINDING_HOST (e.g., https://api.cohere.com/v2/rerank or LiteLLM proxy)
    RERANK_BINDING_API_KEY
+    RERANK_ENABLE_CHUNKING=true (optional, for models with token limits)
+    RERANK_MAX_TOKENS_PER_DOC=480 (optional, default 4096)

 Note: Rerank is controlled per query via the 'enable_rerank' parameter (default: True)
 """
@@ -66,9 +69,11 @@ async def embedding_func(texts: list[str]) -> np.ndarray:

 rerank_model_func = partial(
    cohere_rerank,
-    model=os.getenv("RERANK_MODEL"),
+    model=os.getenv("RERANK_MODEL", "rerank-v3.5"),
    api_key=os.getenv("RERANK_BINDING_API_KEY"),
-    base_url=os.getenv("RERANK_BINDING_HOST"),
+    base_url=os.getenv("RERANK_BINDING_HOST", "https://api.cohere.com/v2/rerank"),
+    enable_chunking=os.getenv("RERANK_ENABLE_CHUNKING", "false").lower() == "true",
+    max_tokens_per_doc=int(os.getenv("RERANK_MAX_TOKENS_PER_DOC", "4096")),
 )


--- a/lightrag/api/init.py
+++ b/lightrag/api/init.py
@@ -1 +1 @@
-__api_version__ = "0256"
+__api_version__ = "0260"
--- a/lightrag/api/config.py
+++ b/lightrag/api/config.py
@@ -365,8 +365,12 @@ def parse_args() -> argparse.Namespace:

    # Inject model configuration
    args.llm_model = get_env_value("LLM_MODEL", "mistral-nemo:latest")
-    args.embedding_model = get_env_value("EMBEDDING_MODEL", "bge-m3:latest")
-    args.embedding_dim = get_env_value("EMBEDDING_DIM", 1024, int)
+    # EMBEDDING_MODEL defaults to None - each binding will use its own default model
+    # e.g., OpenAI uses "text-embedding-3-small", Jina uses "jina-embeddings-v4"
+    args.embedding_model = get_env_value("EMBEDDING_MODEL", None, special_none=True)
+    # EMBEDDING_DIM defaults to None - each binding will use its own default dimension
+    # Value is inherited from provider defaults via wrap_embedding_func_with_attrs decorator
+    args.embedding_dim = get_env_value("EMBEDDING_DIM", None, int, special_none=True)
    args.embedding_send_dim = get_env_value("EMBEDDING_SEND_DIM", False, bool)

    # Inject chunk configuration
--- a/lightrag/api/lightrag_server.py
+++ b/lightrag/api/lightrag_server.py
@@ -159,19 +159,22 @@ def check_frontend_build():
    """Check if frontend is built and optionally check if source is up-to-date

    Returns:
-        bool: True if frontend is outdated, False if up-to-date or production environment
+        tuple: (assets_exist: bool, is_outdated: bool)
+            - assets_exist: True if WebUI build files exist
+            - is_outdated: True if source is newer than build (only in dev environment)
    """
    webui_dir = Path(__file__).parent / "webui"
    index_html = webui_dir / "index.html"

-    # 1. Check if build files exist (required)
+    # 1. Check if build files exist
    if not index_html.exists():
-        ASCIIColors.red("\n" + "=" * 80)
-        ASCIIColors.red("ERROR: Frontend Not Built")
-        ASCIIColors.red("=" * 80)
+        ASCIIColors.yellow("\n" + "=" * 80)
+        ASCIIColors.yellow("WARNING: Frontend Not Built")
+        ASCIIColors.yellow("=" * 80)
        ASCIIColors.yellow("The WebUI frontend has not been built yet.")
+        ASCIIColors.yellow("The API server will start without the WebUI interface.")
        ASCIIColors.yellow(
-            "Please build the frontend code first using the following commands:\n"
+            "\nTo enable WebUI, build the frontend using these commands:\n"
        )
        ASCIIColors.cyan("    cd lightrag_webui")
        ASCIIColors.cyan("    bun install --frozen-lockfile")
@@ -181,8 +184,8 @@ def check_frontend_build():
        ASCIIColors.cyan(
            "Note: Make sure you have Bun installed. Visit https://bun.sh for installation."
        )
-        ASCIIColors.red("=" * 80 + "\n")
-        sys.exit(1)  # Exit immediately
+        ASCIIColors.yellow("=" * 80 + "\n")
+        return (False, False)  # Assets don't exist, not outdated

    # 2. Check if this is a development environment (source directory exists)
    try:
@@ -195,7 +198,7 @@ def check_frontend_build():
            logger.debug(
                "Production environment detected, skipping source freshness check"
            )
-            return False
+            return (True, False)  # Assets exist, not outdated (prod environment)

        # Development environment, perform source code timestamp check
        logger.debug("Development environment detected, checking source freshness")
@@ -270,20 +273,20 @@ def check_frontend_build():
            ASCIIColors.cyan("    cd ..")
            ASCIIColors.yellow("\nThe server will continue with the current build.")
            ASCIIColors.yellow("=" * 80 + "\n")
-            return True  # Frontend is outdated
+            return (True, True)  # Assets exist, outdated
        else:
            logger.info("Frontend build is up-to-date")
-            return False  # Frontend is up-to-date
+            return (True, False)  # Assets exist, up-to-date

    except Exception as e:
        # If check fails, log warning but don't affect startup
        logger.warning(f"Failed to check frontend source freshness: {e}")
-        return False  # Assume up-to-date on error
+        return (True, False)  # Assume assets exist and up-to-date on error


 def create_app(args):
-    # Check frontend build first and get outdated status
-    is_frontend_outdated = check_frontend_build()
+    # Check frontend build first and get status
+    webui_assets_exist, is_frontend_outdated = check_frontend_build()

    # Create unified API version display with warning symbol if frontend is outdated
    api_version_display = (
@@ -651,6 +654,17 @@ def create_app(args):
        2. Extracts max_token_size and embedding_dim from provider if it's an EmbeddingFunc
        3. Creates an optimized wrapper that calls the underlying function directly (avoiding double-wrapping)
        4. Returns a properly configured EmbeddingFunc instance
+
+        Configuration Rules:
+        - When EMBEDDING_MODEL is not set: Uses provider's default model and dimension
+          (e.g., jina-embeddings-v4 with 2048 dims, text-embedding-3-small with 1536 dims)
+        - When EMBEDDING_MODEL is set to a custom model: User MUST also set EMBEDDING_DIM
+          to match the custom model's dimension (e.g., for jina-embeddings-v3, set EMBEDDING_DIM=1024)
+
+        Note: The embedding_dim parameter is automatically injected by EmbeddingFunc wrapper
+        when send_dimensions=True (enabled for Jina and Gemini bindings). This wrapper calls
+        the underlying provider function directly (.func) to avoid double-wrapping, so we must
+        explicitly pass embedding_dim to the provider's underlying function.
        """

        # Step 1: Import provider function and extract default attributes
@@ -710,6 +724,7 @@ def create_app(args):
        )

        # Step 3: Create optimized embedding function (calls underlying function directly)
+        # Note: When model is None, each binding will use its own default model
        async def optimized_embedding_function(texts, embedding_dim=None):
            try:
                if binding == "lollms":
@@ -721,9 +736,9 @@ def create_app(args):
                        if isinstance(lollms_embed, EmbeddingFunc)
                        else lollms_embed
                    )
-                    return await actual_func(
-                        texts, embed_model=model, host=host, api_key=api_key
-                    )
+                    # lollms embed_model is not used (server uses configured vectorizer)
+                    # Only pass base_url and api_key
+                    return await actual_func(texts, base_url=host, api_key=api_key)
                elif binding == "ollama":
                    from lightrag.llm.ollama import ollama_embed

@@ -742,13 +757,16 @@ def create_app(args):

                        ollama_options = OllamaEmbeddingOptions.options_dict(args)

-                    return await actual_func(
-                        texts,
-                        embed_model=model,
-                        host=host,
-                        api_key=api_key,
-                        options=ollama_options,
-                    )
+                    # Pass embed_model only if provided, let function use its default (bge-m3:latest)
+                    kwargs = {
+                        "texts": texts,
+                        "host": host,
+                        "api_key": api_key,
+                        "options": ollama_options,
+                    }
+                    if model:
+                        kwargs["embed_model"] = model
+                    return await actual_func(**kwargs)
                elif binding == "azure_openai":
                    from lightrag.llm.azure_openai import azure_openai_embed

@@ -757,7 +775,11 @@ def create_app(args):
                        if isinstance(azure_openai_embed, EmbeddingFunc)
                        else azure_openai_embed
                    )
-                    return await actual_func(texts, model=model, api_key=api_key)
+                    # Pass model only if provided, let function use its default otherwise
+                    kwargs = {"texts": texts, "api_key": api_key}
+                    if model:
+                        kwargs["model"] = model
+                    return await actual_func(**kwargs)
                elif binding == "aws_bedrock":
                    from lightrag.llm.bedrock import bedrock_embed

@@ -766,7 +788,11 @@ def create_app(args):
                        if isinstance(bedrock_embed, EmbeddingFunc)
                        else bedrock_embed
                    )
-                    return await actual_func(texts, model=model)
+                    # Pass model only if provided, let function use its default otherwise
+                    kwargs = {"texts": texts}
+                    if model:
+                        kwargs["model"] = model
+                    return await actual_func(**kwargs)
                elif binding == "jina":
                    from lightrag.llm.jina import jina_embed

@@ -775,12 +801,16 @@ def create_app(args):
                        if isinstance(jina_embed, EmbeddingFunc)
                        else jina_embed
                    )
-                    return await actual_func(
-                        texts,
-                        embedding_dim=embedding_dim,
-                        base_url=host,
-                        api_key=api_key,
-                    )
+                    # Pass model only if provided, let function use its default (jina-embeddings-v4)
+                    kwargs = {
+                        "texts": texts,
+                        "embedding_dim": embedding_dim,
+                        "base_url": host,
+                        "api_key": api_key,
+                    }
+                    if model:
+                        kwargs["model"] = model
+                    return await actual_func(**kwargs)
                elif binding == "gemini":
                    from lightrag.llm.gemini import gemini_embed

@@ -798,14 +828,19 @@ def create_app(args):

                        gemini_options = GeminiEmbeddingOptions.options_dict(args)

-                    return await actual_func(
-                        texts,
-                        model=model,
-                        base_url=host,
-                        api_key=api_key,
-                        embedding_dim=embedding_dim,
-                        task_type=gemini_options.get("task_type", "RETRIEVAL_DOCUMENT"),
-                    )
+                    # Pass model only if provided, let function use its default (gemini-embedding-001)
+                    kwargs = {
+                        "texts": texts,
+                        "base_url": host,
+                        "api_key": api_key,
+                        "embedding_dim": embedding_dim,
+                        "task_type": gemini_options.get(
+                            "task_type", "RETRIEVAL_DOCUMENT"
+                        ),
+                    }
+                    if model:
+                        kwargs["model"] = model
+                    return await actual_func(**kwargs)
                else:  # openai and compatible
                    from lightrag.llm.openai import openai_embed

@@ -814,13 +849,16 @@ def create_app(args):
                        if isinstance(openai_embed, EmbeddingFunc)
                        else openai_embed
                    )
-                    return await actual_func(
-                        texts,
-                        model=model,
-                        base_url=host,
-                        api_key=api_key,
-                        embedding_dim=embedding_dim,
-                    )
+                    # Pass model only if provided, let function use its default (text-embedding-3-small)
+                    kwargs = {
+                        "texts": texts,
+                        "base_url": host,
+                        "api_key": api_key,
+                        "embedding_dim": embedding_dim,
+                    }
+                    if model:
+                        kwargs["model"] = model
+                    return await actual_func(**kwargs)
            except ImportError as e:
                raise Exception(f"Failed to import {binding} embedding: {e}")

@@ -967,15 +1005,27 @@ def create_app(args):
            query: str, documents: list, top_n: int = None, extra_body: dict = None
        ):
            """Server rerank function with configuration from environment variables"""
-            return await selected_rerank_func(
-                query=query,
-                documents=documents,
-                top_n=top_n,
-                api_key=args.rerank_binding_api_key,
-                model=args.rerank_model,
-                base_url=args.rerank_binding_host,
-                extra_body=extra_body,
-            )
+            # Prepare kwargs for rerank function
+            kwargs = {
+                "query": query,
+                "documents": documents,
+                "top_n": top_n,
+                "api_key": args.rerank_binding_api_key,
+                "model": args.rerank_model,
+                "base_url": args.rerank_binding_host,
+            }
+
+            # Add Cohere-specific parameters if using cohere binding
+            if args.rerank_binding == "cohere":
+                # Enable chunking if configured (useful for models with token limits like ColBERT)
+                kwargs["enable_chunking"] = (
+                    os.getenv("RERANK_ENABLE_CHUNKING", "false").lower() == "true"
+                )
+                kwargs["max_tokens_per_doc"] = int(
+                    os.getenv("RERANK_MAX_TOKENS_PER_DOC", "4096")
+                )
+
+            return await selected_rerank_func(**kwargs, extra_body=extra_body)

        rerank_model_func = server_rerank_func
        logger.info(
@@ -1067,8 +1117,11 @@ def create_app(args):

    @app.get("/")
    async def redirect_to_webui():
-        """Redirect root path to /webui"""
-        return RedirectResponse(url="/webui")
+        """Redirect root path based on WebUI availability"""
+        if webui_assets_exist:
+            return RedirectResponse(url="/webui")
+        else:
+            return RedirectResponse(url="/docs")

    @app.get("/auth-status")
    async def get_auth_status():
@@ -1135,9 +1188,41 @@ def create_app(args):
            "webui_description": webui_description,
        }

-    @app.get("/health", dependencies=[Depends(combined_auth)])
+    @app.get(
+        "/health",
+        dependencies=[Depends(combined_auth)],
+        summary="Get system health and configuration status",
+        description="Returns comprehensive system status including WebUI availability, configuration, and operational metrics",
+        response_description="System health status with configuration details",
+        responses={
+            200: {
+                "description": "Successful response with system status",
+                "content": {
+                    "application/json": {
+                        "example": {
+                            "status": "healthy",
+                            "webui_available": True,
+                            "working_directory": "/path/to/working/dir",
+                            "input_directory": "/path/to/input/dir",
+                            "configuration": {
+                                "llm_binding": "openai",
+                                "llm_model": "gpt-4",
+                                "embedding_binding": "openai",
+                                "embedding_model": "text-embedding-ada-002",
+                                "workspace": "default",
+                            },
+                            "auth_mode": "enabled",
+                            "pipeline_busy": False,
+                            "core_version": "0.0.1",
+                            "api_version": "0.0.1",
+                        }
+                    }
+                },
+            }
+        },
+    )
    async def get_status(request: Request):
-        """Get current system status"""
+        """Get current system status including WebUI availability"""
        try:
            workspace = get_workspace_from_request(request)
            default_workspace = get_default_workspace()
@@ -1157,6 +1242,7 @@ def create_app(args):

            return {
                "status": "healthy",
+                "webui_available": webui_assets_exist,
                "working_directory": str(args.working_dir),
                "input_directory": str(args.input_dir),
                "configuration": {
@@ -1246,16 +1332,27 @@ def create_app(args):
            name="swagger-ui-static",
        )

-    # Webui mount webui/index.html
-    static_dir = Path(__file__).parent / "webui"
-    static_dir.mkdir(exist_ok=True)
-    app.mount(
-        "/webui",
-        SmartStaticFiles(
-            directory=static_dir, html=True, check_dir=True
-        ),  # Use SmartStaticFiles
-        name="webui",
-    )
+    # Conditionally mount WebUI only if assets exist
+    if webui_assets_exist:
+        static_dir = Path(__file__).parent / "webui"
+        static_dir.mkdir(exist_ok=True)
+        app.mount(
+            "/webui",
+            SmartStaticFiles(
+                directory=static_dir, html=True, check_dir=True
+            ),  # Use SmartStaticFiles
+            name="webui",
+        )
+        logger.info("WebUI assets mounted at /webui")
+    else:
+        logger.info("WebUI assets not available, /webui route not mounted")
+
+        # Add redirect for /webui when assets are not available
+        @app.get("/webui")
+        @app.get("/webui/")
+        async def webui_redirect_to_docs():
+            """Redirect /webui to /docs when WebUI is not available"""
+            return RedirectResponse(url="/docs")

    return app

--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@@ -24,7 +24,11 @@ from pydantic import BaseModel, Field, field_validator

 from lightrag import LightRAG
 from lightrag.base import DeletionResult, DocProcessingStatus, DocStatus
-from lightrag.utils import generate_track_id
+from lightrag.utils import (
+    generate_track_id,
+    compute_mdhash_id,
+    sanitize_text_for_encoding,
+)
 from lightrag.api.utils_api import get_combined_auth_dependency
 from ..config import global_args

@@ -159,7 +163,7 @@ class ReprocessResponse(BaseModel):
    Attributes:
        status: Status of the reprocessing operation
        message: Message describing the operation result
-        track_id: Tracking ID for monitoring reprocessing progress
+        track_id: Always empty string. Reprocessed documents retain their original track_id.
    """

    status: Literal["reprocessing_started"] = Field(
@@ -167,7 +171,8 @@ class ReprocessResponse(BaseModel):
    )
    message: str = Field(description="Human-readable message describing the operation")
    track_id: str = Field(
-        description="Tracking ID for monitoring reprocessing progress"
+        default="",
+        description="Always empty string. Reprocessed documents retain their original track_id from initial upload.",
    )

    class Config:
@@ -175,7 +180,7 @@ class ReprocessResponse(BaseModel):
            "example": {
                "status": "reprocessing_started",
                "message": "Reprocessing of failed documents has been initiated in background",
-                "track_id": "retry_20250729_170612_def456",
+                "track_id": "",
            }
        }

@@ -976,19 +981,82 @@ def _extract_pdf_pypdf(file_bytes: bytes, password: str = None) -> str:


 def _extract_docx(file_bytes: bytes) -> str:
-    """Extract DOCX content (synchronous).
+    """Extract DOCX content including tables in document order (synchronous).

    Args:
        file_bytes: DOCX file content as bytes

    Returns:
-        str: Extracted text content
+        str: Extracted text content with tables in their original positions.
+             Tables are separated from paragraphs with blank lines for clarity.
    """
    from docx import Document  # type: ignore
+    from docx.table import Table  # type: ignore
+    from docx.text.paragraph import Paragraph  # type: ignore

    docx_file = BytesIO(file_bytes)
    doc = Document(docx_file)
-    return "\n".join([paragraph.text for paragraph in doc.paragraphs])
+
+    def escape_cell(cell_value: str | None) -> str:
+        """Escape characters that would break tab-delimited layout.
+
+        Escape order is critical: backslashes first, then tabs/newlines.
+        This prevents double-escaping issues.
+
+        Args:
+            cell_value: The cell value to escape (can be None or str)
+
+        Returns:
+            str: Escaped cell value safe for tab-delimited format
+        """
+        if cell_value is None:
+            return ""
+        text = str(cell_value)
+        # CRITICAL: Escape backslash first to avoid double-escaping
+        return (
+            text.replace("\\", "\\\\")  # Must be first: \ -> \\
+            .replace("\t", "\\t")  # Tab -> \t (visible)
+            .replace("\r\n", "\\n")  # Windows newline -> \n
+            .replace("\r", "\\n")  # Mac newline -> \n
+            .replace("\n", "\\n")  # Unix newline -> \n
+        )
+
+    content_parts = []
+    in_table = False  # Track if we're currently processing a table
+
+    # Iterate through all body elements in document order
+    for element in doc.element.body:
+        # Check if element is a paragraph
+        if element.tag.endswith("p"):
+            # If coming out of a table, add blank line after table
+            if in_table:
+                content_parts.append("")  # Blank line after table
+                in_table = False
+
+            paragraph = Paragraph(element, doc)
+            text = paragraph.text
+            # Always append to preserve document spacing (including blank paragraphs)
+            content_parts.append(text)
+
+        # Check if element is a table
+        elif element.tag.endswith("tbl"):
+            # Add blank line before table (if content exists)
+            if content_parts and not in_table:
+                content_parts.append("")  # Blank line before table
+
+            in_table = True
+            table = Table(element, doc)
+            for row in table.rows:
+                row_text = []
+                for cell in row.cells:
+                    cell_text = cell.text
+                    # Escape special characters to preserve tab-delimited structure
+                    row_text.append(escape_cell(cell_text))
+                # Only add row if at least one cell has content
+                if any(cell for cell in row_text):
+                    content_parts.append("\t".join(row_text))
+
+    return "\n".join(content_parts)


 def _extract_pptx(file_bytes: bytes) -> str:
@@ -1013,27 +1081,112 @@ def _extract_pptx(file_bytes: bytes) -> str:


 def _extract_xlsx(file_bytes: bytes) -> str:
-    """Extract XLSX content (synchronous).
+    """Extract XLSX content in tab-delimited format with clear sheet separation.
+
+    This function processes Excel workbooks and converts them to a structured text format
+    suitable for LLM prompts and RAG systems. Each sheet is clearly delimited with
+    separator lines, and special characters are escaped to preserve the tab-delimited structure.
+
+    Features:
+    - Each sheet is wrapped with '====================' separators for visual distinction
+    - Special characters (tabs, newlines, backslashes) are escaped to prevent structure corruption
+    - Column alignment is preserved across all rows to maintain tabular structure
+    - Empty rows are preserved as blank lines to maintain row structure
+    - Uses sheet.max_column to determine column width efficiently

    Args:
        file_bytes: XLSX file content as bytes

    Returns:
-        str: Extracted text content
+        str: Extracted text content with all sheets in tab-delimited format.
+             Format: Sheet separators, sheet name, then tab-delimited rows.
+
+    Example output:
+        ==================== Sheet: Data ====================
+        Name\tAge\tCity
+        Alice\t30\tNew York
+        Bob\t25\tLondon
+
+        ==================== Sheet: Summary ====================
+        Total\t2
+        ====================
    """
    from openpyxl import load_workbook  # type: ignore

    xlsx_file = BytesIO(file_bytes)
    wb = load_workbook(xlsx_file)
-    content = ""
-    for sheet in wb:
-        content += f"Sheet: {sheet.title}\n"
+
+    def escape_cell(cell_value: str | int | float | None) -> str:
+        """Escape characters that would break tab-delimited layout.
+
+        Escape order is critical: backslashes first, then tabs/newlines.
+        This prevents double-escaping issues.
+
+        Args:
+            cell_value: The cell value to escape (can be None, str, int, or float)
+
+        Returns:
+            str: Escaped cell value safe for tab-delimited format
+        """
+        if cell_value is None:
+            return ""
+        text = str(cell_value)
+        # CRITICAL: Escape backslash first to avoid double-escaping
+        return (
+            text.replace("\\", "\\\\")  # Must be first: \ -> \\
+            .replace("\t", "\\t")  # Tab -> \t (visible)
+            .replace("\r\n", "\\n")  # Windows newline -> \n
+            .replace("\r", "\\n")  # Mac newline -> \n
+            .replace("\n", "\\n")  # Unix newline -> \n
+        )
+
+    def escape_sheet_title(title: str) -> str:
+        """Escape sheet title to prevent formatting issues in separators.
+
+        Args:
+            title: Original sheet title
+
+        Returns:
+            str: Sanitized sheet title with tabs/newlines replaced
+        """
+        return str(title).replace("\n", " ").replace("\t", " ").replace("\r", " ")
+
+    content_parts: list[str] = []
+    sheet_separator = "=" * 20
+
+    for idx, sheet in enumerate(wb):
+        if idx > 0:
+            content_parts.append("")  # Blank line between sheets for readability
+
+        # Escape sheet title to handle edge cases with special characters
+        safe_title = escape_sheet_title(sheet.title)
+        content_parts.append(f"{sheet_separator} Sheet: {safe_title} {sheet_separator}")
+
+        # Use sheet.max_column to get the maximum column width directly
+        max_columns = sheet.max_column if sheet.max_column else 0
+
+        # Extract rows with consistent width to preserve column alignment
        for row in sheet.iter_rows(values_only=True):
-            content += (
-                "\t".join(str(cell) if cell is not None else "" for cell in row) + "\n"
-            )
-        content += "\n"
-    return content
+            row_parts = []
+
+            # Build row up to max_columns width
+            for idx in range(max_columns):
+                if idx < len(row):
+                    row_parts.append(escape_cell(row[idx]))
+                else:
+                    row_parts.append("")  # Pad short rows
+
+            # Check if row is completely empty
+            if all(part == "" for part in row_parts):
+                # Preserve empty rows as blank lines (maintains row structure)
+                content_parts.append("")
+            else:
+                # Join all columns to maintain consistent column count
+                content_parts.append("\t".join(row_parts))
+
+    # Final separator for symmetry (makes parsing easier)
+    content_parts.append(sheet_separator)
+    return "\n".join(content_parts)


 async def pipeline_enqueue_file(
@@ -1949,12 +2102,14 @@ def create_document_routes(
            # Check if filename already exists in doc_status storage
            existing_doc_data = await rag.doc_status.get_doc_by_file_path(safe_filename)
            if existing_doc_data:
-                # Get document status information for error message
+                # Get document status and track_id from existing document
                status = existing_doc_data.get("status", "unknown")
+                # Use `or ""` to handle both missing key and None value (e.g., legacy rows without track_id)
+                existing_track_id = existing_doc_data.get("track_id") or ""
                return InsertResponse(
                    status="duplicated",
                    message=f"File '{safe_filename}' already exists in document storage (Status: {status}).",
-                    track_id="",
+                    track_id=existing_track_id,
                )

            file_path = doc_manager.input_dir / safe_filename
@@ -2018,14 +2173,30 @@ def create_document_routes(
                    request.file_source
                )
                if existing_doc_data:
-                    # Get document status information for error message
+                    # Get document status and track_id from existing document
                    status = existing_doc_data.get("status", "unknown")
+                    # Use `or ""` to handle both missing key and None value (e.g., legacy rows without track_id)
+                    existing_track_id = existing_doc_data.get("track_id") or ""
                    return InsertResponse(
                        status="duplicated",
                        message=f"File source '{request.file_source}' already exists in document storage (Status: {status}).",
-                        track_id="",
+                        track_id=existing_track_id,
                    )

+            # Check if content already exists by computing content hash (doc_id)
+            sanitized_text = sanitize_text_for_encoding(request.text)
+            content_doc_id = compute_mdhash_id(sanitized_text, prefix="doc-")
+            existing_doc = await rag.doc_status.get_by_id(content_doc_id)
+            if existing_doc:
+                # Content already exists, return duplicated with existing track_id
+                status = existing_doc.get("status", "unknown")
+                existing_track_id = existing_doc.get("track_id") or ""
+                return InsertResponse(
+                    status="duplicated",
+                    message=f"Identical content already exists in document storage (doc_id: {content_doc_id}, Status: {status}).",
+                    track_id=existing_track_id,
+                )
+
            # Generate track_id for text insertion
            track_id = generate_track_id("insert")

@@ -2084,14 +2255,31 @@ def create_document_routes(
                            file_source
                        )
                        if existing_doc_data:
-                            # Get document status information for error message
+                            # Get document status and track_id from existing document
                            status = existing_doc_data.get("status", "unknown")
+                            # Use `or ""` to handle both missing key and None value (e.g., legacy rows without track_id)
+                            existing_track_id = existing_doc_data.get("track_id") or ""
                            return InsertResponse(
                                status="duplicated",
                                message=f"File source '{file_source}' already exists in document storage (Status: {status}).",
-                                track_id="",
+                                track_id=existing_track_id,
                            )

+            # Check if any content already exists by computing content hash (doc_id)
+            for text in request.texts:
+                sanitized_text = sanitize_text_for_encoding(text)
+                content_doc_id = compute_mdhash_id(sanitized_text, prefix="doc-")
+                existing_doc = await rag.doc_status.get_by_id(content_doc_id)
+                if existing_doc:
+                    # Content already exists, return duplicated with existing track_id
+                    status = existing_doc.get("status", "unknown")
+                    existing_track_id = existing_doc.get("track_id") or ""
+                    return InsertResponse(
+                        status="duplicated",
+                        message=f"Identical content already exists in document storage (doc_id: {content_doc_id}, Status: {status}).",
+                        track_id=existing_track_id,
+                    )
+
            # Generate track_id for texts insertion
            track_id = generate_track_id("insert")

@@ -2910,29 +3098,27 @@ def create_document_routes(
        This is useful for recovering from server crashes, network errors, LLM service
        outages, or other temporary failures that caused document processing to fail.

-        The processing happens in the background and can be monitored using the
-        returned track_id or by checking the pipeline status.
+        The processing happens in the background and can be monitored by checking the
+        pipeline status. The reprocessed documents retain their original track_id from
+        initial upload, so use their original track_id to monitor progress.

        Returns:
-            ReprocessResponse: Response with status, message, and track_id
+            ReprocessResponse: Response with status and message.
+                track_id is always empty string because reprocessed documents retain
+                their original track_id from initial upload.

        Raises:
            HTTPException: If an error occurs while initiating reprocessing (500).
        """
        try:
-            # Generate track_id with "retry" prefix for retry operation
-            track_id = generate_track_id("retry")
-
            # Start the reprocessing in the background
+            # Note: Reprocessed documents retain their original track_id from initial upload
            background_tasks.add_task(rag.apipeline_process_enqueue_documents)
-            logger.info(
-                f"Reprocessing of failed documents initiated with track_id: {track_id}"
-            )
+            logger.info("Reprocessing of failed documents initiated")

            return ReprocessResponse(
                status="reprocessing_started",
-                message="Reprocessing of failed documents has been initiated in background",
-                track_id=track_id,
+                message="Reprocessing of failed documents has been initiated in background. Documents retain their original track_id.",
            )

        except Exception as e:
--- a/lightrag/api/routers/ollama_api.py
+++ b/lightrag/api/routers/ollama_api.py
@@ -8,7 +8,6 @@ import re
 from enum import Enum
 from fastapi.responses import StreamingResponse
 import asyncio
-from ascii_colors import trace_exception
 from lightrag import LightRAG, QueryParam
 from lightrag.utils import TiktokenTokenizer
 from lightrag.api.utils_api import get_combined_auth_dependency
@@ -309,118 +308,113 @@ class OllamaAPI:
                    )

                    async def stream_generator():
-                        try:
-                            first_chunk_time = None
+                        first_chunk_time = None
+                        last_chunk_time = time.time_ns()
+                        total_response = ""
+
+                        # Ensure response is an async generator
+                        if isinstance(response, str):
+                            # If it's a string, send in two parts
+                            first_chunk_time = start_time
                            last_chunk_time = time.time_ns()
-                            total_response = ""
+                            total_response = response

-                            # Ensure response is an async generator
-                            if isinstance(response, str):
-                                # If it's a string, send in two parts
-                                first_chunk_time = start_time
-                                last_chunk_time = time.time_ns()
-                                total_response = response
+                            data = {
+                                "model": self.ollama_server_infos.LIGHTRAG_MODEL,
+                                "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
+                                "response": response,
+                                "done": False,
+                            }
+                            yield f"{json.dumps(data, ensure_ascii=False)}\n"

-                                data = {
+                            completion_tokens = estimate_tokens(total_response)
+                            total_time = last_chunk_time - start_time
+                            prompt_eval_time = first_chunk_time - start_time
+                            eval_time = last_chunk_time - first_chunk_time
+
+                            data = {
+                                "model": self.ollama_server_infos.LIGHTRAG_MODEL,
+                                "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
+                                "response": "",
+                                "done": True,
+                                "done_reason": "stop",
+                                "context": [],
+                                "total_duration": total_time,
+                                "load_duration": 0,
+                                "prompt_eval_count": prompt_tokens,
+                                "prompt_eval_duration": prompt_eval_time,
+                                "eval_count": completion_tokens,
+                                "eval_duration": eval_time,
+                            }
+                            yield f"{json.dumps(data, ensure_ascii=False)}\n"
+                        else:
+                            try:
+                                async for chunk in response:
+                                    if chunk:
+                                        if first_chunk_time is None:
+                                            first_chunk_time = time.time_ns()
+
+                                        last_chunk_time = time.time_ns()
+
+                                        total_response += chunk
+                                        data = {
+                                            "model": self.ollama_server_infos.LIGHTRAG_MODEL,
+                                            "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
+                                            "response": chunk,
+                                            "done": False,
+                                        }
+                                        yield f"{json.dumps(data, ensure_ascii=False)}\n"
+                            except (asyncio.CancelledError, Exception) as e:
+                                error_msg = str(e)
+                                if isinstance(e, asyncio.CancelledError):
+                                    error_msg = "Stream was cancelled by server"
+                                else:
+                                    error_msg = f"Provider error: {error_msg}"
+
+                                logger.error(f"Stream error: {error_msg}")
+
+                                # Send error message to client
+                                error_data = {
                                    "model": self.ollama_server_infos.LIGHTRAG_MODEL,
                                    "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
-                                    "response": response,
+                                    "response": f"\n\nError: {error_msg}",
+                                    "error": f"\n\nError: {error_msg}",
                                    "done": False,
                                }
-                                yield f"{json.dumps(data, ensure_ascii=False)}\n"
+                                yield f"{json.dumps(error_data, ensure_ascii=False)}\n"

-                                completion_tokens = estimate_tokens(total_response)
-                                total_time = last_chunk_time - start_time
-                                prompt_eval_time = first_chunk_time - start_time
-                                eval_time = last_chunk_time - first_chunk_time
-
-                                data = {
+                                # Send final message to close the stream
+                                final_data = {
                                    "model": self.ollama_server_infos.LIGHTRAG_MODEL,
                                    "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
                                    "response": "",
                                    "done": True,
-                                    "done_reason": "stop",
-                                    "context": [],
-                                    "total_duration": total_time,
-                                    "load_duration": 0,
-                                    "prompt_eval_count": prompt_tokens,
-                                    "prompt_eval_duration": prompt_eval_time,
-                                    "eval_count": completion_tokens,
-                                    "eval_duration": eval_time,
                                }
-                                yield f"{json.dumps(data, ensure_ascii=False)}\n"
-                            else:
-                                try:
-                                    async for chunk in response:
-                                        if chunk:
-                                            if first_chunk_time is None:
-                                                first_chunk_time = time.time_ns()
-
-                                            last_chunk_time = time.time_ns()
-
-                                            total_response += chunk
-                                            data = {
-                                                "model": self.ollama_server_infos.LIGHTRAG_MODEL,
-                                                "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
-                                                "response": chunk,
-                                                "done": False,
-                                            }
-                                            yield f"{json.dumps(data, ensure_ascii=False)}\n"
-                                except (asyncio.CancelledError, Exception) as e:
-                                    error_msg = str(e)
-                                    if isinstance(e, asyncio.CancelledError):
-                                        error_msg = "Stream was cancelled by server"
-                                    else:
-                                        error_msg = f"Provider error: {error_msg}"
-
-                                    logger.error(f"Stream error: {error_msg}")
-
-                                    # Send error message to client
-                                    error_data = {
-                                        "model": self.ollama_server_infos.LIGHTRAG_MODEL,
-                                        "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
-                                        "response": f"\n\nError: {error_msg}",
-                                        "error": f"\n\nError: {error_msg}",
-                                        "done": False,
-                                    }
-                                    yield f"{json.dumps(error_data, ensure_ascii=False)}\n"
-
-                                    # Send final message to close the stream
-                                    final_data = {
-                                        "model": self.ollama_server_infos.LIGHTRAG_MODEL,
-                                        "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
-                                        "response": "",
-                                        "done": True,
-                                    }
-                                    yield f"{json.dumps(final_data, ensure_ascii=False)}\n"
-                                    return
-                                if first_chunk_time is None:
-                                    first_chunk_time = start_time
-                                completion_tokens = estimate_tokens(total_response)
-                                total_time = last_chunk_time - start_time
-                                prompt_eval_time = first_chunk_time - start_time
-                                eval_time = last_chunk_time - first_chunk_time
-
-                                data = {
-                                    "model": self.ollama_server_infos.LIGHTRAG_MODEL,
-                                    "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
-                                    "response": "",
-                                    "done": True,
-                                    "done_reason": "stop",
-                                    "context": [],
-                                    "total_duration": total_time,
-                                    "load_duration": 0,
-                                    "prompt_eval_count": prompt_tokens,
-                                    "prompt_eval_duration": prompt_eval_time,
-                                    "eval_count": completion_tokens,
-                                    "eval_duration": eval_time,
-                                }
-                                yield f"{json.dumps(data, ensure_ascii=False)}\n"
+                                yield f"{json.dumps(final_data, ensure_ascii=False)}\n"
                                return
+                            if first_chunk_time is None:
+                                first_chunk_time = start_time
+                            completion_tokens = estimate_tokens(total_response)
+                            total_time = last_chunk_time - start_time
+                            prompt_eval_time = first_chunk_time - start_time
+                            eval_time = last_chunk_time - first_chunk_time

-                        except Exception as e:
-                            trace_exception(e)
-                            raise
+                            data = {
+                                "model": self.ollama_server_infos.LIGHTRAG_MODEL,
+                                "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
+                                "response": "",
+                                "done": True,
+                                "done_reason": "stop",
+                                "context": [],
+                                "total_duration": total_time,
+                                "load_duration": 0,
+                                "prompt_eval_count": prompt_tokens,
+                                "prompt_eval_duration": prompt_eval_time,
+                                "eval_count": completion_tokens,
+                                "eval_duration": eval_time,
+                            }
+                            yield f"{json.dumps(data, ensure_ascii=False)}\n"
+                            return

                    return StreamingResponse(
                        stream_generator(),
@@ -462,7 +456,7 @@ class OllamaAPI:
                        "eval_duration": eval_time,
                    }
            except Exception as e:
-                trace_exception(e)
+                logger.error(f"Ollama generate error: {str(e)}", exc_info=True)
                raise HTTPException(status_code=500, detail=str(e))

        @self.router.post(
@@ -535,36 +529,98 @@ class OllamaAPI:
                        )

                    async def stream_generator():
-                        try:
-                            first_chunk_time = None
+                        first_chunk_time = None
+                        last_chunk_time = time.time_ns()
+                        total_response = ""
+
+                        # Ensure response is an async generator
+                        if isinstance(response, str):
+                            # If it's a string, send in two parts
+                            first_chunk_time = start_time
                            last_chunk_time = time.time_ns()
-                            total_response = ""
+                            total_response = response

-                            # Ensure response is an async generator
-                            if isinstance(response, str):
-                                # If it's a string, send in two parts
-                                first_chunk_time = start_time
-                                last_chunk_time = time.time_ns()
-                                total_response = response
+                            data = {
+                                "model": self.ollama_server_infos.LIGHTRAG_MODEL,
+                                "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
+                                "message": {
+                                    "role": "assistant",
+                                    "content": response,
+                                    "images": None,
+                                },
+                                "done": False,
+                            }
+                            yield f"{json.dumps(data, ensure_ascii=False)}\n"

-                                data = {
+                            completion_tokens = estimate_tokens(total_response)
+                            total_time = last_chunk_time - start_time
+                            prompt_eval_time = first_chunk_time - start_time
+                            eval_time = last_chunk_time - first_chunk_time
+
+                            data = {
+                                "model": self.ollama_server_infos.LIGHTRAG_MODEL,
+                                "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
+                                "message": {
+                                    "role": "assistant",
+                                    "content": "",
+                                    "images": None,
+                                },
+                                "done_reason": "stop",
+                                "done": True,
+                                "total_duration": total_time,
+                                "load_duration": 0,
+                                "prompt_eval_count": prompt_tokens,
+                                "prompt_eval_duration": prompt_eval_time,
+                                "eval_count": completion_tokens,
+                                "eval_duration": eval_time,
+                            }
+                            yield f"{json.dumps(data, ensure_ascii=False)}\n"
+                        else:
+                            try:
+                                async for chunk in response:
+                                    if chunk:
+                                        if first_chunk_time is None:
+                                            first_chunk_time = time.time_ns()
+
+                                        last_chunk_time = time.time_ns()
+
+                                        total_response += chunk
+                                        data = {
+                                            "model": self.ollama_server_infos.LIGHTRAG_MODEL,
+                                            "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
+                                            "message": {
+                                                "role": "assistant",
+                                                "content": chunk,
+                                                "images": None,
+                                            },
+                                            "done": False,
+                                        }
+                                        yield f"{json.dumps(data, ensure_ascii=False)}\n"
+                            except (asyncio.CancelledError, Exception) as e:
+                                error_msg = str(e)
+                                if isinstance(e, asyncio.CancelledError):
+                                    error_msg = "Stream was cancelled by server"
+                                else:
+                                    error_msg = f"Provider error: {error_msg}"
+
+                                logger.error(f"Stream error: {error_msg}")
+
+                                # Send error message to client
+                                error_data = {
                                    "model": self.ollama_server_infos.LIGHTRAG_MODEL,
                                    "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
                                    "message": {
                                        "role": "assistant",
-                                        "content": response,
+                                        "content": f"\n\nError: {error_msg}",
                                        "images": None,
                                    },
+                                    "error": f"\n\nError: {error_msg}",
                                    "done": False,
                                }
-                                yield f"{json.dumps(data, ensure_ascii=False)}\n"
+                                yield f"{json.dumps(error_data, ensure_ascii=False)}\n"

-                                completion_tokens = estimate_tokens(total_response)
-                                total_time = last_chunk_time - start_time
-                                prompt_eval_time = first_chunk_time - start_time
-                                eval_time = last_chunk_time - first_chunk_time
-
-                                data = {
+                                # Send final message to close the stream
+                                final_data = {
                                    "model": self.ollama_server_infos.LIGHTRAG_MODEL,
                                    "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
                                    "message": {
@@ -572,103 +628,36 @@ class OllamaAPI:
                                        "content": "",
                                        "images": None,
                                    },
-                                    "done_reason": "stop",
                                    "done": True,
-                                    "total_duration": total_time,
-                                    "load_duration": 0,
-                                    "prompt_eval_count": prompt_tokens,
-                                    "prompt_eval_duration": prompt_eval_time,
-                                    "eval_count": completion_tokens,
-                                    "eval_duration": eval_time,
                                }
-                                yield f"{json.dumps(data, ensure_ascii=False)}\n"
-                            else:
-                                try:
-                                    async for chunk in response:
-                                        if chunk:
-                                            if first_chunk_time is None:
-                                                first_chunk_time = time.time_ns()
+                                yield f"{json.dumps(final_data, ensure_ascii=False)}\n"
+                                return

-                                            last_chunk_time = time.time_ns()
+                            if first_chunk_time is None:
+                                first_chunk_time = start_time
+                            completion_tokens = estimate_tokens(total_response)
+                            total_time = last_chunk_time - start_time
+                            prompt_eval_time = first_chunk_time - start_time
+                            eval_time = last_chunk_time - first_chunk_time

-                                            total_response += chunk
-                                            data = {
-                                                "model": self.ollama_server_infos.LIGHTRAG_MODEL,
-                                                "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
-                                                "message": {
-                                                    "role": "assistant",
-                                                    "content": chunk,
-                                                    "images": None,
-                                                },
-                                                "done": False,
-                                            }
-                                            yield f"{json.dumps(data, ensure_ascii=False)}\n"
-                                except (asyncio.CancelledError, Exception) as e:
-                                    error_msg = str(e)
-                                    if isinstance(e, asyncio.CancelledError):
-                                        error_msg = "Stream was cancelled by server"
-                                    else:
-                                        error_msg = f"Provider error: {error_msg}"
-
-                                    logger.error(f"Stream error: {error_msg}")
-
-                                    # Send error message to client
-                                    error_data = {
-                                        "model": self.ollama_server_infos.LIGHTRAG_MODEL,
-                                        "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
-                                        "message": {
-                                            "role": "assistant",
-                                            "content": f"\n\nError: {error_msg}",
-                                            "images": None,
-                                        },
-                                        "error": f"\n\nError: {error_msg}",
-                                        "done": False,
-                                    }
-                                    yield f"{json.dumps(error_data, ensure_ascii=False)}\n"
-
-                                    # Send final message to close the stream
-                                    final_data = {
-                                        "model": self.ollama_server_infos.LIGHTRAG_MODEL,
-                                        "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
-                                        "message": {
-                                            "role": "assistant",
-                                            "content": "",
-                                            "images": None,
-                                        },
-                                        "done": True,
-                                    }
-                                    yield f"{json.dumps(final_data, ensure_ascii=False)}\n"
-                                    return
-
-                                if first_chunk_time is None:
-                                    first_chunk_time = start_time
-                                completion_tokens = estimate_tokens(total_response)
-                                total_time = last_chunk_time - start_time
-                                prompt_eval_time = first_chunk_time - start_time
-                                eval_time = last_chunk_time - first_chunk_time
-
-                                data = {
-                                    "model": self.ollama_server_infos.LIGHTRAG_MODEL,
-                                    "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
-                                    "message": {
-                                        "role": "assistant",
-                                        "content": "",
-                                        "images": None,
-                                    },
-                                    "done_reason": "stop",
-                                    "done": True,
-                                    "total_duration": total_time,
-                                    "load_duration": 0,
-                                    "prompt_eval_count": prompt_tokens,
-                                    "prompt_eval_duration": prompt_eval_time,
-                                    "eval_count": completion_tokens,
-                                    "eval_duration": eval_time,
-                                }
-                                yield f"{json.dumps(data, ensure_ascii=False)}\n"
-
-                        except Exception as e:
-                            trace_exception(e)
-                            raise
+                            data = {
+                                "model": self.ollama_server_infos.LIGHTRAG_MODEL,
+                                "created_at": self.ollama_server_infos.LIGHTRAG_CREATED_AT,
+                                "message": {
+                                    "role": "assistant",
+                                    "content": "",
+                                    "images": None,
+                                },
+                                "done_reason": "stop",
+                                "done": True,
+                                "total_duration": total_time,
+                                "load_duration": 0,
+                                "prompt_eval_count": prompt_tokens,
+                                "prompt_eval_duration": prompt_eval_time,
+                                "eval_count": completion_tokens,
+                                "eval_duration": eval_time,
+                            }
+                            yield f"{json.dumps(data, ensure_ascii=False)}\n"

                    return StreamingResponse(
                        stream_generator(),
@@ -730,5 +719,5 @@ class OllamaAPI:
                        "eval_duration": eval_time,
                    }
            except Exception as e:
-                trace_exception(e)
+                logger.error(f"Ollama chat error: {str(e)}", exc_info=True)
                raise HTTPException(status_code=500, detail=str(e))
--- a/lightrag/api/routers/query_routes.py
+++ b/lightrag/api/routers/query_routes.py
@@ -3,16 +3,13 @@ This module contains all query-related routes for the LightRAG API.
 """

 import json
-import logging
 from typing import Any, Dict, List, Literal, Optional
-
 from fastapi import APIRouter, Depends, HTTPException
 from lightrag.base import QueryParam
 from lightrag.api.utils_api import get_combined_auth_dependency
+from lightrag.utils import logger
 from pydantic import BaseModel, Field, field_validator

-from ascii_colors import trace_exception
-
 router = APIRouter(tags=["query"])


@@ -453,7 +450,7 @@ def create_query_routes(rag, api_key: Optional[str] = None, top_k: int = 60):
            else:
                return QueryResponse(response=response_content, references=None)
        except Exception as e:
-            trace_exception(e)
+            logger.error(f"Error processing query: {str(e)}", exc_info=True)
            raise HTTPException(status_code=500, detail=str(e))

    @router.post(
@@ -713,7 +710,7 @@ def create_query_routes(rag, api_key: Optional[str] = None, top_k: int = 60):
                                if chunk:  # Only send non-empty content
                                    yield f"{json.dumps({'response': chunk})}\n"
                        except Exception as e:
-                            logging.error(f"Streaming error: {str(e)}")
+                            logger.error(f"Streaming error: {str(e)}")
                            yield f"{json.dumps({'error': str(e)})}\n"
                else:
                    # Non-streaming mode: send complete response in one message
@@ -739,7 +736,7 @@ def create_query_routes(rag, api_key: Optional[str] = None, top_k: int = 60):
                },
            )
        except Exception as e:
-            trace_exception(e)
+            logger.error(f"Error processing streaming query: {str(e)}", exc_info=True)
            raise HTTPException(status_code=500, detail=str(e))

    @router.post(
@@ -1156,7 +1153,7 @@ def create_query_routes(rag, api_key: Optional[str] = None, top_k: int = 60):
                    data={},
                )
        except Exception as e:
-            trace_exception(e)
+            logger.error(f"Error processing data query: {str(e)}", exc_info=True)
            raise HTTPException(status_code=500, detail=str(e))

    return router
--- a/lightrag/api/run_with_gunicorn.py
+++ b/lightrag/api/run_with_gunicorn.py
@@ -100,7 +100,7 @@ def main():
        print("\nHow to fix:")
        print("  Option 1 - Set environment variable before starting (recommended):")
        print("     export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES")
-        print("     lightrag-server")
+        print("     lightrag-gunicorn --workers 2")
        print("\n  Option 2 - Add to your shell profile (~/.zshrc or ~/.bash_profile):")
        print("     echo 'export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES' >> ~/.zshrc")
        print("     source ~/.zshrc")
--- a/lightrag/exceptions.py
+++ b/lightrag/exceptions.py
@@ -106,6 +106,28 @@ class PipelineCancelledException(Exception):
        self.message = message


+class ChunkTokenLimitExceededError(ValueError):
+    """Raised when a chunk exceeds the configured token limit."""
+
+    def __init__(
+        self,
+        chunk_tokens: int,
+        chunk_token_limit: int,
+        chunk_preview: str | None = None,
+    ) -> None:
+        preview = chunk_preview.strip() if chunk_preview else None
+        truncated_preview = preview[:80] if preview else None
+        preview_note = f" Preview: '{truncated_preview}'" if truncated_preview else ""
+        message = (
+            f"Chunk token length {chunk_tokens} exceeds chunk_token_size {chunk_token_limit}."
+            f"{preview_note}"
+        )
+        super().__init__(message)
+        self.chunk_tokens = chunk_tokens
+        self.chunk_token_limit = chunk_token_limit
+        self.chunk_preview = truncated_preview
+
+
 class QdrantMigrationError(Exception):
    """Raised when Qdrant data migration from legacy collections fails."""

--- a/lightrag/kg/neo4j_impl.py
+++ b/lightrag/kg/neo4j_impl.py
@@ -44,6 +44,23 @@ config.read("config.ini", "utf-8")
 logging.getLogger("neo4j").setLevel(logging.ERROR)


+READ_RETRY_EXCEPTIONS = (
+    neo4jExceptions.ServiceUnavailable,
+    neo4jExceptions.TransientError,
+    neo4jExceptions.SessionExpired,
+    ConnectionResetError,
+    OSError,
+    AttributeError,
+)
+
+READ_RETRY = retry(
+    stop=stop_after_attempt(3),
+    wait=wait_exponential(multiplier=1, min=4, max=10),
+    retry=retry_if_exception_type(READ_RETRY_EXCEPTIONS),
+    reraise=True,
+)
+
+
@final
@dataclass
 class Neo4JStorage(BaseGraphStorage):
@@ -352,6 +369,7 @@ class Neo4JStorage(BaseGraphStorage):
        # Neo4J handles persistence automatically
        pass

+    @READ_RETRY
    async def has_node(self, node_id: str) -> bool:
        """
        Check if a node with the given label exists in the database
@@ -385,6 +403,7 @@ class Neo4JStorage(BaseGraphStorage):
                    await result.consume()  # Ensure results are consumed even on error
                raise

+    @READ_RETRY
    async def has_edge(self, source_node_id: str, target_node_id: str) -> bool:
        """
        Check if an edge exists between two nodes
@@ -426,6 +445,7 @@ class Neo4JStorage(BaseGraphStorage):
                    await result.consume()  # Ensure results are consumed even on error
                raise

+    @READ_RETRY
    async def get_node(self, node_id: str) -> dict[str, str] | None:
        """Get node by its label identifier, return only node properties

@@ -479,6 +499,7 @@ class Neo4JStorage(BaseGraphStorage):
                )
                raise

+    @READ_RETRY
    async def get_nodes_batch(self, node_ids: list[str]) -> dict[str, dict]:
        """
        Retrieve multiple nodes in one query using UNWIND.
@@ -515,6 +536,7 @@ class Neo4JStorage(BaseGraphStorage):
            await result.consume()  # Make sure to consume the result fully
            return nodes

+    @READ_RETRY
    async def node_degree(self, node_id: str) -> int:
        """Get the degree (number of relationships) of a node with the given label.
        If multiple nodes have the same label, returns the degree of the first node.
@@ -563,6 +585,7 @@ class Neo4JStorage(BaseGraphStorage):
                )
                raise

+    @READ_RETRY
    async def node_degrees_batch(self, node_ids: list[str]) -> dict[str, int]:
        """
        Retrieve the degree for multiple nodes in a single query using UNWIND.
@@ -621,6 +644,7 @@ class Neo4JStorage(BaseGraphStorage):
        degrees = int(src_degree) + int(trg_degree)
        return degrees

+    @READ_RETRY
    async def edge_degrees_batch(
        self, edge_pairs: list[tuple[str, str]]
    ) -> dict[tuple[str, str], int]:
@@ -647,6 +671,7 @@ class Neo4JStorage(BaseGraphStorage):
            edge_degrees[(src, tgt)] = degrees.get(src, 0) + degrees.get(tgt, 0)
        return edge_degrees

+    @READ_RETRY
    async def get_edge(
        self, source_node_id: str, target_node_id: str
    ) -> dict[str, str] | None:
@@ -734,6 +759,7 @@ class Neo4JStorage(BaseGraphStorage):
            )
            raise

+    @READ_RETRY
    async def get_edges_batch(
        self, pairs: list[dict[str, str]]
    ) -> dict[tuple[str, str], dict]:
@@ -784,6 +810,7 @@ class Neo4JStorage(BaseGraphStorage):
            await result.consume()
            return edges_dict

+    @READ_RETRY
    async def get_node_edges(self, source_node_id: str) -> list[tuple[str, str]] | None:
        """Retrieves all edges (relationships) for a particular node identified by its label.

@@ -851,6 +878,7 @@ class Neo4JStorage(BaseGraphStorage):
            )
            raise

+    @READ_RETRY
    async def get_nodes_edges_batch(
        self, node_ids: list[str]
    ) -> dict[str, list[tuple[str, str]]]:
--- a/lightrag/kg/postgres_impl.py
+++ b/lightrag/kg/postgres_impl.py
@@ -383,7 +383,7 @@ class PostgreSQLDB:
    async def configure_age_extension(connection: asyncpg.Connection) -> None:
        """Create AGE extension if it doesn't exist for graph operations."""
        try:
-            await connection.execute("CREATE EXTENSION IF NOT EXISTS age")  # type: ignore
+            await connection.execute("CREATE EXTENSION IF NOT EXISTS AGE CASCADE")  # type: ignore
            logger.info("PostgreSQL, AGE extension enabled")
        except Exception as e:
            logger.warning(f"Could not create AGE extension: {e}")
--- a/lightrag/kg/shared_storage.py
+++ b/lightrag/kg/shared_storage.py
@@ -1700,3 +1700,17 @@ def get_default_workspace() -> str:
    """
    global _default_workspace
    return _default_workspace
+
+
+def get_pipeline_status_lock(
+    enable_logging: bool = False, workspace: str = None
+) -> NamespaceLock:
+    """Return unified storage lock for pipeline status data consistency.
+
+    This function is for compatibility with legacy code only.
+    """
+    global _default_workspace
+    actual_workspace = workspace if workspace else _default_workspace
+    return get_namespace_lock(
+        "pipeline_status", workspace=actual_workspace, enable_logging=enable_logging
+    )
--- a/lightrag/llm/azure_openai.py
+++ b/lightrag/llm/azure_openai.py
@@ -1,177 +1,22 @@
-from collections.abc import Iterable
-import os
-import pipmaster as pm  # Pipmaster for dynamic library install
+"""
+Azure OpenAI compatibility layer.

-# install specific modules
-if not pm.is_installed("openai"):
-    pm.install("openai")
+This module provides backward compatibility by re-exporting Azure OpenAI functions
+from the main openai module where the actual implementation resides.

-from openai import (
-    AsyncAzureOpenAI,
-    APIConnectionError,
-    RateLimitError,
-    APITimeoutError,
-)
-from openai.types.chat import ChatCompletionMessageParam
+All core logic for both OpenAI and Azure OpenAI now lives in lightrag.llm.openai,
+with this module serving as a thin compatibility wrapper for existing code that
+imports from lightrag.llm.azure_openai.
+"""

-from tenacity import (
-    retry,
-    stop_after_attempt,
-    wait_exponential,
-    retry_if_exception_type,
+from lightrag.llm.openai import (
+    azure_openai_complete_if_cache,
+    azure_openai_complete,
+    azure_openai_embed,
 )

-from lightrag.utils import (
-    wrap_embedding_func_with_attrs,
-    safe_unicode_decode,
-    logger,
-)
-
-import numpy as np
-
-
-@retry(
-    stop=stop_after_attempt(3),
-    wait=wait_exponential(multiplier=1, min=4, max=10),
-    retry=retry_if_exception_type(
-        (RateLimitError, APIConnectionError, APIConnectionError)
-    ),
-)
-async def azure_openai_complete_if_cache(
-    model,
-    prompt,
-    system_prompt: str | None = None,
-    history_messages: Iterable[ChatCompletionMessageParam] | None = None,
-    enable_cot: bool = False,
-    base_url: str | None = None,
-    api_key: str | None = None,
-    api_version: str | None = None,
-    **kwargs,
-):
-    if enable_cot:
-        logger.debug(
-            "enable_cot=True is not supported for the Azure OpenAI API and will be ignored."
-        )
-    deployment = os.getenv("AZURE_OPENAI_DEPLOYMENT") or model or os.getenv("LLM_MODEL")
-    base_url = (
-        base_url or os.getenv("AZURE_OPENAI_ENDPOINT") or os.getenv("LLM_BINDING_HOST")
-    )
-    api_key = (
-        api_key or os.getenv("AZURE_OPENAI_API_KEY") or os.getenv("LLM_BINDING_API_KEY")
-    )
-    api_version = (
-        api_version
-        or os.getenv("AZURE_OPENAI_API_VERSION")
-        or os.getenv("OPENAI_API_VERSION")
-    )
-
-    kwargs.pop("hashing_kv", None)
-    kwargs.pop("keyword_extraction", None)
-    timeout = kwargs.pop("timeout", None)
-
-    openai_async_client = AsyncAzureOpenAI(
-        azure_endpoint=base_url,
-        azure_deployment=deployment,
-        api_key=api_key,
-        api_version=api_version,
-        timeout=timeout,
-    )
-    messages = []
-    if system_prompt:
-        messages.append({"role": "system", "content": system_prompt})
-    if history_messages:
-        messages.extend(history_messages)
-    if prompt is not None:
-        messages.append({"role": "user", "content": prompt})
-
-    if "response_format" in kwargs:
-        response = await openai_async_client.beta.chat.completions.parse(
-            model=model, messages=messages, **kwargs
-        )
-    else:
-        response = await openai_async_client.chat.completions.create(
-            model=model, messages=messages, **kwargs
-        )
-
-    if hasattr(response, "__aiter__"):
-
-        async def inner():
-            async for chunk in response:
-                if len(chunk.choices) == 0:
-                    continue
-                content = chunk.choices[0].delta.content
-                if content is None:
-                    continue
-                if r"\u" in content:
-                    content = safe_unicode_decode(content.encode("utf-8"))
-                yield content
-
-        return inner()
-    else:
-        content = response.choices[0].message.content
-        if r"\u" in content:
-            content = safe_unicode_decode(content.encode("utf-8"))
-        return content
-
-
-async def azure_openai_complete(
-    prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
-) -> str:
-    kwargs.pop("keyword_extraction", None)
-    result = await azure_openai_complete_if_cache(
-        os.getenv("LLM_MODEL", "gpt-4o-mini"),
-        prompt,
-        system_prompt=system_prompt,
-        history_messages=history_messages,
-        **kwargs,
-    )
-    return result
-
-
-@wrap_embedding_func_with_attrs(embedding_dim=1536)
-@retry(
-    stop=stop_after_attempt(3),
-    wait=wait_exponential(multiplier=1, min=4, max=10),
-    retry=retry_if_exception_type(
-        (RateLimitError, APIConnectionError, APITimeoutError)
-    ),
-)
-async def azure_openai_embed(
-    texts: list[str],
-    model: str | None = None,
-    base_url: str | None = None,
-    api_key: str | None = None,
-    api_version: str | None = None,
-) -> np.ndarray:
-    deployment = (
-        os.getenv("AZURE_EMBEDDING_DEPLOYMENT")
-        or model
-        or os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
-    )
-    base_url = (
-        base_url
-        or os.getenv("AZURE_EMBEDDING_ENDPOINT")
-        or os.getenv("EMBEDDING_BINDING_HOST")
-    )
-    api_key = (
-        api_key
-        or os.getenv("AZURE_EMBEDDING_API_KEY")
-        or os.getenv("EMBEDDING_BINDING_API_KEY")
-    )
-    api_version = (
-        api_version
-        or os.getenv("AZURE_EMBEDDING_API_VERSION")
-        or os.getenv("OPENAI_API_VERSION")
-    )
-
-    openai_async_client = AsyncAzureOpenAI(
-        azure_endpoint=base_url,
-        azure_deployment=deployment,
-        api_key=api_key,
-        api_version=api_version,
-    )
-
-    response = await openai_async_client.embeddings.create(
-        model=model, input=texts, encoding_format="float"
-    )
-    return np.array([dp.embedding for dp in response.data])
+__all__ = [
+    "azure_openai_complete_if_cache",
+    "azure_openai_complete",
+    "azure_openai_embed",
+]
--- a/lightrag/llm/jina.py
+++ b/lightrag/llm/jina.py
@@ -69,6 +69,7 @@ async def fetch_data(url, headers, data):
 )
 async def jina_embed(
    texts: list[str],
+    model: str = "jina-embeddings-v4",
    embedding_dim: int = 2048,
    late_chunking: bool = False,
    base_url: str = None,
@@ -78,6 +79,8 @@ async def jina_embed(

    Args:
        texts: List of texts to embed.
+        model: The Jina embedding model to use (default: jina-embeddings-v4).
+            Supported models: jina-embeddings-v3, jina-embeddings-v4, etc.
        embedding_dim: The embedding dimensions (default: 2048 for jina-embeddings-v4).
            **IMPORTANT**: This parameter is automatically injected by the EmbeddingFunc wrapper.
            Do NOT manually pass this parameter when calling the function directly.
@@ -107,7 +110,7 @@ async def jina_embed(
        "Authorization": f"Bearer {os.environ['JINA_API_KEY']}",
    }
    data = {
-        "model": "jina-embeddings-v4",
+        "model": model,
        "task": "text-matching",
        "dimensions": embedding_dim,
        "embedding_type": "base64",
--- a/lightrag/llm/ollama.py
+++ b/lightrag/llm/ollama.py
@@ -173,7 +173,9 @@ async def ollama_model_complete(


@wrap_embedding_func_with_attrs(embedding_dim=1024, max_token_size=8192)
-async def ollama_embed(texts: list[str], embed_model, **kwargs) -> np.ndarray:
+async def ollama_embed(
+    texts: list[str], embed_model: str = "bge-m3:latest", **kwargs
+) -> np.ndarray:
    api_key = kwargs.pop("api_key", None)
    if not api_key:
        api_key = os.getenv("OLLAMA_API_KEY")
--- a/lightrag/llm/openai.py
+++ b/lightrag/llm/openai.py
@@ -77,46 +77,86 @@ class InvalidResponseError(Exception):
 def create_openai_async_client(
    api_key: str | None = None,
    base_url: str | None = None,
+    use_azure: bool = False,
+    azure_deployment: str | None = None,
+    api_version: str | None = None,
+    timeout: int | None = None,
    client_configs: dict[str, Any] | None = None,
 ) -> AsyncOpenAI:
-    """Create an AsyncOpenAI client with the given configuration.
+    """Create an AsyncOpenAI or AsyncAzureOpenAI client with the given configuration.

    Args:
        api_key: OpenAI API key. If None, uses the OPENAI_API_KEY environment variable.
        base_url: Base URL for the OpenAI API. If None, uses the default OpenAI API URL.
+        use_azure: Whether to create an Azure OpenAI client. Default is False.
+        azure_deployment: Azure OpenAI deployment name (only used when use_azure=True).
+        api_version: Azure OpenAI API version (only used when use_azure=True).
+        timeout: Request timeout in seconds.
        client_configs: Additional configuration options for the AsyncOpenAI client.
            These will override any default configurations but will be overridden by
            explicit parameters (api_key, base_url).

    Returns:
-        An AsyncOpenAI client instance.
+        An AsyncOpenAI or AsyncAzureOpenAI client instance.
    """
-    if not api_key:
-        api_key = os.environ["OPENAI_API_KEY"]
+    if use_azure:
+        from openai import AsyncAzureOpenAI

-    default_headers = {
-        "User-Agent": f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_8) LightRAG/{__api_version__}",
-        "Content-Type": "application/json",
-    }
+        if not api_key:
+            api_key = os.environ.get("AZURE_OPENAI_API_KEY") or os.environ.get(
+                "LLM_BINDING_API_KEY"
+            )

-    if client_configs is None:
-        client_configs = {}
+        if client_configs is None:
+            client_configs = {}

-    # Create a merged config dict with precedence: explicit params > client_configs > defaults
-    merged_configs = {
-        **client_configs,
-        "default_headers": default_headers,
-        "api_key": api_key,
-    }
+        # Create a merged config dict with precedence: explicit params > client_configs
+        merged_configs = {
+            **client_configs,
+            "api_key": api_key,
+        }

-    if base_url is not None:
-        merged_configs["base_url"] = base_url
+        # Add explicit parameters (override client_configs)
+        if base_url is not None:
+            merged_configs["azure_endpoint"] = base_url
+        if azure_deployment is not None:
+            merged_configs["azure_deployment"] = azure_deployment
+        if api_version is not None:
+            merged_configs["api_version"] = api_version
+        if timeout is not None:
+            merged_configs["timeout"] = timeout
+
+        return AsyncAzureOpenAI(**merged_configs)
    else:
-        merged_configs["base_url"] = os.environ.get(
-            "OPENAI_API_BASE", "https://api.openai.com/v1"
-        )
+        if not api_key:
+            api_key = os.environ["OPENAI_API_KEY"]

-    return AsyncOpenAI(**merged_configs)
+        default_headers = {
+            "User-Agent": f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_8) LightRAG/{__api_version__}",
+            "Content-Type": "application/json",
+        }
+
+        if client_configs is None:
+            client_configs = {}
+
+        # Create a merged config dict with precedence: explicit params > client_configs > defaults
+        merged_configs = {
+            **client_configs,
+            "default_headers": default_headers,
+            "api_key": api_key,
+        }
+
+        if base_url is not None:
+            merged_configs["base_url"] = base_url
+        else:
+            merged_configs["base_url"] = os.environ.get(
+                "OPENAI_API_BASE", "https://api.openai.com/v1"
+            )
+
+        if timeout is not None:
+            merged_configs["timeout"] = timeout
+
+        return AsyncOpenAI(**merged_configs)


@retry(
@@ -141,6 +181,9 @@ async def openai_complete_if_cache(
    stream: bool | None = None,
    timeout: int | None = None,
    keyword_extraction: bool = False,
+    use_azure: bool = False,
+    azure_deployment: str | None = None,
+    api_version: str | None = None,
    **kwargs: Any,
 ) -> str:
    """Complete a prompt using OpenAI's API with caching support and Chain of Thought (COT) integration.
@@ -162,23 +205,33 @@ async def openai_complete_if_cache(
    6. For non-streaming: COT content is prepended to regular content with <think> tags.

    Args:
-        model: The OpenAI model to use.
+        model: The OpenAI model to use. For Azure, this can be the deployment name.
        prompt: The prompt to complete.
        system_prompt: Optional system prompt to include.
        history_messages: Optional list of previous messages in the conversation.
-        base_url: Optional base URL for the OpenAI API.
-        api_key: Optional OpenAI API key. If None, uses the OPENAI_API_KEY environment variable.
-        token_tracker: Optional token usage tracker for monitoring API usage.
        enable_cot: Whether to enable Chain of Thought (COT) processing. Default is False.
+        base_url: Optional base URL for the OpenAI API. For Azure, this should be the
+            Azure OpenAI endpoint (e.g., https://your-resource.openai.azure.com/).
+        api_key: Optional API key. For standard OpenAI, uses OPENAI_API_KEY environment
+            variable if None. For Azure, uses AZURE_OPENAI_API_KEY if None.
+        token_tracker: Optional token usage tracker for monitoring API usage.
        stream: Whether to stream the response. Default is False.
        timeout: Request timeout in seconds. Default is None.
        keyword_extraction: Whether to enable keyword extraction mode. When True, triggers
            special response formatting for keyword extraction. Default is False.
+        use_azure: Whether to use Azure OpenAI service instead of standard OpenAI.
+            When True, creates an AsyncAzureOpenAI client. Default is False.
+        azure_deployment: Azure OpenAI deployment name. Only used when use_azure=True.
+            If not specified, falls back to AZURE_OPENAI_DEPLOYMENT environment variable.
+        api_version: Azure OpenAI API version (e.g., "2024-02-15-preview"). Only used
+            when use_azure=True. If not specified, falls back to AZURE_OPENAI_API_VERSION
+            environment variable.
        **kwargs: Additional keyword arguments to pass to the OpenAI API.
            Special kwargs:
            - openai_client_configs: Dict of configuration options for the AsyncOpenAI client.
                These will be passed to the client constructor but will be overridden by
-                explicit parameters (api_key, base_url).
+                explicit parameters (api_key, base_url). Supports proxy configuration,
+                custom headers, retry policies, etc.

    Returns:
        The completed text (with integrated COT content if available) or an async iterator
@@ -203,10 +256,18 @@ async def openai_complete_if_cache(
    # Extract client configuration options
    client_configs = kwargs.pop("openai_client_configs", {})

-    # Create the OpenAI client
+    # Handle keyword extraction mode
+    if keyword_extraction:
+        kwargs["response_format"] = GPTKeywordExtractionFormat
+
+    # Create the OpenAI client (supports both OpenAI and Azure)
    openai_async_client = create_openai_async_client(
        api_key=api_key,
        base_url=base_url,
+        use_azure=use_azure,
+        azure_deployment=azure_deployment,
+        api_version=api_version,
+        timeout=timeout,
        client_configs=client_configs,
    )

@@ -234,16 +295,24 @@ async def openai_complete_if_cache(
    if timeout is not None:
        kwargs["timeout"] = timeout

+    # Determine the correct model identifier to use
+    # For Azure OpenAI, we must use the deployment name instead of the model name
+    api_model = azure_deployment if use_azure and azure_deployment else model
+
    try:
        # Don't use async with context manager, use client directly
        if "response_format" in kwargs:
-            response = await openai_async_client.beta.chat.completions.parse(
-                model=model, messages=messages, **kwargs
+            response = await openai_async_client.chat.completions.parse(
+                model=api_model, messages=messages, **kwargs
            )
        else:
            response = await openai_async_client.chat.completions.create(
-                model=model, messages=messages, **kwargs
+                model=api_model, messages=messages, **kwargs
            )
+    except APITimeoutError as e:
+        logger.error(f"OpenAI API Timeout Error: {e}")
+        await openai_async_client.close()  # Ensure client is closed
+        raise
    except APIConnectionError as e:
        logger.error(f"OpenAI API Connection Error: {e}")
        await openai_async_client.close()  # Ensure client is closed
@@ -252,10 +321,6 @@ async def openai_complete_if_cache(
        logger.error(f"OpenAI API Rate Limit Error: {e}")
        await openai_async_client.close()  # Ensure client is closed
        raise
-    except APITimeoutError as e:
-        logger.error(f"OpenAI API Timeout Error: {e}")
-        await openai_async_client.close()  # Ensure client is closed
-        raise
    except Exception as e:
        logger.error(
            f"OpenAI API Call Failed,\nModel: {model},\nParams: {kwargs}, Got: {e}"
@@ -287,7 +352,10 @@ async def openai_complete_if_cache(

                    # Check if choices exists and is not empty
                    if not hasattr(chunk, "choices") or not chunk.choices:
-                        logger.warning(f"Received chunk without choices: {chunk}")
+                        # Azure OpenAI sends content filter results in first chunk without choices
+                        logger.debug(
+                            f"Received chunk without choices (likely Azure content filter): {chunk}"
+                        )
                        continue

                    # Check if delta exists
@@ -449,46 +517,57 @@ async def openai_complete_if_cache(
                raise InvalidResponseError("Invalid response from OpenAI API")

            message = response.choices[0].message
-            content = getattr(message, "content", None)
-            reasoning_content = getattr(message, "reasoning_content", "")

-            # Handle COT logic for non-streaming responses (only if enabled)
-            final_content = ""
+            # Handle parsed responses (structured output via response_format)
+            # When using beta.chat.completions.parse(), the response is in message.parsed
+            if hasattr(message, "parsed") and message.parsed is not None:
+                # Serialize the parsed structured response to JSON
+                final_content = message.parsed.model_dump_json()
+                logger.debug("Using parsed structured response from API")
+            else:
+                # Handle regular content responses
+                content = getattr(message, "content", None)
+                reasoning_content = getattr(message, "reasoning_content", "")

-            if enable_cot:
-                # Check if we should include reasoning content
-                should_include_reasoning = False
-                if reasoning_content and reasoning_content.strip():
-                    if not content or content.strip() == "":
-                        # Case 1: Only reasoning content, should include COT
-                        should_include_reasoning = True
-                        final_content = (
-                            content or ""
-                        )  # Use empty string if content is None
+                # Handle COT logic for non-streaming responses (only if enabled)
+                final_content = ""
+
+                if enable_cot:
+                    # Check if we should include reasoning content
+                    should_include_reasoning = False
+                    if reasoning_content and reasoning_content.strip():
+                        if not content or content.strip() == "":
+                            # Case 1: Only reasoning content, should include COT
+                            should_include_reasoning = True
+                            final_content = (
+                                content or ""
+                            )  # Use empty string if content is None
+                        else:
+                            # Case 3: Both content and reasoning_content present, ignore reasoning
+                            should_include_reasoning = False
+                            final_content = content
                    else:
-                        # Case 3: Both content and reasoning_content present, ignore reasoning
-                        should_include_reasoning = False
-                        final_content = content
+                        # No reasoning content, use regular content
+                        final_content = content or ""
+
+                    # Apply COT wrapping if needed
+                    if should_include_reasoning:
+                        if r"\u" in reasoning_content:
+                            reasoning_content = safe_unicode_decode(
+                                reasoning_content.encode("utf-8")
+                            )
+                        final_content = (
+                            f"<think>{reasoning_content}</think>{final_content}"
+                        )
                else:
-                    # No reasoning content, use regular content
+                    # COT disabled, only use regular content
                    final_content = content or ""

-                # Apply COT wrapping if needed
-                if should_include_reasoning:
-                    if r"\u" in reasoning_content:
-                        reasoning_content = safe_unicode_decode(
-                            reasoning_content.encode("utf-8")
-                        )
-                    final_content = f"<think>{reasoning_content}</think>{final_content}"
-            else:
-                # COT disabled, only use regular content
-                final_content = content or ""
-
-            # Validate final content
-            if not final_content or final_content.strip() == "":
-                logger.error("Received empty content from OpenAI API")
-                await openai_async_client.close()  # Ensure client is closed
-                raise InvalidResponseError("Received empty content from OpenAI API")
+                # Validate final content
+                if not final_content or final_content.strip() == "":
+                    logger.error("Received empty content from OpenAI API")
+                    await openai_async_client.close()  # Ensure client is closed
+                    raise InvalidResponseError("Received empty content from OpenAI API")

            # Apply Unicode decoding to final content if needed
            if r"\u" in final_content:
@@ -522,8 +601,6 @@ async def openai_complete(
 ) -> Union[str, AsyncIterator[str]]:
    if history_messages is None:
        history_messages = []
-    if keyword_extraction:
-        kwargs["response_format"] = "json"
    model_name = kwargs["hashing_kv"].global_config["llm_model_name"]
    return await openai_complete_if_cache(
        model_name,
@@ -545,8 +622,6 @@ async def gpt_4o_complete(
 ) -> str:
    if history_messages is None:
        history_messages = []
-    if keyword_extraction:
-        kwargs["response_format"] = GPTKeywordExtractionFormat
    return await openai_complete_if_cache(
        "gpt-4o",
        prompt,
@@ -568,8 +643,6 @@ async def gpt_4o_mini_complete(
 ) -> str:
    if history_messages is None:
        history_messages = []
-    if keyword_extraction:
-        kwargs["response_format"] = GPTKeywordExtractionFormat
    return await openai_complete_if_cache(
        "gpt-4o-mini",
        prompt,
@@ -622,24 +695,40 @@ async def openai_embed(
    embedding_dim: int | None = None,
    client_configs: dict[str, Any] | None = None,
    token_tracker: Any | None = None,
+    use_azure: bool = False,
+    azure_deployment: str | None = None,
+    api_version: str | None = None,
 ) -> np.ndarray:
    """Generate embeddings for a list of texts using OpenAI's API.

+    This function supports both standard OpenAI and Azure OpenAI services.
+
    Args:
        texts: List of texts to embed.
-        model: The OpenAI embedding model to use.
-        base_url: Optional base URL for the OpenAI API.
-        api_key: Optional OpenAI API key. If None, uses the OPENAI_API_KEY environment variable.
+        model: The embedding model to use. For standard OpenAI (e.g., "text-embedding-3-small").
+            For Azure, this can be the deployment name.
+        base_url: Optional base URL for the API. For standard OpenAI, uses default OpenAI endpoint.
+            For Azure, this should be the Azure OpenAI endpoint (e.g., https://your-resource.openai.azure.com/).
+        api_key: Optional API key. For standard OpenAI, uses OPENAI_API_KEY environment variable if None.
+            For Azure, uses AZURE_EMBEDDING_API_KEY environment variable if None.
        embedding_dim: Optional embedding dimension for dynamic dimension reduction.
            **IMPORTANT**: This parameter is automatically injected by the EmbeddingFunc wrapper.
            Do NOT manually pass this parameter when calling the function directly.
            The dimension is controlled by the @wrap_embedding_func_with_attrs decorator.
            Manually passing a different value will trigger a warning and be ignored.
            When provided (by EmbeddingFunc), it will be passed to the OpenAI API for dimension reduction.
-        client_configs: Additional configuration options for the AsyncOpenAI client.
+        client_configs: Additional configuration options for the AsyncOpenAI/AsyncAzureOpenAI client.
            These will override any default configurations but will be overridden by
-            explicit parameters (api_key, base_url).
+            explicit parameters (api_key, base_url). Supports proxy configuration,
+            custom headers, retry policies, etc.
        token_tracker: Optional token usage tracker for monitoring API usage.
+        use_azure: Whether to use Azure OpenAI service instead of standard OpenAI.
+            When True, creates an AsyncAzureOpenAI client. Default is False.
+        azure_deployment: Azure OpenAI deployment name. Only used when use_azure=True.
+            If not specified, falls back to AZURE_EMBEDDING_DEPLOYMENT environment variable.
+        api_version: Azure OpenAI API version (e.g., "2024-02-15-preview"). Only used
+            when use_azure=True. If not specified, falls back to AZURE_EMBEDDING_API_VERSION
+            environment variable.

    Returns:
        A numpy array of embeddings, one per input text.
@@ -649,15 +738,24 @@ async def openai_embed(
        RateLimitError: If the OpenAI API rate limit is exceeded.
        APITimeoutError: If the OpenAI API request times out.
    """
-    # Create the OpenAI client
+    # Create the OpenAI client (supports both OpenAI and Azure)
    openai_async_client = create_openai_async_client(
-        api_key=api_key, base_url=base_url, client_configs=client_configs
+        api_key=api_key,
+        base_url=base_url,
+        use_azure=use_azure,
+        azure_deployment=azure_deployment,
+        api_version=api_version,
+        client_configs=client_configs,
    )

    async with openai_async_client:
+        # Determine the correct model identifier to use
+        # For Azure OpenAI, we must use the deployment name instead of the model name
+        api_model = azure_deployment if use_azure and azure_deployment else model
+
        # Prepare API call parameters
        api_params = {
-            "model": model,
+            "model": api_model,
            "input": texts,
            "encoding_format": "base64",
        }
@@ -684,3 +782,172 @@ async def openai_embed(
                for dp in response.data
            ]
        )
+
+
+# Azure OpenAI wrapper functions for backward compatibility
+async def azure_openai_complete_if_cache(
+    model,
+    prompt,
+    system_prompt: str | None = None,
+    history_messages: list[dict[str, Any]] | None = None,
+    enable_cot: bool = False,
+    base_url: str | None = None,
+    api_key: str | None = None,
+    token_tracker: Any | None = None,
+    stream: bool | None = None,
+    timeout: int | None = None,
+    api_version: str | None = None,
+    keyword_extraction: bool = False,
+    **kwargs,
+):
+    """Azure OpenAI completion wrapper function.
+
+    This function provides backward compatibility by wrapping the unified
+    openai_complete_if_cache implementation with Azure-specific parameter handling.
+
+    All parameters from the underlying openai_complete_if_cache are exposed to ensure
+    full feature parity and API consistency.
+    """
+    # Handle Azure-specific environment variables and parameters
+    deployment = os.getenv("AZURE_OPENAI_DEPLOYMENT") or model or os.getenv("LLM_MODEL")
+    base_url = (
+        base_url or os.getenv("AZURE_OPENAI_ENDPOINT") or os.getenv("LLM_BINDING_HOST")
+    )
+    api_key = (
+        api_key or os.getenv("AZURE_OPENAI_API_KEY") or os.getenv("LLM_BINDING_API_KEY")
+    )
+    api_version = (
+        api_version
+        or os.getenv("AZURE_OPENAI_API_VERSION")
+        or os.getenv("OPENAI_API_VERSION")
+        or "2024-08-01-preview"
+    )
+
+    # Call the unified implementation with Azure-specific parameters
+    return await openai_complete_if_cache(
+        model=deployment,
+        prompt=prompt,
+        system_prompt=system_prompt,
+        history_messages=history_messages,
+        enable_cot=enable_cot,
+        base_url=base_url,
+        api_key=api_key,
+        token_tracker=token_tracker,
+        stream=stream,
+        timeout=timeout,
+        use_azure=True,
+        azure_deployment=deployment,
+        api_version=api_version,
+        keyword_extraction=keyword_extraction,
+        **kwargs,
+    )
+
+
+async def azure_openai_complete(
+    prompt,
+    system_prompt=None,
+    history_messages=None,
+    keyword_extraction=False,
+    **kwargs,
+) -> str:
+    """Azure OpenAI complete wrapper function.
+
+    Provides backward compatibility for azure_openai_complete calls.
+    """
+    if history_messages is None:
+        history_messages = []
+    result = await azure_openai_complete_if_cache(
+        os.getenv("LLM_MODEL", "gpt-4o-mini"),
+        prompt,
+        system_prompt=system_prompt,
+        history_messages=history_messages,
+        keyword_extraction=keyword_extraction,
+        **kwargs,
+    )
+    return result
+
+
+@wrap_embedding_func_with_attrs(embedding_dim=1536, max_token_size=8192)
+async def azure_openai_embed(
+    texts: list[str],
+    model: str | None = None,
+    base_url: str | None = None,
+    api_key: str | None = None,
+    token_tracker: Any | None = None,
+    client_configs: dict[str, Any] | None = None,
+    api_version: str | None = None,
+) -> np.ndarray:
+    """Azure OpenAI embedding wrapper function.
+
+    This function provides backward compatibility by wrapping the unified
+    openai_embed implementation with Azure-specific parameter handling.
+
+    All parameters from the underlying openai_embed are exposed to ensure
+    full feature parity and API consistency.
+
+    IMPORTANT - Decorator Usage:
+
+    1. This function is decorated with @wrap_embedding_func_with_attrs to provide
+       the EmbeddingFunc interface for users who need to access embedding_dim
+       and other attributes.
+
+    2. This function does NOT use @retry decorator to avoid double-wrapping,
+       since the underlying openai_embed.func already has retry logic.
+
+    3. This function calls openai_embed.func (the unwrapped function) instead of
+       openai_embed (the EmbeddingFunc instance) to avoid double decoration issues:
+
+       ✅ Correct: await openai_embed.func(...)  # Calls unwrapped function with retry
+       ❌ Wrong:   await openai_embed(...)       # Would cause double EmbeddingFunc wrapping
+
+    Double decoration causes:
+    - Double injection of embedding_dim parameter
+    - Incorrect parameter passing to the underlying implementation
+    - Runtime errors due to parameter conflicts
+
+    The call chain with correct implementation:
+    azure_openai_embed(texts)
+    → EmbeddingFunc.__call__(texts)              # azure's decorator
+      → azure_openai_embed_impl(texts, embedding_dim=1536)
+        → openai_embed.func(texts, ...)
+          → @retry_wrapper(texts, ...)           # openai's retry (only one layer)
+            → openai_embed_impl(texts, ...)
+              → actual embedding computation
+    """
+    # Handle Azure-specific environment variables and parameters
+    deployment = (
+        os.getenv("AZURE_EMBEDDING_DEPLOYMENT")
+        or model
+        or os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
+    )
+    base_url = (
+        base_url
+        or os.getenv("AZURE_EMBEDDING_ENDPOINT")
+        or os.getenv("EMBEDDING_BINDING_HOST")
+    )
+    api_key = (
+        api_key
+        or os.getenv("AZURE_EMBEDDING_API_KEY")
+        or os.getenv("EMBEDDING_BINDING_API_KEY")
+    )
+    api_version = (
+        api_version
+        or os.getenv("AZURE_EMBEDDING_API_VERSION")
+        or os.getenv("AZURE_OPENAI_API_VERSION")
+        or os.getenv("OPENAI_API_VERSION")
+        or "2024-08-01-preview"
+    )
+
+    # CRITICAL: Call openai_embed.func (unwrapped) to avoid double decoration
+    # openai_embed is an EmbeddingFunc instance, .func accesses the underlying function
+    return await openai_embed.func(
+        texts=texts,
+        model=deployment,
+        base_url=base_url,
+        api_key=api_key,
+        token_tracker=token_tracker,
+        client_configs=client_configs,
+        use_azure=True,
+        azure_deployment=deployment,
+        api_version=api_version,
+    )
--- a/lightrag/operate.py
+++ b/lightrag/operate.py
@@ -8,7 +8,10 @@ import json_repair
 from typing import Any, AsyncIterator, overload, Literal
 from collections import Counter, defaultdict

-from lightrag.exceptions import PipelineCancelledException
+from lightrag.exceptions import (
+    PipelineCancelledException,
+    ChunkTokenLimitExceededError,
+)
 from lightrag.utils import (
    logger,
    compute_mdhash_id,
@@ -109,6 +112,17 @@ def chunking_by_token_size(
        if split_by_character_only:
            for chunk in raw_chunks:
                _tokens = tokenizer.encode(chunk)
+                if len(_tokens) > chunk_token_size:
+                    logger.warning(
+                        "Chunk split_by_character exceeds token limit: len=%d limit=%d",
+                        len(_tokens),
+                        chunk_token_size,
+                    )
+                    raise ChunkTokenLimitExceededError(
+                        chunk_tokens=len(_tokens),
+                        chunk_token_limit=chunk_token_size,
+                        chunk_preview=chunk[:120],
+                    )
                new_chunks.append((len(_tokens), chunk))
        else:
            for chunk in raw_chunks:
@@ -383,8 +397,8 @@ async def _handle_single_entity_extraction(

        # Validate entity name after all cleaning steps
        if not entity_name or not entity_name.strip():
-            logger.warning(
-                f"Entity extraction error: entity name became empty after cleaning. Original: '{record_attributes[1]}'"
+            logger.info(
+                f"Empty entity name found after sanitization. Original: '{record_attributes[1]}'"
            )
            return None

@@ -460,14 +474,14 @@ async def _handle_single_relationship_extraction(

        # Validate entity names after all cleaning steps
        if not source:
-            logger.warning(
-                f"Relationship extraction error: source entity became empty after cleaning. Original: '{record_attributes[1]}'"
+            logger.info(
+                f"Empty source entity found after sanitization. Original: '{record_attributes[1]}'"
            )
            return None

        if not target:
-            logger.warning(
-                f"Relationship extraction error: target entity became empty after cleaning. Original: '{record_attributes[2]}'"
+            logger.info(
+                f"Empty target entity found after sanitization. Original: '{record_attributes[2]}'"
            )
            return None

@@ -2818,9 +2832,11 @@ async def extract_entities(
        cache_keys_collector = []

        # Get initial extraction
+        # Format system prompt without input_text for each chunk (enables OpenAI prompt caching across chunks)
        entity_extraction_system_prompt = PROMPTS[
            "entity_extraction_system_prompt"
-        ].format(**{**context_base, "input_text": content})
+        ].format(**context_base)
+        # Format user prompts with input_text for each chunk
        entity_extraction_user_prompt = PROMPTS["entity_extraction_user_prompt"].format(
            **{**context_base, "input_text": content}
        )
@@ -3250,10 +3266,16 @@ async def extract_keywords_only(
    It ONLY extracts keywords (hl_keywords, ll_keywords).
    """

-    # 1. Handle cache if needed - add cache type for keywords
+    # 1. Build the examples
+    examples = "\n".join(PROMPTS["keywords_extraction_examples"])
+
+    language = global_config["addon_params"].get("language", DEFAULT_SUMMARY_LANGUAGE)
+
+    # 2. Handle cache if needed - add cache type for keywords
    args_hash = compute_args_hash(
        param.mode,
        text,
+        language,
    )
    cached_result = await handle_cache(
        hashing_kv, args_hash, text, param.mode, cache_type="keywords"
@@ -3270,11 +3292,6 @@ async def extract_keywords_only(
                "Invalid cache format for keywords, proceeding with extraction"
            )

-    # 2. Build the examples
-    examples = "\n".join(PROMPTS["keywords_extraction_examples"])
-
-    language = global_config["addon_params"].get("language", DEFAULT_SUMMARY_LANGUAGE)
-
    # 3. Build the keyword-extraction prompt
    kw_prompt = PROMPTS["keywords_extraction"].format(
        query=text,
--- a/lightrag/prompt.py
+++ b/lightrag/prompt.py
@@ -58,18 +58,10 @@ You are a Knowledge Graph Specialist responsible for extracting entities and rel

 ---Examples---
 {examples}
-
---Real Data to be Processed---
-<Input>
-Entity_types: [{entity_types}]
-Text:
-```
-{input_text}
-```
 """

 PROMPTS["entity_extraction_user_prompt"] = """---Task---
-Extract entities and relationships from the input text to be processed.
+Extract entities and relationships from the input text in Data to be Processed below.

 ---Instructions---
 1.  **Strict Adherence to Format:** Strictly adhere to all format requirements for entity and relationship lists, including output order, field delimiters, and proper noun handling, as specified in the system prompt.
@@ -77,6 +69,15 @@ Extract entities and relationships from the input text to be processed.
 3.  **Completion Signal:** Output `{completion_delimiter}` as the final line after all relevant entities and relationships have been extracted and presented.
 4.  **Output Language:** Ensure the output language is {language}. Proper nouns (e.g., personal names, place names, organization names) must be kept in their original language and not translated.

+---Data to be Processed---
+<Entity_types>
+[{entity_types}]
+
+<Input Text>
+```
+{input_text}
+```
+
 <Output>
 """

@@ -99,7 +100,10 @@ Based on the last extraction task, identify and extract any **missed or incorrec
 """

 PROMPTS["entity_extraction_examples"] = [
-    """<Input Text>
+    """<Entity_types>
+["Person","Creature","Organization","Location","Event","Concept","Method","Content","Data","Artifact","NaturalObject"]
+
+<Input Text>
 ```
 while Alex clenched his jaw, the buzz of frustration dull against the backdrop of Taylor's authoritarian certainty. It was this competitive undercurrent that kept him alert, the sense that his and Jordan's shared commitment to discovery was an unspoken rebellion against Cruz's narrowing vision of control and order.

@@ -124,7 +128,10 @@ relation{tuple_delimiter}Taylor{tuple_delimiter}The Device{tuple_delimiter}rever
 {completion_delimiter}

 """,
-    """<Input Text>
+    """<Entity_types>
+["Person","Creature","Organization","Location","Event","Concept","Method","Content","Data","Artifact","NaturalObject"]
+
+<Input Text>
 ```
 Stock markets faced a sharp downturn today as tech giants saw significant declines, with the global tech index dropping by 3.4% in midday trading. Analysts attribute the selloff to investor concerns over rising interest rates and regulatory uncertainty.

@@ -151,7 +158,10 @@ relation{tuple_delimiter}Federal Reserve Policy Announcement{tuple_delimiter}Mar
 {completion_delimiter}

 """,
-    """<Input Text>
+    """<Entity_types>
+["Person","Creature","Organization","Location","Event","Concept","Method","Content","Data","Artifact","NaturalObject"]
+
+<Input Text>
 ```
 At the World Athletics Championship in Tokyo, Noah Carter broke the 100m sprint record using cutting-edge carbon-fiber spikes.
 ```
@@ -374,6 +384,7 @@ Given a user query, your task is to extract two distinct types of keywords:
 2. **Source of Truth**: All keywords must be explicitly derived from the user query, with both high-level and low-level keyword categories are required to contain content.
 3. **Concise & Meaningful**: Keywords should be concise words or meaningful phrases. Prioritize multi-word phrases when they represent a single concept. For example, from "latest financial report of Apple Inc.", you should extract "latest financial report" and "Apple Inc." rather than "latest", "financial", "report", and "Apple".
 4. **Handle Edge Cases**: For queries that are too simple, vague, or nonsensical (e.g., "hello", "ok", "asdfghjkl"), you must return a JSON object with empty lists for both keyword types.
+5. **Language**: All extracted keywords MUST be in {language}. Proper nouns (e.g., personal names, place names, organization names) should be kept in their original language.

 ---Examples---
 {examples}
--- a/lightrag/rerank.py
+++ b/lightrag/rerank.py
@@ -2,7 +2,7 @@ from __future__ import annotations

 import os
 import aiohttp
-from typing import Any, List, Dict, Optional
+from typing import Any, List, Dict, Optional, Tuple
 from tenacity import (
    retry,
    stop_after_attempt,
@@ -19,6 +19,158 @@ from dotenv import load_dotenv
 load_dotenv(dotenv_path=".env", override=False)


+def chunk_documents_for_rerank(
+    documents: List[str],
+    max_tokens: int = 480,
+    overlap_tokens: int = 32,
+    tokenizer_model: str = "gpt-4o-mini",
+) -> Tuple[List[str], List[int]]:
+    """
+    Chunk documents that exceed token limit for reranking.
+
+    Args:
+        documents: List of document strings to chunk
+        max_tokens: Maximum tokens per chunk (default 480 to leave margin for 512 limit)
+        overlap_tokens: Number of tokens to overlap between chunks
+        tokenizer_model: Model name for tiktoken tokenizer
+
+    Returns:
+        Tuple of (chunked_documents, original_doc_indices)
+        - chunked_documents: List of document chunks (may be more than input)
+        - original_doc_indices: Maps each chunk back to its original document index
+    """
+    # Clamp overlap_tokens to ensure the loop always advances
+    # If overlap_tokens >= max_tokens, the chunking loop would hang
+    if overlap_tokens >= max_tokens:
+        original_overlap = overlap_tokens
+        # Ensure overlap is at least 1 token less than max to guarantee progress
+        # For very small max_tokens (e.g., 1), set overlap to 0
+        overlap_tokens = max(0, max_tokens - 1)
+        logger.warning(
+            f"overlap_tokens ({original_overlap}) must be less than max_tokens ({max_tokens}). "
+            f"Clamping to {overlap_tokens} to prevent infinite loop."
+        )
+
+    try:
+        from .utils import TiktokenTokenizer
+
+        tokenizer = TiktokenTokenizer(model_name=tokenizer_model)
+    except Exception as e:
+        logger.warning(
+            f"Failed to initialize tokenizer: {e}. Using character-based approximation."
+        )
+        # Fallback: approximate 1 token ≈ 4 characters
+        max_chars = max_tokens * 4
+        overlap_chars = overlap_tokens * 4
+
+        chunked_docs = []
+        doc_indices = []
+
+        for idx, doc in enumerate(documents):
+            if len(doc) <= max_chars:
+                chunked_docs.append(doc)
+                doc_indices.append(idx)
+            else:
+                # Split into overlapping chunks
+                start = 0
+                while start < len(doc):
+                    end = min(start + max_chars, len(doc))
+                    chunk = doc[start:end]
+                    chunked_docs.append(chunk)
+                    doc_indices.append(idx)
+
+                    if end >= len(doc):
+                        break
+                    start = end - overlap_chars
+
+        return chunked_docs, doc_indices
+
+    # Use tokenizer for accurate chunking
+    chunked_docs = []
+    doc_indices = []
+
+    for idx, doc in enumerate(documents):
+        tokens = tokenizer.encode(doc)
+
+        if len(tokens) <= max_tokens:
+            # Document fits in one chunk
+            chunked_docs.append(doc)
+            doc_indices.append(idx)
+        else:
+            # Split into overlapping chunks
+            start = 0
+            while start < len(tokens):
+                end = min(start + max_tokens, len(tokens))
+                chunk_tokens = tokens[start:end]
+                chunk_text = tokenizer.decode(chunk_tokens)
+                chunked_docs.append(chunk_text)
+                doc_indices.append(idx)
+
+                if end >= len(tokens):
+                    break
+                start = end - overlap_tokens
+
+    return chunked_docs, doc_indices
+
+
+def aggregate_chunk_scores(
+    chunk_results: List[Dict[str, Any]],
+    doc_indices: List[int],
+    num_original_docs: int,
+    aggregation: str = "max",
+) -> List[Dict[str, Any]]:
+    """
+    Aggregate rerank scores from document chunks back to original documents.
+
+    Args:
+        chunk_results: Rerank results for chunks [{"index": chunk_idx, "relevance_score": score}, ...]
+        doc_indices: Maps each chunk index to original document index
+        num_original_docs: Total number of original documents
+        aggregation: Strategy for aggregating scores ("max", "mean", "first")
+
+    Returns:
+        List of results for original documents [{"index": doc_idx, "relevance_score": score}, ...]
+    """
+    # Group scores by original document index
+    doc_scores: Dict[int, List[float]] = {i: [] for i in range(num_original_docs)}
+
+    for result in chunk_results:
+        chunk_idx = result["index"]
+        score = result["relevance_score"]
+
+        if 0 <= chunk_idx < len(doc_indices):
+            original_doc_idx = doc_indices[chunk_idx]
+            doc_scores[original_doc_idx].append(score)
+
+    # Aggregate scores
+    aggregated_results = []
+    for doc_idx, scores in doc_scores.items():
+        if not scores:
+            continue
+
+        if aggregation == "max":
+            final_score = max(scores)
+        elif aggregation == "mean":
+            final_score = sum(scores) / len(scores)
+        elif aggregation == "first":
+            final_score = scores[0]
+        else:
+            logger.warning(f"Unknown aggregation strategy: {aggregation}, using max")
+            final_score = max(scores)
+
+        aggregated_results.append(
+            {
+                "index": doc_idx,
+                "relevance_score": final_score,
+            }
+        )
+
+    # Sort by relevance score (descending)
+    aggregated_results.sort(key=lambda x: x["relevance_score"], reverse=True)
+
+    return aggregated_results
+
+
@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=4, max=60),
@@ -38,6 +190,8 @@ async def generic_rerank_api(
    extra_body: Optional[Dict[str, Any]] = None,
    response_format: str = "standard",  # "standard" (Jina/Cohere) or "aliyun"
    request_format: str = "standard",  # "standard" (Jina/Cohere) or "aliyun"
+    enable_chunking: bool = False,
+    max_tokens_per_doc: int = 480,
 ) -> List[Dict[str, Any]]:
    """
    Generic rerank API call for Jina/Cohere/Aliyun models.
@@ -52,6 +206,9 @@ async def generic_rerank_api(
        return_documents: Whether to return document text (Jina only)
        extra_body: Additional body parameters
        response_format: Response format type ("standard" for Jina/Cohere, "aliyun" for Aliyun)
+        request_format: Request format type
+        enable_chunking: Whether to chunk documents exceeding token limit
+        max_tokens_per_doc: Maximum tokens per document for chunking

    Returns:
        List of dictionary of ["index": int, "relevance_score": float]
@@ -63,6 +220,27 @@ async def generic_rerank_api(
    if api_key is not None:
        headers["Authorization"] = f"Bearer {api_key}"

+    # Handle document chunking if enabled
+    original_documents = documents
+    doc_indices = None
+    original_top_n = top_n  # Save original top_n for post-aggregation limiting
+
+    if enable_chunking:
+        documents, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=max_tokens_per_doc
+        )
+        logger.debug(
+            f"Chunked {len(original_documents)} documents into {len(documents)} chunks"
+        )
+        # When chunking is enabled, disable top_n at API level to get all chunk scores
+        # This ensures proper document-level coverage after aggregation
+        # We'll apply top_n to aggregated document results instead
+        if top_n is not None:
+            logger.debug(
+                f"Chunking enabled: disabled API-level top_n={top_n} to ensure complete document coverage"
+            )
+            top_n = None
+
    # Build request payload based on request format
    if request_format == "aliyun":
        # Aliyun format: nested input/parameters structure
@@ -86,7 +264,7 @@ async def generic_rerank_api(
        if extra_body:
            payload["parameters"].update(extra_body)
    else:
-        # Standard format for Jina/Cohere
+        # Standard format for Jina/Cohere/OpenAI
        payload = {
            "model": model,
            "query": query,
@@ -98,7 +276,7 @@ async def generic_rerank_api(
            payload["top_n"] = top_n

        # Only Jina API supports return_documents parameter
-        if return_documents is not None:
+        if return_documents is not None and response_format in ("standard",):
            payload["return_documents"] = return_documents

        # Add extra parameters
@@ -147,7 +325,6 @@ async def generic_rerank_api(
                        f"Expected 'output.results' to be list, got {type(results)}: {results}"
                    )
                    results = []
-
            elif response_format == "standard":
                # Standard format: {"results": [...]}
                results = response_json.get("results", [])
@@ -158,16 +335,35 @@ async def generic_rerank_api(
                    results = []
            else:
                raise ValueError(f"Unsupported response format: {response_format}")
+
            if not results:
                logger.warning("Rerank API returned empty results")
                return []

            # Standardize return format
-            return [
+            standardized_results = [
                {"index": result["index"], "relevance_score": result["relevance_score"]}
                for result in results
            ]

+            # Aggregate chunk scores back to original documents if chunking was enabled
+            if enable_chunking and doc_indices:
+                standardized_results = aggregate_chunk_scores(
+                    standardized_results,
+                    doc_indices,
+                    len(original_documents),
+                    aggregation="max",
+                )
+                # Apply original top_n limit at document level (post-aggregation)
+                # This preserves document-level semantics: top_n limits documents, not chunks
+                if (
+                    original_top_n is not None
+                    and len(standardized_results) > original_top_n
+                ):
+                    standardized_results = standardized_results[:original_top_n]
+
+            return standardized_results
+

 async def cohere_rerank(
    query: str,
@@ -177,21 +373,46 @@ async def cohere_rerank(
    model: str = "rerank-v3.5",
    base_url: str = "https://api.cohere.com/v2/rerank",
    extra_body: Optional[Dict[str, Any]] = None,
+    enable_chunking: bool = False,
+    max_tokens_per_doc: int = 4096,
 ) -> List[Dict[str, Any]]:
    """
    Rerank documents using Cohere API.

+    Supports both standard Cohere API and Cohere-compatible proxies
+
    Args:
        query: The search query
        documents: List of strings to rerank
        top_n: Number of top results to return
-        api_key: API key
-        model: rerank model name
+        api_key: API key for authentication
+        model: rerank model name (default: rerank-v3.5)
        base_url: API endpoint
        extra_body: Additional body for http request(reserved for extra params)
+        enable_chunking: Whether to chunk documents exceeding max_tokens_per_doc
+        max_tokens_per_doc: Maximum tokens per document (default: 4096 for Cohere v3.5)

    Returns:
        List of dictionary of ["index": int, "relevance_score": float]
+
+    Example:
+        >>> # Standard Cohere API
+        >>> results = await cohere_rerank(
+        ...     query="What is the meaning of life?",
+        ...     documents=["Doc1", "Doc2"],
+        ...     api_key="your-cohere-key"
+        ... )
+
+        >>> # LiteLLM proxy with user authentication
+        >>> results = await cohere_rerank(
+        ...     query="What is vector search?",
+        ...     documents=["Doc1", "Doc2"],
+        ...     model="answerai-colbert-small-v1",
+        ...     base_url="https://llm-proxy.example.com/v2/rerank",
+        ...     api_key="your-proxy-key",
+        ...     enable_chunking=True,
+        ...     max_tokens_per_doc=480
+        ... )
    """
    if api_key is None:
        api_key = os.getenv("COHERE_API_KEY") or os.getenv("RERANK_BINDING_API_KEY")
@@ -206,6 +427,8 @@ async def cohere_rerank(
        return_documents=None,  # Cohere doesn't support this parameter
        extra_body=extra_body,
        response_format="standard",
+        enable_chunking=enable_chunking,
+        max_tokens_per_doc=max_tokens_per_doc,
    )


--- a/lightrag/tools/clean_llm_query_cache.py
+++ b/lightrag/tools/clean_llm_query_cache.py
@@ -1129,11 +1129,16 @@ class CleanupTool:
                pass


-async def main():
-    """Main entry point"""
+async def async_main():
+    """Async main entry point"""
    tool = CleanupTool()
    await tool.run()


+def main():
+    """Synchronous entry point for CLI command"""
+    asyncio.run(async_main())
+
+
 if __name__ == "__main__":
-    asyncio.run(main())
+    main()
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 import weakref

+import sys
+
 import asyncio
 import html
 import csv
@@ -40,6 +42,35 @@ from lightrag.constants import (
    SOURCE_IDS_LIMIT_METHOD_FIFO,
 )

+# Precompile regex pattern for JSON sanitization (module-level, compiled once)
+_SURROGATE_PATTERN = re.compile(r"[\uD800-\uDFFF\uFFFE\uFFFF]")
+
+
+class SafeStreamHandler(logging.StreamHandler):
+    """StreamHandler that gracefully handles closed streams during shutdown.
+
+    This handler prevents "ValueError: I/O operation on closed file" errors
+    that can occur when pytest or other test frameworks close stdout/stderr
+    before Python's logging cleanup runs.
+    """
+
+    def flush(self):
+        """Flush the stream, ignoring errors if the stream is closed."""
+        try:
+            super().flush()
+        except (ValueError, OSError):
+            # Stream is closed or otherwise unavailable, silently ignore
+            pass
+
+    def close(self):
+        """Close the handler, ignoring errors if the stream is already closed."""
+        try:
+            super().close()
+        except (ValueError, OSError):
+            # Stream is closed or otherwise unavailable, silently ignore
+            pass
+
+
 # Initialize logger with basic configuration
 logger = logging.getLogger("lightrag")
 logger.propagate = False  # prevent log message send to root logger
@@ -47,7 +78,7 @@ logger.setLevel(logging.INFO)

 # Add console handler if no handlers exist
 if not logger.handlers:
-    console_handler = logging.StreamHandler()
+    console_handler = SafeStreamHandler()
    console_handler.setLevel(logging.INFO)
    formatter = logging.Formatter("%(levelname)s: %(message)s")
    console_handler.setFormatter(formatter)
@@ -56,8 +87,32 @@ if not logger.handlers:
 # Set httpx logging level to WARNING
 logging.getLogger("httpx").setLevel(logging.WARNING)

-# Precompile regex pattern for JSON sanitization (module-level, compiled once)
-_SURROGATE_PATTERN = re.compile(r"[\uD800-\uDFFF\uFFFE\uFFFF]")
+
+def _patch_ascii_colors_console_handler() -> None:
+    """Prevent ascii_colors from printing flush errors during interpreter exit."""
+
+    try:
+        from ascii_colors import ConsoleHandler
+    except ImportError:
+        return
+
+    if getattr(ConsoleHandler, "_lightrag_patched", False):
+        return
+
+    original_handle_error = ConsoleHandler.handle_error
+
+    def _safe_handle_error(self, message: str) -> None:  # type: ignore[override]
+        exc_type, _, _ = sys.exc_info()
+        if exc_type in (ValueError, OSError) and "close" in message.lower():
+            return
+        original_handle_error(self, message)
+
+    ConsoleHandler.handle_error = _safe_handle_error  # type: ignore[assignment]
+    ConsoleHandler._lightrag_patched = True  # type: ignore[attr-defined]
+
+
+_patch_ascii_colors_console_handler()
+

 # Global import for pypinyin with startup-time logging
 try:
@@ -286,8 +341,8 @@ def setup_logger(
    logger_instance.handlers = []  # Clear existing handlers
    logger_instance.propagate = False

-    # Add console handler
-    console_handler = logging.StreamHandler()
+    # Add console handler with safe stream handling
+    console_handler = SafeStreamHandler()
    console_handler.setFormatter(simple_formatter)
    console_handler.setLevel(level)
    logger_instance.addHandler(console_handler)
@@ -963,7 +1018,76 @@ def priority_limit_async_func_call(


 def wrap_embedding_func_with_attrs(**kwargs):
-    """Wrap a function with attributes"""
+    """Decorator to add embedding dimension and token limit attributes to embedding functions.
+
+    This decorator wraps an async embedding function and returns an EmbeddingFunc instance
+    that automatically handles dimension parameter injection and attribute management.
+
+    WARNING: DO NOT apply this decorator to wrapper functions that call other
+    decorated embedding functions. This will cause double decoration and parameter
+    injection conflicts.
+
+    Correct usage patterns:
+
+    1. Direct implementation (decorated):
+        ```python
+        @wrap_embedding_func_with_attrs(embedding_dim=1536)
+        async def my_embed(texts, embedding_dim=None):
+            # Direct implementation
+            return embeddings
+        ```
+
+    2. Wrapper calling decorated function (DO NOT decorate wrapper):
+        ```python
+        # my_embed is already decorated above
+
+        async def my_wrapper(texts, **kwargs):  # ❌ DO NOT decorate this!
+            # Must call .func to access unwrapped implementation
+            return await my_embed.func(texts, **kwargs)
+        ```
+
+    3. Wrapper calling decorated function (properly decorated):
+        ```python
+        @wrap_embedding_func_with_attrs(embedding_dim=1536)
+        async def my_wrapper(texts, **kwargs):  # ✅ Can decorate if calling .func
+            # Calling .func avoids double decoration
+            return await my_embed.func(texts, **kwargs)
+        ```
+
+    The decorated function becomes an EmbeddingFunc instance with:
+    - embedding_dim: The embedding dimension
+    - max_token_size: Maximum token limit (optional)
+    - func: The original unwrapped function (access via .func)
+    - __call__: Wrapper that injects embedding_dim parameter
+
+    Double decoration causes:
+    - Double injection of embedding_dim parameter
+    - Incorrect parameter passing to the underlying implementation
+    - Runtime errors due to parameter conflicts
+
+    Args:
+        embedding_dim: The dimension of embedding vectors
+        max_token_size: Maximum number of tokens (optional)
+        send_dimensions: Whether to inject embedding_dim as a keyword argument (optional)
+
+    Returns:
+        A decorator that wraps the function as an EmbeddingFunc instance
+
+    Example of correct wrapper implementation:
+        ```python
+        @wrap_embedding_func_with_attrs(embedding_dim=1536, max_token_size=8192)
+        @retry(...)
+        async def openai_embed(texts, ...):
+            # Base implementation
+            pass
+
+        @wrap_embedding_func_with_attrs(embedding_dim=1536)  # Note: No @retry here!
+        async def azure_openai_embed(texts, ...):
+            # CRITICAL: Call .func to access unwrapped function
+            return await openai_embed.func(texts, ...)  # ✅ Correct
+            # return await openai_embed(texts, ...)     # ❌ Wrong - double decoration!
+        ```
+    """

    def final_decro(func) -> EmbeddingFunc:
        new_func = EmbeddingFunc(**kwargs, func=func)
--- a/lightrag_webui/bun.lock
+++ b/lightrag_webui/bun.lock
--- a/lightrag_webui/package.json
+++ b/lightrag_webui/package.json
@@ -16,32 +16,32 @@
    "preview-no-bun": "vite preview"
  },
  "dependencies": {
-    "@faker-js/faker": "^9.9.0",
+    "@faker-js/faker": "^10.1.0",
    "@radix-ui/react-alert-dialog": "^1.1.15",
    "@radix-ui/react-checkbox": "^1.3.3",
    "@radix-ui/react-dialog": "^1.1.15",
    "@radix-ui/react-popover": "^1.1.15",
-    "@radix-ui/react-progress": "^1.1.7",
+    "@radix-ui/react-progress": "^1.1.8",
    "@radix-ui/react-scroll-area": "^1.2.10",
    "@radix-ui/react-select": "^2.2.6",
-    "@radix-ui/react-separator": "^1.1.7",
-    "@radix-ui/react-slot": "^1.2.3",
+    "@radix-ui/react-separator": "^1.1.8",
+    "@radix-ui/react-slot": "^1.2.4",
    "@radix-ui/react-tabs": "^1.1.13",
    "@radix-ui/react-tooltip": "^1.2.8",
    "@radix-ui/react-use-controllable-state": "^1.2.2",
-    "@react-sigma/core": "^5.0.4",
-    "@react-sigma/graph-search": "^5.0.4",
-    "@react-sigma/layout-circlepack": "^5.0.4",
-    "@react-sigma/layout-circular": "^5.0.4",
-    "@react-sigma/layout-force": "^5.0.4",
-    "@react-sigma/layout-forceatlas2": "^5.0.4",
-    "@react-sigma/layout-noverlap": "^5.0.4",
-    "@react-sigma/layout-random": "^5.0.4",
-    "@react-sigma/minimap": "^5.0.5",
+    "@react-sigma/core": "^5.0.6",
+    "@react-sigma/graph-search": "^5.0.6",
+    "@react-sigma/layout-circlepack": "^5.0.6",
+    "@react-sigma/layout-circular": "^5.0.6",
+    "@react-sigma/layout-force": "^5.0.6",
+    "@react-sigma/layout-forceatlas2": "^5.0.6",
+    "@react-sigma/layout-noverlap": "^5.0.6",
+    "@react-sigma/layout-random": "^5.0.6",
+    "@react-sigma/minimap": "^5.0.6",
    "@sigma/edge-curve": "^3.1.0",
    "@sigma/node-border": "^3.0.0",
    "@tanstack/react-table": "^8.21.3",
-    "axios": "^1.12.2",
+    "axios": "^1.13.2",
    "class-variance-authority": "^0.7.1",
    "clsx": "^2.1.1",
    "cmdk": "^1.1.1",
@@ -51,21 +51,21 @@
    "graphology-layout-force": "^0.2.4",
    "graphology-layout-forceatlas2": "^0.10.1",
    "graphology-layout-noverlap": "^0.4.2",
-    "i18next": "^24.2.3",
-    "katex": "^0.16.23",
-    "lucide-react": "^0.475.0",
-    "mermaid": "^11.12.0",
+    "i18next": "^25.6.3",
+    "katex": "^0.16.25",
+    "mermaid": "^11.12.1",
+    "lucide-react": "^0.555.0",
    "minisearch": "^7.2.0",
-    "react": "^19.2.0",
-    "react-dom": "^19.2.0",
+    "react": "^19.2.1",
+    "react-dom": "^19.2.1",
    "react-dropzone": "^14.3.8",
-    "react-error-boundary": "^5.0.0",
-    "react-i18next": "^15.7.4",
-    "react-markdown": "^9.1.0",
+    "react-error-boundary": "^6.0.0",
+    "react-i18next": "^16.3.5",
+    "react-markdown": "^10.1.0",
    "react-number-format": "^5.4.4",
-    "react-router-dom": "^7.9.4",
+    "react-router-dom": "^7.9.6",
    "react-select": "^5.10.2",
-    "react-syntax-highlighter": "^15.6.6",
+    "react-syntax-highlighter": "^16.1.0",
    "rehype-katex": "^7.0.1",
    "rehype-raw": "^7.0.0",
    "rehype-react": "^8.0.0",
@@ -73,40 +73,40 @@
    "remark-math": "^6.0.0",
    "seedrandom": "^3.0.5",
    "sigma": "^3.0.2",
-    "sonner": "^1.7.4",
-    "tailwind-merge": "^3.3.1",
+    "sonner": "^2.0.7",
+    "tailwind-merge": "^3.4.0",
    "tailwind-scrollbar": "^4.0.2",
    "typography": "^0.16.24",
    "unist-util-visit": "^5.0.0",
-    "zustand": "^5.0.8"
+    "zustand": "^5.0.9"
  },
  "devDependencies": {
-    "@eslint/js": "^9.37.0",
-    "@stylistic/eslint-plugin-js": "^3.1.0",
-    "@tailwindcss/vite": "^4.1.14",
-    "@types/bun": "^1.2.23",
+    "@eslint/js": "^9.39.1",
+    "@stylistic/eslint-plugin-js": "^4.4.1",
+    "@types/bun": "^1.3.3",
+    "@tailwindcss/vite": "^4.1.17",
    "@types/katex": "^0.16.7",
-    "@types/node": "^22.18.9",
+    "@types/node": "^24.10.1",
    "@tailwindcss/typography": "^0.5.15",
-    "@types/react": "^19.2.2",
-    "@types/react-dom": "^19.2.1",
+    "@types/react": "^19.2.7",
+    "@types/react-dom": "^19.2.3",
    "@types/react-i18next": "^8.1.0",
    "@types/react-syntax-highlighter": "^15.5.13",
    "@types/seedrandom": "^3.0.8",
-    "@vitejs/plugin-react-swc": "^3.11.0",
-    "eslint": "^9.37.0",
+    "@vitejs/plugin-react-swc": "^4.2.2",
+    "eslint": "^9.39.1",
    "eslint-config-prettier": "^10.1.8",
    "eslint-plugin-react": "^7.37.5",
-    "eslint-plugin-react-hooks": "^5.2.0",
-    "eslint-plugin-react-refresh": "^0.4.23",
-    "globals": "^15.15.0",
+    "eslint-plugin-react-hooks": "^7.0.1",
+    "eslint-plugin-react-refresh": "^0.4.24",
+    "globals": "^16.5.0",
    "graphology-types": "^0.24.8",
-    "prettier": "^3.6.2",
-    "prettier-plugin-tailwindcss": "^0.6.14",
-    "tailwindcss": "^4.1.14",
+    "prettier": "^3.7.1",
+    "prettier-plugin-tailwindcss": "^0.7.2",
+    "typescript-eslint": "^8.48.0",
+    "tailwindcss": "^4.1.17",
    "tailwindcss-animate": "^1.0.7",
-    "typescript": "~5.7.3",
-    "typescript-eslint": "^8.46.0",
-    "vite": "^6.3.6"
+    "typescript": "~5.9.3",
+    "vite": "^7.2.6"
  }
 }
--- a/lightrag_webui/src/components/retrieval/ChatMessage.tsx
+++ b/lightrag_webui/src/components/retrieval/ChatMessage.tsx
@@ -76,7 +76,8 @@ export const ChatMessage = ({
    ? message.content
    : (displayContent !== undefined ? displayContent : (message.content || ''))

-  // Load KaTeX dynamically
+  // Load KaTeX rehype plugin dynamically
+  // Note: KaTeX extensions (mhchem, copy-tex) are imported statically in main.tsx
  useEffect(() => {
    const loadKaTeX = async () => {
      try {
@@ -84,7 +85,6 @@ export const ChatMessage = ({
        setKatexPlugin(() => rehypeKatex);
      } catch (error) {
        console.error('Failed to load KaTeX plugin:', error);
-        // Set to null to ensure we don't try to use a failed plugin
        setKatexPlugin(null);
      }
    };
--- a/lightrag_webui/src/main.tsx
+++ b/lightrag_webui/src/main.tsx
@@ -4,6 +4,9 @@ import './index.css'
 import AppRouter from './AppRouter'
 import './i18n.ts';
 import 'katex/dist/katex.min.css';
+// Import KaTeX extensions at app startup to ensure they are registered before any rendering
+import 'katex/contrib/mhchem'; // Chemistry formulas: \ce{} and \pu{}
+import 'katex/contrib/copy-tex'; // Allow copying rendered formulas as LaTeX source



--- a/lightrag_webui/src/types/katex.d.ts
+++ b/lightrag_webui/src/types/katex.d.ts
@@ -1 +1,2 @@
 declare module 'katex/contrib/mhchem';
+declare module 'katex/contrib/copy-tex';
--- a/lightrag_webui/vite.config.ts
+++ b/lightrag_webui/vite.config.ts
@@ -10,7 +10,10 @@ export default defineConfig({
  resolve: {
    alias: {
      '@': path.resolve(__dirname, './src')
-    }
+    },
+    // Force all modules to use the same katex instance
+    // This ensures mhchem extension registered in main.tsx is available to rehype-katex
+    dedupe: ['katex']
  },
  // base: import.meta.env.VITE_BASE_URL || '/webui/',
  base: webuiPrefix,
--- a/paging.md
+++ b/paging.md
@@ -1,251 +0,0 @@
-# 文档列表页面分页显示功能改造方案
-
-## 一、改造目标
-
-### 问题现状
- 当前文档页面一次性加载所有文档，导致大量文档时界面加载慢
- 前端内存占用过大，用户操作体验差
- 状态过滤和排序都在前端进行，效率低下
-
-### 改造目标
- 实现后端分页查询，减少单次数据传输量
- 添加分页控制组件，支持翻页和跳转功能
- 允许用户设置每页显示行数（10-200条）
- 保持现有状态过滤和排序功能不变
- 提升大数据量场景下的性能表现
-
-## 二、总体架构设计
-
-### 设计原则
-1. **统一分页接口**：后端提供统一的分页API，支持状态过滤和排序
-2. **智能刷新策略**：根据处理状态选择合适的刷新频率和范围
-3. **即时用户反馈**：状态切换、分页操作提供立即响应
-4. **向后兼容**：保持现有功能完整性，不影响现有操作流程
-5. **性能优化**：减少内存占用，优化网络请求
-
-### 技术方案
- **后端**：在现有存储层基础上添加分页查询接口
- **前端**：改造DocumentManager组件，添加分页控制
- **数据流**：统一分页查询 + 独立状态计数查询
-
-## 三、后端改造步骤
-
-### 步骤1：存储层接口扩展
-
-**改动文件**：`lightrag/kg/base.py`
-
-**关键思路**：
- 在BaseDocStatusStorage抽象类中添加分页查询方法
- 设计统一的分页接口，支持状态过滤、排序、分页参数
- 返回文档列表和总数量的元组
-
-**接口设计要点**：
-```
-get_docs_paginated(status_filter, page, page_size, sort_field, sort_direction) -> (documents, total_count)
-count_by_status(status) -> int
-get_all_status_counts() -> Dict[str, int]
-```
-
-### 步骤2：各存储后端实现
-
-**改动文件**：
- `lightrag/kg/postgres_impl.py`
- `lightrag/kg/mongo_impl.py`
- `lightrag/kg/redis_impl.py`
- `lightrag/kg/json_doc_status_impl.py`
-
-**PostgreSQL实现要点**：
- 使用LIMIT和OFFSET实现分页
- 构建动态WHERE条件支持状态过滤
- 使用COUNT查询获取总数量
- 添加合适的数据库索引优化查询性能
-
-**MongoDB实现要点**：
- 使用skip()和limit()实现分页
- 使用聚合管道进行状态统计
- 优化查询条件和索引
-
-**Redis 与 Json实现要点：**
-
-* 考虑先用简单的方式实现，即把所有文件清单读到内存中后进行过滤和排序
-
-**关键考虑**：
-
- 确保各存储后端的分页逻辑一致性
- 处理边界情况（空结果、超出页码范围等）
- 优化查询性能，避免全表扫描
-
-### 步骤3：API路由层改造
-
-**改动文件**：`lightrag/api/routers/document_routes.py`
-
-**新增接口**：
-1. `POST /documents/paginated` - 分页查询文档
-2. `GET /documents/status_counts` - 获取状态计数
-
-**数据模型设计**：
- DocumentsRequest：分页请求参数
- PaginatedDocsResponse：分页响应数据
- PaginationInfo：分页元信息
-
-**关键逻辑**：
- 参数验证（页码范围、页面大小限制）
- 并行查询分页数据和状态计数
- 错误处理和异常响应
-
-### 步骤4：数据库优化
-
-**索引策略**：
- 为workspace + status + updated_at创建复合索引
- 为workspace + status + created_at创建复合索引
- 为workspace + updated_at创建索引
- 为workspace + created_at创建索引
-
-**性能考虑**：
- 避免深度分页的性能问题
- 考虑添加缓存层优化状态计数查询
- 监控查询性能，必要时调整索引策略
-
-## 四、前端改造步骤
-
-### 步骤1：API客户端扩展
-
-**改动文件**：`lightrag_webui/src/api/lightrag.ts`
-
-**新增函数**：
- `getDocumentsPaginated()` - 分页查询文档
- `getDocumentStatusCounts()` - 获取状态计数
-
-**类型定义**：
- 定义分页请求和响应的TypeScript类型
- 确保类型安全和代码提示
-
-### 步骤2：分页控制组件开发
-
-**新增文件**：`lightrag_webui/src/components/ui/PaginationControls.tsx`
-
-**组件功能**：
- 支持紧凑模式和完整模式
- 页码输入和跳转功能
- 每页显示数量选择（10-200）
- 总数信息显示
- 禁用状态处理
-
-**设计要点**：
- 响应式设计，适配不同屏幕尺寸
- 防抖处理，避免频繁请求
- 错误处理和状态回滚
- 组件摆放位置：目前状态按钮上方，与scan按钮同一层，居中摆放
-
-### 步骤3：状态过滤按钮优化
-
-**改动文件**：现有状态过滤相关组件
-
-**优化要点**：
-
- 添加加载状态指示
- 数据不足时的智能提示
- 定期刷新数据，状态切换时如果最先的状态数据距离上次刷新数据超过5秒应即时刷新数据
- 防止重复点击和并发请求
-
-### 步骤4：主组件DocumentManager改造
-
-**改动文件**：`lightrag_webui/src/features/DocumentManager.tsx`
-
-**核心改动**：
-
-**状态管理重构**：
- 将docs状态改为currentPageDocs（仅存储当前页数据）
- 添加pagination状态管理分页信息
- 添加statusCounts状态独立管理状态计数
- 添加加载状态管理（isStatusChanging, isRefreshing）
-
-**数据获取策略**：
- 实现智能刷新：活跃期完整刷新，稳定期轻量刷新
- 状态切换时立即刷新数据
- 分页操作时立即更新数据
- 定期刷新与手动操作协调
-
-**布局调整**：
- 将分页控制组件放置在顶部操作栏中间位置
- 保持状态过滤按钮在表格上方
- 确保响应式布局适配
-
-**事件处理优化**：
- 状态切换时，如果当前页码数据不足，则重置到第一页
- 页面大小变更时智能计算新页码
- 错误时状态回滚机制
-
-## 五、用户体验优化
-
-### 即时反馈机制
- 状态切换时显示加载动画
- 分页操作时提供视觉反馈
- 数据不足时智能提示用户
-
-### 错误处理策略
- 网络错误时自动重试
- 操作失败时状态回滚
- 友好的错误提示信息
-
-### 性能优化措施
- 防抖处理频繁操作
- 智能刷新策略减少不必要请求
- 组件卸载时清理定时器和请求
-
-## 六、兼容性保障
-
-### 向后兼容
- 保留原有的/documents接口作为备用
- 现有功能（排序、过滤、选择）保持不变
- 渐进式升级，支持配置开关
-
-### 数据一致性
- 确保分页数据与状态计数同步
- 处理并发更新的数据一致性问题
- 定期刷新保持数据最新
-
-## 七、测试策略
-
-### 功能测试
- 各种分页场景测试
- 状态过滤组合测试
- 排序功能验证
- 边界条件测试
-
-### 性能测试
- 大数据量场景测试
- 并发访问压力测试
- 内存使用情况监控
- 响应时间测试
-
-### 兼容性测试
- 不同存储后端测试
- 不同浏览器兼容性
- 移动端响应式测试
-
-## 八、关键实现细节
-
-### 后端分页查询设计
- **统一接口**：所有存储后端实现相同的分页接口签名
- **参数验证**：严格验证页码、页面大小、排序参数的合法性
- **性能优化**：使用数据库原生分页功能，避免应用层分页
- **错误处理**：统一的错误响应格式和异常处理机制
-
-### 前端状态管理策略
- **数据分离**：当前页数据与状态计数分别管理
- **智能刷新**：根据文档处理状态选择刷新策略
- **状态同步**：确保UI状态与后端数据保持一致
- **错误恢复**：操作失败时自动回滚到之前状态
-
-### 分页控制组件设计
- **紧凑布局**：适配顶部操作栏的空间限制
- **响应式设计**：在不同屏幕尺寸下自适应布局
- **交互优化**：防抖处理、加载状态、禁用状态管理
- **可访问性**：支持键盘导航和屏幕阅读器
-
-### 数据库索引优化
- **复合索引**：workspace + status + sort_field的组合索引
- **覆盖索引**：尽可能使用覆盖索引减少回表查询
- **索引监控**：定期监控索引使用情况和查询性能
- **渐进优化**：根据实际使用情况调整索引策略
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,7 +23,6 @@ classifiers = [
 dependencies = [
    "aiohttp",
    "configparser",
-    "future",
    "google-api-core>=2.0.0,<3.0.0",
    "google-genai>=1.0.0,<2.0.0",
    "json_repair",
@@ -47,18 +46,18 @@ pytest = [
    "pytest>=8.4.2",
    "pytest-asyncio>=1.2.0",
    "pre-commit",
+    "ruff",
 ]

 api = [
    # Core dependencies
    "aiohttp",
    "configparser",
-    "future",
    "json_repair",
    "nano-vectordb",
    "networkx",
    "numpy>=1.24.0,<2.0.0",
-    "openai>=1.0.0,<3.0.0",
+    "openai>=2.0.0,<3.0.0",
    "pandas>=2.0.0,<2.4.0",
    "pipmaster",
    "pydantic",
@@ -77,9 +76,9 @@ api = [
    "distro",
    "fastapi",
    "httpcore",
-    "httpx",
+    "httpx>=0.28.1",
    "jiter",
-    "passlib[bcrypt]",
+    "bcrypt>=4.0.0",
    "psutil",
    "PyJWT>=2.8.0,<3.0.0",
    "python-jose[cryptography]",
@@ -115,7 +114,7 @@ offline-storage = [

 offline-llm = [
    # LLM provider dependencies
-    "openai>=1.0.0,<3.0.0",
+    "openai>=2.0.0,<3.0.0",
    "anthropic>=0.18.0,<1.0.0",
    "ollama>=0.1.0,<1.0.0",
    "zhipuai>=2.0.0,<3.0.0",
@@ -131,15 +130,18 @@ offline = [
    "lightrag-hku[api,offline-storage,offline-llm]",
 ]

-evaluation = [
-    # Test framework dependencies (for evaluation)
+test = [
+    "lightrag-hku[api]",
    "pytest>=8.4.2",
    "pytest-asyncio>=1.2.0",
    "pre-commit",
-    # RAG evaluation dependencies (RAGAS framework)
+    "ruff",
+]
+
+evaluation = [
+    "lightrag-hku[api]",
    "ragas>=0.3.7",
    "datasets>=4.3.0",
-    "httpx>=0.28.1",
 ]

 observability = [
@@ -151,6 +153,7 @@ observability = [
 lightrag-server = "lightrag.api.lightrag_server:main"
 lightrag-gunicorn = "lightrag.api.run_with_gunicorn:main"
 lightrag-download-cache = "lightrag.tools.download_cache:main"
+lightrag-clean-llmqc = "lightrag.tools.clean_llm_query_cache:main"

 [project.urls]
 Homepage = "https://github.com/HKUDS/LightRAG"
--- a/requirements-offline-llm.txt
+++ b/requirements-offline-llm.txt
@@ -14,6 +14,6 @@ google-api-core>=2.0.0,<3.0.0
 google-genai>=1.0.0,<2.0.0
 llama-index>=0.9.0,<1.0.0
 ollama>=0.1.0,<1.0.0
-openai>=1.0.0,<3.0.0
+openai>=2.0.0,<3.0.0
 voyageai>=0.2.0,<1.0.0
 zhipuai>=2.0.0,<3.0.0
--- a/requirements-offline.txt
+++ b/requirements-offline.txt
@@ -19,7 +19,7 @@ google-genai>=1.0.0,<2.0.0
 llama-index>=0.9.0,<1.0.0
 neo4j>=5.0.0,<7.0.0
 ollama>=0.1.0,<1.0.0
-openai>=1.0.0,<3.0.0
+openai>=2.0.0,<3.0.0
 openpyxl>=3.0.0,<4.0.0
 pycryptodome>=3.0.0,<4.0.0
 pymilvus>=2.6.2,<3.0.0
--- a/tests/test_chunking.py
+++ b/tests/test_chunking.py
--- a/tests/test_overlap_validation.py
+++ b/tests/test_overlap_validation.py
@@ -0,0 +1,113 @@
+"""
+Test for overlap_tokens validation to prevent infinite loop.
+
+This test validates the fix for the bug where overlap_tokens >= max_tokens
+causes an infinite loop in the chunking function.
+"""
+
+from lightrag.rerank import chunk_documents_for_rerank
+
+
+class TestOverlapValidation:
+    """Test suite for overlap_tokens validation"""
+
+    def test_overlap_greater_than_max_tokens(self):
+        """Test that overlap_tokens > max_tokens is clamped and doesn't hang"""
+        documents = [" ".join([f"word{i}" for i in range(100)])]
+
+        # This should clamp overlap_tokens to 29 (max_tokens - 1)
+        chunked_docs, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=30, overlap_tokens=32
+        )
+
+        # Should complete without hanging
+        assert len(chunked_docs) > 0
+        assert all(idx == 0 for idx in doc_indices)
+
+    def test_overlap_equal_to_max_tokens(self):
+        """Test that overlap_tokens == max_tokens is clamped and doesn't hang"""
+        documents = [" ".join([f"word{i}" for i in range(100)])]
+
+        # This should clamp overlap_tokens to 29 (max_tokens - 1)
+        chunked_docs, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=30, overlap_tokens=30
+        )
+
+        # Should complete without hanging
+        assert len(chunked_docs) > 0
+        assert all(idx == 0 for idx in doc_indices)
+
+    def test_overlap_slightly_less_than_max_tokens(self):
+        """Test that overlap_tokens < max_tokens works normally"""
+        documents = [" ".join([f"word{i}" for i in range(100)])]
+
+        # This should work without clamping
+        chunked_docs, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=30, overlap_tokens=29
+        )
+
+        # Should complete successfully
+        assert len(chunked_docs) > 0
+        assert all(idx == 0 for idx in doc_indices)
+
+    def test_small_max_tokens_with_large_overlap(self):
+        """Test edge case with very small max_tokens"""
+        documents = [" ".join([f"word{i}" for i in range(50)])]
+
+        # max_tokens=5, overlap_tokens=10 should clamp to 4
+        chunked_docs, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=5, overlap_tokens=10
+        )
+
+        # Should complete without hanging
+        assert len(chunked_docs) > 0
+        assert all(idx == 0 for idx in doc_indices)
+
+    def test_multiple_documents_with_invalid_overlap(self):
+        """Test multiple documents with overlap_tokens >= max_tokens"""
+        documents = [
+            " ".join([f"word{i}" for i in range(50)]),
+            "short document",
+            " ".join([f"word{i}" for i in range(75)]),
+        ]
+
+        # overlap_tokens > max_tokens
+        chunked_docs, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=25, overlap_tokens=30
+        )
+
+        # Should complete successfully and chunk the long documents
+        assert len(chunked_docs) >= len(documents)
+        # Short document should not be chunked
+        assert "short document" in chunked_docs
+
+    def test_normal_operation_unaffected(self):
+        """Test that normal cases continue to work correctly"""
+        documents = [
+            " ".join([f"word{i}" for i in range(100)]),
+            "short doc",
+        ]
+
+        # Normal case: overlap_tokens (10) < max_tokens (50)
+        chunked_docs, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=50, overlap_tokens=10
+        )
+
+        # Long document should be chunked, short one should not
+        assert len(chunked_docs) > 2  # At least 3 chunks (2 from long doc + 1 short)
+        assert "short doc" in chunked_docs
+        # Verify doc_indices maps correctly
+        assert doc_indices[-1] == 1  # Last chunk is from second document
+
+    def test_edge_case_max_tokens_one(self):
+        """Test edge case where max_tokens=1"""
+        documents = [" ".join([f"word{i}" for i in range(20)])]
+
+        # max_tokens=1, overlap_tokens=5 should clamp to 0
+        chunked_docs, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=1, overlap_tokens=5
+        )
+
+        # Should complete without hanging
+        assert len(chunked_docs) > 0
+        assert all(idx == 0 for idx in doc_indices)
--- a/tests/test_rerank_chunking.py
+++ b/tests/test_rerank_chunking.py
@@ -0,0 +1,564 @@
+"""
+Unit tests for rerank document chunking functionality.
+
+Tests the chunk_documents_for_rerank and aggregate_chunk_scores functions
+in lightrag/rerank.py to ensure proper document splitting and score aggregation.
+"""
+
+import pytest
+from unittest.mock import Mock, patch, AsyncMock
+from lightrag.rerank import (
+    chunk_documents_for_rerank,
+    aggregate_chunk_scores,
+    cohere_rerank,
+)
+
+
+class TestChunkDocumentsForRerank:
+    """Test suite for chunk_documents_for_rerank function"""
+
+    def test_no_chunking_needed_for_short_docs(self):
+        """Documents shorter than max_tokens should not be chunked"""
+        documents = [
+            "Short doc 1",
+            "Short doc 2",
+            "Short doc 3",
+        ]
+
+        chunked_docs, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=100, overlap_tokens=10
+        )
+
+        # No chunking should occur
+        assert len(chunked_docs) == 3
+        assert chunked_docs == documents
+        assert doc_indices == [0, 1, 2]
+
+    def test_chunking_with_character_fallback(self):
+        """Test chunking falls back to character-based when tokenizer unavailable"""
+        # Create a very long document that exceeds character limit
+        long_doc = "a" * 2000  # 2000 characters
+        documents = [long_doc, "short doc"]
+
+        with patch("lightrag.utils.TiktokenTokenizer", side_effect=ImportError):
+            chunked_docs, doc_indices = chunk_documents_for_rerank(
+                documents,
+                max_tokens=100,  # 100 tokens = ~400 chars
+                overlap_tokens=10,  # 10 tokens = ~40 chars
+            )
+
+        # First doc should be split into chunks, second doc stays whole
+        assert len(chunked_docs) > 2  # At least one chunk from first doc + second doc
+        assert chunked_docs[-1] == "short doc"  # Last chunk is the short doc
+        # Verify doc_indices maps chunks to correct original document
+        assert doc_indices[-1] == 1  # Last chunk maps to document 1
+
+    def test_chunking_with_tiktoken_tokenizer(self):
+        """Test chunking with actual tokenizer"""
+        # Create document with known token count
+        # Approximate: "word " = ~1 token, so 200 words ~ 200 tokens
+        long_doc = " ".join([f"word{i}" for i in range(200)])
+        documents = [long_doc, "short"]
+
+        chunked_docs, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=50, overlap_tokens=10
+        )
+
+        # Long doc should be split, short doc should remain
+        assert len(chunked_docs) > 2
+        assert doc_indices[-1] == 1  # Last chunk is from second document
+
+        # Verify overlapping chunks contain overlapping content
+        if len(chunked_docs) > 2:
+            # Check that consecutive chunks from same doc have some overlap
+            for i in range(len(doc_indices) - 1):
+                if doc_indices[i] == doc_indices[i + 1] == 0:
+                    # Both chunks from first doc, should have overlap
+                    chunk1_words = chunked_docs[i].split()
+                    chunk2_words = chunked_docs[i + 1].split()
+                    # At least one word should be common due to overlap
+                    assert any(word in chunk2_words for word in chunk1_words[-5:])
+
+    def test_empty_documents(self):
+        """Test handling of empty document list"""
+        documents = []
+        chunked_docs, doc_indices = chunk_documents_for_rerank(documents)
+
+        assert chunked_docs == []
+        assert doc_indices == []
+
+    def test_single_document_chunking(self):
+        """Test chunking of a single long document"""
+        # Create document with ~100 tokens
+        long_doc = " ".join([f"token{i}" for i in range(100)])
+        documents = [long_doc]
+
+        chunked_docs, doc_indices = chunk_documents_for_rerank(
+            documents, max_tokens=30, overlap_tokens=5
+        )
+
+        # Should create multiple chunks
+        assert len(chunked_docs) > 1
+        # All chunks should map to document 0
+        assert all(idx == 0 for idx in doc_indices)
+
+
+class TestAggregateChunkScores:
+    """Test suite for aggregate_chunk_scores function"""
+
+    def test_no_chunking_simple_aggregation(self):
+        """Test aggregation when no chunking occurred (1:1 mapping)"""
+        chunk_results = [
+            {"index": 0, "relevance_score": 0.9},
+            {"index": 1, "relevance_score": 0.7},
+            {"index": 2, "relevance_score": 0.5},
+        ]
+        doc_indices = [0, 1, 2]  # 1:1 mapping
+        num_original_docs = 3
+
+        aggregated = aggregate_chunk_scores(
+            chunk_results, doc_indices, num_original_docs, aggregation="max"
+        )
+
+        # Results should be sorted by score
+        assert len(aggregated) == 3
+        assert aggregated[0]["index"] == 0
+        assert aggregated[0]["relevance_score"] == 0.9
+        assert aggregated[1]["index"] == 1
+        assert aggregated[1]["relevance_score"] == 0.7
+        assert aggregated[2]["index"] == 2
+        assert aggregated[2]["relevance_score"] == 0.5
+
+    def test_max_aggregation_with_chunks(self):
+        """Test max aggregation strategy with multiple chunks per document"""
+        # 5 chunks: first 3 from doc 0, last 2 from doc 1
+        chunk_results = [
+            {"index": 0, "relevance_score": 0.5},
+            {"index": 1, "relevance_score": 0.8},
+            {"index": 2, "relevance_score": 0.6},
+            {"index": 3, "relevance_score": 0.7},
+            {"index": 4, "relevance_score": 0.4},
+        ]
+        doc_indices = [0, 0, 0, 1, 1]
+        num_original_docs = 2
+
+        aggregated = aggregate_chunk_scores(
+            chunk_results, doc_indices, num_original_docs, aggregation="max"
+        )
+
+        # Should take max score for each document
+        assert len(aggregated) == 2
+        assert aggregated[0]["index"] == 0
+        assert aggregated[0]["relevance_score"] == 0.8  # max of 0.5, 0.8, 0.6
+        assert aggregated[1]["index"] == 1
+        assert aggregated[1]["relevance_score"] == 0.7  # max of 0.7, 0.4
+
+    def test_mean_aggregation_with_chunks(self):
+        """Test mean aggregation strategy"""
+        chunk_results = [
+            {"index": 0, "relevance_score": 0.6},
+            {"index": 1, "relevance_score": 0.8},
+            {"index": 2, "relevance_score": 0.4},
+        ]
+        doc_indices = [0, 0, 1]  # First two chunks from doc 0, last from doc 1
+        num_original_docs = 2
+
+        aggregated = aggregate_chunk_scores(
+            chunk_results, doc_indices, num_original_docs, aggregation="mean"
+        )
+
+        assert len(aggregated) == 2
+        assert aggregated[0]["index"] == 0
+        assert aggregated[0]["relevance_score"] == pytest.approx(0.7)  # (0.6 + 0.8) / 2
+        assert aggregated[1]["index"] == 1
+        assert aggregated[1]["relevance_score"] == 0.4
+
+    def test_first_aggregation_with_chunks(self):
+        """Test first aggregation strategy"""
+        chunk_results = [
+            {"index": 0, "relevance_score": 0.6},
+            {"index": 1, "relevance_score": 0.8},
+            {"index": 2, "relevance_score": 0.4},
+        ]
+        doc_indices = [0, 0, 1]
+        num_original_docs = 2
+
+        aggregated = aggregate_chunk_scores(
+            chunk_results, doc_indices, num_original_docs, aggregation="first"
+        )
+
+        assert len(aggregated) == 2
+        # First should use first score seen for each doc
+        assert aggregated[0]["index"] == 0
+        assert aggregated[0]["relevance_score"] == 0.6  # First score for doc 0
+        assert aggregated[1]["index"] == 1
+        assert aggregated[1]["relevance_score"] == 0.4
+
+    def test_empty_chunk_results(self):
+        """Test handling of empty results"""
+        aggregated = aggregate_chunk_scores([], [], 3, aggregation="max")
+        assert aggregated == []
+
+    def test_documents_with_no_scores(self):
+        """Test when some documents have no chunks/scores"""
+        chunk_results = [
+            {"index": 0, "relevance_score": 0.9},
+            {"index": 1, "relevance_score": 0.7},
+        ]
+        doc_indices = [0, 0]  # Both chunks from document 0
+        num_original_docs = 3  # But we have 3 documents total
+
+        aggregated = aggregate_chunk_scores(
+            chunk_results, doc_indices, num_original_docs, aggregation="max"
+        )
+
+        # Only doc 0 should appear in results
+        assert len(aggregated) == 1
+        assert aggregated[0]["index"] == 0
+
+    def test_unknown_aggregation_strategy(self):
+        """Test that unknown strategy falls back to max"""
+        chunk_results = [
+            {"index": 0, "relevance_score": 0.6},
+            {"index": 1, "relevance_score": 0.8},
+        ]
+        doc_indices = [0, 0]
+        num_original_docs = 1
+
+        # Use invalid strategy
+        aggregated = aggregate_chunk_scores(
+            chunk_results, doc_indices, num_original_docs, aggregation="invalid"
+        )
+
+        # Should fall back to max
+        assert aggregated[0]["relevance_score"] == 0.8
+
+
+@pytest.mark.offline
+class TestTopNWithChunking:
+    """Tests for top_n behavior when chunking is enabled (Bug fix verification)"""
+
+    @pytest.mark.asyncio
+    async def test_top_n_limits_documents_not_chunks(self):
+        """
+        Test that top_n correctly limits documents (not chunks) when chunking is enabled.
+
+        Bug scenario: 10 docs expand to 50 chunks. With old behavior, top_n=5 would
+        return scores for only 5 chunks (possibly all from 1-2 docs). After aggregation,
+        fewer than 5 documents would be returned.
+
+        Fixed behavior: top_n=5 should return exactly 5 documents after aggregation.
+        """
+        # Setup: 5 documents, each producing multiple chunks when chunked
+        # Using small max_tokens to force chunking
+        long_docs = [" ".join([f"doc{i}_word{j}" for j in range(50)]) for i in range(5)]
+        query = "test query"
+
+        # First, determine how many chunks will be created by actual chunking
+        _, doc_indices = chunk_documents_for_rerank(
+            long_docs, max_tokens=50, overlap_tokens=10
+        )
+        num_chunks = len(doc_indices)
+
+        # Mock API returns scores for ALL chunks (simulating disabled API-level top_n)
+        # Give different scores to ensure doc 0 gets highest, doc 1 second, etc.
+        # Assign scores based on original document index (lower doc index = higher score)
+        mock_chunk_scores = []
+        for i in range(num_chunks):
+            original_doc = doc_indices[i]
+            # Higher score for lower doc index, with small variation per chunk
+            base_score = 0.9 - (original_doc * 0.1)
+            mock_chunk_scores.append({"index": i, "relevance_score": base_score})
+
+        mock_response = Mock()
+        mock_response.status = 200
+        mock_response.json = AsyncMock(return_value={"results": mock_chunk_scores})
+        mock_response.request_info = None
+        mock_response.history = None
+        mock_response.headers = {}
+        mock_response.__aenter__ = AsyncMock(return_value=mock_response)
+        mock_response.__aexit__ = AsyncMock(return_value=None)
+
+        mock_session = Mock()
+        mock_session.post = Mock(return_value=mock_response)
+        mock_session.__aenter__ = AsyncMock(return_value=mock_session)
+        mock_session.__aexit__ = AsyncMock(return_value=None)
+
+        with patch("lightrag.rerank.aiohttp.ClientSession", return_value=mock_session):
+            result = await cohere_rerank(
+                query=query,
+                documents=long_docs,
+                api_key="test-key",
+                base_url="http://test.com/rerank",
+                enable_chunking=True,
+                max_tokens_per_doc=50,  # Match chunking above
+                top_n=3,  # Request top 3 documents
+            )
+
+            # Verify: should get exactly 3 documents (not unlimited chunks)
+            assert len(result) == 3
+            # All results should have valid document indices (0-4)
+            assert all(0 <= r["index"] < 5 for r in result)
+            # Results should be sorted by score (descending)
+            assert all(
+                result[i]["relevance_score"] >= result[i + 1]["relevance_score"]
+                for i in range(len(result) - 1)
+            )
+            # The top 3 docs should be 0, 1, 2 (highest scores)
+            result_indices = [r["index"] for r in result]
+            assert set(result_indices) == {0, 1, 2}
+
+    @pytest.mark.asyncio
+    async def test_api_receives_no_top_n_when_chunking_enabled(self):
+        """
+        Test that the API request does NOT include top_n when chunking is enabled.
+
+        This ensures all chunk scores are retrieved for proper aggregation.
+        """
+        documents = [" ".join([f"word{i}" for i in range(100)]), "short doc"]
+        query = "test query"
+
+        captured_payload = {}
+
+        mock_response = Mock()
+        mock_response.status = 200
+        mock_response.json = AsyncMock(
+            return_value={
+                "results": [
+                    {"index": 0, "relevance_score": 0.9},
+                    {"index": 1, "relevance_score": 0.8},
+                    {"index": 2, "relevance_score": 0.7},
+                ]
+            }
+        )
+        mock_response.request_info = None
+        mock_response.history = None
+        mock_response.headers = {}
+        mock_response.__aenter__ = AsyncMock(return_value=mock_response)
+        mock_response.__aexit__ = AsyncMock(return_value=None)
+
+        def capture_post(*args, **kwargs):
+            captured_payload.update(kwargs.get("json", {}))
+            return mock_response
+
+        mock_session = Mock()
+        mock_session.post = Mock(side_effect=capture_post)
+        mock_session.__aenter__ = AsyncMock(return_value=mock_session)
+        mock_session.__aexit__ = AsyncMock(return_value=None)
+
+        with patch("lightrag.rerank.aiohttp.ClientSession", return_value=mock_session):
+            await cohere_rerank(
+                query=query,
+                documents=documents,
+                api_key="test-key",
+                base_url="http://test.com/rerank",
+                enable_chunking=True,
+                max_tokens_per_doc=30,
+                top_n=1,  # User wants top 1 document
+            )
+
+            # Verify: API payload should NOT have top_n (disabled for chunking)
+            assert "top_n" not in captured_payload
+
+    @pytest.mark.asyncio
+    async def test_top_n_not_modified_when_chunking_disabled(self):
+        """
+        Test that top_n is passed through to API when chunking is disabled.
+        """
+        documents = ["doc1", "doc2"]
+        query = "test query"
+
+        captured_payload = {}
+
+        mock_response = Mock()
+        mock_response.status = 200
+        mock_response.json = AsyncMock(
+            return_value={
+                "results": [
+                    {"index": 0, "relevance_score": 0.9},
+                ]
+            }
+        )
+        mock_response.request_info = None
+        mock_response.history = None
+        mock_response.headers = {}
+        mock_response.__aenter__ = AsyncMock(return_value=mock_response)
+        mock_response.__aexit__ = AsyncMock(return_value=None)
+
+        def capture_post(*args, **kwargs):
+            captured_payload.update(kwargs.get("json", {}))
+            return mock_response
+
+        mock_session = Mock()
+        mock_session.post = Mock(side_effect=capture_post)
+        mock_session.__aenter__ = AsyncMock(return_value=mock_session)
+        mock_session.__aexit__ = AsyncMock(return_value=None)
+
+        with patch("lightrag.rerank.aiohttp.ClientSession", return_value=mock_session):
+            await cohere_rerank(
+                query=query,
+                documents=documents,
+                api_key="test-key",
+                base_url="http://test.com/rerank",
+                enable_chunking=False,  # Chunking disabled
+                top_n=1,
+            )
+
+            # Verify: API payload should have top_n when chunking is disabled
+            assert captured_payload.get("top_n") == 1
+
+
+@pytest.mark.offline
+class TestCohereRerankChunking:
+    """Integration tests for cohere_rerank with chunking enabled"""
+
+    @pytest.mark.asyncio
+    async def test_cohere_rerank_with_chunking_disabled(self):
+        """Test that chunking can be disabled"""
+        documents = ["doc1", "doc2"]
+        query = "test query"
+
+        # Mock the generic_rerank_api
+        with patch(
+            "lightrag.rerank.generic_rerank_api", new_callable=AsyncMock
+        ) as mock_api:
+            mock_api.return_value = [
+                {"index": 0, "relevance_score": 0.9},
+                {"index": 1, "relevance_score": 0.7},
+            ]
+
+            result = await cohere_rerank(
+                query=query,
+                documents=documents,
+                api_key="test-key",
+                enable_chunking=False,
+                max_tokens_per_doc=100,
+            )
+
+            # Verify generic_rerank_api was called with correct parameters
+            mock_api.assert_called_once()
+            call_kwargs = mock_api.call_args[1]
+            assert call_kwargs["enable_chunking"] is False
+            assert call_kwargs["max_tokens_per_doc"] == 100
+            # Result should mirror mocked scores
+            assert len(result) == 2
+            assert result[0]["index"] == 0
+            assert result[0]["relevance_score"] == 0.9
+            assert result[1]["index"] == 1
+            assert result[1]["relevance_score"] == 0.7
+
+    @pytest.mark.asyncio
+    async def test_cohere_rerank_with_chunking_enabled(self):
+        """Test that chunking parameters are passed through"""
+        documents = ["doc1", "doc2"]
+        query = "test query"
+
+        with patch(
+            "lightrag.rerank.generic_rerank_api", new_callable=AsyncMock
+        ) as mock_api:
+            mock_api.return_value = [
+                {"index": 0, "relevance_score": 0.9},
+                {"index": 1, "relevance_score": 0.7},
+            ]
+
+            result = await cohere_rerank(
+                query=query,
+                documents=documents,
+                api_key="test-key",
+                enable_chunking=True,
+                max_tokens_per_doc=480,
+            )
+
+            # Verify parameters were passed
+            call_kwargs = mock_api.call_args[1]
+            assert call_kwargs["enable_chunking"] is True
+            assert call_kwargs["max_tokens_per_doc"] == 480
+            # Result should mirror mocked scores
+            assert len(result) == 2
+            assert result[0]["index"] == 0
+            assert result[0]["relevance_score"] == 0.9
+            assert result[1]["index"] == 1
+            assert result[1]["relevance_score"] == 0.7
+
+    @pytest.mark.asyncio
+    async def test_cohere_rerank_default_parameters(self):
+        """Test default parameter values for cohere_rerank"""
+        documents = ["doc1"]
+        query = "test"
+
+        with patch(
+            "lightrag.rerank.generic_rerank_api", new_callable=AsyncMock
+        ) as mock_api:
+            mock_api.return_value = [{"index": 0, "relevance_score": 0.9}]
+
+            result = await cohere_rerank(
+                query=query, documents=documents, api_key="test-key"
+            )
+
+            # Verify default values
+            call_kwargs = mock_api.call_args[1]
+            assert call_kwargs["enable_chunking"] is False
+            assert call_kwargs["max_tokens_per_doc"] == 4096
+            assert call_kwargs["model"] == "rerank-v3.5"
+            # Result should mirror mocked scores
+            assert len(result) == 1
+            assert result[0]["index"] == 0
+            assert result[0]["relevance_score"] == 0.9
+
+
+@pytest.mark.offline
+class TestEndToEndChunking:
+    """End-to-end tests for chunking workflow"""
+
+    @pytest.mark.asyncio
+    async def test_end_to_end_chunking_workflow(self):
+        """Test complete chunking workflow from documents to aggregated results"""
+        # Create documents where first one needs chunking
+        long_doc = " ".join([f"word{i}" for i in range(100)])
+        documents = [long_doc, "short doc"]
+        query = "test query"
+
+        # Mock the HTTP call inside generic_rerank_api
+        mock_response = Mock()
+        mock_response.status = 200
+        mock_response.json = AsyncMock(
+            return_value={
+                "results": [
+                    {"index": 0, "relevance_score": 0.5},  # chunk 0 from doc 0
+                    {"index": 1, "relevance_score": 0.8},  # chunk 1 from doc 0
+                    {"index": 2, "relevance_score": 0.6},  # chunk 2 from doc 0
+                    {"index": 3, "relevance_score": 0.7},  # doc 1 (short)
+                ]
+            }
+        )
+        mock_response.request_info = None
+        mock_response.history = None
+        mock_response.headers = {}
+        # Make mock_response an async context manager (for `async with session.post() as response`)
+        mock_response.__aenter__ = AsyncMock(return_value=mock_response)
+        mock_response.__aexit__ = AsyncMock(return_value=None)
+
+        mock_session = Mock()
+        # session.post() returns an async context manager, so return mock_response which is now one
+        mock_session.post = Mock(return_value=mock_response)
+        mock_session.__aenter__ = AsyncMock(return_value=mock_session)
+        mock_session.__aexit__ = AsyncMock(return_value=None)
+
+        with patch("lightrag.rerank.aiohttp.ClientSession", return_value=mock_session):
+            result = await cohere_rerank(
+                query=query,
+                documents=documents,
+                api_key="test-key",
+                base_url="http://test.com/rerank",
+                enable_chunking=True,
+                max_tokens_per_doc=30,  # Force chunking of long doc
+            )
+
+            # Should get 2 results (one per original document)
+            # The long doc's chunks should be aggregated
+            assert len(result) <= len(documents)
+            # Results should be sorted by score
+            assert all(
+                result[i]["relevance_score"] >= result[i + 1]["relevance_score"]
+                for i in range(len(result) - 1)
+            )
--- a/uv.lock
+++ b/uv.lock
@@ -1334,15 +1334,6 @@ http = [
    { name = "aiohttp" },
 ]

-[[package]]
-name = "future"
-version = "1.0.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/a7/b2/4140c69c6a66432916b26158687e821ba631a4c9273c474343badf84d3ba/future-1.0.0.tar.gz", hash = "sha256:bd2968309307861edae1458a4f8a4f3598c03be43b97521076aebf5d94c07b05", size = 1228490, upload-time = "2024-02-21T11:52:38.461Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/da/71/ae30dadffc90b9006d77af76b393cb9dfbfc9629f339fc1574a1c52e6806/future-1.0.0-py3-none-any.whl", hash = "sha256:929292d34f5872e70396626ef385ec22355a1fae8ad29e1a734c3e43f9fbc216", size = 491326, upload-time = "2024-02-21T11:52:35.956Z" },
-]
-
 [[package]]
 name = "gitdb"
 version = "4.0.12"
@@ -2542,7 +2533,6 @@ source = { editable = "." }
 dependencies = [
    { name = "aiohttp" },
    { name = "configparser" },
-    { name = "future" },
    { name = "google-api-core" },
    { name = "google-genai" },
    { name = "json-repair" },
@@ -2567,10 +2557,10 @@ api = [
    { name = "aiohttp" },
    { name = "ascii-colors" },
    { name = "asyncpg" },
+    { name = "bcrypt" },
    { name = "configparser" },
    { name = "distro" },
    { name = "fastapi" },
-    { name = "future" },
    { name = "google-api-core" },
    { name = "google-genai" },
    { name = "gunicorn" },
@@ -2585,7 +2575,6 @@ api = [
    { name = "openai" },
    { name = "openpyxl" },
    { name = "pandas" },
-    { name = "passlib", extra = ["bcrypt"] },
    { name = "pipmaster" },
    { name = "psutil" },
    { name = "pycryptodome" },
@@ -2615,6 +2604,7 @@ evaluation = [
    { name = "pytest" },
    { name = "pytest-asyncio" },
    { name = "ragas" },
+    { name = "ruff" },
 ]
 observability = [
    { name = "langfuse" },
@@ -2626,10 +2616,10 @@ offline = [
    { name = "anthropic" },
    { name = "ascii-colors" },
    { name = "asyncpg" },
+    { name = "bcrypt" },
    { name = "configparser" },
    { name = "distro" },
    { name = "fastapi" },
-    { name = "future" },
    { name = "google-api-core" },
    { name = "google-genai" },
    { name = "gunicorn" },
@@ -2647,7 +2637,6 @@ offline = [
    { name = "openai" },
    { name = "openpyxl" },
    { name = "pandas" },
-    { name = "passlib", extra = ["bcrypt"] },
    { name = "pipmaster" },
    { name = "psutil" },
    { name = "pycryptodome" },
@@ -2700,6 +2689,7 @@ pytest = [
    { name = "pre-commit" },
    { name = "pytest" },
    { name = "pytest-asyncio" },
+    { name = "ruff" },
 ]

 [package.metadata]
@@ -2712,14 +2702,13 @@ requires-dist = [
    { name = "ascii-colors", marker = "extra == 'api'" },
    { name = "asyncpg", marker = "extra == 'api'" },
    { name = "asyncpg", marker = "extra == 'offline-storage'", specifier = ">=0.29.0,<1.0.0" },
+    { name = "bcrypt", marker = "extra == 'api'", specifier = ">=4.0.0" },
    { name = "configparser" },
    { name = "configparser", marker = "extra == 'api'" },
    { name = "datasets", marker = "extra == 'evaluation'", specifier = ">=4.3.0" },
    { name = "distro", marker = "extra == 'api'" },
    { name = "docling", marker = "sys_platform != 'darwin' and extra == 'docling'", specifier = ">=2.0.0,<3.0.0" },
    { name = "fastapi", marker = "extra == 'api'" },
-    { name = "future" },
-    { name = "future", marker = "extra == 'api'" },
    { name = "google-api-core", specifier = ">=2.0.0,<3.0.0" },
    { name = "google-api-core", marker = "extra == 'api'", specifier = ">=2.0.0,<3.0.0" },
    { name = "google-api-core", marker = "extra == 'offline-llm'", specifier = ">=2.0.0,<3.0.0" },
@@ -2735,7 +2724,6 @@ requires-dist = [
    { name = "json-repair", marker = "extra == 'api'" },
    { name = "langfuse", marker = "extra == 'observability'", specifier = ">=3.8.1" },
    { name = "lightrag-hku", extras = ["api", "offline-llm", "offline-storage"], marker = "extra == 'offline'" },
-    { name = "lightrag-hku", extras = ["pytest"], marker = "extra == 'evaluation'" },
    { name = "llama-index", marker = "extra == 'offline-llm'", specifier = ">=0.9.0,<1.0.0" },
    { name = "nano-vectordb" },
    { name = "nano-vectordb", marker = "extra == 'api'" },
@@ -2745,14 +2733,14 @@ requires-dist = [
    { name = "numpy", specifier = ">=1.24.0,<2.0.0" },
    { name = "numpy", marker = "extra == 'api'", specifier = ">=1.24.0,<2.0.0" },
    { name = "ollama", marker = "extra == 'offline-llm'", specifier = ">=0.1.0,<1.0.0" },
-    { name = "openai", marker = "extra == 'api'", specifier = ">=1.0.0,<3.0.0" },
-    { name = "openai", marker = "extra == 'offline-llm'", specifier = ">=1.0.0,<3.0.0" },
+    { name = "openai", marker = "extra == 'api'", specifier = ">=2.0.0,<3.0.0" },
+    { name = "openai", marker = "extra == 'offline-llm'", specifier = ">=2.0.0,<3.0.0" },
    { name = "openpyxl", marker = "extra == 'api'", specifier = ">=3.0.0,<4.0.0" },
    { name = "pandas", specifier = ">=2.0.0,<2.4.0" },
    { name = "pandas", marker = "extra == 'api'", specifier = ">=2.0.0,<2.4.0" },
-    { name = "passlib", extras = ["bcrypt"], marker = "extra == 'api'" },
    { name = "pipmaster" },
    { name = "pipmaster", marker = "extra == 'api'" },
+    { name = "pre-commit", marker = "extra == 'evaluation'" },
    { name = "pre-commit", marker = "extra == 'pytest'" },
    { name = "psutil", marker = "extra == 'api'" },
    { name = "pycryptodome", marker = "extra == 'api'", specifier = ">=3.0.0,<4.0.0" },
@@ -2764,7 +2752,9 @@ requires-dist = [
    { name = "pypdf", marker = "extra == 'api'", specifier = ">=6.1.0" },
    { name = "pypinyin" },
    { name = "pypinyin", marker = "extra == 'api'" },
+    { name = "pytest", marker = "extra == 'evaluation'", specifier = ">=8.4.2" },
    { name = "pytest", marker = "extra == 'pytest'", specifier = ">=8.4.2" },
+    { name = "pytest-asyncio", marker = "extra == 'evaluation'", specifier = ">=1.2.0" },
    { name = "pytest-asyncio", marker = "extra == 'pytest'", specifier = ">=1.2.0" },
    { name = "python-docx", marker = "extra == 'api'", specifier = ">=0.8.11,<2.0.0" },
    { name = "python-dotenv" },
@@ -2776,6 +2766,8 @@ requires-dist = [
    { name = "qdrant-client", marker = "extra == 'offline-storage'", specifier = ">=1.11.0,<2.0.0" },
    { name = "ragas", marker = "extra == 'evaluation'", specifier = ">=0.3.7" },
    { name = "redis", marker = "extra == 'offline-storage'", specifier = ">=5.0.0,<8.0.0" },
+    { name = "ruff", marker = "extra == 'evaluation'" },
+    { name = "ruff", marker = "extra == 'pytest'" },
    { name = "setuptools" },
    { name = "setuptools", marker = "extra == 'api'" },
    { name = "tenacity" },
@@ -4104,20 +4096,6 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/ab/5f/b38085618b950b79d2d9164a711c52b10aefc0ae6833b96f626b7021b2ed/pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a", size = 13098436, upload-time = "2024-09-20T13:09:48.112Z" },
 ]

-[[package]]
-name = "passlib"
-version = "1.7.4"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/b6/06/9da9ee59a67fae7761aab3ccc84fa4f3f33f125b370f1ccdb915bf967c11/passlib-1.7.4.tar.gz", hash = "sha256:defd50f72b65c5402ab2c573830a6978e5f202ad0d984793c8dde2c4152ebe04", size = 689844, upload-time = "2020-10-08T19:00:52.121Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/3b/a4/ab6b7589382ca3df236e03faa71deac88cae040af60c071a78d254a62172/passlib-1.7.4-py2.py3-none-any.whl", hash = "sha256:aa6bca462b8d8bda89c70b382f0c298a20b5560af6cbfa2dce410c0a2fb669f1", size = 525554, upload-time = "2020-10-08T19:00:49.856Z" },
-]
-
-[package.optional-dependencies]
-bcrypt = [
-    { name = "bcrypt" },
-]
-
 [[package]]
 name = "pillow"
 version = "11.3.0"
@@ -5635,6 +5613,32 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/3f/50/0a9e7e7afe7339bd5e36911f0ceb15fed51945836ed803ae5afd661057fd/rtree-1.4.1-py3-none-win_arm64.whl", hash = "sha256:3d46f55729b28138e897ffef32f7ce93ac335cb67f9120125ad3742a220800f0", size = 355253, upload-time = "2025-08-13T19:32:00.296Z" },
 ]

+[[package]]
+name = "ruff"
+version = "0.14.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/52/f0/62b5a1a723fe183650109407fa56abb433b00aa1c0b9ba555f9c4efec2c6/ruff-0.14.6.tar.gz", hash = "sha256:6f0c742ca6a7783a736b867a263b9a7a80a45ce9bee391eeda296895f1b4e1cc", size = 5669501, upload-time = "2025-11-21T14:26:17.903Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/67/d2/7dd544116d107fffb24a0064d41a5d2ed1c9d6372d142f9ba108c8e39207/ruff-0.14.6-py3-none-linux_armv6l.whl", hash = "sha256:d724ac2f1c240dbd01a2ae98db5d1d9a5e1d9e96eba999d1c48e30062df578a3", size = 13326119, upload-time = "2025-11-21T14:25:24.2Z" },
+    { url = "https://files.pythonhosted.org/packages/36/6a/ad66d0a3315d6327ed6b01f759d83df3c4d5f86c30462121024361137b6a/ruff-0.14.6-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:9f7539ea257aa4d07b7ce87aed580e485c40143f2473ff2f2b75aee003186004", size = 13526007, upload-time = "2025-11-21T14:25:26.906Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/9d/dae6db96df28e0a15dea8e986ee393af70fc97fd57669808728080529c37/ruff-0.14.6-py3-none-macosx_11_0_arm64.whl", hash = "sha256:7f6007e55b90a2a7e93083ba48a9f23c3158c433591c33ee2e99a49b889c6332", size = 12676572, upload-time = "2025-11-21T14:25:29.826Z" },
+    { url = "https://files.pythonhosted.org/packages/76/a4/f319e87759949062cfee1b26245048e92e2acce900ad3a909285f9db1859/ruff-0.14.6-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a8e7b9d73d8728b68f632aa8e824ef041d068d231d8dbc7808532d3629a6bef", size = 13140745, upload-time = "2025-11-21T14:25:32.788Z" },
+    { url = "https://files.pythonhosted.org/packages/95/d3/248c1efc71a0a8ed4e8e10b4b2266845d7dfc7a0ab64354afe049eaa1310/ruff-0.14.6-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d50d45d4553a3ebcbd33e7c5e0fe6ca4aafd9a9122492de357205c2c48f00775", size = 13076486, upload-time = "2025-11-21T14:25:35.601Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/19/b68d4563fe50eba4b8c92aa842149bb56dd24d198389c0ed12e7faff4f7d/ruff-0.14.6-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:118548dd121f8a21bfa8ab2c5b80e5b4aed67ead4b7567790962554f38e598ce", size = 13727563, upload-time = "2025-11-21T14:25:38.514Z" },
+    { url = "https://files.pythonhosted.org/packages/47/ac/943169436832d4b0e867235abbdb57ce3a82367b47e0280fa7b4eabb7593/ruff-0.14.6-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:57256efafbfefcb8748df9d1d766062f62b20150691021f8ab79e2d919f7c11f", size = 15199755, upload-time = "2025-11-21T14:25:41.516Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/b9/288bb2399860a36d4bb0541cb66cce3c0f4156aaff009dc8499be0c24bf2/ruff-0.14.6-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ff18134841e5c68f8e5df1999a64429a02d5549036b394fafbe410f886e1989d", size = 14850608, upload-time = "2025-11-21T14:25:44.428Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/b1/a0d549dd4364e240f37e7d2907e97ee80587480d98c7799d2d8dc7a2f605/ruff-0.14.6-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:29c4b7ec1e66a105d5c27bd57fa93203637d66a26d10ca9809dc7fc18ec58440", size = 14118754, upload-time = "2025-11-21T14:25:47.214Z" },
+    { url = "https://files.pythonhosted.org/packages/13/ac/9b9fe63716af8bdfddfacd0882bc1586f29985d3b988b3c62ddce2e202c3/ruff-0.14.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:167843a6f78680746d7e226f255d920aeed5e4ad9c03258094a2d49d3028b105", size = 13949214, upload-time = "2025-11-21T14:25:50.002Z" },
+    { url = "https://files.pythonhosted.org/packages/12/27/4dad6c6a77fede9560b7df6802b1b697e97e49ceabe1f12baf3ea20862e9/ruff-0.14.6-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:16a33af621c9c523b1ae006b1b99b159bf5ac7e4b1f20b85b2572455018e0821", size = 14106112, upload-time = "2025-11-21T14:25:52.841Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/db/23e322d7177873eaedea59a7932ca5084ec5b7e20cb30f341ab594130a71/ruff-0.14.6-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:1432ab6e1ae2dc565a7eea707d3b03a0c234ef401482a6f1621bc1f427c2ff55", size = 13035010, upload-time = "2025-11-21T14:25:55.536Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/9c/20e21d4d69dbb35e6a1df7691e02f363423658a20a2afacf2a2c011800dc/ruff-0.14.6-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:4c55cfbbe7abb61eb914bfd20683d14cdfb38a6d56c6c66efa55ec6570ee4e71", size = 13054082, upload-time = "2025-11-21T14:25:58.625Z" },
+    { url = "https://files.pythonhosted.org/packages/66/25/906ee6a0464c3125c8d673c589771a974965c2be1a1e28b5c3b96cb6ef88/ruff-0.14.6-py3-none-musllinux_1_2_i686.whl", hash = "sha256:efea3c0f21901a685fff4befda6d61a1bf4cb43de16da87e8226a281d614350b", size = 13303354, upload-time = "2025-11-21T14:26:01.816Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/58/60577569e198d56922b7ead07b465f559002b7b11d53f40937e95067ca1c/ruff-0.14.6-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:344d97172576d75dc6afc0e9243376dbe1668559c72de1864439c4fc95f78185", size = 14054487, upload-time = "2025-11-21T14:26:05.058Z" },
+    { url = "https://files.pythonhosted.org/packages/67/0b/8e4e0639e4cc12547f41cb771b0b44ec8225b6b6a93393176d75fe6f7d40/ruff-0.14.6-py3-none-win32.whl", hash = "sha256:00169c0c8b85396516fdd9ce3446c7ca20c2a8f90a77aa945ba6b8f2bfe99e85", size = 13013361, upload-time = "2025-11-21T14:26:08.152Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/02/82240553b77fd1341f80ebb3eaae43ba011c7a91b4224a9f317d8e6591af/ruff-0.14.6-py3-none-win_amd64.whl", hash = "sha256:390e6480c5e3659f8a4c8d6a0373027820419ac14fa0d2713bd8e6c3e125b8b9", size = 14432087, upload-time = "2025-11-21T14:26:10.891Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/1f/93f9b0fad9470e4c829a5bb678da4012f0c710d09331b860ee555216f4ea/ruff-0.14.6-py3-none-win_arm64.whl", hash = "sha256:d43c81fbeae52cfa8728d8766bbf46ee4298c888072105815b392da70ca836b2", size = 13520930, upload-time = "2025-11-21T14:26:13.951Z" },
+]
+
 [[package]]
 name = "s3transfer"
 version = "0.14.0"