From 72b29659c94e0dc9874ce34531ec383547e98e3f Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 29 Oct 2025 13:33:21 +0800 Subject: [PATCH 1/5] Fix worker process cleanup to prevent shared resource conflicts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Add worker_exit hook in gunicorn config • Add shutdown_manager parameter in finalize_share_data of share_storage • Prevent Manager shutdown in workers • Remove custom signal handlers --- lightrag/api/gunicorn_config.py | 21 +++++++++++++++++++++ lightrag/api/lightrag_server.py | 25 ++++--------------------- lightrag/api/run_with_gunicorn.py | 24 +++++------------------- lightrag/kg/shared_storage.py | 10 +++++++--- 4 files changed, 37 insertions(+), 43 deletions(-) diff --git a/lightrag/api/gunicorn_config.py b/lightrag/api/gunicorn_config.py index 7b25b5b9..a19a0b39 100644 --- a/lightrag/api/gunicorn_config.py +++ b/lightrag/api/gunicorn_config.py @@ -162,3 +162,24 @@ def post_fork(server, worker): uvicorn_error_logger.handlers = [] uvicorn_error_logger.setLevel(logging.CRITICAL) uvicorn_error_logger.propagate = False + + +def worker_exit(server, worker): + """ + Executed when a worker is about to exit. + + This is called for each worker process when it exits. We should only + clean up worker-local resources here, NOT the shared Manager. + The Manager should only be shut down by the main process in on_exit(). + """ + print("=" * 80) + print(f"GUNICORN WORKER PROCESS: Shutting down worker {worker.pid}") + print(f"Process ID: {os.getpid()}") + print("=" * 80) + + # Clean up worker-local resources without shutting down the Manager + # Pass shutdown_manager=False to prevent Manager shutdown + finalize_share_data(shutdown_manager=False) + + print(f"Worker {worker.pid} cleanup complete") + print("=" * 80) diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index 4dd5edaa..cee831d0 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -12,7 +12,6 @@ from fastapi.openapi.docs import ( import os import logging import logging.config -import signal import sys import uvicorn import pipmaster as pm @@ -82,24 +81,6 @@ config.read("config.ini") auth_configured = bool(auth_handler.accounts) -def setup_signal_handlers(): - """Setup signal handlers for graceful shutdown""" - - def signal_handler(sig, frame): - print(f"\n\nReceived signal {sig}, shutting down gracefully...") - print(f"Process ID: {os.getpid()}") - - # Release shared resources - finalize_share_data() - - # Exit with success status - sys.exit(0) - - # Register signal handlers - signal.signal(signal.SIGINT, signal_handler) # Ctrl+C - signal.signal(signal.SIGTERM, signal_handler) # kill command - - class LLMConfigCache: """Smart LLM and Embedding configuration cache class""" @@ -1108,8 +1089,10 @@ def main(): update_uvicorn_mode_config() display_splash_screen(global_args) - # Setup signal handlers for graceful shutdown - setup_signal_handlers() + # Note: Signal handlers are NOT registered here because: + # - Uvicorn has built-in signal handling that properly calls lifespan shutdown + # - Custom signal handlers can interfere with uvicorn's graceful shutdown + # - Cleanup is handled by the lifespan context manager's finally block # Create application instance directly instead of using factory function app = create_app(global_args) diff --git a/lightrag/api/run_with_gunicorn.py b/lightrag/api/run_with_gunicorn.py index 929db019..5ad0c5b5 100644 --- a/lightrag/api/run_with_gunicorn.py +++ b/lightrag/api/run_with_gunicorn.py @@ -5,12 +5,11 @@ Start LightRAG server with Gunicorn import os import sys -import signal import pipmaster as pm from lightrag.api.utils_api import display_splash_screen, check_env_file from lightrag.api.config import global_args from lightrag.utils import get_env_value -from lightrag.kg.shared_storage import initialize_share_data, finalize_share_data +from lightrag.kg.shared_storage import initialize_share_data from lightrag.constants import ( DEFAULT_WOKERS, @@ -34,20 +33,6 @@ def check_and_install_dependencies(): print(f"{package} installed successfully") -# Signal handler for graceful shutdown -def signal_handler(sig, frame): - print("\n\n" + "=" * 80) - print("RECEIVED TERMINATION SIGNAL") - print(f"Process ID: {os.getpid()}") - print("=" * 80 + "\n") - - # Release shared resources - finalize_share_data() - - # Exit with success status - sys.exit(0) - - def main(): # Check .env file if not check_env_file(): @@ -56,9 +41,10 @@ def main(): # Check and install dependencies check_and_install_dependencies() - # Register signal handlers for graceful shutdown - signal.signal(signal.SIGINT, signal_handler) # Ctrl+C - signal.signal(signal.SIGTERM, signal_handler) # kill command + # Note: Signal handlers are NOT registered here because: + # - Worker cleanup is handled by gunicorn_config.worker_exit() + # - Master cleanup is handled by gunicorn_config.on_exit() + # This prevents race conditions when multiple processes try to finalize shared data # Display startup information display_splash_screen(global_args) diff --git a/lightrag/kg/shared_storage.py b/lightrag/kg/shared_storage.py index 33d43bfa..e7c170d8 100644 --- a/lightrag/kg/shared_storage.py +++ b/lightrag/kg/shared_storage.py @@ -1443,7 +1443,7 @@ async def get_namespace_data( return _shared_dicts[namespace] -def finalize_share_data(): +def finalize_share_data(shutdown_manager: bool = True): """ Release shared resources and clean up. @@ -1452,6 +1452,10 @@ def finalize_share_data(): In multi-process mode, it shuts down the Manager and releases all shared objects. In single-process mode, it simply resets the global variables. + + Args: + shutdown_manager: If True, shut down the multiprocessing Manager. + Should be True only for the main process, False for worker processes. """ global \ _manager, \ @@ -1478,8 +1482,8 @@ def finalize_share_data(): f"Process {os.getpid()} finalizing storage data (multiprocess={_is_multiprocess})" ) - # In multi-process mode, shut down the Manager - if _is_multiprocess and _manager is not None: + # In multi-process mode, shut down the Manager only if requested + if _is_multiprocess and _manager is not None and shutdown_manager: try: # Clear shared resources before shutting down Manager if _shared_dicts is not None: From 816feefd84629f79e331b070cf7aa1950fa686df Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 29 Oct 2025 13:53:46 +0800 Subject: [PATCH 2/5] Fix cleanup coordination between Gunicorn and UvicornWorker lifecycles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Document UvicornWorker hook limitations • Add GUNICORN_CMD_ARGS cleanup guard • Prevent double cleanup in workers --- lightrag/api/gunicorn_config.py | 17 +++++++++++++++-- lightrag/api/lightrag_server.py | 8 ++++++-- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/lightrag/api/gunicorn_config.py b/lightrag/api/gunicorn_config.py index a19a0b39..f22788a5 100644 --- a/lightrag/api/gunicorn_config.py +++ b/lightrag/api/gunicorn_config.py @@ -168,8 +168,21 @@ def worker_exit(server, worker): """ Executed when a worker is about to exit. - This is called for each worker process when it exits. We should only - clean up worker-local resources here, NOT the shared Manager. + NOTE: When using UvicornWorker (worker_class = "uvicorn.workers.UvicornWorker"), + this hook may NOT be called reliably. UvicornWorker has its own lifecycle + management that prioritizes ASGI lifespan shutdown events. + + The primary cleanup mechanism is handled by: + 1. FastAPI lifespan context manager with GUNICORN_CMD_ARGS check (in lightrag_server.py) + - Workers skip cleanup when GUNICORN_CMD_ARGS is set + 2. on_exit() hook for main process cleanup + + This function serves as a defensive fallback for: + - Non-UvicornWorker scenarios + - Future Gunicorn/Uvicorn behavior changes + - Additional safety layer + + When called, we should only clean up worker-local resources, NOT the shared Manager. The Manager should only be shut down by the main process in on_exit(). """ print("=" * 80) diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index cee831d0..47513b77 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -326,8 +326,12 @@ def create_app(args): # Clean up database connections await rag.finalize_storages() - # Clean up shared data - finalize_share_data() + # In Gunicorn mode with preload_app=True, cleanup is handled by worker_exit/on_exit hooks + # Only perform cleanup in Uvicorn single-process mode + if "GUNICORN_CMD_ARGS" not in os.environ: + + # Clean up shared data + finalize_share_data() # Initialize FastAPI base_description = ( From 4a46d39c93e67158a906bbce6291f6512f532ea8 Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 29 Oct 2025 14:06:03 +0800 Subject: [PATCH 3/5] Replace GUNICORN_CMD_ARGS with custom LIGHTRAG_GUNICORN_MODE flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Use custom env var for mode detection • Improve Gunicorn mode reliability --- lightrag/api/lightrag_server.py | 3 +-- lightrag/api/run_with_gunicorn.py | 3 +++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index 47513b77..c4f4a3b1 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -328,8 +328,7 @@ def create_app(args): # In Gunicorn mode with preload_app=True, cleanup is handled by worker_exit/on_exit hooks # Only perform cleanup in Uvicorn single-process mode - if "GUNICORN_CMD_ARGS" not in os.environ: - + if "LIGHTRAG_GUNICORN_MODE" not in os.environ: # Clean up shared data finalize_share_data() diff --git a/lightrag/api/run_with_gunicorn.py b/lightrag/api/run_with_gunicorn.py index 5ad0c5b5..c5f7cb5c 100644 --- a/lightrag/api/run_with_gunicorn.py +++ b/lightrag/api/run_with_gunicorn.py @@ -34,6 +34,9 @@ def check_and_install_dependencies(): def main(): + # Set Gunicorn mode flag for lifespan cleanup detection + os.environ["LIGHTRAG_GUNICORN_MODE"] = "1" + # Check .env file if not check_env_file(): sys.exit(1) From 6489aaa7f03913d8b4909e3812cabffc92bc8f5b Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 29 Oct 2025 15:15:13 +0800 Subject: [PATCH 4/5] Remove worker_exit hook and improve cleanup logging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Remove unreliable worker_exit function • Add debug logs for cleanup modes • Move DEBUG_LOCKS to top of file --- lightrag/api/gunicorn_config.py | 34 --------------------------------- lightrag/api/lightrag_server.py | 5 +++++ lightrag/kg/shared_storage.py | 3 ++- 3 files changed, 7 insertions(+), 35 deletions(-) diff --git a/lightrag/api/gunicorn_config.py b/lightrag/api/gunicorn_config.py index f22788a5..7b25b5b9 100644 --- a/lightrag/api/gunicorn_config.py +++ b/lightrag/api/gunicorn_config.py @@ -162,37 +162,3 @@ def post_fork(server, worker): uvicorn_error_logger.handlers = [] uvicorn_error_logger.setLevel(logging.CRITICAL) uvicorn_error_logger.propagate = False - - -def worker_exit(server, worker): - """ - Executed when a worker is about to exit. - - NOTE: When using UvicornWorker (worker_class = "uvicorn.workers.UvicornWorker"), - this hook may NOT be called reliably. UvicornWorker has its own lifecycle - management that prioritizes ASGI lifespan shutdown events. - - The primary cleanup mechanism is handled by: - 1. FastAPI lifespan context manager with GUNICORN_CMD_ARGS check (in lightrag_server.py) - - Workers skip cleanup when GUNICORN_CMD_ARGS is set - 2. on_exit() hook for main process cleanup - - This function serves as a defensive fallback for: - - Non-UvicornWorker scenarios - - Future Gunicorn/Uvicorn behavior changes - - Additional safety layer - - When called, we should only clean up worker-local resources, NOT the shared Manager. - The Manager should only be shut down by the main process in on_exit(). - """ - print("=" * 80) - print(f"GUNICORN WORKER PROCESS: Shutting down worker {worker.pid}") - print(f"Process ID: {os.getpid()}") - print("=" * 80) - - # Clean up worker-local resources without shutting down the Manager - # Pass shutdown_manager=False to prevent Manager shutdown - finalize_share_data(shutdown_manager=False) - - print(f"Worker {worker.pid} cleanup complete") - print("=" * 80) diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index c4f4a3b1..ec8b2f45 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -330,7 +330,12 @@ def create_app(args): # Only perform cleanup in Uvicorn single-process mode if "LIGHTRAG_GUNICORN_MODE" not in os.environ: # Clean up shared data + logger.debug("Unvicorn Mode: finalize shared storage...") finalize_share_data() + else: + logger.debug( + "Gunicorn Mode: don not finalize shared storage in worker process" + ) # Initialize FastAPI base_description = ( diff --git a/lightrag/kg/shared_storage.py b/lightrag/kg/shared_storage.py index e7c170d8..c7dc03e5 100644 --- a/lightrag/kg/shared_storage.py +++ b/lightrag/kg/shared_storage.py @@ -10,6 +10,8 @@ from typing import Any, Dict, List, Optional, Union, TypeVar, Generic from lightrag.exceptions import PipelineNotInitializedError +DEBUG_LOCKS = False + # Define a direct print function for critical logs that must be visible in all processes def direct_log(message, enable_output: bool = True, level: str = "DEBUG"): @@ -90,7 +92,6 @@ _storage_keyed_lock: Optional["KeyedUnifiedLock"] = None # async locks for coroutine synchronization in multiprocess mode _async_locks: Optional[Dict[str, asyncio.Lock]] = None -DEBUG_LOCKS = False _debug_n_locks_acquired: int = 0 From d5bcd14c6f0f867ed38ae88c48487bd7b3533398 Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 29 Oct 2025 18:55:47 +0800 Subject: [PATCH 5/5] Refactor service deployment to use direct process execution - Remove bash wrapper script - Update systemd service configuration - Improve process management for gunicorn - Simplify shared storage cleanup logic - Update documentation for deployment --- lightrag-api | 4 ---- lightrag.service.example | 16 +++++++++++++--- lightrag/api/README-zh.md | 22 +++++++--------------- lightrag/api/README.md | 22 ++++++++-------------- lightrag/api/gunicorn_config.py | 6 ++++-- lightrag/api/lightrag_server.py | 9 ++++----- lightrag/api/run_with_gunicorn.py | 4 +--- lightrag/kg/shared_storage.py | 10 +++------- 8 files changed, 40 insertions(+), 53 deletions(-) delete mode 100644 lightrag-api diff --git a/lightrag-api b/lightrag-api deleted file mode 100644 index 89c814fb..00000000 --- a/lightrag-api +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -source /home/netman/lightrag-xyj/venv/bin/activate -lightrag-server diff --git a/lightrag.service.example b/lightrag.service.example index 3c96e0b4..3a342c5b 100644 --- a/lightrag.service.example +++ b/lightrag.service.example @@ -1,5 +1,5 @@ [Unit] -Description=LightRAG XYJ Ollama Service +Description=LightRAG XYJ Service After=network.target [Service] @@ -8,10 +8,20 @@ User=netman # Memory settings MemoryHigh=8G MemoryMax=12G + +# Using virtual enviroment created by miniconda +Environment="PATH=/home/netman/miniconda3/bin:/home/netman/lightrag-xyj/venv/bin" WorkingDirectory=/home/netman/lightrag-xyj -ExecStart=/home/netman/lightrag-xyj/lightrag-api +# ExecStart=/home/netman/lightrag-xyj/venv/bin/lightrag-server +ExecStart=/home/netman/lightrag-xyj/venv/bin/lightrag-gunicorn + +# Kill mode require ExecStart must be gunicorn or unvicorn main process +KillMode=process +ExecStop=/bin/kill -s TERM $MAINPID +TimeoutStopSec=60 + Restart=always -RestartSec=10 +RestartSec=30 [Install] WantedBy=multi-user.target diff --git a/lightrag/api/README-zh.md b/lightrag/api/README-zh.md index 692c589d..bd2bbd62 100644 --- a/lightrag/api/README-zh.md +++ b/lightrag/api/README-zh.md @@ -184,24 +184,16 @@ MAX_ASYNC=4 ### 将 Lightrag 安装为 Linux 服务 -从示例文件 `lightrag.service.example` 创建您的服务文件 `lightrag.service`。修改服务文件中的 WorkingDirectory 和 ExecStart: +从示例文件 `lightrag.service.example` 创建您的服务文件 `lightrag.service`。修改服务文件中的服务启动定义: ```text -Description=LightRAG Ollama Service -WorkingDirectory= -ExecStart=/lightrag/api/lightrag-api -``` - -修改您的服务启动脚本:`lightrag-api`。根据需要更改 python 虚拟环境激活命令: - -```shell -#!/bin/bash - -# 您的 python 虚拟环境激活命令 -source /home/netman/lightrag-xyj/venv/bin/activate -# 启动 lightrag api 服务器 -lightrag-server +# Set Enviroment to your Python virtual enviroment +Environment="PATH=/home/netman/lightrag-xyj/venv/bin" +WorkingDirectory=/home/netman/lightrag-xyj +# ExecStart=/home/netman/lightrag-xyj/venv/bin/lightrag-server +ExecStart=/home/netman/lightrag-xyj/venv/bin/lightrag-gunicorn ``` +> ExecStart命令必须是 lightrag-gunicorn 或 lightrag-server 中的一个,不能使用其它脚本包裹它们。因为停止服务必须要求主进程必须是这两个进程。 安装 LightRAG 服务。如果您的系统是 Ubuntu,以下命令将生效: diff --git a/lightrag/api/README.md b/lightrag/api/README.md index aa24576e..8bf9e281 100644 --- a/lightrag/api/README.md +++ b/lightrag/api/README.md @@ -188,24 +188,18 @@ MAX_ASYNC=4 ### Install LightRAG as a Linux Service -Create your service file `lightrag.service` from the sample file: `lightrag.service.example`. Modify the `WorkingDirectory` and `ExecStart` in the service file: +Create your service file `lightrag.service` from the sample file: `lightrag.service.example`. Modify the start options the service file: ```text -Description=LightRAG Ollama Service -WorkingDirectory= -ExecStart=/lightrag/api/lightrag-api +# Set Enviroment to your Python virtual enviroment +Environment="PATH=/home/netman/lightrag-xyj/venv/bin" +WorkingDirectory=/home/netman/lightrag-xyj +# ExecStart=/home/netman/lightrag-xyj/venv/bin/lightrag-server +ExecStart=/home/netman/lightrag-xyj/venv/bin/lightrag-gunicorn + ``` -Modify your service startup script: `lightrag-api`. Change your Python virtual environment activation command as needed: - -```shell -#!/bin/bash - -# your python virtual environment activation -source /home/netman/lightrag-xyj/venv/bin/activate -# start lightrag api server -lightrag-server -``` +> The ExecStart command must be either `lightrag-gunicorn` or `lightrag-server`; no wrapper scripts are allowed. This is because service termination requires the main process to be one of these two executables. Install LightRAG service. If your system is Ubuntu, the following commands will work: diff --git a/lightrag/api/gunicorn_config.py b/lightrag/api/gunicorn_config.py index 7b25b5b9..e000b46c 100644 --- a/lightrag/api/gunicorn_config.py +++ b/lightrag/api/gunicorn_config.py @@ -129,11 +129,13 @@ def on_exit(server): print("=" * 80) print("GUNICORN MASTER PROCESS: Shutting down") print(f"Process ID: {os.getpid()}") - print("=" * 80) - # Release shared resources + print("Finalizing shared storage...") finalize_share_data() + print("Gunicorn shutdown complete") + print("=" * 80) + print("=" * 80) print("Gunicorn shutdown complete") print("=" * 80) diff --git a/lightrag/api/lightrag_server.py b/lightrag/api/lightrag_server.py index ec8b2f45..3269fbb5 100644 --- a/lightrag/api/lightrag_server.py +++ b/lightrag/api/lightrag_server.py @@ -326,15 +326,14 @@ def create_app(args): # Clean up database connections await rag.finalize_storages() - # In Gunicorn mode with preload_app=True, cleanup is handled by worker_exit/on_exit hooks - # Only perform cleanup in Uvicorn single-process mode if "LIGHTRAG_GUNICORN_MODE" not in os.environ: - # Clean up shared data - logger.debug("Unvicorn Mode: finalize shared storage...") + # Only perform cleanup in Uvicorn single-process mode + logger.debug("Unvicorn Mode: finalizing shared storage...") finalize_share_data() else: + # In Gunicorn mode with preload_app=True, cleanup is handled by on_exit hooks logger.debug( - "Gunicorn Mode: don not finalize shared storage in worker process" + "Gunicorn Mode: postpone shared storage finalization to master process" ) # Initialize FastAPI diff --git a/lightrag/api/run_with_gunicorn.py b/lightrag/api/run_with_gunicorn.py index c5f7cb5c..f2d4d859 100644 --- a/lightrag/api/run_with_gunicorn.py +++ b/lightrag/api/run_with_gunicorn.py @@ -45,9 +45,7 @@ def main(): check_and_install_dependencies() # Note: Signal handlers are NOT registered here because: - # - Worker cleanup is handled by gunicorn_config.worker_exit() - # - Master cleanup is handled by gunicorn_config.on_exit() - # This prevents race conditions when multiple processes try to finalize shared data + # - Master cleanup already handled by gunicorn_config.on_exit() # Display startup information display_splash_screen(global_args) diff --git a/lightrag/kg/shared_storage.py b/lightrag/kg/shared_storage.py index c7dc03e5..0abcf719 100644 --- a/lightrag/kg/shared_storage.py +++ b/lightrag/kg/shared_storage.py @@ -1444,7 +1444,7 @@ async def get_namespace_data( return _shared_dicts[namespace] -def finalize_share_data(shutdown_manager: bool = True): +def finalize_share_data(): """ Release shared resources and clean up. @@ -1453,10 +1453,6 @@ def finalize_share_data(shutdown_manager: bool = True): In multi-process mode, it shuts down the Manager and releases all shared objects. In single-process mode, it simply resets the global variables. - - Args: - shutdown_manager: If True, shut down the multiprocessing Manager. - Should be True only for the main process, False for worker processes. """ global \ _manager, \ @@ -1483,8 +1479,8 @@ def finalize_share_data(shutdown_manager: bool = True): f"Process {os.getpid()} finalizing storage data (multiprocess={_is_multiprocess})" ) - # In multi-process mode, shut down the Manager only if requested - if _is_multiprocess and _manager is not None and shutdown_manager: + # In multi-process mode, shut down the Manager + if _is_multiprocess and _manager is not None: try: # Clear shared resources before shutting down Manager if _shared_dicts is not None: