From 484e441d0ca19dba4817a75fa0167b46a4114985 Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 31 Dec 2025 16:03:46 +0800 Subject: [PATCH 1/3] Optimize Postgres retry logic for HA switchover - Increase default retries and backoff - Raise connection retry parameter caps - Update env example with HA defaults - Extend frontend timeouts for updates - Update integration test limits --- env.example | 21 +++++++++++++------ lightrag/kg/postgres_impl.py | 12 +++++------ .../src/features/DocumentManager.tsx | 20 +++++++++++------- tests/test_postgres_retry_integration.py | 8 +++---- 4 files changed, 37 insertions(+), 24 deletions(-) diff --git a/env.example b/env.example index 894e7e85..b7e6b70f 100644 --- a/env.example +++ b/env.example @@ -403,13 +403,22 @@ POSTGRES_VCHORDRQ_PROBES= POSTGRES_VCHORDRQ_EPSILON=1.9 ### PostgreSQL Connection Retry Configuration (Network Robustness) -### Number of retry attempts (1-10, default: 3) -### Initial retry backoff in seconds (0.1-5.0, default: 0.5) -### Maximum retry backoff in seconds (backoff-60.0, default: 5.0) +### NEW DEFAULTS (v1.4.10+): Optimized for HA deployments with ~30s switchover time +### These defaults provide out-of-the-box support for PostgreSQL High Availability setups +### +### Number of retry attempts (1-100, default: 10) +### - Default 10 attempts allows ~225s total retry time (sufficient for most HA scenarios) +### - For extreme cases: increase up to 20-50 +### Initial retry backoff in seconds (0.1-300.0, default: 3.0) +### - Default 3.0s provides reasonable initial delay for switchover detection +### - For faster recovery: decrease to 1.0-2.0 +### Maximum retry backoff in seconds (must be >= backoff, max: 600.0, default: 30.0) +### - Default 30.0s matches typical switchover completion time +### - For longer switchovers: increase to 60-90 ### Connection pool close timeout in seconds (1.0-30.0, default: 5.0) -# POSTGRES_CONNECTION_RETRIES=3 -# POSTGRES_CONNECTION_RETRY_BACKOFF=0.5 -# POSTGRES_CONNECTION_RETRY_BACKOFF_MAX=5.0 +# POSTGRES_CONNECTION_RETRIES=10 +# POSTGRES_CONNECTION_RETRY_BACKOFF=3.0 +# POSTGRES_CONNECTION_RETRY_BACKOFF_MAX=30.0 # POSTGRES_POOL_CLOSE_TIMEOUT=5.0 ### PostgreSQL SSL Configuration (Optional) diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py index 71be88b9..1fd76276 100644 --- a/lightrag/kg/postgres_impl.py +++ b/lightrag/kg/postgres_impl.py @@ -1797,34 +1797,34 @@ class ClientManager: ), # Connection retry configuration "connection_retry_attempts": min( - 10, + 100, # Increased from 10 to 100 for long-running operations int( os.environ.get( "POSTGRES_CONNECTION_RETRIES", - config.get("postgres", "connection_retries", fallback=3), + config.get("postgres", "connection_retries", fallback=10), ) ), ), "connection_retry_backoff": min( - 5.0, + 300.0, # Increased from 5.0 to 300.0 (5 minutes) for PG switchover scenarios float( os.environ.get( "POSTGRES_CONNECTION_RETRY_BACKOFF", config.get( - "postgres", "connection_retry_backoff", fallback=0.5 + "postgres", "connection_retry_backoff", fallback=3.0 ), ) ), ), "connection_retry_backoff_max": min( - 60.0, + 600.0, # Increased from 60.0 to 600.0 (10 minutes) for PG switchover scenarios float( os.environ.get( "POSTGRES_CONNECTION_RETRY_BACKOFF_MAX", config.get( "postgres", "connection_retry_backoff_max", - fallback=5.0, + fallback=30.0, ), ) ), diff --git a/lightrag_webui/src/features/DocumentManager.tsx b/lightrag_webui/src/features/DocumentManager.tsx index 406faf2b..a646a398 100644 --- a/lightrag_webui/src/features/DocumentManager.tsx +++ b/lightrag_webui/src/features/DocumentManager.tsx @@ -584,7 +584,7 @@ export default function DocumentManager() { // Utility function to create timeout wrapper for API calls const withTimeout = useCallback(( promise: Promise, - timeoutMs: number = 30000, + timeoutMs: number = 30000, // Default 30s timeout for normal operations errorMsg: string = 'Request timeout' ): Promise => { const timeoutPromise = new Promise((_, reject) => { @@ -676,7 +676,8 @@ export default function DocumentManager() { // Intelligent refresh function: handles all boundary cases const handleIntelligentRefresh = useCallback(async ( targetPage?: number, // Optional target page, defaults to current page - resetToFirst?: boolean // Whether to force reset to first page + resetToFirst?: boolean, // Whether to force reset to first page + customTimeout?: number // Optional custom timeout in milliseconds (uses withTimeout default if not provided) ) => { try { if (!isMountedRef.current) return; @@ -694,10 +695,10 @@ export default function DocumentManager() { sort_direction: sortDirection }; - // Use timeout wrapper for the API call + // Use timeout wrapper for the API call (uses customTimeout if provided, otherwise withTimeout default) const response = await withTimeout( getDocumentsPaginated(request), - 30000, // 30 second timeout + customTimeout, // Pass undefined to use default 30s, or explicit timeout for special cases 'Document fetch timeout' ); @@ -717,7 +718,7 @@ export default function DocumentManager() { const lastPageResponse = await withTimeout( getDocumentsPaginated(lastPageRequest), - 30000, + customTimeout, // Use same timeout for consistency 'Document fetch timeout' ); @@ -847,7 +848,10 @@ export default function DocumentManager() { // Reset health check timer with 1 second delay to avoid race condition useBackendState.getState().resetHealthCheckTimerDelayed(1000); - // Start fast refresh with 2-second interval immediately after scan + // Perform immediate refresh with 90s timeout after scan (tolerates PostgreSQL switchover) + await handleIntelligentRefresh(undefined, false, 90000); + + // Start fast refresh with 2-second interval after initial refresh startPollingInterval(2000); // Set recovery timer to restore normal polling interval after 15 seconds @@ -865,7 +869,7 @@ export default function DocumentManager() { toast.error(t('documentPanel.documentManager.errors.scanFailed', { error: errorMessage(err) })); } } - }, [t, startPollingInterval, currentTab, health, statusCounts]) + }, [t, startPollingInterval, currentTab, health, statusCounts, handleIntelligentRefresh]) // Handle page size change - update state and save to store const handlePageSizeChange = useCallback((newPageSize: number) => { @@ -1184,7 +1188,7 @@ export default function DocumentManager() { ) : !isSelectionMode ? ( ) : null} - + handleIntelligentRefresh(undefined, false, 120000)} /> Date: Wed, 31 Dec 2025 16:37:08 +0800 Subject: [PATCH 2/3] Update Postgres integration tests to use new HA retry defaults - Increase retry count to 10 - Raise initial backoff to 3.0s - Raise max backoff to 30.0s - Remove obsolete test_env fixture - Align tests with HA config --- tests/test_postgres_retry_integration.py | 37 +++++++++++++----------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/tests/test_postgres_retry_integration.py b/tests/test_postgres_retry_integration.py index a215647e..30ec4d9c 100644 --- a/tests/test_postgres_retry_integration.py +++ b/tests/test_postgres_retry_integration.py @@ -31,7 +31,13 @@ class TestPostgresRetryIntegration: @pytest.fixture def db_config(self): - """Load database configuration from environment variables.""" + """Load database configuration from environment variables. + + Uses new HA-optimized defaults that match postgres_impl.py ClientManager.get_config(): + - 10 retry attempts (up from 3) + - 3.0s initial backoff (up from 0.5s) + - 30.0s max backoff (up from 5.0s) + """ return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", "5432")), @@ -41,30 +47,23 @@ class TestPostgresRetryIntegration: "workspace": os.getenv("POSTGRES_WORKSPACE", "test_retry"), "max_connections": int(os.getenv("POSTGRES_MAX_CONNECTIONS", "10")), # Connection retry configuration - mirrors postgres_impl.py ClientManager.get_config() + # NEW DEFAULTS optimized for HA deployments "connection_retry_attempts": min( - 100, int(os.getenv("POSTGRES_CONNECTION_RETRIES", "3")) + 100, int(os.getenv("POSTGRES_CONNECTION_RETRIES", "10")) # 3 → 10 ), "connection_retry_backoff": min( - 300.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF", "0.5")) + 300.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF", "3.0")) # 0.5 → 3.0 ), "connection_retry_backoff_max": min( - 600.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF_MAX", "5.0")) + 600.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF_MAX", "30.0")) # 5.0 → 30.0 ), "pool_close_timeout": min( 30.0, float(os.getenv("POSTGRES_POOL_CLOSE_TIMEOUT", "5.0")) ), } - @pytest.fixture - def test_env(self, monkeypatch): - """Set up test environment variables for retry configuration.""" - monkeypatch.setenv("POSTGRES_CONNECTION_RETRIES", "3") - monkeypatch.setenv("POSTGRES_CONNECTION_RETRY_BACKOFF", "0.5") - monkeypatch.setenv("POSTGRES_CONNECTION_RETRY_BACKOFF_MAX", "2.0") - monkeypatch.setenv("POSTGRES_POOL_CLOSE_TIMEOUT", "3.0") - @pytest.mark.asyncio - async def test_real_connection_success(self, db_config, test_env): + async def test_real_connection_success(self, db_config): """ Test successful connection to real PostgreSQL database. @@ -100,11 +99,12 @@ class TestPostgresRetryIntegration: await db.pool.close() @pytest.mark.asyncio - async def test_simulated_transient_error_with_real_db(self, db_config, test_env): + async def test_simulated_transient_error_with_real_db(self, db_config): """ Test retry mechanism with simulated transient errors on real database. Simulates connection failures on first 2 attempts, then succeeds. + Uses new HA defaults (10 retries, 3s backoff). """ print("\n" + "=" * 80) print("INTEGRATION TEST 2: Simulated Transient Errors") @@ -155,12 +155,13 @@ class TestPostgresRetryIntegration: await db.pool.close() @pytest.mark.asyncio - async def test_query_retry_with_real_db(self, db_config, test_env): + async def test_query_retry_with_real_db(self, db_config): """ Test query-level retry with simulated connection issues. Tests that queries retry on transient failures by simulating a temporary database unavailability. + Uses new HA defaults (10 retries, 3s backoff). """ print("\n" + "=" * 80) print("INTEGRATION TEST 3: Query-Level Retry") @@ -193,11 +194,12 @@ class TestPostgresRetryIntegration: await db.pool.close() @pytest.mark.asyncio - async def test_concurrent_queries_with_real_db(self, db_config, test_env): + async def test_concurrent_queries_with_real_db(self, db_config): """ Test concurrent queries to validate thread safety and connection pooling. Runs multiple concurrent queries to ensure no deadlocks or race conditions. + Uses new HA defaults (10 retries, 3s backoff). """ print("\n" + "=" * 80) print("INTEGRATION TEST 4: Concurrent Queries") @@ -243,9 +245,10 @@ class TestPostgresRetryIntegration: await db.pool.close() @pytest.mark.asyncio - async def test_pool_close_timeout_real(self, db_config, test_env): + async def test_pool_close_timeout_real(self, db_config): """ Test pool close timeout protection with real database. + Uses new HA defaults (10 retries, 3s backoff). """ print("\n" + "=" * 80) print("INTEGRATION TEST 5: Pool Close Timeout") From 5ccb5ec980f58566776864b34631c49a2a8fb4a1 Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 31 Dec 2025 19:09:46 +0800 Subject: [PATCH 3/3] Fix linting --- tests/test_postgres_retry_integration.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tests/test_postgres_retry_integration.py b/tests/test_postgres_retry_integration.py index 30ec4d9c..6871713e 100644 --- a/tests/test_postgres_retry_integration.py +++ b/tests/test_postgres_retry_integration.py @@ -32,7 +32,7 @@ class TestPostgresRetryIntegration: @pytest.fixture def db_config(self): """Load database configuration from environment variables. - + Uses new HA-optimized defaults that match postgres_impl.py ClientManager.get_config(): - 10 retry attempts (up from 3) - 3.0s initial backoff (up from 0.5s) @@ -49,13 +49,20 @@ class TestPostgresRetryIntegration: # Connection retry configuration - mirrors postgres_impl.py ClientManager.get_config() # NEW DEFAULTS optimized for HA deployments "connection_retry_attempts": min( - 100, int(os.getenv("POSTGRES_CONNECTION_RETRIES", "10")) # 3 → 10 + 100, + int(os.getenv("POSTGRES_CONNECTION_RETRIES", "10")), # 3 → 10 ), "connection_retry_backoff": min( - 300.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF", "3.0")) # 0.5 → 3.0 + 300.0, + float( + os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF", "3.0") + ), # 0.5 → 3.0 ), "connection_retry_backoff_max": min( - 600.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF_MAX", "30.0")) # 5.0 → 30.0 + 600.0, + float( + os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF_MAX", "30.0") + ), # 5.0 → 30.0 ), "pool_close_timeout": min( 30.0, float(os.getenv("POSTGRES_POOL_CLOSE_TIMEOUT", "5.0"))