From 484e441d0ca19dba4817a75fa0167b46a4114985 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Wed, 31 Dec 2025 16:03:46 +0800
Subject: [PATCH 1/3] Optimize Postgres retry logic for HA switchover

- Increase default retries and backoff
- Raise connection retry parameter caps
- Update env example with HA defaults
- Extend frontend timeouts for updates
- Update integration test limits
---
 env.example                                   | 21 +++++++++++++------
 lightrag/kg/postgres_impl.py                  | 12 +++++------
 .../src/features/DocumentManager.tsx          | 20 +++++++++++-------
 tests/test_postgres_retry_integration.py      |  8 +++----
 4 files changed, 37 insertions(+), 24 deletions(-)

diff --git a/env.example b/env.example
index 894e7e85..b7e6b70f 100644
--- a/env.example
+++ b/env.example
@@ -403,13 +403,22 @@ POSTGRES_VCHORDRQ_PROBES=
 POSTGRES_VCHORDRQ_EPSILON=1.9
 
 ### PostgreSQL Connection Retry Configuration (Network Robustness)
-### Number of retry attempts (1-10, default: 3)
-### Initial retry backoff in seconds (0.1-5.0, default: 0.5)
-### Maximum retry backoff in seconds (backoff-60.0, default: 5.0)
+### NEW DEFAULTS (v1.4.10+): Optimized for HA deployments with ~30s switchover time
+### These defaults provide out-of-the-box support for PostgreSQL High Availability setups
+###
+### Number of retry attempts (1-100, default: 10)
+###   - Default 10 attempts allows ~225s total retry time (sufficient for most HA scenarios)
+###   - For extreme cases: increase up to 20-50
+### Initial retry backoff in seconds (0.1-300.0, default: 3.0)
+###   - Default 3.0s provides reasonable initial delay for switchover detection
+###   - For faster recovery: decrease to 1.0-2.0
+### Maximum retry backoff in seconds (must be >= backoff, max: 600.0, default: 30.0)
+###   - Default 30.0s matches typical switchover completion time
+###   - For longer switchovers: increase to 60-90
 ### Connection pool close timeout in seconds (1.0-30.0, default: 5.0)
-# POSTGRES_CONNECTION_RETRIES=3
-# POSTGRES_CONNECTION_RETRY_BACKOFF=0.5
-# POSTGRES_CONNECTION_RETRY_BACKOFF_MAX=5.0
+# POSTGRES_CONNECTION_RETRIES=10
+# POSTGRES_CONNECTION_RETRY_BACKOFF=3.0
+# POSTGRES_CONNECTION_RETRY_BACKOFF_MAX=30.0
 # POSTGRES_POOL_CLOSE_TIMEOUT=5.0
 
 ### PostgreSQL SSL Configuration (Optional)
diff --git a/lightrag/kg/postgres_impl.py b/lightrag/kg/postgres_impl.py
index 71be88b9..1fd76276 100644
--- a/lightrag/kg/postgres_impl.py
+++ b/lightrag/kg/postgres_impl.py
@@ -1797,34 +1797,34 @@ class ClientManager:
             ),
             # Connection retry configuration
             "connection_retry_attempts": min(
-                10,
+                100,  # Increased from 10 to 100 for long-running operations
                 int(
                     os.environ.get(
                         "POSTGRES_CONNECTION_RETRIES",
-                        config.get("postgres", "connection_retries", fallback=3),
+                        config.get("postgres", "connection_retries", fallback=10),
                     )
                 ),
             ),
             "connection_retry_backoff": min(
-                5.0,
+                300.0,  # Increased from 5.0 to 300.0 (5 minutes) for PG switchover scenarios
                 float(
                     os.environ.get(
                         "POSTGRES_CONNECTION_RETRY_BACKOFF",
                         config.get(
-                            "postgres", "connection_retry_backoff", fallback=0.5
+                            "postgres", "connection_retry_backoff", fallback=3.0
                         ),
                     )
                 ),
             ),
             "connection_retry_backoff_max": min(
-                60.0,
+                600.0,  # Increased from 60.0 to 600.0 (10 minutes) for PG switchover scenarios
                 float(
                     os.environ.get(
                         "POSTGRES_CONNECTION_RETRY_BACKOFF_MAX",
                         config.get(
                             "postgres",
                             "connection_retry_backoff_max",
-                            fallback=5.0,
+                            fallback=30.0,
                         ),
                     )
                 ),
diff --git a/lightrag_webui/src/features/DocumentManager.tsx b/lightrag_webui/src/features/DocumentManager.tsx
index 406faf2b..a646a398 100644
--- a/lightrag_webui/src/features/DocumentManager.tsx
+++ b/lightrag_webui/src/features/DocumentManager.tsx
@@ -584,7 +584,7 @@ export default function DocumentManager() {
   // Utility function to create timeout wrapper for API calls
   const withTimeout = useCallback((
     promise: Promise<any>,
-    timeoutMs: number = 30000,
+    timeoutMs: number = 30000, // Default 30s timeout for normal operations
     errorMsg: string = 'Request timeout'
   ): Promise<any> => {
     const timeoutPromise = new Promise((_, reject) => {
@@ -676,7 +676,8 @@ export default function DocumentManager() {
   // Intelligent refresh function: handles all boundary cases
   const handleIntelligentRefresh = useCallback(async (
     targetPage?: number, // Optional target page, defaults to current page
-    resetToFirst?: boolean // Whether to force reset to first page
+    resetToFirst?: boolean, // Whether to force reset to first page
+    customTimeout?: number // Optional custom timeout in milliseconds (uses withTimeout default if not provided)
   ) => {
     try {
       if (!isMountedRef.current) return;
@@ -694,10 +695,10 @@ export default function DocumentManager() {
         sort_direction: sortDirection
       };
 
-      // Use timeout wrapper for the API call
+      // Use timeout wrapper for the API call (uses customTimeout if provided, otherwise withTimeout default)
       const response = await withTimeout(
         getDocumentsPaginated(request),
-        30000, // 30 second timeout
+        customTimeout, // Pass undefined to use default 30s, or explicit timeout for special cases
         'Document fetch timeout'
       );
 
@@ -717,7 +718,7 @@ export default function DocumentManager() {
 
           const lastPageResponse = await withTimeout(
             getDocumentsPaginated(lastPageRequest),
-            30000,
+            customTimeout, // Use same timeout for consistency
             'Document fetch timeout'
           );
 
@@ -847,7 +848,10 @@ export default function DocumentManager() {
       // Reset health check timer with 1 second delay to avoid race condition
       useBackendState.getState().resetHealthCheckTimerDelayed(1000);
 
-      // Start fast refresh with 2-second interval immediately after scan
+      // Perform immediate refresh with 90s timeout after scan (tolerates PostgreSQL switchover)
+      await handleIntelligentRefresh(undefined, false, 90000);
+
+      // Start fast refresh with 2-second interval after initial refresh
       startPollingInterval(2000);
 
       // Set recovery timer to restore normal polling interval after 15 seconds
@@ -865,7 +869,7 @@ export default function DocumentManager() {
         toast.error(t('documentPanel.documentManager.errors.scanFailed', { error: errorMessage(err) }));
       }
     }
-  }, [t, startPollingInterval, currentTab, health, statusCounts])
+  }, [t, startPollingInterval, currentTab, health, statusCounts, handleIntelligentRefresh])
 
   // Handle page size change - update state and save to store
   const handlePageSizeChange = useCallback((newPageSize: number) => {
@@ -1184,7 +1188,7 @@ export default function DocumentManager() {
             ) : !isSelectionMode ? (
               <ClearDocumentsDialog onDocumentsCleared={handleDocumentsCleared} />
             ) : null}
-            <UploadDocumentsDialog onDocumentsUploaded={fetchDocuments} />
+            <UploadDocumentsDialog onDocumentsUploaded={() => handleIntelligentRefresh(undefined, false, 120000)} />
             <PipelineStatusDialog
               open={showPipelineStatus}
               onOpenChange={setShowPipelineStatus}
diff --git a/tests/test_postgres_retry_integration.py b/tests/test_postgres_retry_integration.py
index 2c7b3499..a215647e 100644
--- a/tests/test_postgres_retry_integration.py
+++ b/tests/test_postgres_retry_integration.py
@@ -40,15 +40,15 @@ class TestPostgresRetryIntegration:
             "database": os.getenv("POSTGRES_DATABASE", "postgres"),
             "workspace": os.getenv("POSTGRES_WORKSPACE", "test_retry"),
             "max_connections": int(os.getenv("POSTGRES_MAX_CONNECTIONS", "10")),
-            # Connection retry configuration
+            # Connection retry configuration - mirrors postgres_impl.py ClientManager.get_config()
             "connection_retry_attempts": min(
-                10, int(os.getenv("POSTGRES_CONNECTION_RETRIES", "3"))
+                100, int(os.getenv("POSTGRES_CONNECTION_RETRIES", "3"))
             ),
             "connection_retry_backoff": min(
-                5.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF", "0.5"))
+                300.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF", "0.5"))
             ),
             "connection_retry_backoff_max": min(
-                60.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF_MAX", "5.0"))
+                600.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF_MAX", "5.0"))
             ),
             "pool_close_timeout": min(
                 30.0, float(os.getenv("POSTGRES_POOL_CLOSE_TIMEOUT", "5.0"))

From adb4eac6ce6cbad31a00d8980d0f99153bf636f4 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Wed, 31 Dec 2025 16:37:08 +0800
Subject: [PATCH 2/3] Update Postgres integration tests to use new HA retry
 defaults

- Increase retry count to 10
- Raise initial backoff to 3.0s
- Raise max backoff to 30.0s
- Remove obsolete test_env fixture
- Align tests with HA config
---
 tests/test_postgres_retry_integration.py | 37 +++++++++++++-----------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/tests/test_postgres_retry_integration.py b/tests/test_postgres_retry_integration.py
index a215647e..30ec4d9c 100644
--- a/tests/test_postgres_retry_integration.py
+++ b/tests/test_postgres_retry_integration.py
@@ -31,7 +31,13 @@ class TestPostgresRetryIntegration:
 
     @pytest.fixture
     def db_config(self):
-        """Load database configuration from environment variables."""
+        """Load database configuration from environment variables.
+        
+        Uses new HA-optimized defaults that match postgres_impl.py ClientManager.get_config():
+        - 10 retry attempts (up from 3)
+        - 3.0s initial backoff (up from 0.5s)
+        - 30.0s max backoff (up from 5.0s)
+        """
         return {
             "host": os.getenv("POSTGRES_HOST", "localhost"),
             "port": int(os.getenv("POSTGRES_PORT", "5432")),
@@ -41,30 +47,23 @@ class TestPostgresRetryIntegration:
             "workspace": os.getenv("POSTGRES_WORKSPACE", "test_retry"),
             "max_connections": int(os.getenv("POSTGRES_MAX_CONNECTIONS", "10")),
             # Connection retry configuration - mirrors postgres_impl.py ClientManager.get_config()
+            # NEW DEFAULTS optimized for HA deployments
             "connection_retry_attempts": min(
-                100, int(os.getenv("POSTGRES_CONNECTION_RETRIES", "3"))
+                100, int(os.getenv("POSTGRES_CONNECTION_RETRIES", "10"))  # 3 → 10
             ),
             "connection_retry_backoff": min(
-                300.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF", "0.5"))
+                300.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF", "3.0"))  # 0.5 → 3.0
             ),
             "connection_retry_backoff_max": min(
-                600.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF_MAX", "5.0"))
+                600.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF_MAX", "30.0"))  # 5.0 → 30.0
             ),
             "pool_close_timeout": min(
                 30.0, float(os.getenv("POSTGRES_POOL_CLOSE_TIMEOUT", "5.0"))
             ),
         }
 
-    @pytest.fixture
-    def test_env(self, monkeypatch):
-        """Set up test environment variables for retry configuration."""
-        monkeypatch.setenv("POSTGRES_CONNECTION_RETRIES", "3")
-        monkeypatch.setenv("POSTGRES_CONNECTION_RETRY_BACKOFF", "0.5")
-        monkeypatch.setenv("POSTGRES_CONNECTION_RETRY_BACKOFF_MAX", "2.0")
-        monkeypatch.setenv("POSTGRES_POOL_CLOSE_TIMEOUT", "3.0")
-
     @pytest.mark.asyncio
-    async def test_real_connection_success(self, db_config, test_env):
+    async def test_real_connection_success(self, db_config):
         """
         Test successful connection to real PostgreSQL database.
 
@@ -100,11 +99,12 @@ class TestPostgresRetryIntegration:
                 await db.pool.close()
 
     @pytest.mark.asyncio
-    async def test_simulated_transient_error_with_real_db(self, db_config, test_env):
+    async def test_simulated_transient_error_with_real_db(self, db_config):
         """
         Test retry mechanism with simulated transient errors on real database.
 
         Simulates connection failures on first 2 attempts, then succeeds.
+        Uses new HA defaults (10 retries, 3s backoff).
         """
         print("\n" + "=" * 80)
         print("INTEGRATION TEST 2: Simulated Transient Errors")
@@ -155,12 +155,13 @@ class TestPostgresRetryIntegration:
                 await db.pool.close()
 
     @pytest.mark.asyncio
-    async def test_query_retry_with_real_db(self, db_config, test_env):
+    async def test_query_retry_with_real_db(self, db_config):
         """
         Test query-level retry with simulated connection issues.
 
         Tests that queries retry on transient failures by simulating
         a temporary database unavailability.
+        Uses new HA defaults (10 retries, 3s backoff).
         """
         print("\n" + "=" * 80)
         print("INTEGRATION TEST 3: Query-Level Retry")
@@ -193,11 +194,12 @@ class TestPostgresRetryIntegration:
                 await db.pool.close()
 
     @pytest.mark.asyncio
-    async def test_concurrent_queries_with_real_db(self, db_config, test_env):
+    async def test_concurrent_queries_with_real_db(self, db_config):
         """
         Test concurrent queries to validate thread safety and connection pooling.
 
         Runs multiple concurrent queries to ensure no deadlocks or race conditions.
+        Uses new HA defaults (10 retries, 3s backoff).
         """
         print("\n" + "=" * 80)
         print("INTEGRATION TEST 4: Concurrent Queries")
@@ -243,9 +245,10 @@ class TestPostgresRetryIntegration:
                 await db.pool.close()
 
     @pytest.mark.asyncio
-    async def test_pool_close_timeout_real(self, db_config, test_env):
+    async def test_pool_close_timeout_real(self, db_config):
         """
         Test pool close timeout protection with real database.
+        Uses new HA defaults (10 retries, 3s backoff).
         """
         print("\n" + "=" * 80)
         print("INTEGRATION TEST 5: Pool Close Timeout")

From 5ccb5ec980f58566776864b34631c49a2a8fb4a1 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Wed, 31 Dec 2025 19:09:46 +0800
Subject: [PATCH 3/3] Fix linting

---
 tests/test_postgres_retry_integration.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/tests/test_postgres_retry_integration.py b/tests/test_postgres_retry_integration.py
index 30ec4d9c..6871713e 100644
--- a/tests/test_postgres_retry_integration.py
+++ b/tests/test_postgres_retry_integration.py
@@ -32,7 +32,7 @@ class TestPostgresRetryIntegration:
     @pytest.fixture
     def db_config(self):
         """Load database configuration from environment variables.
-        
+
         Uses new HA-optimized defaults that match postgres_impl.py ClientManager.get_config():
         - 10 retry attempts (up from 3)
         - 3.0s initial backoff (up from 0.5s)
@@ -49,13 +49,20 @@ class TestPostgresRetryIntegration:
             # Connection retry configuration - mirrors postgres_impl.py ClientManager.get_config()
             # NEW DEFAULTS optimized for HA deployments
             "connection_retry_attempts": min(
-                100, int(os.getenv("POSTGRES_CONNECTION_RETRIES", "10"))  # 3 → 10
+                100,
+                int(os.getenv("POSTGRES_CONNECTION_RETRIES", "10")),  # 3 → 10
             ),
             "connection_retry_backoff": min(
-                300.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF", "3.0"))  # 0.5 → 3.0
+                300.0,
+                float(
+                    os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF", "3.0")
+                ),  # 0.5 → 3.0
             ),
             "connection_retry_backoff_max": min(
-                600.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF_MAX", "30.0"))  # 5.0 → 30.0
+                600.0,
+                float(
+                    os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF_MAX", "30.0")
+                ),  # 5.0 → 30.0
             ),
             "pool_close_timeout": min(
                 30.0, float(os.getenv("POSTGRES_POOL_CLOSE_TIMEOUT", "5.0"))