Merge pull request #2562 from danielaskdd/fix-pg-timetout

Fix: Enhance PostgreSQL Reconnection Tolerance for HA Deployments
2025-12-31 23:42:28 +08:00
parent 62c71a9128 5ccb5ec980
commit 5590a2001e
4 changed files with 61 additions and 38 deletions
--- a/env.example
+++ b/env.example
@@ -403,13 +403,22 @@ POSTGRES_VCHORDRQ_PROBES=
 POSTGRES_VCHORDRQ_EPSILON=1.9
 ### PostgreSQL Connection Retry Configuration (Network Robustness)
-### Number of retry attempts (1-10, default: 3)
+### NEW DEFAULTS (v1.4.10+): Optimized for HA deployments with ~30s switchover time
-### Initial retry backoff in seconds (0.1-5.0, default: 0.5)
+### These defaults provide out-of-the-box support for PostgreSQL High Availability setups
-### Maximum retry backoff in seconds (backoff-60.0, default: 5.0)
+###
 ### Number of retry attempts (1-100, default: 10)
 ###   - Default 10 attempts allows ~225s total retry time (sufficient for most HA scenarios)
 ###   - For extreme cases: increase up to 20-50
 ### Initial retry backoff in seconds (0.1-300.0, default: 3.0)
 ###   - Default 3.0s provides reasonable initial delay for switchover detection
 ###   - For faster recovery: decrease to 1.0-2.0
 ### Maximum retry backoff in seconds (must be >= backoff, max: 600.0, default: 30.0)
 ###   - Default 30.0s matches typical switchover completion time
 ###   - For longer switchovers: increase to 60-90
 ### Connection pool close timeout in seconds (1.0-30.0, default: 5.0)
-# POSTGRES_CONNECTION_RETRIES=3
+# POSTGRES_CONNECTION_RETRIES=10
-# POSTGRES_CONNECTION_RETRY_BACKOFF=0.5
+# POSTGRES_CONNECTION_RETRY_BACKOFF=3.0
-# POSTGRES_CONNECTION_RETRY_BACKOFF_MAX=5.0
+# POSTGRES_CONNECTION_RETRY_BACKOFF_MAX=30.0
 # POSTGRES_POOL_CLOSE_TIMEOUT=5.0
 ### PostgreSQL SSL Configuration (Optional)
--- a/lightrag/kg/postgres_impl.py
+++ b/lightrag/kg/postgres_impl.py
@@ -1797,34 +1797,34 @@ class ClientManager:
            ),
            # Connection retry configuration
            "connection_retry_attempts": min(
-                10,
+                100,  # Increased from 10 to 100 for long-running operations
                int(
                    os.environ.get(
                        "POSTGRES_CONNECTION_RETRIES",
-                        config.get("postgres", "connection_retries", fallback=3),
+                        config.get("postgres", "connection_retries", fallback=10),
                    )
                ),
            ),
            "connection_retry_backoff": min(
-                5.0,
+                300.0,  # Increased from 5.0 to 300.0 (5 minutes) for PG switchover scenarios
                float(
                    os.environ.get(
                        "POSTGRES_CONNECTION_RETRY_BACKOFF",
                        config.get(
-                            "postgres", "connection_retry_backoff", fallback=0.5
+                            "postgres", "connection_retry_backoff", fallback=3.0
                        ),
                    )
                ),
            ),
            "connection_retry_backoff_max": min(
-                60.0,
+                600.0,  # Increased from 60.0 to 600.0 (10 minutes) for PG switchover scenarios
                float(
                    os.environ.get(
                        "POSTGRES_CONNECTION_RETRY_BACKOFF_MAX",
                        config.get(
                            "postgres",
                            "connection_retry_backoff_max",
-                            fallback=5.0,
+                            fallback=30.0,
                        ),
                    )
                ),
--- a/lightrag_webui/src/features/DocumentManager.tsx
+++ b/lightrag_webui/src/features/DocumentManager.tsx
@@ -584,7 +584,7 @@ export default function DocumentManager() {
  // Utility function to create timeout wrapper for API calls
  const withTimeout = useCallback((
    promise: Promise<any>,
-    timeoutMs: number = 30000,
+    timeoutMs: number = 30000, // Default 30s timeout for normal operations
    errorMsg: string = 'Request timeout'
  ): Promise<any> => {
    const timeoutPromise = new Promise((_, reject) => {
@@ -676,7 +676,8 @@ export default function DocumentManager() {
  // Intelligent refresh function: handles all boundary cases
  const handleIntelligentRefresh = useCallback(async (
    targetPage?: number, // Optional target page, defaults to current page
-    resetToFirst?: boolean // Whether to force reset to first page
+    resetToFirst?: boolean, // Whether to force reset to first page
    customTimeout?: number // Optional custom timeout in milliseconds (uses withTimeout default if not provided)
  ) => {
    try {
      if (!isMountedRef.current) return;
@@ -694,10 +695,10 @@ export default function DocumentManager() {
        sort_direction: sortDirection
      };
-      // Use timeout wrapper for the API call
+      // Use timeout wrapper for the API call (uses customTimeout if provided, otherwise withTimeout default)
      const response = await withTimeout(
        getDocumentsPaginated(request),
-        30000, // 30 second timeout
+        customTimeout, // Pass undefined to use default 30s, or explicit timeout for special cases
        'Document fetch timeout'
      );
@@ -717,7 +718,7 @@ export default function DocumentManager() {
          const lastPageResponse = await withTimeout(
            getDocumentsPaginated(lastPageRequest),
-            30000,
+            customTimeout, // Use same timeout for consistency
            'Document fetch timeout'
          );
@@ -847,7 +848,10 @@ export default function DocumentManager() {
      // Reset health check timer with 1 second delay to avoid race condition
      useBackendState.getState().resetHealthCheckTimerDelayed(1000);
-      // Start fast refresh with 2-second interval immediately after scan
+      // Perform immediate refresh with 90s timeout after scan (tolerates PostgreSQL switchover)
      await handleIntelligentRefresh(undefined, false, 90000);
      // Start fast refresh with 2-second interval after initial refresh
      startPollingInterval(2000);
      // Set recovery timer to restore normal polling interval after 15 seconds
@@ -865,7 +869,7 @@ export default function DocumentManager() {
        toast.error(t('documentPanel.documentManager.errors.scanFailed', { error: errorMessage(err) }));
      }
    }
-  }, [t, startPollingInterval, currentTab, health, statusCounts])
+  }, [t, startPollingInterval, currentTab, health, statusCounts, handleIntelligentRefresh])
  // Handle page size change - update state and save to store
  const handlePageSizeChange = useCallback((newPageSize: number) => {
@@ -1184,7 +1188,7 @@ export default function DocumentManager() {
            ) : !isSelectionMode ? (
              <ClearDocumentsDialog onDocumentsCleared={handleDocumentsCleared} />
            ) : null}
-            <UploadDocumentsDialog onDocumentsUploaded={fetchDocuments} />
+            <UploadDocumentsDialog onDocumentsUploaded={() => handleIntelligentRefresh(undefined, false, 120000)} />
            <PipelineStatusDialog
              open={showPipelineStatus}
              onOpenChange={setShowPipelineStatus}
--- a/tests/test_postgres_retry_integration.py
+++ b/tests/test_postgres_retry_integration.py
@@ -31,7 +31,13 @@ class TestPostgresRetryIntegration:
    @pytest.fixture
    def db_config(self):
-        """Load database configuration from environment variables."""
+        """Load database configuration from environment variables.
        Uses new HA-optimized defaults that match postgres_impl.py ClientManager.get_config():
        - 10 retry attempts (up from 3)
        - 3.0s initial backoff (up from 0.5s)
        - 30.0s max backoff (up from 5.0s)
        """
        return {
            "host": os.getenv("POSTGRES_HOST", "localhost"),
            "port": int(os.getenv("POSTGRES_PORT", "5432")),
@@ -40,31 +46,31 @@ class TestPostgresRetryIntegration:
            "database": os.getenv("POSTGRES_DATABASE", "postgres"),
            "workspace": os.getenv("POSTGRES_WORKSPACE", "test_retry"),
            "max_connections": int(os.getenv("POSTGRES_MAX_CONNECTIONS", "10")),
-            # Connection retry configuration
+            # Connection retry configuration - mirrors postgres_impl.py ClientManager.get_config()
            # NEW DEFAULTS optimized for HA deployments
            "connection_retry_attempts": min(
-                10, int(os.getenv("POSTGRES_CONNECTION_RETRIES", "3"))
+                100,
                int(os.getenv("POSTGRES_CONNECTION_RETRIES", "10")),  # 3 → 10
            ),
            "connection_retry_backoff": min(
-                5.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF", "0.5"))
+                300.0,
                float(
                    os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF", "3.0")
                ),  # 0.5 → 3.0
            ),
            "connection_retry_backoff_max": min(
-                60.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF_MAX", "5.0"))
+                600.0,
                float(
                    os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF_MAX", "30.0")
                ),  # 5.0 → 30.0
            ),
            "pool_close_timeout": min(
                30.0, float(os.getenv("POSTGRES_POOL_CLOSE_TIMEOUT", "5.0"))
            ),
        }
    @pytest.fixture
    def test_env(self, monkeypatch):
        """Set up test environment variables for retry configuration."""
        monkeypatch.setenv("POSTGRES_CONNECTION_RETRIES", "3")
        monkeypatch.setenv("POSTGRES_CONNECTION_RETRY_BACKOFF", "0.5")
        monkeypatch.setenv("POSTGRES_CONNECTION_RETRY_BACKOFF_MAX", "2.0")
        monkeypatch.setenv("POSTGRES_POOL_CLOSE_TIMEOUT", "3.0")
    @pytest.mark.asyncio
-    async def test_real_connection_success(self, db_config, test_env):
+    async def test_real_connection_success(self, db_config):
        """
        Test successful connection to real PostgreSQL database.
@@ -100,11 +106,12 @@ class TestPostgresRetryIntegration:
                await db.pool.close()
    @pytest.mark.asyncio
-    async def test_simulated_transient_error_with_real_db(self, db_config, test_env):
+    async def test_simulated_transient_error_with_real_db(self, db_config):
        """
        Test retry mechanism with simulated transient errors on real database.
        Simulates connection failures on first 2 attempts, then succeeds.
        Uses new HA defaults (10 retries, 3s backoff).
        """
        print("\n" + "=" * 80)
        print("INTEGRATION TEST 2: Simulated Transient Errors")
@@ -155,12 +162,13 @@ class TestPostgresRetryIntegration:
                await db.pool.close()
    @pytest.mark.asyncio
-    async def test_query_retry_with_real_db(self, db_config, test_env):
+    async def test_query_retry_with_real_db(self, db_config):
        """
        Test query-level retry with simulated connection issues.
        Tests that queries retry on transient failures by simulating
        a temporary database unavailability.
        Uses new HA defaults (10 retries, 3s backoff).
        """
        print("\n" + "=" * 80)
        print("INTEGRATION TEST 3: Query-Level Retry")
@@ -193,11 +201,12 @@ class TestPostgresRetryIntegration:
                await db.pool.close()
    @pytest.mark.asyncio
-    async def test_concurrent_queries_with_real_db(self, db_config, test_env):
+    async def test_concurrent_queries_with_real_db(self, db_config):
        """
        Test concurrent queries to validate thread safety and connection pooling.
        Runs multiple concurrent queries to ensure no deadlocks or race conditions.
        Uses new HA defaults (10 retries, 3s backoff).
        """
        print("\n" + "=" * 80)
        print("INTEGRATION TEST 4: Concurrent Queries")
@@ -243,9 +252,10 @@ class TestPostgresRetryIntegration:
                await db.pool.close()
    @pytest.mark.asyncio
-    async def test_pool_close_timeout_real(self, db_config, test_env):
+    async def test_pool_close_timeout_real(self, db_config):
        """
        Test pool close timeout protection with real database.
        Uses new HA defaults (10 retries, 3s backoff).
        """
        print("\n" + "=" * 80)
        print("INTEGRATION TEST 5: Pool Close Timeout")