Optimize Postgres retry logic for HA switchover

- Increase default retries and backoff - Raise connection retry parameter caps - Update env example with HA defaults - Extend frontend timeouts for updates - Update integration test limits
2025-12-31 16:03:46 +08:00
parent 62c71a9128
commit 484e441d0c
4 changed files with 37 additions and 24 deletions
--- a/env.example
+++ b/env.example
@@ -403,13 +403,22 @@ POSTGRES_VCHORDRQ_PROBES=
 POSTGRES_VCHORDRQ_EPSILON=1.9

 ### PostgreSQL Connection Retry Configuration (Network Robustness)
-### Number of retry attempts (1-10, default: 3)
-### Initial retry backoff in seconds (0.1-5.0, default: 0.5)
-### Maximum retry backoff in seconds (backoff-60.0, default: 5.0)
+### NEW DEFAULTS (v1.4.10+): Optimized for HA deployments with ~30s switchover time
+### These defaults provide out-of-the-box support for PostgreSQL High Availability setups
+###
+### Number of retry attempts (1-100, default: 10)
+###   - Default 10 attempts allows ~225s total retry time (sufficient for most HA scenarios)
+###   - For extreme cases: increase up to 20-50
+### Initial retry backoff in seconds (0.1-300.0, default: 3.0)
+###   - Default 3.0s provides reasonable initial delay for switchover detection
+###   - For faster recovery: decrease to 1.0-2.0
+### Maximum retry backoff in seconds (must be >= backoff, max: 600.0, default: 30.0)
+###   - Default 30.0s matches typical switchover completion time
+###   - For longer switchovers: increase to 60-90
 ### Connection pool close timeout in seconds (1.0-30.0, default: 5.0)
-# POSTGRES_CONNECTION_RETRIES=3
-# POSTGRES_CONNECTION_RETRY_BACKOFF=0.5
-# POSTGRES_CONNECTION_RETRY_BACKOFF_MAX=5.0
+# POSTGRES_CONNECTION_RETRIES=10
+# POSTGRES_CONNECTION_RETRY_BACKOFF=3.0
+# POSTGRES_CONNECTION_RETRY_BACKOFF_MAX=30.0
 # POSTGRES_POOL_CLOSE_TIMEOUT=5.0

 ### PostgreSQL SSL Configuration (Optional)
--- a/lightrag/kg/postgres_impl.py
+++ b/lightrag/kg/postgres_impl.py
@@ -1797,34 +1797,34 @@ class ClientManager:
            ),
            # Connection retry configuration
            "connection_retry_attempts": min(
-                10,
+                100,  # Increased from 10 to 100 for long-running operations
                int(
                    os.environ.get(
                        "POSTGRES_CONNECTION_RETRIES",
-                        config.get("postgres", "connection_retries", fallback=3),
+                        config.get("postgres", "connection_retries", fallback=10),
                    )
                ),
            ),
            "connection_retry_backoff": min(
-                5.0,
+                300.0,  # Increased from 5.0 to 300.0 (5 minutes) for PG switchover scenarios
                float(
                    os.environ.get(
                        "POSTGRES_CONNECTION_RETRY_BACKOFF",
                        config.get(
-                            "postgres", "connection_retry_backoff", fallback=0.5
+                            "postgres", "connection_retry_backoff", fallback=3.0
                        ),
                    )
                ),
            ),
            "connection_retry_backoff_max": min(
-                60.0,
+                600.0,  # Increased from 60.0 to 600.0 (10 minutes) for PG switchover scenarios
                float(
                    os.environ.get(
                        "POSTGRES_CONNECTION_RETRY_BACKOFF_MAX",
                        config.get(
                            "postgres",
                            "connection_retry_backoff_max",
-                            fallback=5.0,
+                            fallback=30.0,
                        ),
                    )
                ),
--- a/lightrag_webui/src/features/DocumentManager.tsx
+++ b/lightrag_webui/src/features/DocumentManager.tsx
@@ -584,7 +584,7 @@ export default function DocumentManager() {
  // Utility function to create timeout wrapper for API calls
  const withTimeout = useCallback((
    promise: Promise<any>,
-    timeoutMs: number = 30000,
+    timeoutMs: number = 30000, // Default 30s timeout for normal operations
    errorMsg: string = 'Request timeout'
  ): Promise<any> => {
    const timeoutPromise = new Promise((_, reject) => {
@@ -676,7 +676,8 @@ export default function DocumentManager() {
  // Intelligent refresh function: handles all boundary cases
  const handleIntelligentRefresh = useCallback(async (
    targetPage?: number, // Optional target page, defaults to current page
-    resetToFirst?: boolean // Whether to force reset to first page
+    resetToFirst?: boolean, // Whether to force reset to first page
+    customTimeout?: number // Optional custom timeout in milliseconds (uses withTimeout default if not provided)
  ) => {
    try {
      if (!isMountedRef.current) return;
@@ -694,10 +695,10 @@ export default function DocumentManager() {
        sort_direction: sortDirection
      };

-      // Use timeout wrapper for the API call
+      // Use timeout wrapper for the API call (uses customTimeout if provided, otherwise withTimeout default)
      const response = await withTimeout(
        getDocumentsPaginated(request),
-        30000, // 30 second timeout
+        customTimeout, // Pass undefined to use default 30s, or explicit timeout for special cases
        'Document fetch timeout'
      );

@@ -717,7 +718,7 @@ export default function DocumentManager() {

          const lastPageResponse = await withTimeout(
            getDocumentsPaginated(lastPageRequest),
-            30000,
+            customTimeout, // Use same timeout for consistency
            'Document fetch timeout'
          );

@@ -847,7 +848,10 @@ export default function DocumentManager() {
      // Reset health check timer with 1 second delay to avoid race condition
      useBackendState.getState().resetHealthCheckTimerDelayed(1000);

-      // Start fast refresh with 2-second interval immediately after scan
+      // Perform immediate refresh with 90s timeout after scan (tolerates PostgreSQL switchover)
+      await handleIntelligentRefresh(undefined, false, 90000);
+
+      // Start fast refresh with 2-second interval after initial refresh
      startPollingInterval(2000);

      // Set recovery timer to restore normal polling interval after 15 seconds
@@ -865,7 +869,7 @@ export default function DocumentManager() {
        toast.error(t('documentPanel.documentManager.errors.scanFailed', { error: errorMessage(err) }));
      }
    }
-  }, [t, startPollingInterval, currentTab, health, statusCounts])
+  }, [t, startPollingInterval, currentTab, health, statusCounts, handleIntelligentRefresh])

  // Handle page size change - update state and save to store
  const handlePageSizeChange = useCallback((newPageSize: number) => {
@@ -1184,7 +1188,7 @@ export default function DocumentManager() {
            ) : !isSelectionMode ? (
              <ClearDocumentsDialog onDocumentsCleared={handleDocumentsCleared} />
            ) : null}
-            <UploadDocumentsDialog onDocumentsUploaded={fetchDocuments} />
+            <UploadDocumentsDialog onDocumentsUploaded={() => handleIntelligentRefresh(undefined, false, 120000)} />
            <PipelineStatusDialog
              open={showPipelineStatus}
              onOpenChange={setShowPipelineStatus}
--- a/tests/test_postgres_retry_integration.py
+++ b/tests/test_postgres_retry_integration.py
@@ -40,15 +40,15 @@ class TestPostgresRetryIntegration:
            "database": os.getenv("POSTGRES_DATABASE", "postgres"),
            "workspace": os.getenv("POSTGRES_WORKSPACE", "test_retry"),
            "max_connections": int(os.getenv("POSTGRES_MAX_CONNECTIONS", "10")),
-            # Connection retry configuration
+            # Connection retry configuration - mirrors postgres_impl.py ClientManager.get_config()
            "connection_retry_attempts": min(
-                10, int(os.getenv("POSTGRES_CONNECTION_RETRIES", "3"))
+                100, int(os.getenv("POSTGRES_CONNECTION_RETRIES", "3"))
            ),
            "connection_retry_backoff": min(
-                5.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF", "0.5"))
+                300.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF", "0.5"))
            ),
            "connection_retry_backoff_max": min(
-                60.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF_MAX", "5.0"))
+                600.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF_MAX", "5.0"))
            ),
            "pool_close_timeout": min(
                30.0, float(os.getenv("POSTGRES_POOL_CLOSE_TIMEOUT", "5.0"))