Optimize Postgres retry logic for HA switchover

- Increase default retries and backoff
- Raise connection retry parameter caps
- Update env example with HA defaults
- Extend frontend timeouts for updates
- Update integration test limits
This commit is contained in:
yangdx
2025-12-31 16:03:46 +08:00
parent 62c71a9128
commit 484e441d0c
4 changed files with 37 additions and 24 deletions

View File

@@ -403,13 +403,22 @@ POSTGRES_VCHORDRQ_PROBES=
POSTGRES_VCHORDRQ_EPSILON=1.9 POSTGRES_VCHORDRQ_EPSILON=1.9
### PostgreSQL Connection Retry Configuration (Network Robustness) ### PostgreSQL Connection Retry Configuration (Network Robustness)
### Number of retry attempts (1-10, default: 3) ### NEW DEFAULTS (v1.4.10+): Optimized for HA deployments with ~30s switchover time
### Initial retry backoff in seconds (0.1-5.0, default: 0.5) ### These defaults provide out-of-the-box support for PostgreSQL High Availability setups
### Maximum retry backoff in seconds (backoff-60.0, default: 5.0) ###
### Number of retry attempts (1-100, default: 10)
### - Default 10 attempts allows ~225s total retry time (sufficient for most HA scenarios)
### - For extreme cases: increase up to 20-50
### Initial retry backoff in seconds (0.1-300.0, default: 3.0)
### - Default 3.0s provides reasonable initial delay for switchover detection
### - For faster recovery: decrease to 1.0-2.0
### Maximum retry backoff in seconds (must be >= backoff, max: 600.0, default: 30.0)
### - Default 30.0s matches typical switchover completion time
### - For longer switchovers: increase to 60-90
### Connection pool close timeout in seconds (1.0-30.0, default: 5.0) ### Connection pool close timeout in seconds (1.0-30.0, default: 5.0)
# POSTGRES_CONNECTION_RETRIES=3 # POSTGRES_CONNECTION_RETRIES=10
# POSTGRES_CONNECTION_RETRY_BACKOFF=0.5 # POSTGRES_CONNECTION_RETRY_BACKOFF=3.0
# POSTGRES_CONNECTION_RETRY_BACKOFF_MAX=5.0 # POSTGRES_CONNECTION_RETRY_BACKOFF_MAX=30.0
# POSTGRES_POOL_CLOSE_TIMEOUT=5.0 # POSTGRES_POOL_CLOSE_TIMEOUT=5.0
### PostgreSQL SSL Configuration (Optional) ### PostgreSQL SSL Configuration (Optional)

View File

@@ -1797,34 +1797,34 @@ class ClientManager:
), ),
# Connection retry configuration # Connection retry configuration
"connection_retry_attempts": min( "connection_retry_attempts": min(
10, 100, # Increased from 10 to 100 for long-running operations
int( int(
os.environ.get( os.environ.get(
"POSTGRES_CONNECTION_RETRIES", "POSTGRES_CONNECTION_RETRIES",
config.get("postgres", "connection_retries", fallback=3), config.get("postgres", "connection_retries", fallback=10),
) )
), ),
), ),
"connection_retry_backoff": min( "connection_retry_backoff": min(
5.0, 300.0, # Increased from 5.0 to 300.0 (5 minutes) for PG switchover scenarios
float( float(
os.environ.get( os.environ.get(
"POSTGRES_CONNECTION_RETRY_BACKOFF", "POSTGRES_CONNECTION_RETRY_BACKOFF",
config.get( config.get(
"postgres", "connection_retry_backoff", fallback=0.5 "postgres", "connection_retry_backoff", fallback=3.0
), ),
) )
), ),
), ),
"connection_retry_backoff_max": min( "connection_retry_backoff_max": min(
60.0, 600.0, # Increased from 60.0 to 600.0 (10 minutes) for PG switchover scenarios
float( float(
os.environ.get( os.environ.get(
"POSTGRES_CONNECTION_RETRY_BACKOFF_MAX", "POSTGRES_CONNECTION_RETRY_BACKOFF_MAX",
config.get( config.get(
"postgres", "postgres",
"connection_retry_backoff_max", "connection_retry_backoff_max",
fallback=5.0, fallback=30.0,
), ),
) )
), ),

View File

@@ -584,7 +584,7 @@ export default function DocumentManager() {
// Utility function to create timeout wrapper for API calls // Utility function to create timeout wrapper for API calls
const withTimeout = useCallback(( const withTimeout = useCallback((
promise: Promise<any>, promise: Promise<any>,
timeoutMs: number = 30000, timeoutMs: number = 30000, // Default 30s timeout for normal operations
errorMsg: string = 'Request timeout' errorMsg: string = 'Request timeout'
): Promise<any> => { ): Promise<any> => {
const timeoutPromise = new Promise((_, reject) => { const timeoutPromise = new Promise((_, reject) => {
@@ -676,7 +676,8 @@ export default function DocumentManager() {
// Intelligent refresh function: handles all boundary cases // Intelligent refresh function: handles all boundary cases
const handleIntelligentRefresh = useCallback(async ( const handleIntelligentRefresh = useCallback(async (
targetPage?: number, // Optional target page, defaults to current page targetPage?: number, // Optional target page, defaults to current page
resetToFirst?: boolean // Whether to force reset to first page resetToFirst?: boolean, // Whether to force reset to first page
customTimeout?: number // Optional custom timeout in milliseconds (uses withTimeout default if not provided)
) => { ) => {
try { try {
if (!isMountedRef.current) return; if (!isMountedRef.current) return;
@@ -694,10 +695,10 @@ export default function DocumentManager() {
sort_direction: sortDirection sort_direction: sortDirection
}; };
// Use timeout wrapper for the API call // Use timeout wrapper for the API call (uses customTimeout if provided, otherwise withTimeout default)
const response = await withTimeout( const response = await withTimeout(
getDocumentsPaginated(request), getDocumentsPaginated(request),
30000, // 30 second timeout customTimeout, // Pass undefined to use default 30s, or explicit timeout for special cases
'Document fetch timeout' 'Document fetch timeout'
); );
@@ -717,7 +718,7 @@ export default function DocumentManager() {
const lastPageResponse = await withTimeout( const lastPageResponse = await withTimeout(
getDocumentsPaginated(lastPageRequest), getDocumentsPaginated(lastPageRequest),
30000, customTimeout, // Use same timeout for consistency
'Document fetch timeout' 'Document fetch timeout'
); );
@@ -847,7 +848,10 @@ export default function DocumentManager() {
// Reset health check timer with 1 second delay to avoid race condition // Reset health check timer with 1 second delay to avoid race condition
useBackendState.getState().resetHealthCheckTimerDelayed(1000); useBackendState.getState().resetHealthCheckTimerDelayed(1000);
// Start fast refresh with 2-second interval immediately after scan // Perform immediate refresh with 90s timeout after scan (tolerates PostgreSQL switchover)
await handleIntelligentRefresh(undefined, false, 90000);
// Start fast refresh with 2-second interval after initial refresh
startPollingInterval(2000); startPollingInterval(2000);
// Set recovery timer to restore normal polling interval after 15 seconds // Set recovery timer to restore normal polling interval after 15 seconds
@@ -865,7 +869,7 @@ export default function DocumentManager() {
toast.error(t('documentPanel.documentManager.errors.scanFailed', { error: errorMessage(err) })); toast.error(t('documentPanel.documentManager.errors.scanFailed', { error: errorMessage(err) }));
} }
} }
}, [t, startPollingInterval, currentTab, health, statusCounts]) }, [t, startPollingInterval, currentTab, health, statusCounts, handleIntelligentRefresh])
// Handle page size change - update state and save to store // Handle page size change - update state and save to store
const handlePageSizeChange = useCallback((newPageSize: number) => { const handlePageSizeChange = useCallback((newPageSize: number) => {
@@ -1184,7 +1188,7 @@ export default function DocumentManager() {
) : !isSelectionMode ? ( ) : !isSelectionMode ? (
<ClearDocumentsDialog onDocumentsCleared={handleDocumentsCleared} /> <ClearDocumentsDialog onDocumentsCleared={handleDocumentsCleared} />
) : null} ) : null}
<UploadDocumentsDialog onDocumentsUploaded={fetchDocuments} /> <UploadDocumentsDialog onDocumentsUploaded={() => handleIntelligentRefresh(undefined, false, 120000)} />
<PipelineStatusDialog <PipelineStatusDialog
open={showPipelineStatus} open={showPipelineStatus}
onOpenChange={setShowPipelineStatus} onOpenChange={setShowPipelineStatus}

View File

@@ -40,15 +40,15 @@ class TestPostgresRetryIntegration:
"database": os.getenv("POSTGRES_DATABASE", "postgres"), "database": os.getenv("POSTGRES_DATABASE", "postgres"),
"workspace": os.getenv("POSTGRES_WORKSPACE", "test_retry"), "workspace": os.getenv("POSTGRES_WORKSPACE", "test_retry"),
"max_connections": int(os.getenv("POSTGRES_MAX_CONNECTIONS", "10")), "max_connections": int(os.getenv("POSTGRES_MAX_CONNECTIONS", "10")),
# Connection retry configuration # Connection retry configuration - mirrors postgres_impl.py ClientManager.get_config()
"connection_retry_attempts": min( "connection_retry_attempts": min(
10, int(os.getenv("POSTGRES_CONNECTION_RETRIES", "3")) 100, int(os.getenv("POSTGRES_CONNECTION_RETRIES", "3"))
), ),
"connection_retry_backoff": min( "connection_retry_backoff": min(
5.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF", "0.5")) 300.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF", "0.5"))
), ),
"connection_retry_backoff_max": min( "connection_retry_backoff_max": min(
60.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF_MAX", "5.0")) 600.0, float(os.getenv("POSTGRES_CONNECTION_RETRY_BACKOFF_MAX", "5.0"))
), ),
"pool_close_timeout": min( "pool_close_timeout": min(
30.0, float(os.getenv("POSTGRES_POOL_CLOSE_TIMEOUT", "5.0")) 30.0, float(os.getenv("POSTGRES_POOL_CLOSE_TIMEOUT", "5.0"))