""" Error Handling System for Discord Voice Chat Quote Bot Implements comprehensive error handling, API fallbacks, circuit breakers, retry mechanisms, and resilience patterns for robust operation. """ import asyncio import functools import json import logging import time from dataclasses import dataclass from datetime import datetime, timedelta, timezone from enum import Enum from typing import Any, Callable, Dict, List, Optional logger = logging.getLogger(__name__) class ErrorSeverity(Enum): """Error severity levels""" LOW = "low" # Minor issues, no user impact MEDIUM = "medium" # Some functionality affected HIGH = "high" # Major functionality impacted CRITICAL = "critical" # System-wide failures class ErrorCategory(Enum): """Error categories for classification""" API_ERROR = "api_error" DATABASE_ERROR = "database_error" NETWORK_ERROR = "network_error" VALIDATION_ERROR = "validation_error" AUTHENTICATION_ERROR = "authentication_error" PERMISSION_ERROR = "permission_error" RESOURCE_ERROR = "resource_error" TIMEOUT_ERROR = "timeout_error" UNKNOWN_ERROR = "unknown_error" class CircuitState(Enum): """Circuit breaker states""" CLOSED = "closed" # Normal operation OPEN = "open" # Failing, requests blocked HALF_OPEN = "half_open" # Testing if service recovered @dataclass class ErrorContext: """Context information for error handling""" error: Exception error_id: str severity: ErrorSeverity category: ErrorCategory component: str operation: str user_id: Optional[int] = None guild_id: Optional[int] = None metadata: Optional[Dict[str, Any]] = None timestamp: Optional[datetime] = None def __post_init__(self): if self.timestamp is None: self.timestamp = datetime.now(timezone.utc) @dataclass class RetryConfig: """Configuration for retry mechanisms""" max_attempts: int = 3 base_delay: float = 1.0 max_delay: float = 60.0 exponential_base: float = 2.0 jitter: bool = True retry_on: Optional[List[type]] = None @dataclass class CircuitBreakerConfig: """Configuration for circuit breaker""" failure_threshold: int = 5 recovery_timeout: float = 60.0 expected_exception: type = Exception class ErrorHandler: """ Comprehensive error handling system Features: - Error classification and severity assessment - Automatic retry with exponential backoff - Circuit breaker pattern for failing services - API fallback mechanisms - Error aggregation and reporting - User-friendly error messages - Performance impact monitoring - Recovery mechanisms """ def __init__(self): # Error tracking self.error_counts: Dict[str, int] = {} self.error_history: List[ErrorContext] = [] self.circuit_breakers: Dict[str, "CircuitBreaker"] = {} # Configuration self.max_error_history = 1000 self.error_aggregation_window = timedelta(minutes=5) self.alert_threshold = 10 # errors per window # Fallback strategies self.fallback_strategies: Dict[str, Callable] = {} # Statistics self.total_errors = 0 self.handled_errors = 0 self.unhandled_errors = 0 self._initialized = False async def initialize(self): """Initialize error handling system""" if self._initialized: return try: logger.info("Initializing error handling system...") # Register default fallback strategies self._register_default_fallbacks() # Setup circuit breakers for external services self._setup_circuit_breakers() self._initialized = True logger.info("Error handling system initialized successfully") except Exception as e: logger.error(f"Failed to initialize error handling system: {e}") raise def handle_error( self, error: Exception, component: str, operation: str, severity: ErrorSeverity = ErrorSeverity.MEDIUM, user_id: Optional[int] = None, guild_id: Optional[int] = None, metadata: Optional[Dict[str, Any]] = None, ) -> ErrorContext: """Handle an error with full context""" try: # Generate unique error ID error_id = f"{component}_{operation}_{int(time.time())}" # Classify error category = self._classify_error(error) # Create error context error_context = ErrorContext( error=error, error_id=error_id, severity=severity, category=category, component=component, operation=operation, user_id=user_id, guild_id=guild_id, metadata=metadata or {}, ) # Record error self._record_error(error_context) # Log error with appropriate level self._log_error(error_context) # Update statistics self.total_errors += 1 self.handled_errors += 1 return error_context except Exception as handling_error: logger.critical(f"Error in error handler: {handling_error}") self.unhandled_errors += 1 raise def retry_with_backoff(self, config: Optional[RetryConfig] = None): """Decorator for retry with exponential backoff""" if config is None: config = RetryConfig() def decorator(func): @functools.wraps(func) async def wrapper(*args, **kwargs): last_exception = None for attempt in range(config.max_attempts): try: return await func(*args, **kwargs) except Exception as e: last_exception = e # Check if we should retry this exception if config.retry_on and not any( isinstance(e, exc_type) for exc_type in config.retry_on ): raise # Don't retry on last attempt if attempt == config.max_attempts - 1: break # Calculate delay delay = min( config.base_delay * (config.exponential_base**attempt), config.max_delay, ) # Add jitter if enabled if config.jitter: import random delay *= 0.5 + random.random() * 0.5 logger.warning( f"Retry attempt {attempt + 1}/{config.max_attempts} for {func.__name__} after {delay:.2f}s: {e}" ) await asyncio.sleep(delay) # All retries exhausted if last_exception is not None: self.handle_error( last_exception, component=func.__module__ or "unknown", operation=func.__name__, severity=ErrorSeverity.HIGH, ) raise last_exception else: # This shouldn't happen, but handle the case raise RuntimeError( f"Function {func.__name__} failed but no exception was captured" ) return wrapper return decorator def with_circuit_breaker( self, service_name: str, config: Optional[CircuitBreakerConfig] = None ): """Decorator for circuit breaker pattern""" if config is None: config = CircuitBreakerConfig() if service_name not in self.circuit_breakers: self.circuit_breakers[service_name] = CircuitBreaker(service_name, config) circuit_breaker = self.circuit_breakers[service_name] def decorator(func): @functools.wraps(func) async def wrapper(*args, **kwargs): return await circuit_breaker.call(func, *args, **kwargs) return wrapper return decorator def with_fallback(self, fallback_strategy: str): """Decorator to apply fallback strategy on error""" def decorator(func): @functools.wraps(func) async def wrapper(*args, **kwargs): try: return await func(*args, **kwargs) except Exception as e: # Handle error self.handle_error( e, component=func.__module__ or "unknown", operation=func.__name__, severity=ErrorSeverity.MEDIUM, ) # Try fallback if fallback_strategy in self.fallback_strategies: try: fallback_func = self.fallback_strategies[fallback_strategy] logger.info( f"Applying fallback strategy '{fallback_strategy}' for {func.__name__}" ) return await fallback_func(*args, **kwargs) except Exception as fallback_error: logger.error( f"Fallback strategy '{fallback_strategy}' failed: {fallback_error}" ) # Re-raise original error if no fallback or fallback failed raise return wrapper return decorator def get_user_friendly_message(self, error_context: ErrorContext) -> str: """Generate user-friendly error message""" try: category_messages = { ErrorCategory.API_ERROR: "There was a problem connecting to external services. Please try again in a moment.", ErrorCategory.DATABASE_ERROR: "There was a temporary database issue. Your data is safe, please try again.", ErrorCategory.NETWORK_ERROR: "Network connection issue detected. Please check your connection and try again.", ErrorCategory.VALIDATION_ERROR: "The information provided is not valid. Please check your input and try again.", ErrorCategory.AUTHENTICATION_ERROR: "Authentication failed. Please check your permissions.", ErrorCategory.PERMISSION_ERROR: "You don't have permission to perform this action.", ErrorCategory.RESOURCE_ERROR: "System resources are temporarily unavailable. Please try again later.", ErrorCategory.TIMEOUT_ERROR: "The operation took too long to complete. Please try again.", ErrorCategory.UNKNOWN_ERROR: "An unexpected error occurred. Our team has been notified.", } base_message = category_messages.get( error_context.category, "An error occurred. Please try again." ) # Add error ID for support if error_context.severity in [ErrorSeverity.HIGH, ErrorSeverity.CRITICAL]: base_message += f" (Error ID: {error_context.error_id})" return base_message except Exception as e: logger.error(f"Error generating user-friendly message: {e}") return "An unexpected error occurred. Please try again." def _classify_error(self, error: Exception) -> ErrorCategory: """Classify error by type and content""" try: type(error).__name__ error_message = str(error).lower() # Classification logic if "connection" in error_message or "network" in error_message: return ErrorCategory.NETWORK_ERROR elif "database" in error_message or "sql" in error_message: return ErrorCategory.DATABASE_ERROR elif "api" in error_message or "http" in error_message: return ErrorCategory.API_ERROR elif "timeout" in error_message or "timed out" in error_message: return ErrorCategory.TIMEOUT_ERROR elif "permission" in error_message or "forbidden" in error_message: return ErrorCategory.PERMISSION_ERROR elif "authentication" in error_message or "unauthorized" in error_message: return ErrorCategory.AUTHENTICATION_ERROR elif "validation" in error_message or "invalid" in error_message: return ErrorCategory.VALIDATION_ERROR elif "memory" in error_message or "resource" in error_message: return ErrorCategory.RESOURCE_ERROR else: return ErrorCategory.UNKNOWN_ERROR except Exception: return ErrorCategory.UNKNOWN_ERROR def _record_error(self, error_context: ErrorContext): """Record error for tracking and analysis""" try: # Add to history self.error_history.append(error_context) # Trim history if too long if len(self.error_history) > self.max_error_history: self.error_history = self.error_history[-self.max_error_history :] # Update counts key = f"{error_context.component}_{error_context.category.value}" self.error_counts[key] = self.error_counts.get(key, 0) + 1 except Exception as e: logger.error(f"Failed to record error: {e}") def _log_error(self, error_context: ErrorContext): """Log error with appropriate level""" try: log_message = f"[{error_context.error_id}] {error_context.component}.{error_context.operation}: {error_context.error}" if error_context.metadata: log_message += f" | Metadata: {json.dumps(error_context.metadata)}" if error_context.severity == ErrorSeverity.CRITICAL: logger.critical(log_message, exc_info=error_context.error) elif error_context.severity == ErrorSeverity.HIGH: logger.error(log_message, exc_info=error_context.error) elif error_context.severity == ErrorSeverity.MEDIUM: logger.warning(log_message) else: logger.info(log_message) except Exception as e: logger.error(f"Failed to log error: {e}") def _register_default_fallbacks(self): """Register default fallback strategies""" try: # API fallback - return cached or default response async def api_fallback(*args, **kwargs): return { "status": "degraded", "message": "Service temporarily unavailable", "data": None, } # Database fallback - return empty result async def database_fallback(*args, **kwargs): return [] # AI service fallback - return simple response async def ai_fallback(*args, **kwargs): return { "choices": [ { "message": { "content": "I apologize, but I'm having trouble processing your request right now. Please try again in a moment." } } ] } self.fallback_strategies.update( { "api_fallback": api_fallback, "database_fallback": database_fallback, "ai_fallback": ai_fallback, } ) except Exception as e: logger.error(f"Failed to register default fallbacks: {e}") def _setup_circuit_breakers(self): """Setup circuit breakers for external services""" try: # API services self.circuit_breakers["openai_api"] = CircuitBreaker( "openai_api", CircuitBreakerConfig(failure_threshold=3, recovery_timeout=30.0), ) self.circuit_breakers["anthropic_api"] = CircuitBreaker( "anthropic_api", CircuitBreakerConfig(failure_threshold=3, recovery_timeout=30.0), ) # Database self.circuit_breakers["database"] = CircuitBreaker( "database", CircuitBreakerConfig(failure_threshold=5, recovery_timeout=60.0), ) # External APIs self.circuit_breakers["discord_api"] = CircuitBreaker( "discord_api", CircuitBreakerConfig(failure_threshold=10, recovery_timeout=120.0), ) except Exception as e: logger.error(f"Failed to setup circuit breakers: {e}") def get_error_stats(self) -> Dict[str, Any]: """Get error handling statistics""" try: # Recent errors (last hour) recent_cutoff = datetime.now(timezone.utc) - timedelta(hours=1) recent_errors = [ e for e in self.error_history if e.timestamp and e.timestamp > recent_cutoff ] # Error distribution by category category_counts = {} for error in recent_errors: category = error.category.value category_counts[category] = category_counts.get(category, 0) + 1 # Error distribution by severity severity_counts = {} for error in recent_errors: severity = error.severity.value severity_counts[severity] = severity_counts.get(severity, 0) + 1 # Circuit breaker states circuit_states = {} for name, cb in self.circuit_breakers.items(): circuit_states[name] = { "state": cb.state.value, "failure_count": cb.failure_count, "last_failure": ( cb.last_failure_time.isoformat() if cb.last_failure_time else None ), } return { "total_errors": self.total_errors, "handled_errors": self.handled_errors, "unhandled_errors": self.unhandled_errors, "recent_errors": len(recent_errors), "error_rate": len(recent_errors) / 60, # errors per minute "category_distribution": category_counts, "severity_distribution": severity_counts, "circuit_breakers": circuit_states, "fallback_strategies": list(self.fallback_strategies.keys()), } except Exception as e: logger.error(f"Failed to get error stats: {e}") return {} async def check_health(self) -> Dict[str, Any]: """Check health of error handling system""" try: # Check circuit breaker states circuit_issues = [] for name, cb in self.circuit_breakers.items(): if cb.state != CircuitState.CLOSED: circuit_issues.append( { "service": name, "state": cb.state.value, "failure_count": cb.failure_count, } ) # Recent error rate recent_cutoff = datetime.now(timezone.utc) - timedelta(minutes=5) recent_errors = [ e for e in self.error_history if e.timestamp and e.timestamp > recent_cutoff ] error_rate = len(recent_errors) / 5 # errors per minute health_status = "healthy" if circuit_issues or error_rate > 5: health_status = "degraded" if len(circuit_issues) > 2 or error_rate > 10: health_status = "unhealthy" return { "status": health_status, "initialized": self._initialized, "total_errors": self.total_errors, "error_rate": error_rate, "circuit_issues": circuit_issues, "fallback_strategies": len(self.fallback_strategies), } except Exception as e: return {"status": "error", "error": str(e)} class CircuitBreaker: """Circuit breaker implementation for failing services""" def __init__(self, name: str, config: CircuitBreakerConfig): self.name = name self.config = config self.state = CircuitState.CLOSED self.failure_count = 0 self.last_failure_time: Optional[datetime] = None self.last_success_time: Optional[datetime] = None async def call(self, func: Callable, *args, **kwargs): """Call function through circuit breaker""" if self.state == CircuitState.OPEN: if self._should_attempt_reset(): self.state = CircuitState.HALF_OPEN logger.info(f"Circuit breaker {self.name} moved to HALF_OPEN") else: raise Exception(f"Circuit breaker {self.name} is OPEN") try: result = await func(*args, **kwargs) self._on_success() return result except Exception: self._on_failure() raise def _should_attempt_reset(self) -> bool: """Check if circuit breaker should attempt reset""" if not self.last_failure_time: return True time_since_failure = datetime.now(timezone.utc) - self.last_failure_time return time_since_failure.total_seconds() >= self.config.recovery_timeout def _on_success(self): """Handle successful call""" self.failure_count = 0 self.last_success_time = datetime.now(timezone.utc) if self.state == CircuitState.HALF_OPEN: self.state = CircuitState.CLOSED logger.info(f"Circuit breaker {self.name} reset to CLOSED") def _on_failure(self): """Handle failed call""" self.failure_count += 1 self.last_failure_time = datetime.now(timezone.utc) if self.failure_count >= self.config.failure_threshold: self.state = CircuitState.OPEN logger.warning( f"Circuit breaker {self.name} opened after {self.failure_count} failures" ) # Global error handler instance - will be initialized in main.py error_handler: Optional[ErrorHandler] = None async def initialize_error_handler() -> ErrorHandler: """Initialize the global error handler instance""" global error_handler if error_handler is None: error_handler = ErrorHandler() await error_handler.initialize() return error_handler def get_error_handler() -> ErrorHandler: """Get the global error handler instance""" if error_handler is None: raise RuntimeError( "Error handler not initialized. Call initialize_error_handler() first." ) return error_handler # Convenience decorators def handle_errors( component: str, operation: Optional[str] = None, severity: ErrorSeverity = ErrorSeverity.MEDIUM, ): """Decorator for automatic error handling""" def decorator(func): @functools.wraps(func) async def wrapper(*args, **kwargs): try: return await func(*args, **kwargs) except Exception as e: handler = get_error_handler() handler.handle_error( e, component=component, operation=operation or func.__name__, severity=severity, ) raise return wrapper return decorator def with_retry(max_attempts: int = 3, base_delay: float = 1.0): """Decorator for retry with exponential backoff""" config = RetryConfig(max_attempts=max_attempts, base_delay=base_delay) handler = get_error_handler() return handler.retry_with_backoff(config) def with_circuit_breaker(service_name: str): """Decorator for circuit breaker pattern""" handler = get_error_handler() return handler.with_circuit_breaker(service_name) def with_fallback(strategy: str): """Decorator for fallback strategy""" handler = get_error_handler() return handler.with_fallback(strategy)