Files
disbord/core/error_handler.py
Travis Vasceannie 3acb779569 chore: remove .env.example and add new files for project structure
- Deleted .env.example file as it is no longer needed.
- Added .gitignore to manage ignored files and directories.
- Introduced CLAUDE.md for AI provider integration documentation.
- Created dev.sh for development setup and scripts.
- Updated Dockerfile and Dockerfile.production for improved build processes.
- Added multiple test files and directories for comprehensive testing.
- Introduced new utility and service files for enhanced functionality.
- Organized codebase with new directories and files for better maintainability.
2025-08-27 23:00:19 -04:00

701 lines
24 KiB
Python

"""
Error Handling System for Discord Voice Chat Quote Bot
Implements comprehensive error handling, API fallbacks, circuit breakers,
retry mechanisms, and resilience patterns for robust operation.
"""
import asyncio
import functools
import json
import logging
import time
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from enum import Enum
from typing import Any, Callable, Dict, List, Optional
logger = logging.getLogger(__name__)
class ErrorSeverity(Enum):
"""Error severity levels"""
LOW = "low" # Minor issues, no user impact
MEDIUM = "medium" # Some functionality affected
HIGH = "high" # Major functionality impacted
CRITICAL = "critical" # System-wide failures
class ErrorCategory(Enum):
"""Error categories for classification"""
API_ERROR = "api_error"
DATABASE_ERROR = "database_error"
NETWORK_ERROR = "network_error"
VALIDATION_ERROR = "validation_error"
AUTHENTICATION_ERROR = "authentication_error"
PERMISSION_ERROR = "permission_error"
RESOURCE_ERROR = "resource_error"
TIMEOUT_ERROR = "timeout_error"
UNKNOWN_ERROR = "unknown_error"
class CircuitState(Enum):
"""Circuit breaker states"""
CLOSED = "closed" # Normal operation
OPEN = "open" # Failing, requests blocked
HALF_OPEN = "half_open" # Testing if service recovered
@dataclass
class ErrorContext:
"""Context information for error handling"""
error: Exception
error_id: str
severity: ErrorSeverity
category: ErrorCategory
component: str
operation: str
user_id: Optional[int] = None
guild_id: Optional[int] = None
metadata: Optional[Dict[str, Any]] = None
timestamp: Optional[datetime] = None
def __post_init__(self):
if self.timestamp is None:
self.timestamp = datetime.now(timezone.utc)
@dataclass
class RetryConfig:
"""Configuration for retry mechanisms"""
max_attempts: int = 3
base_delay: float = 1.0
max_delay: float = 60.0
exponential_base: float = 2.0
jitter: bool = True
retry_on: Optional[List[type]] = None
@dataclass
class CircuitBreakerConfig:
"""Configuration for circuit breaker"""
failure_threshold: int = 5
recovery_timeout: float = 60.0
expected_exception: type = Exception
class ErrorHandler:
"""
Comprehensive error handling system
Features:
- Error classification and severity assessment
- Automatic retry with exponential backoff
- Circuit breaker pattern for failing services
- API fallback mechanisms
- Error aggregation and reporting
- User-friendly error messages
- Performance impact monitoring
- Recovery mechanisms
"""
def __init__(self):
# Error tracking
self.error_counts: Dict[str, int] = {}
self.error_history: List[ErrorContext] = []
self.circuit_breakers: Dict[str, "CircuitBreaker"] = {}
# Configuration
self.max_error_history = 1000
self.error_aggregation_window = timedelta(minutes=5)
self.alert_threshold = 10 # errors per window
# Fallback strategies
self.fallback_strategies: Dict[str, Callable] = {}
# Statistics
self.total_errors = 0
self.handled_errors = 0
self.unhandled_errors = 0
self._initialized = False
async def initialize(self):
"""Initialize error handling system"""
if self._initialized:
return
try:
logger.info("Initializing error handling system...")
# Register default fallback strategies
self._register_default_fallbacks()
# Setup circuit breakers for external services
self._setup_circuit_breakers()
self._initialized = True
logger.info("Error handling system initialized successfully")
except Exception as e:
logger.error(f"Failed to initialize error handling system: {e}")
raise
def handle_error(
self,
error: Exception,
component: str,
operation: str,
severity: ErrorSeverity = ErrorSeverity.MEDIUM,
user_id: Optional[int] = None,
guild_id: Optional[int] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> ErrorContext:
"""Handle an error with full context"""
try:
# Generate unique error ID
error_id = f"{component}_{operation}_{int(time.time())}"
# Classify error
category = self._classify_error(error)
# Create error context
error_context = ErrorContext(
error=error,
error_id=error_id,
severity=severity,
category=category,
component=component,
operation=operation,
user_id=user_id,
guild_id=guild_id,
metadata=metadata or {},
)
# Record error
self._record_error(error_context)
# Log error with appropriate level
self._log_error(error_context)
# Update statistics
self.total_errors += 1
self.handled_errors += 1
return error_context
except Exception as handling_error:
logger.critical(f"Error in error handler: {handling_error}")
self.unhandled_errors += 1
raise
def retry_with_backoff(self, config: Optional[RetryConfig] = None):
"""Decorator for retry with exponential backoff"""
if config is None:
config = RetryConfig()
def decorator(func):
@functools.wraps(func)
async def wrapper(*args, **kwargs):
last_exception = None
for attempt in range(config.max_attempts):
try:
return await func(*args, **kwargs)
except Exception as e:
last_exception = e
# Check if we should retry this exception
if config.retry_on and not any(
isinstance(e, exc_type) for exc_type in config.retry_on
):
raise
# Don't retry on last attempt
if attempt == config.max_attempts - 1:
break
# Calculate delay
delay = min(
config.base_delay * (config.exponential_base**attempt),
config.max_delay,
)
# Add jitter if enabled
if config.jitter:
import random
delay *= 0.5 + random.random() * 0.5
logger.warning(
f"Retry attempt {attempt + 1}/{config.max_attempts} for {func.__name__} after {delay:.2f}s: {e}"
)
await asyncio.sleep(delay)
# All retries exhausted
if last_exception is not None:
self.handle_error(
last_exception,
component=func.__module__ or "unknown",
operation=func.__name__,
severity=ErrorSeverity.HIGH,
)
raise last_exception
else:
# This shouldn't happen, but handle the case
raise RuntimeError(
f"Function {func.__name__} failed but no exception was captured"
)
return wrapper
return decorator
def with_circuit_breaker(
self, service_name: str, config: Optional[CircuitBreakerConfig] = None
):
"""Decorator for circuit breaker pattern"""
if config is None:
config = CircuitBreakerConfig()
if service_name not in self.circuit_breakers:
self.circuit_breakers[service_name] = CircuitBreaker(service_name, config)
circuit_breaker = self.circuit_breakers[service_name]
def decorator(func):
@functools.wraps(func)
async def wrapper(*args, **kwargs):
return await circuit_breaker.call(func, *args, **kwargs)
return wrapper
return decorator
def with_fallback(self, fallback_strategy: str):
"""Decorator to apply fallback strategy on error"""
def decorator(func):
@functools.wraps(func)
async def wrapper(*args, **kwargs):
try:
return await func(*args, **kwargs)
except Exception as e:
# Handle error
self.handle_error(
e,
component=func.__module__ or "unknown",
operation=func.__name__,
severity=ErrorSeverity.MEDIUM,
)
# Try fallback
if fallback_strategy in self.fallback_strategies:
try:
fallback_func = self.fallback_strategies[fallback_strategy]
logger.info(
f"Applying fallback strategy '{fallback_strategy}' for {func.__name__}"
)
return await fallback_func(*args, **kwargs)
except Exception as fallback_error:
logger.error(
f"Fallback strategy '{fallback_strategy}' failed: {fallback_error}"
)
# Re-raise original error if no fallback or fallback failed
raise
return wrapper
return decorator
def get_user_friendly_message(self, error_context: ErrorContext) -> str:
"""Generate user-friendly error message"""
try:
category_messages = {
ErrorCategory.API_ERROR: "There was a problem connecting to external services. Please try again in a moment.",
ErrorCategory.DATABASE_ERROR: "There was a temporary database issue. Your data is safe, please try again.",
ErrorCategory.NETWORK_ERROR: "Network connection issue detected. Please check your connection and try again.",
ErrorCategory.VALIDATION_ERROR: "The information provided is not valid. Please check your input and try again.",
ErrorCategory.AUTHENTICATION_ERROR: "Authentication failed. Please check your permissions.",
ErrorCategory.PERMISSION_ERROR: "You don't have permission to perform this action.",
ErrorCategory.RESOURCE_ERROR: "System resources are temporarily unavailable. Please try again later.",
ErrorCategory.TIMEOUT_ERROR: "The operation took too long to complete. Please try again.",
ErrorCategory.UNKNOWN_ERROR: "An unexpected error occurred. Our team has been notified.",
}
base_message = category_messages.get(
error_context.category, "An error occurred. Please try again."
)
# Add error ID for support
if error_context.severity in [ErrorSeverity.HIGH, ErrorSeverity.CRITICAL]:
base_message += f" (Error ID: {error_context.error_id})"
return base_message
except Exception as e:
logger.error(f"Error generating user-friendly message: {e}")
return "An unexpected error occurred. Please try again."
def _classify_error(self, error: Exception) -> ErrorCategory:
"""Classify error by type and content"""
try:
type(error).__name__
error_message = str(error).lower()
# Classification logic
if "connection" in error_message or "network" in error_message:
return ErrorCategory.NETWORK_ERROR
elif "database" in error_message or "sql" in error_message:
return ErrorCategory.DATABASE_ERROR
elif "api" in error_message or "http" in error_message:
return ErrorCategory.API_ERROR
elif "timeout" in error_message or "timed out" in error_message:
return ErrorCategory.TIMEOUT_ERROR
elif "permission" in error_message or "forbidden" in error_message:
return ErrorCategory.PERMISSION_ERROR
elif "authentication" in error_message or "unauthorized" in error_message:
return ErrorCategory.AUTHENTICATION_ERROR
elif "validation" in error_message or "invalid" in error_message:
return ErrorCategory.VALIDATION_ERROR
elif "memory" in error_message or "resource" in error_message:
return ErrorCategory.RESOURCE_ERROR
else:
return ErrorCategory.UNKNOWN_ERROR
except Exception:
return ErrorCategory.UNKNOWN_ERROR
def _record_error(self, error_context: ErrorContext):
"""Record error for tracking and analysis"""
try:
# Add to history
self.error_history.append(error_context)
# Trim history if too long
if len(self.error_history) > self.max_error_history:
self.error_history = self.error_history[-self.max_error_history :]
# Update counts
key = f"{error_context.component}_{error_context.category.value}"
self.error_counts[key] = self.error_counts.get(key, 0) + 1
except Exception as e:
logger.error(f"Failed to record error: {e}")
def _log_error(self, error_context: ErrorContext):
"""Log error with appropriate level"""
try:
log_message = f"[{error_context.error_id}] {error_context.component}.{error_context.operation}: {error_context.error}"
if error_context.metadata:
log_message += f" | Metadata: {json.dumps(error_context.metadata)}"
if error_context.severity == ErrorSeverity.CRITICAL:
logger.critical(log_message, exc_info=error_context.error)
elif error_context.severity == ErrorSeverity.HIGH:
logger.error(log_message, exc_info=error_context.error)
elif error_context.severity == ErrorSeverity.MEDIUM:
logger.warning(log_message)
else:
logger.info(log_message)
except Exception as e:
logger.error(f"Failed to log error: {e}")
def _register_default_fallbacks(self):
"""Register default fallback strategies"""
try:
# API fallback - return cached or default response
async def api_fallback(*args, **kwargs):
return {
"status": "degraded",
"message": "Service temporarily unavailable",
"data": None,
}
# Database fallback - return empty result
async def database_fallback(*args, **kwargs):
return []
# AI service fallback - return simple response
async def ai_fallback(*args, **kwargs):
return {
"choices": [
{
"message": {
"content": "I apologize, but I'm having trouble processing your request right now. Please try again in a moment."
}
}
]
}
self.fallback_strategies.update(
{
"api_fallback": api_fallback,
"database_fallback": database_fallback,
"ai_fallback": ai_fallback,
}
)
except Exception as e:
logger.error(f"Failed to register default fallbacks: {e}")
def _setup_circuit_breakers(self):
"""Setup circuit breakers for external services"""
try:
# API services
self.circuit_breakers["openai_api"] = CircuitBreaker(
"openai_api",
CircuitBreakerConfig(failure_threshold=3, recovery_timeout=30.0),
)
self.circuit_breakers["anthropic_api"] = CircuitBreaker(
"anthropic_api",
CircuitBreakerConfig(failure_threshold=3, recovery_timeout=30.0),
)
# Database
self.circuit_breakers["database"] = CircuitBreaker(
"database",
CircuitBreakerConfig(failure_threshold=5, recovery_timeout=60.0),
)
# External APIs
self.circuit_breakers["discord_api"] = CircuitBreaker(
"discord_api",
CircuitBreakerConfig(failure_threshold=10, recovery_timeout=120.0),
)
except Exception as e:
logger.error(f"Failed to setup circuit breakers: {e}")
def get_error_stats(self) -> Dict[str, Any]:
"""Get error handling statistics"""
try:
# Recent errors (last hour)
recent_cutoff = datetime.now(timezone.utc) - timedelta(hours=1)
recent_errors = [
e
for e in self.error_history
if e.timestamp and e.timestamp > recent_cutoff
]
# Error distribution by category
category_counts = {}
for error in recent_errors:
category = error.category.value
category_counts[category] = category_counts.get(category, 0) + 1
# Error distribution by severity
severity_counts = {}
for error in recent_errors:
severity = error.severity.value
severity_counts[severity] = severity_counts.get(severity, 0) + 1
# Circuit breaker states
circuit_states = {}
for name, cb in self.circuit_breakers.items():
circuit_states[name] = {
"state": cb.state.value,
"failure_count": cb.failure_count,
"last_failure": (
cb.last_failure_time.isoformat()
if cb.last_failure_time
else None
),
}
return {
"total_errors": self.total_errors,
"handled_errors": self.handled_errors,
"unhandled_errors": self.unhandled_errors,
"recent_errors": len(recent_errors),
"error_rate": len(recent_errors) / 60, # errors per minute
"category_distribution": category_counts,
"severity_distribution": severity_counts,
"circuit_breakers": circuit_states,
"fallback_strategies": list(self.fallback_strategies.keys()),
}
except Exception as e:
logger.error(f"Failed to get error stats: {e}")
return {}
async def check_health(self) -> Dict[str, Any]:
"""Check health of error handling system"""
try:
# Check circuit breaker states
circuit_issues = []
for name, cb in self.circuit_breakers.items():
if cb.state != CircuitState.CLOSED:
circuit_issues.append(
{
"service": name,
"state": cb.state.value,
"failure_count": cb.failure_count,
}
)
# Recent error rate
recent_cutoff = datetime.now(timezone.utc) - timedelta(minutes=5)
recent_errors = [
e
for e in self.error_history
if e.timestamp and e.timestamp > recent_cutoff
]
error_rate = len(recent_errors) / 5 # errors per minute
health_status = "healthy"
if circuit_issues or error_rate > 5:
health_status = "degraded"
if len(circuit_issues) > 2 or error_rate > 10:
health_status = "unhealthy"
return {
"status": health_status,
"initialized": self._initialized,
"total_errors": self.total_errors,
"error_rate": error_rate,
"circuit_issues": circuit_issues,
"fallback_strategies": len(self.fallback_strategies),
}
except Exception as e:
return {"status": "error", "error": str(e)}
class CircuitBreaker:
"""Circuit breaker implementation for failing services"""
def __init__(self, name: str, config: CircuitBreakerConfig):
self.name = name
self.config = config
self.state = CircuitState.CLOSED
self.failure_count = 0
self.last_failure_time: Optional[datetime] = None
self.last_success_time: Optional[datetime] = None
async def call(self, func: Callable, *args, **kwargs):
"""Call function through circuit breaker"""
if self.state == CircuitState.OPEN:
if self._should_attempt_reset():
self.state = CircuitState.HALF_OPEN
logger.info(f"Circuit breaker {self.name} moved to HALF_OPEN")
else:
raise Exception(f"Circuit breaker {self.name} is OPEN")
try:
result = await func(*args, **kwargs)
self._on_success()
return result
except Exception:
self._on_failure()
raise
def _should_attempt_reset(self) -> bool:
"""Check if circuit breaker should attempt reset"""
if not self.last_failure_time:
return True
time_since_failure = datetime.now(timezone.utc) - self.last_failure_time
return time_since_failure.total_seconds() >= self.config.recovery_timeout
def _on_success(self):
"""Handle successful call"""
self.failure_count = 0
self.last_success_time = datetime.now(timezone.utc)
if self.state == CircuitState.HALF_OPEN:
self.state = CircuitState.CLOSED
logger.info(f"Circuit breaker {self.name} reset to CLOSED")
def _on_failure(self):
"""Handle failed call"""
self.failure_count += 1
self.last_failure_time = datetime.now(timezone.utc)
if self.failure_count >= self.config.failure_threshold:
self.state = CircuitState.OPEN
logger.warning(
f"Circuit breaker {self.name} opened after {self.failure_count} failures"
)
# Global error handler instance - will be initialized in main.py
error_handler: Optional[ErrorHandler] = None
async def initialize_error_handler() -> ErrorHandler:
"""Initialize the global error handler instance"""
global error_handler
if error_handler is None:
error_handler = ErrorHandler()
await error_handler.initialize()
return error_handler
def get_error_handler() -> ErrorHandler:
"""Get the global error handler instance"""
if error_handler is None:
raise RuntimeError(
"Error handler not initialized. Call initialize_error_handler() first."
)
return error_handler
# Convenience decorators
def handle_errors(
component: str,
operation: Optional[str] = None,
severity: ErrorSeverity = ErrorSeverity.MEDIUM,
):
"""Decorator for automatic error handling"""
def decorator(func):
@functools.wraps(func)
async def wrapper(*args, **kwargs):
try:
return await func(*args, **kwargs)
except Exception as e:
handler = get_error_handler()
handler.handle_error(
e,
component=component,
operation=operation or func.__name__,
severity=severity,
)
raise
return wrapper
return decorator
def with_retry(max_attempts: int = 3, base_delay: float = 1.0):
"""Decorator for retry with exponential backoff"""
config = RetryConfig(max_attempts=max_attempts, base_delay=base_delay)
handler = get_error_handler()
return handler.retry_with_backoff(config)
def with_circuit_breaker(service_name: str):
"""Decorator for circuit breaker pattern"""
handler = get_error_handler()
return handler.with_circuit_breaker(service_name)
def with_fallback(strategy: str):
"""Decorator for fallback strategy"""
handler = get_error_handler()
return handler.with_fallback(strategy)