Files
disbord/services/monitoring/health_monitor.py
Travis Vasceannie 3acb779569 chore: remove .env.example and add new files for project structure
- Deleted .env.example file as it is no longer needed.
- Added .gitignore to manage ignored files and directories.
- Introduced CLAUDE.md for AI provider integration documentation.
- Created dev.sh for development setup and scripts.
- Updated Dockerfile and Dockerfile.production for improved build processes.
- Added multiple test files and directories for comprehensive testing.
- Introduced new utility and service files for enhanced functionality.
- Organized codebase with new directories and files for better maintainability.
2025-08-27 23:00:19 -04:00

760 lines
27 KiB
Python

"""
Health Monitoring System for Discord Voice Chat Quote Bot
Implements comprehensive system monitoring with Prometheus metrics,
health checks, and performance tracking for all bot components.
"""
import asyncio
import json
import logging
import time
from dataclasses import asdict, dataclass
from datetime import datetime, timedelta, timezone
from enum import Enum
from typing import Callable, Optional
import psutil
try:
from prometheus_client import (CollectorRegistry, Counter, Gauge,
Histogram, generate_latest)
PROMETHEUS_AVAILABLE = True
except ImportError:
# Fallback for environments without prometheus_client
PROMETHEUS_AVAILABLE = False
Counter = Histogram = Gauge = CollectorRegistry = None
from core.database import DatabaseManager
logger = logging.getLogger(__name__)
class HealthStatus(Enum):
"""Health status levels"""
HEALTHY = "healthy"
WARNING = "warning"
CRITICAL = "critical"
DOWN = "down"
class MetricType(Enum):
"""Types of metrics to track"""
COUNTER = "counter"
HISTOGRAM = "histogram"
GAUGE = "gauge"
@dataclass
class HealthCheckResult:
"""Result of a health check"""
component: str
status: HealthStatus
message: str
response_time: float
metadata: dict[str, str | float | int]
timestamp: datetime
@dataclass
class SystemMetrics:
"""System performance metrics"""
cpu_usage: float
memory_usage: float
disk_usage: float
network_connections: int
uptime: float
timestamp: datetime
@dataclass
class ComponentMetrics:
"""Metrics for a specific component"""
component_name: str
requests_total: int
errors_total: int
response_time_avg: float
active_connections: int
last_error: Optional[str]
uptime: float
class HealthMonitor:
"""
Comprehensive health monitoring system
Features:
- Prometheus metrics collection and export
- Component health checks with automatic recovery
- System resource monitoring (CPU, memory, disk)
- Performance metrics tracking
- Alert thresholds and notifications
- Health dashboard generation
- Automatic metric cleanup and rotation
- Integration with Discord notifications
"""
def __init__(self, db_manager: DatabaseManager):
self.db_manager = db_manager
# Prometheus setup
self.registry = (
CollectorRegistry() if PROMETHEUS_AVAILABLE and CollectorRegistry else None
)
self.metrics = {}
# Health check components
self.health_checks: dict[str, Callable] = {}
self.health_results: dict[str, HealthCheckResult] = {}
# Performance tracking
self.system_metrics_history: list[SystemMetrics] = []
self.component_metrics: dict[str, ComponentMetrics] = {}
# Configuration
self.check_interval = 30 # seconds
self.metrics_retention_hours = 24
self.alert_thresholds = {
"cpu_usage": 80.0,
"memory_usage": 85.0,
"disk_usage": 90.0,
"error_rate": 5.0,
"response_time": 5.0,
}
# Background tasks
self._health_check_task = None
self._metrics_collection_task = None
self._cleanup_task = None
# Statistics
self.total_checks = 0
self.failed_checks = 0
self.alerts_sent = 0
self._initialized = False
# Initialize Prometheus metrics if available
if PROMETHEUS_AVAILABLE:
self._setup_prometheus_metrics()
async def initialize(self):
"""Initialize the health monitoring system"""
if self._initialized:
return
try:
logger.info("Initializing health monitoring system...")
# Setup database tables
await self._setup_monitoring_tables()
# Register default health checks
await self._register_default_health_checks()
# Start background tasks
self._health_check_task = asyncio.create_task(self._health_check_worker())
self._metrics_collection_task = asyncio.create_task(
self._metrics_collection_worker()
)
self._cleanup_task = asyncio.create_task(self._cleanup_worker())
self._initialized = True
logger.info("Health monitoring system initialized successfully")
except Exception as e:
logger.error(f"Failed to initialize health monitoring: {e}")
raise
def _setup_prometheus_metrics(self):
"""Setup Prometheus metrics"""
if not PROMETHEUS_AVAILABLE or not Gauge or not self.registry:
return
try:
# System metrics
self.metrics["cpu_usage"] = Gauge(
"bot_cpu_usage_percent", "CPU usage percentage", registry=self.registry
)
self.metrics["memory_usage"] = Gauge(
"bot_memory_usage_percent",
"Memory usage percentage",
registry=self.registry,
)
self.metrics["disk_usage"] = Gauge(
"bot_disk_usage_percent",
"Disk usage percentage",
registry=self.registry,
)
# Component metrics
self.metrics["requests_total"] = Counter(
"bot_requests_total",
"Total number of requests",
["component"],
registry=self.registry,
)
self.metrics["errors_total"] = Counter(
"bot_errors_total",
"Total number of errors",
["component", "error_type"],
registry=self.registry,
)
self.metrics["response_time"] = Histogram(
"bot_response_time_seconds",
"Response time in seconds",
["component"],
registry=self.registry,
)
self.metrics["health_status"] = Gauge(
"bot_component_health",
"Component health status (1=healthy, 0=unhealthy)",
["component"],
registry=self.registry,
)
# Bot-specific metrics
self.metrics["quotes_processed"] = Counter(
"bot_quotes_processed_total",
"Total quotes processed",
registry=self.registry,
)
self.metrics["users_active"] = Gauge(
"bot_users_active", "Number of active users", registry=self.registry
)
self.metrics["voice_sessions"] = Gauge(
"bot_voice_sessions_active",
"Number of active voice sessions",
registry=self.registry,
)
logger.info("Prometheus metrics initialized")
except Exception as e:
logger.error(f"Failed to setup Prometheus metrics: {e}")
async def register_health_check(self, component: str, check_func: Callable):
"""Register a health check for a component"""
try:
self.health_checks[component] = check_func
logger.info(f"Registered health check for component: {component}")
except Exception as e:
logger.error(f"Failed to register health check for {component}: {e}")
async def record_metric(
self, metric_name: str, value: float, labels: dict[str, str] | None = None
):
"""Record a metric value"""
try:
if not PROMETHEUS_AVAILABLE or metric_name not in self.metrics:
return
metric = self.metrics[metric_name]
if labels:
if hasattr(metric, "labels"):
metric.labels(**labels).set(value)
else:
# For metrics without labels
metric.set(value)
else:
metric.set(value)
except Exception as e:
logger.error(f"Failed to record metric {metric_name}: {e}")
async def increment_counter(
self,
metric_name: str,
labels: dict[str, str] | None = None,
amount: float = 1.0,
):
"""Increment a counter metric"""
try:
if not PROMETHEUS_AVAILABLE or metric_name not in self.metrics:
return
metric = self.metrics[metric_name]
if labels and hasattr(metric, "labels"):
metric.labels(**labels).inc(amount)
else:
metric.inc(amount)
except Exception as e:
logger.error(f"Failed to increment counter {metric_name}: {e}")
async def observe_histogram(
self, metric_name: str, value: float, labels: dict[str, str] | None = None
):
"""Observe a value in a histogram metric"""
try:
if not PROMETHEUS_AVAILABLE or metric_name not in self.metrics:
return
metric = self.metrics[metric_name]
if labels and hasattr(metric, "labels"):
metric.labels(**labels).observe(value)
else:
metric.observe(value)
except Exception as e:
logger.error(f"Failed to observe histogram {metric_name}: {e}")
async def get_health_status(self) -> dict[str, str | dict | float | int]:
"""Get overall system health status"""
try:
overall_status = HealthStatus.HEALTHY
component_statuses = {}
# Check each component
for component, result in self.health_results.items():
component_statuses[component] = {
"status": result.status.value,
"message": result.message,
"response_time": result.response_time,
"last_check": result.timestamp.isoformat(),
}
# Determine overall status
if result.status == HealthStatus.CRITICAL:
overall_status = HealthStatus.CRITICAL
elif (
result.status == HealthStatus.WARNING
and overall_status == HealthStatus.HEALTHY
):
overall_status = HealthStatus.WARNING
# Get system metrics
system_metrics = await self._collect_system_metrics()
return {
"overall_status": overall_status.value,
"components": component_statuses,
"system_metrics": asdict(system_metrics) if system_metrics else {},
"uptime": time.time() - psutil.boot_time(),
"total_checks": self.total_checks,
"failed_checks": self.failed_checks,
"success_rate": (1 - self.failed_checks / max(self.total_checks, 1))
* 100,
}
except Exception as e:
logger.error(f"Failed to get health status: {e}")
return {"overall_status": HealthStatus.CRITICAL.value, "error": str(e)}
async def get_metrics_export(self) -> str:
"""Get Prometheus metrics export"""
try:
if not PROMETHEUS_AVAILABLE or not self.registry:
return "# Prometheus not available\n"
return generate_latest(self.registry).decode("utf-8")
except Exception as e:
logger.error(f"Failed to export metrics: {e}")
return f"# Error exporting metrics: {e}\n"
async def _register_default_health_checks(self):
"""Register default health checks for core components"""
try:
# Database health check
async def database_check():
start_time = time.time()
try:
await self.db_manager.execute_query("SELECT 1", fetch_one=True)
response_time = time.time() - start_time
if response_time > 2.0:
return HealthCheckResult(
component="database",
status=HealthStatus.WARNING,
message=f"Database responding slowly ({response_time:.2f}s)",
response_time=response_time,
metadata={"query_time": response_time},
timestamp=datetime.now(timezone.utc),
)
return HealthCheckResult(
component="database",
status=HealthStatus.HEALTHY,
message="Database is responding normally",
response_time=response_time,
metadata={"query_time": response_time},
timestamp=datetime.now(timezone.utc),
)
except Exception as e:
response_time = time.time() - start_time
return HealthCheckResult(
component="database",
status=HealthStatus.CRITICAL,
message=f"Database connection failed: {str(e)}",
response_time=response_time,
metadata={"error": str(e)},
timestamp=datetime.now(timezone.utc),
)
# System resources check
async def system_check():
start_time = time.time()
try:
# Use non-blocking CPU measurement to avoid conflicts
cpu_percent = psutil.cpu_percent(interval=None)
if (
cpu_percent == 0.0
): # First call returns 0.0, get blocking measurement
await asyncio.sleep(0.1) # Short sleep instead of blocking
cpu_percent = psutil.cpu_percent(interval=None)
memory_percent = psutil.virtual_memory().percent
disk_percent = psutil.disk_usage("/").percent
response_time = time.time() - start_time
status = HealthStatus.HEALTHY
messages = []
if cpu_percent > self.alert_thresholds["cpu_usage"]:
status = HealthStatus.WARNING
messages.append(f"High CPU usage: {cpu_percent:.1f}%")
if memory_percent > self.alert_thresholds["memory_usage"]:
status = HealthStatus.WARNING
messages.append(f"High memory usage: {memory_percent:.1f}%")
if disk_percent > self.alert_thresholds["disk_usage"]:
status = HealthStatus.CRITICAL
messages.append(f"High disk usage: {disk_percent:.1f}%")
message = (
"; ".join(messages)
if messages
else "System resources are normal"
)
return HealthCheckResult(
component="system",
status=status,
message=message,
response_time=response_time,
metadata={
"cpu_percent": cpu_percent,
"memory_percent": memory_percent,
"disk_percent": disk_percent,
},
timestamp=datetime.now(timezone.utc),
)
except Exception as e:
response_time = time.time() - start_time
return HealthCheckResult(
component="system",
status=HealthStatus.CRITICAL,
message=f"System check failed: {str(e)}",
response_time=response_time,
metadata={"error": str(e)},
timestamp=datetime.now(timezone.utc),
)
# Register the health checks
await self.register_health_check("database", database_check)
await self.register_health_check("system", system_check)
except Exception as e:
logger.error(f"Failed to register default health checks: {e}")
async def _health_check_worker(self):
"""Background worker to perform health checks"""
while True:
try:
logger.debug("Running health checks...")
# Run all registered health checks
for component, check_func in self.health_checks.items():
try:
result = await check_func()
self.health_results[component] = result
# Update Prometheus metrics
if PROMETHEUS_AVAILABLE and "health_status" in self.metrics:
health_value = (
1 if result.status == HealthStatus.HEALTHY else 0
)
await self.record_metric(
"health_status", health_value, {"component": component}
)
self.total_checks += 1
if result.status in [
HealthStatus.WARNING,
HealthStatus.CRITICAL,
]:
self.failed_checks += 1
logger.warning(
f"Health check failed for {component}: {result.message}"
)
except Exception as e:
logger.error(f"Health check error for {component}: {e}")
self.failed_checks += 1
# Create error result
self.health_results[component] = HealthCheckResult(
component=component,
status=HealthStatus.CRITICAL,
message=f"Health check error: {str(e)}",
response_time=0.0,
metadata={"error": str(e)},
timestamp=datetime.now(timezone.utc),
)
# Store health check results
await self._store_health_results()
# Sleep until next check
await asyncio.sleep(self.check_interval)
except asyncio.CancelledError:
break
except Exception as e:
logger.error(f"Error in health check worker: {e}")
await asyncio.sleep(self.check_interval)
async def _metrics_collection_worker(self):
"""Background worker to collect system metrics"""
while True:
try:
# Collect system metrics
system_metrics = await self._collect_system_metrics()
if system_metrics:
# Store in history
self.system_metrics_history.append(system_metrics)
# Keep only recent metrics
cutoff_time = datetime.now(timezone.utc) - timedelta(
hours=self.metrics_retention_hours
)
self.system_metrics_history = [
m
for m in self.system_metrics_history
if m.timestamp > cutoff_time
]
# Update Prometheus metrics
if PROMETHEUS_AVAILABLE:
await self.record_metric("cpu_usage", system_metrics.cpu_usage)
await self.record_metric(
"memory_usage", system_metrics.memory_usage
)
await self.record_metric(
"disk_usage", system_metrics.disk_usage
)
# Sleep for 1 minute
await asyncio.sleep(60)
except asyncio.CancelledError:
break
except Exception as e:
logger.error(f"Error in metrics collection worker: {e}")
await asyncio.sleep(60)
async def _collect_system_metrics(self) -> SystemMetrics | None:
"""Collect current system metrics"""
try:
# Use non-blocking CPU measurement
cpu_usage = psutil.cpu_percent(interval=None)
if cpu_usage == 0.0: # First call, wait briefly and try again
await asyncio.sleep(0.1)
cpu_usage = psutil.cpu_percent(interval=None)
memory = psutil.virtual_memory()
disk = psutil.disk_usage("/")
# Handle potential network connection errors gracefully
try:
network_connections = len(psutil.net_connections())
except (psutil.AccessDenied, OSError):
network_connections = 0 # Fallback if access denied
return SystemMetrics(
cpu_usage=cpu_usage,
memory_usage=memory.percent,
disk_usage=(disk.used / disk.total) * 100,
network_connections=network_connections,
uptime=time.time() - psutil.boot_time(),
timestamp=datetime.now(timezone.utc),
)
except Exception as e:
logger.error(f"Failed to collect system metrics: {e}")
return None
async def _setup_monitoring_tables(self):
"""Setup database tables for monitoring data"""
try:
# Health check results table
await self.db_manager.execute_query(
"""
CREATE TABLE IF NOT EXISTS health_check_results (
id SERIAL PRIMARY KEY,
component VARCHAR(100) NOT NULL,
status VARCHAR(20) NOT NULL,
message TEXT,
response_time DECIMAL(8,3),
metadata JSONB,
timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
)
"""
)
# System metrics table
await self.db_manager.execute_query(
"""
CREATE TABLE IF NOT EXISTS system_metrics (
id SERIAL PRIMARY KEY,
cpu_usage DECIMAL(5,2),
memory_usage DECIMAL(5,2),
disk_usage DECIMAL(5,2),
network_connections INTEGER,
uptime DECIMAL(12,2),
timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
)
"""
)
# Component metrics table
await self.db_manager.execute_query(
"""
CREATE TABLE IF NOT EXISTS component_metrics (
id SERIAL PRIMARY KEY,
component_name VARCHAR(100) NOT NULL,
requests_total INTEGER DEFAULT 0,
errors_total INTEGER DEFAULT 0,
response_time_avg DECIMAL(8,3),
active_connections INTEGER DEFAULT 0,
last_error TEXT,
uptime DECIMAL(12,2),
timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
)
"""
)
except Exception as e:
logger.error(f"Failed to setup monitoring tables: {e}")
async def _store_health_results(self):
"""Store health check results in database"""
try:
for component, result in self.health_results.items():
await self.db_manager.execute_query(
"""
INSERT INTO health_check_results
(component, status, message, response_time, metadata, timestamp)
VALUES ($1, $2, $3, $4, $5, $6)
""",
component,
result.status.value,
result.message,
result.response_time,
json.dumps(result.metadata),
result.timestamp,
)
except Exception as e:
logger.error(f"Failed to store health results: {e}")
async def _cleanup_worker(self):
"""Background worker to clean up old monitoring data"""
while True:
try:
# Clean up old health check results (keep 7 days)
cutoff_date = datetime.now(timezone.utc) - timedelta(days=7)
deleted_health = await self.db_manager.execute_query(
"""
DELETE FROM health_check_results
WHERE timestamp < $1
""",
cutoff_date,
)
# Clean up old system metrics (keep 7 days)
deleted_metrics = await self.db_manager.execute_query(
"""
DELETE FROM system_metrics
WHERE timestamp < $1
""",
cutoff_date,
)
if deleted_health or deleted_metrics:
logger.info("Cleaned up old monitoring data")
# Sleep for 24 hours
await asyncio.sleep(86400)
except asyncio.CancelledError:
break
except Exception as e:
logger.error(f"Error in cleanup worker: {e}")
await asyncio.sleep(86400)
async def check_health(self) -> dict[str, str | bool | int | float]:
"""Check health of monitoring system"""
try:
return {
"initialized": self._initialized,
"prometheus_available": PROMETHEUS_AVAILABLE,
"registered_checks": len(self.health_checks),
"total_checks": self.total_checks,
"failed_checks": self.failed_checks,
"success_rate": (1 - self.failed_checks / max(self.total_checks, 1))
* 100,
}
except Exception as e:
return {"error": str(e), "healthy": False}
async def close(self):
"""Close health monitoring system"""
try:
logger.info("Closing health monitoring system...")
# Cancel background tasks
tasks = [
self._health_check_task,
self._metrics_collection_task,
self._cleanup_task,
]
for task in tasks:
if task:
task.cancel()
# Wait for tasks to complete
await asyncio.gather(*[t for t in tasks if t], return_exceptions=True)
logger.info("Health monitoring system closed")
except Exception as e:
logger.error(f"Error closing health monitoring: {e}")