- Deleted .env.example file as it is no longer needed. - Added .gitignore to manage ignored files and directories. - Introduced CLAUDE.md for AI provider integration documentation. - Created dev.sh for development setup and scripts. - Updated Dockerfile and Dockerfile.production for improved build processes. - Added multiple test files and directories for comprehensive testing. - Introduced new utility and service files for enhanced functionality. - Organized codebase with new directories and files for better maintainability.
760 lines
27 KiB
Python
760 lines
27 KiB
Python
"""
|
|
Health Monitoring System for Discord Voice Chat Quote Bot
|
|
|
|
Implements comprehensive system monitoring with Prometheus metrics,
|
|
health checks, and performance tracking for all bot components.
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import time
|
|
from dataclasses import asdict, dataclass
|
|
from datetime import datetime, timedelta, timezone
|
|
from enum import Enum
|
|
from typing import Callable, Optional
|
|
|
|
import psutil
|
|
|
|
try:
|
|
from prometheus_client import (CollectorRegistry, Counter, Gauge,
|
|
Histogram, generate_latest)
|
|
|
|
PROMETHEUS_AVAILABLE = True
|
|
except ImportError:
|
|
# Fallback for environments without prometheus_client
|
|
PROMETHEUS_AVAILABLE = False
|
|
Counter = Histogram = Gauge = CollectorRegistry = None
|
|
|
|
from core.database import DatabaseManager
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class HealthStatus(Enum):
|
|
"""Health status levels"""
|
|
|
|
HEALTHY = "healthy"
|
|
WARNING = "warning"
|
|
CRITICAL = "critical"
|
|
DOWN = "down"
|
|
|
|
|
|
class MetricType(Enum):
|
|
"""Types of metrics to track"""
|
|
|
|
COUNTER = "counter"
|
|
HISTOGRAM = "histogram"
|
|
GAUGE = "gauge"
|
|
|
|
|
|
@dataclass
|
|
class HealthCheckResult:
|
|
"""Result of a health check"""
|
|
|
|
component: str
|
|
status: HealthStatus
|
|
message: str
|
|
response_time: float
|
|
metadata: dict[str, str | float | int]
|
|
timestamp: datetime
|
|
|
|
|
|
@dataclass
|
|
class SystemMetrics:
|
|
"""System performance metrics"""
|
|
|
|
cpu_usage: float
|
|
memory_usage: float
|
|
disk_usage: float
|
|
network_connections: int
|
|
uptime: float
|
|
timestamp: datetime
|
|
|
|
|
|
@dataclass
|
|
class ComponentMetrics:
|
|
"""Metrics for a specific component"""
|
|
|
|
component_name: str
|
|
requests_total: int
|
|
errors_total: int
|
|
response_time_avg: float
|
|
active_connections: int
|
|
last_error: Optional[str]
|
|
uptime: float
|
|
|
|
|
|
class HealthMonitor:
|
|
"""
|
|
Comprehensive health monitoring system
|
|
|
|
Features:
|
|
- Prometheus metrics collection and export
|
|
- Component health checks with automatic recovery
|
|
- System resource monitoring (CPU, memory, disk)
|
|
- Performance metrics tracking
|
|
- Alert thresholds and notifications
|
|
- Health dashboard generation
|
|
- Automatic metric cleanup and rotation
|
|
- Integration with Discord notifications
|
|
"""
|
|
|
|
def __init__(self, db_manager: DatabaseManager):
|
|
self.db_manager = db_manager
|
|
|
|
# Prometheus setup
|
|
self.registry = (
|
|
CollectorRegistry() if PROMETHEUS_AVAILABLE and CollectorRegistry else None
|
|
)
|
|
self.metrics = {}
|
|
|
|
# Health check components
|
|
self.health_checks: dict[str, Callable] = {}
|
|
self.health_results: dict[str, HealthCheckResult] = {}
|
|
|
|
# Performance tracking
|
|
self.system_metrics_history: list[SystemMetrics] = []
|
|
self.component_metrics: dict[str, ComponentMetrics] = {}
|
|
|
|
# Configuration
|
|
self.check_interval = 30 # seconds
|
|
self.metrics_retention_hours = 24
|
|
self.alert_thresholds = {
|
|
"cpu_usage": 80.0,
|
|
"memory_usage": 85.0,
|
|
"disk_usage": 90.0,
|
|
"error_rate": 5.0,
|
|
"response_time": 5.0,
|
|
}
|
|
|
|
# Background tasks
|
|
self._health_check_task = None
|
|
self._metrics_collection_task = None
|
|
self._cleanup_task = None
|
|
|
|
# Statistics
|
|
self.total_checks = 0
|
|
self.failed_checks = 0
|
|
self.alerts_sent = 0
|
|
|
|
self._initialized = False
|
|
|
|
# Initialize Prometheus metrics if available
|
|
if PROMETHEUS_AVAILABLE:
|
|
self._setup_prometheus_metrics()
|
|
|
|
async def initialize(self):
|
|
"""Initialize the health monitoring system"""
|
|
if self._initialized:
|
|
return
|
|
|
|
try:
|
|
logger.info("Initializing health monitoring system...")
|
|
|
|
# Setup database tables
|
|
await self._setup_monitoring_tables()
|
|
|
|
# Register default health checks
|
|
await self._register_default_health_checks()
|
|
|
|
# Start background tasks
|
|
self._health_check_task = asyncio.create_task(self._health_check_worker())
|
|
self._metrics_collection_task = asyncio.create_task(
|
|
self._metrics_collection_worker()
|
|
)
|
|
self._cleanup_task = asyncio.create_task(self._cleanup_worker())
|
|
|
|
self._initialized = True
|
|
logger.info("Health monitoring system initialized successfully")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to initialize health monitoring: {e}")
|
|
raise
|
|
|
|
def _setup_prometheus_metrics(self):
|
|
"""Setup Prometheus metrics"""
|
|
if not PROMETHEUS_AVAILABLE or not Gauge or not self.registry:
|
|
return
|
|
|
|
try:
|
|
# System metrics
|
|
self.metrics["cpu_usage"] = Gauge(
|
|
"bot_cpu_usage_percent", "CPU usage percentage", registry=self.registry
|
|
)
|
|
|
|
self.metrics["memory_usage"] = Gauge(
|
|
"bot_memory_usage_percent",
|
|
"Memory usage percentage",
|
|
registry=self.registry,
|
|
)
|
|
|
|
self.metrics["disk_usage"] = Gauge(
|
|
"bot_disk_usage_percent",
|
|
"Disk usage percentage",
|
|
registry=self.registry,
|
|
)
|
|
|
|
# Component metrics
|
|
self.metrics["requests_total"] = Counter(
|
|
"bot_requests_total",
|
|
"Total number of requests",
|
|
["component"],
|
|
registry=self.registry,
|
|
)
|
|
|
|
self.metrics["errors_total"] = Counter(
|
|
"bot_errors_total",
|
|
"Total number of errors",
|
|
["component", "error_type"],
|
|
registry=self.registry,
|
|
)
|
|
|
|
self.metrics["response_time"] = Histogram(
|
|
"bot_response_time_seconds",
|
|
"Response time in seconds",
|
|
["component"],
|
|
registry=self.registry,
|
|
)
|
|
|
|
self.metrics["health_status"] = Gauge(
|
|
"bot_component_health",
|
|
"Component health status (1=healthy, 0=unhealthy)",
|
|
["component"],
|
|
registry=self.registry,
|
|
)
|
|
|
|
# Bot-specific metrics
|
|
self.metrics["quotes_processed"] = Counter(
|
|
"bot_quotes_processed_total",
|
|
"Total quotes processed",
|
|
registry=self.registry,
|
|
)
|
|
|
|
self.metrics["users_active"] = Gauge(
|
|
"bot_users_active", "Number of active users", registry=self.registry
|
|
)
|
|
|
|
self.metrics["voice_sessions"] = Gauge(
|
|
"bot_voice_sessions_active",
|
|
"Number of active voice sessions",
|
|
registry=self.registry,
|
|
)
|
|
|
|
logger.info("Prometheus metrics initialized")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to setup Prometheus metrics: {e}")
|
|
|
|
async def register_health_check(self, component: str, check_func: Callable):
|
|
"""Register a health check for a component"""
|
|
try:
|
|
self.health_checks[component] = check_func
|
|
logger.info(f"Registered health check for component: {component}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to register health check for {component}: {e}")
|
|
|
|
async def record_metric(
|
|
self, metric_name: str, value: float, labels: dict[str, str] | None = None
|
|
):
|
|
"""Record a metric value"""
|
|
try:
|
|
if not PROMETHEUS_AVAILABLE or metric_name not in self.metrics:
|
|
return
|
|
|
|
metric = self.metrics[metric_name]
|
|
|
|
if labels:
|
|
if hasattr(metric, "labels"):
|
|
metric.labels(**labels).set(value)
|
|
else:
|
|
# For metrics without labels
|
|
metric.set(value)
|
|
else:
|
|
metric.set(value)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to record metric {metric_name}: {e}")
|
|
|
|
async def increment_counter(
|
|
self,
|
|
metric_name: str,
|
|
labels: dict[str, str] | None = None,
|
|
amount: float = 1.0,
|
|
):
|
|
"""Increment a counter metric"""
|
|
try:
|
|
if not PROMETHEUS_AVAILABLE or metric_name not in self.metrics:
|
|
return
|
|
|
|
metric = self.metrics[metric_name]
|
|
|
|
if labels and hasattr(metric, "labels"):
|
|
metric.labels(**labels).inc(amount)
|
|
else:
|
|
metric.inc(amount)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to increment counter {metric_name}: {e}")
|
|
|
|
async def observe_histogram(
|
|
self, metric_name: str, value: float, labels: dict[str, str] | None = None
|
|
):
|
|
"""Observe a value in a histogram metric"""
|
|
try:
|
|
if not PROMETHEUS_AVAILABLE or metric_name not in self.metrics:
|
|
return
|
|
|
|
metric = self.metrics[metric_name]
|
|
|
|
if labels and hasattr(metric, "labels"):
|
|
metric.labels(**labels).observe(value)
|
|
else:
|
|
metric.observe(value)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to observe histogram {metric_name}: {e}")
|
|
|
|
async def get_health_status(self) -> dict[str, str | dict | float | int]:
|
|
"""Get overall system health status"""
|
|
try:
|
|
overall_status = HealthStatus.HEALTHY
|
|
component_statuses = {}
|
|
|
|
# Check each component
|
|
for component, result in self.health_results.items():
|
|
component_statuses[component] = {
|
|
"status": result.status.value,
|
|
"message": result.message,
|
|
"response_time": result.response_time,
|
|
"last_check": result.timestamp.isoformat(),
|
|
}
|
|
|
|
# Determine overall status
|
|
if result.status == HealthStatus.CRITICAL:
|
|
overall_status = HealthStatus.CRITICAL
|
|
elif (
|
|
result.status == HealthStatus.WARNING
|
|
and overall_status == HealthStatus.HEALTHY
|
|
):
|
|
overall_status = HealthStatus.WARNING
|
|
|
|
# Get system metrics
|
|
system_metrics = await self._collect_system_metrics()
|
|
|
|
return {
|
|
"overall_status": overall_status.value,
|
|
"components": component_statuses,
|
|
"system_metrics": asdict(system_metrics) if system_metrics else {},
|
|
"uptime": time.time() - psutil.boot_time(),
|
|
"total_checks": self.total_checks,
|
|
"failed_checks": self.failed_checks,
|
|
"success_rate": (1 - self.failed_checks / max(self.total_checks, 1))
|
|
* 100,
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to get health status: {e}")
|
|
return {"overall_status": HealthStatus.CRITICAL.value, "error": str(e)}
|
|
|
|
async def get_metrics_export(self) -> str:
|
|
"""Get Prometheus metrics export"""
|
|
try:
|
|
if not PROMETHEUS_AVAILABLE or not self.registry:
|
|
return "# Prometheus not available\n"
|
|
|
|
return generate_latest(self.registry).decode("utf-8")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to export metrics: {e}")
|
|
return f"# Error exporting metrics: {e}\n"
|
|
|
|
async def _register_default_health_checks(self):
|
|
"""Register default health checks for core components"""
|
|
try:
|
|
# Database health check
|
|
async def database_check():
|
|
start_time = time.time()
|
|
try:
|
|
await self.db_manager.execute_query("SELECT 1", fetch_one=True)
|
|
response_time = time.time() - start_time
|
|
|
|
if response_time > 2.0:
|
|
return HealthCheckResult(
|
|
component="database",
|
|
status=HealthStatus.WARNING,
|
|
message=f"Database responding slowly ({response_time:.2f}s)",
|
|
response_time=response_time,
|
|
metadata={"query_time": response_time},
|
|
timestamp=datetime.now(timezone.utc),
|
|
)
|
|
|
|
return HealthCheckResult(
|
|
component="database",
|
|
status=HealthStatus.HEALTHY,
|
|
message="Database is responding normally",
|
|
response_time=response_time,
|
|
metadata={"query_time": response_time},
|
|
timestamp=datetime.now(timezone.utc),
|
|
)
|
|
|
|
except Exception as e:
|
|
response_time = time.time() - start_time
|
|
return HealthCheckResult(
|
|
component="database",
|
|
status=HealthStatus.CRITICAL,
|
|
message=f"Database connection failed: {str(e)}",
|
|
response_time=response_time,
|
|
metadata={"error": str(e)},
|
|
timestamp=datetime.now(timezone.utc),
|
|
)
|
|
|
|
# System resources check
|
|
async def system_check():
|
|
start_time = time.time()
|
|
try:
|
|
# Use non-blocking CPU measurement to avoid conflicts
|
|
cpu_percent = psutil.cpu_percent(interval=None)
|
|
if (
|
|
cpu_percent == 0.0
|
|
): # First call returns 0.0, get blocking measurement
|
|
await asyncio.sleep(0.1) # Short sleep instead of blocking
|
|
cpu_percent = psutil.cpu_percent(interval=None)
|
|
|
|
memory_percent = psutil.virtual_memory().percent
|
|
disk_percent = psutil.disk_usage("/").percent
|
|
|
|
response_time = time.time() - start_time
|
|
|
|
status = HealthStatus.HEALTHY
|
|
messages = []
|
|
|
|
if cpu_percent > self.alert_thresholds["cpu_usage"]:
|
|
status = HealthStatus.WARNING
|
|
messages.append(f"High CPU usage: {cpu_percent:.1f}%")
|
|
|
|
if memory_percent > self.alert_thresholds["memory_usage"]:
|
|
status = HealthStatus.WARNING
|
|
messages.append(f"High memory usage: {memory_percent:.1f}%")
|
|
|
|
if disk_percent > self.alert_thresholds["disk_usage"]:
|
|
status = HealthStatus.CRITICAL
|
|
messages.append(f"High disk usage: {disk_percent:.1f}%")
|
|
|
|
message = (
|
|
"; ".join(messages)
|
|
if messages
|
|
else "System resources are normal"
|
|
)
|
|
|
|
return HealthCheckResult(
|
|
component="system",
|
|
status=status,
|
|
message=message,
|
|
response_time=response_time,
|
|
metadata={
|
|
"cpu_percent": cpu_percent,
|
|
"memory_percent": memory_percent,
|
|
"disk_percent": disk_percent,
|
|
},
|
|
timestamp=datetime.now(timezone.utc),
|
|
)
|
|
|
|
except Exception as e:
|
|
response_time = time.time() - start_time
|
|
return HealthCheckResult(
|
|
component="system",
|
|
status=HealthStatus.CRITICAL,
|
|
message=f"System check failed: {str(e)}",
|
|
response_time=response_time,
|
|
metadata={"error": str(e)},
|
|
timestamp=datetime.now(timezone.utc),
|
|
)
|
|
|
|
# Register the health checks
|
|
await self.register_health_check("database", database_check)
|
|
await self.register_health_check("system", system_check)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to register default health checks: {e}")
|
|
|
|
async def _health_check_worker(self):
|
|
"""Background worker to perform health checks"""
|
|
while True:
|
|
try:
|
|
logger.debug("Running health checks...")
|
|
|
|
# Run all registered health checks
|
|
for component, check_func in self.health_checks.items():
|
|
try:
|
|
result = await check_func()
|
|
self.health_results[component] = result
|
|
|
|
# Update Prometheus metrics
|
|
if PROMETHEUS_AVAILABLE and "health_status" in self.metrics:
|
|
health_value = (
|
|
1 if result.status == HealthStatus.HEALTHY else 0
|
|
)
|
|
await self.record_metric(
|
|
"health_status", health_value, {"component": component}
|
|
)
|
|
|
|
self.total_checks += 1
|
|
|
|
if result.status in [
|
|
HealthStatus.WARNING,
|
|
HealthStatus.CRITICAL,
|
|
]:
|
|
self.failed_checks += 1
|
|
logger.warning(
|
|
f"Health check failed for {component}: {result.message}"
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Health check error for {component}: {e}")
|
|
self.failed_checks += 1
|
|
|
|
# Create error result
|
|
self.health_results[component] = HealthCheckResult(
|
|
component=component,
|
|
status=HealthStatus.CRITICAL,
|
|
message=f"Health check error: {str(e)}",
|
|
response_time=0.0,
|
|
metadata={"error": str(e)},
|
|
timestamp=datetime.now(timezone.utc),
|
|
)
|
|
|
|
# Store health check results
|
|
await self._store_health_results()
|
|
|
|
# Sleep until next check
|
|
await asyncio.sleep(self.check_interval)
|
|
|
|
except asyncio.CancelledError:
|
|
break
|
|
except Exception as e:
|
|
logger.error(f"Error in health check worker: {e}")
|
|
await asyncio.sleep(self.check_interval)
|
|
|
|
async def _metrics_collection_worker(self):
|
|
"""Background worker to collect system metrics"""
|
|
while True:
|
|
try:
|
|
# Collect system metrics
|
|
system_metrics = await self._collect_system_metrics()
|
|
|
|
if system_metrics:
|
|
# Store in history
|
|
self.system_metrics_history.append(system_metrics)
|
|
|
|
# Keep only recent metrics
|
|
cutoff_time = datetime.now(timezone.utc) - timedelta(
|
|
hours=self.metrics_retention_hours
|
|
)
|
|
self.system_metrics_history = [
|
|
m
|
|
for m in self.system_metrics_history
|
|
if m.timestamp > cutoff_time
|
|
]
|
|
|
|
# Update Prometheus metrics
|
|
if PROMETHEUS_AVAILABLE:
|
|
await self.record_metric("cpu_usage", system_metrics.cpu_usage)
|
|
await self.record_metric(
|
|
"memory_usage", system_metrics.memory_usage
|
|
)
|
|
await self.record_metric(
|
|
"disk_usage", system_metrics.disk_usage
|
|
)
|
|
|
|
# Sleep for 1 minute
|
|
await asyncio.sleep(60)
|
|
|
|
except asyncio.CancelledError:
|
|
break
|
|
except Exception as e:
|
|
logger.error(f"Error in metrics collection worker: {e}")
|
|
await asyncio.sleep(60)
|
|
|
|
async def _collect_system_metrics(self) -> SystemMetrics | None:
|
|
"""Collect current system metrics"""
|
|
try:
|
|
# Use non-blocking CPU measurement
|
|
cpu_usage = psutil.cpu_percent(interval=None)
|
|
if cpu_usage == 0.0: # First call, wait briefly and try again
|
|
await asyncio.sleep(0.1)
|
|
cpu_usage = psutil.cpu_percent(interval=None)
|
|
|
|
memory = psutil.virtual_memory()
|
|
disk = psutil.disk_usage("/")
|
|
|
|
# Handle potential network connection errors gracefully
|
|
try:
|
|
network_connections = len(psutil.net_connections())
|
|
except (psutil.AccessDenied, OSError):
|
|
network_connections = 0 # Fallback if access denied
|
|
|
|
return SystemMetrics(
|
|
cpu_usage=cpu_usage,
|
|
memory_usage=memory.percent,
|
|
disk_usage=(disk.used / disk.total) * 100,
|
|
network_connections=network_connections,
|
|
uptime=time.time() - psutil.boot_time(),
|
|
timestamp=datetime.now(timezone.utc),
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to collect system metrics: {e}")
|
|
return None
|
|
|
|
async def _setup_monitoring_tables(self):
|
|
"""Setup database tables for monitoring data"""
|
|
try:
|
|
# Health check results table
|
|
await self.db_manager.execute_query(
|
|
"""
|
|
CREATE TABLE IF NOT EXISTS health_check_results (
|
|
id SERIAL PRIMARY KEY,
|
|
component VARCHAR(100) NOT NULL,
|
|
status VARCHAR(20) NOT NULL,
|
|
message TEXT,
|
|
response_time DECIMAL(8,3),
|
|
metadata JSONB,
|
|
timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
|
|
)
|
|
"""
|
|
)
|
|
|
|
# System metrics table
|
|
await self.db_manager.execute_query(
|
|
"""
|
|
CREATE TABLE IF NOT EXISTS system_metrics (
|
|
id SERIAL PRIMARY KEY,
|
|
cpu_usage DECIMAL(5,2),
|
|
memory_usage DECIMAL(5,2),
|
|
disk_usage DECIMAL(5,2),
|
|
network_connections INTEGER,
|
|
uptime DECIMAL(12,2),
|
|
timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
|
|
)
|
|
"""
|
|
)
|
|
|
|
# Component metrics table
|
|
await self.db_manager.execute_query(
|
|
"""
|
|
CREATE TABLE IF NOT EXISTS component_metrics (
|
|
id SERIAL PRIMARY KEY,
|
|
component_name VARCHAR(100) NOT NULL,
|
|
requests_total INTEGER DEFAULT 0,
|
|
errors_total INTEGER DEFAULT 0,
|
|
response_time_avg DECIMAL(8,3),
|
|
active_connections INTEGER DEFAULT 0,
|
|
last_error TEXT,
|
|
uptime DECIMAL(12,2),
|
|
timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
|
|
)
|
|
"""
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to setup monitoring tables: {e}")
|
|
|
|
async def _store_health_results(self):
|
|
"""Store health check results in database"""
|
|
try:
|
|
for component, result in self.health_results.items():
|
|
await self.db_manager.execute_query(
|
|
"""
|
|
INSERT INTO health_check_results
|
|
(component, status, message, response_time, metadata, timestamp)
|
|
VALUES ($1, $2, $3, $4, $5, $6)
|
|
""",
|
|
component,
|
|
result.status.value,
|
|
result.message,
|
|
result.response_time,
|
|
json.dumps(result.metadata),
|
|
result.timestamp,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to store health results: {e}")
|
|
|
|
async def _cleanup_worker(self):
|
|
"""Background worker to clean up old monitoring data"""
|
|
while True:
|
|
try:
|
|
# Clean up old health check results (keep 7 days)
|
|
cutoff_date = datetime.now(timezone.utc) - timedelta(days=7)
|
|
|
|
deleted_health = await self.db_manager.execute_query(
|
|
"""
|
|
DELETE FROM health_check_results
|
|
WHERE timestamp < $1
|
|
""",
|
|
cutoff_date,
|
|
)
|
|
|
|
# Clean up old system metrics (keep 7 days)
|
|
deleted_metrics = await self.db_manager.execute_query(
|
|
"""
|
|
DELETE FROM system_metrics
|
|
WHERE timestamp < $1
|
|
""",
|
|
cutoff_date,
|
|
)
|
|
|
|
if deleted_health or deleted_metrics:
|
|
logger.info("Cleaned up old monitoring data")
|
|
|
|
# Sleep for 24 hours
|
|
await asyncio.sleep(86400)
|
|
|
|
except asyncio.CancelledError:
|
|
break
|
|
except Exception as e:
|
|
logger.error(f"Error in cleanup worker: {e}")
|
|
await asyncio.sleep(86400)
|
|
|
|
async def check_health(self) -> dict[str, str | bool | int | float]:
|
|
"""Check health of monitoring system"""
|
|
try:
|
|
return {
|
|
"initialized": self._initialized,
|
|
"prometheus_available": PROMETHEUS_AVAILABLE,
|
|
"registered_checks": len(self.health_checks),
|
|
"total_checks": self.total_checks,
|
|
"failed_checks": self.failed_checks,
|
|
"success_rate": (1 - self.failed_checks / max(self.total_checks, 1))
|
|
* 100,
|
|
}
|
|
|
|
except Exception as e:
|
|
return {"error": str(e), "healthy": False}
|
|
|
|
async def close(self):
|
|
"""Close health monitoring system"""
|
|
try:
|
|
logger.info("Closing health monitoring system...")
|
|
|
|
# Cancel background tasks
|
|
tasks = [
|
|
self._health_check_task,
|
|
self._metrics_collection_task,
|
|
self._cleanup_task,
|
|
]
|
|
|
|
for task in tasks:
|
|
if task:
|
|
task.cancel()
|
|
|
|
# Wait for tasks to complete
|
|
await asyncio.gather(*[t for t in tasks if t], return_exceptions=True)
|
|
|
|
logger.info("Health monitoring system closed")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error closing health monitoring: {e}")
|