disbord/services/monitoring/health_monitor.py

"""
Health Monitoring System for Discord Voice Chat Quote Bot

Implements comprehensive system monitoring with Prometheus metrics,
health checks, and performance tracking for all bot components.
"""

import asyncio
import json
import logging
import time
from dataclasses import asdict, dataclass
from datetime import datetime, timedelta, timezone
from enum import Enum
from typing import Callable, Optional

import psutil

try:
    from prometheus_client import (CollectorRegistry, Counter, Gauge,
                                   Histogram, generate_latest)

    PROMETHEUS_AVAILABLE = True
except ImportError:
    # Fallback for environments without prometheus_client
    PROMETHEUS_AVAILABLE = False
    Counter = Histogram = Gauge = CollectorRegistry = None

from core.database import DatabaseManager

logger = logging.getLogger(__name__)


class HealthStatus(Enum):
    """Health status levels"""

    HEALTHY = "healthy"
    WARNING = "warning"
    CRITICAL = "critical"
    DOWN = "down"


class MetricType(Enum):
    """Types of metrics to track"""

    COUNTER = "counter"
    HISTOGRAM = "histogram"
    GAUGE = "gauge"


@dataclass
class HealthCheckResult:
    """Result of a health check"""

    component: str
    status: HealthStatus
    message: str
    response_time: float
    metadata: dict[str, str | float | int]
    timestamp: datetime


@dataclass
class SystemMetrics:
    """System performance metrics"""

    cpu_usage: float
    memory_usage: float
    disk_usage: float
    network_connections: int
    uptime: float
    timestamp: datetime


@dataclass
class ComponentMetrics:
    """Metrics for a specific component"""

    component_name: str
    requests_total: int
    errors_total: int
    response_time_avg: float
    active_connections: int
    last_error: Optional[str]
    uptime: float


class HealthMonitor:
    """
    Comprehensive health monitoring system

    Features:
    - Prometheus metrics collection and export
    - Component health checks with automatic recovery
    - System resource monitoring (CPU, memory, disk)
    - Performance metrics tracking
    - Alert thresholds and notifications
    - Health dashboard generation
    - Automatic metric cleanup and rotation
    - Integration with Discord notifications
    """

    def __init__(self, db_manager: DatabaseManager):
        self.db_manager = db_manager

        # Prometheus setup
        self.registry = (
            CollectorRegistry() if PROMETHEUS_AVAILABLE and CollectorRegistry else None
        )
        self.metrics = {}

        # Health check components
        self.health_checks: dict[str, Callable] = {}
        self.health_results: dict[str, HealthCheckResult] = {}

        # Performance tracking
        self.system_metrics_history: list[SystemMetrics] = []
        self.component_metrics: dict[str, ComponentMetrics] = {}

        # Configuration
        self.check_interval = 30  # seconds
        self.metrics_retention_hours = 24
        self.alert_thresholds = {
            "cpu_usage": 80.0,
            "memory_usage": 85.0,
            "disk_usage": 90.0,
            "error_rate": 5.0,
            "response_time": 5.0,
        }

        # Background tasks
        self._health_check_task = None
        self._metrics_collection_task = None
        self._cleanup_task = None

        # Statistics
        self.total_checks = 0
        self.failed_checks = 0
        self.alerts_sent = 0

        self._initialized = False

        # Initialize Prometheus metrics if available
        if PROMETHEUS_AVAILABLE:
            self._setup_prometheus_metrics()

    async def initialize(self):
        """Initialize the health monitoring system"""
        if self._initialized:
            return

        try:
            logger.info("Initializing health monitoring system...")

            # Setup database tables
            await self._setup_monitoring_tables()

            # Register default health checks
            await self._register_default_health_checks()

            # Start background tasks
            self._health_check_task = asyncio.create_task(self._health_check_worker())
            self._metrics_collection_task = asyncio.create_task(
                self._metrics_collection_worker()
            )
            self._cleanup_task = asyncio.create_task(self._cleanup_worker())

            self._initialized = True
            logger.info("Health monitoring system initialized successfully")

        except Exception as e:
            logger.error(f"Failed to initialize health monitoring: {e}")
            raise

    def _setup_prometheus_metrics(self):
        """Setup Prometheus metrics"""
        if not PROMETHEUS_AVAILABLE or not Gauge or not self.registry:
            return

        try:
            # System metrics
            self.metrics["cpu_usage"] = Gauge(
                "bot_cpu_usage_percent", "CPU usage percentage", registry=self.registry
            )

            self.metrics["memory_usage"] = Gauge(
                "bot_memory_usage_percent",
                "Memory usage percentage",
                registry=self.registry,
            )

            self.metrics["disk_usage"] = Gauge(
                "bot_disk_usage_percent",
                "Disk usage percentage",
                registry=self.registry,
            )

            # Component metrics
            self.metrics["requests_total"] = Counter(
                "bot_requests_total",
                "Total number of requests",
                ["component"],
                registry=self.registry,
            )

            self.metrics["errors_total"] = Counter(
                "bot_errors_total",
                "Total number of errors",
                ["component", "error_type"],
                registry=self.registry,
            )

            self.metrics["response_time"] = Histogram(
                "bot_response_time_seconds",
                "Response time in seconds",
                ["component"],
                registry=self.registry,
            )

            self.metrics["health_status"] = Gauge(
                "bot_component_health",
                "Component health status (1=healthy, 0=unhealthy)",
                ["component"],
                registry=self.registry,
            )

            # Bot-specific metrics
            self.metrics["quotes_processed"] = Counter(
                "bot_quotes_processed_total",
                "Total quotes processed",
                registry=self.registry,
            )

            self.metrics["users_active"] = Gauge(
                "bot_users_active", "Number of active users", registry=self.registry
            )

            self.metrics["voice_sessions"] = Gauge(
                "bot_voice_sessions_active",
                "Number of active voice sessions",
                registry=self.registry,
            )

            logger.info("Prometheus metrics initialized")

        except Exception as e:
            logger.error(f"Failed to setup Prometheus metrics: {e}")

    async def register_health_check(self, component: str, check_func: Callable):
        """Register a health check for a component"""
        try:
            self.health_checks[component] = check_func
            logger.info(f"Registered health check for component: {component}")

        except Exception as e:
            logger.error(f"Failed to register health check for {component}: {e}")

    async def record_metric(
        self, metric_name: str, value: float, labels: dict[str, str] | None = None
    ):
        """Record a metric value"""
        try:
            if not PROMETHEUS_AVAILABLE or metric_name not in self.metrics:
                return

            metric = self.metrics[metric_name]

            if labels:
                if hasattr(metric, "labels"):
                    metric.labels(**labels).set(value)
                else:
                    # For metrics without labels
                    metric.set(value)
            else:
                metric.set(value)

        except Exception as e:
            logger.error(f"Failed to record metric {metric_name}: {e}")

    async def increment_counter(
        self,
        metric_name: str,
        labels: dict[str, str] | None = None,
        amount: float = 1.0,
    ):
        """Increment a counter metric"""
        try:
            if not PROMETHEUS_AVAILABLE or metric_name not in self.metrics:
                return

            metric = self.metrics[metric_name]

            if labels and hasattr(metric, "labels"):
                metric.labels(**labels).inc(amount)
            else:
                metric.inc(amount)

        except Exception as e:
            logger.error(f"Failed to increment counter {metric_name}: {e}")

    async def observe_histogram(
        self, metric_name: str, value: float, labels: dict[str, str] | None = None
    ):
        """Observe a value in a histogram metric"""
        try:
            if not PROMETHEUS_AVAILABLE or metric_name not in self.metrics:
                return

            metric = self.metrics[metric_name]

            if labels and hasattr(metric, "labels"):
                metric.labels(**labels).observe(value)
            else:
                metric.observe(value)

        except Exception as e:
            logger.error(f"Failed to observe histogram {metric_name}: {e}")

    async def get_health_status(self) -> dict[str, str | dict | float | int]:
        """Get overall system health status"""
        try:
            overall_status = HealthStatus.HEALTHY
            component_statuses = {}

            # Check each component
            for component, result in self.health_results.items():
                component_statuses[component] = {
                    "status": result.status.value,
                    "message": result.message,
                    "response_time": result.response_time,
                    "last_check": result.timestamp.isoformat(),
                }

                # Determine overall status
                if result.status == HealthStatus.CRITICAL:
                    overall_status = HealthStatus.CRITICAL
                elif (
                    result.status == HealthStatus.WARNING
                    and overall_status == HealthStatus.HEALTHY
                ):
                    overall_status = HealthStatus.WARNING

            # Get system metrics
            system_metrics = await self._collect_system_metrics()

            return {
                "overall_status": overall_status.value,
                "components": component_statuses,
                "system_metrics": asdict(system_metrics) if system_metrics else {},
                "uptime": time.time() - psutil.boot_time(),
                "total_checks": self.total_checks,
                "failed_checks": self.failed_checks,
                "success_rate": (1 - self.failed_checks / max(self.total_checks, 1))
                * 100,
            }

        except Exception as e:
            logger.error(f"Failed to get health status: {e}")
            return {"overall_status": HealthStatus.CRITICAL.value, "error": str(e)}

    async def get_metrics_export(self) -> str:
        """Get Prometheus metrics export"""
        try:
            if not PROMETHEUS_AVAILABLE or not self.registry:
                return "# Prometheus not available\n"

            return generate_latest(self.registry).decode("utf-8")

        except Exception as e:
            logger.error(f"Failed to export metrics: {e}")
            return f"# Error exporting metrics: {e}\n"

    async def _register_default_health_checks(self):
        """Register default health checks for core components"""
        try:
            # Database health check
            async def database_check():
                start_time = time.time()
                try:
                    await self.db_manager.execute_query("SELECT 1", fetch_one=True)
                    response_time = time.time() - start_time

                    if response_time > 2.0:
                        return HealthCheckResult(
                            component="database",
                            status=HealthStatus.WARNING,
                            message=f"Database responding slowly ({response_time:.2f}s)",
                            response_time=response_time,
                            metadata={"query_time": response_time},
                            timestamp=datetime.now(timezone.utc),
                        )

                    return HealthCheckResult(
                        component="database",
                        status=HealthStatus.HEALTHY,
                        message="Database is responding normally",
                        response_time=response_time,
                        metadata={"query_time": response_time},
                        timestamp=datetime.now(timezone.utc),
                    )

                except Exception as e:
                    response_time = time.time() - start_time
                    return HealthCheckResult(
                        component="database",
                        status=HealthStatus.CRITICAL,
                        message=f"Database connection failed: {str(e)}",
                        response_time=response_time,
                        metadata={"error": str(e)},
                        timestamp=datetime.now(timezone.utc),
                    )

            # System resources check
            async def system_check():
                start_time = time.time()
                try:
                    # Use non-blocking CPU measurement to avoid conflicts
                    cpu_percent = psutil.cpu_percent(interval=None)
                    if (
                        cpu_percent == 0.0
                    ):  # First call returns 0.0, get blocking measurement
                        await asyncio.sleep(0.1)  # Short sleep instead of blocking
                        cpu_percent = psutil.cpu_percent(interval=None)

                    memory_percent = psutil.virtual_memory().percent
                    disk_percent = psutil.disk_usage("/").percent

                    response_time = time.time() - start_time

                    status = HealthStatus.HEALTHY
                    messages = []

                    if cpu_percent > self.alert_thresholds["cpu_usage"]:
                        status = HealthStatus.WARNING
                        messages.append(f"High CPU usage: {cpu_percent:.1f}%")

                    if memory_percent > self.alert_thresholds["memory_usage"]:
                        status = HealthStatus.WARNING
                        messages.append(f"High memory usage: {memory_percent:.1f}%")

                    if disk_percent > self.alert_thresholds["disk_usage"]:
                        status = HealthStatus.CRITICAL
                        messages.append(f"High disk usage: {disk_percent:.1f}%")

                    message = (
                        "; ".join(messages)
                        if messages
                        else "System resources are normal"
                    )

                    return HealthCheckResult(
                        component="system",
                        status=status,
                        message=message,
                        response_time=response_time,
                        metadata={
                            "cpu_percent": cpu_percent,
                            "memory_percent": memory_percent,
                            "disk_percent": disk_percent,
                        },
                        timestamp=datetime.now(timezone.utc),
                    )

                except Exception as e:
                    response_time = time.time() - start_time
                    return HealthCheckResult(
                        component="system",
                        status=HealthStatus.CRITICAL,
                        message=f"System check failed: {str(e)}",
                        response_time=response_time,
                        metadata={"error": str(e)},
                        timestamp=datetime.now(timezone.utc),
                    )

            # Register the health checks
            await self.register_health_check("database", database_check)
            await self.register_health_check("system", system_check)

        except Exception as e:
            logger.error(f"Failed to register default health checks: {e}")

    async def _health_check_worker(self):
        """Background worker to perform health checks"""
        while True:
            try:
                logger.debug("Running health checks...")

                # Run all registered health checks
                for component, check_func in self.health_checks.items():
                    try:
                        result = await check_func()
                        self.health_results[component] = result

                        # Update Prometheus metrics
                        if PROMETHEUS_AVAILABLE and "health_status" in self.metrics:
                            health_value = (
                                1 if result.status == HealthStatus.HEALTHY else 0
                            )
                            await self.record_metric(
                                "health_status", health_value, {"component": component}
                            )

                        self.total_checks += 1

                        if result.status in [
                            HealthStatus.WARNING,
                            HealthStatus.CRITICAL,
                        ]:
                            self.failed_checks += 1
                            logger.warning(
                                f"Health check failed for {component}: {result.message}"
                            )

                    except Exception as e:
                        logger.error(f"Health check error for {component}: {e}")
                        self.failed_checks += 1

                        # Create error result
                        self.health_results[component] = HealthCheckResult(
                            component=component,
                            status=HealthStatus.CRITICAL,
                            message=f"Health check error: {str(e)}",
                            response_time=0.0,
                            metadata={"error": str(e)},
                            timestamp=datetime.now(timezone.utc),
                        )

                # Store health check results
                await self._store_health_results()

                # Sleep until next check
                await asyncio.sleep(self.check_interval)

            except asyncio.CancelledError:
                break
            except Exception as e:
                logger.error(f"Error in health check worker: {e}")
                await asyncio.sleep(self.check_interval)

    async def _metrics_collection_worker(self):
        """Background worker to collect system metrics"""
        while True:
            try:
                # Collect system metrics
                system_metrics = await self._collect_system_metrics()

                if system_metrics:
                    # Store in history
                    self.system_metrics_history.append(system_metrics)

                    # Keep only recent metrics
                    cutoff_time = datetime.now(timezone.utc) - timedelta(
                        hours=self.metrics_retention_hours
                    )
                    self.system_metrics_history = [
                        m
                        for m in self.system_metrics_history
                        if m.timestamp > cutoff_time
                    ]

                    # Update Prometheus metrics
                    if PROMETHEUS_AVAILABLE:
                        await self.record_metric("cpu_usage", system_metrics.cpu_usage)
                        await self.record_metric(
                            "memory_usage", system_metrics.memory_usage
                        )
                        await self.record_metric(
                            "disk_usage", system_metrics.disk_usage
                        )

                # Sleep for 1 minute
                await asyncio.sleep(60)

            except asyncio.CancelledError:
                break
            except Exception as e:
                logger.error(f"Error in metrics collection worker: {e}")
                await asyncio.sleep(60)

    async def _collect_system_metrics(self) -> SystemMetrics | None:
        """Collect current system metrics"""
        try:
            # Use non-blocking CPU measurement
            cpu_usage = psutil.cpu_percent(interval=None)
            if cpu_usage == 0.0:  # First call, wait briefly and try again
                await asyncio.sleep(0.1)
                cpu_usage = psutil.cpu_percent(interval=None)

            memory = psutil.virtual_memory()
            disk = psutil.disk_usage("/")

            # Handle potential network connection errors gracefully
            try:
                network_connections = len(psutil.net_connections())
            except (psutil.AccessDenied, OSError):
                network_connections = 0  # Fallback if access denied

            return SystemMetrics(
                cpu_usage=cpu_usage,
                memory_usage=memory.percent,
                disk_usage=(disk.used / disk.total) * 100,
                network_connections=network_connections,
                uptime=time.time() - psutil.boot_time(),
                timestamp=datetime.now(timezone.utc),
            )

        except Exception as e:
            logger.error(f"Failed to collect system metrics: {e}")
            return None

    async def _setup_monitoring_tables(self):
        """Setup database tables for monitoring data"""
        try:
            # Health check results table
            await self.db_manager.execute_query(
                """
                CREATE TABLE IF NOT EXISTS health_check_results (
                    id SERIAL PRIMARY KEY,
                    component VARCHAR(100) NOT NULL,
                    status VARCHAR(20) NOT NULL,
                    message TEXT,
                    response_time DECIMAL(8,3),
                    metadata JSONB,
                    timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
                )
            """
            )

            # System metrics table
            await self.db_manager.execute_query(
                """
                CREATE TABLE IF NOT EXISTS system_metrics (
                    id SERIAL PRIMARY KEY,
                    cpu_usage DECIMAL(5,2),
                    memory_usage DECIMAL(5,2),
                    disk_usage DECIMAL(5,2),
                    network_connections INTEGER,
                    uptime DECIMAL(12,2),
                    timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
                )
            """
            )

            # Component metrics table
            await self.db_manager.execute_query(
                """
                CREATE TABLE IF NOT EXISTS component_metrics (
                    id SERIAL PRIMARY KEY,
                    component_name VARCHAR(100) NOT NULL,
                    requests_total INTEGER DEFAULT 0,
                    errors_total INTEGER DEFAULT 0,
                    response_time_avg DECIMAL(8,3),
                    active_connections INTEGER DEFAULT 0,
                    last_error TEXT,
                    uptime DECIMAL(12,2),
                    timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
                )
            """
            )

        except Exception as e:
            logger.error(f"Failed to setup monitoring tables: {e}")

    async def _store_health_results(self):
        """Store health check results in database"""
        try:
            for component, result in self.health_results.items():
                await self.db_manager.execute_query(
                    """
                    INSERT INTO health_check_results
                    (component, status, message, response_time, metadata, timestamp)
                    VALUES ($1, $2, $3, $4, $5, $6)
                """,
                    component,
                    result.status.value,
                    result.message,
                    result.response_time,
                    json.dumps(result.metadata),
                    result.timestamp,
                )

        except Exception as e:
            logger.error(f"Failed to store health results: {e}")

    async def _cleanup_worker(self):
        """Background worker to clean up old monitoring data"""
        while True:
            try:
                # Clean up old health check results (keep 7 days)
                cutoff_date = datetime.now(timezone.utc) - timedelta(days=7)

                deleted_health = await self.db_manager.execute_query(
                    """
                    DELETE FROM health_check_results
                    WHERE timestamp < $1
                """,
                    cutoff_date,
                )

                # Clean up old system metrics (keep 7 days)
                deleted_metrics = await self.db_manager.execute_query(
                    """
                    DELETE FROM system_metrics
                    WHERE timestamp < $1
                """,
                    cutoff_date,
                )

                if deleted_health or deleted_metrics:
                    logger.info("Cleaned up old monitoring data")

                # Sleep for 24 hours
                await asyncio.sleep(86400)

            except asyncio.CancelledError:
                break
            except Exception as e:
                logger.error(f"Error in cleanup worker: {e}")
                await asyncio.sleep(86400)

    async def check_health(self) -> dict[str, str | bool | int | float]:
        """Check health of monitoring system"""
        try:
            return {
                "initialized": self._initialized,
                "prometheus_available": PROMETHEUS_AVAILABLE,
                "registered_checks": len(self.health_checks),
                "total_checks": self.total_checks,
                "failed_checks": self.failed_checks,
                "success_rate": (1 - self.failed_checks / max(self.total_checks, 1))
                * 100,
            }

        except Exception as e:
            return {"error": str(e), "healthy": False}

    async def close(self):
        """Close health monitoring system"""
        try:
            logger.info("Closing health monitoring system...")

            # Cancel background tasks
            tasks = [
                self._health_check_task,
                self._metrics_collection_task,
                self._cleanup_task,
            ]

            for task in tasks:
                if task:
                    task.cancel()

            # Wait for tasks to complete
            await asyncio.gather(*[t for t in tasks if t], return_exceptions=True)

            logger.info("Health monitoring system closed")

        except Exception as e:
            logger.error(f"Error closing health monitoring: {e}")