feat: enhance system resource monitoring and HuggingFace token management
- Added system resource metrics to the gRPC `ServerInfo` message, including total and available RAM, as well as GPU VRAM metrics. - Implemented `SystemResources` class to encapsulate resource data and provide a method for retrieving current system metrics. - Updated `HfTokenService` to bootstrap token storage from settings if not already configured. - Refactored gRPC service to return system resource information in server responses, improving observability and performance tracking.
This commit is contained in:
2
client
2
client
Submodule client updated: d8e84e27c3...429e02c3af
@@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, Final, cast
|
||||
|
||||
import httpx
|
||||
|
||||
from noteflow.config.settings import get_settings
|
||||
from noteflow.domain.utils.time import utc_now
|
||||
from noteflow.infrastructure.logging import get_logger
|
||||
from noteflow.infrastructure.security.protocols import EncryptedChunk
|
||||
@@ -121,6 +122,9 @@ class HfTokenService:
|
||||
metadata = await uow.preferences.get(_HF_TOKEN_META_KEY)
|
||||
|
||||
if encrypted_data is None:
|
||||
seeded = await self._bootstrap_from_settings()
|
||||
if seeded:
|
||||
return HfTokenStatus(True, False, "", None)
|
||||
return HfTokenStatus(False, False, "", None)
|
||||
|
||||
if metadata is not None and isinstance(metadata, dict):
|
||||
@@ -134,6 +138,16 @@ class HfTokenService:
|
||||
|
||||
return HfTokenStatus(True, is_validated, username, validated_at)
|
||||
|
||||
async def _bootstrap_from_settings(self) -> bool:
|
||||
"""Seed token storage from settings when not yet configured."""
|
||||
token = get_settings().diarization_hf_token
|
||||
if not token:
|
||||
return False
|
||||
success, _ = await self.set_token(token, validate=False)
|
||||
if success:
|
||||
logger.info("hf_token_seeded_from_env")
|
||||
return success
|
||||
|
||||
async def delete_token(self) -> bool:
|
||||
"""Delete the stored HuggingFace token."""
|
||||
async with self._uow_factory() as uow:
|
||||
|
||||
@@ -77,6 +77,11 @@ class ProtoServerInfoResponse(Protocol):
|
||||
active_meetings: int
|
||||
diarization_enabled: bool
|
||||
diarization_ready: bool
|
||||
system_ram_total_bytes: int
|
||||
system_ram_available_bytes: int
|
||||
gpu_vram_total_bytes: int
|
||||
gpu_vram_available_bytes: int
|
||||
def HasField(self, field_name: str) -> bool: ...
|
||||
|
||||
|
||||
class NoteFlowServiceStubProtocol(Protocol):
|
||||
|
||||
@@ -24,6 +24,7 @@ from noteflow.infrastructure.logging import (
|
||||
user_id_var,
|
||||
workspace_id_var,
|
||||
)
|
||||
from noteflow.infrastructure.metrics import get_system_resources
|
||||
from noteflow.infrastructure.persistence.memory import MemoryUnitOfWork
|
||||
from noteflow.infrastructure.persistence.unit_of_work import SqlAlchemyUnitOfWork
|
||||
from noteflow.infrastructure.security.crypto import AesGcmCryptoBox
|
||||
@@ -304,7 +305,8 @@ class ServicerInfoMixin:
|
||||
else:
|
||||
active = self.get_memory_store().active_count
|
||||
|
||||
return noteflow_pb2.ServerInfo(
|
||||
resources = get_system_resources()
|
||||
response = noteflow_pb2.ServerInfo(
|
||||
version=self.VERSION,
|
||||
asr_model=asr_model,
|
||||
asr_ready=asr_ready,
|
||||
@@ -316,6 +318,16 @@ class ServicerInfoMixin:
|
||||
diarization_ready=diarization_ready,
|
||||
state_version=self.STATE_VERSION,
|
||||
)
|
||||
if resources.ram_total_bytes is not None:
|
||||
response.system_ram_total_bytes = resources.ram_total_bytes
|
||||
if resources.ram_available_bytes is not None:
|
||||
response.system_ram_available_bytes = resources.ram_available_bytes
|
||||
if resources.gpu_vram_total_bytes is not None:
|
||||
response.gpu_vram_total_bytes = resources.gpu_vram_total_bytes
|
||||
if resources.gpu_vram_available_bytes is not None:
|
||||
response.gpu_vram_available_bytes = resources.gpu_vram_available_bytes
|
||||
|
||||
return response
|
||||
|
||||
|
||||
class ServicerLifecycleMixin:
|
||||
|
||||
@@ -31,6 +31,10 @@ class ServerInfo:
|
||||
active_meetings: int
|
||||
diarization_enabled: bool = False
|
||||
diarization_ready: bool = False
|
||||
system_ram_total_bytes: int | None = None
|
||||
system_ram_available_bytes: int | None = None
|
||||
gpu_vram_total_bytes: int | None = None
|
||||
gpu_vram_available_bytes: int | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -197,6 +197,26 @@ class NoteFlowClient(
|
||||
active_meetings=response.active_meetings,
|
||||
diarization_enabled=response.diarization_enabled,
|
||||
diarization_ready=response.diarization_ready,
|
||||
system_ram_total_bytes=(
|
||||
response.system_ram_total_bytes
|
||||
if response.HasField("system_ram_total_bytes")
|
||||
else None
|
||||
),
|
||||
system_ram_available_bytes=(
|
||||
response.system_ram_available_bytes
|
||||
if response.HasField("system_ram_available_bytes")
|
||||
else None
|
||||
),
|
||||
gpu_vram_total_bytes=(
|
||||
response.gpu_vram_total_bytes
|
||||
if response.HasField("gpu_vram_total_bytes")
|
||||
else None
|
||||
),
|
||||
gpu_vram_available_bytes=(
|
||||
response.gpu_vram_available_bytes
|
||||
if response.HasField("gpu_vram_available_bytes")
|
||||
else None
|
||||
),
|
||||
)
|
||||
except grpc.RpcError as e:
|
||||
logger.error("Failed to get server info: %s", e)
|
||||
|
||||
@@ -622,6 +622,18 @@ message ServerInfo {
|
||||
// Server state version for cache invalidation (Sprint GAP-002)
|
||||
// Increment when breaking state changes require client cache invalidation
|
||||
int64 state_version = 10;
|
||||
|
||||
// Total system RAM in bytes
|
||||
optional int64 system_ram_total_bytes = 11;
|
||||
|
||||
// Available system RAM in bytes
|
||||
optional int64 system_ram_available_bytes = 12;
|
||||
|
||||
// Total GPU VRAM in bytes (primary device)
|
||||
optional int64 gpu_vram_total_bytes = 13;
|
||||
|
||||
// Available GPU VRAM in bytes (primary device)
|
||||
optional int64 gpu_vram_available_bytes = 14;
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -1,9 +1,12 @@
|
||||
"""Metrics infrastructure for NoteFlow."""
|
||||
|
||||
from .collector import MetricsCollector, PerformanceMetrics, get_metrics_collector
|
||||
from .system_resources import SystemResources, get_system_resources
|
||||
|
||||
__all__ = [
|
||||
"MetricsCollector",
|
||||
"PerformanceMetrics",
|
||||
"get_metrics_collector",
|
||||
"SystemResources",
|
||||
"get_system_resources",
|
||||
]
|
||||
|
||||
107
src/noteflow/infrastructure/metrics/system_resources.py
Normal file
107
src/noteflow/infrastructure/metrics/system_resources.py
Normal file
@@ -0,0 +1,107 @@
|
||||
"""System resource discovery helpers."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Final, Protocol, cast
|
||||
|
||||
from noteflow.infrastructure.logging import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
_GPU_DEVICE_INDEX: Final = 0
|
||||
|
||||
|
||||
class _TorchCudaProperties(Protocol):
|
||||
total_memory: int
|
||||
|
||||
|
||||
class _TorchCudaModule(Protocol):
|
||||
def is_available(self) -> bool: ...
|
||||
def mem_get_info(self) -> tuple[int, int]: ...
|
||||
def get_device_properties(self, device: int) -> _TorchCudaProperties: ...
|
||||
def memory_reserved(self, device: int) -> int: ...
|
||||
def memory_allocated(self, device: int) -> int: ...
|
||||
|
||||
|
||||
class _TorchModule(Protocol):
|
||||
cuda: _TorchCudaModule
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class SystemResources:
|
||||
"""Snapshot of server memory resources."""
|
||||
|
||||
ram_total_bytes: int | None
|
||||
ram_available_bytes: int | None
|
||||
gpu_vram_total_bytes: int | None
|
||||
gpu_vram_available_bytes: int | None
|
||||
|
||||
|
||||
def get_system_resources() -> SystemResources:
|
||||
"""Collect current system memory resources.
|
||||
|
||||
Returns:
|
||||
SystemResources with byte counts when available.
|
||||
"""
|
||||
ram_total, ram_available = _get_ram_bytes()
|
||||
gpu_total, gpu_available = _get_gpu_bytes()
|
||||
return SystemResources(
|
||||
ram_total_bytes=ram_total,
|
||||
ram_available_bytes=ram_available,
|
||||
gpu_vram_total_bytes=gpu_total,
|
||||
gpu_vram_available_bytes=gpu_available,
|
||||
)
|
||||
|
||||
|
||||
def _get_ram_bytes() -> tuple[int | None, int | None]:
|
||||
try:
|
||||
import psutil
|
||||
except ImportError as exc:
|
||||
logger.debug("psutil unavailable; cannot read system RAM: %s", exc)
|
||||
return None, None
|
||||
|
||||
try:
|
||||
memory = psutil.virtual_memory()
|
||||
except (OSError, RuntimeError) as exc:
|
||||
logger.debug("Failed to read system RAM: %s", exc)
|
||||
return None, None
|
||||
|
||||
return int(memory.total), int(memory.available)
|
||||
|
||||
|
||||
def _get_gpu_bytes() -> tuple[int | None, int | None]:
|
||||
try:
|
||||
import torch
|
||||
except ImportError as exc:
|
||||
logger.debug("torch unavailable; cannot read GPU memory: %s", exc)
|
||||
return None, None
|
||||
|
||||
torch_typed = cast(_TorchModule, torch)
|
||||
cuda = torch_typed.cuda
|
||||
|
||||
try:
|
||||
if not cuda.is_available():
|
||||
return None, None
|
||||
except RuntimeError as exc:
|
||||
logger.debug("Failed to query CUDA availability: %s", exc)
|
||||
return None, None
|
||||
|
||||
if hasattr(cuda, "mem_get_info"):
|
||||
try:
|
||||
free_bytes, total_bytes = cuda.mem_get_info()
|
||||
return int(total_bytes), int(free_bytes)
|
||||
except RuntimeError as exc:
|
||||
logger.debug("Failed to read CUDA mem info: %s", exc)
|
||||
|
||||
try:
|
||||
props = cuda.get_device_properties(_GPU_DEVICE_INDEX)
|
||||
total_bytes = int(props.total_memory)
|
||||
reserved = int(cuda.memory_reserved(_GPU_DEVICE_INDEX))
|
||||
allocated = int(cuda.memory_allocated(_GPU_DEVICE_INDEX))
|
||||
used_bytes = max(reserved, allocated)
|
||||
free_bytes = max(total_bytes - used_bytes, 0)
|
||||
return total_bytes, free_bytes
|
||||
except RuntimeError as exc:
|
||||
logger.debug("Failed to estimate GPU memory: %s", exc)
|
||||
return None, None
|
||||
Reference in New Issue
Block a user