feat: enhance system resource monitoring and HuggingFace token management

- Added system resource metrics to the gRPC `ServerInfo` message, including total and available RAM, as well as GPU VRAM metrics.
- Implemented `SystemResources` class to encapsulate resource data and provide a method for retrieving current system metrics.
- Updated `HfTokenService` to bootstrap token storage from settings if not already configured.
- Refactored gRPC service to return system resource information in server responses, improving observability and performance tracking.
This commit is contained in:
2026-01-13 16:06:30 +00:00
parent ee82183fd7
commit 7df238c531
10 changed files with 505 additions and 328 deletions

2
client

Submodule client updated: d8e84e27c3...429e02c3af

View File

@@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, Final, cast
import httpx
from noteflow.config.settings import get_settings
from noteflow.domain.utils.time import utc_now
from noteflow.infrastructure.logging import get_logger
from noteflow.infrastructure.security.protocols import EncryptedChunk
@@ -121,6 +122,9 @@ class HfTokenService:
metadata = await uow.preferences.get(_HF_TOKEN_META_KEY)
if encrypted_data is None:
seeded = await self._bootstrap_from_settings()
if seeded:
return HfTokenStatus(True, False, "", None)
return HfTokenStatus(False, False, "", None)
if metadata is not None and isinstance(metadata, dict):
@@ -134,6 +138,16 @@ class HfTokenService:
return HfTokenStatus(True, is_validated, username, validated_at)
async def _bootstrap_from_settings(self) -> bool:
"""Seed token storage from settings when not yet configured."""
token = get_settings().diarization_hf_token
if not token:
return False
success, _ = await self.set_token(token, validate=False)
if success:
logger.info("hf_token_seeded_from_env")
return success
async def delete_token(self) -> bool:
"""Delete the stored HuggingFace token."""
async with self._uow_factory() as uow:

View File

@@ -77,6 +77,11 @@ class ProtoServerInfoResponse(Protocol):
active_meetings: int
diarization_enabled: bool
diarization_ready: bool
system_ram_total_bytes: int
system_ram_available_bytes: int
gpu_vram_total_bytes: int
gpu_vram_available_bytes: int
def HasField(self, field_name: str) -> bool: ...
class NoteFlowServiceStubProtocol(Protocol):

View File

@@ -24,6 +24,7 @@ from noteflow.infrastructure.logging import (
user_id_var,
workspace_id_var,
)
from noteflow.infrastructure.metrics import get_system_resources
from noteflow.infrastructure.persistence.memory import MemoryUnitOfWork
from noteflow.infrastructure.persistence.unit_of_work import SqlAlchemyUnitOfWork
from noteflow.infrastructure.security.crypto import AesGcmCryptoBox
@@ -304,7 +305,8 @@ class ServicerInfoMixin:
else:
active = self.get_memory_store().active_count
return noteflow_pb2.ServerInfo(
resources = get_system_resources()
response = noteflow_pb2.ServerInfo(
version=self.VERSION,
asr_model=asr_model,
asr_ready=asr_ready,
@@ -316,6 +318,16 @@ class ServicerInfoMixin:
diarization_ready=diarization_ready,
state_version=self.STATE_VERSION,
)
if resources.ram_total_bytes is not None:
response.system_ram_total_bytes = resources.ram_total_bytes
if resources.ram_available_bytes is not None:
response.system_ram_available_bytes = resources.ram_available_bytes
if resources.gpu_vram_total_bytes is not None:
response.gpu_vram_total_bytes = resources.gpu_vram_total_bytes
if resources.gpu_vram_available_bytes is not None:
response.gpu_vram_available_bytes = resources.gpu_vram_available_bytes
return response
class ServicerLifecycleMixin:

View File

@@ -31,6 +31,10 @@ class ServerInfo:
active_meetings: int
diarization_enabled: bool = False
diarization_ready: bool = False
system_ram_total_bytes: int | None = None
system_ram_available_bytes: int | None = None
gpu_vram_total_bytes: int | None = None
gpu_vram_available_bytes: int | None = None
@dataclass

View File

@@ -197,6 +197,26 @@ class NoteFlowClient(
active_meetings=response.active_meetings,
diarization_enabled=response.diarization_enabled,
diarization_ready=response.diarization_ready,
system_ram_total_bytes=(
response.system_ram_total_bytes
if response.HasField("system_ram_total_bytes")
else None
),
system_ram_available_bytes=(
response.system_ram_available_bytes
if response.HasField("system_ram_available_bytes")
else None
),
gpu_vram_total_bytes=(
response.gpu_vram_total_bytes
if response.HasField("gpu_vram_total_bytes")
else None
),
gpu_vram_available_bytes=(
response.gpu_vram_available_bytes
if response.HasField("gpu_vram_available_bytes")
else None
),
)
except grpc.RpcError as e:
logger.error("Failed to get server info: %s", e)

View File

@@ -622,6 +622,18 @@ message ServerInfo {
// Server state version for cache invalidation (Sprint GAP-002)
// Increment when breaking state changes require client cache invalidation
int64 state_version = 10;
// Total system RAM in bytes
optional int64 system_ram_total_bytes = 11;
// Available system RAM in bytes
optional int64 system_ram_available_bytes = 12;
// Total GPU VRAM in bytes (primary device)
optional int64 gpu_vram_total_bytes = 13;
// Available GPU VRAM in bytes (primary device)
optional int64 gpu_vram_available_bytes = 14;
}
// =============================================================================

File diff suppressed because one or more lines are too long

View File

@@ -1,9 +1,12 @@
"""Metrics infrastructure for NoteFlow."""
from .collector import MetricsCollector, PerformanceMetrics, get_metrics_collector
from .system_resources import SystemResources, get_system_resources
__all__ = [
"MetricsCollector",
"PerformanceMetrics",
"get_metrics_collector",
"SystemResources",
"get_system_resources",
]

View File

@@ -0,0 +1,107 @@
"""System resource discovery helpers."""
from __future__ import annotations
from dataclasses import dataclass
from typing import Final, Protocol, cast
from noteflow.infrastructure.logging import get_logger
logger = get_logger(__name__)
_GPU_DEVICE_INDEX: Final = 0
class _TorchCudaProperties(Protocol):
total_memory: int
class _TorchCudaModule(Protocol):
def is_available(self) -> bool: ...
def mem_get_info(self) -> tuple[int, int]: ...
def get_device_properties(self, device: int) -> _TorchCudaProperties: ...
def memory_reserved(self, device: int) -> int: ...
def memory_allocated(self, device: int) -> int: ...
class _TorchModule(Protocol):
cuda: _TorchCudaModule
@dataclass(frozen=True, slots=True)
class SystemResources:
"""Snapshot of server memory resources."""
ram_total_bytes: int | None
ram_available_bytes: int | None
gpu_vram_total_bytes: int | None
gpu_vram_available_bytes: int | None
def get_system_resources() -> SystemResources:
"""Collect current system memory resources.
Returns:
SystemResources with byte counts when available.
"""
ram_total, ram_available = _get_ram_bytes()
gpu_total, gpu_available = _get_gpu_bytes()
return SystemResources(
ram_total_bytes=ram_total,
ram_available_bytes=ram_available,
gpu_vram_total_bytes=gpu_total,
gpu_vram_available_bytes=gpu_available,
)
def _get_ram_bytes() -> tuple[int | None, int | None]:
try:
import psutil
except ImportError as exc:
logger.debug("psutil unavailable; cannot read system RAM: %s", exc)
return None, None
try:
memory = psutil.virtual_memory()
except (OSError, RuntimeError) as exc:
logger.debug("Failed to read system RAM: %s", exc)
return None, None
return int(memory.total), int(memory.available)
def _get_gpu_bytes() -> tuple[int | None, int | None]:
try:
import torch
except ImportError as exc:
logger.debug("torch unavailable; cannot read GPU memory: %s", exc)
return None, None
torch_typed = cast(_TorchModule, torch)
cuda = torch_typed.cuda
try:
if not cuda.is_available():
return None, None
except RuntimeError as exc:
logger.debug("Failed to query CUDA availability: %s", exc)
return None, None
if hasattr(cuda, "mem_get_info"):
try:
free_bytes, total_bytes = cuda.mem_get_info()
return int(total_bytes), int(free_bytes)
except RuntimeError as exc:
logger.debug("Failed to read CUDA mem info: %s", exc)
try:
props = cuda.get_device_properties(_GPU_DEVICE_INDEX)
total_bytes = int(props.total_memory)
reserved = int(cuda.memory_reserved(_GPU_DEVICE_INDEX))
allocated = int(cuda.memory_allocated(_GPU_DEVICE_INDEX))
used_bytes = max(reserved, allocated)
free_bytes = max(total_bytes - used_bytes, 0)
return total_bytes, free_bytes
except RuntimeError as exc:
logger.debug("Failed to estimate GPU memory: %s", exc)
return None, None