feat: enhance system resource monitoring and HuggingFace token management

- Added system resource metrics to the gRPC `ServerInfo` message, including total and available RAM, as well as GPU VRAM metrics. - Implemented `SystemResources` class to encapsulate resource data and provide a method for retrieving current system metrics. - Updated `HfTokenService` to bootstrap token storage from settings if not already configured. - Refactored gRPC service to return system resource information in server responses, improving observability and performance tracking.
2026-01-13 16:06:30 +00:00
parent ee82183fd7
commit 7df238c531
10 changed files with 505 additions and 328 deletions
--- a/2
+++ b/2
--- a/src/noteflow/application/services/hf_token_service.py
+++ b/src/noteflow/application/services/hf_token_service.py
@@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, Final, cast

 import httpx

+from noteflow.config.settings import get_settings
 from noteflow.domain.utils.time import utc_now
 from noteflow.infrastructure.logging import get_logger
 from noteflow.infrastructure.security.protocols import EncryptedChunk
@@ -121,6 +122,9 @@ class HfTokenService:
            metadata = await uow.preferences.get(_HF_TOKEN_META_KEY)

        if encrypted_data is None:
+            seeded = await self._bootstrap_from_settings()
+            if seeded:
+                return HfTokenStatus(True, False, "", None)
            return HfTokenStatus(False, False, "", None)

        if metadata is not None and isinstance(metadata, dict):
@@ -134,6 +138,16 @@ class HfTokenService:

        return HfTokenStatus(True, is_validated, username, validated_at)

+    async def _bootstrap_from_settings(self) -> bool:
+        """Seed token storage from settings when not yet configured."""
+        token = get_settings().diarization_hf_token
+        if not token:
+            return False
+        success, _ = await self.set_token(token, validate=False)
+        if success:
+            logger.info("hf_token_seeded_from_env")
+        return success
+
    async def delete_token(self) -> bool:
        """Delete the stored HuggingFace token."""
        async with self._uow_factory() as uow:
--- a/src/noteflow/grpc/_client_mixins/protocols.py
+++ b/src/noteflow/grpc/_client_mixins/protocols.py
@@ -77,6 +77,11 @@ class ProtoServerInfoResponse(Protocol):
    active_meetings: int
    diarization_enabled: bool
    diarization_ready: bool
+    system_ram_total_bytes: int
+    system_ram_available_bytes: int
+    gpu_vram_total_bytes: int
+    gpu_vram_available_bytes: int
+    def HasField(self, field_name: str) -> bool: ...


 class NoteFlowServiceStubProtocol(Protocol):
--- a/src/noteflow/grpc/_service_mixins.py
+++ b/src/noteflow/grpc/_service_mixins.py
@@ -24,6 +24,7 @@ from noteflow.infrastructure.logging import (
    user_id_var,
    workspace_id_var,
 )
+from noteflow.infrastructure.metrics import get_system_resources
 from noteflow.infrastructure.persistence.memory import MemoryUnitOfWork
 from noteflow.infrastructure.persistence.unit_of_work import SqlAlchemyUnitOfWork
 from noteflow.infrastructure.security.crypto import AesGcmCryptoBox
@@ -304,7 +305,8 @@ class ServicerInfoMixin:
        else:
            active = self.get_memory_store().active_count

-        return noteflow_pb2.ServerInfo(
+        resources = get_system_resources()
+        response = noteflow_pb2.ServerInfo(
            version=self.VERSION,
            asr_model=asr_model,
            asr_ready=asr_ready,
@@ -316,6 +318,16 @@ class ServicerInfoMixin:
            diarization_ready=diarization_ready,
            state_version=self.STATE_VERSION,
        )
+        if resources.ram_total_bytes is not None:
+            response.system_ram_total_bytes = resources.ram_total_bytes
+        if resources.ram_available_bytes is not None:
+            response.system_ram_available_bytes = resources.ram_available_bytes
+        if resources.gpu_vram_total_bytes is not None:
+            response.gpu_vram_total_bytes = resources.gpu_vram_total_bytes
+        if resources.gpu_vram_available_bytes is not None:
+            response.gpu_vram_available_bytes = resources.gpu_vram_available_bytes
+
+        return response


 class ServicerLifecycleMixin:
--- a/src/noteflow/grpc/_types.py
+++ b/src/noteflow/grpc/_types.py
@@ -31,6 +31,10 @@ class ServerInfo:
    active_meetings: int
    diarization_enabled: bool = False
    diarization_ready: bool = False
+    system_ram_total_bytes: int | None = None
+    system_ram_available_bytes: int | None = None
+    gpu_vram_total_bytes: int | None = None
+    gpu_vram_available_bytes: int | None = None


@dataclass
--- a/src/noteflow/grpc/client.py
+++ b/src/noteflow/grpc/client.py
@@ -197,6 +197,26 @@ class NoteFlowClient(
                active_meetings=response.active_meetings,
                diarization_enabled=response.diarization_enabled,
                diarization_ready=response.diarization_ready,
+                system_ram_total_bytes=(
+                    response.system_ram_total_bytes
+                    if response.HasField("system_ram_total_bytes")
+                    else None
+                ),
+                system_ram_available_bytes=(
+                    response.system_ram_available_bytes
+                    if response.HasField("system_ram_available_bytes")
+                    else None
+                ),
+                gpu_vram_total_bytes=(
+                    response.gpu_vram_total_bytes
+                    if response.HasField("gpu_vram_total_bytes")
+                    else None
+                ),
+                gpu_vram_available_bytes=(
+                    response.gpu_vram_available_bytes
+                    if response.HasField("gpu_vram_available_bytes")
+                    else None
+                ),
            )
        except grpc.RpcError as e:
            logger.error("Failed to get server info: %s", e)
--- a/src/noteflow/grpc/proto/noteflow.proto
+++ b/src/noteflow/grpc/proto/noteflow.proto
@@ -622,6 +622,18 @@ message ServerInfo {
  // Server state version for cache invalidation (Sprint GAP-002)
  // Increment when breaking state changes require client cache invalidation
  int64 state_version = 10;
+
+  // Total system RAM in bytes
+  optional int64 system_ram_total_bytes = 11;
+
+  // Available system RAM in bytes
+  optional int64 system_ram_available_bytes = 12;
+
+  // Total GPU VRAM in bytes (primary device)
+  optional int64 gpu_vram_total_bytes = 13;
+
+  // Available GPU VRAM in bytes (primary device)
+  optional int64 gpu_vram_available_bytes = 14;
 }

 // =============================================================================
--- a/src/noteflow/grpc/proto/noteflow_pb2.py
+++ b/src/noteflow/grpc/proto/noteflow_pb2.py
--- a/src/noteflow/infrastructure/metrics/init.py
+++ b/src/noteflow/infrastructure/metrics/init.py
@@ -1,9 +1,12 @@
 """Metrics infrastructure for NoteFlow."""

 from .collector import MetricsCollector, PerformanceMetrics, get_metrics_collector
+from .system_resources import SystemResources, get_system_resources

 __all__ = [
    "MetricsCollector",
    "PerformanceMetrics",
    "get_metrics_collector",
+    "SystemResources",
+    "get_system_resources",
 ]
--- a/src/noteflow/infrastructure/metrics/system_resources.py
+++ b/src/noteflow/infrastructure/metrics/system_resources.py
@@ -0,0 +1,107 @@
+"""System resource discovery helpers."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Final, Protocol, cast
+
+from noteflow.infrastructure.logging import get_logger
+
+logger = get_logger(__name__)
+
+_GPU_DEVICE_INDEX: Final = 0
+
+
+class _TorchCudaProperties(Protocol):
+    total_memory: int
+
+
+class _TorchCudaModule(Protocol):
+    def is_available(self) -> bool: ...
+    def mem_get_info(self) -> tuple[int, int]: ...
+    def get_device_properties(self, device: int) -> _TorchCudaProperties: ...
+    def memory_reserved(self, device: int) -> int: ...
+    def memory_allocated(self, device: int) -> int: ...
+
+
+class _TorchModule(Protocol):
+    cuda: _TorchCudaModule
+
+
+@dataclass(frozen=True, slots=True)
+class SystemResources:
+    """Snapshot of server memory resources."""
+
+    ram_total_bytes: int | None
+    ram_available_bytes: int | None
+    gpu_vram_total_bytes: int | None
+    gpu_vram_available_bytes: int | None
+
+
+def get_system_resources() -> SystemResources:
+    """Collect current system memory resources.
+
+    Returns:
+        SystemResources with byte counts when available.
+    """
+    ram_total, ram_available = _get_ram_bytes()
+    gpu_total, gpu_available = _get_gpu_bytes()
+    return SystemResources(
+        ram_total_bytes=ram_total,
+        ram_available_bytes=ram_available,
+        gpu_vram_total_bytes=gpu_total,
+        gpu_vram_available_bytes=gpu_available,
+    )
+
+
+def _get_ram_bytes() -> tuple[int | None, int | None]:
+    try:
+        import psutil
+    except ImportError as exc:
+        logger.debug("psutil unavailable; cannot read system RAM: %s", exc)
+        return None, None
+
+    try:
+        memory = psutil.virtual_memory()
+    except (OSError, RuntimeError) as exc:
+        logger.debug("Failed to read system RAM: %s", exc)
+        return None, None
+
+    return int(memory.total), int(memory.available)
+
+
+def _get_gpu_bytes() -> tuple[int | None, int | None]:
+    try:
+        import torch
+    except ImportError as exc:
+        logger.debug("torch unavailable; cannot read GPU memory: %s", exc)
+        return None, None
+
+    torch_typed = cast(_TorchModule, torch)
+    cuda = torch_typed.cuda
+
+    try:
+        if not cuda.is_available():
+            return None, None
+    except RuntimeError as exc:
+        logger.debug("Failed to query CUDA availability: %s", exc)
+        return None, None
+
+    if hasattr(cuda, "mem_get_info"):
+        try:
+            free_bytes, total_bytes = cuda.mem_get_info()
+            return int(total_bytes), int(free_bytes)
+        except RuntimeError as exc:
+            logger.debug("Failed to read CUDA mem info: %s", exc)
+
+    try:
+        props = cuda.get_device_properties(_GPU_DEVICE_INDEX)
+        total_bytes = int(props.total_memory)
+        reserved = int(cuda.memory_reserved(_GPU_DEVICE_INDEX))
+        allocated = int(cuda.memory_allocated(_GPU_DEVICE_INDEX))
+        used_bytes = max(reserved, allocated)
+        free_bytes = max(total_bytes - used_bytes, 0)
+        return total_bytes, free_bytes
+    except RuntimeError as exc:
+        logger.debug("Failed to estimate GPU memory: %s", exc)
+        return None, None