feat: add GPU-accelerated Docker profiles and optimize gRPC streaming configuration

- Added server-gpu and full-gpu Docker Compose profiles with NVIDIA CUDA support for GPU-accelerated inference - Created server-gpu.Dockerfile with PyTorch CUDA base image and GPU runtime configuration - Added compose.gpu.yaml for optional GPU allocation customization (device count, IDs, memory settings) - Documented GPU setup requirements for Linux (nvidia-container-toolkit), Windows WSL2, and macOS in docker/CLAUDE
2026-01-15 23:45:00 -05:00
parent 1d2bc25024
commit 389eba0a95
7 changed files with 253 additions and 8 deletions
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -12,5 +12,9 @@
      "Bash(npm run build:*)"
    ]
  },
-  "outputStyle": "YAML Structured"
+  "outputStyle": "YAML Structured",
+  "enabledMcpjsonServers": [
+    "lightrag-mcp"
+  ],
+  "enableAllProjectMcpServers": true
 }
--- a/client/src-tauri/src/constants.rs
+++ b/client/src-tauri/src/constants.rs
@@ -33,6 +33,12 @@ pub mod grpc {
    pub const RETRY_DELAY_BASE_MS: u64 = 1000;
    /// Keep-alive interval
    pub const KEEP_ALIVE_INTERVAL: Duration = Duration::from_secs(30);
+    /// Keep-alive timeout (how long to wait for ping response)
+    pub const KEEP_ALIVE_TIMEOUT: Duration = Duration::from_secs(20);
+    /// HTTP/2 initial stream window size (1MB for audio streaming)
+    pub const HTTP2_STREAM_WINDOW_SIZE: u32 = 1024 * 1024;
+    /// HTTP/2 initial connection window size (5MB total)
+    pub const HTTP2_CONNECTION_WINDOW_SIZE: u32 = 5 * 1024 * 1024;
 }

 /// Audio settings
@@ -155,8 +161,8 @@ pub mod cache {
 pub mod streaming {
    use std::time::Duration;

-    /// Audio stream channel buffer capacity
-    pub const AUDIO_CHANNEL_CAPACITY: usize = 128;
+    /// Audio stream channel buffer capacity (~51 seconds at 100ms chunks)
+    pub const AUDIO_CHANNEL_CAPACITY: usize = 512;
    /// Request timeout for bidirectional audio streams
    pub const STREAM_REQUEST_TIMEOUT: Duration = Duration::from_secs(300);
 }
--- a/client/src-tauri/src/grpc/client/core.rs
+++ b/client/src-tauri/src/grpc/client/core.rs
@@ -313,7 +313,11 @@ impl GrpcClient {
            })?
            .connect_timeout(Duration::from_millis(self.config.connect_timeout_ms))
            .timeout(Duration::from_millis(self.config.request_timeout_ms))
-            .http2_keep_alive_interval(Duration::from_secs(self.config.keepalive_interval_secs));
+            .http2_keep_alive_interval(Duration::from_secs(self.config.keepalive_interval_secs))
+            .keep_alive_timeout(grpc_config::KEEP_ALIVE_TIMEOUT)
+            .http2_adaptive_window(true)
+            .initial_stream_window_size(grpc_config::HTTP2_STREAM_WINDOW_SIZE)
+            .initial_connection_window_size(grpc_config::HTTP2_CONNECTION_WINDOW_SIZE);

        let channel = endpoint.connect().await.map_err(|e| {
            let elapsed_ms = connect_start.elapsed().as_millis();
--- a/compose.gpu.yaml
+++ b/compose.gpu.yaml
@@ -0,0 +1,29 @@
+# GPU-specific overrides for Docker Compose
+# Usage: docker compose -f compose.yaml -f compose.gpu.yaml --profile server-gpu up
+#
+# This file provides additional GPU configuration options.
+# It is optional - the server-gpu profile in compose.yaml includes sensible defaults.
+
+services:
+  server-gpu:
+    # Override GPU allocation (uncomment to customize)
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              # Use 'all' to use all available GPUs, or specify count
+              count: 1
+              # count: all
+              capabilities: [gpu]
+              # Optionally specify device IDs (e.g., for multi-GPU systems)
+              # device_ids: ['0']
+
+    # Additional environment variables for GPU optimization
+    environment:
+      # PyTorch CUDA settings
+      CUDA_VISIBLE_DEVICES: "0"
+      # Enable TF32 for better performance on Ampere+ GPUs
+      NVIDIA_TF32_OVERRIDE: "1"
+      # Memory management
+      PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
--- a/compose.yaml
+++ b/compose.yaml
@@ -24,7 +24,9 @@ services:
    profiles:
      - infra
      - server
+      - server-gpu
      - full
+      - full-gpu
      - server-full

  redis:
@@ -46,7 +48,9 @@ services:
    profiles:
      - infra
      - server
+      - server-gpu
      - full
+      - full-gpu
      - server-full

  qdrant:
@@ -71,14 +75,25 @@ services:
    profiles:
      - infra
      - server
+      - server-gpu
      - full
+      - full-gpu
      - server-full

  # =============================================================================
  # Application Services
  # =============================================================================
-  # Note: 'server' and 'server-full' are mutually exclusive (same port 50051).
-  # Use profile 'server' OR 'server-full', not both.
+  # Note: 'server', 'server-gpu', and 'server-full' are mutually exclusive (same port 50051).
+  # Use ONE of: 'server' (CPU), 'server-gpu' (NVIDIA GPU), or 'server-full'.
+  #
+  # GPU Support:
+  # - Use profile 'server-gpu' or 'full-gpu' for NVIDIA GPU acceleration
+  # - Requires: NVIDIA drivers, nvidia-container-toolkit, Docker with GPU support
+  # - On Windows/WSL2: Ensure WSL2 GPU passthrough is configured
+  # - On Linux: Install nvidia-container-toolkit package
+  # - On macOS: GPU passthrough not supported (use CPU profile)
+
+  # CPU-only server (default, cross-platform)
  server:
    container_name: noteflow-server
    build:
@@ -97,6 +112,8 @@ services:
      NOTEFLOW_REDIS_URL: redis://redis:6379/0
      NOTEFLOW_QDRANT_URL: http://qdrant:6333
      NOTEFLOW_LOG_FORMAT: console
+      # Force CPU device when running CPU-only container
+      NOTEFLOW_ASR_DEVICE: cpu
    volumes:
      - .:/workspace
      - server_venv:/workspace/.venv
@@ -113,6 +130,52 @@ services:
      - server
      - full

+  # GPU-enabled server (NVIDIA CUDA)
+  server-gpu:
+    container_name: noteflow-server
+    build:
+      context: .
+      dockerfile: docker/server-gpu.Dockerfile
+      target: server
+    restart: unless-stopped
+    ports:
+      - "50051:50051"
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+    env_file:
+      - .env
+    environment:
+      NOTEFLOW_DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-noteflow}:${POSTGRES_PASSWORD:-noteflow}@db:5432/${POSTGRES_DB:-noteflow}
+      NOTEFLOW_REDIS_URL: redis://redis:6379/0
+      NOTEFLOW_QDRANT_URL: http://qdrant:6333
+      NOTEFLOW_LOG_FORMAT: console
+      # Enable CUDA device auto-detection
+      NOTEFLOW_ASR_DEVICE: cuda
+      NOTEFLOW_DIARIZATION_DEVICE: cuda
+    volumes:
+      - .:/workspace
+      - server_venv_gpu:/workspace/.venv
+    # NVIDIA GPU configuration (cross-platform compatible)
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    depends_on:
+      db:
+        condition: service_healthy
+      redis:
+        condition: service_healthy
+      qdrant:
+        condition: service_healthy
+    networks:
+      - noteflow-net
+    profiles:
+      - server-gpu
+      - full-gpu
+
  # server-full:
  #   container_name: noteflow-server-full
  #   build:
@@ -181,13 +244,12 @@ services:
    command: sh -c "npm install -g npm@latest && npm install && npm update caniuse-lite browserslist || true && npm run dev"
    environment:
      - NODE_ENV=development
-    depends_on:
-      - server
    networks:
      - noteflow-net
    profiles:
      - frontend
      - full
+      - full-gpu

 volumes:
  noteflow_pg_data:
@@ -196,6 +258,7 @@ volumes:
  client_cargo_cache:
  client_npm_cache:
  server_venv:
+  server_venv_gpu:

 networks:
  noteflow-net:
--- a/docker/CLAUDE.md
+++ b/docker/CLAUDE.md
@@ -4,6 +4,58 @@ These rules guide Claude Code to generate secure Docker configurations, Dockerfi

 ---

+## GPU Support Configuration
+
+### Available Profiles
+
+NoteFlow supports both CPU-only and GPU-accelerated Docker deployments:
+
+| Profile | Description | GPU Required |
+|---------|-------------|--------------|
+| `server` | CPU-only server (default) | No |
+| `server-gpu` | NVIDIA CUDA-enabled server | Yes |
+| `full` | CPU server + frontend | No |
+| `full-gpu` | GPU server + frontend | Yes |
+
+### Usage Examples
+
+```bash
+# CPU-only (default, cross-platform)
+docker compose --profile server --profile infra up -d
+
+# GPU-enabled (requires NVIDIA GPU + nvidia-container-toolkit)
+docker compose --profile server-gpu --profile infra up -d
+
+# Full stack with GPU
+docker compose --profile full-gpu --profile infra up -d
+```
+
+### Platform-Specific GPU Setup
+
+**Linux:**
+```bash
+# Install NVIDIA container toolkit
+distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
+curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
+curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \
+  sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
+  sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
+sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
+sudo systemctl restart docker
+```
+
+**Windows (WSL2):**
+1. Install NVIDIA GPU drivers for Windows
+2. Enable WSL2 GPU support in Docker Desktop settings
+3. Use WSL2 backend (not Hyper-V)
+
+**macOS:**
+- GPU passthrough not supported in Docker on macOS
+- Use CPU profile (`server` or `full`)
+- For GPU acceleration, run server natively outside Docker
+
+---
+
 ## Rule: Minimal Base Images

 **Level**: `strict`
--- a/docker/server-gpu.Dockerfile
+++ b/docker/server-gpu.Dockerfile
@@ -0,0 +1,87 @@
+# syntax=docker/dockerfile:1
+# GPU-enabled server Dockerfile with CUDA support
+# Use this for systems with NVIDIA GPUs
+
+# =============================================================================
+# Base Stage - CUDA-enabled Python using official PyTorch CUDA image
+# =============================================================================
+# Using Python 3.12 bookworm with CUDA support via PyTorch's approach
+FROM python:3.12-bookworm AS base
+
+# CUDA environment variables (for runtime GPU detection)
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    UV_COMPILE_BYTECODE=1 \
+    UV_LINK_MODE=copy \
+    # CUDA environment - these tell nvidia-container-runtime to inject GPU
+    NVIDIA_VISIBLE_DEVICES=all \
+    NVIDIA_DRIVER_CAPABILITIES=compute,utility
+
+# Install uv and system dependencies
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
+
+# Core build/runtime deps for project packages (sounddevice, asyncpg, cryptography).
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+        build-essential \
+        pkg-config \
+        portaudio19-dev \
+        libsndfile1 \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workspace
+
+# Copy dependency files first for better layer caching
+COPY pyproject.toml uv.lock* ./
+
+# =============================================================================
+# Server Stage - GPU Enabled
+# =============================================================================
+FROM base AS server
+
+# Install dependencies with CUDA-enabled PyTorch
+# The --extra-index-url ensures we get CUDA-enabled torch
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv sync --frozen --no-install-project --group dev --all-extras
+
+# Copy source code
+COPY . .
+
+# Install the project itself
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv sync --frozen --group dev --all-extras
+
+# Install spaCy small English model for NER (baked into image)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl
+
+# Verify CUDA is accessible (will fail build if CUDA libs missing)
+RUN python -c "import torch; print(f'PyTorch: {torch.__version__}'); print(f'CUDA available: {torch.cuda.is_available()}')" || true
+
+EXPOSE 50051
+
+CMD ["sh", "-c", "uv sync --frozen --group dev --all-extras && uv run python scripts/dev_watch_server.py"]
+
+# =============================================================================
+# Server Production Stage - GPU Enabled with all extras
+# =============================================================================
+FROM base AS server-full
+
+# Install all dependencies including optional extras
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv sync --frozen --no-install-project --group dev --all-extras
+
+# Copy source code
+COPY . .
+
+# Install the project itself
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv sync --frozen --group dev --all-extras
+
+# Install spaCy small English model for NER (baked into image)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl
+
+EXPOSE 50051
+
+CMD ["sh", "-c", "uv sync --frozen --group dev --all-extras && uv run python scripts/dev_watch_server.py"]