feat: add GPU-accelerated Docker profiles and optimize gRPC streaming configuration

- Added server-gpu and full-gpu Docker Compose profiles with NVIDIA CUDA support for GPU-accelerated inference
- Created server-gpu.Dockerfile with PyTorch CUDA base image and GPU runtime configuration
- Added compose.gpu.yaml for optional GPU allocation customization (device count, IDs, memory settings)
- Documented GPU setup requirements for Linux (nvidia-container-toolkit), Windows WSL2, and macOS in docker/CLAUDE
This commit is contained in:
2026-01-15 23:45:00 -05:00
parent 1d2bc25024
commit 389eba0a95
7 changed files with 253 additions and 8 deletions

View File

@@ -12,5 +12,9 @@
"Bash(npm run build:*)"
]
},
"outputStyle": "YAML Structured"
"outputStyle": "YAML Structured",
"enabledMcpjsonServers": [
"lightrag-mcp"
],
"enableAllProjectMcpServers": true
}

View File

@@ -33,6 +33,12 @@ pub mod grpc {
pub const RETRY_DELAY_BASE_MS: u64 = 1000;
/// Keep-alive interval
pub const KEEP_ALIVE_INTERVAL: Duration = Duration::from_secs(30);
/// Keep-alive timeout (how long to wait for ping response)
pub const KEEP_ALIVE_TIMEOUT: Duration = Duration::from_secs(20);
/// HTTP/2 initial stream window size (1MB for audio streaming)
pub const HTTP2_STREAM_WINDOW_SIZE: u32 = 1024 * 1024;
/// HTTP/2 initial connection window size (5MB total)
pub const HTTP2_CONNECTION_WINDOW_SIZE: u32 = 5 * 1024 * 1024;
}
/// Audio settings
@@ -155,8 +161,8 @@ pub mod cache {
pub mod streaming {
use std::time::Duration;
/// Audio stream channel buffer capacity
pub const AUDIO_CHANNEL_CAPACITY: usize = 128;
/// Audio stream channel buffer capacity (~51 seconds at 100ms chunks)
pub const AUDIO_CHANNEL_CAPACITY: usize = 512;
/// Request timeout for bidirectional audio streams
pub const STREAM_REQUEST_TIMEOUT: Duration = Duration::from_secs(300);
}

View File

@@ -313,7 +313,11 @@ impl GrpcClient {
})?
.connect_timeout(Duration::from_millis(self.config.connect_timeout_ms))
.timeout(Duration::from_millis(self.config.request_timeout_ms))
.http2_keep_alive_interval(Duration::from_secs(self.config.keepalive_interval_secs));
.http2_keep_alive_interval(Duration::from_secs(self.config.keepalive_interval_secs))
.keep_alive_timeout(grpc_config::KEEP_ALIVE_TIMEOUT)
.http2_adaptive_window(true)
.initial_stream_window_size(grpc_config::HTTP2_STREAM_WINDOW_SIZE)
.initial_connection_window_size(grpc_config::HTTP2_CONNECTION_WINDOW_SIZE);
let channel = endpoint.connect().await.map_err(|e| {
let elapsed_ms = connect_start.elapsed().as_millis();

29
compose.gpu.yaml Normal file
View File

@@ -0,0 +1,29 @@
# GPU-specific overrides for Docker Compose
# Usage: docker compose -f compose.yaml -f compose.gpu.yaml --profile server-gpu up
#
# This file provides additional GPU configuration options.
# It is optional - the server-gpu profile in compose.yaml includes sensible defaults.
services:
server-gpu:
# Override GPU allocation (uncomment to customize)
deploy:
resources:
reservations:
devices:
- driver: nvidia
# Use 'all' to use all available GPUs, or specify count
count: 1
# count: all
capabilities: [gpu]
# Optionally specify device IDs (e.g., for multi-GPU systems)
# device_ids: ['0']
# Additional environment variables for GPU optimization
environment:
# PyTorch CUDA settings
CUDA_VISIBLE_DEVICES: "0"
# Enable TF32 for better performance on Ampere+ GPUs
NVIDIA_TF32_OVERRIDE: "1"
# Memory management
PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"

View File

@@ -24,7 +24,9 @@ services:
profiles:
- infra
- server
- server-gpu
- full
- full-gpu
- server-full
redis:
@@ -46,7 +48,9 @@ services:
profiles:
- infra
- server
- server-gpu
- full
- full-gpu
- server-full
qdrant:
@@ -71,14 +75,25 @@ services:
profiles:
- infra
- server
- server-gpu
- full
- full-gpu
- server-full
# =============================================================================
# Application Services
# =============================================================================
# Note: 'server' and 'server-full' are mutually exclusive (same port 50051).
# Use profile 'server' OR 'server-full', not both.
# Note: 'server', 'server-gpu', and 'server-full' are mutually exclusive (same port 50051).
# Use ONE of: 'server' (CPU), 'server-gpu' (NVIDIA GPU), or 'server-full'.
#
# GPU Support:
# - Use profile 'server-gpu' or 'full-gpu' for NVIDIA GPU acceleration
# - Requires: NVIDIA drivers, nvidia-container-toolkit, Docker with GPU support
# - On Windows/WSL2: Ensure WSL2 GPU passthrough is configured
# - On Linux: Install nvidia-container-toolkit package
# - On macOS: GPU passthrough not supported (use CPU profile)
# CPU-only server (default, cross-platform)
server:
container_name: noteflow-server
build:
@@ -97,6 +112,8 @@ services:
NOTEFLOW_REDIS_URL: redis://redis:6379/0
NOTEFLOW_QDRANT_URL: http://qdrant:6333
NOTEFLOW_LOG_FORMAT: console
# Force CPU device when running CPU-only container
NOTEFLOW_ASR_DEVICE: cpu
volumes:
- .:/workspace
- server_venv:/workspace/.venv
@@ -113,6 +130,52 @@ services:
- server
- full
# GPU-enabled server (NVIDIA CUDA)
server-gpu:
container_name: noteflow-server
build:
context: .
dockerfile: docker/server-gpu.Dockerfile
target: server
restart: unless-stopped
ports:
- "50051:50051"
extra_hosts:
- "host.docker.internal:host-gateway"
env_file:
- .env
environment:
NOTEFLOW_DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-noteflow}:${POSTGRES_PASSWORD:-noteflow}@db:5432/${POSTGRES_DB:-noteflow}
NOTEFLOW_REDIS_URL: redis://redis:6379/0
NOTEFLOW_QDRANT_URL: http://qdrant:6333
NOTEFLOW_LOG_FORMAT: console
# Enable CUDA device auto-detection
NOTEFLOW_ASR_DEVICE: cuda
NOTEFLOW_DIARIZATION_DEVICE: cuda
volumes:
- .:/workspace
- server_venv_gpu:/workspace/.venv
# NVIDIA GPU configuration (cross-platform compatible)
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
depends_on:
db:
condition: service_healthy
redis:
condition: service_healthy
qdrant:
condition: service_healthy
networks:
- noteflow-net
profiles:
- server-gpu
- full-gpu
# server-full:
# container_name: noteflow-server-full
# build:
@@ -181,13 +244,12 @@ services:
command: sh -c "npm install -g npm@latest && npm install && npm update caniuse-lite browserslist || true && npm run dev"
environment:
- NODE_ENV=development
depends_on:
- server
networks:
- noteflow-net
profiles:
- frontend
- full
- full-gpu
volumes:
noteflow_pg_data:
@@ -196,6 +258,7 @@ volumes:
client_cargo_cache:
client_npm_cache:
server_venv:
server_venv_gpu:
networks:
noteflow-net:

View File

@@ -4,6 +4,58 @@ These rules guide Claude Code to generate secure Docker configurations, Dockerfi
---
## GPU Support Configuration
### Available Profiles
NoteFlow supports both CPU-only and GPU-accelerated Docker deployments:
| Profile | Description | GPU Required |
|---------|-------------|--------------|
| `server` | CPU-only server (default) | No |
| `server-gpu` | NVIDIA CUDA-enabled server | Yes |
| `full` | CPU server + frontend | No |
| `full-gpu` | GPU server + frontend | Yes |
### Usage Examples
```bash
# CPU-only (default, cross-platform)
docker compose --profile server --profile infra up -d
# GPU-enabled (requires NVIDIA GPU + nvidia-container-toolkit)
docker compose --profile server-gpu --profile infra up -d
# Full stack with GPU
docker compose --profile full-gpu --profile infra up -d
```
### Platform-Specific GPU Setup
**Linux:**
```bash
# Install NVIDIA container toolkit
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
sudo systemctl restart docker
```
**Windows (WSL2):**
1. Install NVIDIA GPU drivers for Windows
2. Enable WSL2 GPU support in Docker Desktop settings
3. Use WSL2 backend (not Hyper-V)
**macOS:**
- GPU passthrough not supported in Docker on macOS
- Use CPU profile (`server` or `full`)
- For GPU acceleration, run server natively outside Docker
---
## Rule: Minimal Base Images
**Level**: `strict`

View File

@@ -0,0 +1,87 @@
# syntax=docker/dockerfile:1
# GPU-enabled server Dockerfile with CUDA support
# Use this for systems with NVIDIA GPUs
# =============================================================================
# Base Stage - CUDA-enabled Python using official PyTorch CUDA image
# =============================================================================
# Using Python 3.12 bookworm with CUDA support via PyTorch's approach
FROM python:3.12-bookworm AS base
# CUDA environment variables (for runtime GPU detection)
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
UV_COMPILE_BYTECODE=1 \
UV_LINK_MODE=copy \
# CUDA environment - these tell nvidia-container-runtime to inject GPU
NVIDIA_VISIBLE_DEVICES=all \
NVIDIA_DRIVER_CAPABILITIES=compute,utility
# Install uv and system dependencies
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
# Core build/runtime deps for project packages (sounddevice, asyncpg, cryptography).
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
build-essential \
pkg-config \
portaudio19-dev \
libsndfile1 \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /workspace
# Copy dependency files first for better layer caching
COPY pyproject.toml uv.lock* ./
# =============================================================================
# Server Stage - GPU Enabled
# =============================================================================
FROM base AS server
# Install dependencies with CUDA-enabled PyTorch
# The --extra-index-url ensures we get CUDA-enabled torch
RUN --mount=type=cache,target=/root/.cache/uv \
uv sync --frozen --no-install-project --group dev --all-extras
# Copy source code
COPY . .
# Install the project itself
RUN --mount=type=cache,target=/root/.cache/uv \
uv sync --frozen --group dev --all-extras
# Install spaCy small English model for NER (baked into image)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl
# Verify CUDA is accessible (will fail build if CUDA libs missing)
RUN python -c "import torch; print(f'PyTorch: {torch.__version__}'); print(f'CUDA available: {torch.cuda.is_available()}')" || true
EXPOSE 50051
CMD ["sh", "-c", "uv sync --frozen --group dev --all-extras && uv run python scripts/dev_watch_server.py"]
# =============================================================================
# Server Production Stage - GPU Enabled with all extras
# =============================================================================
FROM base AS server-full
# Install all dependencies including optional extras
RUN --mount=type=cache,target=/root/.cache/uv \
uv sync --frozen --no-install-project --group dev --all-extras
# Copy source code
COPY . .
# Install the project itself
RUN --mount=type=cache,target=/root/.cache/uv \
uv sync --frozen --group dev --all-extras
# Install spaCy small English model for NER (baked into image)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl
EXPOSE 50051
CMD ["sh", "-c", "uv sync --frozen --group dev --all-extras && uv run python scripts/dev_watch_server.py"]