feat: add GPU-accelerated Docker profiles and optimize gRPC streaming configuration
- Added server-gpu and full-gpu Docker Compose profiles with NVIDIA CUDA support for GPU-accelerated inference - Created server-gpu.Dockerfile with PyTorch CUDA base image and GPU runtime configuration - Added compose.gpu.yaml for optional GPU allocation customization (device count, IDs, memory settings) - Documented GPU setup requirements for Linux (nvidia-container-toolkit), Windows WSL2, and macOS in docker/CLAUDE
This commit is contained in:
@@ -12,5 +12,9 @@
|
||||
"Bash(npm run build:*)"
|
||||
]
|
||||
},
|
||||
"outputStyle": "YAML Structured"
|
||||
"outputStyle": "YAML Structured",
|
||||
"enabledMcpjsonServers": [
|
||||
"lightrag-mcp"
|
||||
],
|
||||
"enableAllProjectMcpServers": true
|
||||
}
|
||||
|
||||
@@ -33,6 +33,12 @@ pub mod grpc {
|
||||
pub const RETRY_DELAY_BASE_MS: u64 = 1000;
|
||||
/// Keep-alive interval
|
||||
pub const KEEP_ALIVE_INTERVAL: Duration = Duration::from_secs(30);
|
||||
/// Keep-alive timeout (how long to wait for ping response)
|
||||
pub const KEEP_ALIVE_TIMEOUT: Duration = Duration::from_secs(20);
|
||||
/// HTTP/2 initial stream window size (1MB for audio streaming)
|
||||
pub const HTTP2_STREAM_WINDOW_SIZE: u32 = 1024 * 1024;
|
||||
/// HTTP/2 initial connection window size (5MB total)
|
||||
pub const HTTP2_CONNECTION_WINDOW_SIZE: u32 = 5 * 1024 * 1024;
|
||||
}
|
||||
|
||||
/// Audio settings
|
||||
@@ -155,8 +161,8 @@ pub mod cache {
|
||||
pub mod streaming {
|
||||
use std::time::Duration;
|
||||
|
||||
/// Audio stream channel buffer capacity
|
||||
pub const AUDIO_CHANNEL_CAPACITY: usize = 128;
|
||||
/// Audio stream channel buffer capacity (~51 seconds at 100ms chunks)
|
||||
pub const AUDIO_CHANNEL_CAPACITY: usize = 512;
|
||||
/// Request timeout for bidirectional audio streams
|
||||
pub const STREAM_REQUEST_TIMEOUT: Duration = Duration::from_secs(300);
|
||||
}
|
||||
|
||||
@@ -313,7 +313,11 @@ impl GrpcClient {
|
||||
})?
|
||||
.connect_timeout(Duration::from_millis(self.config.connect_timeout_ms))
|
||||
.timeout(Duration::from_millis(self.config.request_timeout_ms))
|
||||
.http2_keep_alive_interval(Duration::from_secs(self.config.keepalive_interval_secs));
|
||||
.http2_keep_alive_interval(Duration::from_secs(self.config.keepalive_interval_secs))
|
||||
.keep_alive_timeout(grpc_config::KEEP_ALIVE_TIMEOUT)
|
||||
.http2_adaptive_window(true)
|
||||
.initial_stream_window_size(grpc_config::HTTP2_STREAM_WINDOW_SIZE)
|
||||
.initial_connection_window_size(grpc_config::HTTP2_CONNECTION_WINDOW_SIZE);
|
||||
|
||||
let channel = endpoint.connect().await.map_err(|e| {
|
||||
let elapsed_ms = connect_start.elapsed().as_millis();
|
||||
|
||||
29
compose.gpu.yaml
Normal file
29
compose.gpu.yaml
Normal file
@@ -0,0 +1,29 @@
|
||||
# GPU-specific overrides for Docker Compose
|
||||
# Usage: docker compose -f compose.yaml -f compose.gpu.yaml --profile server-gpu up
|
||||
#
|
||||
# This file provides additional GPU configuration options.
|
||||
# It is optional - the server-gpu profile in compose.yaml includes sensible defaults.
|
||||
|
||||
services:
|
||||
server-gpu:
|
||||
# Override GPU allocation (uncomment to customize)
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
# Use 'all' to use all available GPUs, or specify count
|
||||
count: 1
|
||||
# count: all
|
||||
capabilities: [gpu]
|
||||
# Optionally specify device IDs (e.g., for multi-GPU systems)
|
||||
# device_ids: ['0']
|
||||
|
||||
# Additional environment variables for GPU optimization
|
||||
environment:
|
||||
# PyTorch CUDA settings
|
||||
CUDA_VISIBLE_DEVICES: "0"
|
||||
# Enable TF32 for better performance on Ampere+ GPUs
|
||||
NVIDIA_TF32_OVERRIDE: "1"
|
||||
# Memory management
|
||||
PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
|
||||
71
compose.yaml
71
compose.yaml
@@ -24,7 +24,9 @@ services:
|
||||
profiles:
|
||||
- infra
|
||||
- server
|
||||
- server-gpu
|
||||
- full
|
||||
- full-gpu
|
||||
- server-full
|
||||
|
||||
redis:
|
||||
@@ -46,7 +48,9 @@ services:
|
||||
profiles:
|
||||
- infra
|
||||
- server
|
||||
- server-gpu
|
||||
- full
|
||||
- full-gpu
|
||||
- server-full
|
||||
|
||||
qdrant:
|
||||
@@ -71,14 +75,25 @@ services:
|
||||
profiles:
|
||||
- infra
|
||||
- server
|
||||
- server-gpu
|
||||
- full
|
||||
- full-gpu
|
||||
- server-full
|
||||
|
||||
# =============================================================================
|
||||
# Application Services
|
||||
# =============================================================================
|
||||
# Note: 'server' and 'server-full' are mutually exclusive (same port 50051).
|
||||
# Use profile 'server' OR 'server-full', not both.
|
||||
# Note: 'server', 'server-gpu', and 'server-full' are mutually exclusive (same port 50051).
|
||||
# Use ONE of: 'server' (CPU), 'server-gpu' (NVIDIA GPU), or 'server-full'.
|
||||
#
|
||||
# GPU Support:
|
||||
# - Use profile 'server-gpu' or 'full-gpu' for NVIDIA GPU acceleration
|
||||
# - Requires: NVIDIA drivers, nvidia-container-toolkit, Docker with GPU support
|
||||
# - On Windows/WSL2: Ensure WSL2 GPU passthrough is configured
|
||||
# - On Linux: Install nvidia-container-toolkit package
|
||||
# - On macOS: GPU passthrough not supported (use CPU profile)
|
||||
|
||||
# CPU-only server (default, cross-platform)
|
||||
server:
|
||||
container_name: noteflow-server
|
||||
build:
|
||||
@@ -97,6 +112,8 @@ services:
|
||||
NOTEFLOW_REDIS_URL: redis://redis:6379/0
|
||||
NOTEFLOW_QDRANT_URL: http://qdrant:6333
|
||||
NOTEFLOW_LOG_FORMAT: console
|
||||
# Force CPU device when running CPU-only container
|
||||
NOTEFLOW_ASR_DEVICE: cpu
|
||||
volumes:
|
||||
- .:/workspace
|
||||
- server_venv:/workspace/.venv
|
||||
@@ -113,6 +130,52 @@ services:
|
||||
- server
|
||||
- full
|
||||
|
||||
# GPU-enabled server (NVIDIA CUDA)
|
||||
server-gpu:
|
||||
container_name: noteflow-server
|
||||
build:
|
||||
context: .
|
||||
dockerfile: docker/server-gpu.Dockerfile
|
||||
target: server
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "50051:50051"
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
NOTEFLOW_DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-noteflow}:${POSTGRES_PASSWORD:-noteflow}@db:5432/${POSTGRES_DB:-noteflow}
|
||||
NOTEFLOW_REDIS_URL: redis://redis:6379/0
|
||||
NOTEFLOW_QDRANT_URL: http://qdrant:6333
|
||||
NOTEFLOW_LOG_FORMAT: console
|
||||
# Enable CUDA device auto-detection
|
||||
NOTEFLOW_ASR_DEVICE: cuda
|
||||
NOTEFLOW_DIARIZATION_DEVICE: cuda
|
||||
volumes:
|
||||
- .:/workspace
|
||||
- server_venv_gpu:/workspace/.venv
|
||||
# NVIDIA GPU configuration (cross-platform compatible)
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: 1
|
||||
capabilities: [gpu]
|
||||
depends_on:
|
||||
db:
|
||||
condition: service_healthy
|
||||
redis:
|
||||
condition: service_healthy
|
||||
qdrant:
|
||||
condition: service_healthy
|
||||
networks:
|
||||
- noteflow-net
|
||||
profiles:
|
||||
- server-gpu
|
||||
- full-gpu
|
||||
|
||||
# server-full:
|
||||
# container_name: noteflow-server-full
|
||||
# build:
|
||||
@@ -181,13 +244,12 @@ services:
|
||||
command: sh -c "npm install -g npm@latest && npm install && npm update caniuse-lite browserslist || true && npm run dev"
|
||||
environment:
|
||||
- NODE_ENV=development
|
||||
depends_on:
|
||||
- server
|
||||
networks:
|
||||
- noteflow-net
|
||||
profiles:
|
||||
- frontend
|
||||
- full
|
||||
- full-gpu
|
||||
|
||||
volumes:
|
||||
noteflow_pg_data:
|
||||
@@ -196,6 +258,7 @@ volumes:
|
||||
client_cargo_cache:
|
||||
client_npm_cache:
|
||||
server_venv:
|
||||
server_venv_gpu:
|
||||
|
||||
networks:
|
||||
noteflow-net:
|
||||
|
||||
@@ -4,6 +4,58 @@ These rules guide Claude Code to generate secure Docker configurations, Dockerfi
|
||||
|
||||
---
|
||||
|
||||
## GPU Support Configuration
|
||||
|
||||
### Available Profiles
|
||||
|
||||
NoteFlow supports both CPU-only and GPU-accelerated Docker deployments:
|
||||
|
||||
| Profile | Description | GPU Required |
|
||||
|---------|-------------|--------------|
|
||||
| `server` | CPU-only server (default) | No |
|
||||
| `server-gpu` | NVIDIA CUDA-enabled server | Yes |
|
||||
| `full` | CPU server + frontend | No |
|
||||
| `full-gpu` | GPU server + frontend | Yes |
|
||||
|
||||
### Usage Examples
|
||||
|
||||
```bash
|
||||
# CPU-only (default, cross-platform)
|
||||
docker compose --profile server --profile infra up -d
|
||||
|
||||
# GPU-enabled (requires NVIDIA GPU + nvidia-container-toolkit)
|
||||
docker compose --profile server-gpu --profile infra up -d
|
||||
|
||||
# Full stack with GPU
|
||||
docker compose --profile full-gpu --profile infra up -d
|
||||
```
|
||||
|
||||
### Platform-Specific GPU Setup
|
||||
|
||||
**Linux:**
|
||||
```bash
|
||||
# Install NVIDIA container toolkit
|
||||
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
|
||||
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
|
||||
curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \
|
||||
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
|
||||
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
|
||||
sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
|
||||
sudo systemctl restart docker
|
||||
```
|
||||
|
||||
**Windows (WSL2):**
|
||||
1. Install NVIDIA GPU drivers for Windows
|
||||
2. Enable WSL2 GPU support in Docker Desktop settings
|
||||
3. Use WSL2 backend (not Hyper-V)
|
||||
|
||||
**macOS:**
|
||||
- GPU passthrough not supported in Docker on macOS
|
||||
- Use CPU profile (`server` or `full`)
|
||||
- For GPU acceleration, run server natively outside Docker
|
||||
|
||||
---
|
||||
|
||||
## Rule: Minimal Base Images
|
||||
|
||||
**Level**: `strict`
|
||||
|
||||
87
docker/server-gpu.Dockerfile
Normal file
87
docker/server-gpu.Dockerfile
Normal file
@@ -0,0 +1,87 @@
|
||||
# syntax=docker/dockerfile:1
|
||||
# GPU-enabled server Dockerfile with CUDA support
|
||||
# Use this for systems with NVIDIA GPUs
|
||||
|
||||
# =============================================================================
|
||||
# Base Stage - CUDA-enabled Python using official PyTorch CUDA image
|
||||
# =============================================================================
|
||||
# Using Python 3.12 bookworm with CUDA support via PyTorch's approach
|
||||
FROM python:3.12-bookworm AS base
|
||||
|
||||
# CUDA environment variables (for runtime GPU detection)
|
||||
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||
PYTHONUNBUFFERED=1 \
|
||||
UV_COMPILE_BYTECODE=1 \
|
||||
UV_LINK_MODE=copy \
|
||||
# CUDA environment - these tell nvidia-container-runtime to inject GPU
|
||||
NVIDIA_VISIBLE_DEVICES=all \
|
||||
NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||
|
||||
# Install uv and system dependencies
|
||||
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
|
||||
|
||||
# Core build/runtime deps for project packages (sounddevice, asyncpg, cryptography).
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
pkg-config \
|
||||
portaudio19-dev \
|
||||
libsndfile1 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
# Copy dependency files first for better layer caching
|
||||
COPY pyproject.toml uv.lock* ./
|
||||
|
||||
# =============================================================================
|
||||
# Server Stage - GPU Enabled
|
||||
# =============================================================================
|
||||
FROM base AS server
|
||||
|
||||
# Install dependencies with CUDA-enabled PyTorch
|
||||
# The --extra-index-url ensures we get CUDA-enabled torch
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv sync --frozen --no-install-project --group dev --all-extras
|
||||
|
||||
# Copy source code
|
||||
COPY . .
|
||||
|
||||
# Install the project itself
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv sync --frozen --group dev --all-extras
|
||||
|
||||
# Install spaCy small English model for NER (baked into image)
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl
|
||||
|
||||
# Verify CUDA is accessible (will fail build if CUDA libs missing)
|
||||
RUN python -c "import torch; print(f'PyTorch: {torch.__version__}'); print(f'CUDA available: {torch.cuda.is_available()}')" || true
|
||||
|
||||
EXPOSE 50051
|
||||
|
||||
CMD ["sh", "-c", "uv sync --frozen --group dev --all-extras && uv run python scripts/dev_watch_server.py"]
|
||||
|
||||
# =============================================================================
|
||||
# Server Production Stage - GPU Enabled with all extras
|
||||
# =============================================================================
|
||||
FROM base AS server-full
|
||||
|
||||
# Install all dependencies including optional extras
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv sync --frozen --no-install-project --group dev --all-extras
|
||||
|
||||
# Copy source code
|
||||
COPY . .
|
||||
|
||||
# Install the project itself
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv sync --frozen --group dev --all-extras
|
||||
|
||||
# Install spaCy small English model for NER (baked into image)
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl
|
||||
|
||||
EXPOSE 50051
|
||||
|
||||
CMD ["sh", "-c", "uv sync --frozen --group dev --all-extras && uv run python scripts/dev_watch_server.py"]
|
||||
Reference in New Issue
Block a user