Files
rag-manager/ingest_pipeline/config/settings.py
2025-09-18 09:44:16 +00:00

150 lines
4.7 KiB
Python

"""Application settings and configuration."""
from functools import lru_cache
from typing import Annotated, Literal
from pydantic import Field, HttpUrl, model_validator
from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
"""Application settings."""
model_config = SettingsConfigDict(
env_file=".env",
env_file_encoding="utf-8",
case_sensitive=False,
extra="ignore", # Ignore extra environment variables
)
# API Keys
firecrawl_api_key: str | None = None
openwebui_api_key: str | None = None
weaviate_api_key: str | None = None
r2r_api_key: str | None = None
# Endpoints
llm_endpoint: HttpUrl = HttpUrl("http://llm.lab")
weaviate_endpoint: HttpUrl = HttpUrl("http://weaviate.yo")
openwebui_endpoint: HttpUrl = HttpUrl("http://chat.lab") # This will be the API URL
firecrawl_endpoint: HttpUrl = HttpUrl("http://crawl.lab:30002")
r2r_endpoint: HttpUrl | None = Field(default=None, alias="r2r_api_url")
# Model Configuration
embedding_model: str = "ollama/bge-m3:latest"
embedding_dimension: int = 1024
# Ingestion Settings
default_batch_size: Annotated[int, Field(gt=0, le=500)] = 50
max_file_size: int = 1_000_000
max_crawl_depth: Annotated[int, Field(ge=1, le=20)] = 5
max_crawl_pages: Annotated[int, Field(ge=1, le=1000)] = 100
# Storage Settings
default_storage_backend: Literal["weaviate", "open_webui", "r2r"] = "weaviate"
default_collection_prefix: str = "docs"
# Prefect Settings
prefect_api_url: HttpUrl | None = None
prefect_api_key: str | None = None
prefect_work_pool: str = "default"
# Scheduling Defaults
default_schedule_interval: Annotated[int, Field(ge=1, le=10080)] = 60 # Max 1 week
# Performance Settings
max_concurrent_tasks: Annotated[int, Field(ge=1, le=20)] = 5
request_timeout: Annotated[int, Field(ge=10, le=300)] = 60
# Logging
log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR"] = "INFO"
def get_storage_endpoint(self, backend: str) -> HttpUrl:
"""
Get endpoint for storage backend.
Args:
backend: Storage backend name
Returns:
Endpoint URL
Raises:
ValueError: If backend is unknown or R2R endpoint not configured
"""
endpoints = {
"weaviate": self.weaviate_endpoint,
"open_webui": self.openwebui_endpoint,
}
if backend in endpoints:
return endpoints[backend]
elif backend == "r2r":
if not self.r2r_endpoint:
raise ValueError(
"R2R_API_URL must be set in environment variables. "
"This should have been caught during settings validation."
)
return self.r2r_endpoint
else:
raise ValueError(f"Unknown backend: {backend}. Supported: weaviate, open_webui, r2r")
def get_api_key(self, service: str) -> str | None:
"""
Get API key for service.
Args:
service: Service name
Returns:
API key or None
"""
service_map = {
"firecrawl": self.firecrawl_api_key,
"openwebui": self.openwebui_api_key,
"weaviate": self.weaviate_api_key,
"r2r": self.r2r_api_key,
}
return service_map.get(service)
@model_validator(mode="after")
def validate_backend_configuration(self) -> "Settings":
"""Validate that required configuration is present for the default backend."""
backend = self.default_storage_backend
# Validate R2R backend configuration
if backend == "r2r" and not self.r2r_endpoint:
raise ValueError(
"R2R_API_URL must be set in environment variables when using R2R as default backend"
)
# Validate API key requirements (optional warning for missing keys)
required_keys = {
"weaviate": ("WEAVIATE_API_KEY", self.weaviate_api_key),
"open_webui": ("OPENWEBUI_API_KEY", self.openwebui_api_key),
"r2r": ("R2R_API_KEY", self.r2r_api_key),
}
if backend in required_keys:
key_name, key_value = required_keys[backend]
if not key_value:
import warnings
warnings.warn(
f"{key_name} not set - authentication may fail for {backend} backend",
UserWarning,
stacklevel=2
)
return self
@lru_cache
def get_settings() -> Settings:
"""
Get cached settings instance.
Returns:
Settings instance
"""
return Settings()