rag-manager/ingest_pipeline/config/settings.py

"""Application settings and configuration."""

from functools import lru_cache
from typing import Annotated, Literal

from pydantic import Field, HttpUrl, model_validator
from pydantic_settings import BaseSettings, SettingsConfigDict


class Settings(BaseSettings):
    """Application settings."""

    model_config = SettingsConfigDict(
        env_file=".env",
        env_file_encoding="utf-8",
        case_sensitive=False,
        extra="ignore",  # Ignore extra environment variables
    )

    # API Keys
    firecrawl_api_key: str | None = None
    openwebui_api_key: str | None = None
    weaviate_api_key: str | None = None
    r2r_api_key: str | None = None

    # Endpoints
    llm_endpoint: HttpUrl = HttpUrl("http://llm.lab")
    weaviate_endpoint: HttpUrl = HttpUrl("http://weaviate.yo")
    openwebui_endpoint: HttpUrl = HttpUrl("http://chat.lab")  # This will be the API URL
    firecrawl_endpoint: HttpUrl = HttpUrl("http://crawl.lab:30002")
    r2r_endpoint: HttpUrl | None = Field(default=None, alias="r2r_api_url")

    # Model Configuration
    embedding_model: str = "ollama/bge-m3:latest"
    embedding_dimension: int = 1024

    # Ingestion Settings
    default_batch_size: Annotated[int, Field(gt=0, le=500)] = 50
    max_file_size: int = 1_000_000
    max_crawl_depth: Annotated[int, Field(ge=1, le=20)] = 5
    max_crawl_pages: Annotated[int, Field(ge=1, le=1000)] = 100

    # Storage Settings
    default_storage_backend: Literal["weaviate", "open_webui", "r2r"] = "weaviate"
    default_collection_prefix: str = "docs"

    # Prefect Settings
    prefect_api_url: HttpUrl | None = None
    prefect_api_key: str | None = None
    prefect_work_pool: str = "default"

    # Scheduling Defaults
    default_schedule_interval: Annotated[int, Field(ge=1, le=10080)] = 60  # Max 1 week

    # Performance Settings
    max_concurrent_tasks: Annotated[int, Field(ge=1, le=20)] = 5
    request_timeout: Annotated[int, Field(ge=10, le=300)] = 60

    # Logging
    log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR"] = "INFO"

    def get_storage_endpoint(self, backend: str) -> HttpUrl:
        """
        Get endpoint for storage backend.

        Args:
            backend: Storage backend name

        Returns:
            Endpoint URL

        Raises:
            ValueError: If backend is unknown or R2R endpoint not configured
        """
        endpoints = {
            "weaviate": self.weaviate_endpoint,
            "open_webui": self.openwebui_endpoint,
        }

        if backend in endpoints:
            return endpoints[backend]
        elif backend == "r2r":
            if not self.r2r_endpoint:
                raise ValueError(
                    "R2R_API_URL must be set in environment variables. "
                    "This should have been caught during settings validation."
                )
            return self.r2r_endpoint
        else:
            raise ValueError(f"Unknown backend: {backend}. Supported: weaviate, open_webui, r2r")

    def get_api_key(self, service: str) -> str | None:
        """
        Get API key for service.

        Args:
            service: Service name

        Returns:
            API key or None
        """
        service_map = {
            "firecrawl": self.firecrawl_api_key,
            "openwebui": self.openwebui_api_key,
            "weaviate": self.weaviate_api_key,
            "r2r": self.r2r_api_key,
        }
        return service_map.get(service)

    @model_validator(mode="after")
    def validate_backend_configuration(self) -> "Settings":
        """Validate that required configuration is present for the default backend."""
        backend = self.default_storage_backend

        # Validate R2R backend configuration
        if backend == "r2r" and not self.r2r_endpoint:
            raise ValueError(
                "R2R_API_URL must be set in environment variables when using R2R as default backend"
            )

        # Validate API key requirements (optional warning for missing keys)
        required_keys = {
            "weaviate": ("WEAVIATE_API_KEY", self.weaviate_api_key),
            "open_webui": ("OPENWEBUI_API_KEY", self.openwebui_api_key),
            "r2r": ("R2R_API_KEY", self.r2r_api_key),
        }

        if backend in required_keys:
            key_name, key_value = required_keys[backend]
            if not key_value:
                import warnings
                warnings.warn(
                    f"{key_name} not set - authentication may fail for {backend} backend",
                    UserWarning,
                    stacklevel=2
                )

        return self


@lru_cache
def get_settings() -> Settings:
    """
    Get cached settings instance.

    Returns:
        Settings instance
    """
    return Settings()