Files
rag-manager/ingest_pipeline/core/models.py
2025-09-19 08:31:36 +00:00

198 lines
5.9 KiB
Python

"""Core data models with strict typing."""
from datetime import UTC, datetime
from enum import Enum
from typing import Annotated, ClassVar, TypedDict
from uuid import UUID, uuid4
from prefect.blocks.core import Block
from pydantic import BaseModel, Field, HttpUrl, SecretStr
class IngestionStatus(str, Enum):
"""Status of an ingestion job."""
PENDING = "pending"
IN_PROGRESS = "in_progress"
COMPLETED = "completed"
PARTIAL = "partial" # Some documents succeeded, some failed
FAILED = "failed"
CANCELLED = "cancelled"
class StorageBackend(str, Enum):
"""Available storage backends."""
WEAVIATE = "weaviate"
OPEN_WEBUI = "open_webui"
R2R = "r2r"
class IngestionSource(str, Enum):
"""Types of ingestion sources."""
WEB = "web"
REPOSITORY = "repository"
DOCUMENTATION = "documentation"
class VectorConfig(BaseModel):
"""Configuration for vectorization."""
model: str = Field(default="ollama/bge-m3:latest")
embedding_endpoint: HttpUrl = Field(default=HttpUrl("http://llm.lab"))
dimension: int = Field(default=1024)
batch_size: Annotated[int, Field(gt=0, le=1000)] = 100
class StorageConfig(Block):
"""Configuration for storage backend."""
_block_type_name: ClassVar[str] = "Storage Configuration"
_block_type_slug: ClassVar[str] = "storage-config"
_description: ClassVar[str] = "Configures storage backend connections and settings for document ingestion"
backend: StorageBackend
endpoint: HttpUrl
api_key: SecretStr | None = Field(default=None)
collection_name: str = Field(default="documents")
batch_size: Annotated[int, Field(gt=0, le=1000)] = 100
class FirecrawlConfig(Block):
"""Configuration for Firecrawl ingestion (operational parameters only)."""
_block_type_name: ClassVar[str] = "Firecrawl Configuration"
_block_type_slug: ClassVar[str] = "firecrawl-config"
_description: ClassVar[str] = "Configures Firecrawl web scraping and crawling parameters"
formats: list[str] = Field(default_factory=lambda: ["markdown", "html"])
max_depth: Annotated[int, Field(ge=1, le=20)] = 5
limit: Annotated[int, Field(ge=1, le=1000)] = 100
only_main_content: bool = Field(default=True)
include_subdomains: bool = Field(default=False)
class RepomixConfig(Block):
"""Configuration for Repomix ingestion."""
_block_type_name: ClassVar[str] = "Repomix Configuration"
_block_type_slug: ClassVar[str] = "repomix-config"
_description: ClassVar[str] = "Configures repository ingestion patterns and file processing settings"
include_patterns: list[str] = Field(
default_factory=lambda: ["*.py", "*.js", "*.ts", "*.md", "*.yaml", "*.json"]
)
exclude_patterns: list[str] = Field(
default_factory=lambda: ["**/node_modules/**", "**/__pycache__/**", "**/.git/**"]
)
max_file_size: int = Field(default=1_000_000) # 1MB
respect_gitignore: bool = Field(default=True)
class R2RConfig(Block):
"""Configuration for R2R ingestion."""
_block_type_name: ClassVar[str] = "R2R Configuration"
_block_type_slug: ClassVar[str] = "r2r-config"
_description: ClassVar[str] = "Configures R2R-specific ingestion settings including chunking and graph enrichment"
chunk_size: Annotated[int, Field(ge=100, le=8192)] = 1000
chunk_overlap: Annotated[int, Field(ge=0, le=1000)] = 200
enable_graph_enrichment: bool = Field(default=False)
graph_creation_settings: dict[str, object] | None = Field(default=None)
class DocumentMetadataRequired(TypedDict):
"""Required metadata fields for a document."""
source_url: str
timestamp: datetime
content_type: str
word_count: int
char_count: int
class DocumentMetadata(DocumentMetadataRequired, total=False):
"""Rich metadata for a document with R2R-compatible fields."""
# Basic optional fields
title: str | None
description: str | None
# Content categorization
tags: list[str]
category: str
section: str
language: str
# Authorship and source info
author: str
domain: str
site_name: str
# Document structure
heading_hierarchy: list[str]
section_depth: int
has_code_blocks: bool
has_images: bool
has_links: bool
# Processing metadata
extraction_method: str
crawl_depth: int
last_modified: datetime | None
# Content quality indicators
readability_score: float | None
completeness_score: float | None
# Repository-specific fields
file_path: str | None
repository_name: str | None
branch_name: str | None
commit_hash: str | None
programming_language: str | None
# Custom business metadata
importance_score: float | None
review_status: str | None
assigned_team: str | None
class Document(BaseModel):
"""Represents a single document."""
id: UUID = Field(default_factory=uuid4)
content: str
metadata: DocumentMetadata
vector: list[float] | None = Field(default=None)
score: float | None = Field(default=None)
source: IngestionSource
collection: str = Field(default="documents")
class IngestionJob(BaseModel):
"""Represents an ingestion job."""
id: UUID = Field(default_factory=uuid4)
source_type: IngestionSource
source_url: HttpUrl | str
status: IngestionStatus = Field(default=IngestionStatus.PENDING)
created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
updated_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
completed_at: datetime | None = Field(default=None)
error_message: str | None = Field(default=None)
document_count: int = Field(default=0)
storage_backend: StorageBackend
class IngestionResult(BaseModel):
"""Result of an ingestion operation."""
job_id: UUID
status: IngestionStatus
documents_processed: int
documents_failed: int
duration_seconds: float
error_messages: list[str] = Field(default_factory=list)