198 lines
5.9 KiB
Python
198 lines
5.9 KiB
Python
"""Core data models with strict typing."""
|
|
|
|
from datetime import UTC, datetime
|
|
from enum import Enum
|
|
from typing import Annotated, ClassVar, TypedDict
|
|
from uuid import UUID, uuid4
|
|
|
|
from prefect.blocks.core import Block
|
|
from pydantic import BaseModel, Field, HttpUrl, SecretStr
|
|
|
|
|
|
class IngestionStatus(str, Enum):
|
|
"""Status of an ingestion job."""
|
|
|
|
PENDING = "pending"
|
|
IN_PROGRESS = "in_progress"
|
|
COMPLETED = "completed"
|
|
PARTIAL = "partial" # Some documents succeeded, some failed
|
|
FAILED = "failed"
|
|
CANCELLED = "cancelled"
|
|
|
|
|
|
class StorageBackend(str, Enum):
|
|
"""Available storage backends."""
|
|
|
|
WEAVIATE = "weaviate"
|
|
OPEN_WEBUI = "open_webui"
|
|
R2R = "r2r"
|
|
|
|
|
|
class IngestionSource(str, Enum):
|
|
"""Types of ingestion sources."""
|
|
|
|
WEB = "web"
|
|
REPOSITORY = "repository"
|
|
DOCUMENTATION = "documentation"
|
|
|
|
|
|
class VectorConfig(BaseModel):
|
|
"""Configuration for vectorization."""
|
|
|
|
model: str = Field(default="ollama/bge-m3:latest")
|
|
embedding_endpoint: HttpUrl = Field(default=HttpUrl("http://llm.lab"))
|
|
dimension: int = Field(default=1024)
|
|
batch_size: Annotated[int, Field(gt=0, le=1000)] = 100
|
|
|
|
|
|
class StorageConfig(Block):
|
|
"""Configuration for storage backend."""
|
|
|
|
_block_type_name: ClassVar[str] = "Storage Configuration"
|
|
_block_type_slug: ClassVar[str] = "storage-config"
|
|
_description: ClassVar[str] = "Configures storage backend connections and settings for document ingestion"
|
|
|
|
backend: StorageBackend
|
|
endpoint: HttpUrl
|
|
api_key: SecretStr | None = Field(default=None)
|
|
collection_name: str = Field(default="documents")
|
|
batch_size: Annotated[int, Field(gt=0, le=1000)] = 100
|
|
|
|
|
|
class FirecrawlConfig(Block):
|
|
"""Configuration for Firecrawl ingestion (operational parameters only)."""
|
|
|
|
_block_type_name: ClassVar[str] = "Firecrawl Configuration"
|
|
_block_type_slug: ClassVar[str] = "firecrawl-config"
|
|
_description: ClassVar[str] = "Configures Firecrawl web scraping and crawling parameters"
|
|
|
|
formats: list[str] = Field(default_factory=lambda: ["markdown", "html"])
|
|
max_depth: Annotated[int, Field(ge=1, le=20)] = 5
|
|
limit: Annotated[int, Field(ge=1, le=1000)] = 100
|
|
only_main_content: bool = Field(default=True)
|
|
include_subdomains: bool = Field(default=False)
|
|
|
|
|
|
class RepomixConfig(Block):
|
|
"""Configuration for Repomix ingestion."""
|
|
|
|
_block_type_name: ClassVar[str] = "Repomix Configuration"
|
|
_block_type_slug: ClassVar[str] = "repomix-config"
|
|
_description: ClassVar[str] = "Configures repository ingestion patterns and file processing settings"
|
|
|
|
include_patterns: list[str] = Field(
|
|
default_factory=lambda: ["*.py", "*.js", "*.ts", "*.md", "*.yaml", "*.json"]
|
|
)
|
|
exclude_patterns: list[str] = Field(
|
|
default_factory=lambda: ["**/node_modules/**", "**/__pycache__/**", "**/.git/**"]
|
|
)
|
|
max_file_size: int = Field(default=1_000_000) # 1MB
|
|
respect_gitignore: bool = Field(default=True)
|
|
|
|
|
|
class R2RConfig(Block):
|
|
"""Configuration for R2R ingestion."""
|
|
|
|
_block_type_name: ClassVar[str] = "R2R Configuration"
|
|
_block_type_slug: ClassVar[str] = "r2r-config"
|
|
_description: ClassVar[str] = "Configures R2R-specific ingestion settings including chunking and graph enrichment"
|
|
|
|
chunk_size: Annotated[int, Field(ge=100, le=8192)] = 1000
|
|
chunk_overlap: Annotated[int, Field(ge=0, le=1000)] = 200
|
|
enable_graph_enrichment: bool = Field(default=False)
|
|
graph_creation_settings: dict[str, object] | None = Field(default=None)
|
|
|
|
|
|
class DocumentMetadataRequired(TypedDict):
|
|
"""Required metadata fields for a document."""
|
|
source_url: str
|
|
timestamp: datetime
|
|
content_type: str
|
|
word_count: int
|
|
char_count: int
|
|
|
|
|
|
class DocumentMetadata(DocumentMetadataRequired, total=False):
|
|
"""Rich metadata for a document with R2R-compatible fields."""
|
|
|
|
# Basic optional fields
|
|
title: str | None
|
|
description: str | None
|
|
|
|
# Content categorization
|
|
tags: list[str]
|
|
category: str
|
|
section: str
|
|
language: str
|
|
|
|
# Authorship and source info
|
|
author: str
|
|
domain: str
|
|
site_name: str
|
|
|
|
# Document structure
|
|
heading_hierarchy: list[str]
|
|
section_depth: int
|
|
has_code_blocks: bool
|
|
has_images: bool
|
|
has_links: bool
|
|
|
|
# Processing metadata
|
|
extraction_method: str
|
|
crawl_depth: int
|
|
last_modified: datetime | None
|
|
|
|
# Content quality indicators
|
|
readability_score: float | None
|
|
completeness_score: float | None
|
|
|
|
# Repository-specific fields
|
|
file_path: str | None
|
|
repository_name: str | None
|
|
branch_name: str | None
|
|
commit_hash: str | None
|
|
programming_language: str | None
|
|
|
|
# Custom business metadata
|
|
importance_score: float | None
|
|
review_status: str | None
|
|
assigned_team: str | None
|
|
|
|
|
|
class Document(BaseModel):
|
|
"""Represents a single document."""
|
|
|
|
id: UUID = Field(default_factory=uuid4)
|
|
content: str
|
|
metadata: DocumentMetadata
|
|
vector: list[float] | None = Field(default=None)
|
|
score: float | None = Field(default=None)
|
|
source: IngestionSource
|
|
collection: str = Field(default="documents")
|
|
|
|
|
|
class IngestionJob(BaseModel):
|
|
"""Represents an ingestion job."""
|
|
|
|
id: UUID = Field(default_factory=uuid4)
|
|
source_type: IngestionSource
|
|
source_url: HttpUrl | str
|
|
status: IngestionStatus = Field(default=IngestionStatus.PENDING)
|
|
created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
|
|
updated_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
|
|
completed_at: datetime | None = Field(default=None)
|
|
error_message: str | None = Field(default=None)
|
|
document_count: int = Field(default=0)
|
|
storage_backend: StorageBackend
|
|
|
|
|
|
class IngestionResult(BaseModel):
|
|
"""Result of an ingestion operation."""
|
|
|
|
job_id: UUID
|
|
status: IngestionStatus
|
|
documents_processed: int
|
|
documents_failed: int
|
|
duration_seconds: float
|
|
error_messages: list[str] = Field(default_factory=list)
|