rag-manager/repomix-output.xml

This file is a merged representation of a subset of the codebase, containing specifically included files, combined into a single document by Repomix.

<file_summary>
This section contains a summary of this file.

<purpose>
This file contains a packed representation of a subset of the repository's contents that is considered the most important context.
It is designed to be easily consumable by AI systems for analysis, code review,
or other automated processes.
</purpose>

<file_format>
The content is organized as follows:
1. This summary section
2. Repository information
3. Directory structure
4. Repository files (if enabled)
5. Multiple file entries, each consisting of:
  - File path as an attribute
  - Full contents of the file
</file_format>

<usage_guidelines>
- This file should be treated as read-only. Any changes should be made to the
  original repository files, not this packed version.
- When processing this file, use the file path to distinguish
  between different files in the repository.
- Be aware that this file may contain sensitive information. Handle it with
  the same level of security as you would the original repository.
</usage_guidelines>

<notes>
- Some files may have been excluded based on .gitignore rules and Repomix's configuration
- Binary files are not included in this packed representation. Please refer to the Repository Structure section for a complete list of file paths, including binary files
- Only files matching these patterns are included: ingest_pipeline/
- Files matching patterns in .gitignore are excluded
- Files matching default ignore patterns are excluded
- Files are sorted by Git change count (files with more changes are at the bottom)
</notes>

</file_summary>

<directory_structure>
ingest_pipeline/
  automations/
    __init__.py
  cli/
    tui/
      screens/
        __init__.py
        base.py
        dashboard.py
        dialogs.py
        documents.py
        help.py
        ingestion.py
        search.py
      utils/
        __init__.py
        runners.py
        storage_manager.py
      widgets/
        __init__.py
        cards.py
        firecrawl_config.py
        indicators.py
        r2r_widgets.py
        tables.py
      __init__.py
      app.py
      layouts.py
      models.py
      styles.py
    __init__.py
    main.py
  config/
    __init__.py
    settings.py
  core/
    __init__.py
    exceptions.py
    models.py
  flows/
    __init__.py
    ingestion.py
    scheduler.py
  ingestors/
    __init__.py
    base.py
    firecrawl.py
  storage/
    r2r/
      __init__.py
      collections.py
      storage.py
    __init__.py
    base.py
    openwebui.py
    types.py
    weaviate.py
  utils/
    __init__.py
    async_helpers.py
    metadata_tagger.py
    vectorizer.py
  __main__.py
</directory_structure>

<files>
This section contains the contents of the repository's files.

<file path="ingest_pipeline/utils/async_helpers.py">
"""Async utilities for task management and backpressure control."""

import asyncio
import logging
from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable
from contextlib import asynccontextmanager
from typing import Final, TypeVar

LOGGER: Final[logging.Logger] = logging.getLogger(__name__)

T = TypeVar("T")


class AsyncTaskManager:
    """Manages concurrent tasks with backpressure control."""

    def __init__(self, max_concurrent: int = 10):
        """
        Initialize task manager.

        Args:
            max_concurrent: Maximum number of concurrent tasks
        """
        self.semaphore = asyncio.Semaphore(max_concurrent)
        self.max_concurrent = max_concurrent

    @asynccontextmanager
    async def acquire(self) -> AsyncGenerator[None, None]:
        """Acquire a slot for task execution."""
        async with self.semaphore:
            yield

    async def run_tasks(
        self, tasks: Iterable[Awaitable[T]], return_exceptions: bool = False
    ) -> list[T | BaseException]:
        """
        Run multiple tasks with backpressure control.

        Args:
            tasks: Iterable of awaitable tasks
            return_exceptions: Whether to return exceptions or raise them

        Returns:
            List of task results or exceptions
        """

        async def _controlled_task(task: Awaitable[T]) -> T:
            async with self.acquire():
                return await task

        controlled_tasks = [_controlled_task(task) for task in tasks]

        if return_exceptions:
            results = await asyncio.gather(*controlled_tasks, return_exceptions=True)
            return list(results)
        else:
            results = await asyncio.gather(*controlled_tasks)
            return list(results)

    async def map_async(
        self, func: Callable[[T], Awaitable[T]], items: Iterable[T], return_exceptions: bool = False
    ) -> list[T | BaseException]:
        """
        Apply async function to items with backpressure control.

        Args:
            func: Async function to apply
            items: Items to process
            return_exceptions: Whether to return exceptions or raise them

        Returns:
            List of processed results or exceptions
        """
        tasks = [func(item) for item in items]
        return await self.run_tasks(tasks, return_exceptions=return_exceptions)


async def run_with_semaphore(semaphore: asyncio.Semaphore, coro: Awaitable[T]) -> T:
    """Run coroutine with semaphore-controlled concurrency."""
    async with semaphore:
        return await coro


async def batch_process(
    items: list[T],
    processor: Callable[[T], Awaitable[T]],
    batch_size: int = 50,
    max_concurrent: int = 5,
) -> list[T]:
    """
    Process items in batches with controlled concurrency.

    Args:
        items: Items to process
        processor: Async function to process each item
        batch_size: Number of items per batch
        max_concurrent: Maximum concurrent tasks per batch

    Returns:
        List of processed results
    """
    task_manager = AsyncTaskManager(max_concurrent)
    results: list[T] = []

    for i in range(0, len(items), batch_size):
        batch = items[i : i + batch_size]
        LOGGER.debug(
            "Processing batch %d-%d of %d items", i, min(i + batch_size, len(items)), len(items)
        )

        batch_results = await task_manager.map_async(processor, batch, return_exceptions=False)
        # If return_exceptions=False, exceptions would have been raised, so all results are successful
        # Type checker doesn't know this, so we need to cast
        successful_results: list[T] = [r for r in batch_results if not isinstance(r, BaseException)]
        results.extend(successful_results)

    return results
</file>

<file path="ingest_pipeline/automations/__init__.py">
"""Prefect Automations for ingestion pipeline monitoring and management."""

# Automation configurations as YAML-ready dictionaries
AUTOMATION_TEMPLATES = {
    "cancel_long_running": """
name: Cancel Long Running Ingestion Flows
description: Cancels ingestion flows running longer than 30 minutes
trigger:
  type: event
  posture: Proactive
  expect: [prefect.flow-run.Running]
  match_related:
    prefect.resource.role: flow
    prefect.resource.name: ingestion_pipeline
  threshold: 1
  within: 1800
actions:
  - type: cancel-flow-run
    source: inferred
enabled: true
""",
    "retry_failed": """
name: Retry Failed Ingestion Flows
description: Retries failed ingestion flows with original parameters
trigger:
  type: event
  posture: Reactive
  expect: [prefect.flow-run.Failed]
  match_related:
    prefect.resource.role: flow
    prefect.resource.name: ingestion_pipeline
  threshold: 1
  within: 0
actions:
  - type: run-deployment
    source: inferred
    parameters:
      validate_first: false
enabled: true
""",
    "resource_monitoring": """
name: Manage Work Pool Based on Resources
description: Pauses work pool when system resources are constrained
trigger:
  type: event
  posture: Reactive
  expect: [system.resource.high_usage]
  threshold: 1
  within: 120
actions:
  - type: pause-work-pool
    work_pool_name: default
enabled: true
""",
}


def get_automation_yaml_templates() -> dict[str, str]:
    """Get automation templates as YAML strings."""
    return AUTOMATION_TEMPLATES.copy()
</file>

<file path="ingest_pipeline/cli/tui/screens/help.py">
"""Help screen with keyboard shortcuts and usage information."""

from textual.app import ComposeResult
from textual.binding import Binding
from textual.containers import Container, ScrollableContainer
from textual.screen import ModalScreen
from textual.widgets import Button, Markdown, Rule, Static
from typing_extensions import override


class HelpScreen(ModalScreen[None]):
    """Modern help screen with comprehensive keyboard shortcuts."""

    help_content: str

    BINDINGS = [
        Binding("escape", "app.pop_screen", "Close"),
        Binding("q", "app.pop_screen", "Close"),
        Binding("enter", "app.pop_screen", "Close"),
        Binding("f1", "app.pop_screen", "Close"),
    ]

    def __init__(self, help_content: str):
        super().__init__()
        self.help_content = help_content

    @override
    def compose(self) -> ComposeResult:
        with Container(classes="modal-container"):
            yield Static("📚 Help & Keyboard Shortcuts", classes="title")
            yield Static("Enhanced navigation and productivity features", classes="subtitle")
            yield Rule(line_style="heavy")

            with ScrollableContainer():
                yield Markdown(self.help_content)

            yield Container(
                Button("✅ Got it! (Press Escape or Enter)", id="close_btn", variant="primary"),
                classes="action_buttons center",
            )

    def on_mount(self) -> None:
        """Initialize the help screen."""
        # Focus the close button
        self.query_one("#close_btn").focus()

    def on_button_pressed(self, event: Button.Pressed) -> None:
        """Close help screen."""
        if event.button.id == "close_btn":
            self.app.pop_screen()
</file>

<file path="ingest_pipeline/cli/tui/utils/__init__.py">
"""Utility functions for the TUI."""

from .runners import dashboard, run_textual_tui

__all__ = ["dashboard", "run_textual_tui"]
</file>

<file path="ingest_pipeline/cli/tui/widgets/__init__.py">
"""Enhanced widgets with keyboard navigation support."""

from .cards import MetricsCard
from .indicators import EnhancedProgressBar, StatusIndicator
from .tables import EnhancedDataTable

__all__ = [
    "MetricsCard",
    "StatusIndicator",
    "EnhancedProgressBar",
    "EnhancedDataTable",
]
</file>

<file path="ingest_pipeline/cli/tui/widgets/cards.py">
"""Metrics card widget."""

from typing import Any

from textual.app import ComposeResult
from textual.widgets import Static
from typing_extensions import override


class MetricsCard(Static):
    """A modern metrics display card."""

    title: str
    value: str
    description: str

    def __init__(self, title: str, value: str, description: str = "", **kwargs: Any) -> None:
        super().__init__(**kwargs)
        self.title = title
        self.value = value
        self.description = description

    @override
    def compose(self) -> ComposeResult:
        yield Static(self.value, classes="metrics-value")
        yield Static(self.title, classes="metrics-label")
        if self.description:
            yield Static(self.description, classes="metrics-description")
</file>

<file path="ingest_pipeline/cli/tui/widgets/tables.py">
"""Enhanced DataTable with improved keyboard navigation."""

from typing import Any

from textual import events
from textual.binding import Binding
from textual.message import Message
from textual.widgets import DataTable


class EnhancedDataTable(DataTable[Any]):
    """DataTable with enhanced keyboard navigation and visual feedback."""

    BINDINGS = [
        Binding("up,k", "cursor_up", "Cursor Up", show=False),
        Binding("down,j", "cursor_down", "Cursor Down", show=False),
        Binding("left,h", "cursor_left", "Cursor Left", show=False),
        Binding("right,l", "cursor_right", "Cursor Right", show=False),
        Binding("home", "cursor_home", "First Row", show=False),
        Binding("end", "cursor_end", "Last Row", show=False),
        Binding("pageup", "page_up", "Page Up", show=False),
        Binding("pagedown", "page_down", "Page Down", show=False),
        Binding("enter", "select_cursor", "Select", show=False),
        Binding("space", "toggle_selection", "Toggle Selection", show=False),
        Binding("ctrl+a", "select_all", "Select All", show=False),
        Binding("ctrl+shift+a", "clear_selection", "Clear Selection", show=False),
    ]

    def __init__(self, **kwargs: Any) -> None:
        super().__init__(**kwargs)
        self.cursor_type = "row"  # Default to row selection
        self.zebra_stripes = True  # Enable zebra striping for better visibility
        self.show_cursor = True

    def on_key(self, event: events.Key) -> None:
        """Handle additional keyboard shortcuts."""
        if event.key == "ctrl+1":
            # Jump to first column
            self.move_cursor(column=0)
            event.prevent_default()
        elif event.key == "ctrl+9":
            # Jump to last column
            if self.columns:
                self.move_cursor(column=len(self.columns) - 1)
            event.prevent_default()
        elif event.key == "/":
            # Start quick search (to be implemented by parent)
            self.post_message(self.QuickSearch(self))
            event.prevent_default()
        elif event.key == "escape":
            # Clear selection or exit search
            # Clear selection by calling action
            self.action_clear_selection()
            event.prevent_default()
        # No else clause needed - just handle our events

    def action_cursor_home(self) -> None:
        """Move cursor to first row."""
        if self.row_count > 0:
            self.move_cursor(row=0)

    def action_cursor_end(self) -> None:
        """Move cursor to last row."""
        if self.row_count > 0:
            self.move_cursor(row=self.row_count - 1)

    def action_page_up(self) -> None:
        """Move cursor up by visible page size."""
        if self.row_count > 0:
            page_size = max(1, self.size.height // 2)  # Approximate visible rows
            new_row = max(0, self.cursor_coordinate.row - page_size)
            self.move_cursor(row=new_row)

    def action_page_down(self) -> None:
        """Move cursor down by visible page size."""
        if self.row_count > 0:
            page_size = max(1, self.size.height // 2)  # Approximate visible rows
            new_row = min(self.row_count - 1, self.cursor_coordinate.row + page_size)
            self.move_cursor(row=new_row)

    def action_toggle_selection(self) -> None:
        """Toggle selection of current row."""
        if self.row_count > 0:
            current_row = self.cursor_coordinate.row
            # This will be handled by the parent screen
            self.post_message(self.RowToggled(self, current_row))

    def action_select_all(self) -> None:
        """Select all rows."""
        # This will be handled by the parent screen
        self.post_message(self.SelectAll(self))

    def action_clear_selection(self) -> None:
        """Clear all selections."""
        # This will be handled by the parent screen
        self.post_message(self.ClearSelection(self))

    # Custom messages for enhanced functionality
    class QuickSearch(Message):
        """Posted when user wants to start a quick search."""

        def __init__(self, table: "EnhancedDataTable") -> None:
            super().__init__()
            self.table = table

    class RowToggled(Message):
        """Posted when a row selection is toggled."""

        def __init__(self, table: "EnhancedDataTable", row_index: int) -> None:
            super().__init__()
            self.table = table
            self.row_index = row_index

    class SelectAll(Message):
        """Posted when user wants to select all rows."""

        def __init__(self, table: "EnhancedDataTable") -> None:
            super().__init__()
            self.table = table

    class ClearSelection(Message):
        """Posted when user wants to clear selection."""

        def __init__(self, table: "EnhancedDataTable") -> None:
            super().__init__()
            self.table = table
</file>

<file path="ingest_pipeline/cli/tui/__init__.py">
"""Enhanced TUI package with keyboard navigation and modular architecture."""

from .app import CollectionManagementApp
from .models import CollectionInfo, DocumentInfo
from .utils import dashboard, run_textual_tui

__all__ = [
    "CollectionManagementApp",
    "CollectionInfo",
    "DocumentInfo",
    "dashboard",
    "run_textual_tui",
]
</file>

<file path="ingest_pipeline/cli/__init__.py">
"""CLI module for the ingestion pipeline."""

from .main import app

__all__ = ["app"]
</file>

<file path="ingest_pipeline/core/__init__.py">
"""Core module for ingestion pipeline."""

from .exceptions import (
    IngestionError,
    StorageError,
    VectorizationError,
)
from .models import (
    Document,
    IngestionJob,
    IngestionResult,
    IngestionSource,
    IngestionStatus,
    StorageBackend,
)

__all__ = [
    "Document",
    "IngestionJob",
    "IngestionResult",
    "IngestionSource",
    "IngestionStatus",
    "StorageBackend",
    "IngestionError",
    "StorageError",
    "VectorizationError",
]
</file>

<file path="ingest_pipeline/core/exceptions.py">
"""Custom exceptions for the ingestion pipeline."""


class IngestionError(Exception):
    """Base exception for ingestion errors."""

    pass


class StorageError(IngestionError):
    """Exception for storage-related errors."""

    pass


class VectorizationError(IngestionError):
    """Exception for vectorization errors."""

    pass


class ConfigurationError(IngestionError):
    """Exception for configuration errors."""

    pass


class SourceNotFoundError(IngestionError):
    """Exception when source cannot be found or accessed."""

    pass
</file>

<file path="ingest_pipeline/flows/__init__.py">
"""Prefect flows for orchestration."""

from .ingestion import create_ingestion_flow
from .scheduler import create_scheduled_deployment

__all__ = [
    "create_ingestion_flow",
    "create_scheduled_deployment",
]
</file>

<file path="ingest_pipeline/storage/r2r/__init__.py">
"""R2R storage package providing comprehensive R2R integration."""

from .storage import R2RStorage

__all__ = ["R2RStorage"]
</file>

<file path="ingest_pipeline/storage/types.py">
"""Shared types for storage adapters."""

from typing import TypedDict


class CollectionSummary(TypedDict):
    """Collection metadata for describe_collections."""

    name: str
    count: int
    size_mb: float


class DocumentInfo(TypedDict):
    """Document information for list_documents."""

    id: str
    title: str
    source_url: str
    description: str
    content_type: str
    content_preview: str
    word_count: int
    timestamp: str
</file>

<file path="ingest_pipeline/utils/__init__.py">
"""Utility modules."""

from .metadata_tagger import MetadataTagger
from .vectorizer import Vectorizer

__all__ = ["MetadataTagger", "Vectorizer"]
</file>

<file path="ingest_pipeline/__main__.py">
"""Main entry point for the ingestion pipeline."""

from .cli.main import app

if __name__ == "__main__":
    app()
</file>

<file path="ingest_pipeline/cli/tui/screens/__init__.py">
"""Screen components for the TUI application."""

from __future__ import annotations

from .dashboard import CollectionOverviewScreen
from .dialogs import ConfirmDeleteScreen, ConfirmDocumentDeleteScreen
from .documents import DocumentManagementScreen
from .help import HelpScreen
from .ingestion import IngestionScreen
from .search import SearchScreen

__all__ = [
    "CollectionOverviewScreen",
    "IngestionScreen",
    "SearchScreen",
    "DocumentManagementScreen",
    "ConfirmDeleteScreen",
    "ConfirmDocumentDeleteScreen",
    "HelpScreen",
]
</file>

<file path="ingest_pipeline/cli/tui/screens/base.py">
"""Base screen classes for common CRUD patterns."""

from __future__ import annotations

from typing import TYPE_CHECKING, Generic, TypeVar

from textual import work
from textual.app import ComposeResult
from textual.binding import Binding
from textual.containers import Container
from textual.screen import ModalScreen, Screen
from textual.widget import Widget
from textual.widgets import Button, DataTable, LoadingIndicator, Static
from typing_extensions import override

if TYPE_CHECKING:
    from ..utils.storage_manager import StorageManager

T = TypeVar("T")


class BaseScreen(Screen[object]):
    """Base screen with common functionality."""

    def __init__(
        self,
        storage_manager: StorageManager,
        *,
        name: str | None = None,
        id: str | None = None,
        classes: str | None = None,
        **kwargs: object,
    ) -> None:
        """Initialize base screen."""
        super().__init__(name=name, id=id, classes=classes)
        # Ignore any additional kwargs to avoid type issues
        self.storage_manager = storage_manager


class CRUDScreen(BaseScreen, Generic[T]):
    """Base class for Create/Read/Update/Delete operations."""

    BINDINGS = [
        Binding("ctrl+n", "create_item", "New"),
        Binding("ctrl+e", "edit_item", "Edit"),
        Binding("ctrl+d", "delete_item", "Delete"),
        Binding("f5", "refresh", "Refresh"),
        Binding("escape", "app.pop_screen", "Back"),
    ]

    def __init__(
        self,
        storage_manager: StorageManager,
        *,
        name: str | None = None,
        id: str | None = None,
        classes: str | None = None,
        **kwargs: object,
    ) -> None:
        """Initialize CRUD screen."""
        super().__init__(storage_manager, name=name, id=id, classes=classes)
        self.items: list[T] = []
        self.selected_item: T | None = None
        self.loading = False

    @override
    def compose(self) -> ComposeResult:
        """Compose CRUD screen layout."""
        yield Container(
            Static(self.get_title(), classes="screen-title"),
            self.create_toolbar(),
            self.create_list_view(),
            LoadingIndicator(id="loading"),
            classes="crud-container",
        )

    def get_title(self) -> str:
        """Get screen title."""
        return "CRUD Operations"

    def create_toolbar(self) -> Container:
        """Create action toolbar."""
        return Container(
            Button("📝 New", id="new_btn", variant="primary"),
            Button("✏️ Edit", id="edit_btn", variant="default"),
            Button("🗑️ Delete", id="delete_btn", variant="error"),
            Button("🔄 Refresh", id="refresh_btn", variant="default"),
            classes="toolbar",
        )

    def create_list_view(self) -> DataTable[str]:
        """Create list view widget."""
        table = DataTable[str](id="items_table")
        table.add_columns(*self.get_table_columns())
        return table

    def get_table_columns(self) -> list[str]:
        """Get table column headers."""
        raise NotImplementedError("Subclasses must implement get_table_columns")

    async def load_items(self) -> list[T]:
        """Load items from storage."""
        raise NotImplementedError("Subclasses must implement load_items")

    def item_to_row(self, item: T) -> list[str]:
        """Convert item to table row."""
        raise NotImplementedError("Subclasses must implement item_to_row")

    async def create_item_dialog(self) -> T | None:
        """Show create item dialog."""
        raise NotImplementedError("Subclasses must implement create_item_dialog")

    async def edit_item_dialog(self, item: T) -> T | None:
        """Show edit item dialog."""
        raise NotImplementedError("Subclasses must implement edit_item_dialog")

    async def delete_item(self, item: T) -> bool:
        """Delete item."""
        raise NotImplementedError("Subclasses must implement delete_item")

    def on_mount(self) -> None:
        """Initialize screen."""
        self.query_one("#loading").display = False
        self.refresh_items()

    @work(exclusive=True)
    async def refresh_items(self) -> None:
        """Refresh items list."""
        self.set_loading(True)
        try:
            self.items = await self.load_items()
            await self.update_table()
        finally:
            self.set_loading(False)

    async def update_table(self) -> None:
        """Update table with current items."""
        table = self.query_one("#items_table", DataTable)
        table.clear()

        for item in self.items:
            row_data = self.item_to_row(item)
            table.add_row(*row_data)

    def set_loading(self, loading: bool) -> None:
        """Set loading state."""
        self.loading = loading
        loading_widget = self.query_one("#loading")
        loading_widget.display = loading

    def action_create_item(self) -> None:
        """Create new item."""
        self.run_worker(self._create_item_worker())

    def action_edit_item(self) -> None:
        """Edit selected item."""
        if self.selected_item:
            self.run_worker(self._edit_item_worker())

    def action_delete_item(self) -> None:
        """Delete selected item."""
        if self.selected_item:
            self.run_worker(self._delete_item_worker())

    def action_refresh(self) -> None:
        """Refresh items."""
        self.refresh_items()

    async def _create_item_worker(self) -> None:
        """Worker for creating items."""
        item = await self.create_item_dialog()
        if item:
            self.refresh_items()

    async def _edit_item_worker(self) -> None:
        """Worker for editing items."""
        if self.selected_item:
            item = await self.edit_item_dialog(self.selected_item)
            if item:
                self.refresh_items()

    async def _delete_item_worker(self) -> None:
        """Worker for deleting items."""
        if self.selected_item:
            success = await self.delete_item(self.selected_item)
            if success:
                self.refresh_items()


class ListScreen(BaseScreen, Generic[T]):
    """Base for paginated list views."""

    def __init__(
        self,
        storage_manager: StorageManager,
        page_size: int = 20,
        *,
        name: str | None = None,
        id: str | None = None,
        classes: str | None = None,
        **kwargs: object,
    ) -> None:
        """Initialize list screen."""
        super().__init__(storage_manager, name=name, id=id, classes=classes)
        self.page_size = page_size
        self.current_page = 0
        self.total_items = 0
        self.items: list[T] = []

    @override
    def compose(self) -> ComposeResult:
        """Compose list screen layout."""
        yield Container(
            Static(self.get_title(), classes="screen-title"),
            self.create_filters(),
            self.create_list_view(),
            self.create_pagination(),
            LoadingIndicator(id="loading"),
            classes="list-container",
        )

    def get_title(self) -> str:
        """Get screen title."""
        raise NotImplementedError("Subclasses must implement get_title")

    def create_filters(self) -> Container:
        """Create filter widgets."""
        raise NotImplementedError("Subclasses must implement create_filters")

    def create_list_view(self) -> Widget:
        """Create list view widget."""
        raise NotImplementedError("Subclasses must implement create_list_view")

    async def load_page(self, page: int, page_size: int) -> tuple[list[T], int]:
        """Load page of items."""
        raise NotImplementedError("Subclasses must implement load_page")

    def create_pagination(self) -> Container:
        """Create pagination controls."""
        return Container(
            Button("◀ Previous", id="prev_btn", variant="default"),
            Static("Page 1 of 1", id="page_info"),
            Button("Next ▶", id="next_btn", variant="default"),
            classes="pagination",
        )

    def on_mount(self) -> None:
        """Initialize screen."""
        self.query_one("#loading").display = False
        self.load_current_page()

    @work(exclusive=True)
    async def load_current_page(self) -> None:
        """Load current page."""
        self.set_loading(True)
        try:
            self.items, self.total_items = await self.load_page(self.current_page, self.page_size)
            await self.update_list_view()
            self.update_pagination_info()
        finally:
            self.set_loading(False)

    async def update_list_view(self) -> None:
        """Update list view with current items."""
        raise NotImplementedError("Subclasses must implement update_list_view")

    def update_pagination_info(self) -> None:
        """Update pagination information."""
        total_pages = max(1, (self.total_items + self.page_size - 1) // self.page_size)
        current_page_display = self.current_page + 1

        page_info = self.query_one("#page_info", Static)
        page_info.update(f"Page {current_page_display} of {total_pages}")

        prev_btn = self.query_one("#prev_btn", Button)
        next_btn = self.query_one("#next_btn", Button)

        prev_btn.disabled = self.current_page == 0
        next_btn.disabled = self.current_page >= total_pages - 1

    def set_loading(self, loading: bool) -> None:
        """Set loading state."""
        loading_widget = self.query_one("#loading")
        loading_widget.display = loading

    def on_button_pressed(self, event: Button.Pressed) -> None:
        """Handle button presses."""
        if event.button.id == "prev_btn" and self.current_page > 0:
            self.current_page -= 1
            self.load_current_page()
        elif event.button.id == "next_btn":
            total_pages = (self.total_items + self.page_size - 1) // self.page_size
            if self.current_page < total_pages - 1:
                self.current_page += 1
                self.load_current_page()


class FormScreen(ModalScreen[T], Generic[T]):
    """Base for input forms with validation."""

    BINDINGS = [
        Binding("escape", "app.pop_screen", "Cancel"),
        Binding("ctrl+s", "save", "Save"),
        Binding("enter", "save", "Save"),
    ]

    def __init__(
        self,
        item: T | None = None,
        *,
        name: str | None = None,
        id: str | None = None,
        classes: str | None = None,
        **kwargs: object,
    ) -> None:
        """Initialize form screen."""
        super().__init__(name=name, id=id, classes=classes)
        # Ignore any additional kwargs to avoid type issues
        self.item = item
        self.is_edit_mode = item is not None

    @override
    def compose(self) -> ComposeResult:
        """Compose form layout."""
        title = "Edit" if self.is_edit_mode else "Create"
        yield Container(
            Static(f"{title} {self.get_item_type()}", classes="form-title"),
            self.create_form_fields(),
            Container(
                Button("💾 Save", id="save_btn", variant="success"),
                Button("❌ Cancel", id="cancel_btn", variant="default"),
                classes="form-actions",
            ),
            classes="form-container",
        )

    def get_item_type(self) -> str:
        """Get item type name for title."""
        raise NotImplementedError("Subclasses must implement get_item_type")

    def create_form_fields(self) -> Container:
        """Create form input fields."""
        raise NotImplementedError("Subclasses must implement create_form_fields")

    def validate_form(self) -> tuple[bool, list[str]]:
        """Validate form data."""
        raise NotImplementedError("Subclasses must implement validate_form")

    def get_form_data(self) -> T:
        """Get item from form data."""
        raise NotImplementedError("Subclasses must implement get_form_data")

    def on_mount(self) -> None:
        """Initialize form."""
        if self.is_edit_mode and self.item:
            self.populate_form(self.item)

    def populate_form(self, item: T) -> None:
        """Populate form with item data."""
        raise NotImplementedError("Subclasses must implement populate_form")

    def action_save(self) -> None:
        """Save form data."""
        is_valid, errors = self.validate_form()
        if is_valid:
            try:
                item = self.get_form_data()
                self.dismiss(item)
            except Exception as e:
                self.show_validation_errors([str(e)])
        else:
            self.show_validation_errors(errors)

    def show_validation_errors(self, errors: list[str]) -> None:
        """Show validation errors to user."""
        # This would typically show a notification or update error display
        pass

    def on_button_pressed(self, event: Button.Pressed) -> None:
        """Handle button presses."""
        if event.button.id == "save_btn":
            self.action_save()
        elif event.button.id == "cancel_btn":
            self.dismiss(None)
</file>

<file path="ingest_pipeline/cli/tui/widgets/indicators.py">
"""Status indicators and progress bars with enhanced visual feedback."""

from typing import Any

from textual.app import ComposeResult
from textual.widgets import ProgressBar, Static
from typing_extensions import override


class StatusIndicator(Static):
    """Modern status indicator with color coding and animations."""

    status: str

    def __init__(self, status: str, **kwargs: Any) -> None:
        super().__init__(**kwargs)
        self.status = status
        self.update_status(status)

    def update_status(self, status: str) -> None:
        """Update the status display with enhanced visual feedback."""
        self.status = status

        # Remove previous status classes
        self.remove_class("status-active", "status-error", "status-warning", "pulse", "glow")

        status_lower = status.lower()

        if (
            status_lower in {"active", "online", "connected", "✓ active"}
            or status_lower.endswith("active")
            or "✓" in status_lower
            and "active" in status_lower
        ):
            self.add_class("status-active")
            self.add_class("glow")
            self.update(f"🟢 {status}")
        elif status_lower in {"error", "failed", "offline", "disconnected"}:
            self.add_class("status-error")
            self.add_class("pulse")
            self.update(f"🔴 {status}")
        elif status_lower in {"warning", "pending", "in_progress"}:
            self.add_class("status-warning")
            self.add_class("pulse")
            self.update(f"🟡 {status}")
        elif status_lower in {"loading", "connecting"}:
            self.add_class("shimmer")
            self.update(f"🔄 {status}")
        else:
            self.update(f"⚪ {status}")


class EnhancedProgressBar(Static):
    """Enhanced progress bar with better visual feedback."""

    total: int
    progress: int
    status_text: str

    def __init__(self, total: int = 100, **kwargs: Any) -> None:
        super().__init__(**kwargs)
        self.total = total
        self.progress = 0
        self.status_text = "Ready"

    @override
    def compose(self) -> ComposeResult:
        yield Static("", id="progress_status", classes="progress-label")
        yield ProgressBar(total=self.total, id="progress_bar", show_eta=True, classes="shimmer")

    def update_progress(self, progress: int, status: str = "") -> None:
        """Update progress with enhanced feedback."""
        self.progress = progress
        if status:
            self.status_text = status

        # Update the progress bar
        progress_bar = self.query_one("#progress_bar", ProgressBar)
        progress_bar.update(progress=progress)

        # Update status text with icons
        status_display = self.query_one("#progress_status", Static)
        if progress >= 100:
            status_display.update(f"✅ {self.status_text}")
            progress_bar.add_class("glow")
        elif progress >= 75:
            status_display.update(f"🔥 {self.status_text}")
        elif progress >= 50:
            status_display.update(f"⚡ {self.status_text}")
        elif progress >= 25:
            status_display.update(f"🔄 {self.status_text}")
        else:
            status_display.update(f"🚀 {self.status_text}")
</file>

<file path="ingest_pipeline/cli/tui/models.py">
"""Data models and TypedDict definitions for the TUI."""

from enum import IntEnum
from typing import TypedDict


class StorageCapabilities(IntEnum):
    """Storage backend capabilities (ordered by feature completeness)."""

    NONE = 0
    BASIC = 1  # Basic CRUD operations
    VECTOR_SEARCH = 2  # Vector search capabilities
    KNOWLEDGE_BASE = 3  # Knowledge base features
    FULL_FEATURED = 4  # All features including chunks and entities


class CollectionInfo(TypedDict):
    """Information about a collection."""

    name: str
    type: str
    count: int
    backend: str | list[str]  # Support both single backend and multi-backend
    status: str
    last_updated: str
    size_mb: float


class DocumentInfo(TypedDict):
    """Information about a document."""

    id: str
    title: str
    source_url: str
    description: str
    content_type: str
    content_preview: str
    word_count: int
    timestamp: str


class ChunkInfo(TypedDict):
    """Information about a document chunk (R2R specific)."""

    id: str
    document_id: str
    content: str
    start_index: int
    end_index: int
    metadata: dict[str, object]


class EntityInfo(TypedDict):
    """Information about an extracted entity (R2R specific)."""

    id: str
    name: str
    type: str
    confidence: float
    metadata: dict[str, object]


class FirecrawlOptions(TypedDict, total=False):
    """Advanced Firecrawl scraping options."""

    # Scraping options
    formats: list[str]  # ["markdown", "html", "screenshot"]
    only_main_content: bool
    include_tags: list[str]
    exclude_tags: list[str]
    wait_for: int  # milliseconds

    # Mapping options
    search: str | None
    include_subdomains: bool
    limit: int
    max_depth: int

    # Extraction options
    extract_schema: dict[str, object] | None
    extract_prompt: str | None


class IngestionConfig(TypedDict):
    """Configuration for ingestion operations."""

    source_url: str
    source_type: str  # "web", "repository", "documentation"
    target_collection: str
    storage_backend: str
    firecrawl_options: FirecrawlOptions
    batch_size: int
    max_concurrent: int


class SearchFilter(TypedDict, total=False):
    """Search filtering options."""

    backends: list[str]
    collections: list[str]
    content_types: list[str]
    date_range: tuple[str, str] | None
    word_count_range: tuple[int, int] | None
    similarity_threshold: float


class IngestionProgress(TypedDict):
    """Real-time ingestion progress information."""

    total_urls: int
    processed_urls: int
    successful_ingestions: int
    failed_ingestions: int
    current_url: str
    elapsed_time: float
    estimated_remaining: float
    errors: list[str]
</file>

<file path="ingest_pipeline/ingestors/__init__.py">
"""Ingestors module for different data sources."""

from .base import BaseIngestor
from .firecrawl import FirecrawlIngestor, FirecrawlPage
from .repomix import RepomixIngestor

__all__ = [
    "BaseIngestor",
    "FirecrawlIngestor",
    "FirecrawlPage",
    "RepomixIngestor",
]
</file>

<file path="ingest_pipeline/storage/r2r/collections.py">
"""Comprehensive collection CRUD operations for R2R."""

from typing import TypedDict, cast
from uuid import UUID

from r2r import R2RAsyncClient

from ...core.exceptions import StorageError

# JSON serializable type for API responses
JsonData = dict[str, str | int | bool | None]


class DocumentAddResult(TypedDict, total=False):
    """Result of adding a document to a collection."""

    document_id: str
    added: bool
    result: JsonData
    error: str


class DocumentRemoveResult(TypedDict, total=False):
    """Result of removing a document from a collection."""

    document_id: str
    removed: bool
    error: str


class ExportResult(TypedDict):
    """Result of a CSV export operation."""

    exported: int
    path: str


class R2RCollections:
    """Comprehensive collection management for R2R."""

    client: R2RAsyncClient

    def __init__(self, client: R2RAsyncClient) -> None:
        """Initialize collections manager with R2R client."""
        self.client = client

    async def create(self, name: str, description: str | None = None) -> JsonData:
        """Create a new collection in R2R.

        Args:
            name: Collection name
            description: Optional collection description

        Returns:
            Created collection information

        Raises:
            StorageError: If collection creation fails
        """
        try:
            response = await self.client.collections.create(
                name=name,
                description=description,
            )
            # response.results is a list, not a model with model_dump()
            return cast(JsonData, response.results if isinstance(response.results, list) else [])
        except Exception as e:
            raise StorageError(f"Failed to create collection '{name}': {e}") from e

    async def retrieve(self, collection_id: str | UUID) -> JsonData:
        """Retrieve a collection by ID.

        Args:
            collection_id: Collection ID to retrieve

        Returns:
            Collection information

        Raises:
            StorageError: If collection retrieval fails
        """
        try:
            response = await self.client.collections.retrieve(str(collection_id))
            # response.results is a list, not a model with model_dump()
            return cast(JsonData, response.results if isinstance(response.results, list) else [])
        except Exception as e:
            raise StorageError(f"Failed to retrieve collection {collection_id}: {e}") from e

    async def update(
        self,
        collection_id: str | UUID,
        name: str | None = None,
        description: str | None = None,
    ) -> JsonData:
        """Update collection metadata.

        Args:
            collection_id: Collection ID to update
            name: New name (optional)
            description: New description (optional)

        Returns:
            Updated collection information

        Raises:
            StorageError: If collection update fails
        """
        try:
            response = await self.client.collections.update(
                id=str(collection_id),
                name=name,
                description=description,
            )
            # response.results is a list, not a model with model_dump()
            return cast(JsonData, response.results if isinstance(response.results, list) else [])
        except Exception as e:
            raise StorageError(f"Failed to update collection {collection_id}: {e}") from e

    async def delete(self, collection_id: str | UUID) -> bool:
        """Delete a collection by ID.

        Args:
            collection_id: Collection ID to delete

        Returns:
            True if deletion was successful

        Raises:
            StorageError: If collection deletion fails
        """
        try:
            _ = await self.client.collections.delete(str(collection_id))
            return True
        except Exception as e:
            raise StorageError(f"Failed to delete collection {collection_id}: {e}") from e

    async def list_all(
        self, offset: int = 0, limit: int = 100, owner_only: bool = False
    ) -> JsonData:
        """List collections with pagination support.

        Args:
            offset: Starting offset for pagination
            limit: Maximum number of collections to return
            owner_only: Only return collections owned by current user

        Returns:
            Paginated list of collections

        Raises:
            StorageError: If collection listing fails
        """
        try:
            response = await self.client.collections.list(
                offset=offset,
                limit=limit,
                owner_only=owner_only,
            )
            # response.results is a list, not a model with model_dump()
            return cast(JsonData, response.results if isinstance(response.results, list) else [])
        except Exception as e:
            raise StorageError(f"Failed to list collections: {e}") from e

    async def get_by_name(
        self, collection_name: str, owner_id: str | UUID | None = None
    ) -> JsonData:
        """Get collection by name with optional owner filter.

        Args:
            collection_name: Name of the collection
            owner_id: Optional owner ID filter

        Returns:
            Collection information

        Raises:
            StorageError: If collection retrieval fails
        """
        try:
            # List all collections and find by name
            collections_response = await self.client.collections.list()
            for collection in collections_response.results:
                if (
                    owner_id is None or str(collection.owner_id) == str(owner_id)
                ) and collection.name == collection_name:
                    return cast(JsonData, collection.model_dump())
            raise StorageError(f"Collection '{collection_name}' not found")
        except Exception as e:
            raise StorageError(f"Failed to get collection by name '{collection_name}': {e}") from e

    async def add_document(self, collection_id: str | UUID, document_id: str | UUID) -> JsonData:
        """Associate a document with a collection.

        Args:
            collection_id: Collection ID
            document_id: Document ID to add

        Returns:
            Association result

        Raises:
            StorageError: If document association fails
        """
        try:
            response = await self.client.collections.add_document(
                id=str(collection_id),
                document_id=str(document_id),
            )
            # response.results is a list, not a model with model_dump()
            return cast(JsonData, response.results if isinstance(response.results, list) else [])
        except Exception as e:
            raise StorageError(
                f"Failed to add document {document_id} to collection {collection_id}: {e}"
            ) from e

    async def remove_document(self, collection_id: str | UUID, document_id: str | UUID) -> bool:
        """Remove document association from collection.

        Args:
            collection_id: Collection ID
            document_id: Document ID to remove

        Returns:
            True if removal was successful

        Raises:
            StorageError: If document removal fails
        """
        try:
            await self.client.collections.remove_document(
                id=str(collection_id),
                document_id=str(document_id),
            )
            return True
        except Exception as e:
            raise StorageError(
                f"Failed to remove document {document_id} from collection {collection_id}: {e}"
            ) from e

    async def list_documents(
        self, collection_id: str | UUID, offset: int = 0, limit: int = 100
    ) -> JsonData:
        """List all documents in a collection with pagination.

        Args:
            collection_id: Collection ID
            offset: Starting offset for pagination
            limit: Maximum number of documents to return

        Returns:
            Paginated list of documents in collection

        Raises:
            StorageError: If document listing fails
        """
        try:
            response = await self.client.collections.list_documents(
                id=str(collection_id),
                offset=offset,
                limit=limit,
            )
            # response.results is a list, not a model with model_dump()
            return cast(JsonData, response.results if isinstance(response.results, list) else [])
        except Exception as e:
            raise StorageError(
                f"Failed to list documents in collection {collection_id}: {e}"
            ) from e

    async def add_user(self, collection_id: str | UUID, user_id: str | UUID) -> JsonData:
        """Grant user access to a collection.

        Args:
            collection_id: Collection ID
            user_id: User ID to grant access

        Returns:
            Access grant result

        Raises:
            StorageError: If user access grant fails
        """
        try:
            response = await self.client.collections.add_user(
                id=str(collection_id),
                user_id=str(user_id),
            )
            # response.results is a list, not a model with model_dump()
            return cast(JsonData, response.results if isinstance(response.results, list) else [])
        except Exception as e:
            raise StorageError(
                f"Failed to add user {user_id} to collection {collection_id}: {e}"
            ) from e

    async def remove_user(self, collection_id: str | UUID, user_id: str | UUID) -> bool:
        """Revoke user access from a collection.

        Args:
            collection_id: Collection ID
            user_id: User ID to revoke access

        Returns:
            True if revocation was successful

        Raises:
            StorageError: If user access revocation fails
        """
        try:
            await self.client.collections.remove_user(
                id=str(collection_id),
                user_id=str(user_id),
            )
            return True
        except Exception as e:
            raise StorageError(
                f"Failed to remove user {user_id} from collection {collection_id}: {e}"
            ) from e

    async def list_users(
        self, collection_id: str | UUID, offset: int = 0, limit: int = 100
    ) -> JsonData:
        """List all users with access to a collection.

        Args:
            collection_id: Collection ID
            offset: Starting offset for pagination
            limit: Maximum number of users to return

        Returns:
            Paginated list of users with collection access

        Raises:
            StorageError: If user listing fails
        """
        try:
            response = await self.client.collections.list_users(
                id=str(collection_id),
                offset=offset,
                limit=limit,
            )
            # response.results is a list, not a model with model_dump()
            return cast(JsonData, response.results if isinstance(response.results, list) else [])
        except Exception as e:
            raise StorageError(f"Failed to list users for collection {collection_id}: {e}") from e

    async def extract_entities(
        self,
        collection_id: str | UUID,
        run_with_orchestration: bool = True,
        settings: JsonData | None = None,
    ) -> JsonData:
        """Extract entities and relationships from collection documents.

        Args:
            collection_id: Collection ID
            run_with_orchestration: Whether to run with orchestration
            settings: Extraction configuration settings

        Returns:
            Extraction results

        Raises:
            StorageError: If entity extraction fails
        """
        try:
            response = await self.client.collections.extract(
                id=str(collection_id),
                run_with_orchestration=run_with_orchestration,
                settings=cast(dict[str, object], settings or {}),
            )
            # response.results is a list, not a model with model_dump()
            return cast(JsonData, response.results if isinstance(response.results, list) else [])
        except Exception as e:
            raise StorageError(
                f"Failed to extract entities from collection {collection_id}: {e}"
            ) from e

    async def export_to_csv(
        self, output_path: str, columns: list[str] | None = None, include_header: bool = True
    ) -> ExportResult:
        """Export collections to CSV format.

        Args:
            output_path: Path for the exported CSV file
            columns: Specific columns to export (optional)
            include_header: Whether to include header row

        Returns:
            Export result information

        Raises:
            StorageError: If export fails
        """
        # R2R SDK doesn't currently support collection export
        # Implement a basic CSV export using list()
        try:
            import csv
            from pathlib import Path

            collections_response = await self.client.collections.list()
            collections_data = [
                {
                    "id": str(c.id),
                    "name": c.name,
                    "description": c.description or "",
                    "owner_id": str(c.owner_id) if hasattr(c, "owner_id") else "",
                }
                for c in collections_response.results
            ]

            Path(output_path).parent.mkdir(parents=True, exist_ok=True)

            with open(output_path, "w", newline="", encoding="utf-8") as csvfile:
                if not collections_data:
                    return {"exported": 0, "path": output_path}

                fieldnames = columns or list(collections_data[0].keys())
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

                if include_header:
                    writer.writeheader()

                for collection in collections_data:
                    filtered_collection = {k: v for k, v in collection.items() if k in fieldnames}
                    writer.writerow(filtered_collection)

            return {"exported": len(collections_data), "path": output_path}
        except Exception as e:
            raise StorageError(f"Failed to export collections: {e}") from e

    async def batch_add_documents(
        self, collection_id: str | UUID, document_ids: list[str | UUID]
    ) -> list[DocumentAddResult]:
        """Add multiple documents to a collection efficiently.

        Args:
            collection_id: Collection ID
            document_ids: List of document IDs to add

        Returns:
            List of addition results
        """
        results: list[DocumentAddResult] = []
        for doc_id in document_ids:
            try:
                result = await self.add_document(collection_id, doc_id)
                results.append({"document_id": str(doc_id), "added": True, "result": result})
            except StorageError as e:
                results.append({"document_id": str(doc_id), "added": False, "error": str(e)})
        return results

    async def batch_remove_documents(
        self, collection_id: str | UUID, document_ids: list[str | UUID]
    ) -> list[DocumentRemoveResult]:
        """Remove multiple documents from a collection efficiently.

        Args:
            collection_id: Collection ID
            document_ids: List of document IDs to remove

        Returns:
            List of removal results
        """
        results: list[DocumentRemoveResult] = []
        for doc_id in document_ids:
            try:
                success = await self.remove_document(collection_id, doc_id)
                results.append({"document_id": str(doc_id), "removed": success})
            except StorageError as e:
                results.append({"document_id": str(doc_id), "removed": False, "error": str(e)})
        return results
</file>

<file path="ingest_pipeline/storage/__init__.py">
"""Storage adapters for different backends."""

from typing import TYPE_CHECKING

from .base import BaseStorage
from .openwebui import OpenWebUIStorage
from .weaviate import WeaviateStorage

if TYPE_CHECKING:
    from .r2r import R2RStorage as _R2RStorage

try:
    from .r2r.storage import R2RStorage as _RuntimeR2RStorage

    R2RStorage: type[BaseStorage] | None = _RuntimeR2RStorage
except ImportError:
    R2RStorage = None

__all__ = [
    "BaseStorage",
    "WeaviateStorage",
    "OpenWebUIStorage",
    "R2RStorage",
    "_R2RStorage",
]
</file>

<file path="ingest_pipeline/cli/tui/screens/ingestion.py">
"""Enhanced ingestion screen with multi-storage support."""

from __future__ import annotations

from typing import TYPE_CHECKING, cast

from textual import work
from textual.app import ComposeResult
from textual.binding import Binding
from textual.containers import Container, Horizontal
from textual.screen import ModalScreen
from textual.widgets import Button, Checkbox, Input, Label, LoadingIndicator, Rule, Static
from typing_extensions import override

from ....core.models import IngestionResult, IngestionSource, StorageBackend
from ....flows.ingestion import create_ingestion_flow
from ..models import CollectionInfo
from ..utils.storage_manager import StorageManager
from ..widgets import EnhancedProgressBar

if TYPE_CHECKING:
    from ..app import CollectionManagementApp


BACKEND_ORDER: tuple[StorageBackend, ...] = (
    StorageBackend.WEAVIATE,
    StorageBackend.OPEN_WEBUI,
    StorageBackend.R2R,
)

BACKEND_LABELS: dict[StorageBackend, str] = {
    StorageBackend.WEAVIATE: "🗄️ Weaviate",
    StorageBackend.OPEN_WEBUI: "🌐 OpenWebUI",
    StorageBackend.R2R: "🧠 R2R",
}


class IngestionScreen(ModalScreen[None]):
    """Modern ingestion screen with multi-backend fan-out."""

    collection: CollectionInfo
    storage_manager: StorageManager
    selected_type: IngestionSource
    progress_value: int
    available_backends: list[StorageBackend]
    selected_backends: list[StorageBackend]

    BINDINGS = [
        Binding("escape", "app.pop_screen", "Cancel"),
        Binding("ctrl+i", "start_ingestion", "Start"),
        Binding("1", "select_web", "Web", show=False),
        Binding("2", "select_repo", "Repository", show=False),
        Binding("3", "select_docs", "Documentation", show=False),
        Binding("enter", "start_ingestion", "Start Ingestion"),
        Binding("tab", "focus_next", "Next Field"),
        Binding("shift+tab", "focus_previous", "Previous Field"),
    ]

    def __init__(self, collection: CollectionInfo, storage_manager: StorageManager) -> None:
        super().__init__()
        self.collection = collection
        self.storage_manager = storage_manager
        self.selected_type = IngestionSource.WEB
        self.progress_value = 0
        self.available_backends = list(storage_manager.get_available_backends())
        if not self.available_backends:
            raise ValueError("No storage backends are available for ingestion")
        self.selected_backends = self._derive_initial_backends()

    @override
    def compose(self) -> ComposeResult:
        target_name = self.collection["name"]
        backend_info = self.collection["backend"]

        # Format backend label for display
        if isinstance(backend_info, list):
            # Ensure all elements are strings for safe joining
            backend_strings = [str(b) for b in backend_info if b is not None]
            target_backend_label = " + ".join(backend_strings) if backend_strings else "unknown"
        else:
            target_backend_label = str(backend_info) if backend_info is not None else "unknown"

        with Container(classes="modal-container"):
            yield Static("📥 Modern Ingestion Interface", classes="title")
            yield Static(
                f"Target: {target_name} ({target_backend_label})",
                classes="subtitle",
            )
            yield Rule()

            yield Container(
                Label("🌐 Source URL:", classes="input-label"),
                Input(
                    placeholder="https://docs.example.com or file:///path/to/repo",
                    id="url_input",
                    classes="modern-input",
                ),
                Label("📝 Collection Name:", classes="input-label"),
                Input(
                    placeholder="Enter collection name (or leave empty to auto-generate)",
                    id="collection_input",
                    classes="modern-input",
                    value=self.collection.get("name", ""),
                ),
                Label("📋 Source Type (Press 1/2/3):", classes="input-label"),
                Horizontal(
                    Button("🌐 Web (1)", id="web_btn", variant="primary", classes="type-button"),
                    Button(
                        "📦 Repository (2)", id="repo_btn", variant="default", classes="type-button"
                    ),
                    Button(
                        "📖 Documentation (3)",
                        id="docs_btn",
                        variant="default",
                        classes="type-button",
                    ),
                    classes="type_buttons",
                ),
                Rule(line_style="dashed"),
                Label(
                    f"🗄️ Target Storages ({len(self.available_backends)} available):",
                    classes="input-label",
                    id="backend_label",
                ),
                Container(
                    *self._create_backend_checkbox_widgets(),
                    classes="backend-selection",
                ),
                Container(
                    Button("Select All Storages", id="select_all_backends", variant="default"),
                    Button("Clear Selection", id="clear_backends", variant="default"),
                    classes="backend-actions",
                ),
                Static("📋 Selected: None", id="selection_status", classes="selection-status"),
                classes="input-section card",
            )

            yield Container(
                Label("🔄 Progress:", classes="progress-label"),
                EnhancedProgressBar(id="enhanced_progress", total=100),
                Static("Ready to start", id="progress_text", classes="status-text"),
                classes="progress-section card",
            )

            yield Horizontal(
                Button("🚀 Start Ingestion", id="start_btn", variant="success"),
                Button("❌ Cancel", id="cancel_btn", variant="error"),
                classes="action_buttons",
            )

            yield LoadingIndicator(id="loading", classes="pulse")

    def _create_backend_checkbox_widgets(self) -> list[Checkbox]:
        """Create checkbox widgets for each available backend."""
        checkboxes: list[Checkbox] = [
            Checkbox(
                BACKEND_LABELS.get(backend, backend.value),
                value=backend in self.selected_backends,
                id=f"backend_{backend.value}",
            )
            for backend in BACKEND_ORDER
            if backend in self.available_backends
        ]
        return checkboxes

    def on_mount(self) -> None:
        """Initialize the screen state once widgets exist."""
        self.query_one("#loading").display = False
        self.query_one("#url_input", Input).focus()
        self._set_backend_selection(self.selected_backends)
        self._update_selection_status()

    def action_select_web(self) -> None:
        self.selected_type = IngestionSource.WEB
        self._update_type_buttons("web")

    def action_select_repo(self) -> None:
        self.selected_type = IngestionSource.REPOSITORY
        self._update_type_buttons("repo")

    def action_select_docs(self) -> None:
        self.selected_type = IngestionSource.DOCUMENTATION
        self._update_type_buttons("docs")

    def _update_type_buttons(self, selected: str) -> None:
        buttons = {
            "web": self.query_one("#web_btn", Button),
            "repo": self.query_one("#repo_btn", Button),
            "docs": self.query_one("#docs_btn", Button),
        }
        for kind, button in buttons.items():
            button.variant = "primary" if kind == selected else "default"

    def on_button_pressed(self, event: Button.Pressed) -> None:
        button_id = event.button.id
        if button_id == "web_btn":
            self.action_select_web()
        elif button_id == "repo_btn":
            self.action_select_repo()
        elif button_id == "docs_btn":
            self.action_select_docs()
        elif button_id == "select_all_backends":
            self._set_backend_selection(self.available_backends)
            self._update_selection_status()
        elif button_id == "clear_backends":
            self._set_backend_selection([])
            self._update_selection_status()
        elif button_id == "start_btn":
            self.action_start_ingestion()
        elif button_id == "cancel_btn":
            self.app.pop_screen()

    def on_checkbox_changed(self, event: Checkbox.Changed) -> None:
        """Handle checkbox state changes for backend selection."""
        if event.checkbox.id and event.checkbox.id.startswith("backend_"):
            # Update the selected backends list based on current checkbox states
            self.selected_backends = self._resolve_selected_backends()
            self._update_selection_status()

    def on_input_submitted(self, event: Input.Submitted) -> None:
        if event.input.id in ("url_input", "collection_input"):
            self.action_start_ingestion()

    def action_start_ingestion(self) -> None:
        url_input = self.query_one("#url_input", Input)
        collection_input = self.query_one("#collection_input", Input)

        source_url = url_input.value.strip()
        collection_name = collection_input.value.strip()

        if not source_url:
            cast("CollectionManagementApp", self.app).safe_notify(
                "🔍 Please enter a source URL", severity="error"
            )
            url_input.focus()
            return

        # Validate URL format
        if not self._validate_url(source_url):
            cast("CollectionManagementApp", self.app).safe_notify(
                "❌ Invalid URL format. Please enter a valid HTTP/HTTPS URL or file:// path",
                severity="error",
            )
            url_input.focus()
            return

        resolved_backends = self._resolve_selected_backends()
        if not resolved_backends:
            cast("CollectionManagementApp", self.app).safe_notify(
                "⚠️ Select at least one storage backend", severity="warning"
            )
            return

        self.selected_backends = resolved_backends
        self.perform_ingestion(source_url, collection_name)

    def _validate_url(self, url: str) -> bool:
        """Validate URL format for security."""
        if not url:
            return False

        # Basic URL validation
        url_lower = url.lower()

        # Allow HTTP/HTTPS URLs
        if url_lower.startswith(("http://", "https://")):
            # Additional validation could be added here
            return True

        # Allow file:// URLs for repository paths
        if url_lower.startswith("file://"):
            return True

        # Allow local file paths that look like repositories
        return "/" in url and not url_lower.startswith(("javascript:", "data:", "vbscript:"))

    def _resolve_selected_backends(self) -> list[StorageBackend]:
        selected: list[StorageBackend] = []
        for backend in BACKEND_ORDER:
            if backend not in self.available_backends:
                continue
            checkbox_id = f"#backend_{backend.value}"
            checkbox = self.query_one(checkbox_id, Checkbox)
            if checkbox.value:
                selected.append(backend)
        return selected

    def _set_backend_selection(self, backends: list[StorageBackend]) -> None:
        normalized = [backend for backend in BACKEND_ORDER if backend in backends]
        for backend in BACKEND_ORDER:
            if backend not in self.available_backends:
                continue
            checkbox_id = f"#backend_{backend.value}"
            checkbox = self.query_one(checkbox_id, Checkbox)
            checkbox.value = backend in normalized
        self.selected_backends = normalized

    def _update_selection_status(self) -> None:
        """Update the visual indicator showing current storage selection."""
        try:
            status_widget = self.query_one("#selection_status", Static)

            if not self.selected_backends:
                status_widget.update("📋 Selected: None")
            elif len(self.selected_backends) == 1:
                backend_name = BACKEND_LABELS.get(
                    self.selected_backends[0], self.selected_backends[0].value
                )
                status_widget.update(f"📋 Selected: {backend_name}")
            else:
                # Multiple backends selected
                backend_names = [
                    BACKEND_LABELS.get(backend, backend.value) for backend in self.selected_backends
                ]
                if len(backend_names) <= 3:
                    # Show all names if 3 or fewer
                    names_str = ", ".join(backend_names)
                    status_widget.update(f"📋 Selected: {names_str}")
                else:
                    # Show count if more than 3
                    status_widget.update(f"📋 Selected: {len(self.selected_backends)} backends")
        except Exception:
            # Widget might not exist yet during initialization
            pass

    def _derive_initial_backends(self) -> list[StorageBackend]:
        backend_info = self.collection.get("backend", "")

        # Handle both single backend (str) and multi-backend (list[str])
        if isinstance(backend_info, list):
            # Multi-backend: try to match all backends
            matched_backends = []
            for backend_name in backend_info:
                backend_name_lower = backend_name.lower()
                for backend in BACKEND_ORDER:
                    if backend not in self.available_backends:
                        continue
                    if (
                        backend.value.lower() == backend_name_lower
                        or backend.name.lower() == backend_name_lower
                    ):
                        matched_backends.append(backend)
                        break
            return matched_backends or [self.available_backends[0]]
        else:
            # Single backend: original logic
            backend_label = str(backend_info).lower()
            for backend in BACKEND_ORDER:
                if backend not in self.available_backends:
                    continue
                if backend.value in backend_label or backend.name.lower() in backend_label:
                    return [backend]
            return [self.available_backends[0]]

    @work(exclusive=True, thread=True)
    def perform_ingestion(self, source_url: str, collection_name: str = "") -> None:
        import asyncio
        from typing import cast

        backends = self._resolve_selected_backends()
        self.selected_backends = backends

        def update_ui(action: str) -> None:
            def _update() -> None:
                try:
                    loading = self.query_one("#loading")
                    if action == "show_loading":
                        loading.display = True
                    elif action == "hide_loading":
                        loading.display = False
                except Exception:
                    pass

            cast("CollectionManagementApp", self.app).call_from_thread(_update)

        def progress_reporter(percent: int, message: str) -> None:
            def _update_progress() -> None:
                try:
                    progress = self.query_one("#enhanced_progress", EnhancedProgressBar)
                    progress_text = self.query_one("#progress_text", Static)
                    progress.update_progress(percent, message)
                    progress_text.update(message)
                except Exception:
                    pass

            cast("CollectionManagementApp", self.app).call_from_thread(_update_progress)

        try:
            update_ui("show_loading")
            progress_reporter(5, "🚀 Starting Prefect flows...")

            # Use user-provided collection name or fall back to default
            final_collection_name = collection_name or self.collection.get("name")

            total_successful = 0
            total_failed = 0
            flow_errors: list[str] = []

            for i, backend in enumerate(backends):
                progress_percent = 20 + (60 * i) // len(backends)
                progress_reporter(
                    progress_percent,
                    f"🔗 Processing {backend.value} backend ({i + 1}/{len(backends)})...",
                )

                try:
                    # Run the Prefect flow for this backend using asyncio.run with timeout
                    import asyncio

                    async def run_flow_with_timeout(
                        current_backend: StorageBackend = backend,
                    ) -> IngestionResult:
                        return await asyncio.wait_for(
                            create_ingestion_flow(
                                source_url=source_url,
                                source_type=self.selected_type,
                                storage_backend=current_backend,
                                collection_name=final_collection_name,
                                progress_callback=progress_reporter,
                            ),
                            timeout=600.0,  # 10 minute timeout
                        )

                    result = asyncio.run(run_flow_with_timeout())

                    total_successful += result.documents_processed
                    total_failed += result.documents_failed

                    if result.error_messages:
                        flow_errors.extend(
                            [f"{backend.value}: {err}" for err in result.error_messages]
                        )

                except TimeoutError:
                    error_msg = f"{backend.value}: Timeout after 10 minutes"
                    flow_errors.append(error_msg)
                    progress_reporter(0, f"❌ {backend.value} timed out")

                    def notify_timeout(
                        msg: str = f"⏰ {backend.value} flow timed out after 10 minutes",
                    ) -> None:
                        try:
                            self.notify(msg, severity="error", markup=False)
                        except Exception:
                            pass

                    cast("CollectionManagementApp", self.app).call_from_thread(notify_timeout)
                except Exception as exc:
                    flow_errors.append(f"{backend.value}: {exc}")

                    def notify_error(msg: str = f"❌ {backend.value} flow failed: {exc}") -> None:
                        try:
                            self.notify(msg, severity="error", markup=False)
                        except Exception:
                            pass

                    cast("CollectionManagementApp", self.app).call_from_thread(notify_error)

            successful = total_successful
            failed = total_failed

            progress_reporter(100, "🎉 Completed successfully!")

            def notify_results() -> None:
                try:
                    if successful > 0:
                        self.notify(
                            f"🎉 Successfully ingested {successful} documents across {len(backends)} backend(s) via Prefect!",
                            severity="information",
                        )
                    if failed > 0:
                        self.notify(f"⚠️ {failed} documents failed to process", severity="warning")

                    if flow_errors:
                        for error in flow_errors:
                            self.notify(f"⚠️ {error}", severity="warning", markup=False)
                except Exception:
                    pass

            cast("CollectionManagementApp", self.app).call_from_thread(notify_results)

            def _pop() -> None:
                try:
                    self.app.pop_screen()
                except Exception:
                    pass

            # Schedule screen pop via timer instead of blocking
            cast("CollectionManagementApp", self.app).call_from_thread(
                lambda: self.app.set_timer(2.0, _pop)
            )

        except Exception as exc:  # pragma: no cover - defensive
            progress_reporter(0, f"❌ Prefect flows error: {exc}")

            def notify_error(msg: str = f"❌ Prefect flows failed: {exc}") -> None:
                try:
                    self.notify(msg, severity="error")
                except Exception:
                    pass

            cast("CollectionManagementApp", self.app).call_from_thread(notify_error)

            def _pop_on_error() -> None:
                try:
                    self.app.pop_screen()
                except Exception:
                    pass

            # Schedule screen pop via timer for error case too
            cast("CollectionManagementApp", self.app).call_from_thread(
                lambda: self.app.set_timer(2.0, _pop_on_error)
            )
        finally:
            update_ui("hide_loading")
</file>

<file path="ingest_pipeline/cli/tui/screens/search.py">
"""Search screen for finding documents within collections."""

from textual.app import ComposeResult
from textual.binding import Binding
from textual.containers import Container
from textual.screen import Screen
from textual.widgets import Button, Footer, Header, Input, LoadingIndicator, Static
from typing_extensions import override

from ....storage.openwebui import OpenWebUIStorage
from ....storage.weaviate import WeaviateStorage
from ..models import CollectionInfo
from ..widgets import EnhancedDataTable


class SearchScreen(Screen[None]):
    """Screen for searching within a collection with enhanced keyboard navigation."""

    collection: CollectionInfo
    weaviate: WeaviateStorage | None
    openwebui: OpenWebUIStorage | None

    BINDINGS = [
        Binding("escape", "app.pop_screen", "Back"),
        Binding("enter", "perform_search", "Search"),
        Binding("ctrl+f", "focus_search", "Focus Search"),
        Binding("f3", "perform_search", "Search Again"),
        Binding("ctrl+r", "clear_results", "Clear Results"),
        Binding("/", "focus_search", "Quick Search"),
    ]

    def __init__(
        self,
        collection: CollectionInfo,
        weaviate: WeaviateStorage | None,
        openwebui: OpenWebUIStorage | None,
    ):
        super().__init__()
        self.collection = collection
        self.weaviate = weaviate
        self.openwebui = openwebui

    @override
    def compose(self) -> ComposeResult:
        yield Header()
        # Check if search is supported for this backend
        backends = self.collection["backend"]
        if isinstance(backends, str):
            backends = [backends]
        search_supported = "weaviate" in backends
        search_indicator = "✅ Search supported" if search_supported else "❌ Search not supported"

        yield Container(
            Static(
                f"🔍 Search in: {self.collection['name']} ({', '.join(backends)}) - {search_indicator}",
                classes="title",
            ),
            Static(
                "Press / or Ctrl+F to focus search, Enter to search"
                if search_supported
                else "Search functionality not available for this backend",
                classes="subtitle",
            ),
            Input(placeholder="Enter search query... (press Enter to search)", id="search_input"),
            Button("🔍 Search", id="search_btn", variant="primary"),
            Button("🗑️ Clear Results", id="clear_btn", variant="default"),
            EnhancedDataTable(id="results_table"),
            Static(
                "Enter your search query to find relevant documents.",
                id="search_status",
                classes="status-text",
            ),
            LoadingIndicator(id="loading"),
            classes="main_container",
        )
        yield Footer()

    def on_mount(self) -> None:
        """Initialize the screen."""
        self.query_one("#loading").display = False

        # Setup results table with enhanced metadata
        table = self.query_one("#results_table", EnhancedDataTable)
        table.add_columns("Title", "Source URL", "Type", "Content Preview", "Words", "Score")

        # Focus search input
        self.query_one("#search_input").focus()

    def action_focus_search(self) -> None:
        """Focus the search input field."""
        search_input = self.query_one("#search_input", Input)
        search_input.focus()

    def action_clear_results(self) -> None:
        """Clear search results."""
        table = self.query_one("#results_table", EnhancedDataTable)
        table.clear()
        table.add_columns("Title", "Source URL", "Type", "Content Preview", "Words", "Score")

        status = self.query_one("#search_status", Static)
        status.update("Search results cleared. Enter a new query to search.")

    def on_input_submitted(self, event: Input.Submitted) -> None:
        """Handle search input submission."""
        if event.input.id == "search_input":
            self.action_perform_search()

    def on_button_pressed(self, event: Button.Pressed) -> None:
        """Handle button presses."""
        if event.button.id == "search_btn":
            self.action_perform_search()
        elif event.button.id == "clear_btn":
            self.action_clear_results()

    def action_perform_search(self) -> None:
        """Perform search."""
        search_input = self.query_one("#search_input", Input)
        if not search_input.value.strip():
            self.notify("Please enter a search query", severity="warning")
            search_input.focus()
            return

        self.run_worker(self.search_collection(search_input.value.strip()))

    async def search_collection(self, query: str) -> None:
        """Search the collection."""
        loading = self.query_one("#loading", LoadingIndicator)
        table = self.query_one("#results_table", EnhancedDataTable)
        status = self.query_one("#search_status", Static)

        try:
            self._setup_search_ui(loading, table, status, query)
            results = await self._execute_search(query)
            self._populate_results_table(table, results)
            self._update_search_status(status, query, results, table)
        except Exception as e:
            status.update(f"Search error: {e}")
            self.notify(f"Search error: {e}", severity="error", markup=False)
        finally:
            loading.display = False

    def _setup_search_ui(
        self, loading: LoadingIndicator, table: EnhancedDataTable, status: Static, query: str
    ) -> None:
        """Setup the search UI elements."""
        loading.display = True
        status.update(f"🔍 Searching for '{query}'...")
        table.clear()
        table.add_columns("Title", "Source URL", "Type", "Content Preview", "Words", "Score")

    async def _execute_search(self, query: str) -> list[dict[str, str | float]]:
        """Execute the search based on collection type."""
        if self.collection["type"] == "weaviate" and self.weaviate:
            return await self.search_weaviate(query)
        elif self.collection["type"] == "openwebui" and self.openwebui:
            # OpenWebUI search is not yet implemented
            self.notify("Search not supported for OpenWebUI collections", severity="warning")
            return []
        elif self.collection["type"] == "r2r":
            # R2R search would go here when implemented
            self.notify("Search not supported for R2R collections", severity="warning")
            return []
        return []

    def _populate_results_table(
        self, table: EnhancedDataTable, results: list[dict[str, str | float]]
    ) -> None:
        """Populate the results table with search results."""
        for result in results:
            row_data = self._format_result_row(result)
            table.add_row(*row_data)

    def _format_result_row(self, result: dict[str, str | float]) -> tuple[str, ...]:
        """Format a single result row for the table."""
        title = self._truncate_text(result.get("title", "Untitled"), 30)
        source_url = self._truncate_text(result.get("source_url", ""), 40)
        type_display = self._format_content_type(result.get("content_type", "text/plain"))
        content_preview = self._format_content_preview(result.get("content", ""))
        word_count = str(result.get("word_count", 0))
        score_display = self._format_score(result.get("score"))

        return (title, source_url, type_display, content_preview, word_count, score_display)

    def _truncate_text(self, text: str | float | None, max_length: int) -> str:
        """Truncate text to specified length."""
        if not isinstance(text, str):
            text = str(text) if text is not None else ""
        return text[:max_length]

    def _format_content_type(self, content_type: str | float) -> str:
        """Format content type with appropriate icon."""
        content_type = str(content_type).lower()
        if "markdown" in content_type:
            return "📝 md"
        elif "html" in content_type:
            return "🌐 html"
        elif "text" in content_type:
            return "📄 txt"
        else:
            return f"📄 {content_type.split('/')[-1][:5]}"

    def _format_content_preview(self, content: str | float) -> str:
        """Format content preview with truncation."""
        if not isinstance(content, str):
            content = str(content) if content is not None else ""
        return f"{content[:60]}..." if len(content) > 60 else content

    def _format_score(self, score: object) -> str:
        """Format search score for display."""
        if isinstance(score, (int, float)):
            return f"{score:.3f}"
        elif score is None:
            return "-"
        else:
            return str(score)

    def _update_search_status(
        self,
        status: Static,
        query: str,
        results: list[dict[str, str | float]],
        table: EnhancedDataTable,
    ) -> None:
        """Update search status and notifications based on results."""
        if not results:
            status.update(f"No results found for '{query}'. Try different keywords.")
            self.notify("No results found", severity="information")
        else:
            status.update(
                f"Found {len(results)} results for '{query}'. Use arrow keys to navigate."
            )
            self.notify(f"Found {len(results)} results", severity="information")
            table.focus()

    async def search_weaviate(self, query: str) -> list[dict[str, str | float]]:
        """Search Weaviate collection."""
        if not self.weaviate:
            return []

        try:
            await self.weaviate.initialize()
            # Use the search_documents method which returns more metadata
            results = await self.weaviate.search_documents(
                query,
                limit=20,
                collection_name=self.collection["name"],
            )

            # Convert Document objects to dict format expected by the UI
            formatted_results = []
            for doc in results:
                metadata = getattr(doc, "metadata", {})

                score_value: float | None = None
                raw_score = getattr(doc, "score", None)
                if raw_score is not None:
                    try:
                        score_value = float(raw_score)
                    except (TypeError, ValueError):
                        score_value = None

                formatted_results.append(
                    {
                        "title": metadata.get("title", "Untitled"),
                        "source_url": metadata.get("source_url", ""),
                        "content_type": metadata.get("content_type", "text/plain"),
                        "content": getattr(doc, "content", ""),
                        "word_count": metadata.get("word_count", 0),
                        "score": score_value if score_value is not None else 0.0,
                    }
                )
            return formatted_results
        except Exception as e:
            self.notify(f"Weaviate search error: {e}", severity="error", markup=False)
            return []

    async def search_openwebui(self, query: str) -> list[dict[str, str | float]]:
        """Search OpenWebUI collection."""
        if not self.openwebui:
            return []

        try:
            # OpenWebUI does not have a direct search API, so return empty
            # In a real implementation, you would need to implement search via their API
            self.notify("OpenWebUI search not yet implemented", severity="warning")
            return []
        except Exception as e:
            self.notify(f"OpenWebUI search error: {e}", severity="error", markup=False)
            return []
</file>

<file path="ingest_pipeline/cli/tui/utils/storage_manager.py">
"""Storage management utilities for TUI applications."""

from __future__ import annotations

import asyncio
from collections.abc import AsyncGenerator, Coroutine, Sequence
from typing import TYPE_CHECKING, Protocol

from pydantic import SecretStr

from ....core.exceptions import StorageError
from ....core.models import Document, StorageBackend, StorageConfig
from ....storage.base import BaseStorage
from ....storage.openwebui import OpenWebUIStorage
from ....storage.r2r.storage import R2RStorage
from ....storage.weaviate import WeaviateStorage
from ..models import CollectionInfo, StorageCapabilities

if TYPE_CHECKING:
    from ....config.settings import Settings


class StorageBackendProtocol(Protocol):
    """Protocol defining storage backend interface."""

    async def initialize(self) -> None: ...
    async def count(self, *, collection_name: str | None = None) -> int: ...
    async def list_collections(self) -> list[str]: ...
    async def search(
        self,
        query: str,
        limit: int = 10,
        threshold: float = 0.7,
        *,
        collection_name: str | None = None,
    ) -> AsyncGenerator[Document, None]: ...
    async def close(self) -> None: ...


class MultiStorageAdapter(BaseStorage):
    """Mirror writes to multiple storage backends."""

    def __init__(self, storages: Sequence[BaseStorage]) -> None:
        if not storages:
            raise ValueError("MultiStorageAdapter requires at least one storage backend")

        unique: list[BaseStorage] = []
        seen_ids: set[int] = set()
        for storage in storages:
            storage_id = id(storage)
            if storage_id in seen_ids:
                continue
            seen_ids.add(storage_id)
            unique.append(storage)

        self._storages: list[BaseStorage] = unique
        self._primary: BaseStorage = unique[0]
        super().__init__(self._primary.config)

    async def initialize(self) -> None:
        for storage in self._storages:
            await storage.initialize()

    async def store(self, document: Document, *, collection_name: str | None = None) -> str:
        # Store in primary backend first
        primary_id: str = await self._primary.store(document, collection_name=collection_name)

        # Replicate to secondary backends concurrently
        if len(self._storages) > 1:

            async def replicate_to_backend(
                storage: BaseStorage,
            ) -> tuple[BaseStorage, bool, Exception | None]:
                try:
                    await storage.store(document, collection_name=collection_name)
                    return storage, True, None
                except Exception as exc:
                    return storage, False, exc

            tasks = [replicate_to_backend(storage) for storage in self._storages[1:]]
            results = await asyncio.gather(*tasks, return_exceptions=True)

            failures: list[str] = []
            errors: list[Exception] = []

            for result in results:
                if isinstance(result, tuple):
                    storage, success, error = result
                    if not success and error is not None:
                        failures.append(self._format_backend_label(storage))
                        errors.append(error)
                elif isinstance(result, Exception):
                    failures.append("unknown")
                    errors.append(result)

            if failures:
                backends = ", ".join(failures)
                primary_error = errors[0] if errors else Exception("Unknown replication error")
                raise StorageError(
                    f"Document stored in primary backend but replication failed for: {backends}"
                ) from primary_error

        return primary_id

    async def store_batch(
        self, documents: list[Document], *, collection_name: str | None = None
    ) -> list[str]:
        # Store in primary backend first
        primary_ids: list[str] = await self._primary.store_batch(
            documents, collection_name=collection_name
        )

        # Replicate to secondary backends concurrently
        if len(self._storages) > 1:

            async def replicate_batch_to_backend(
                storage: BaseStorage,
            ) -> tuple[BaseStorage, bool, Exception | None]:
                try:
                    await storage.store_batch(documents, collection_name=collection_name)
                    return storage, True, None
                except Exception as exc:
                    return storage, False, exc

            tasks = [replicate_batch_to_backend(storage) for storage in self._storages[1:]]
            results = await asyncio.gather(*tasks, return_exceptions=True)

            failures: list[str] = []
            errors: list[Exception] = []

            for result in results:
                if isinstance(result, tuple):
                    storage, success, error = result
                    if not success and error is not None:
                        failures.append(self._format_backend_label(storage))
                        errors.append(error)
                elif isinstance(result, Exception):
                    failures.append("unknown")
                    errors.append(result)

            if failures:
                backends = ", ".join(failures)
                primary_error = (
                    errors[0] if errors else Exception("Unknown batch replication error")
                )
                raise StorageError(
                    f"Batch stored in primary backend but replication failed for: {backends}"
                ) from primary_error

        return primary_ids

    async def delete(self, document_id: str, *, collection_name: str | None = None) -> bool:
        # Delete from primary backend first
        primary_deleted: bool = await self._primary.delete(
            document_id, collection_name=collection_name
        )

        # Delete from secondary backends concurrently
        if len(self._storages) > 1:

            async def delete_from_backend(
                storage: BaseStorage,
            ) -> tuple[BaseStorage, bool, Exception | None]:
                try:
                    await storage.delete(document_id, collection_name=collection_name)
                    return storage, True, None
                except Exception as exc:
                    return storage, False, exc

            tasks = [delete_from_backend(storage) for storage in self._storages[1:]]
            results = await asyncio.gather(*tasks, return_exceptions=True)

            failures: list[str] = []
            errors: list[Exception] = []

            for result in results:
                if isinstance(result, tuple):
                    storage, success, error = result
                    if not success and error is not None:
                        failures.append(self._format_backend_label(storage))
                        errors.append(error)
                elif isinstance(result, Exception):
                    failures.append("unknown")
                    errors.append(result)

            if failures:
                backends = ", ".join(failures)
                primary_error = errors[0] if errors else Exception("Unknown deletion error")
                raise StorageError(
                    f"Document deleted from primary backend but failed for: {backends}"
                ) from primary_error

        return primary_deleted

    async def count(self, *, collection_name: str | None = None) -> int:
        count_result: int = await self._primary.count(collection_name=collection_name)
        return count_result

    async def list_collections(self) -> list[str]:
        list_fn = getattr(self._primary, "list_collections", None)
        if list_fn is None:
            return []
        collections_result: list[str] = await list_fn()
        return collections_result

    async def search(
        self,
        query: str,
        limit: int = 10,
        threshold: float = 0.7,
        *,
        collection_name: str | None = None,
    ) -> AsyncGenerator[Document, None]:
        async for item in self._primary.search(
            query,
            limit=limit,
            threshold=threshold,
            collection_name=collection_name,
        ):
            yield item

    async def close(self) -> None:
        for storage in self._storages:
            close_fn = getattr(storage, "close", None)
            if close_fn is not None:
                await close_fn()

    def _format_backend_label(self, storage: BaseStorage) -> str:
        backend = getattr(storage.config, "backend", None)
        if isinstance(backend, StorageBackend):
            backend_value: str = backend.value
            return backend_value
        class_name: str = storage.__class__.__name__
        return class_name


class StorageManager:
    """Centralized manager for all storage backend operations."""

    def __init__(self, settings: Settings) -> None:
        """Initialize storage manager with application settings."""
        self.settings: Settings = settings
        self.backends: dict[StorageBackend, BaseStorage] = {}
        self.capabilities: dict[StorageBackend, StorageCapabilities] = {}
        self._initialized: bool = False

    async def initialize_all_backends(self) -> dict[StorageBackend, bool]:
        """Initialize all available storage backends with timeout protection."""
        results: dict[StorageBackend, bool] = {}

        async def init_backend(
            backend_type: StorageBackend, config: StorageConfig, storage_class: type[BaseStorage]
        ) -> bool:
            """Initialize a single backend with timeout."""
            try:
                storage = storage_class(config)
                await asyncio.wait_for(storage.initialize(), timeout=30.0)
                self.backends[backend_type] = storage
                if backend_type == StorageBackend.WEAVIATE:
                    self.capabilities[backend_type] = StorageCapabilities.VECTOR_SEARCH
                elif backend_type == StorageBackend.OPEN_WEBUI:
                    self.capabilities[backend_type] = StorageCapabilities.KNOWLEDGE_BASE
                elif backend_type == StorageBackend.R2R:
                    self.capabilities[backend_type] = StorageCapabilities.FULL_FEATURED
                return True
            except (TimeoutError, Exception):
                return False

        # Initialize backends concurrently with timeout protection
        tasks: list[tuple[StorageBackend, Coroutine[None, None, bool]]] = []

        # Try Weaviate
        if self.settings.weaviate_endpoint:
            config = StorageConfig(
                backend=StorageBackend.WEAVIATE,
                endpoint=self.settings.weaviate_endpoint,
                api_key=SecretStr(self.settings.weaviate_api_key)
                if self.settings.weaviate_api_key
                else None,
                collection_name="default",
            )
            tasks.append(
                (
                    StorageBackend.WEAVIATE,
                    init_backend(StorageBackend.WEAVIATE, config, WeaviateStorage),
                )
            )
        else:
            results[StorageBackend.WEAVIATE] = False

        # Try OpenWebUI
        if self.settings.openwebui_endpoint and self.settings.openwebui_api_key:
            config = StorageConfig(
                backend=StorageBackend.OPEN_WEBUI,
                endpoint=self.settings.openwebui_endpoint,
                api_key=SecretStr(self.settings.openwebui_api_key)
                if self.settings.openwebui_api_key
                else None,
                collection_name="default",
            )
            tasks.append(
                (
                    StorageBackend.OPEN_WEBUI,
                    init_backend(StorageBackend.OPEN_WEBUI, config, OpenWebUIStorage),
                )
            )
        else:
            results[StorageBackend.OPEN_WEBUI] = False

        # Try R2R
        if self.settings.r2r_endpoint:
            config = StorageConfig(
                backend=StorageBackend.R2R,
                endpoint=self.settings.r2r_endpoint,
                api_key=SecretStr(self.settings.r2r_api_key) if self.settings.r2r_api_key else None,
                collection_name="default",
            )
            tasks.append((StorageBackend.R2R, init_backend(StorageBackend.R2R, config, R2RStorage)))
        else:
            results[StorageBackend.R2R] = False

        # Execute initialization tasks concurrently
        if tasks:
            backend_types, task_coroutines = zip(*tasks, strict=False)
            task_results: Sequence[bool | BaseException] = await asyncio.gather(
                *task_coroutines, return_exceptions=True
            )

            for backend_type, task_result in zip(backend_types, task_results, strict=False):
                results[backend_type] = task_result if isinstance(task_result, bool) else False
        self._initialized = True
        return results

    def get_backend(self, backend_type: StorageBackend) -> BaseStorage | None:
        """Get storage backend by type."""
        return self.backends.get(backend_type)

    def build_multi_storage_adapter(
        self, backends: Sequence[StorageBackend]
    ) -> MultiStorageAdapter:
        storages: list[BaseStorage] = []
        seen: set[StorageBackend] = set()
        for backend in backends:
            backend_enum = (
                backend if isinstance(backend, StorageBackend) else StorageBackend(backend)
            )
            if backend_enum in seen:
                continue
            seen.add(backend_enum)
            storage = self.backends.get(backend_enum)
            if storage is None:
                raise ValueError(f"Storage backend {backend_enum.value} is not initialized")
            storages.append(storage)
        return MultiStorageAdapter(storages)

    def get_available_backends(self) -> list[StorageBackend]:
        """Get list of successfully initialized backends."""
        return list(self.backends.keys())

    def has_capability(self, backend: StorageBackend, capability: StorageCapabilities) -> bool:
        """Check if backend has specific capability."""
        backend_caps = self.capabilities.get(backend, StorageCapabilities.BASIC)
        return capability.value <= backend_caps.value

    async def get_all_collections(self) -> list[CollectionInfo]:
        """Get collections from all available backends, merging collections with same name."""
        collection_map: dict[str, CollectionInfo] = {}

        for backend_type, storage in self.backends.items():
            try:
                backend_collections = await storage.list_collections()
                for collection_name in backend_collections:
                    # Validate collection name
                    if not collection_name or not isinstance(collection_name, str):
                        continue

                    try:
                        count = await storage.count(collection_name=collection_name)
                        # Validate count is non-negative
                        count = max(count, 0)
                    except StorageError as e:
                        # Storage-specific errors - log and use 0 count
                        import logging

                        logging.warning(
                            f"Failed to get count for {collection_name} on {backend_type.value}: {e}"
                        )
                        count = 0
                    except Exception as e:
                        # Unexpected errors - log and skip this collection from this backend
                        import logging

                        logging.warning(
                            f"Unexpected error counting {collection_name} on {backend_type.value}: {e}"
                        )
                        continue

                    size_mb = count * 0.01  # Rough estimate: 10KB per document

                    if collection_name in collection_map:
                        # Merge with existing collection
                        existing = collection_map[collection_name]
                        existing_backends = existing["backend"]
                        backend_value = backend_type.value

                        if isinstance(existing_backends, str):
                            existing["backend"] = [existing_backends, backend_value]
                        elif isinstance(existing_backends, list):
                            # Prevent duplicates
                            if backend_value not in existing_backends:
                                existing_backends.append(backend_value)

                        # Aggregate counts and sizes
                        existing["count"] += count
                        existing["size_mb"] += size_mb
                    else:
                        # Create new collection entry
                        collection_info: CollectionInfo = {
                            "name": collection_name,
                            "type": self._get_collection_type(collection_name, backend_type),
                            "count": count,
                            "backend": backend_type.value,
                            "status": "active",
                            "last_updated": "2024-01-01T00:00:00Z",
                            "size_mb": size_mb,
                        }
                        collection_map[collection_name] = collection_info
            except Exception:
                continue

        return list(collection_map.values())

    def _get_collection_type(self, collection_name: str, backend: StorageBackend) -> str:
        """Determine collection type based on name and backend."""
        # Prioritize definitive backend type first
        if backend == StorageBackend.R2R:
            return "r2r"
        elif backend == StorageBackend.WEAVIATE:
            return "weaviate"
        elif backend == StorageBackend.OPEN_WEBUI:
            return "openwebui"

        # Fallback to name-based guessing if backend is not specific
        name_lower = collection_name.lower()
        if "web" in name_lower or "doc" in name_lower:
            return "documentation"
        elif "repo" in name_lower or "code" in name_lower:
            return "repository"
        else:
            return "general"

    async def search_across_backends(
        self,
        query: str,
        limit: int = 10,
        backends: list[StorageBackend] | None = None,
    ) -> dict[StorageBackend, list[Document]]:
        """Search across multiple backends and return grouped results."""
        if backends is None:
            backends = self.get_available_backends()

        results: dict[StorageBackend, list[Document]] = {}

        async def search_backend(backend_type: StorageBackend) -> None:
            storage = self.backends.get(backend_type)
            if storage:
                try:
                    documents: list[Document] = []
                    async for doc in storage.search(query, limit=limit):
                        documents.append(doc)
                    results[backend_type] = documents
                except Exception:
                    results[backend_type] = []

        # Run searches in parallel
        tasks = [search_backend(backend) for backend in backends]
        await asyncio.gather(*tasks, return_exceptions=True)

        return results

    def get_r2r_storage(self) -> R2RStorage | None:
        """Get R2R storage instance if available."""
        storage = self.backends.get(StorageBackend.R2R)
        return storage if isinstance(storage, R2RStorage) else None

    async def get_backend_status(
        self,
    ) -> dict[StorageBackend, dict[str, str | int | bool | StorageCapabilities]]:
        """Get comprehensive status for all backends."""
        status: dict[StorageBackend, dict[str, str | int | bool | StorageCapabilities]] = {}

        for backend_type, storage in self.backends.items():
            try:
                collections = await storage.list_collections()
                total_docs = 0
                for collection in collections:
                    total_docs += await storage.count(collection_name=collection)

                backend_status: dict[str, str | int | bool | StorageCapabilities] = {
                    "available": True,
                    "collections": len(collections),
                    "total_documents": total_docs,
                    "capabilities": self.capabilities.get(backend_type, StorageCapabilities.BASIC),
                    "endpoint": getattr(storage.config, "endpoint", "unknown"),
                }
                status[backend_type] = backend_status
            except Exception as e:
                status[backend_type] = {
                    "available": False,
                    "error": str(e),
                    "capabilities": StorageCapabilities.NONE,
                }

        return status

    async def close_all(self) -> None:
        """Close all storage connections."""
        for storage in self.backends.values():
            try:
                await storage.close()
            except Exception:
                pass

        self.backends.clear()
        self.capabilities.clear()
        self._initialized = False

    @property
    def is_initialized(self) -> bool:
        """Check if storage manager is initialized."""
        return self._initialized

    def supports_advanced_features(self, backend: StorageBackend) -> bool:
        """Check if backend supports advanced features like chunks and entities."""
        return self.has_capability(backend, StorageCapabilities.FULL_FEATURED)
</file>

<file path="ingest_pipeline/cli/tui/widgets/firecrawl_config.py">
"""Firecrawl configuration widgets for advanced scraping options."""

from __future__ import annotations

import json
from typing import cast

from textual.app import ComposeResult
from textual.containers import Container, Horizontal
from textual.validation import Integer
from textual.widget import Widget
from textual.widgets import Button, Checkbox, Input, Label, Switch, TextArea
from typing_extensions import override

from ..models import FirecrawlOptions


class ScrapeOptionsForm(Widget):
    """Form for configuring Firecrawl scraping options."""

    DEFAULT_CSS = """
    ScrapeOptionsForm {
        border: solid $border;
        background: $surface;
        padding: 1;
        height: auto;
    }

    ScrapeOptionsForm .form-section {
        margin-bottom: 2;
        padding: 1;
        border: solid $border-lighten-1;
        background: $surface-lighten-1;
    }

    ScrapeOptionsForm .form-row {
        layout: horizontal;
        align-items: center;
        height: auto;
        margin-bottom: 1;
    }

    ScrapeOptionsForm .form-label {
        width: 30%;
        min-width: 15;
        text-align: right;
        padding-right: 2;
    }

    ScrapeOptionsForm .form-input {
        width: 70%;
    }

    ScrapeOptionsForm .checkbox-row {
        layout: horizontal;
        align-items: center;
        height: 3;
        margin-bottom: 1;
    }

    ScrapeOptionsForm .checkbox-label {
        margin-left: 2;
    }
    """

    def __init__(
        self,
        *,
        name: str | None = None,
        id: str | None = None,
        classes: str | None = None,
        disabled: bool = False,
        markup: bool = True,
    ) -> None:
        """Initialize scrape options form."""
        super().__init__(name=name, id=id, classes=classes, disabled=disabled, markup=markup)

    @override
    def compose(self) -> ComposeResult:
        """Compose scrape options form."""
        yield Label("🔧 Scraping Configuration", classes="form-title")

        # Output formats section
        yield Container(
            Label("Output Formats", classes="section-title"),
            Horizontal(
                Checkbox("Markdown", id="format_markdown", value=True, classes="checkbox"),
                Label("Markdown", classes="checkbox-label"),
                classes="checkbox-row",
            ),
            Horizontal(
                Checkbox("HTML", id="format_html", value=False, classes="checkbox"),
                Label("HTML", classes="checkbox-label"),
                classes="checkbox-row",
            ),
            Horizontal(
                Checkbox("Screenshot", id="format_screenshot", value=False, classes="checkbox"),
                Label("Screenshot", classes="checkbox-label"),
                classes="checkbox-row",
            ),
            classes="form-section",
        )

        # Content filtering section
        yield Container(
            Label("Content Filtering", classes="section-title"),
            Horizontal(
                Label("Only Main Content:", classes="form-label"),
                Switch(id="only_main_content", value=True, classes="form-input"),
                classes="form-row",
            ),
            Horizontal(
                Label("Include Tags:", classes="form-label"),
                Input(
                    placeholder="p, div, article (comma-separated)",
                    id="include_tags",
                    classes="form-input",
                ),
                classes="form-row",
            ),
            Horizontal(
                Label("Exclude Tags:", classes="form-label"),
                Input(
                    placeholder="nav, footer, script (comma-separated)",
                    id="exclude_tags",
                    classes="form-input",
                ),
                classes="form-row",
            ),
            classes="form-section",
        )

        # Performance settings section
        yield Container(
            Label("Performance Settings", classes="section-title"),
            Horizontal(
                Label("Wait Time (ms):", classes="form-label"),
                Input(
                    placeholder="0",
                    id="wait_for",
                    validators=[Integer(minimum=0, maximum=30000)],
                    classes="form-input",
                ),
                classes="form-row",
            ),
            classes="form-section",
        )

    def get_scrape_options(self) -> dict[str, object]:
        """Get scraping options from form."""
        # Collect formats
        formats = []
        if self.query_one("#format_markdown", Checkbox).value:
            formats.append("markdown")
        if self.query_one("#format_html", Checkbox).value:
            formats.append("html")
        if self.query_one("#format_screenshot", Checkbox).value:
            formats.append("screenshot")
        options: dict[str, object] = {
            "formats": formats,
            "only_main_content": self.query_one("#only_main_content", Switch).value,
        }
        include_tags_input = self.query_one("#include_tags", Input).value
        if include_tags_input.strip():
            options["include_tags"] = [tag.strip() for tag in include_tags_input.split(",")]

        exclude_tags_input = self.query_one("#exclude_tags", Input).value
        if exclude_tags_input.strip():
            options["exclude_tags"] = [tag.strip() for tag in exclude_tags_input.split(",")]

        # Performance
        wait_for_input = self.query_one("#wait_for", Input).value
        if wait_for_input.strip():
            try:
                options["wait_for"] = int(wait_for_input)
            except ValueError:
                pass

        return options

    def set_scrape_options(self, options: dict[str, object]) -> None:
        """Set form values from options."""
        # Set formats
        formats = options.get("formats", ["markdown"])
        formats_list = formats if isinstance(formats, list) else []
        self.query_one("#format_markdown", Checkbox).value = "markdown" in formats_list
        self.query_one("#format_html", Checkbox).value = "html" in formats_list
        self.query_one("#format_screenshot", Checkbox).value = "screenshot" in formats_list

        # Set content filtering
        main_content_val = options.get("only_main_content", True)
        self.query_one("#only_main_content", Switch).value = bool(main_content_val)

        if include_tags := options.get("include_tags", []):
            include_list = include_tags if isinstance(include_tags, list) else []
            self.query_one("#include_tags", Input).value = ", ".join(
                str(tag) for tag in include_list
            )

        if exclude_tags := options.get("exclude_tags", []):
            exclude_list = exclude_tags if isinstance(exclude_tags, list) else []
            self.query_one("#exclude_tags", Input).value = ", ".join(
                str(tag) for tag in exclude_list
            )

        # Set performance
        wait_for = options.get("wait_for")
        if wait_for is not None:
            self.query_one("#wait_for", Input).value = str(wait_for)


class MapOptionsForm(Widget):
    """Form for configuring site mapping options."""

    DEFAULT_CSS = """
    MapOptionsForm {
        border: solid $border;
        background: $surface;
        padding: 1;
        height: auto;
    }

    MapOptionsForm .form-section {
        margin-bottom: 2;
        padding: 1;
        border: solid $border-lighten-1;
        background: $surface-lighten-1;
    }

    MapOptionsForm .form-row {
        layout: horizontal;
        align-items: center;
        height: auto;
        margin-bottom: 1;
    }

    MapOptionsForm .form-label {
        width: 30%;
        min-width: 15;
        text-align: right;
        padding-right: 2;
    }

    MapOptionsForm .form-input {
        width: 70%;
    }
    """

    def __init__(
        self,
        *,
        name: str | None = None,
        id: str | None = None,
        classes: str | None = None,
        disabled: bool = False,
        markup: bool = True,
    ) -> None:
        """Initialize map options form."""
        super().__init__(name=name, id=id, classes=classes, disabled=disabled, markup=markup)

    @override
    def compose(self) -> ComposeResult:
        """Compose map options form."""
        yield Label("🗺️ Site Mapping Configuration", classes="form-title")

        # Discovery settings section
        yield Container(
            Label("Discovery Settings", classes="section-title"),
            Horizontal(
                Label("Search Pattern:", classes="form-label"),
                Input(
                    placeholder="docs, api, guide (optional)",
                    id="search_pattern",
                    classes="form-input",
                ),
                classes="form-row",
            ),
            Horizontal(
                Label("Include Subdomains:", classes="form-label"),
                Switch(id="include_subdomains", value=False, classes="form-input"),
                classes="form-row",
            ),
            classes="form-section",
        )

        # Limits section
        yield Container(
            Label("Crawling Limits", classes="section-title"),
            Horizontal(
                Label("Max Pages:", classes="form-label"),
                Input(
                    placeholder="100",
                    id="max_pages",
                    validators=[Integer(minimum=1, maximum=1000)],
                    classes="form-input",
                ),
                classes="form-row",
            ),
            Horizontal(
                Label("Max Depth:", classes="form-label"),
                Input(
                    placeholder="5",
                    id="max_depth",
                    validators=[Integer(minimum=1, maximum=20)],
                    classes="form-input",
                ),
                classes="form-row",
            ),
            classes="form-section",
        )

    def get_map_options(self) -> dict[str, object]:
        """Get mapping options from form."""
        options: dict[str, object] = {}

        # Discovery settings
        search_pattern = self.query_one("#search_pattern", Input).value
        if search_pattern.strip():
            options["search"] = search_pattern.strip()

        options["include_subdomains"] = self.query_one("#include_subdomains", Switch).value

        # Limits
        max_pages_input = self.query_one("#max_pages", Input).value
        if max_pages_input.strip():
            try:
                options["limit"] = int(max_pages_input)
            except ValueError:
                pass

        max_depth_input = self.query_one("#max_depth", Input).value
        if max_depth_input.strip():
            try:
                options["max_depth"] = int(max_depth_input)
            except ValueError:
                pass

        return options

    def set_map_options(self, options: dict[str, object]) -> None:
        """Set form values from options."""
        if search := options.get("search"):
            self.query_one("#search_pattern", Input).value = str(search)

        subdomains_val = options.get("include_subdomains", False)
        self.query_one("#include_subdomains", Switch).value = bool(subdomains_val)

        # Set limits
        limit = options.get("limit")
        if limit is not None:
            self.query_one("#max_pages", Input).value = str(limit)

        max_depth = options.get("max_depth")
        if max_depth is not None:
            self.query_one("#max_depth", Input).value = str(max_depth)


class ExtractOptionsForm(Widget):
    """Form for configuring data extraction options."""

    DEFAULT_CSS = """
    ExtractOptionsForm {
        border: solid $border;
        background: $surface;
        padding: 1;
        height: auto;
    }

    ExtractOptionsForm .form-section {
        margin-bottom: 2;
        padding: 1;
        border: solid $border-lighten-1;
        background: $surface-lighten-1;
    }

    ExtractOptionsForm .form-row {
        layout: horizontal;
        align-items: start;
        height: auto;
        margin-bottom: 1;
    }

    ExtractOptionsForm .form-label {
        width: 30%;
        min-width: 15;
        text-align: right;
        padding-right: 2;
        padding-top: 1;
    }

    ExtractOptionsForm .form-input {
        width: 70%;
    }

    ExtractOptionsForm .text-area {
        height: 6;
    }
    """

    def __init__(
        self,
        *,
        name: str | None = None,
        id: str | None = None,
        classes: str | None = None,
        disabled: bool = False,
        markup: bool = True,
    ) -> None:
        """Initialize extract options form."""
        super().__init__(name=name, id=id, classes=classes, disabled=disabled, markup=markup)

    @override
    def compose(self) -> ComposeResult:
        """Compose extract options form."""
        yield Label("🎯 Data Extraction Configuration", classes="form-title")

        # Extraction prompt section
        yield Container(
            Label("AI-Powered Extraction", classes="section-title"),
            Horizontal(
                Label("Custom Prompt:", classes="form-label"),
                TextArea(
                    placeholder="Extract product names, prices, and descriptions...",
                    id="extract_prompt",
                    classes="form-input text-area",
                ),
                classes="form-row",
            ),
            classes="form-section",
        )

        # Schema definition section
        yield Container(
            Label("Structured Schema (JSON)", classes="section-title"),
            Horizontal(
                Label("Schema Definition:", classes="form-label"),
                TextArea(
                    placeholder='{"product_name": "string", "price": "number", "description": "string"}',
                    id="extract_schema",
                    classes="form-input text-area",
                ),
                classes="form-row",
            ),
            Container(
                Label("💡 Tip: Define the structure of data you want to extract"),
                classes="help-text",
            ),
            classes="form-section",
        )

        # Schema presets
        yield Container(
            Label("Quick Presets", classes="section-title"),
            Horizontal(
                Button("📄 Article", id="preset_article", variant="default"),
                Button("🛍️ Product", id="preset_product", variant="default"),
                Button("👤 Contact", id="preset_contact", variant="default"),
                Button("📊 Data", id="preset_data", variant="default"),
                classes="preset-buttons",
            ),
            classes="form-section",
        )

    def get_extract_options(self) -> dict[str, object]:
        """Get extraction options from form."""
        options: dict[str, object] = {}

        # Extract prompt
        prompt = self.query_one("#extract_prompt", TextArea).text
        if prompt.strip():
            options["extract_prompt"] = prompt.strip()

        # Extract schema
        schema_text = self.query_one("#extract_schema", TextArea).text
        if schema_text.strip():
            try:
                schema = json.loads(schema_text)
                options["extract_schema"] = schema
            except json.JSONDecodeError:
                # Invalid JSON, skip schema
                pass

        return options

    def set_extract_options(self, options: dict[str, object]) -> None:
        """Set form values from options."""
        if prompt := options.get("extract_prompt"):
            self.query_one("#extract_prompt", TextArea).text = str(prompt)

        if schema := options.get("extract_schema"):
            import json

            self.query_one("#extract_schema", TextArea).text = json.dumps(schema, indent=2)

    def on_button_pressed(self, event: Button.Pressed) -> None:
        """Handle preset button presses."""
        schema_widget = self.query_one("#extract_schema", TextArea)
        prompt_widget = self.query_one("#extract_prompt", TextArea)

        if event.button.id == "preset_article":
            schema_widget.text = """{
  "title": "string",
  "author": "string",
  "date": "string",
  "content": "string",
  "tags": ["string"]
}"""
            prompt_widget.text = (
                "Extract article title, author, publication date, main content, and associated tags"
            )

        elif event.button.id == "preset_product":
            schema_widget.text = """{
  "name": "string",
  "price": "number",
  "description": "string",
  "category": "string",
  "availability": "string"
}"""
            prompt_widget.text = (
                "Extract product name, price, description, category, and availability status"
            )

        elif event.button.id == "preset_contact":
            schema_widget.text = """{
  "name": "string",
  "email": "string",
  "phone": "string",
  "company": "string",
  "position": "string"
}"""
            prompt_widget.text = (
                "Extract contact information including name, email, phone, company, and position"
            )

        elif event.button.id == "preset_data":
            schema_widget.text = """{
  "metrics": [{"name": "string", "value": "number", "unit": "string"}],
  "tables": [{"headers": ["string"], "rows": [["string"]]}]
}"""
            prompt_widget.text = "Extract numerical data, metrics, and tabular information"


class FirecrawlConfigWidget(Widget):
    """Complete Firecrawl configuration widget with tabbed interface."""

    DEFAULT_CSS = """
    FirecrawlConfigWidget {
        border: solid $border;
        background: $surface;
        height: 100%;
        padding: 1;
    }

    FirecrawlConfigWidget .config-header {
        dock: top;
        height: 3;
        background: $primary;
        color: $text;
        padding: 1;
        margin: -1 -1 1 -1;
    }

    FirecrawlConfigWidget .tab-buttons {
        dock: top;
        height: 3;
        layout: horizontal;
        margin-bottom: 1;
    }

    FirecrawlConfigWidget .tab-button {
        width: 1fr;
        margin-right: 1;
    }

    FirecrawlConfigWidget .tab-content {
        height: 1fr;
        overflow: auto;
    }

    FirecrawlConfigWidget .actions {
        dock: bottom;
        height: 3;
        layout: horizontal;
        align: center;
        margin-top: 1;
    }
    """

    def __init__(
        self,
        *,
        name: str | None = None,
        id: str | None = None,
        classes: str | None = None,
        disabled: bool = False,
        markup: bool = True,
    ) -> None:
        """Initialize Firecrawl config widget."""
        super().__init__(name=name, id=id, classes=classes, disabled=disabled, markup=markup)
        self.current_tab = "scrape"

    @override
    def compose(self) -> ComposeResult:
        """Compose config widget layout."""
        yield Container(
            Label("🔥 Firecrawl Configuration", classes="config-header"),
            Horizontal(
                Button("🔧 Scraping", id="tab_scrape", variant="primary", classes="tab-button"),
                Button("🗺️ Mapping", id="tab_map", variant="default", classes="tab-button"),
                Button("🎯 Extraction", id="tab_extract", variant="default", classes="tab-button"),
                classes="tab-buttons",
            ),
            Container(
                ScrapeOptionsForm(id="scrape_form"),
                classes="tab-content",
            ),
            Horizontal(
                Button("📋 Load Preset", id="load_preset", variant="default"),
                Button("💾 Save Preset", id="save_preset", variant="default"),
                Button("🔄 Reset", id="reset_config", variant="default"),
                classes="actions",
            ),
        )

    def on_mount(self) -> None:
        """Initialize widget."""
        self.show_tab("scrape")

    def show_tab(self, tab_name: str) -> None:
        """Show specific configuration tab."""
        self.current_tab = tab_name

        # Update button states
        for tab in ["scrape", "map", "extract"]:
            button = self.query_one(f"#tab_{tab}", Button)
            button.variant = "primary" if tab == tab_name else "default"
        # Update tab content
        content_container = self.query_one(".tab-content", Container)
        content_container.remove_children()

        if tab_name == "extract":
            content_container.mount(ExtractOptionsForm(id="extract_form"))
        elif tab_name == "map":
            content_container.mount(MapOptionsForm(id="map_form"))
        elif tab_name == "scrape":
            content_container.mount(ScrapeOptionsForm(id="scrape_form"))

    def on_button_pressed(self, event: Button.Pressed) -> None:
        """Handle button presses."""
        if event.button.id and event.button.id.startswith("tab_"):
            tab_name = event.button.id[4:]  # Remove "tab_" prefix
            self.show_tab(tab_name)

    def get_all_options(self) -> FirecrawlOptions:
        """Get all configuration options."""
        options: FirecrawlOptions = {}

        # Try to get options from currently mounted form
        if self.current_tab == "scrape":
            try:
                form = self.query_one("#scrape_form", ScrapeOptionsForm)
                scrape_opts = form.get_scrape_options()
                options.update(cast(FirecrawlOptions, scrape_opts))
            except Exception:
                pass
        elif self.current_tab == "map":
            try:
                map_form = self.query_one("#map_form", MapOptionsForm)
                map_opts = map_form.get_map_options()
                options.update(cast(FirecrawlOptions, map_opts))
            except Exception:
                pass
        elif self.current_tab == "extract":
            try:
                extract_form = self.query_one("#extract_form", ExtractOptionsForm)
                extract_opts = extract_form.get_extract_options()
                options.update(cast(FirecrawlOptions, extract_opts))
            except Exception:
                pass

        return options
</file>

<file path="ingest_pipeline/cli/tui/widgets/r2r_widgets.py">
"""R2R-specific widgets for chunk viewing and entity visualization."""

from __future__ import annotations

from typing import Any

from textual import work
from textual.app import ComposeResult
from textual.containers import Container, Horizontal, Vertical, VerticalScroll
from textual.widget import Widget
from textual.widgets import Button, DataTable, Label, Markdown, ProgressBar, Static, Tree
from typing_extensions import override

from ....storage.r2r.storage import R2RStorage
from ..models import ChunkInfo, EntityInfo


class ChunkViewer(Widget):
    """Widget for viewing document chunks with navigation."""

    DEFAULT_CSS = """
    ChunkViewer {
        border: solid $border;
        background: $surface;
        height: 100%;
    }

    ChunkViewer .chunk-header {
        dock: top;
        height: 3;
        background: $primary;
        color: $text;
        padding: 1;
    }

    ChunkViewer .chunk-navigation {
        dock: top;
        height: 3;
        background: $surface-lighten-1;
        padding: 1;
    }

    ChunkViewer .chunk-content {
        height: 1fr;
        padding: 1;
        overflow: auto;
    }

    ChunkViewer .chunk-footer {
        dock: bottom;
        height: 3;
        background: $surface-darken-1;
        padding: 1;
    }
    """

    def __init__(self, r2r_storage: R2RStorage, document_id: str, **kwargs: Any) -> None:
        """Initialize chunk viewer."""
        super().__init__(**kwargs)
        self.r2r_storage: R2RStorage = r2r_storage
        self.document_id: str = document_id
        self.chunks: list[ChunkInfo] = []
        self.current_chunk_index: int = 0

    @override
    def compose(self) -> ComposeResult:
        """Compose chunk viewer layout."""
        yield Container(
            Static("📄 Document Chunks", classes="chunk-header"),
            Horizontal(
                Button("◀ Previous", id="prev_chunk", variant="default"),
                Static("Chunk 1 of 1", id="chunk_info"),
                Button("Next ▶", id="next_chunk", variant="default"),
                classes="chunk-navigation",
            ),
            VerticalScroll(
                Markdown("", id="chunk_content"),
                classes="chunk-content",
            ),
            Container(
                Static("Loading chunks...", id="chunk_status"),
                classes="chunk-footer",
            ),
        )

    def on_mount(self) -> None:
        """Initialize chunk viewer."""
        self.load_chunks()

    @work(exclusive=True)
    async def load_chunks(self) -> None:
        """Load document chunks."""
        try:
            chunks_data = await self.r2r_storage.get_document_chunks(self.document_id)
            self.chunks = []

            for chunk_data in chunks_data:
                chunk_info: ChunkInfo = {
                    "id": str(chunk_data.get("id", "")),
                    "document_id": self.document_id,
                    "content": str(chunk_data.get("text", "")),
                    "start_index": (lambda si: int(si) if isinstance(si, (int, str)) else 0)(
                        chunk_data.get("start_index", 0)
                    ),
                    "end_index": (lambda ei: int(ei) if isinstance(ei, (int, str)) else 0)(
                        chunk_data.get("end_index", 0)
                    ),
                    "metadata": (
                        dict(metadata_val)
                        if (metadata_val := chunk_data.get("metadata"))
                        and isinstance(metadata_val, dict)
                        else {}
                    ),
                }
                self.chunks.append(chunk_info)

            if self.chunks:
                self.current_chunk_index = 0
                self.update_chunk_display()
            else:
                self.query_one("#chunk_status", Static).update("No chunks found")

        except Exception as e:
            self.query_one("#chunk_status", Static).update(f"Error loading chunks: {e}")

    def update_chunk_display(self) -> None:
        """Update chunk display with current chunk."""
        if not self.chunks:
            return

        chunk = self.chunks[self.current_chunk_index]

        # Update content
        content_widget = self.query_one("#chunk_content", Markdown)
        content_widget.update(chunk["content"])

        # Update navigation info
        chunk_info = self.query_one("#chunk_info", Static)
        chunk_info.update(f"Chunk {self.current_chunk_index + 1} of {len(self.chunks)}")

        # Update status
        status_widget = self.query_one("#chunk_status", Static)
        status_widget.update(
            f"Chunk {chunk['id']} | "
            f"Range: {chunk['start_index']}-{chunk['end_index']} | "
            f"Length: {len(chunk['content'])} chars"
        )

        # Update button states
        prev_btn = self.query_one("#prev_chunk", Button)
        next_btn = self.query_one("#next_chunk", Button)
        prev_btn.disabled = self.current_chunk_index == 0
        next_btn.disabled = self.current_chunk_index >= len(self.chunks) - 1

    def on_button_pressed(self, event: Button.Pressed) -> None:
        """Handle button presses."""
        if event.button.id == "prev_chunk" and self.current_chunk_index > 0:
            self.current_chunk_index -= 1
            self.update_chunk_display()
        elif event.button.id == "next_chunk" and self.current_chunk_index < len(self.chunks) - 1:
            self.current_chunk_index += 1
            self.update_chunk_display()


class EntityGraph(Widget):
    """Widget for visualizing extracted entities and relationships."""

    DEFAULT_CSS = """
    EntityGraph {
        border: solid $border;
        background: $surface;
        height: 100%;
    }

    EntityGraph .entity-header {
        dock: top;
        height: 3;
        background: $primary;
        color: $text;
        padding: 1;
    }

    EntityGraph .entity-tree {
        height: 1fr;
        overflow: auto;
    }

    EntityGraph .entity-details {
        dock: bottom;
        height: 8;
        background: $surface-lighten-1;
        padding: 1;
        border-top: solid $border;
    }
    """

    def __init__(self, r2r_storage: R2RStorage, document_id: str, **kwargs: Any) -> None:
        """Initialize entity graph."""
        super().__init__(**kwargs)
        self.r2r_storage: R2RStorage = r2r_storage
        self.document_id: str = document_id
        self.entities: list[EntityInfo] = []

    @override
    def compose(self) -> ComposeResult:
        """Compose entity graph layout."""
        yield Container(
            Static("🕸️ Entity Graph", classes="entity-header"),
            Tree("Entities", id="entity_tree", classes="entity-tree"),
            VerticalScroll(
                Label("Entity Details"),
                Static("Select an entity to view details", id="entity_details"),
                classes="entity-details",
            ),
        )

    def on_mount(self) -> None:
        """Initialize entity graph."""
        self.load_entities()

    @work(exclusive=True)
    async def load_entities(self) -> None:
        """Load entities from document."""
        try:
            entities_data = await self.r2r_storage.extract_entities(self.document_id)
            self.entities = []

            # Parse entities from R2R response
            entities_list = entities_data.get("entities", [])
            if not isinstance(entities_list, list):
                entities_list = []
            for entity_data in entities_list:
                entity_info: EntityInfo = {
                    "id": str(entity_data.get("id", "")),
                    "name": str(entity_data.get("name", "")),
                    "type": str(entity_data.get("type", "unknown")),
                    "confidence": float(entity_data.get("confidence", 0.0)),
                    "metadata": dict(entity_data.get("metadata", {})),
                }
                self.entities.append(entity_info)

            self.populate_entity_tree()

        except Exception as e:
            details_widget = self.query_one("#entity_details", Static)
            details_widget.update(f"Error loading entities: {e}")

    def populate_entity_tree(self) -> None:
        """Populate the entity tree."""
        tree = self.query_one("#entity_tree", Tree)
        tree.clear()

        if not self.entities:
            tree.root.add_leaf("No entities found")
            return

        # Group entities by type
        entities_by_type: dict[str, list[EntityInfo]] = {}
        for entity in self.entities:
            entity_type = entity["type"]
            if entity_type not in entities_by_type:
                entities_by_type[entity_type] = []
            entities_by_type[entity_type].append(entity)

        # Add entities to tree grouped by type
        for entity_type, type_entities in entities_by_type.items():
            type_node = tree.root.add(f"{entity_type.title()} ({len(type_entities)})")
            for entity in type_entities:
                confidence_pct = int(entity["confidence"] * 100)
                entity_node = type_node.add_leaf(f"{entity['name']} ({confidence_pct}%)")
                entity_node.data = entity

        tree.root.expand()

    def on_tree_node_selected(self, event: Tree.NodeSelected[EntityInfo]) -> None:
        """Handle entity selection."""
        if hasattr(event.node, "data") and event.node.data:
            entity = event.node.data
            self.show_entity_details(entity)

    def show_entity_details(self, entity: EntityInfo) -> None:
        """Show detailed information about an entity."""
        details_widget = self.query_one("#entity_details", Static)

        details_text = f"""**Entity:** {entity["name"]}
**Type:** {entity["type"]}
**Confidence:** {entity["confidence"]:.2%}
**ID:** {entity["id"]}

**Metadata:**
"""
        for key, value in entity["metadata"].items():
            details_text += f"- **{key}:** {value}\n"

        details_widget.update(details_text)


class CollectionStats(Widget):
    """Widget for showing R2R-specific collection statistics."""

    DEFAULT_CSS = """
    CollectionStats {
        border: solid $border;
        background: $surface;
        height: 100%;
        padding: 1;
    }

    CollectionStats .stats-header {
        dock: top;
        height: 3;
        background: $primary;
        color: $text;
        padding: 1;
        margin: -1 -1 1 -1;
    }

    CollectionStats .stats-grid {
        layout: grid;
        grid-size: 2;
        grid-columns: 1fr 1fr;
        grid-gutter: 1;
        height: auto;
    }

    CollectionStats .stat-card {
        background: $surface-lighten-1;
        border: solid $border;
        padding: 1;
        height: auto;
    }

    CollectionStats .stat-value {
        color: $primary;
        text-style: bold;
        text-align: center;
    }

    CollectionStats .stat-label {
        color: $text-muted;
        text-align: center;
        margin-top: 1;
    }

    CollectionStats .progress-section {
        margin-top: 2;
    }
    """

    def __init__(self, r2r_storage: R2RStorage, collection_name: str, **kwargs: Any) -> None:
        """Initialize collection stats."""
        super().__init__(**kwargs)
        self.r2r_storage: R2RStorage = r2r_storage
        self.collection_name: str = collection_name

    @override
    def compose(self) -> ComposeResult:
        """Compose stats layout."""
        yield Container(
            Static(f"📊 {self.collection_name} Statistics", classes="stats-header"),
            Container(
                Container(
                    Static("0", id="document_count", classes="stat-value"),
                    Static("Documents", classes="stat-label"),
                    classes="stat-card",
                ),
                Container(
                    Static("0", id="chunk_count", classes="stat-value"),
                    Static("Chunks", classes="stat-label"),
                    classes="stat-card",
                ),
                Container(
                    Static("0", id="entity_count", classes="stat-value"),
                    Static("Entities", classes="stat-label"),
                    classes="stat-card",
                ),
                Container(
                    Static("0 MB", id="storage_size", classes="stat-value"),
                    Static("Storage Used", classes="stat-label"),
                    classes="stat-card",
                ),
                classes="stats-grid",
            ),
            Container(
                Label("Processing Progress"),
                ProgressBar(id="processing_progress", total=100, show_eta=False),
                Static("Idle", id="processing_status"),
                classes="progress-section",
            ),
        )

    def on_mount(self) -> None:
        """Initialize stats display."""
        self.refresh_stats()

    @work(exclusive=True)
    async def refresh_stats(self) -> None:
        """Refresh collection statistics."""
        try:
            # Get basic document count
            doc_count = await self.r2r_storage.count(collection_name=self.collection_name)
            self.query_one("#document_count", Static).update(str(doc_count))

            # Estimate other stats (these would need real implementation)
            estimated_chunks = doc_count * 5  # Rough estimate
            estimated_entities = doc_count * 10  # Rough estimate
            estimated_size_mb = doc_count * 0.05  # Rough estimate

            self.query_one("#chunk_count", Static).update(str(estimated_chunks))
            self.query_one("#entity_count", Static).update(str(estimated_entities))
            self.query_one("#storage_size", Static).update(f"{estimated_size_mb:.1f} MB")

            # Update progress (would be real-time in actual implementation)
            progress_bar = self.query_one("#processing_progress", ProgressBar)
            progress_bar.progress = 100  # Assume complete for now

            status_widget = self.query_one("#processing_status", Static)
            status_widget.update("All documents processed")

        except Exception as e:
            self.query_one("#processing_status", Static).update(f"Error: {e}")


class DocumentOverview(Widget):
    """Widget for comprehensive document overview and statistics."""

    DEFAULT_CSS = """
    DocumentOverview {
        layout: vertical;
        height: 100%;
    }

    DocumentOverview .overview-header {
        dock: top;
        height: 3;
        background: $primary;
        color: $text;
        padding: 1;
    }

    DocumentOverview .overview-content {
        height: 1fr;
        layout: horizontal;
    }

    DocumentOverview .overview-left {
        width: 50%;
        padding: 1;
    }

    DocumentOverview .overview-right {
        width: 50%;
        padding: 1;
    }

    DocumentOverview .info-table {
        height: auto;
        margin-bottom: 2;
    }
    """

    def __init__(self, r2r_storage: R2RStorage, document_id: str, **kwargs: Any) -> None:
        """Initialize document overview."""
        super().__init__(**kwargs)
        self.r2r_storage: R2RStorage = r2r_storage
        self.document_id: str = document_id

    @override
    def compose(self) -> ComposeResult:
        """Compose overview layout."""
        yield Container(
            Static("📋 Document Overview", classes="overview-header"),
            Horizontal(
                Vertical(
                    Label("Document Information"),
                    DataTable[str](id="doc_info_table", classes="info-table"),
                    Label("Processing Statistics"),
                    DataTable[str](id="stats_table", classes="info-table"),
                    classes="overview-left",
                ),
                Vertical(
                    ChunkViewer(self.r2r_storage, self.document_id),
                    classes="overview-right",
                ),
                classes="overview-content",
            ),
        )

    def on_mount(self) -> None:
        """Initialize overview."""
        self.load_overview()

    @work(exclusive=True)
    async def load_overview(self) -> None:
        """Load comprehensive document overview."""
        try:
            overview_data = await self.r2r_storage.get_document_overview(self.document_id)

            # Populate document info table
            doc_table = self.query_one("#doc_info_table", DataTable)
            doc_table.add_columns("Property", "Value")

            document_info_raw = overview_data.get("document", {})
            document_info = document_info_raw if isinstance(document_info_raw, dict) else {}
            doc_table.add_row("ID", str(document_info.get("id", "N/A")))
            doc_table.add_row("Title", str(document_info.get("title", "N/A")))
            doc_table.add_row("Created", str(document_info.get("created_at", "N/A")))
            doc_table.add_row("Modified", str(document_info.get("updated_at", "N/A")))

            # Populate stats table
            stats_table = self.query_one("#stats_table", DataTable)
            stats_table.add_columns("Metric", "Count")

            chunk_count = overview_data.get("chunk_count", 0)
            stats_table.add_row("Chunks", str(chunk_count))
            stats_table.add_row("Characters", str(len(str(document_info.get("content", "")))))

        except Exception as e:
            # Handle error by showing minimal info
            doc_table = self.query_one("#doc_info_table", DataTable)
            doc_table.add_columns("Property", "Value")
            doc_table.add_row("Error", str(e))
</file>

<file path="ingest_pipeline/cli/tui/app.py">
"""Main TUI application with enhanced keyboard navigation."""

from __future__ import annotations

import logging
import os
from collections import deque
from pathlib import Path
from queue import Empty, Queue
from typing import TYPE_CHECKING, ClassVar, Literal

from textual import events
from textual.app import App
from textual.binding import Binding, BindingType
from textual.timer import Timer

from ...storage.base import BaseStorage
from ...storage.openwebui import OpenWebUIStorage
from ...storage.weaviate import WeaviateStorage
from .screens.dashboard import CollectionOverviewScreen
from .screens.help import HelpScreen
from .styles import TUI_CSS
from .utils.storage_manager import StorageManager

if TYPE_CHECKING:
    from logging import Formatter, LogRecord

    from ...storage.r2r.storage import R2RStorage
    from .screens.dialogs import LogViewerScreen
else:  # pragma: no cover - optional dependency fallback
    R2RStorage = BaseStorage


class CollectionManagementApp(App[None]):
    """Enhanced modern Textual application with comprehensive keyboard navigation."""

    CSS: ClassVar[str] = TUI_CSS
    TITLE = "Collection Management"
    SUB_TITLE = "Document Ingestion Pipeline"

    def safe_notify(
        self,
        message: str,
        *,
        severity: Literal["information", "warning", "error"] = "information",
    ) -> None:
        """Safely notify with markup disabled to prevent parsing errors."""
        self.notify(message, severity=severity, markup=False)

    BINDINGS: ClassVar[list[BindingType]] = [
        Binding("q", "quit", "Quit"),
        Binding("ctrl+c", "quit", "Quit"),
        Binding("ctrl+q", "quit", "Quit"),
        Binding("f1", "help", "Help"),
        Binding("ctrl+h", "help", "Help"),
        Binding("?", "help", "Quick Help"),
        # Global navigation shortcuts
        Binding("ctrl+r", "refresh_current", "Refresh Current Screen"),
        Binding("ctrl+w", "close_current", "Close Current Screen"),
        Binding("ctrl+l", "toggle_logs", "Logs"),
        # Tab navigation shortcuts
        Binding("ctrl+1", "dashboard_tab", "Dashboard", show=False),
        Binding("ctrl+2", "collections_tab", "Collections", show=False),
        Binding("ctrl+3", "analytics_tab", "Analytics", show=False),
    ]

    storage_manager: StorageManager
    weaviate: WeaviateStorage | None
    openwebui: OpenWebUIStorage | None
    r2r: R2RStorage | BaseStorage | None
    log_queue: Queue[LogRecord] | None
    _log_formatter: Formatter
    _log_buffer: deque[str]
    _log_viewer: LogViewerScreen | None
    _log_file: Path | None
    _log_timer: Timer | None

    def __init__(
        self,
        storage_manager: StorageManager,
        weaviate: WeaviateStorage | None = None,
        openwebui: OpenWebUIStorage | None = None,
        r2r: R2RStorage | BaseStorage | None = None,
        *,
        log_queue: Queue[LogRecord] | None = None,
        log_formatter: Formatter | None = None,
        log_file: Path | None = None,
    ) -> None:
        super().__init__()
        self.storage_manager = storage_manager
        self.weaviate = weaviate
        self.openwebui = openwebui
        self.r2r = r2r
        # Remove direct assignment to read-only title properties
        # These should be set through class attributes or overridden methods
        self.log_queue = log_queue
        self._log_formatter = log_formatter or logging.Formatter(
            fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
            datefmt="%H:%M:%S",
        )
        self._log_buffer = deque(maxlen=500)
        self._log_viewer = None
        self._log_file = log_file
        self._log_timer = None

    def on_mount(self) -> None:
        """Initialize the enhanced app with better branding."""
        self.title = "🚀 Enhanced Collection Management System"
        self.sub_title = (
            "Advanced Document Ingestion & Management Platform with Keyboard Navigation"
        )
        reduced_motion_env = os.getenv("TEXTUAL_REDUCED_MOTION") or os.getenv(
            "PREFER_REDUCED_MOTION"
        )
        if reduced_motion_env is not None:
            normalized = reduced_motion_env.strip().lower()
            reduced_motion_enabled = normalized in {"1", "true", "yes", "on"}
        else:
            reduced_motion_enabled = False
        _ = self.set_class(reduced_motion_enabled, "reduced-motion")
        _ = self.push_screen(
            CollectionOverviewScreen(
                self.storage_manager,
                self.weaviate,
                self.openwebui,
                self.r2r,
            )
        )
        if self.log_queue is not None and self._log_timer is None:
            # Poll the queue so log output is captured without blocking the UI loop
            self._log_timer = self.set_interval(0.25, self._drain_log_queue)

    def _drain_log_queue(self) -> None:
        """Drain queued log records and route them to the active log viewer."""
        if self.log_queue is None:
            return

        drained: list[str] = []
        while True:
            try:
                record = self.log_queue.get_nowait()
            except Empty:
                break
            message = self._log_formatter.format(record)
            self._log_buffer.append(message)
            drained.append(message)

        if drained and self._log_viewer is not None:
            self._log_viewer.append_logs(drained)

    def attach_log_viewer(self, viewer: LogViewerScreen) -> None:
        """Register an active log viewer and hydrate it with existing entries."""
        self._log_viewer = viewer
        viewer.replace_logs(list(self._log_buffer))
        viewer.update_log_file(self._log_file)
        # Drain once more to deliver any entries gathered between instantiation and mount
        self._drain_log_queue()

    def detach_log_viewer(self, viewer: LogViewerScreen) -> None:
        """Remove the current log viewer when it is dismissed."""
        if self._log_viewer is viewer:
            self._log_viewer = None

    def get_log_file_path(self) -> Path | None:
        """Return the active log file path if configured."""
        return self._log_file

    def action_toggle_logs(self) -> None:
        """Toggle the log viewer modal screen."""
        if self._log_viewer is not None:
            _ = self.pop_screen()
            return

        from .screens.dialogs import LogViewerScreen  # Local import to avoid cycle

        _ = self.push_screen(LogViewerScreen())

    def action_help(self) -> None:
        """Show comprehensive help information with all keyboard shortcuts."""
        help_md = """
# 🚀 Enhanced Collection Management System

## 🎯 Global Navigation
- **F1** / **Ctrl+H** / **?**: Show this help
- **Q** / **Ctrl+C** / **Ctrl+Q**: Quit application
- **Ctrl+R**: Refresh current screen
- **Ctrl+W**: Close current screen/dialog
- **Escape**: Go back/cancel current action

## 📑 Tab Navigation
- **Tab** / **Shift+Tab**: Switch between tabs
- **Ctrl+1**: Jump to Dashboard tab
- **Ctrl+2**: Jump to Collections tab
- **Ctrl+3**: Jump to Analytics tab

## 📚 Collections Management
- **R**: Refresh collections list
- **I**: Start new ingestion
- **M**: Manage documents in selected collection
- **S**: Search within selected collection
- **Ctrl+D**: Delete selected collection

## 🗂️ Table Navigation
- **Arrow Keys** / **J/K/H/L**: Navigate table cells (Vi-style)
- **Home** / **End**: Jump to first/last row
- **Page Up** / **Page Down**: Scroll by page
- **Enter**: Select/activate current row
- **Space**: Toggle row selection
- **Ctrl+A**: Select all items
- **Ctrl+Shift+A**: Clear all selections

## 📄 Document Management
- **Space**: Toggle document selection
- **Delete** / **Ctrl+D**: Delete selected documents
- **A**: Select all documents on page
- **N**: Clear selection
- **Page Up/Down**: Navigate between pages
- **Home/End**: Go to first/last page

## 🔍 Search Features
- **/** : Quick search (focus search field)
- **Ctrl+F**: Focus search input
- **Enter**: Perform search
- **F3**: Repeat last search
- **Ctrl+R**: Clear search results
- **Escape**: Clear search/exit search mode

## 📥 Ingestion Interface
- **1/2/3**: Select ingestion type (Web/Repository/Documentation)
- **Tab/Shift+Tab**: Navigate between fields
- **Enter**: Start ingestion process
- **Ctrl+I**: Quick start ingestion
- **Escape**: Cancel ingestion

## 🎨 Visual Features
- Enhanced focus indicators with colored borders
- Smooth keyboard navigation with visual feedback
- Status indicators with real-time updates
- Progress bars with detailed status messages
- Responsive design with accessibility features

## 💡 Pro Tips
- Use **Vi-style** navigation (J/K/H/L) for efficient movement
- **Tab** through interactive elements for keyboard-only operation
- Hold **Shift** with arrow keys for range selection (where supported)
- Use **Ctrl+** shortcuts for power user efficiency
- **Escape** is your friend - it cancels most operations safely

## 🚀 Performance Features
- Lazy loading for large collections
- Paginated document views
- Background refresh operations
- Efficient memory management
- Responsive UI updates

---

**Enjoy the enhanced keyboard-driven interface!** 🎉

*Press Escape, Enter, or Q to close this help.*
        """
        _ = self.push_screen(HelpScreen(help_md))

    def action_refresh_current(self) -> None:
        """Refresh the current screen if it supports it."""
        current_screen = self.screen
        handler = getattr(current_screen, "action_refresh", None)
        if callable(handler):
            _ = handler()
            return
        self.notify("Current screen doesn't support refresh", severity="information")

    def action_close_current(self) -> None:
        """Close current screen/dialog."""
        if len(self.screen_stack) > 1:  # Don't close the main screen
            _ = self.pop_screen()
        else:
            self.notify("Cannot close main screen. Use Q to quit.", severity="warning")

    def action_dashboard_tab(self) -> None:
        """Switch to dashboard tab in current screen."""
        current_screen = self.screen
        handler = getattr(current_screen, "action_tab_dashboard", None)
        if callable(handler):
            _ = handler()

    def action_collections_tab(self) -> None:
        """Switch to collections tab in current screen."""
        current_screen = self.screen
        handler = getattr(current_screen, "action_tab_collections", None)
        if callable(handler):
            _ = handler()

    def action_analytics_tab(self) -> None:
        """Switch to analytics tab in current screen."""
        current_screen = self.screen
        handler = getattr(current_screen, "action_tab_analytics", None)
        if callable(handler):
            _ = handler()

    def on_key(self, event: events.Key) -> None:
        """Handle global keyboard shortcuts."""
        # Handle global shortcuts that might not be bound to specific actions
        if event.key == "ctrl+shift+?":
            # Alternative help shortcut
            self.action_help()
            _ = event.prevent_default()
        elif event.key == "ctrl+alt+r":
            # Force refresh all connections
            self.notify("🔄 Refreshing all connections...", severity="information")
            # This could trigger a full reinit if needed
            _ = event.prevent_default()
        # No else clause needed - just handle our events
</file>

<file path="ingest_pipeline/cli/tui/layouts.py">
"""Responsive layout system for TUI applications."""

from __future__ import annotations

from typing import cast

from textual.app import ComposeResult
from textual.containers import Container, VerticalScroll
from textual.widget import Widget
from textual.widgets import Static
from typing_extensions import override


class ResponsiveGrid(Container):
    """Grid that auto-adjusts based on terminal size."""

    DEFAULT_CSS: str = """
    ResponsiveGrid {
        layout: grid;
        grid-size: 1;
        grid-columns: 1fr;
        grid-rows: auto;
        grid-gutter: 1;
        padding: 1;
    }

    ResponsiveGrid.two-column {
        grid-size: 2;
        grid-columns: 1fr 1fr;
    }

    ResponsiveGrid.three-column {
        grid-size: 3;
        grid-columns: 1fr 1fr 1fr;
    }

    ResponsiveGrid.auto-fit {
        grid-columns: repeat(auto-fit, minmax(20, 1fr));
    }

    ResponsiveGrid.compact {
        grid-gutter: 0;
        padding: 0;
    }
    """

    def __init__(
        self,
        *children: Widget,
        columns: int = 1,
        auto_fit: bool = False,
        compact: bool = False,
        name: str | None = None,
        id: str | None = None,
        classes: str | None = None,
        disabled: bool = False,
        markup: bool = True,
    ) -> None:
        """Initialize responsive grid."""
        super().__init__(
            *children, name=name, id=id, classes=classes, disabled=disabled, markup=markup
        )
        self._columns: int = columns
        self._auto_fit: bool = auto_fit
        self._compact: bool = compact

    def on_mount(self) -> None:
        """Apply responsive classes based on configuration."""
        widget = cast(Widget, self)
        if self._auto_fit:
            widget.add_class("auto-fit")
        elif self._columns == 2:
            widget.add_class("two-column")
        elif self._columns == 3:
            widget.add_class("three-column")

        if self._compact:
            widget.add_class("compact")

    def on_resize(self) -> None:
        """Adjust layout based on terminal size."""
        if self._auto_fit:
            # Let CSS handle auto-fit
            return

        widget = cast(Widget, self)
        terminal_width = widget.size.width
        if terminal_width < 60:
            # Force single column on narrow terminals
            widget.remove_class("two-column", "three-column")
            widget.styles.grid_size_columns = 1
            widget.styles.grid_columns = "1fr"
        elif terminal_width < 100 and self._columns > 2:
            # Force two columns on medium terminals
            widget.remove_class("three-column")
            widget.add_class("two-column")
            widget.styles.grid_size_columns = 2
            widget.styles.grid_columns = "1fr 1fr"
        elif self._columns == 2:
            widget.add_class("two-column")
        elif self._columns == 3:
            widget.add_class("three-column")


class CollapsibleSidebar(Container):
    """Sidebar that can be collapsed to save space."""

    DEFAULT_CSS: str = """
    CollapsibleSidebar {
        dock: left;
        width: 25%;
        min-width: 20;
        max-width: 40;
        background: $surface;
        border-right: solid $border;
        padding: 1;
        transition: width 300ms;
    }

    CollapsibleSidebar.collapsed {
        width: 3;
        min-width: 3;
        overflow: hidden;
    }

    CollapsibleSidebar.collapsed > * {
        display: none;
    }

    CollapsibleSidebar .sidebar-toggle {
        dock: top;
        height: 1;
        background: $primary;
        color: $text;
        text-align: center;
        margin-bottom: 1;
    }

    CollapsibleSidebar .sidebar-content {
        height: 1fr;
        overflow-y: auto;
    }
    """

    def __init__(
        self,
        *children: Widget,
        collapsed: bool = False,
        name: str | None = None,
        id: str | None = None,
        classes: str | None = None,
        disabled: bool = False,
        markup: bool = True,
    ) -> None:
        """Initialize collapsible sidebar."""
        super().__init__(name=name, id=id, classes=classes, disabled=disabled, markup=markup)
        self._collapsed: bool = collapsed
        self._children: tuple[Widget, ...] = children

    @override
    def compose(self) -> ComposeResult:
        """Compose sidebar with toggle and content."""
        yield Static("☰", classes="sidebar-toggle")
        with VerticalScroll(classes="sidebar-content"):
            yield from self._children

    def on_mount(self) -> None:
        """Apply initial collapsed state."""
        if self._collapsed:
            cast(Widget, self).add_class("collapsed")

    def on_click(self) -> None:
        """Toggle sidebar when clicked."""
        self.toggle()

    def toggle(self) -> None:
        """Toggle sidebar collapsed state."""
        self._collapsed = not self._collapsed
        widget = cast(Widget, self)
        if self._collapsed:
            widget.add_class("collapsed")
        else:
            widget.remove_class("collapsed")

    def expand_sidebar(self) -> None:
        """Expand sidebar."""
        if self._collapsed:
            self.toggle()

    def collapse_sidebar(self) -> None:
        """Collapse sidebar."""
        if not self._collapsed:
            self.toggle()


class TabularLayout(Container):
    """Optimized layout for data tables with optional sidebar."""

    DEFAULT_CSS: str = """
    TabularLayout {
        layout: horizontal;
        height: 100%;
    }

    TabularLayout .main-content {
        width: 1fr;
        height: 100%;
        layout: vertical;
    }

    TabularLayout .table-container {
        height: 1fr;
        overflow: auto;
        border: solid $border;
        background: $surface;
    }

    TabularLayout .table-header {
        dock: top;
        height: 3;
        background: $primary;
        color: $text;
        padding: 1;
    }

    TabularLayout .table-footer {
        dock: bottom;
        height: 3;
        background: $surface-lighten-1;
        padding: 1;
        border-top: solid $border;
    }
    """

    def __init__(
        self,
        table_widget: Widget,
        header_content: Widget | None = None,
        footer_content: Widget | None = None,
        sidebar_content: Widget | None = None,
        name: str | None = None,
        id: str | None = None,
        classes: str | None = None,
        disabled: bool = False,
        markup: bool = True,
    ) -> None:
        """Initialize tabular layout."""
        super().__init__(name=name, id=id, classes=classes, disabled=disabled, markup=markup)
        self.table_widget: Widget = table_widget
        self.header_content: Widget | None = header_content
        self.footer_content: Widget | None = footer_content
        self.sidebar_content: Widget | None = sidebar_content

    @override
    def compose(self) -> ComposeResult:
        """Compose layout with optional sidebar."""
        if self.sidebar_content:
            yield CollapsibleSidebar(self.sidebar_content)

        with Container(classes="main-content"):
            if self.header_content:
                yield Container(self.header_content, classes="table-header")

            yield Container(self.table_widget, classes="table-container")

            if self.footer_content:
                yield Container(self.footer_content, classes="table-footer")


class CardLayout(ResponsiveGrid):
    """Grid layout optimized for card-based content."""

    DEFAULT_CSS: str = """
    CardLayout {
        grid-gutter: 2;
        padding: 2;
    }

    CardLayout .card {
        background: $surface;
        border: solid $border;
        border-radius: 1;
        padding: 2;
        height: auto;
        min-height: 10;
    }

    CardLayout .card:hover {
        border: solid $accent;
        background: $surface-lighten-1;
    }

    CardLayout .card:focus {
        border: solid $primary;
    }

    CardLayout .card-header {
        dock: top;
        height: 3;
        background: $primary-lighten-1;
        color: $text;
        padding: 1;
        margin: -2 -2 1 -2;
        border-radius: 1 1 0 0;
    }

    CardLayout .card-content {
        height: 1fr;
        overflow: auto;
    }

    CardLayout .card-footer {
        dock: bottom;
        height: 3;
        background: $surface-darken-1;
        padding: 1;
        margin: 1 -2 -2 -2;
        border-radius: 0 0 1 1;
    }
    """

    def __init__(
        self,
        name: str | None = None,
        id: str | None = None,
        classes: str | None = None,
        disabled: bool = False,
        markup: bool = True,
    ) -> None:
        """Initialize card layout with default settings for cards."""
        # Default to auto-fit cards with minimum width
        super().__init__(
            auto_fit=True, name=name, id=id, classes=classes, disabled=disabled, markup=markup
        )


class SplitPane(Container):
    """Resizable split pane layout."""

    DEFAULT_CSS: str = """
    SplitPane {
        layout: horizontal;
        height: 100%;
    }

    SplitPane.vertical {
        layout: vertical;
    }

    SplitPane .left-pane,
    SplitPane .top-pane {
        width: 50%;
        height: 50%;
        background: $surface;
        border-right: solid $border;
        border-bottom: solid $border;
    }

    SplitPane .right-pane,
    SplitPane .bottom-pane {
        width: 50%;
        height: 50%;
        background: $surface;
    }

    SplitPane .splitter {
        width: 1;
        height: 1;
        background: $border;
    }

    SplitPane.vertical .splitter {
        width: 100%;
        height: 1;
    }
    """

    def __init__(
        self,
        left_content: Widget,
        right_content: Widget,
        vertical: bool = False,
        split_ratio: float = 0.5,
        name: str | None = None,
        id: str | None = None,
        classes: str | None = None,
        disabled: bool = False,
        markup: bool = True,
    ) -> None:
        """Initialize split pane."""
        super().__init__(name=name, id=id, classes=classes, disabled=disabled, markup=markup)
        self._left_content: Widget = left_content
        self._right_content: Widget = right_content
        self._vertical: bool = vertical
        self._split_ratio: float = split_ratio

    @override
    def compose(self) -> ComposeResult:
        """Compose split pane layout."""
        if self._vertical:
            cast(Widget, self).add_class("vertical")

        pane_classes = (
            ("top-pane", "bottom-pane") if self._vertical else ("left-pane", "right-pane")
        )

        yield Container(self._left_content, classes=pane_classes[0])
        yield Static("", classes="splitter")
        yield Container(self._right_content, classes=pane_classes[1])

    def on_mount(self) -> None:
        """Apply split ratio."""
        widget = cast(Widget, self)
        if self._vertical:
            widget.query_one(".top-pane").styles.height = f"{self._split_ratio * 100}%"
            widget.query_one(".bottom-pane").styles.height = f"{(1 - self._split_ratio) * 100}%"
        else:
            widget.query_one(".left-pane").styles.width = f"{self._split_ratio * 100}%"
            widget.query_one(".right-pane").styles.width = f"{(1 - self._split_ratio) * 100}%"
</file>

<file path="ingest_pipeline/cli/main.py">
"""CLI interface for ingestion pipeline."""

import asyncio
from typing import Annotated

import typer
from pydantic import SecretStr
from rich.console import Console
from rich.panel import Panel
from rich.progress import BarColumn, Progress, SpinnerColumn, TaskProgressColumn, TextColumn
from rich.table import Table

from ..config import configure_prefect, get_settings
from ..core.models import (
    IngestionResult,
    IngestionSource,
    StorageBackend,
    StorageConfig,
)
from ..flows.ingestion import create_ingestion_flow
from ..flows.scheduler import create_scheduled_deployment, serve_deployments

app = typer.Typer(
    name="ingest",
    help="🚀 Modern Document Ingestion Pipeline - Advanced web and repository processing",
    rich_markup_mode="rich",
    add_completion=False,
)
console = Console()


@app.callback()
def main(
    version: Annotated[
        bool, typer.Option("--version", "-v", help="Show version information")
    ] = False,
) -> None:
    """
    🚀 Modern Document Ingestion Pipeline

    [bold cyan]Advanced document processing and management platform[/bold cyan]

    Features:
    • 🌐 Web scraping and crawling with Firecrawl
    • 📦 Repository ingestion with Repomix
    • 🗄️ Multiple storage backends (Weaviate, OpenWebUI, R2R)
    • 📊 Modern TUI for collection management
    • ⚡ Async processing with Prefect orchestration
    • 🎨 Rich CLI with enhanced visuals
    """
    settings = get_settings()
    configure_prefect(settings)

    if version:
        console.print(
            Panel(
                (
                    "[bold magenta]Ingest Pipeline v0.1.0[/bold magenta]\n"
                    "[dim]Modern Document Ingestion & Management System[/dim]"
                ),
                title="🚀 Version Info",
                border_style="magenta",
            )
        )
        raise typer.Exit()


@app.command()
def ingest(
    source_url: Annotated[str, typer.Argument(help="URL or path to ingest from")],
    source_type: Annotated[
        IngestionSource, typer.Option("--type", "-t", help="Type of source")
    ] = IngestionSource.WEB,
    storage: Annotated[
        StorageBackend, typer.Option("--storage", "-s", help="Storage backend")
    ] = StorageBackend.WEAVIATE,
    collection: Annotated[
        str | None,
        typer.Option(
            "--collection", "-c", help="Target collection name (auto-generated if not specified)"
        ),
    ] = None,
    validate: Annotated[
        bool, typer.Option("--validate/--no-validate", help="Validate source before ingesting")
    ] = True,
) -> None:
    """
    🚀 Run a one-time ingestion job with enhanced progress tracking.

    This command processes documents from various sources and stores them in
    your chosen backend with full progress visualization.
    """
    # Enhanced startup message
    console.print(
        Panel(
            (
                f"[bold cyan]🚀 Starting Modern Ingestion[/bold cyan]\n\n"
                f"[yellow]Source:[/yellow] {source_url}\n"
                f"[yellow]Type:[/yellow] {source_type.value.title()}\n"
                f"[yellow]Storage:[/yellow] {storage.value.replace('_', ' ').title()}\n"
                f"[yellow]Collection:[/yellow] {collection or '[dim]Auto-generated[/dim]'}"
            ),
            title="🔥 Ingestion Configuration",
            border_style="cyan",
        )
    )

    async def run_with_progress() -> IngestionResult:
        with Progress(
            SpinnerColumn(),
            TextColumn("[progress.description]{task.description}"),
            BarColumn(),
            TaskProgressColumn(),
            console=console,
        ) as progress:
            task = progress.add_task("🔄 Processing documents...", total=100)

            # Simulate progress updates during ingestion
            progress.update(task, advance=20, description="🔗 Connecting to services...")
            await asyncio.sleep(0.5)

            progress.update(task, advance=30, description="📄 Fetching documents...")
            result = await run_ingestion(
                url=source_url,
                source_type=source_type,
                storage_backend=storage,
                collection_name=collection,
                validate_first=validate,
            )

            progress.update(task, advance=50, description="✅ Ingestion complete!")
            return result

    # Use asyncio.run() with proper event loop handling
    try:
        result = asyncio.run(run_with_progress())
    except RuntimeError as e:
        if "asyncio.run() cannot be called from a running event loop" in str(e):
            # If we're already in an event loop (e.g., in Jupyter), use nest_asyncio
            try:
                import nest_asyncio

                nest_asyncio.apply()
                result = asyncio.run(run_with_progress())
            except ImportError:
                # Fallback: get the current loop and run the coroutine
                loop = asyncio.get_event_loop()
                result = loop.run_until_complete(run_with_progress())
        else:
            raise

    # Enhanced results display
    status_color = "green" if result.status.value == "completed" else "red"

    # Create results table with enhanced styling
    table = Table(
        title="📊 Ingestion Results",
        title_style="bold magenta",
        border_style="cyan",
        header_style="bold blue",
    )
    table.add_column("📋 Metric", style="cyan", no_wrap=True)
    table.add_column("📈 Value", style=status_color, justify="right")

    # Add enhanced status icon
    status_icon = "✅" if result.status.value == "completed" else "❌"
    table.add_row("Status", f"{status_icon} {result.status.value.title()}")

    table.add_row("Documents Processed", f"📄 {result.documents_processed:,}")
    table.add_row("Documents Failed", f"⚠️ {result.documents_failed:,}")
    table.add_row("Duration", f"⏱️ {result.duration_seconds:.2f}s")

    if result.error_messages:
        error_text = "\n".join(f"❌ {error}" for error in result.error_messages[:3])
        if len(result.error_messages) > 3:
            error_text += f"\n... and {len(result.error_messages) - 3} more errors"
        table.add_row("Errors", error_text)

    console.print(table)

    # Success celebration or error guidance
    if result.status.value == "completed" and result.documents_processed > 0:
        console.print(
            Panel(
                (
                    f"🎉 [bold green]Success![/bold green] {result.documents_processed} documents ingested\n\n"
                    f"💡 [dim]Try '[bold cyan]ingest modern[/bold cyan]' to explore your collections![/dim]"
                ),
                title="✨ Ingestion Complete",
                border_style="green",
            )
        )
    elif result.error_messages:
        console.print(
            Panel(
                (
                    "❌ [bold red]Ingestion encountered errors[/bold red]\n\n"
                    "💡 [dim]Check your configuration and try again[/dim]"
                ),
                title="⚠️ Issues Detected",
                border_style="red",
            )
        )


@app.command()
def schedule(
    name: Annotated[str, typer.Argument(help="Deployment name")],
    source_url: Annotated[str, typer.Argument(help="URL or path to ingest from")],
    source_type: Annotated[
        IngestionSource, typer.Option("--type", "-t", help="Type of source")
    ] = IngestionSource.WEB,
    storage: Annotated[
        StorageBackend, typer.Option("--storage", "-s", help="Storage backend")
    ] = StorageBackend.WEAVIATE,
    cron: Annotated[
        str | None, typer.Option("--cron", "-c", help="Cron expression for scheduling")
    ] = None,
    interval: Annotated[int, typer.Option("--interval", "-i", help="Interval in minutes")] = 60,
    serve_now: Annotated[
        bool, typer.Option("--serve/--no-serve", help="Start serving immediately")
    ] = False,
) -> None:
    """
    Create a scheduled deployment for recurring ingestion.
    """
    console.print(f"[bold blue]Creating deployment: {name}[/bold blue]")

    deployment = create_scheduled_deployment(
        name=name,
        source_url=source_url,
        source_type=source_type,
        storage_backend=storage,
        schedule_type="cron" if cron else "interval",
        cron_expression=cron,
        interval_minutes=interval,
    )

    console.print(f"[green]✓ Deployment '{name}' created[/green]")

    if serve_now:
        console.print("[yellow]Starting deployment server...[/yellow]")
        serve_deployments([deployment])


@app.command()
def serve(
    config_file: Annotated[
        str | None, typer.Option("--config", "-c", help="Path to deployments config file")
    ] = None,
    ui: Annotated[
        str | None, typer.Option("--ui", help="Launch user interface (options: tui, web)")
    ] = None,
) -> None:
    """
    🚀 Serve configured deployments with optional UI interface.

    Launch the deployment server to run scheduled ingestion jobs,
    optionally with a modern Terminal User Interface (TUI) or web interface.
    """
    # Handle UI mode first
    if ui == "tui":
        console.print(
            Panel(
                (
                    "[bold cyan]🚀 Launching Enhanced TUI[/bold cyan]\n\n"
                    "[yellow]Features:[/yellow]\n"
                    "• 📊 Interactive collection management\n"
                    "• ⌨️  Enhanced keyboard navigation\n"
                    "• 🎨 Modern design with focus indicators\n"
                    "• 📄 Document browsing and search\n"
                    "• 🔄 Real-time status updates"
                ),
                title="🎉 TUI Mode",
                border_style="cyan",
            )
        )
        from .tui import dashboard

        dashboard()
        return
    elif ui == "web":
        console.print("[red]Web UI not yet implemented. Use --ui tui for Terminal UI.[/red]")
        return
    elif ui:
        console.print(f"[red]Unknown UI option: {ui}[/red]")
        console.print("[yellow]Available options: tui, web[/yellow]")
        return

    # Normal deployment server mode
    if config_file:
        # Load deployments from config
        console.print(f"[yellow]Loading deployments from {config_file}[/yellow]")
        # Implementation would load YAML/JSON config
    else:
        # Create example deployments
        deployments = [
            create_scheduled_deployment(
                name="docs-daily",
                source_url="https://docs.example.com",
                source_type="documentation",
                storage_backend="weaviate",
                schedule_type="cron",
                cron_expression="0 2 * * *",  # Daily at 2 AM
            ),
            create_scheduled_deployment(
                name="repo-hourly",
                source_url="https://github.com/example/repo",
                source_type="repository",
                storage_backend="open_webui",
                schedule_type="interval",
                interval_minutes=60,
            ),
        ]

        console.print(
            "[bold green]Starting deployment server with example deployments[/bold green]"
        )
        serve_deployments(deployments)


@app.command()
def tui() -> None:
    """
    🚀 Launch the enhanced Terminal User Interface.

    Quick shortcut for 'serve --ui tui' with modern keyboard navigation,
    interactive collection management, and real-time status updates.
    """
    console.print(
        Panel(
            (
                "[bold cyan]🚀 Launching Enhanced TUI[/bold cyan]\n\n"
                "[yellow]Features:[/yellow]\n"
                "• 📊 Interactive collection management\n"
                "• ⌨️  Enhanced keyboard navigation\n"
                "• 🎨 Modern design with focus indicators\n"
                "• 📄 Document browsing and search\n"
                "• 🔄 Real-time status updates"
            ),
            title="🎉 TUI Mode",
            border_style="cyan",
        )
    )
    from .tui import dashboard

    dashboard()


@app.command()
def config() -> None:
    """
    📋 Display current configuration with enhanced formatting.

    Shows all configured endpoints, models, and settings in a beautiful
    table format with status indicators.
    """
    settings = get_settings()

    console.print(
        Panel(
            (
                "[bold cyan]⚙️ System Configuration[/bold cyan]\n"
                "[dim]Current pipeline settings and endpoints[/dim]"
            ),
            title="🔧 Configuration",
            border_style="cyan",
        )
    )

    # Enhanced configuration table
    table = Table(
        title="📊 Configuration Details",
        title_style="bold magenta",
        border_style="blue",
        header_style="bold cyan",
        show_lines=True,
    )
    table.add_column("🏷️ Setting", style="cyan", no_wrap=True, width=25)
    table.add_column("🎯 Value", style="yellow", overflow="fold")
    table.add_column("📊 Status", style="green", width=12, justify="center")

    # Add configuration rows with status indicators
    def get_status_indicator(value: str | None) -> str:
        return "✅ Set" if value else "❌ Missing"

    table.add_row("🤖 LLM Endpoint", str(settings.llm_endpoint), "✅ Active")
    table.add_row("🔥 Firecrawl Endpoint", str(settings.firecrawl_endpoint), "✅ Active")
    table.add_row(
        "🗄️ Weaviate Endpoint",
        str(settings.weaviate_endpoint),
        get_status_indicator(str(settings.weaviate_api_key) if settings.weaviate_api_key else None),
    )
    table.add_row(
        "🌐 OpenWebUI Endpoint",
        str(settings.openwebui_endpoint),
        get_status_indicator(settings.openwebui_api_key),
    )
    table.add_row("🧠 Embedding Model", settings.embedding_model, "✅ Set")
    table.add_row("💾 Default Storage", settings.default_storage_backend.title(), "✅ Set")
    table.add_row("📦 Default Batch Size", f"{settings.default_batch_size:,}", "✅ Set")
    table.add_row("⚡ Max Concurrent Tasks", f"{settings.max_concurrent_tasks}", "✅ Set")

    console.print(table)

    # Additional helpful information
    console.print(
        Panel(
            (
                "💡 [bold cyan]Quick Tips[/bold cyan]\n\n"
                "• Use '[bold]ingest list-collections[/bold]' to view all collections\n"
                "• Use '[bold]ingest search[/bold]' to search content\n"
                "• Configure API keys in your [yellow].env[/yellow] file\n"
                "• Default collection names are auto-generated from URLs"
            ),
            title="🚀 Usage Tips",
            border_style="green",
        )
    )


@app.command()
def list_collections() -> None:
    """
    📋 List all collections across storage backends.
    """
    console.print("[bold cyan]📚 Collection Overview[/bold cyan]")
    asyncio.run(run_list_collections())


@app.command()
def search(
    query: Annotated[str, typer.Argument(help="Search query")],
    collection: Annotated[
        str | None, typer.Option("--collection", "-c", help="Target collection")
    ] = None,
    backend: Annotated[
        StorageBackend, typer.Option("--backend", "-b", help="Storage backend")
    ] = StorageBackend.WEAVIATE,
    limit: Annotated[int, typer.Option("--limit", "-l", help="Result limit")] = 10,
) -> None:
    """
    🔍 Search across collections.
    """
    console.print(f"[bold cyan]🔍 Searching for: {query}[/bold cyan]")
    asyncio.run(run_search(query, collection, backend.value, limit))


@app.command(name="blocks")
def blocks_command() -> None:
    """🧩 List and manage Prefect Blocks."""
    console.print("[bold cyan]📦 Prefect Blocks Management[/bold cyan]")
    console.print(
        "Use 'prefect block register --module ingest_pipeline.core.models' to register custom blocks"
    )
    console.print("Use 'prefect block ls' to list available blocks")


@app.command(name="variables")
def variables_command() -> None:
    """📊 Manage Prefect Variables."""
    console.print("[bold cyan]📊 Prefect Variables Management[/bold cyan]")
    console.print("Use 'prefect variable set VARIABLE_NAME value' to set variables")
    console.print("Use 'prefect variable ls' to list variables")


async def run_ingestion(
    url: str,
    source_type: IngestionSource,
    storage_backend: StorageBackend,
    collection_name: str | None = None,
    validate_first: bool = True,
) -> IngestionResult:
    """
    Run ingestion with support for targeted collections.
    """
    # Auto-generate collection name if not provided
    if not collection_name:
        from urllib.parse import urlparse

        parsed = urlparse(url)
        domain = parsed.netloc.replace(".", "_").replace("-", "_")
        collection_name = f"{domain}_{source_type.value}"

    return await create_ingestion_flow(
        source_url=url,
        source_type=source_type,
        storage_backend=storage_backend,
        collection_name=collection_name,
        validate_first=validate_first,
    )


async def run_list_collections() -> None:
    """
    List collections across storage backends.
    """
    from ..config import get_settings
    from ..core.models import StorageBackend
    from ..storage.openwebui import OpenWebUIStorage
    from ..storage.weaviate import WeaviateStorage

    settings = get_settings()

    console.print("🔍 [bold cyan]Scanning storage backends...[/bold cyan]")

    # Try to connect to Weaviate
    weaviate_collections: list[tuple[str, int]] = []
    try:
        weaviate_config = StorageConfig(
            backend=StorageBackend.WEAVIATE,
            endpoint=settings.weaviate_endpoint,
            api_key=SecretStr(settings.weaviate_api_key)
            if settings.weaviate_api_key is not None
            else None,
            collection_name="default",
        )
        weaviate = WeaviateStorage(weaviate_config)
        await weaviate.initialize()

        overview = await weaviate.describe_collections()
        for item in overview:
            name = str(item.get("name", "Unknown"))
            count_val = item.get("count", 0)
            count = int(count_val) if isinstance(count_val, (int, str)) else 0
            weaviate_collections.append((name, count))
    except Exception as e:
        console.print(f"❌ [red]Weaviate connection failed: {e}[/red]")

    # Try to connect to OpenWebUI
    openwebui_collections: list[tuple[str, int]] = []
    try:
        openwebui_config = StorageConfig(
            backend=StorageBackend.OPEN_WEBUI,
            endpoint=settings.openwebui_endpoint,
            api_key=SecretStr(settings.openwebui_api_key)
            if settings.openwebui_api_key is not None
            else None,
            collection_name="default",
        )
        openwebui = OpenWebUIStorage(openwebui_config)
        await openwebui.initialize()

        overview = await openwebui.describe_collections()
        for item in overview:
            name = str(item.get("name", "Unknown"))
            count_val = item.get("count", 0)
            count = int(count_val) if isinstance(count_val, (int, str)) else 0
            openwebui_collections.append((name, count))
    except Exception as e:
        console.print(f"❌ [red]OpenWebUI connection failed: {e}[/red]")

    # Display results
    if weaviate_collections or openwebui_collections:
        # Create results table
        from rich.table import Table

        table = Table(
            title="📚 Collection Overview",
            title_style="bold magenta",
            border_style="cyan",
            header_style="bold blue",
        )
        table.add_column("🏷️ Collection", style="cyan", no_wrap=True)
        table.add_column("📊 Backend", style="yellow")
        table.add_column("📄 Documents", style="green", justify="right")

        # Add Weaviate collections
        for name, count in weaviate_collections:
            table.add_row(name, "🗄️ Weaviate", f"{count:,}")

        # Add OpenWebUI collections
        for name, count in openwebui_collections:
            table.add_row(name, "🌐 OpenWebUI", f"{count:,}")

        console.print(table)
    else:
        console.print("❌ [yellow]No collections found in any backend[/yellow]")


async def run_search(query: str, collection: str | None, backend: str, limit: int) -> None:
    """
    Search across collections.
    """
    from ..config import get_settings
    from ..core.models import StorageBackend
    from ..storage.weaviate import WeaviateStorage

    settings = get_settings()

    console.print(f"🔍 Searching for: '[bold cyan]{query}[/bold cyan]'")
    if collection:
        console.print(f"📚 Target collection: [yellow]{collection}[/yellow]")
    console.print(f"💾 Backend: [blue]{backend}[/blue]")

    results = []

    try:
        if backend == "weaviate":
            weaviate_config = StorageConfig(
                backend=StorageBackend.WEAVIATE,
                endpoint=settings.weaviate_endpoint,
                api_key=SecretStr(settings.weaviate_api_key)
                if settings.weaviate_api_key is not None
                else None,
                collection_name=collection or "default",
            )
            weaviate = WeaviateStorage(weaviate_config)
            await weaviate.initialize()

            results_generator = weaviate.search(query, limit=limit)
            async for doc in results_generator:
                results.append(
                    {
                        "title": getattr(doc, "title", "Untitled"),
                        "content": getattr(doc, "content", ""),
                        "score": getattr(doc, "score", 0.0),
                        "backend": "🗄️ Weaviate",
                    }
                )

        elif backend == "open_webui":
            console.print("❌ [red]OpenWebUI search not yet implemented[/red]")
            return

    except Exception as e:
        console.print(f"❌ [red]Search failed: {e}[/red]")
        return

    # Display results
    if results:
        from rich.table import Table

        table = Table(
            title=f"🔍 Search Results for '{query}'",
            title_style="bold magenta",
            border_style="green",
            header_style="bold blue",
        )
        table.add_column("📄 Title", style="cyan", max_width=40)
        table.add_column("📝 Preview", style="white", max_width=60)
        table.add_column("📊 Score", style="yellow", justify="right")

        for result in results[:limit]:
            title = str(result["title"])
            title_display = title[:40] + "..." if len(title) > 40 else title

            content = str(result["content"])
            content_display = content[:60] + "..." if len(content) > 60 else content

            score = f"{result['score']:.3f}"

            table.add_row(title_display, content_display, score)

        console.print(table)
        console.print(f"\n✅ [green]Found {len(results)} results[/green]")
    else:
        console.print("❌ [yellow]No results found[/yellow]")


if __name__ == "__main__":
    app()
</file>

<file path="ingest_pipeline/config/__init__.py">
"""Configuration management utilities."""

from __future__ import annotations

from contextlib import ExitStack

from prefect.settings import Setting, temporary_settings


# Import Prefect settings with version compatibility - avoid static analysis issues
def _setup_prefect_settings() -> tuple[object, object, object]:
    """Setup Prefect settings with proper fallbacks."""
    try:
        import prefect.settings as ps

        # Try to get the settings directly
        api_key = getattr(ps, "PREFECT_API_KEY", None)
        api_url = getattr(ps, "PREFECT_API_URL", None)
        work_pool = getattr(ps, "PREFECT_DEFAULT_WORK_POOL_NAME", None)

        if api_key is not None:
            return api_key, api_url, work_pool

        # Fallback to registry-based approach
        registry = getattr(ps, "PREFECT_SETTING_REGISTRY", None)
        if registry is not None:
            Setting = getattr(ps, "Setting", None)
            if Setting is not None:
                api_key = registry.get("PREFECT_API_KEY") or Setting(
                    "PREFECT_API_KEY", type_=str, default=None
                )
                api_url = registry.get("PREFECT_API_URL") or Setting(
                    "PREFECT_API_URL", type_=str, default=None
                )
                work_pool = registry.get("PREFECT_DEFAULT_WORK_POOL_NAME") or Setting(
                    "PREFECT_DEFAULT_WORK_POOL_NAME", type_=str, default=None
                )
                return api_key, api_url, work_pool

    except ImportError:
        pass

    # Ultimate fallback
    return None, None, None


PREFECT_API_KEY, PREFECT_API_URL, PREFECT_DEFAULT_WORK_POOL_NAME = _setup_prefect_settings()

# Import after Prefect settings setup to avoid circular dependencies
from .settings import Settings, get_settings  # noqa: E402

__all__ = ["Settings", "get_settings", "configure_prefect"]

_prefect_settings_stack: ExitStack | None = None


def configure_prefect(settings: Settings) -> None:
    """Apply Prefect settings from the application configuration."""
    global _prefect_settings_stack

    overrides: dict[Setting, str] = {}

    if (
        settings.prefect_api_url is not None
        and PREFECT_API_URL is not None
        and isinstance(PREFECT_API_URL, Setting)
    ):
        overrides[PREFECT_API_URL] = str(settings.prefect_api_url)
    if (
        settings.prefect_api_key
        and PREFECT_API_KEY is not None
        and isinstance(PREFECT_API_KEY, Setting)
    ):
        overrides[PREFECT_API_KEY] = settings.prefect_api_key
    if (
        settings.prefect_work_pool
        and PREFECT_DEFAULT_WORK_POOL_NAME is not None
        and isinstance(PREFECT_DEFAULT_WORK_POOL_NAME, Setting)
    ):
        overrides[PREFECT_DEFAULT_WORK_POOL_NAME] = settings.prefect_work_pool

    if not overrides:
        return

    filtered_overrides = {
        setting: value for setting, value in overrides.items() if setting.value() != value
    }

    if not filtered_overrides:
        return

    new_stack = ExitStack()
    new_stack.enter_context(temporary_settings(updates=filtered_overrides))

    if _prefect_settings_stack is not None:
        _prefect_settings_stack.close()

    _prefect_settings_stack = new_stack
</file>

<file path="ingest_pipeline/ingestors/base.py">
"""Base ingestor interface."""

from __future__ import annotations

from abc import ABC, abstractmethod
from collections.abc import AsyncGenerator
from typing import TYPE_CHECKING

from ..core.models import Document, IngestionJob

if TYPE_CHECKING:
    from ..storage.base import BaseStorage


class BaseIngestor(ABC):
    """Abstract base class for all ingestors."""

    @abstractmethod
    def ingest(self, job: IngestionJob) -> AsyncGenerator[Document, None]:
        """
        Ingest data from a source.

        Args:
            job: The ingestion job configuration

        Yields:
            Documents from the source
        """
        ...  # pragma: no cover

    @abstractmethod
    async def validate_source(self, source_url: str) -> bool:
        """
        Validate if the source is accessible.

        Args:
            source_url: URL or path to the source

        Returns:
            True if source is valid and accessible
        """
        pass  # pragma: no cover

    @abstractmethod
    async def estimate_size(self, source_url: str) -> int:
        """
        Estimate the number of documents in the source.

        Args:
            source_url: URL or path to the source

        Returns:
            Estimated number of documents
        """
        pass  # pragma: no cover

    async def ingest_with_dedup(
        self,
        job: IngestionJob,
        storage_client: BaseStorage,
        *,
        collection_name: str | None = None,
        stale_after_days: int = 30,
    ) -> AsyncGenerator[Document, None]:
        """
        Ingest documents with duplicate detection (optional optimization).

        Default implementation falls back to regular ingestion.
        Subclasses can override to provide optimized deduplication.

        Args:
            job: The ingestion job configuration
            storage_client: Storage client to check for existing documents
            collection_name: Collection to check for duplicates
            stale_after_days: Consider documents stale after this many days

        Yields:
            Documents from the source (with deduplication if implemented)
        """
        # Default implementation: fall back to regular ingestion
        async for document in self.ingest(job):
            yield document
</file>

<file path="ingest_pipeline/cli/tui/screens/documents.py">
"""Document management screen with enhanced navigation."""

from datetime import datetime

from textual.app import ComposeResult
from textual.binding import Binding
from textual.containers import Container, Horizontal, ScrollableContainer
from textual.screen import ModalScreen, Screen
from textual.widgets import Button, Footer, Header, Label, LoadingIndicator, Markdown, Static
from typing_extensions import override

from ....storage.base import BaseStorage
from ..models import CollectionInfo, DocumentInfo
from ..widgets import EnhancedDataTable


class DocumentManagementScreen(Screen[None]):
    """Screen for managing documents within a collection with enhanced keyboard navigation."""

    collection: CollectionInfo
    storage: BaseStorage | None
    documents: list[DocumentInfo]
    selected_docs: set[str]
    current_offset: int
    page_size: int

    BINDINGS = [
        Binding("escape", "app.pop_screen", "Back"),
        Binding("r", "refresh", "Refresh"),
        Binding("v", "view_document", "View"),
        Binding("delete", "delete_selected", "Delete Selected"),
        Binding("a", "select_all", "Select All"),
        Binding("ctrl+a", "select_all", "Select All"),
        Binding("n", "select_none", "Clear Selection"),
        Binding("ctrl+shift+a", "select_none", "Clear Selection"),
        Binding("space", "toggle_selection", "Toggle Selection"),
        Binding("ctrl+d", "delete_selected", "Delete Selected"),
        Binding("pageup", "prev_page", "Previous Page"),
        Binding("pagedown", "next_page", "Next Page"),
        Binding("home", "first_page", "First Page"),
        Binding("end", "last_page", "Last Page"),
    ]

    def __init__(self, collection: CollectionInfo, storage: BaseStorage | None):
        super().__init__()
        self.collection = collection
        self.storage = storage
        self.documents: list[DocumentInfo] = []
        self.selected_docs: set[str] = set()
        self.current_offset = 0
        self.page_size = 50

    @override
    def compose(self) -> ComposeResult:
        yield Header()
        yield Container(
            Static(f"📄 Document Management: {self.collection['name']}", classes="title"),
            Static(
                f"Total Documents: {self.collection['count']:,} | Use Space to select, Delete to remove",
                classes="subtitle",
            ),
            Label(f"Page size: {self.page_size} documents"),
            EnhancedDataTable(id="documents_table", classes="enhanced-table"),
            Horizontal(
                Button("🔄 Refresh", id="refresh_docs_btn", variant="primary"),
                Button("🗑️ Delete Selected", id="delete_selected_btn", variant="error"),
                Button("✅ Select All", id="select_all_btn", variant="default"),
                Button("❌ Clear Selection", id="clear_selection_btn", variant="default"),
                Button("⬅️ Previous Page", id="prev_page_btn", variant="default"),
                Button("➡️ Next Page", id="next_page_btn", variant="default"),
                classes="button_bar",
            ),
            Label("", id="selection_status"),
            Static("", id="page_info", classes="status-text"),
            LoadingIndicator(id="loading"),
            classes="main_container",
        )
        yield Footer()

    async def on_mount(self) -> None:
        """Initialize the screen."""
        self.query_one("#loading").display = False

        # Setup documents table with enhanced columns
        table = self.query_one("#documents_table", EnhancedDataTable)
        table.add_columns(
            "✓", "Title", "Source URL", "Description", "Type", "Words", "Timestamp", "ID"
        )

        # Set up message handling for table events
        table.can_focus = True

        await self.load_documents()

    async def load_documents(self) -> None:
        """Load documents from the collection."""
        loading = self.query_one("#loading")
        loading.display = True

        try:
            if self.storage:
                # Try to load documents using the storage backend
                try:
                    raw_docs = await self.storage.list_documents(
                        limit=self.page_size,
                        offset=self.current_offset,
                        collection_name=self.collection["name"],
                    )
                    # Cast to proper type with type checking
                    self.documents = [
                        DocumentInfo(
                            id=str(doc.get("id", f"doc_{i}")),
                            title=str(doc.get("title", "Untitled Document")),
                            source_url=str(doc.get("source_url", "")),
                            description=str(doc.get("description", "")),
                            content_type=str(doc.get("content_type", "text/plain")),
                            content_preview=str(doc.get("content_preview", "")),
                            word_count=(
                                lambda wc_val: int(wc_val)
                                if isinstance(wc_val, (int, str)) and str(wc_val).isdigit()
                                else 0
                            )(doc.get("word_count", 0)),
                            timestamp=str(doc.get("timestamp", "")),
                        )
                        for i, doc in enumerate(raw_docs)
                    ]
                except NotImplementedError:
                    # For storage backends that don't support document listing, show a message
                    self.notify(
                        f"Document listing not supported for {self.storage.__class__.__name__}",
                        severity="information",
                    )
                    self.documents = []

                await self.update_table()
                self.update_selection_status()
                self.update_page_info()

        except Exception as e:
            self.notify(f"Error loading documents: {e}", severity="error", markup=False)
        finally:
            loading.display = False

    async def update_table(self) -> None:
        """Update the documents table with enhanced metadata display."""
        table = self.query_one("#documents_table", EnhancedDataTable)
        table.clear(columns=True)

        # Add enhanced columns with more metadata
        table.add_columns(
            "✓", "Title", "Source URL", "Description", "Type", "Words", "Timestamp", "ID"
        )

        # Add rows with enhanced metadata
        for doc in self.documents:
            selected = "✓" if doc["id"] in self.selected_docs else ""

            # Get additional metadata from the raw docs
            description = str(doc.get("description") or "").strip()[:40]
            if not description:
                description = "[dim]No description[/dim]"
            elif len(str(doc.get("description") or "")) > 40:
                description += "..."

            # Format content type with appropriate icon
            content_type = doc.get("content_type", "text/plain")
            if "markdown" in content_type.lower():
                type_display = "📝 md"
            elif "html" in content_type.lower():
                type_display = "🌐 html"
            elif "text" in content_type.lower():
                type_display = "📄 txt"
            else:
                type_display = f"📄 {content_type.split('/')[-1][:5]}"

            # Format timestamp to be more readable
            timestamp = doc.get("timestamp", "")
            if timestamp:
                try:
                    # Parse ISO format timestamp
                    dt = datetime.fromisoformat(timestamp.replace("Z", "+00:00"))
                    timestamp = dt.strftime("%m/%d %H:%M")
                except Exception:
                    timestamp = str(timestamp)[:16]  # Fallback
            table.add_row(
                selected,
                doc.get("title", "Untitled")[:40],
                doc.get("source_url", "")[:35],
                description,
                type_display,
                str(doc.get("word_count", 0)),
                timestamp,
                doc["id"][:8] + "...",  # Show truncated ID
            )

    def update_selection_status(self) -> None:
        """Update the selection status label."""
        status_label = self.query_one("#selection_status", Label)
        total_selected = len(self.selected_docs)
        status_label.update(f"Selected: {total_selected} documents")

    def update_page_info(self) -> None:
        """Update the page information."""
        page_info = self.query_one("#page_info", Static)
        total_docs = self.collection["count"]
        start = self.current_offset + 1
        end = min(self.current_offset + len(self.documents), total_docs)
        page_num = (self.current_offset // self.page_size) + 1
        total_pages = (total_docs + self.page_size - 1) // self.page_size

        page_info.update(
            f"Showing {start:,}-{end:,} of {total_docs:,} documents (Page {page_num} of {total_pages})"
        )

    def get_current_document(self) -> DocumentInfo | None:
        """Get the currently selected document."""
        table = self.query_one("#documents_table", EnhancedDataTable)
        try:
            if 0 <= table.cursor_coordinate.row < len(self.documents):
                return self.documents[table.cursor_coordinate.row]
        except (AttributeError, IndexError):
            pass
        return None

    # Action methods
    def action_refresh(self) -> None:
        """Refresh the document list."""
        self.run_worker(self.load_documents())

    def action_toggle_selection(self) -> None:
        """Toggle selection of current row."""
        if doc := self.get_current_document():
            doc_id = doc["id"]
            if doc_id in self.selected_docs:
                self.selected_docs.remove(doc_id)
            else:
                self.selected_docs.add(doc_id)

            self.run_worker(self.update_table())
            self.update_selection_status()

    def action_select_all(self) -> None:
        """Select all documents on current page."""
        for doc in self.documents:
            self.selected_docs.add(doc["id"])
        self.run_worker(self.update_table())
        self.update_selection_status()

    def action_select_none(self) -> None:
        """Clear all selections."""
        self.selected_docs.clear()
        self.run_worker(self.update_table())
        self.update_selection_status()

    def action_delete_selected(self) -> None:
        """Delete selected documents."""
        if self.selected_docs:
            from .dialogs import ConfirmDocumentDeleteScreen

            self.app.push_screen(
                ConfirmDocumentDeleteScreen(list(self.selected_docs), self.collection, self)
            )
        else:
            self.notify("No documents selected", severity="warning")

    def action_next_page(self) -> None:
        """Go to next page."""
        if self.current_offset + self.page_size < self.collection["count"]:
            self.current_offset += self.page_size
            self.run_worker(self.load_documents())

    def action_prev_page(self) -> None:
        """Go to previous page."""
        if self.current_offset >= self.page_size:
            self.current_offset -= self.page_size
            self.run_worker(self.load_documents())

    def action_first_page(self) -> None:
        """Go to first page."""
        if self.current_offset > 0:
            self.current_offset = 0
            self.run_worker(self.load_documents())

    def action_last_page(self) -> None:
        """Go to last page."""
        total_docs = self.collection["count"]
        last_offset = ((total_docs - 1) // self.page_size) * self.page_size
        if self.current_offset != last_offset:
            self.current_offset = last_offset
            self.run_worker(self.load_documents())

    def on_button_pressed(self, event: Button.Pressed) -> None:
        """Handle button presses."""
        if event.button.id == "refresh_docs_btn":
            self.action_refresh()
        elif event.button.id == "delete_selected_btn":
            self.action_delete_selected()
        elif event.button.id == "select_all_btn":
            self.action_select_all()
        elif event.button.id == "clear_selection_btn":
            self.action_select_none()
        elif event.button.id == "next_page_btn":
            self.action_next_page()
        elif event.button.id == "prev_page_btn":
            self.action_prev_page()

    def on_enhanced_data_table_row_toggled(self, event: EnhancedDataTable.RowToggled) -> None:
        """Handle row toggle from enhanced table."""
        if 0 <= event.row_index < len(self.documents):
            doc = self.documents[event.row_index]
            doc_id = doc["id"]

            if doc_id in self.selected_docs:
                self.selected_docs.remove(doc_id)
            else:
                self.selected_docs.add(doc_id)

            self.run_worker(self.update_table())
            self.update_selection_status()

    def on_enhanced_data_table_select_all(self, event: EnhancedDataTable.SelectAll) -> None:
        """Handle select all from enhanced table."""
        self.action_select_all()

    def on_enhanced_data_table_clear_selection(
        self, event: EnhancedDataTable.ClearSelection
    ) -> None:
        """Handle clear selection from enhanced table."""
        self.action_select_none()

    def action_view_document(self) -> None:
        """View the content of the currently selected document."""
        if doc := self.get_current_document():
            if self.storage:
                self.app.push_screen(
                    DocumentContentModal(doc, self.storage, self.collection["name"])
                )
            else:
                self.notify("No storage backend available", severity="error")
        else:
            self.notify("No document selected", severity="warning")


class DocumentContentModal(ModalScreen[None]):
    """Modal screen for viewing document content."""

    DEFAULT_CSS = """
    DocumentContentModal {
        align: center middle;
    }

    DocumentContentModal > Container {
        width: 90%;
        height: 85%;
        background: $surface;
        border: thick $primary;
    }

    DocumentContentModal .modal-header {
        background: $primary;
        color: $text;
        padding: 1;
        dock: top;
        height: 3;
    }

    DocumentContentModal .modal-content {
        padding: 1;
        height: 1fr;
    }
    """

    BINDINGS = [
        Binding("escape", "app.pop_screen", "Close"),
        Binding("q", "app.pop_screen", "Close"),
    ]

    def __init__(self, document: DocumentInfo, storage: BaseStorage, collection_name: str):
        super().__init__()
        self.document = document
        self.storage = storage
        self.collection_name = collection_name

    def compose(self) -> ComposeResult:
        yield Container(
            Static(
                f"📄 Document: {self.document['title'][:60]}{'...' if len(self.document['title']) > 60 else ''}",
                classes="modal-header",
            ),
            ScrollableContainer(
                Markdown("Loading document content...", id="document_content"),
                LoadingIndicator(id="content_loading"),
                classes="modal-content",
            ),
        )

    async def on_mount(self) -> None:
        """Load and display the document content."""
        content_widget = self.query_one("#document_content", Markdown)
        loading = self.query_one("#content_loading")

        try:
            # Get full document content
            doc_content = await self.storage.retrieve(
                self.document["id"], collection_name=self.collection_name
            )

            # Format content for display
            if isinstance(doc_content, str):
                formatted_content = f"""# {self.document["title"]}

**Source:** {self.document.get("source_url", "N/A")}
**Type:** {self.document.get("content_type", "text/plain")}
**Words:** {self.document.get("word_count", 0):,}
**Timestamp:** {self.document.get("timestamp", "N/A")}

---

{doc_content}
"""
            else:
                formatted_content = f"""# {self.document["title"]}

**Source:** {self.document.get("source_url", "N/A")}
**Type:** {self.document.get("content_type", "text/plain")}
**Words:** {self.document.get("word_count", 0):,}
**Timestamp:** {self.document.get("timestamp", "N/A")}

---

*Content format not supported for display*
"""

            content_widget.update(formatted_content)

        except Exception as e:
            content_widget.update(
                f"# Error Loading Document\n\nFailed to load document content: {e}"
            )
        finally:
            loading.display = False
</file>

<file path="ingest_pipeline/cli/tui/utils/runners.py">
"""TUI runner functions and initialization."""

from __future__ import annotations

import asyncio
import logging
from logging import Logger
from logging.handlers import QueueHandler, RotatingFileHandler
from pathlib import Path
from queue import Queue
from typing import NamedTuple

import platformdirs

from ....config import configure_prefect, get_settings
from .storage_manager import StorageManager


class _TuiLoggingContext(NamedTuple):
    """Container describing configured logging outputs for the TUI."""

    queue: Queue[logging.LogRecord]
    formatter: logging.Formatter
    log_file: Path | None


_logging_context: _TuiLoggingContext | None = None


def _configure_tui_logging(*, log_level: str) -> _TuiLoggingContext:
    """Configure logging so that messages do not break the TUI output."""

    global _logging_context
    if _logging_context is not None:
        return _logging_context

    resolved_level = getattr(logging, log_level.upper(), logging.INFO)
    log_queue: Queue[logging.LogRecord] = Queue()
    formatter = logging.Formatter(
        fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
    )

    root_logger = logging.getLogger()
    root_logger.setLevel(resolved_level)

    # Remove existing stream handlers to prevent console flicker inside the TUI
    for handler in list(root_logger.handlers):
        root_logger.removeHandler(handler)

    queue_handler = QueueHandler(log_queue)
    queue_handler.setLevel(resolved_level)
    root_logger.addHandler(queue_handler)

    log_file: Path | None = None
    try:
        # Try current directory first for development
        log_dir = Path.cwd() / "logs"
        log_dir.mkdir(parents=True, exist_ok=True)
        log_file = log_dir / "tui.log"
    except OSError:
        # Fall back to user log directory
        try:
            log_dir = Path(platformdirs.user_log_dir("ingest-pipeline", "ingest-pipeline"))
            log_dir.mkdir(parents=True, exist_ok=True)
            log_file = log_dir / "tui.log"
        except OSError as exc:
            fallback = logging.getLogger(__name__)
            fallback.warning("Failed to create log directory, file logging disabled: %s", exc)
            log_file = None

    if log_file:
        try:
            file_handler = RotatingFileHandler(
                log_file,
                maxBytes=2_000_000,
                backupCount=5,
                encoding="utf-8",
            )
            file_handler.setLevel(resolved_level)
            file_handler.setFormatter(formatter)
            root_logger.addHandler(file_handler)
        except OSError as exc:
            fallback = logging.getLogger(__name__)
            fallback.warning("Failed to configure file logging for TUI: %s", exc)
            log_file = None

    _logging_context = _TuiLoggingContext(log_queue, formatter, log_file)
    return _logging_context


LOGGER: Logger = logging.getLogger(__name__)


async def run_textual_tui() -> None:
    """Run the enhanced modern TUI with better error handling and initialization."""
    settings = get_settings()
    configure_prefect(settings)

    logging_context = _configure_tui_logging(log_level=settings.log_level)

    LOGGER.info("Initializing collection management TUI")
    LOGGER.info("Scanning available storage backends")

    # Create storage manager without initialization - let TUI handle it asynchronously
    storage_manager = StorageManager(settings)

    LOGGER.info("Launching TUI - storage backends will initialize in background")

    # Import here to avoid circular import
    from ..app import CollectionManagementApp

    app = CollectionManagementApp(
        storage_manager,
        None,  # weaviate - will be available after initialization
        None,  # openwebui - will be available after initialization
        None,  # r2r_backend - will be available after initialization
        log_queue=logging_context.queue,
        log_formatter=logging_context.formatter,
        log_file=logging_context.log_file,
    )
    try:
        await app.run_async()
    finally:
        LOGGER.info("Shutting down storage connections")
        await storage_manager.close_all()
        LOGGER.info("All storage connections closed gracefully")


def dashboard() -> None:
    """Launch the modern collection dashboard."""
    asyncio.run(run_textual_tui())
</file>

<file path="ingest_pipeline/cli/tui/styles.py">
"""Comprehensive theming system for TUI applications with WCAG AA accessibility compliance."""

from dataclasses import dataclass
from enum import Enum
from typing import Protocol

from textual.app import App

# Type alias for Textual apps with unknown return type
TextualApp = App[object]


class AppProtocol(Protocol):
    """Protocol for apps that support CSS and refresh."""

    CSS: str

    def refresh(self) -> None:
        """Refresh the app."""
        ...


class ThemeType(Enum):
    """Available theme types."""

    DARK = "dark"
    LIGHT = "light"
    HIGH_CONTRAST = "high_contrast"
    GITHUB_DARK = "github_dark"


@dataclass
class ColorPalette:
    """Color palette with WCAG AA compliant contrast ratios."""

    # Background colors
    bg_primary: str
    bg_secondary: str
    bg_tertiary: str
    bg_elevated: str

    # Text colors (all tested for WCAG AA compliance)
    text_primary: str  # 4.5:1+ contrast ratio
    text_secondary: str  # 4.5:1+ contrast ratio
    text_tertiary: str  # 4.5:1+ contrast ratio
    text_inverse: str

    # Semantic colors
    primary: str
    primary_hover: str
    success: str
    warning: str
    error: str
    info: str

    # Interactive states
    border_default: str
    border_focus: str
    border_hover: str

    # Surface colors
    surface_1: str
    surface_2: str
    surface_3: str


class ThemeRegistry:
    """Registry for managing application themes."""

    @staticmethod
    def get_enhanced_dark() -> ColorPalette:
        """Enhanced dark theme with superior contrast ratios."""
        return ColorPalette(
            # Backgrounds - darker for better contrast
            bg_primary="#0a0c10",
            bg_secondary="#151821",
            bg_tertiary="#1f2329",
            bg_elevated="#252932",
            # Text - brighter for better visibility (WCAG AA compliant)
            text_primary="#ffffff",  # 21:1 contrast ratio
            text_secondary="#e6edf3",  # 14.8:1 contrast ratio
            text_tertiary="#c9d1d9",  # 9.6:1 contrast ratio
            text_inverse="#0a0c10",
            # Semantic colors - enhanced for visibility
            primary="#1f6feb",
            primary_hover="#388bfd",
            success="#238636",
            warning="#d29922",
            error="#f85149",
            info="#58a6ff",
            # Interactive states
            border_default="#444c56",
            border_focus="#58a6ff",
            border_hover="#58a6ff",
            # Surface elevation
            surface_1="#161b22",
            surface_2="#21262d",
            surface_3="#30363d",
        )

    @staticmethod
    def get_light() -> ColorPalette:
        """Light theme with excellent readability."""
        return ColorPalette(
            # Backgrounds
            bg_primary="#ffffff",
            bg_secondary="#f6f8fa",
            bg_tertiary="#f1f3f4",
            bg_elevated="#ffffff",
            # Text (WCAG AA compliant)
            text_primary="#1f2328",  # 12.6:1 contrast ratio
            text_secondary="#424a53",  # 7.1:1 contrast ratio
            text_tertiary="#636c76",  # 4.7:1 contrast ratio
            text_inverse="#ffffff",
            # Semantic colors
            primary="#0969da",
            primary_hover="#0860ca",
            success="#1a7f37",
            warning="#9a6700",
            error="#d1242f",
            info="#0969da",
            # Interactive states
            border_default="#d1d9e0",
            border_focus="#fd7e14",
            border_hover="#0969da",
            # Surface elevation
            surface_1="#f6f8fa",
            surface_2="#eaeef2",
            surface_3="#d1d9e0",
        )

    @staticmethod
    def get_high_contrast() -> ColorPalette:
        """High contrast theme for maximum accessibility."""
        return ColorPalette(
            # Backgrounds
            bg_primary="#000000",
            bg_secondary="#1a1a1a",
            bg_tertiary="#262626",
            bg_elevated="#333333",
            # Text (Maximum contrast)
            text_primary="#ffffff",  # 21:1 contrast ratio
            text_secondary="#ffffff",  # 21:1 contrast ratio
            text_tertiary="#cccccc",  # 11.8:1 contrast ratio
            text_inverse="#000000",
            # Semantic colors - high contrast variants
            primary="#00aaff",
            primary_hover="#66ccff",
            success="#00ff00",
            warning="#ffaa00",
            error="#ff4444",
            info="#00aaff",
            # Interactive states
            border_default="#666666",
            border_focus="#ffff00",
            border_hover="#ffffff",
            # Surface elevation
            surface_1="#1a1a1a",
            surface_2="#333333",
            surface_3="#4d4d4d",
        )

    @staticmethod
    def get_github_dark() -> ColorPalette:
        """Enhanced GitHub dark theme with improved contrast."""
        return ColorPalette(
            # Backgrounds
            bg_primary="#0d1117",
            bg_secondary="#161b22",
            bg_tertiary="#21262d",
            bg_elevated="#2d333b",
            # Text (Enhanced for better visibility)
            text_primary="#f0f6fc",  # 13.6:1 contrast ratio
            text_secondary="#e6edf3",  # 11.9:1 contrast ratio
            text_tertiary="#c9d1d9",  # 8.2:1 contrast ratio
            text_inverse="#0d1117",
            # Semantic colors
            primary="#58a6ff",
            primary_hover="#79c0ff",
            success="#3fb950",
            warning="#d29922",
            error="#f85149",
            info="#58a6ff",
            # Interactive states
            border_default="#30363d",
            border_focus="#f78166",
            border_hover="#58a6ff",
            # Surface elevation
            surface_1="#161b22",
            surface_2="#21262d",
            surface_3="#30363d",
        )


class ThemeManager:
    """Manages theme selection and CSS generation."""

    def __init__(self, default_theme: ThemeType = ThemeType.DARK):
        self.current_theme: ThemeType = default_theme
        self._themes: dict[ThemeType, ColorPalette] = {
            ThemeType.DARK: ThemeRegistry.get_enhanced_dark(),
            ThemeType.LIGHT: ThemeRegistry.get_light(),
            ThemeType.HIGH_CONTRAST: ThemeRegistry.get_high_contrast(),
            ThemeType.GITHUB_DARK: ThemeRegistry.get_github_dark(),
        }

    def set_theme(self, theme: ThemeType) -> None:
        """Switch to a different theme."""
        self.current_theme = theme

    def get_current_palette(self) -> ColorPalette:
        """Get the current theme's color palette."""
        return self._themes[self.current_theme]

    def generate_css(self) -> str:
        """Generate Textual CSS for the current theme."""
        palette = self.get_current_palette()

        return f"""
/* ===============================================
   ENHANCED THEMING SYSTEM - {self.current_theme.value.upper()}
   WCAG AA Compliant with Superior Text Visibility
   =============================================== */

/* Base Application Styling */
Screen {{
    background: {palette.bg_primary};
}}

* {{
    color: {palette.text_primary};
}}

/* ===============================================
   LAYOUT & CONTAINERS
   =============================================== */

/* Enhanced title styling with superior contrast */
.title {{
    text-align: center;
    margin: 0;
    color: {palette.text_primary};
    text-style: bold;
    background: {palette.bg_secondary};
    padding: 0 1;
    height: 3;
    min-height: 3;
    max-height: 3;
    border: solid {palette.primary};
}}

.subtitle {{
    text-align: center;
    margin: 0;
    color: {palette.text_secondary};
    text-style: italic;
    background: {palette.bg_secondary};
    padding: 0 1;
    height: 2;
    min-height: 2;
    max-height: 2;
}}

/* Main container with elevated surface */
.main_container {{
    margin: 0;
    padding: 1 0;
    background: {palette.bg_secondary};
}}

/* Enhanced card components with better elevation */
.card {{
    background: {palette.surface_2};
    padding: 1;
    margin: 0 1;
    color: {palette.text_primary};
    border: solid {palette.border_default};
    height: auto;
    min-height: 4;
}}

.card:focus-within {{
    border: thick {palette.border_focus};
    background: {palette.bg_elevated};
}}

/* ===============================================
   INTERACTIVE ELEMENTS
   =============================================== */

/* Base button with superior contrast */
Button {{
    background: {palette.surface_2};
    color: {palette.text_primary};
    margin: 0 1;
    border: solid {palette.border_default};
}}

Button:hover {{
    background: {palette.surface_3};
    color: {palette.text_primary};
    border: solid {palette.border_hover};
}}

Button:focus {{
    border: thick {palette.border_focus};
    background: {palette.primary};
    color: {palette.text_inverse};
}}

/* Semantic button variants with enhanced visibility */
Button.-primary {{
    background: {palette.primary};
    color: {palette.text_inverse};
    border: solid {palette.primary};
}}

Button.-primary:hover {{
    background: {palette.primary_hover};
    border: solid {palette.primary_hover};
}}

Button.-primary:focus {{
    border: thick {palette.border_focus};
    background: {palette.primary_hover};
}}

Button.-success {{
    background: {palette.success};
    color: {palette.text_inverse};
    border: solid {palette.success};
}}

Button.-success:hover {{
    background: {palette.success};
    opacity: 0.9;
}}

Button.-error {{
    background: {palette.error};
    color: {palette.text_inverse};
    border: solid {palette.error};
}}

Button.-error:hover {{
    background: {palette.error};
    opacity: 0.9;
}}

Button.-warning {{
    background: {palette.warning};
    color: {palette.text_inverse};
    border: solid {palette.warning};
}}

Button.-warning:hover {{
    background: {palette.warning};
    opacity: 0.9;
}}

/* ===============================================
   DATA DISPLAY - ENHANCED READABILITY
   =============================================== */

/* DataTable with superior contrast and accessibility */
DataTable {{
    background: {palette.surface_2};
    color: {palette.text_primary};
    border: solid {palette.border_default};
}}

DataTable:focus {{
    border: thick {palette.border_focus};
}}

DataTable > .datatable--header {{
    background: {palette.bg_secondary};
    color: {palette.primary};
    text-style: bold;
}}

DataTable > .datatable--cursor {{
    background: {palette.primary};
    color: {palette.text_inverse};
}}

DataTable > .datatable--cursor-row {{
    background: {palette.primary_hover};
    color: {palette.text_inverse};
}}

DataTable > .datatable--row-odd {{
    background: {palette.surface_2};
    color: {palette.text_primary};
}}

DataTable > .datatable--row-even {{
    background: {palette.bg_tertiary};
    color: {palette.text_primary};
}}

/* ===============================================
   FORM ELEMENTS - ACCESSIBLE INPUT DESIGN
   =============================================== */

/* Enhanced input with superior visibility */
Input {{
    background: {palette.surface_1};
    color: {palette.text_primary};
    border: solid {palette.border_default};
}}

Input:focus {{
    border: thick {palette.border_focus};
    background: {palette.bg_elevated};
    color: {palette.text_primary};
}}

Input.-invalid {{
    border: solid {palette.error};
    background: {palette.surface_1};
}}

Input.-invalid:focus {{
    border: thick {palette.error};
    background: {palette.bg_elevated};
}}

/* ===============================================
   NAVIGATION - ENHANCED CLARITY
   =============================================== */

/* Header and Footer with improved contrast */
Header, Footer {{
    background: {palette.bg_secondary};
    color: {palette.text_primary};
    border: solid {palette.border_default};
}}

/* Simple Tab styling to ensure text visibility */
Tab {{
    color: {palette.text_primary};
    background: {palette.surface_2};
}}

Tab:hover {{
    color: {palette.text_primary};
    background: {palette.surface_3};
}}

Tab:focus {{
    color: {palette.text_primary};
    background: {palette.bg_elevated};
}}

Tab.-active {{
    background: {palette.primary};
    color: {palette.text_inverse};
    text-style: bold;
}}

/* ===============================================
   TYPOGRAPHY - WCAG AA COMPLIANT
   =============================================== */

/* Label hierarchy with enhanced readability */
Label {{
    color: {palette.text_primary};
}}

.label-secondary {{
    color: {palette.text_secondary};
}}

.label-muted {{
    color: {palette.text_tertiary};
}}

/* ===============================================
   STATUS INDICATORS - ENHANCED VISIBILITY
   =============================================== */

/* Semantic status colors with superior contrast */
.status-active, .status-success {{
    color: {palette.success};
    text-style: bold;
}}

.status-error, .status-failed {{
    color: {palette.error};
    text-style: bold;
}}

.status-warning, .status-pending {{
    color: {palette.warning};
    text-style: bold;
}}

.status-info {{
    color: {palette.info};
    text-style: bold;
}}

.status-inactive, .status-disabled {{
    color: {palette.text_tertiary};
}}

/* ===============================================
   VISUAL EFFECTS - ACCESSIBLE ANIMATIONS
   =============================================== */

/* Animation classes with accessibility considerations */
.pulse {{
    text-style: blink;
}}

.glow {{
    background: {palette.primary};
    color: {palette.text_inverse};
}}

.shimmer {{
    text-style: italic;
    color: {palette.text_secondary};
}}

.highlight {{
    background: {palette.border_focus};
    color: {palette.text_inverse};
}}

/* ===============================================
   METRICS - ENHANCED DASHBOARD VISIBILITY
   =============================================== */

/* Enhanced metrics with superior readability */
.metrics-value {{
    text-style: bold;
    text-align: center;
    color: {palette.primary};
    height: 1;
    margin: 0;
}}

.metrics-label {{
    text-align: center;
    color: {palette.text_primary};
    text-style: bold;
    height: 1;
    margin: 0;
}}

.metrics-description {{
    text-align: center;
    color: {palette.text_secondary};
    text-style: italic;
    height: 1;
    margin: 0;
}}

/* MetricsCard container optimization */
MetricsCard {{
    background: {palette.surface_2};
    border: solid {palette.border_default};
    padding: 0 1;
    margin: 0;
    height: auto;
    min-height: 3;
    max-height: 5;
    align: center middle;
}}

/* Section organization with enhanced hierarchy */
.section-title {{
    text-style: bold;
    color: {palette.primary};
    margin: 0;
    border-left: thick {palette.primary};
    padding-left: 1;
    height: auto;
    min-height: 2;
    max-height: 3;
}}

.section-subtitle {{
    color: {palette.text_secondary};
    text-style: italic;
    margin: 0 0 1 0;
}}

/* ===============================================
   LAYOUT SYSTEMS - IMPROVED READABILITY
   =============================================== */

/* Enhanced text styling with better contrast */
.status-text {{
    color: {palette.text_secondary};
    text-align: center;
    margin: 1 0;
    text-style: italic;
}}

.help-text {{
    color: {palette.text_tertiary};
    text-style: italic;
}}

/* Button organization with enhanced backgrounds */
.button_bar {{
    margin: 0;
    background: {palette.bg_secondary};
    padding: 1;
    height: auto;
    min-height: 5;
    max-height: 6;
}}

.action_buttons {{
    margin: 0;
    text-align: center;
    padding: 1;
    height: auto;
    background: {palette.surface_2};
    border-top: solid {palette.border_default};
}}

/* Enhanced progress indicators */
.progress-label {{
    color: {palette.text_primary};
    margin: 1 0;
    text-style: bold;
    text-align: center;
}}

.progress-complete {{
    color: {palette.success};
    text-style: bold;
}}

.progress-error {{
    color: {palette.error};
    text-style: bold;
}}

/* ===============================================
   RESPONSIVE GRID SYSTEMS
   =============================================== */

/* Enhanced grid layouts */
.responsive-grid {{
    grid-size: 4;
    grid-gutter: 1;
    background: {palette.bg_primary};
    margin: 0;
    padding: 0;
    height: auto;
}}

.metrics-grid {{
    grid-size: 4;
    grid-gutter: 1;
    margin: 0;
    padding: 0;
    background: {palette.bg_primary};
    align: center middle;
    height: auto;
    min-height: 5;
    max-height: 7;
}}

.analytics-grid {{
    grid-size: 2;
    grid-gutter: 1;
    background: {palette.bg_primary};
}}

/* ===============================================
   MODAL & OVERLAY - ENHANCED ACCESSIBILITY
   =============================================== */

/* Accessible modal design */
IngestionScreen {{
    align: center middle;
}}

.modal-container {{
    background: {palette.surface_2};
    border: thick {palette.primary};
    padding: 1;
    width: 90%;
    height: 80%;
    max-width: 80;
    min-width: 40;
    overflow-y: auto;
    layout: vertical;
}}

/* Backend selection responsive layout */
.backend-selection {{
    layout: horizontal;
    padding: 1;
    height: auto;
    align: center middle;
}}

.backend-actions {{
    layout: horizontal;
    padding: 1;
    height: auto;
    align: center middle;
}}

/* Responsive adjustments for horizontal layout */
.backend-actions Button {{
    margin: 0 1;
    width: auto;
    min-width: 12;
    text-overflow: ellipsis;
}}

/* Backend selection checkboxes horizontal layout */
.backend-selection Checkbox {{
    margin: 0 2;
    width: auto;
    text-overflow: ellipsis;
}}

/* Input section responsive improvements */
.input-section {{
    margin: 1 0;
    padding: 1;
    background: {palette.surface_2};
    border: solid {palette.border_default};
    height: auto;
    width: 100%;
}}

.modal-header {{
    background: {palette.bg_secondary};
    color: {palette.primary};
    text-style: bold;
    padding: 1;
    border-bottom: solid {palette.border_default};
}}

.modal-body {{
    padding: 1;
    color: {palette.text_primary};
}}

.modal-footer {{
    background: {palette.bg_secondary};
    padding: 1;
    border-top: solid {palette.border_default};
}}

/* ===============================================
   SPECIALIZED COMPONENTS - ENHANCED VISIBILITY
   =============================================== */

/* Enhanced chart and analytics */
.chart-title {{
    text-style: bold;
    color: {palette.primary};
    margin: 1 0;
}}

.chart-placeholder {{
    color: {palette.text_tertiary};
    text-style: italic;
    text-align: center;
    padding: 2;
    background: {palette.bg_secondary};
    border: dashed {palette.border_default};
}}

/* Enhanced table variants with superior contrast */
.enhanced-table {{
    background: {palette.surface_2};
    color: {palette.text_primary};
    border: solid {palette.border_default};
}}

.enhanced-table:focus {{
    border: thick {palette.border_focus};
}}

.enhanced-table-header {{
    background: {palette.bg_secondary};
    color: {palette.primary};
    text-style: bold;
}}

/* Enhanced status and info bars */
.status-bar {{
    background: {palette.bg_secondary};
    color: {palette.text_secondary};
    padding: 0 1;
    border: solid {palette.border_default};
}}

.info-bar {{
    background: {palette.info};
    color: {palette.text_inverse};
    padding: 0 1;
}}

/* ===============================================
   FORM SECTIONS - ACCESSIBLE INPUT DESIGN
   =============================================== */

/* Enhanced input organization */
.input-section {{
    margin: 0;
    padding: 1;
    background: {palette.surface_2};
    border: solid {palette.border_default};
    height: auto;
}}

.input-label {{
    color: {palette.text_primary};
    margin: 0 0 1 0;
    text-style: bold;
}}

.input-help {{
    color: {palette.text_secondary};
    text-style: italic;
    margin: 0 0 1 0;
}}

.modern-input {{
    background: {palette.surface_1};
    color: {palette.text_primary};
    border: solid {palette.border_default};
    margin: 1 0;
}}

.modern-input:focus {{
    border: thick {palette.border_focus};
    background: {palette.bg_elevated};
}}

/* Enhanced type selection buttons */
.type_buttons {{
    margin: 0;
    height: auto;
}}

.type-button {{
    margin: 0 1;
    background: {palette.surface_2};
    color: {palette.text_primary};
    border: solid {palette.border_default};
}}

.type-button:hover {{
    background: {palette.surface_3};
    border: solid {palette.border_hover};
}}

.type-button.-selected {{
    background: {palette.primary};
    color: {palette.text_inverse};
    border: solid {palette.primary};
}}

/* ===============================================
   UTILITY CLASSES - ENHANCED CONSISTENCY
   =============================================== */

/* Enhanced progress sections */
.progress-section {{
    margin: 1 0;
    padding: 1;
    background: {palette.surface_2};
    border: solid {palette.border_default};
    height: auto;
}}

/* Alignment utilities */
.center {{
    text-align: center;
    margin: 0;
    padding: 0;
}}

.text-left {{
    text-align: left;
}}

.text-right {{
    text-align: right;
}}

/* Dashboard container spacing optimization */
Container.center {{
    margin: 0;
    padding: 0;
    height: auto;
    min-height: 0;
}}

/* Grid spacing optimization */
Grid {{
    margin: 0;
    padding: 0;
    height: auto;
}}

/* Rule spacing optimization */
Rule {{
    margin: 0;
    padding: 0;
    height: 1;
    min-height: 1;
    max-height: 1;
}}

/* Specific spacing elimination for dashboard */
.main_container Rule {{
    margin: 0;
    height: 0;
    display: none;
}}

.main_container Container {{
    margin: 0;
    padding: 0;
}}

/* Enhanced state utilities */
.warning {{
    color: {palette.warning};
    text-style: bold;
}}

.error {{
    color: {palette.error};
    text-style: bold;
}}

.success {{
    color: {palette.success};
    text-style: bold;
}}

.info {{
    color: {palette.info};
    text-style: bold;
}}

/* Enhanced interactive state utilities */
.pressed {{
    background: {palette.primary_hover};
    color: {palette.text_inverse};
}}

.selected {{
    background: {palette.primary};
    color: {palette.text_inverse};
    border: solid {palette.primary};
}}

.disabled {{
    color: {palette.text_tertiary};
    background: {palette.bg_secondary};
}}

/* ===============================================
   ACCESSIBILITY - WCAG AA COMPLIANCE
   =============================================== */

/* Enhanced global focus indicator system */
*:focus {{
    outline: solid {palette.border_focus};
}}

/* Improved high contrast mode support */
.high-contrast {{
    color: #ffffff;
    background: #000000;
}}

.high-contrast Button {{
    border: thick #ffffff;
    color: #ffffff;
    background: #000000;
}}

.high-contrast Button:focus {{
    background: #ffffff;
    color: #000000;
    border: thick #000000;
}}

/* Enhanced reduced motion support */
.reduced-motion .pulse {{
    text-style: none;
}}

.reduced-motion .shimmer {{
    text-style: none;
}}

.reduced-motion .pulse {{
    text-style: none;
}}

/* ===============================================
   COMPONENT ENHANCEMENTS - IMPROVED VISIBILITY
   =============================================== */

/* Enhanced loading states */
.loading {{
    color: {palette.text_secondary};
    text-style: italic;
}}

.loading-dots {{
    color: {palette.primary};
    text-style: blink;
}}

/* Enhanced empty states */
.empty-state {{
    color: {palette.text_tertiary};
    text-style: italic;
    text-align: center;
    padding: 4;
}}

.empty-state-icon {{
    color: {palette.border_default};
    text-align: center;
}}

/* Enhanced search and filter components */
.search-highlight {{
    background: {palette.warning};
    color: {palette.text_inverse};
    text-style: bold;
}}

.filter-active {{
    color: {palette.primary};
    text-style: bold;
}}

/* Enhanced breadcrumb navigation */
.breadcrumb {{
    color: {palette.text_secondary};
}}

.breadcrumb-separator {{
    color: {palette.text_tertiary};
}}

.breadcrumb-current {{
    color: {palette.text_primary};
    text-style: bold;
}}

/* ===============================================
   THEME-SPECIFIC CUSTOMIZATIONS
   =============================================== */

/* Additional theme-specific styling can be added here */
.theme-indicator {{
    color: {palette.primary};
    text-style: italic;
}}

.accessibility-notice {{
    color: {palette.text_primary};
    background: {palette.bg_elevated};
    padding: 1;
    border: solid {palette.border_default};
}}
"""


# Initialize the theme manager with enhanced dark theme as default
theme_manager = ThemeManager(ThemeType.DARK)

# Generate CSS for the current theme
TUI_CSS = theme_manager.generate_css()  # pyright: ignore[reportConstantRedefinition]


# Convenience functions for easy theme switching
def set_theme(theme_type: ThemeType) -> str:
    """Switch to a different theme and return the new CSS."""
    theme_manager.set_theme(theme_type)
    global TUI_CSS
    TUI_CSS = theme_manager.generate_css()  # pyright: ignore[reportConstantRedefinition]
    return TUI_CSS


def get_available_themes() -> list[ThemeType]:
    """Get list of available themes."""
    return list(ThemeType)


def get_current_theme() -> ThemeType:
    """Get the currently active theme."""
    return theme_manager.current_theme


def get_theme_palette() -> ColorPalette:
    """Get the color palette for the current theme."""
    return theme_manager.get_current_palette()


def get_css_for_theme(theme_type: ThemeType) -> str:
    """Get CSS for a specific theme without changing the current theme."""
    current = theme_manager.current_theme
    theme_manager.set_theme(theme_type)
    css = theme_manager.generate_css()
    theme_manager.set_theme(current)  # Restore original theme
    return css


def apply_theme_to_app(app: TextualApp | AppProtocol, theme_type: ThemeType) -> None:
    """Apply a theme to a Textual app instance."""
    try:
        # Note: CSS class variable cannot be changed at runtime
        # This function would need to be called during app initialization
        # or implement a different approach for dynamic theming
        _ = set_theme(theme_type)  # Keep for future implementation
        if hasattr(app, "refresh"):
            app.refresh()
    except Exception as e:
        # Graceful fallback - log but don't crash the UI
        import logging

        logging.debug(f"Failed to apply theme to app: {e}")


class ThemeSwitcher:
    """Helper class for managing theme switching in TUI applications."""

    def __init__(self, app: TextualApp | AppProtocol | None = None) -> None:
        self.app: TextualApp | AppProtocol | None = app
        self.theme_history: list[ThemeType] = [ThemeType.DARK]

    def switch_theme(self, theme_type: ThemeType) -> str:
        """Switch to a new theme and apply it to the app if available."""
        css = set_theme(theme_type)
        self.theme_history.append(theme_type)

        if self.app:
            apply_theme_to_app(self.app, theme_type)

        return css

    def toggle_dark_light(self) -> str:
        """Toggle between dark and light themes."""
        current = get_current_theme()
        if current in [ThemeType.DARK, ThemeType.GITHUB_DARK, ThemeType.HIGH_CONTRAST]:
            return self.switch_theme(ThemeType.LIGHT)
        else:
            return self.switch_theme(ThemeType.DARK)

    def cycle_themes(self) -> str:
        """Cycle through all available themes."""
        themes = get_available_themes()
        current = get_current_theme()
        current_index = themes.index(current)
        next_theme = themes[(current_index + 1) % len(themes)]
        return self.switch_theme(next_theme)

    def get_theme_info(self) -> dict[str, str | list[str] | dict[str, str]]:
        """Get information about the current theme."""
        palette = get_theme_palette()
        return {
            "current_theme": get_current_theme().value,
            "available_themes": [t.value for t in get_available_themes()],
            "palette": {
                "bg_primary": palette.bg_primary,
                "text_primary": palette.text_primary,
                "primary": palette.primary,
                "contrast_info": "WCAG AA compliant colors",
            },
        }


# Responsive breakpoints for dynamic layout adaptation
RESPONSIVE_BREAKPOINTS = {
    "xs": 40,  # Extra small terminals
    "sm": 60,  # Small terminals
    "md": 100,  # Medium terminals
    "lg": 140,  # Large terminals
    "xl": 180,  # Extra large terminals
}


def get_responsive_css() -> str:
    """Generate responsive CSS with breakpoint-based adaptations."""
    return """
/* Responsive Grid System */
.responsive-grid {
    layout: grid;
    grid-gutter: 1;
    padding: 1;
}

.responsive-grid.auto-fit {
    grid-columns: repeat(auto-fit, minmax(20, 1fr));
}

.responsive-grid.compact {
    grid-gutter: 0;
    padding: 0;
}

/* Breakpoint-specific styles */
@media (max-width: 60) {
    .responsive-grid {
        grid-size: 1;
        grid-columns: 1fr;
    }

    .collapsible-sidebar {
        width: 100%;
        height: auto;
        dock: top;
    }

    .form-row {
        layout: vertical;
    }

    .form-label {
        width: 100%;
        text-align: left;
        padding-bottom: 1;
    }

    .form-input {
        width: 100%;
    }
}

@media (min-width: 61) and (max-width: 100) {
    .responsive-grid {
        grid-size: 2;
        grid-columns: 1fr 1fr;
    }
}

@media (min-width: 101) {
    .responsive-grid {
        grid-size: 3;
        grid-columns: 1fr 1fr 1fr;
    }
}

/* Enhanced Layout Components */
.split-pane {
    layout: horizontal;
    height: 100%;
}

.split-pane.vertical {
    layout: vertical;
}

.split-pane .pane {
    background: $surface;
    border: solid $border;
}

.split-pane .splitter {
    width: 1;
    background: $border;
    cursor: col-resize;
}

.split-pane.vertical .splitter {
    height: 1;
    width: 100%;
    cursor: row-resize;
}

/* Card Layout System */
.card-layout {
    layout: grid;
    grid-gutter: 2;
    padding: 2;
}

.card {
    background: $surface;
    border: solid $border;
    border-radius: 1;
    padding: 2;
    height: auto;
    min-height: 10;
    transition: border 200ms, background 200ms;
}

.card:hover {
    border: solid $accent;
    background: $surface-lighten-1;
}

.card:focus {
    border: solid $primary;
    box-shadow: 0 0 0 1 $primary-lighten-1;
}

.card-header {
    dock: top;
    height: 3;
    background: $primary-lighten-1;
    color: $text;
    padding: 1;
    margin: -2 -2 1 -2;
    border-radius: 1 1 0 0;
}

.card-content {
    height: 1fr;
    overflow: auto;
}

.card-footer {
    dock: bottom;
    height: 3;
    background: $surface-darken-1;
    padding: 1;
    margin: 1 -2 -2 -2;
    border-radius: 0 0 1 1;
}

/* Collapsible Sidebar */
.collapsible-sidebar {
    dock: left;
    width: 25%;
    min-width: 20;
    max-width: 40;
    background: $surface;
    border-right: solid $border;
    padding: 1;
    transition: width 300ms ease-in-out;
}

.collapsible-sidebar.collapsed {
    width: 3;
    min-width: 3;
    overflow: hidden;
}

.collapsible-sidebar.collapsed > * {
    display: none;
}

.collapsible-sidebar .sidebar-toggle {
    dock: top;
    height: 1;
    background: $primary;
    color: $text;
    text-align: center;
    margin-bottom: 1;
    cursor: pointer;
}

.collapsible-sidebar .sidebar-content {
    height: 1fr;
    overflow-y: auto;
}

/* Tabular Layout */
.tabular-layout {
    layout: horizontal;
    height: 100%;
}

.tabular-layout .main-content {
    width: 1fr;
    height: 100%;
    layout: vertical;
}

.tabular-layout .table-container {
    height: 1fr;
    overflow: auto;
    border: solid $border;
    background: $surface;
}

.tabular-layout .table-header {
    dock: top;
    height: 3;
    background: $primary;
    color: $text;
    padding: 1;
}

.tabular-layout .table-footer {
    dock: bottom;
    height: 3;
    background: $surface-lighten-1;
    padding: 1;
    border-top: solid $border;
}

/* Form Styling Enhancements */
.form-container {
    background: $surface;
    border: solid $border;
    padding: 2;
    border-radius: 1;
}

.form-title {
    color: $primary;
    text-style: bold;
    margin-bottom: 2;
    text-align: center;
}

.form-section {
    margin-bottom: 2;
    padding: 1;
    border: solid $border-lighten-1;
    background: $surface-lighten-1;
    border-radius: 1;
}

.section-title {
    color: $primary;
    text-style: bold;
    margin-bottom: 1;
}

.form-row {
    layout: horizontal;
    align-items: center;
    height: auto;
    margin-bottom: 1;
}

.form-label {
    width: 30%;
    min-width: 15;
    text-align: right;
    padding-right: 2;
    color: $text-secondary;
}

.form-input {
    width: 70%;
}

.form-actions {
    layout: horizontal;
    align: center;
    margin-top: 2;
    padding-top: 2;
    border-top: solid $border;
}

.form-actions Button {
    margin: 0 1;
    min-width: 10;
}

/* Button Enhancements */
Button {
    transition: background 200ms, color 200ms;
}

Button:hover {
    background: $primary-hover;
}

Button:focus {
    border: solid $primary;
    box-shadow: 0 0 0 1 $primary-lighten-1;
}

.button-group {
    layout: horizontal;
    align: center;
}

.button-group Button {
    margin-right: 1;
}

.button-group Button:last-child {
    margin-right: 0;
}

/* Data Table Enhancements */
DataTable {
    border: solid $border;
    background: $surface;
}

DataTable .datatable--header {
    background: $primary;
    color: $text;
    text-style: bold;
}

DataTable .datatable--odd-row {
    background: $surface-lighten-1;
}

DataTable .datatable--even-row {
    background: $surface;
}

DataTable .datatable--cursor {
    background: $primary-lighten-2;
    color: $text;
}

/* Loading and Progress Indicators */
LoadingIndicator {
    color: $primary;
    background: transparent;
}

ProgressBar {
    border: solid $border;
    background: $surface-darken-1;
}

ProgressBar .bar--bar {
    color: $primary;
}

ProgressBar .bar--percentage {
    color: $text;
    text-style: bold;
}

/* Modal and Dialog Styling */
.modal-container {
    background: $surface;
    border: thick $accent;
    border-radius: 1;
    padding: 2;
    box-shadow: 0 4 8 0 rgba(0, 0, 0, 0.3);
}

.dialog-container {
    background: $surface;
    border: solid $border;
    border-radius: 1;
    padding: 2;
    min-width: 40;
    max-width: 80;
}

/* Animation Classes */
.fade-in {
    opacity: 0;
    transition: opacity 300ms ease-in;
}

.fade-in.visible {
    opacity: 1;
}

.slide-in-left {
    transform: translateX(-100%);
    transition: transform 300ms ease-in-out;
}

.slide-in-left.visible {
    transform: translateX(0);
}

.slide-in-right {
    transform: translateX(100%);
    transition: transform 300ms ease-in-out;
}

.slide-in-right.visible {
    transform: translateX(0);
}

/* Accessibility Enhancements */
.screen-reader-only {
    position: absolute;
    width: 1px;
    height: 1px;
    padding: 0;
    margin: -1px;
    overflow: hidden;
    clip: rect(0, 0, 0, 0);
    border: 0;
}

.focus-visible {
    outline: 2px solid $primary;
    outline-offset: 2px;
}

/* Print Styles (for export functionality) */
@media print {
    * {
        background: white !important;
        color: black !important;
    }

    .no-print {
        display: none !important;
    }
}

/* High Contrast Mode Support */
@media (prefers-contrast: high) {
    * {
        border-color: currentColor;
    }

    Button {
        border: 2px solid currentColor;
    }

    Input, Select, TextArea {
        border: 2px solid currentColor;
    }
}

/* Dark Mode Detection */
@media (prefers-color-scheme: dark) {
    :root {
        --primary-color: #1f6feb;
        --background-color: #0a0c10;
        --text-color: #ffffff;
    }
}

/* Light Mode Detection */
@media (prefers-color-scheme: light) {
    :root {
        --primary-color: #0969da;
        --background-color: #ffffff;
        --text-color: #1f2328;
    }
}
"""


def get_css_custom_properties() -> str:
    """Generate CSS custom properties for dynamic theming."""
    palette = get_theme_palette()

    return f"""
:root {{
    /* Color Palette */
    --bg-primary: {palette.bg_primary};
    --bg-secondary: {palette.bg_secondary};
    --bg-tertiary: {palette.bg_tertiary};
    --bg-elevated: {palette.bg_elevated};

    --text-primary: {palette.text_primary};
    --text-secondary: {palette.text_secondary};
    --text-tertiary: {palette.text_tertiary};
    --text-inverse: {palette.text_inverse};

    --primary: {palette.primary};
    --primary-hover: {palette.primary_hover};
    --success: {palette.success};
    --warning: {palette.warning};
    --error: {palette.error};
    --info: {palette.info};

    --border-default: {palette.border_default};
    --border-focus: {palette.border_focus};
    --border-hover: {palette.border_hover};

    --surface-1: {palette.surface_1};
    --surface-2: {palette.surface_2};
    --surface-3: {palette.surface_3};

    /* Spacing Scale */
    --space-xs: 0.25rem;
    --space-sm: 0.5rem;
    --space-md: 1rem;
    --space-lg: 1.5rem;
    --space-xl: 2rem;

    /* Typography Scale */
    --text-xs: 0.75rem;
    --text-sm: 0.875rem;
    --text-base: 1rem;
    --text-lg: 1.125rem;
    --text-xl: 1.25rem;

    /* Border Radius */
    --radius-sm: 0.25rem;
    --radius-md: 0.5rem;
    --radius-lg: 1rem;

    /* Shadows */
    --shadow-sm: 0 1px 2px rgba(0, 0, 0, 0.1);
    --shadow-md: 0 4px 6px rgba(0, 0, 0, 0.1);
    --shadow-lg: 0 10px 15px rgba(0, 0, 0, 0.1);

    /* Transitions */
    --transition-fast: 150ms ease-in-out;
    --transition-normal: 250ms ease-in-out;
    --transition-slow: 350ms ease-in-out;
}}
"""


def get_enhanced_dark_theme_css() -> str:
    """Generate CSS for the enhanced dark theme."""
    theme_manager = ThemeManager(default_theme=ThemeType.DARK)
    return theme_manager.generate_css()


def apply_responsive_theme() -> str:
    """Apply complete responsive theme with custom properties."""
    base_css = get_enhanced_dark_theme_css()
    responsive_css = get_responsive_css()
    custom_properties = get_css_custom_properties()

    return f"{custom_properties}\n{base_css}\n{responsive_css}"
</file>

<file path="ingest_pipeline/flows/ingestion.py">
"""Prefect flow for ingestion pipeline."""

from __future__ import annotations

from collections.abc import Callable
from datetime import UTC, datetime
from typing import TYPE_CHECKING, Literal, TypeAlias, assert_never, cast

from prefect import flow, get_run_logger, task
from prefect.blocks.core import Block
from prefect.variables import Variable
from pydantic import SecretStr

from ..config.settings import Settings
from ..core.exceptions import IngestionError
from ..core.models import (
    Document,
    FirecrawlConfig,
    IngestionJob,
    IngestionResult,
    IngestionSource,
    IngestionStatus,
    RepomixConfig,
    StorageBackend,
    StorageConfig,
)
from ..ingestors import BaseIngestor, FirecrawlIngestor, FirecrawlPage, RepomixIngestor
from ..storage import OpenWebUIStorage, WeaviateStorage
from ..storage import R2RStorage as RuntimeR2RStorage
from ..storage.base import BaseStorage
from ..utils.metadata_tagger import MetadataTagger

SourceTypeLiteral = Literal["web", "repository", "documentation"]
StorageBackendLiteral = Literal["weaviate", "open_webui", "r2r"]
SourceTypeLike: TypeAlias = IngestionSource | SourceTypeLiteral
StorageBackendLike: TypeAlias = StorageBackend | StorageBackendLiteral


def _safe_cache_key(prefix: str, params: dict[str, object], key: str) -> str:
    """Create a type-safe cache key from task parameters."""
    value = params.get(key, "")
    return f"{prefix}_{hash(str(value))}"


if TYPE_CHECKING:
    from ..storage.r2r.storage import R2RStorage as R2RStorageType
else:
    R2RStorageType = BaseStorage


@task(name="validate_source", retries=2, retry_delay_seconds=10, tags=["validation"])
async def validate_source_task(source_url: str, source_type: IngestionSource) -> bool:
    """
    Validate that a source is accessible.

    Args:
        source_url: URL or path to source
        source_type: Type of source

    Returns:
        True if valid
    """
    if source_type == IngestionSource.WEB:
        ingestor = FirecrawlIngestor()
    elif source_type == IngestionSource.REPOSITORY:
        ingestor = RepomixIngestor()
    else:
        raise ValueError(f"Unsupported source type: {source_type}")

    result = await ingestor.validate_source(source_url)
    return bool(result)


@task(name="initialize_storage", retries=3, retry_delay_seconds=5, tags=["storage"])
async def initialize_storage_task(config: StorageConfig | str) -> BaseStorage:
    """
    Initialize storage backend.

    Args:
        config: Storage configuration block or block name

    Returns:
        Initialized storage adapter
    """
    # Load block if string provided
    if isinstance(config, str):
        # Use Block.aload with type slug for better type inference
        loaded_block = await Block.aload(f"storage-config/{config}")
        config = cast(StorageConfig, loaded_block)

    if config.backend == StorageBackend.WEAVIATE:
        storage = WeaviateStorage(config)
    elif config.backend == StorageBackend.OPEN_WEBUI:
        storage = OpenWebUIStorage(config)
    elif config.backend == StorageBackend.R2R:
        if RuntimeR2RStorage is None:
            raise ValueError("R2R storage not available. Check dependencies.")
        storage = RuntimeR2RStorage(config)
    else:
        raise ValueError(f"Unsupported backend: {config.backend}")

    await storage.initialize()
    return storage


@task(
    name="map_firecrawl_site",
    retries=2,
    retry_delay_seconds=15,
    tags=["firecrawl", "map"],
    cache_key_fn=lambda ctx, p: _safe_cache_key("firecrawl_map", p, "source_url"),
)
async def map_firecrawl_site_task(source_url: str, config: FirecrawlConfig | str) -> list[str]:
    """Map a site using Firecrawl and return discovered URLs."""
    # Load block if string provided
    if isinstance(config, str):
        # Use Block.aload with type slug for better type inference
        loaded_block = await Block.aload(f"firecrawl-config/{config}")
        config = cast(FirecrawlConfig, loaded_block)

    ingestor = FirecrawlIngestor(config)
    mapped = await ingestor.map_site(source_url)
    return mapped or [source_url]


@task(
    name="filter_existing_documents",
    retries=1,
    retry_delay_seconds=5,
    tags=["dedup"],
    cache_key_fn=lambda ctx, p: _safe_cache_key("filter_docs", p, "urls"),
)  # Cache based on URL list
async def filter_existing_documents_task(
    urls: list[str],
    storage_client: BaseStorage,
    stale_after_days: int = 30,
    *,
    collection_name: str | None = None,
) -> list[str]:
    """Filter URLs to only those that need scraping (missing or stale in storage)."""
    import asyncio

    logger = get_run_logger()

    # Use semaphore to limit concurrent existence checks
    semaphore = asyncio.Semaphore(20)

    async def check_url_exists(url: str) -> tuple[str, bool]:
        async with semaphore:
            try:
                document_id = str(FirecrawlIngestor.compute_document_id(url))
                exists = await storage_client.check_exists(
                    document_id, collection_name=collection_name, stale_after_days=stale_after_days
                )
                return url, exists
            except Exception as e:
                logger.warning("Error checking existence for URL %s: %s", url, e)
                # Assume doesn't exist on error to ensure we scrape it
                return url, False

    # Check all URLs in parallel - use return_exceptions=True for partial failure handling
    results = await asyncio.gather(*[check_url_exists(url) for url in urls], return_exceptions=True)

    # Collect URLs that need scraping, handling any exceptions
    eligible = []
    for result in results:
        if isinstance(result, Exception):
            logger.error("Unexpected error in parallel existence check: %s", result)
            continue
        # Type narrowing: result is now known to be tuple[str, bool]
        if isinstance(result, tuple) and len(result) == 2:
            url, exists = result
            if not exists:
                eligible.append(url)

    skipped = len(urls) - len(eligible)
    if skipped > 0:
        logger.info("Skipping %s up-to-date documents in %s", skipped, storage_client.display_name)

    return eligible


@task(
    name="scrape_firecrawl_batch", retries=2, retry_delay_seconds=20, tags=["firecrawl", "scrape"]
)
async def scrape_firecrawl_batch_task(
    batch_urls: list[str], config: FirecrawlConfig
) -> list[FirecrawlPage]:
    """Scrape a batch of URLs via Firecrawl."""
    ingestor = FirecrawlIngestor(config)
    result: list[FirecrawlPage] = await ingestor.scrape_pages(batch_urls)
    return result


@task(name="annotate_firecrawl_metadata", retries=1, retry_delay_seconds=10, tags=["metadata"])
async def annotate_firecrawl_metadata_task(
    pages: list[FirecrawlPage], job: IngestionJob
) -> list[Document]:
    """Annotate scraped pages with standardized metadata."""
    if not pages:
        return []

    ingestor = FirecrawlIngestor()
    documents = [ingestor.create_document(page, job) for page in pages]

    try:
        from ..config import get_settings

        settings = get_settings()
        async with MetadataTagger(llm_endpoint=str(settings.llm_endpoint)) as tagger:
            tagged_documents: list[Document] = await tagger.tag_batch(documents)
            return tagged_documents
    except IngestionError as exc:  # pragma: no cover - logging side effect
        logger = get_run_logger()
        logger.warning("Metadata tagging failed: %s", exc)
        return documents
    except Exception as exc:  # pragma: no cover - defensive
        logger = get_run_logger()
        logger.warning("Metadata tagging unavailable, using base metadata: %s", exc)
        return documents


@task(name="upsert_r2r_documents", retries=2, retry_delay_seconds=20, tags=["storage", "r2r"])
async def upsert_r2r_documents_task(
    storage_client: R2RStorageType,
    documents: list[Document],
    collection_name: str | None,
) -> tuple[int, int]:
    """Upsert documents into R2R storage."""
    if not documents:
        return 0, 0

    stored_ids: list[str] = await storage_client.store_batch(
        documents, collection_name=collection_name
    )
    processed = len(stored_ids)
    failed = len(documents) - processed

    if failed:
        logger = get_run_logger()
        logger.warning("Failed to upsert %s documents to R2R", failed)

    return processed, failed


@task(name="ingest_documents", retries=2, retry_delay_seconds=30, tags=["ingestion"])
async def ingest_documents_task(
    job: IngestionJob,
    collection_name: str | None = None,
    batch_size: int | None = None,
    storage_client: BaseStorage | None = None,
    storage_block_name: str | None = None,
    ingestor_config_block_name: str | None = None,
    progress_callback: Callable[[int, str], None] | None = None,
) -> tuple[int, int]:
    """
    Ingest documents from source with optional pre-initialized storage client.

    Args:
        job: Ingestion job configuration
        collection_name: Target collection name
        batch_size: Number of documents per batch (uses Variable if None)
        storage_client: Optional pre-initialized storage client
        storage_block_name: Optional storage block name to load
        ingestor_config_block_name: Optional ingestor config block name to load
        progress_callback: Optional callback for progress updates

    Returns:
        Tuple of (processed_count, failed_count)
    """
    if progress_callback:
        progress_callback(35, "Creating ingestor and storage clients...")

    # Use Variable for batch size if not provided
    if batch_size is None:
        try:
            batch_size_var = await Variable.aget("default_batch_size", default="50")
            # Convert Variable result to int, handling various types
            if isinstance(batch_size_var, int):
                batch_size = batch_size_var
            elif isinstance(batch_size_var, (str, float)):
                batch_size = int(float(str(batch_size_var)))
            else:
                batch_size = 50
        except Exception:
            batch_size = 50

    ingestor = await _create_ingestor(job, ingestor_config_block_name)
    storage = storage_client or await _create_storage(job, collection_name, storage_block_name)

    if progress_callback:
        progress_callback(40, "Starting document processing...")

    return await _process_documents(
        ingestor, storage, job, batch_size, collection_name, progress_callback
    )


async def _create_ingestor(job: IngestionJob, config_block_name: str | None = None) -> BaseIngestor:
    """Create appropriate ingestor based on job source type."""
    if job.source_type == IngestionSource.WEB:
        if config_block_name:
            # Use Block.aload with type slug for better type inference
            loaded_block = await Block.aload(f"firecrawl-config/{config_block_name}")
            config = cast(FirecrawlConfig, loaded_block)
        else:
            # Fallback to default configuration
            config = FirecrawlConfig()
        return FirecrawlIngestor(config)
    elif job.source_type == IngestionSource.REPOSITORY:
        if config_block_name:
            # Use Block.aload with type slug for better type inference
            loaded_block = await Block.aload(f"repomix-config/{config_block_name}")
            config = cast(RepomixConfig, loaded_block)
        else:
            # Fallback to default configuration
            config = RepomixConfig()
        return RepomixIngestor(config)
    else:
        raise ValueError(f"Unsupported source: {job.source_type}")


async def _create_storage(
    job: IngestionJob, collection_name: str | None, storage_block_name: str | None = None
) -> BaseStorage:
    """Create and initialize storage client."""
    if collection_name is None:
        # Use variable for default collection prefix
        prefix = await Variable.aget("default_collection_prefix", default="docs")
        collection_name = f"{prefix}_{job.source_type.value}"

    if storage_block_name:
        # Load storage config from block
        loaded_block = await Block.aload(f"storage-config/{storage_block_name}")
        storage_config = cast(StorageConfig, loaded_block)
        # Override collection name if provided
        storage_config.collection_name = collection_name
    else:
        # Fallback to building config from settings
        from ..config import get_settings

        settings = get_settings()
        storage_config = _build_storage_config(job, settings, collection_name)

    storage = _instantiate_storage(job.storage_backend, storage_config)
    await storage.initialize()
    return storage


def _build_storage_config(
    job: IngestionJob, settings: Settings, collection_name: str
) -> StorageConfig:
    """Build storage configuration from job and settings."""
    storage_endpoints = {
        StorageBackend.WEAVIATE: settings.weaviate_endpoint,
        StorageBackend.OPEN_WEBUI: settings.openwebui_endpoint,
        StorageBackend.R2R: settings.get_storage_endpoint("r2r"),
    }
    storage_api_keys: dict[StorageBackend, str | None] = {
        StorageBackend.WEAVIATE: settings.get_api_key("weaviate"),
        StorageBackend.OPEN_WEBUI: settings.get_api_key("openwebui"),
        StorageBackend.R2R: None,  # R2R is self-hosted, no API key needed
    }

    api_key_raw: str | None = storage_api_keys[job.storage_backend]
    api_key: SecretStr | None = SecretStr(api_key_raw) if api_key_raw is not None else None

    return StorageConfig(
        backend=job.storage_backend,
        endpoint=storage_endpoints[job.storage_backend],
        api_key=api_key,
        collection_name=collection_name,
    )


def _instantiate_storage(backend: StorageBackend, config: StorageConfig) -> BaseStorage:
    """Instantiate storage based on backend type."""
    if backend == StorageBackend.WEAVIATE:
        return WeaviateStorage(config)
    elif backend == StorageBackend.OPEN_WEBUI:
        return OpenWebUIStorage(config)
    elif backend == StorageBackend.R2R:
        if RuntimeR2RStorage is None:
            raise ValueError("R2R storage not available. Check dependencies.")
        return RuntimeR2RStorage(config)

    assert_never(backend)


def _chunk_urls(urls: list[str], chunk_size: int) -> list[list[str]]:
    """Group URLs into fixed-size chunks for batch processing."""

    if chunk_size <= 0:
        raise ValueError("chunk_size must be greater than zero")

    return [urls[i : i + chunk_size] for i in range(0, len(urls), chunk_size)]


def _deduplicate_urls(urls: list[str]) -> list[str]:
    """Return the URLs with order preserved and duplicates removed."""

    seen: set[str] = set()
    unique: list[str] = []
    for url in urls:
        if url not in seen:
            seen.add(url)
            unique.append(url)
    return unique


async def _process_documents(
    ingestor: BaseIngestor,
    storage: BaseStorage,
    job: IngestionJob,
    batch_size: int,
    collection_name: str | None,
    progress_callback: Callable[[int, str], None] | None = None,
) -> tuple[int, int]:
    """Process documents in batches."""
    processed = 0
    failed = 0
    batch: list[Document] = []
    total_documents = 0
    batch_count = 0

    if progress_callback:
        progress_callback(45, "Ingesting documents from source...")

    # Use smart ingestion with deduplication if storage supports it
    if hasattr(storage, "check_exists"):
        try:
            # Try to use the smart ingestion method
            document_generator = ingestor.ingest_with_dedup(
                job, storage, collection_name=collection_name
            )
        except Exception:
            # Fall back to regular ingestion if smart method fails
            document_generator = ingestor.ingest(job)
    else:
        document_generator = ingestor.ingest(job)

    async for document in document_generator:
        batch.append(document)
        total_documents += 1

        if len(batch) >= batch_size:
            batch_count += 1
            if progress_callback:
                progress_callback(
                    45 + min(35, (batch_count * 10)),
                    f"Processing batch {batch_count} ({total_documents} documents so far)...",
                )

            batch_processed, batch_failed = await _store_batch(storage, batch, collection_name)
            processed += batch_processed
            failed += batch_failed
            batch = []

    # Process remaining batch
    if batch:
        batch_count += 1
        if progress_callback:
            progress_callback(80, f"Processing final batch ({total_documents} total documents)...")

        batch_processed, batch_failed = await _store_batch(storage, batch, collection_name)
        processed += batch_processed
        failed += batch_failed

    if progress_callback:
        progress_callback(85, f"Completed processing {total_documents} documents")

    return processed, failed


async def _store_batch(
    storage: BaseStorage,
    batch: list[Document],
    collection_name: str | None,
) -> tuple[int, int]:
    """Store a batch of documents and return processed/failed counts."""
    try:
        # Apply metadata tagging for backends that benefit from it
        processed_batch = batch
        if hasattr(storage, "config") and storage.config.backend in (
            StorageBackend.R2R,
            StorageBackend.WEAVIATE,
        ):
            try:
                from ..config import get_settings

                settings = get_settings()
                async with MetadataTagger(llm_endpoint=str(settings.llm_endpoint)) as tagger:
                    processed_batch = await tagger.tag_batch(batch)
            except Exception as exc:
                print(f"Metadata tagging failed, using original documents: {exc}")
                processed_batch = batch

        stored_ids = await storage.store_batch(processed_batch, collection_name=collection_name)
        processed_count = len(stored_ids)
        failed_count = len(processed_batch) - processed_count

        batch_type = (
            "final" if len(processed_batch) < 50 else ""
        )  # Assume standard batch size is 50
        print(f"Successfully stored {processed_count} documents in {batch_type} batch".strip())

        return processed_count, failed_count
    except Exception as e:
        batch_type = "Final" if len(batch) < 50 else "Batch"
        print(f"{batch_type} storage failed: {e}")
        return 0, len(batch)


@flow(
    name="firecrawl_to_r2r",
    description="Ingest Firecrawl pages into R2R with metadata annotation",
    persist_result=False,
    log_prints=True,
)
async def firecrawl_to_r2r_flow(
    job: IngestionJob,
    collection_name: str | None = None,
    progress_callback: Callable[[int, str], None] | None = None,
) -> tuple[int, int]:
    """Specialized flow for Firecrawl ingestion into R2R."""
    logger = get_run_logger()
    from ..config import get_settings

    if progress_callback:
        progress_callback(35, "Initializing Firecrawl and R2R storage...")

    settings = get_settings()
    firecrawl_config = FirecrawlConfig()
    resolved_collection = collection_name or f"docs_{job.source_type.value}"

    storage_config = _build_storage_config(job, settings, resolved_collection)
    storage_client = await initialize_storage_task(storage_config)

    if RuntimeR2RStorage is None or not isinstance(storage_client, RuntimeR2RStorage):
        raise IngestionError("Firecrawl to R2R flow requires an R2R storage backend")

    r2r_storage = cast("R2RStorageType", storage_client)

    if progress_callback:
        progress_callback(45, "Checking for existing content before mapping...")

    # Smart mapping: try single URL first to avoid expensive map operation
    base_url = str(job.source_url)
    single_url_id = str(FirecrawlIngestor.compute_document_id(base_url))
    base_exists = await r2r_storage.check_exists(
        single_url_id, collection_name=resolved_collection, stale_after_days=30
    )

    if base_exists:
        # Check if this is a recent single-page update
        logger.info("Base URL %s exists and is fresh, skipping expensive mapping", base_url)
        if progress_callback:
            progress_callback(100, "Content is up to date, no processing needed")
        return 0, 0

    if progress_callback:
        progress_callback(50, "Discovering pages with Firecrawl...")

    discovered_urls = await map_firecrawl_site_task(base_url, firecrawl_config)
    unique_urls = _deduplicate_urls(discovered_urls)
    logger.info("Discovered %s unique URLs from Firecrawl map", len(unique_urls))

    if progress_callback:
        progress_callback(60, f"Found {len(unique_urls)} pages, filtering existing content...")

    eligible_urls = await filter_existing_documents_task(
        unique_urls, r2r_storage, collection_name=resolved_collection
    )

    if not eligible_urls:
        logger.info("All Firecrawl pages are up to date for %s", job.source_url)
        if progress_callback:
            progress_callback(100, "All pages are up to date, no processing needed")
        return 0, 0

    if progress_callback:
        progress_callback(70, f"Scraping {len(eligible_urls)} new/updated pages...")

    batch_size = min(settings.default_batch_size, firecrawl_config.limit)
    url_batches = _chunk_urls(eligible_urls, batch_size)
    logger.info("Scraping %s batches of Firecrawl pages", len(url_batches))

    # Use asyncio.gather for concurrent scraping
    import asyncio

    scrape_tasks = [scrape_firecrawl_batch_task(batch, firecrawl_config) for batch in url_batches]
    batch_results = await asyncio.gather(*scrape_tasks)

    scraped_pages: list[FirecrawlPage] = []
    for batch_pages in batch_results:
        scraped_pages.extend(batch_pages)

    if progress_callback:
        progress_callback(80, f"Processing {len(scraped_pages)} scraped pages...")

    documents = await annotate_firecrawl_metadata_task(scraped_pages, job)

    if not documents:
        logger.warning("No documents produced after scraping for %s", job.source_url)
        return 0, len(eligible_urls)

    if progress_callback:
        progress_callback(90, f"Storing {len(documents)} documents in R2R...")

    processed, failed = await upsert_r2r_documents_task(r2r_storage, documents, resolved_collection)

    logger.info("Upserted %s documents into R2R (%s failed)", processed, failed)

    return processed, failed


@task(name="update_job_status", tags=["tracking"])
async def update_job_status_task(
    job: IngestionJob,
    status: IngestionStatus,
    processed: int = 0,
    _failed: int = 0,
    error: str | None = None,
) -> IngestionJob:
    """
    Update job status.

    Args:
        job: Ingestion job
        status: New status
        processed: Documents processed
        _failed: Documents failed (currently unused)
        error: Error message if any

    Returns:
        Updated job
    """
    job.status = status
    job.updated_at = datetime.now(UTC)
    job.document_count = processed

    if status == IngestionStatus.COMPLETED:
        job.completed_at = datetime.now(UTC)

    if error:
        job.error_message = error

    return job


@flow(
    name="ingestion_pipeline",
    description="Main ingestion pipeline for documents",
    retries=1,
    retry_delay_seconds=60,
    persist_result=True,
    log_prints=True,
)
async def create_ingestion_flow(
    source_url: str,
    source_type: SourceTypeLike,
    storage_backend: StorageBackendLike = StorageBackend.WEAVIATE,
    collection_name: str | None = None,
    validate_first: bool = True,
    progress_callback: Callable[[int, str], None] | None = None,
) -> IngestionResult:
    """
    Main ingestion flow.

    Args:
        source_url: URL or path to source
        source_type: Type of source
        storage_backend: Storage backend to use
        validate_first: Whether to validate source first
        progress_callback: Optional callback for progress updates

    Returns:
        Ingestion result
    """
    print(f"Starting ingestion from {source_url}")

    source_enum = IngestionSource(source_type)
    backend_enum = StorageBackend(storage_backend)

    # Create job
    job = IngestionJob(
        source_url=source_url,
        source_type=source_enum,
        storage_backend=backend_enum,
        status=IngestionStatus.PENDING,
    )

    start_time = datetime.now(UTC)
    error_messages: list[str] = []
    processed = 0
    failed = 0

    try:
        # Validate source if requested
        if validate_first:
            if progress_callback:
                progress_callback(10, "Validating source...")
            print("Validating source...")
            is_valid = await validate_source_task(source_url, job.source_type)

            if not is_valid:
                raise IngestionError(f"Source validation failed: {source_url}")

        # Update status to in progress
        if progress_callback:
            progress_callback(20, "Initializing storage...")
        job = await update_job_status_task(job, IngestionStatus.IN_PROGRESS)

        # Run ingestion
        if progress_callback:
            progress_callback(30, "Starting document ingestion...")
        print("Ingesting documents...")
        if job.source_type == IngestionSource.WEB and job.storage_backend == StorageBackend.R2R:
            processed, failed = await firecrawl_to_r2r_flow(
                job, collection_name, progress_callback=progress_callback
            )
        else:
            processed, failed = await ingest_documents_task(
                job, collection_name, progress_callback=progress_callback
            )

        if progress_callback:
            progress_callback(90, "Finalizing ingestion...")

        # Update final status
        if failed > 0:
            error_messages.append(f"{failed} documents failed to process")

        # Set status based on results
        if processed == 0 and failed > 0:
            final_status = IngestionStatus.FAILED
        elif failed > 0:
            final_status = IngestionStatus.PARTIAL
        else:
            final_status = IngestionStatus.COMPLETED

        job = await update_job_status_task(job, final_status, processed=processed, _failed=failed)

        print(f"Ingestion completed: {processed} processed, {failed} failed")

    except Exception as e:
        print(f"Ingestion failed: {e}")
        error_messages.append(str(e))

        # Don't reset counts - keep whatever was processed before the error
        job = await update_job_status_task(
            job, IngestionStatus.FAILED, processed=processed, _failed=failed, error=str(e)
        )

    # Calculate duration
    duration = (datetime.now(UTC) - start_time).total_seconds()

    return IngestionResult(
        job_id=job.id,
        status=job.status,
        documents_processed=processed,
        documents_failed=failed,
        duration_seconds=duration,
        error_messages=error_messages,
    )
</file>

<file path="ingest_pipeline/flows/scheduler.py">
"""Scheduler for Prefect deployments."""

from datetime import timedelta
from typing import Literal, Protocol, cast

from prefect.deployments.runner import RunnerDeployment
from prefect.flows import serve as prefect_serve
from prefect.schedules import Cron, Interval
from prefect.variables import Variable

from ..core.models import IngestionSource, StorageBackend
from .ingestion import SourceTypeLike, StorageBackendLike, create_ingestion_flow


class FlowWithDeployment(Protocol):
    """Protocol for flows that have deployment methods."""

    def to_deployment(
        self,
        name: str,
        **kwargs: object,
    ) -> RunnerDeployment:
        """Create a deployment from this flow."""
        ...


def create_scheduled_deployment(
    name: str,
    source_url: str,
    source_type: SourceTypeLike,
    storage_backend: StorageBackendLike = StorageBackend.WEAVIATE,
    schedule_type: Literal["cron", "interval"] = "interval",
    cron_expression: str | None = None,
    interval_minutes: int | None = None,
    tags: list[str] | None = None,
    storage_block_name: str | None = None,
    ingestor_config_block_name: str | None = None,
) -> RunnerDeployment:
    """
    Create a scheduled deployment for ingestion with block support.

    Args:
        name: Deployment name
        source_url: Source to ingest from
        source_type: Type of source
        storage_backend: Storage backend
        schedule_type: Type of schedule
        cron_expression: Cron expression if using cron
        interval_minutes: Interval in minutes (uses Variable if None)
        tags: Optional tags for deployment
        storage_block_name: Optional storage block name
        ingestor_config_block_name: Optional ingestor config block name

    Returns:
        Deployment configuration
    """
    # Use Variable for interval if not provided
    if interval_minutes is None:
        try:
            interval_var = Variable.get("default_schedule_interval", default="60")
            # Convert Variable result to int, handling various types
            if isinstance(interval_var, int):
                interval_minutes = interval_var
            elif isinstance(interval_var, (str, float)):
                interval_minutes = int(float(str(interval_var)))
            else:
                interval_minutes = 60
        except Exception:
            interval_minutes = 60

    # Create schedule
    if schedule_type == "cron" and cron_expression:
        schedule = Cron(cron_expression, timezone="UTC")
    else:
        schedule = Interval(timedelta(minutes=interval_minutes), timezone="UTC")

    # Default tags
    source_enum = IngestionSource(source_type)
    backend_enum = StorageBackend(storage_backend)

    if tags is None:
        tags = [source_enum.value, backend_enum.value]

    # Create deployment parameters with block support
    parameters: dict[str, str | bool] = {
        "source_url": source_url,
        "source_type": source_enum.value,
        "storage_backend": backend_enum.value,
        "validate_first": True,
    }

    # Add block names if provided
    if storage_block_name:
        parameters["storage_block_name"] = storage_block_name
    if ingestor_config_block_name:
        parameters["ingestor_config_block_name"] = ingestor_config_block_name

    # Create deployment
    # The flow decorator adds the to_deployment method at runtime
    flow_with_deployment = cast(FlowWithDeployment, create_ingestion_flow)
    return flow_with_deployment.to_deployment(
        name=name,
        schedule=schedule,
        parameters=parameters,
        tags=tags,
        description=f"Scheduled ingestion from {source_url}",
    )


def serve_deployments(deployments: list[RunnerDeployment]) -> None:
    """
    Serve multiple deployments.

    Args:
        deployments: List of deployment configurations
    """
    prefect_serve(*deployments, limit=10)
</file>

<file path="ingest_pipeline/ingestors/firecrawl.py">
"""Firecrawl ingestor for web and documentation sites."""

import asyncio
import logging
import re
from collections.abc import AsyncGenerator, Awaitable, Callable
from dataclasses import dataclass
from datetime import UTC, datetime
from typing import TYPE_CHECKING, Protocol, cast
from urllib.parse import urlparse
from uuid import NAMESPACE_URL, UUID, uuid5

from firecrawl import AsyncFirecrawl
from typing_extensions import override

from ..config import get_settings
from ..core.exceptions import IngestionError
from ..core.models import (
    Document,
    DocumentMetadata,
    FirecrawlConfig,
    IngestionJob,
    IngestionSource,
)
from .base import BaseIngestor

if TYPE_CHECKING:
    from ..storage.base import BaseStorage


class FirecrawlMetadata(Protocol):
    """Protocol for Firecrawl metadata objects."""

    title: str | None
    description: str | None
    author: str | None
    language: str | None
    sitemap_last_modified: str | None
    sourceURL: str | None
    keywords: str | list[str] | None
    robots: str | None
    ogTitle: str | None
    ogDescription: str | None
    ogUrl: str | None
    ogImage: str | None
    twitterCard: str | None
    twitterSite: str | None
    twitterCreator: str | None
    favicon: str | None
    statusCode: int | None


class FirecrawlResult(Protocol):
    """Protocol for Firecrawl scrape result objects."""

    metadata: FirecrawlMetadata | None
    markdown: str | None


class FirecrawlMapLink(Protocol):
    """Protocol for Firecrawl map link objects."""

    url: str


class FirecrawlMapResult(Protocol):
    """Protocol for Firecrawl map result objects."""

    links: list[FirecrawlMapLink] | None


class AsyncFirecrawlSession(Protocol):
    """Protocol for AsyncFirecrawl session objects."""

    async def close(self) -> None: ...


class AsyncFirecrawlClient(Protocol):
    """Protocol for AsyncFirecrawl client objects."""

    _session: AsyncFirecrawlSession | None

    async def close(self) -> None: ...

    async def scrape(self, url: str, formats: list[str]) -> FirecrawlResult: ...

    async def map(self, url: str, limit: int | None = None) -> "FirecrawlMapResult": ...


class FirecrawlError(IngestionError):
    """Base exception for Firecrawl-related errors."""

    status_code: int | None

    def __init__(self, message: str, status_code: int | None = None) -> None:
        super().__init__(message)
        self.status_code = status_code


class FirecrawlConnectionError(FirecrawlError):
    """Connection error with Firecrawl service."""

    pass


class FirecrawlRateLimitError(FirecrawlError):
    """Rate limit exceeded error."""

    pass


class FirecrawlUnauthorizedError(FirecrawlError):
    """Unauthorized access error."""

    pass


async def retry_with_backoff(
    operation: Callable[[], Awaitable[object]], max_retries: int = 3
) -> object:
    """Retry operation with exponential backoff following Firecrawl best practices."""
    for attempt in range(max_retries):
        try:
            return await operation()
        except Exception as e:
            if attempt == max_retries - 1:
                raise e
            delay: float = 1.0 * (2**attempt)
            logging.warning(
                f"Firecrawl operation failed (attempt {attempt + 1}/{max_retries}): {e}. Retrying in {delay:.1f}s..."
            )
            await asyncio.sleep(delay)

    # This should never be reached due to the exception handling above,
    # but mypy requires a return statement for all code paths
    raise RuntimeError("Retry loop completed without return or exception")


@dataclass(slots=True)
class FirecrawlPage:
    """Structured representation of a scraped Firecrawl page."""

    url: str
    content: str
    title: str | None
    description: str | None
    author: str | None = None
    language: str | None = None
    sitemap_last_modified: str | None = None
    source_url: str | None = None
    keywords: list[str] | None = None
    robots: str | None = None
    og_title: str | None = None
    og_description: str | None = None
    og_url: str | None = None
    og_image: str | None = None
    twitter_card: str | None = None
    twitter_site: str | None = None
    twitter_creator: str | None = None
    favicon: str | None = None
    status_code: int | None = None


class FirecrawlIngestor(BaseIngestor):
    """Ingestor for web and documentation sites using Firecrawl."""

    config: FirecrawlConfig
    client: AsyncFirecrawlClient

    def __init__(self, config: FirecrawlConfig | None = None):
        """
        Initialize Firecrawl ingestor.

        Args:
            config: Firecrawl configuration (for operational params only)
        """
        self.config = config or FirecrawlConfig()
        settings = get_settings()

        # All connection details come from settings/.env
        # For self-hosted instances, use a dummy API key if none is provided
        # The SDK requires an API key even for self-hosted instances
        api_key = settings.firecrawl_api_key or "no-key-required"

        # Initialize AsyncFirecrawl following official pattern
        # Note: api_url parameter may not be supported in all versions
        # Default to standard initialization for cloud instances
        try:
            endpoint_str = str(settings.firecrawl_endpoint).rstrip("/")
            if endpoint_str.startswith("http://crawl.lab") or endpoint_str.startswith(
                "http://localhost"
            ):
                # Self-hosted instance - try with api_url if supported
                self.client = cast(
                    AsyncFirecrawlClient,
                    AsyncFirecrawl(api_key=api_key, api_url=str(settings.firecrawl_endpoint)),
                )
            else:
                # Cloud instance - use standard initialization
                self.client = cast(AsyncFirecrawlClient, AsyncFirecrawl(api_key=api_key))
        except Exception:
            # Fallback to standard initialization
            self.client = cast(AsyncFirecrawlClient, AsyncFirecrawl(api_key=api_key))

    @override
    async def ingest(self, job: IngestionJob) -> AsyncGenerator[Document, None]:
        """
        Ingest documents from a web source.

        Args:
            job: The ingestion job configuration

        Yields:
            Documents from the web source
        """
        url = str(job.source_url)

        # First, map the site to understand its structure
        site_map = await self.map_site(url) or [url]

        # Process pages in batches
        batch_size = 10
        for i in range(0, len(site_map), batch_size):
            batch_urls = site_map[i : i + batch_size]
            pages = await self.scrape_pages(batch_urls)

            for page in pages:
                yield self.create_document(page, job)

    async def ingest_with_dedup(
        self,
        job: IngestionJob,
        storage_client: "BaseStorage",
        *,
        collection_name: str | None = None,
        stale_after_days: int = 30,
    ) -> AsyncGenerator[Document, None]:
        """
        Ingest documents with duplicate detection to avoid unnecessary scraping.

        Args:
            job: The ingestion job configuration
            storage_client: Storage client to check for existing documents
            collection_name: Collection to check for duplicates
            stale_after_days: Consider documents stale after this many days

        Yields:
            Documents from the web source (only new/stale ones)
        """
        url = str(job.source_url)

        # First, map the site to understand its structure
        site_map = await self.map_site(url) or [url]

        # Filter out URLs that already exist in storage and are fresh
        eligible_urls: list[str] = []
        for check_url in site_map:
            document_id = str(self.compute_document_id(check_url))
            exists = await storage_client.check_exists(
                document_id, collection_name=collection_name, stale_after_days=stale_after_days
            )
            if not exists:
                eligible_urls.append(check_url)

        if not eligible_urls:
            return  # No new documents to scrape

        # Process eligible pages in batches
        batch_size = 10
        for i in range(0, len(eligible_urls), batch_size):
            batch_urls = eligible_urls[i : i + batch_size]
            pages = await self.scrape_pages(batch_urls)

            for page in pages:
                yield self.create_document(page, job)

    async def map_site(self, url: str) -> list[str]:
        """Public wrapper for mapping a site."""

        return await self._map_site(url)

    async def scrape_pages(self, urls: list[str]) -> list[FirecrawlPage]:
        """Scrape a batch of URLs and return structured page data."""

        return await self._scrape_batch(urls)

    @override
    async def validate_source(self, source_url: str) -> bool:
        """
        Validate if the web source is accessible.

        Args:
            source_url: URL to validate

        Returns:
            True if source is accessible
        """
        try:
            # Use SDK v2 endpoints following official pattern with retry
            async def validate_operation() -> bool:
                result = await self.client.scrape(source_url, formats=["markdown"])
                return result is not None and getattr(result, "markdown", None) is not None

            result = await retry_with_backoff(validate_operation)
            return bool(result)
        except Exception as e:
            logging.warning(f"Failed to validate source {source_url}: {e}")
            return False

    @override
    async def estimate_size(self, source_url: str) -> int:
        """
        Estimate the number of pages in the website.

        Args:
            source_url: URL of the website

        Returns:
            Estimated number of pages
        """
        try:
            site_map = await self._map_site(source_url)
            return len(site_map) if site_map else 0
        except Exception as e:
            logging.warning(f"Failed to estimate size for {source_url}: {e}")
            return 0

    async def _map_site(self, url: str) -> list[str]:
        """
        Map a website to get all URLs.

        Args:
            url: Base URL to map

        Returns:
            List of URLs found
        """
        try:
            # Use SDK v2 map endpoint following official pattern
            result: FirecrawlMapResult = await self.client.map(url=url, limit=self.config.limit)

            if result and result.links:
                # Extract URLs from the result following official pattern
                return [link.url for link in result.links]
            return []
        except Exception as e:
            # If map fails (might not be available in all versions), fall back to single URL
            logging.warning(f"Map endpoint not available or failed: {e}. Using single URL.")
            return [url]

    async def _scrape_batch(self, urls: list[str]) -> list[FirecrawlPage]:
        """
        Scrape a batch of URLs.

        Args:
            urls: List of URLs to scrape

        Returns:
            List of scraped documents
        """
        tasks = [self._scrape_single(url) for url in urls]

        results = await asyncio.gather(*tasks, return_exceptions=True)

        pages: list[FirecrawlPage] = []
        for result in results:
            if isinstance(result, FirecrawlPage):
                pages.append(result)
            elif isinstance(result, BaseException):
                continue

        return pages

    async def _scrape_single(self, url: str) -> FirecrawlPage | None:
        """
        Scrape a single URL and extract rich metadata.

        Args:
            url: URL to scrape

        Returns:
            Scraped document data with enhanced metadata
        """
        try:
            # Use SDK v2 scrape endpoint following official pattern with retry
            async def scrape_operation() -> FirecrawlPage | None:
                result: FirecrawlResult = await self.client.scrape(url, formats=self.config.formats)

                # Extract data from the result following official response handling
                if result:
                    # The SDK returns a ScrapeData object with typed metadata
                    metadata: FirecrawlMetadata | None = getattr(result, "metadata", None)

                    # Extract basic metadata
                    title: str | None = getattr(metadata, "title", None) if metadata else None
                    description: str | None = (
                        getattr(metadata, "description", None) if metadata else None
                    )

                    # Extract enhanced metadata if available
                    author: str | None = getattr(metadata, "author", None) if metadata else None
                    language: str | None = getattr(metadata, "language", None) if metadata else None
                    sitemap_last_modified: str | None = (
                        getattr(metadata, "sitemap_last_modified", None) if metadata else None
                    )
                    source_url: str | None = (
                        getattr(metadata, "sourceURL", None) if metadata else None
                    )
                    keywords: str | list[str] | None = (
                        getattr(metadata, "keywords", None) if metadata else None
                    )
                    robots: str | None = getattr(metadata, "robots", None) if metadata else None

                    # Open Graph metadata
                    og_title: str | None = getattr(metadata, "ogTitle", None) if metadata else None
                    og_description: str | None = (
                        getattr(metadata, "ogDescription", None) if metadata else None
                    )
                    og_url: str | None = getattr(metadata, "ogUrl", None) if metadata else None
                    og_image: str | None = getattr(metadata, "ogImage", None) if metadata else None

                    # Twitter metadata
                    twitter_card: str | None = (
                        getattr(metadata, "twitterCard", None) if metadata else None
                    )
                    twitter_site: str | None = (
                        getattr(metadata, "twitterSite", None) if metadata else None
                    )
                    twitter_creator: str | None = (
                        getattr(metadata, "twitterCreator", None) if metadata else None
                    )

                    # Additional metadata
                    favicon: str | None = getattr(metadata, "favicon", None) if metadata else None
                    status_code: int | None = (
                        getattr(metadata, "statusCode", None) if metadata else None
                    )

                    return FirecrawlPage(
                        url=url,
                        content=getattr(result, "markdown", "") or "",
                        title=title,
                        description=description,
                        author=author,
                        language=language,
                        sitemap_last_modified=sitemap_last_modified,
                        source_url=source_url,
                        keywords=keywords.split(",")
                        if keywords and isinstance(keywords, str)
                        else (keywords if isinstance(keywords, list) else None),
                        robots=robots,
                        og_title=og_title,
                        og_description=og_description,
                        og_url=og_url,
                        og_image=og_image,
                        twitter_card=twitter_card,
                        twitter_site=twitter_site,
                        twitter_creator=twitter_creator,
                        favicon=favicon,
                        status_code=status_code,
                    )
                return None

            result = await retry_with_backoff(scrape_operation)
            return result if isinstance(result, FirecrawlPage) else None
        except Exception as e:
            logging.debug(f"Failed to scrape {url}: {e}")
            return None

    @staticmethod
    def compute_document_id(source_url: str) -> UUID:
        """Derive a deterministic UUID for a document based on its source URL."""
        return uuid5(NAMESPACE_URL, source_url)

    @staticmethod
    def _analyze_content_structure(content: str) -> dict[str, str | int | bool | list[str]]:
        """Analyze markdown content to extract structural information."""
        # Extract heading hierarchy
        heading_pattern = r"^(#{1,6})\s+(.+)$"
        headings: list[str] = []
        for match in re.finditer(heading_pattern, content, re.MULTILINE):
            level = len(match.group(1))
            text = match.group(2).strip()
            headings.append(f"{'  ' * (level - 1)}{text}")

        # Check for various content types
        has_code_blocks = bool(re.search(r"```[\s\S]*?```", content))
        has_images = bool(re.search(r"!\[.*?\]\(.*?\)", content))
        has_links = bool(re.search(r"\[.*?\]\(.*?\)", content))

        # Calculate section depth
        max_depth = 0
        if headings:
            for heading in headings:
                heading_str: str = str(heading)
                depth = (len(heading_str) - len(heading_str.lstrip())) // 2 + 1
                max_depth = max(max_depth, depth)

        return {
            "heading_hierarchy": headings,
            "section_depth": max_depth,
            "has_code_blocks": has_code_blocks,
            "has_images": has_images,
            "has_links": has_links,
        }

    @staticmethod
    def _calculate_content_quality(content: str, title: str | None) -> dict[str, float | None]:
        """Calculate basic content quality metrics."""
        if not content:
            return {"readability_score": None, "completeness_score": None}

        # Simple readability approximation (Flesch-like)
        sentences = len(re.findall(r"[.!?]+", content))
        words = len(content.split())

        if sentences == 0 or words == 0:
            readability_score = None
        else:
            avg_sentence_length = words / sentences
            # Simplified readability score (0-100, higher is more readable)
            readability_score = max(0.0, min(100.0, 100.0 - (avg_sentence_length - 15.0) * 2.0))

        # Completeness score based on structure
        completeness_factors = 0
        total_factors = 5

        if title:
            completeness_factors += 1
        if len(content) > 500:
            completeness_factors += 1
        if re.search(r"^#{1,6}\s+", content, re.MULTILINE):
            completeness_factors += 1
        if len(content.split()) > 100:
            completeness_factors += 1
        if not re.search(r"(error|404|not found|page not found)", content, re.IGNORECASE):
            completeness_factors += 1

        completeness_score = (completeness_factors / total_factors) * 100

        return {
            "readability_score": readability_score,
            "completeness_score": completeness_score,
        }

    @staticmethod
    def _extract_domain_info(url: str) -> dict[str, str]:
        """Extract domain and site information from URL."""
        parsed = urlparse(url)
        domain = parsed.netloc.lower()

        # Remove www. prefix
        if domain.startswith("www."):
            domain = domain[4:]

        # Extract site name from domain
        domain_parts = domain.split(".")
        site_name = domain_parts[0].replace("-", " ").replace("_", " ").title()

        return {
            "domain": domain,
            "site_name": site_name,
        }

    def create_document(self, page: FirecrawlPage, job: IngestionJob) -> Document:
        """
        Create a Document from scraped data with enriched metadata.

        Args:
            page: Scraped document data
            job: The ingestion job

        Returns:
            Document instance with rich metadata
        """
        content = page.content
        source_url = page.url

        # Analyze content structure
        structure_info = self._analyze_content_structure(content)

        # Calculate quality metrics
        quality_info = self._calculate_content_quality(content, page.title)

        # Extract domain information
        domain_info = self._extract_domain_info(source_url)

        # Build rich metadata
        metadata: DocumentMetadata = {
            # Core required fields
            "source_url": source_url,
            "timestamp": datetime.now(UTC),
            "content_type": "text/markdown",
            "word_count": len(content.split()),
            "char_count": len(content),
            # Basic optional fields
            "title": page.title or f"Page from {source_url}",
            "description": page.description
            or page.og_description
            or f"Content scraped from {source_url}",
            # Content categorization
            "tags": page.keywords or [],
            "language": page.language or "en",
            # Authorship and source info
            "author": page.author or page.twitter_creator or "Unknown",
            "domain": domain_info["domain"],
            "site_name": domain_info["site_name"],
            # Document structure
            "heading_hierarchy": (
                list(hierarchy_val)
                if (hierarchy_val := structure_info.get("heading_hierarchy"))
                and isinstance(hierarchy_val, (list, tuple))
                else []
            ),
            "section_depth": (
                int(depth_val)
                if (depth_val := structure_info.get("section_depth"))
                and isinstance(depth_val, (int, str))
                else 0
            ),
            "has_code_blocks": bool(structure_info.get("has_code_blocks", False)),
            "has_images": bool(structure_info.get("has_images", False)),
            "has_links": bool(structure_info.get("has_links", False)),
            # Processing metadata
            "extraction_method": "firecrawl",
            "last_modified": datetime.fromisoformat(page.sitemap_last_modified)
            if page.sitemap_last_modified
            else None,
            # Content quality indicators
            "readability_score": quality_info["readability_score"],
            "completeness_score": quality_info["completeness_score"],
        }

        # Note: Additional web-specific metadata like og_title, twitter_card etc.
        # would need to be added to DocumentMetadata TypedDict if needed

        return Document(
            id=self.compute_document_id(source_url),
            content=content,
            metadata=metadata,
            source=IngestionSource.WEB,
            collection=job.storage_backend.value,
        )

    async def close(self) -> None:
        """Close the Firecrawl client and cleanup resources."""
        # AsyncFirecrawl may not have explicit close method in all versions
        # This is defensive cleanup following best practices
        if hasattr(self.client, "close"):
            try:
                await self.client.close()
            except Exception as e:
                logging.debug(f"Error closing Firecrawl client: {e}")
        elif (
            hasattr(self.client, "_session")
            and self.client._session
            and hasattr(self.client._session, "close")
        ):
            try:
                await self.client._session.close()
            except Exception as e:
                logging.debug(f"Error closing Firecrawl session: {e}")

    async def __aenter__(self) -> "FirecrawlIngestor":
        """Async context manager entry."""
        return self

    async def __aexit__(
        self,
        exc_type: type[BaseException] | None,
        exc_val: BaseException | None,
        exc_tb: object | None,
    ) -> None:
        """Async context manager exit with cleanup."""
        await self.close()
</file>

<file path="ingest_pipeline/utils/metadata_tagger.py">
"""Metadata tagger for enriching documents with AI-generated tags and metadata."""

import json
from datetime import UTC, datetime
from typing import Final, Protocol, TypedDict, cast

import httpx

from ..config import get_settings
from ..core.exceptions import IngestionError
from ..core.models import Document

JSON_CONTENT_TYPE: Final[str] = "application/json"
AUTHORIZATION_HEADER: Final[str] = "Authorization"


class HttpResponse(Protocol):
    """Protocol for HTTP response."""

    def raise_for_status(self) -> None: ...
    def json(self) -> dict[str, object]: ...


class AsyncHttpClient(Protocol):
    """Protocol for async HTTP client."""

    async def post(self, url: str, *, json: dict[str, object] | None = None) -> HttpResponse: ...

    async def aclose(self) -> None: ...

    async def __aenter__(self) -> "AsyncHttpClient": ...

    async def __aexit__(
        self,
        exc_type: type[BaseException] | None,
        exc_val: BaseException | None,
        exc_tb: object | None,
    ) -> None: ...


class LlmResponse(TypedDict):
    """Type for LLM API response structure."""

    choices: list[dict[str, object]]


class LlmChoice(TypedDict):
    """Type for individual choice in LLM response."""

    message: dict[str, object]


class LlmMessage(TypedDict):
    """Type for message in LLM choice."""

    content: str


class DocumentMetadata(TypedDict, total=False):
    """Structured metadata for documents."""

    tags: list[str]
    category: str
    summary: str
    key_topics: list[str]
    document_type: str
    language: str
    technical_level: str


class MetadataTagger:
    """Generates metadata tags for documents using language models."""

    endpoint: str
    model: str
    client: AsyncHttpClient

    def __init__(
        self,
        llm_endpoint: str | None = None,
        model: str | None = None,
        api_key: str | None = None,
        *,
        timeout: float | None = None,
    ):
        """
        Initialize metadata tagger.

        Args:
            llm_endpoint: LLM API endpoint
            model: Model to use for tagging
            api_key: Explicit API key override
            timeout: Optional request timeout override in seconds
        """
        settings = get_settings()
        endpoint_value = llm_endpoint or str(settings.llm_endpoint)
        self.endpoint = endpoint_value.rstrip("/")
        self.model = model or settings.metadata_model

        resolved_timeout = timeout if timeout is not None else float(settings.request_timeout)
        resolved_api_key = api_key or settings.get_llm_api_key() or ""

        headers: dict[str, str] = {"Content-Type": JSON_CONTENT_TYPE}
        if resolved_api_key:
            headers[AUTHORIZATION_HEADER] = f"Bearer {resolved_api_key}"

        # Create client with proper typing - httpx.AsyncClient implements AsyncHttpClient protocol
        self.client = cast(
            AsyncHttpClient,
            httpx.AsyncClient(timeout=resolved_timeout, headers=headers),
        )

    async def tag_document(
        self, document: Document, custom_instructions: str | None = None
    ) -> Document:
        """
        Analyze document and generate metadata tags.

        Args:
            document: Document to tag
            custom_instructions: Optional custom instructions for tagging

        Returns:
            Document with enriched metadata
        """
        if not document.content:
            return document

        try:
            # Generate metadata using LLM
            metadata = await self._generate_metadata(
                document.content,
                document.metadata.get("title") if document.metadata else None,
                custom_instructions,
            )

            # Merge with existing metadata - preserve ALL existing fields and add LLM-generated ones

            from ..core.models import DocumentMetadata as CoreDocumentMetadata

            # Start with a copy of existing metadata to preserve all fields
            updated_metadata = dict(document.metadata)

            # Update/enhance with LLM-generated metadata, preserving existing values when new ones are empty
            if title_val := metadata.get("title"):
                if not updated_metadata.get("title") and isinstance(title_val, str):
                    updated_metadata["title"] = title_val
            if summary_val := metadata.get("summary"):
                if not updated_metadata.get("description"):
                    updated_metadata["description"] = str(summary_val)

            # Ensure required fields have values
            _ = updated_metadata.setdefault("source_url", "")
            _ = updated_metadata.setdefault("timestamp", datetime.now(UTC))
            _ = updated_metadata.setdefault("content_type", "text/plain")
            _ = updated_metadata.setdefault("word_count", len(document.content.split()))
            _ = updated_metadata.setdefault("char_count", len(document.content))

            # Build a proper DocumentMetadata instance with only valid keys
            new_metadata: CoreDocumentMetadata = {
                "source_url": str(updated_metadata.get("source_url", "")),
                "timestamp": (lambda ts: ts if isinstance(ts, datetime) else datetime.now(UTC))(
                    updated_metadata.get("timestamp", datetime.now(UTC))
                ),
                "content_type": str(updated_metadata.get("content_type", "text/plain")),
                "word_count": (lambda wc: int(wc) if isinstance(wc, (int, str)) else 0)(
                    updated_metadata.get("word_count", 0)
                ),
                "char_count": (lambda cc: int(cc) if isinstance(cc, (int, str)) else 0)(
                    updated_metadata.get("char_count", 0)
                ),
            }

            # Add optional fields if they exist
            if "title" in updated_metadata and updated_metadata["title"]:
                new_metadata["title"] = str(updated_metadata["title"])
            if "description" in updated_metadata and updated_metadata["description"]:
                new_metadata["description"] = str(updated_metadata["description"])
            if "tags" in updated_metadata and isinstance(updated_metadata["tags"], list):
                tags_list = cast(list[object], updated_metadata["tags"])
                new_metadata["tags"] = [str(tag) for tag in tags_list if tag is not None]
            if "category" in updated_metadata and updated_metadata["category"]:
                new_metadata["category"] = str(updated_metadata["category"])
            if "language" in updated_metadata and updated_metadata["language"]:
                new_metadata["language"] = str(updated_metadata["language"])

            document.metadata = new_metadata

            return document

        except Exception as e:
            raise IngestionError(f"Failed to tag document: {e}") from e

    async def tag_batch(
        self,
        documents: list[Document],
        custom_instructions: str | None = None,
    ) -> list[Document]:
        """
        Tag multiple documents with metadata.

        Args:
            documents: Documents to tag
            custom_instructions: Optional custom instructions

        Returns:
            Documents with enriched metadata
        """
        tagged_docs: list[Document] = []

        for doc in documents:
            tagged_doc = await self.tag_document(doc, custom_instructions)
            tagged_docs.append(tagged_doc)

        return tagged_docs

    async def _generate_metadata(
        self,
        content: str,
        title: str | None = None,
        custom_instructions: str | None = None,
    ) -> DocumentMetadata:
        """
        Generate metadata using LLM.

        Args:
            content: Document content
            title: Document title
            custom_instructions: Optional custom instructions

        Returns:
            Generated metadata dictionary
        """
        # Prepare the prompt
        system_prompt = """You are a document metadata tagger. Analyze the given content and generate relevant metadata.

Return a JSON object with the following structure:
{
    "tags": ["tag1", "tag2", ...],  # 3-7 relevant topic tags
    "category": "string",  # Main category
    "summary": "string",  # 1-2 sentence summary
    "key_topics": ["topic1", "topic2", ...],  # Main topics discussed
    "document_type": "string",  # Type of document (e.g., "technical", "tutorial", "reference")
    "language": "string",  # Primary language (e.g., "en", "es")
    "technical_level": "string"  # One of: "beginner", "intermediate", "advanced"
}"""

        if custom_instructions:
            system_prompt += f"\n\nAdditional instructions: {custom_instructions}"

        # Prepare user prompt
        user_prompt = "Document to analyze:\n"
        if title:
            user_prompt += f"Title: {title}\n"
        user_prompt += f"Content:\n{content[:3000]}"  # Limit content length

        # Call LLM
        response = await self.client.post(
            f"{self.endpoint}/v1/chat/completions",
            json={
                "model": self.model,
                "messages": [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt},
                ],
                "temperature": 0.3,
                "max_tokens": 500,
                "response_format": {"type": "json_object"},
            },
        )
        response.raise_for_status()

        result_raw = response.json()
        if not isinstance(result_raw, dict):
            raise IngestionError("Invalid response format from LLM")

        result = cast(LlmResponse, result_raw)

        # Extract content from response
        choices = result.get("choices", [])
        if not choices or not isinstance(choices, list):
            raise IngestionError("No response from LLM")

        first_choice_raw = choices[0]
        if not isinstance(first_choice_raw, dict):
            raise IngestionError("Invalid choice format")

        first_choice = cast(LlmChoice, first_choice_raw)
        message_raw = first_choice.get("message", {})
        if not isinstance(message_raw, dict):
            raise IngestionError("Invalid message format")

        message = cast(LlmMessage, message_raw)
        content_str = str(message.get("content", "{}"))

        try:
            raw_metadata = cast(dict[str, object], json.loads(content_str))
        except json.JSONDecodeError as e:
            raise IngestionError(f"Failed to parse LLM response: {e}") from e

        # Ensure it's a dict before processing
        if not isinstance(raw_metadata, dict):
            raise IngestionError("LLM response is not a valid JSON object")

        # Validate and sanitize metadata
        return self._sanitize_metadata(raw_metadata)

    def _sanitize_metadata(self, metadata: dict[str, object]) -> DocumentMetadata:
        """
        Sanitize and validate metadata.

        Args:
            metadata: Raw metadata from LLM

        Returns:
            Sanitized metadata
        """
        sanitized: DocumentMetadata = {}

        # Tags
        if "tags" in metadata and isinstance(metadata["tags"], list):
            tags_list = cast(list[object], metadata["tags"])
            tags_raw = tags_list[:10] if len(tags_list) > 10 else tags_list
            tags = [str(tag).lower().strip() for tag in tags_raw]
            sanitized["tags"] = [tag for tag in tags if tag]

        # Category
        if "category" in metadata:
            sanitized["category"] = str(metadata["category"]).strip()

        # Summary
        if "summary" in metadata:
            if summary := str(metadata["summary"]).strip():
                sanitized["summary"] = summary[:500]  # Limit length

        # Key topics
        if "key_topics" in metadata and isinstance(metadata["key_topics"], list):
            topics_list = cast(list[object], metadata["key_topics"])
            topics_raw = topics_list[:10] if len(topics_list) > 10 else topics_list
            topics = [str(topic).strip() for topic in topics_raw]
            sanitized["key_topics"] = [topic for topic in topics if topic]

        # Document type
        if "document_type" in metadata:
            sanitized["document_type"] = str(metadata["document_type"]).strip()

        # Language
        if "language" in metadata:
            lang = str(metadata["language"]).strip().lower()
            if len(lang) == 2:  # Basic validation for ISO 639-1
                sanitized["language"] = lang

        # Technical level
        if "technical_level" in metadata:
            level = str(metadata["technical_level"]).strip().lower()
            if level in {"beginner", "intermediate", "advanced"}:
                sanitized["technical_level"] = level

        return sanitized

    async def __aenter__(self) -> "MetadataTagger":
        """Async context manager entry."""
        return self

    async def __aexit__(self, *args: object) -> None:
        """Async context manager exit."""
        await self.client.aclose()
</file>

<file path="ingest_pipeline/utils/vectorizer.py">
"""Vectorizer utility for generating embeddings."""

import asyncio
from types import TracebackType
from typing import Final, NotRequired, Self, TypedDict

import httpx

from ..config import get_settings
from ..core.exceptions import VectorizationError
from ..core.models import StorageConfig, VectorConfig

JSON_CONTENT_TYPE: Final[str] = "application/json"
AUTHORIZATION_HEADER: Final[str] = "Authorization"


class EmbeddingData(TypedDict):
    """Structure for embedding data from providers."""

    embedding: list[float]
    index: NotRequired[int]
    object: NotRequired[str]


class EmbeddingResponse(TypedDict):
    """Embedding response format for multiple providers."""

    data: list[EmbeddingData]
    model: NotRequired[str]
    object: NotRequired[str]
    usage: NotRequired[dict[str, int]]
    # Alternative formats
    embedding: NotRequired[list[float]]
    vector: NotRequired[list[float]]
    embeddings: NotRequired[list[list[float]]]


def _extract_embedding_from_response(response_data: dict[str, object]) -> list[float]:
    """Extract embedding vector from provider response."""
    # OpenAI/Ollama format: {"data": [{"embedding": [...]}]}
    if "data" in response_data:
        data_list = response_data["data"]
        if isinstance(data_list, list) and data_list:
            first_item = data_list[0]
            if isinstance(first_item, dict) and "embedding" in first_item:
                embedding = first_item["embedding"]
                if isinstance(embedding, list) and all(
                    isinstance(x, (int, float)) for x in embedding
                ):
                    return [float(x) for x in embedding]

    # Direct embedding format: {"embedding": [...]}
    if "embedding" in response_data:
        embedding = response_data["embedding"]
        if isinstance(embedding, list) and all(isinstance(x, (int, float)) for x in embedding):
            return [float(x) for x in embedding]

    # Vector format: {"vector": [...]}
    if "vector" in response_data:
        vector = response_data["vector"]
        if isinstance(vector, list) and all(isinstance(x, (int, float)) for x in vector):
            return [float(x) for x in vector]

    # Embeddings array format: {"embeddings": [[...]]}
    if "embeddings" in response_data:
        embeddings = response_data["embeddings"]
        if isinstance(embeddings, list) and embeddings:
            first_embedding = embeddings[0]
            if isinstance(first_embedding, list) and all(
                isinstance(x, (int, float)) for x in first_embedding
            ):
                return [float(x) for x in first_embedding]

    raise VectorizationError("Unrecognized embedding response format")


class Vectorizer:
    """Handles text vectorization using LLM endpoints."""

    endpoint: str
    model: str
    dimension: int

    def __init__(self, config: StorageConfig | VectorConfig):
        """
        Initialize vectorizer.

        Args:
            config: Configuration with embedding details
        """
        settings = get_settings()
        if isinstance(config, StorageConfig):
            # Extract vector config from global settings when storage config is provided
            self.endpoint = str(settings.llm_endpoint).rstrip("/")
            self.model = settings.embedding_model
            self.dimension = settings.embedding_dimension
        else:
            self.endpoint = str(config.embedding_endpoint).rstrip("/")
            self.model = config.model
            self.dimension = config.dimension

        resolved_api_key = settings.get_llm_api_key() or ""
        headers: dict[str, str] = {"Content-Type": JSON_CONTENT_TYPE}
        if resolved_api_key:
            headers[AUTHORIZATION_HEADER] = f"Bearer {resolved_api_key}"

        timeout_seconds = float(settings.request_timeout)
        self.client = httpx.AsyncClient(timeout=timeout_seconds, headers=headers)

    async def vectorize(self, text: str) -> list[float]:
        """
        Generate embedding vector for text.

        Args:
            text: Text to vectorize

        Returns:
            Embedding vector
        """
        if not text:
            raise VectorizationError("Cannot vectorize empty text")

        try:
            return (
                await self._ollama_embed(text)
                if "ollama" in self.model
                else await self._openai_embed(text)
            )
        except Exception as e:
            raise VectorizationError(f"Vectorization failed: {e}") from e

    async def vectorize_batch(self, texts: list[str]) -> list[list[float]]:
        """
        Generate embeddings for multiple texts in parallel.

        Args:
            texts: List of texts to vectorize

        Returns:
            List of embedding vectors

        Raises:
            VectorizationError: If any vectorization fails
        """

        if not texts:
            return []

        # Use semaphore to limit concurrent requests and prevent overwhelming the endpoint
        semaphore = asyncio.Semaphore(20)

        async def vectorize_with_semaphore(text: str) -> list[float]:
            async with semaphore:
                return await self.vectorize(text)

        try:
            # Execute all vectorization requests concurrently
            vectors = await asyncio.gather(*[vectorize_with_semaphore(text) for text in texts])
            return list(vectors)
        except Exception as e:
            raise VectorizationError(f"Batch vectorization failed: {e}") from e

    async def _ollama_embed(self, text: str) -> list[float]:
        """
        Generate embedding using Ollama via OpenAI-compatible endpoint.

        Args:
            text: Text to embed

        Returns:
            Embedding vector
        """
        # Use the full model name as it appears in the API
        model_name = self.model

        # Use OpenAI-compatible endpoint for ollama models
        response = await self.client.post(
            f"{self.endpoint}/v1/embeddings",
            json={
                "model": model_name,
                "input": text,
            },
        )
        _ = response.raise_for_status()

        response_json = response.json()
        if not isinstance(response_json, dict):
            raise VectorizationError("Invalid JSON response format")

        # Extract embedding using type-safe helper
        embedding = _extract_embedding_from_response(response_json)

        # Ensure correct dimension
        if len(embedding) != self.dimension:
            raise VectorizationError(
                f"Embedding dimension mismatch: expected {self.dimension}, received {len(embedding)}"
            )

        return embedding

    async def _openai_embed(self, text: str) -> list[float]:
        """
        Generate embedding using OpenAI-compatible API.

        Args:
            text: Text to embed

        Returns:
            Embedding vector
        """
        response = await self.client.post(
            f"{self.endpoint}/v1/embeddings",
            json={
                "model": self.model,
                "input": text,
            },
        )
        _ = response.raise_for_status()

        response_json = response.json()
        if not isinstance(response_json, dict):
            raise VectorizationError("Invalid JSON response format")

        # Extract embedding using type-safe helper
        embedding = _extract_embedding_from_response(response_json)

        # Ensure correct dimension
        if len(embedding) != self.dimension:
            raise VectorizationError(
                f"Embedding dimension mismatch: expected {self.dimension}, received {len(embedding)}"
            )

        return embedding

    async def __aenter__(self) -> Self:
        """Async context manager entry."""
        return self

    async def close(self) -> None:
        """Close the HTTP client connection."""
        try:
            await self.client.aclose()
        except Exception:
            # Already closed or connection lost
            pass

    async def __aexit__(
        self,
        exc_type: type[BaseException] | None,
        exc_val: BaseException | None,
        exc_tb: TracebackType | None,
    ) -> None:
        """Async context manager exit."""
        await self.close()
</file>

<file path="ingest_pipeline/cli/tui/screens/dashboard.py">
"""Main dashboard screen with collections overview."""

import logging
from typing import TYPE_CHECKING, Final

from textual import work
from textual.app import ComposeResult
from textual.binding import Binding
from textual.containers import Container, Grid, Horizontal
from textual.css.query import NoMatches
from textual.reactive import reactive, var
from textual.screen import Screen
from textual.widgets import (
    Button,
    Footer,
    Header,
    LoadingIndicator,
    Rule,
    Static,
    TabbedContent,
    TabPane,
)
from typing_extensions import override

from ....core.models import StorageBackend
from ....storage.base import BaseStorage
from ....storage.openwebui import OpenWebUIStorage
from ....storage.weaviate import WeaviateStorage
from ..models import CollectionInfo
from ..utils.storage_manager import StorageManager
from ..widgets import EnhancedDataTable, MetricsCard, StatusIndicator

if TYPE_CHECKING:
    from ....storage.r2r.storage import R2RStorage
else:  # pragma: no cover - optional dependency fallback
    R2RStorage = BaseStorage


LOGGER: Final[logging.Logger] = logging.getLogger(__name__)


class CollectionOverviewScreen(Screen[None]):
    """Enhanced dashboard with modern design and metrics."""

    total_documents: int = 0
    total_collections: int = 0
    active_backends: int = 0

    BINDINGS = [
        Binding("q", "quit", "Quit"),
        Binding("r", "refresh", "Refresh"),
        Binding("i", "ingest", "Ingest"),
        Binding("m", "manage", "Manage"),
        Binding("s", "search", "Search"),
        Binding("ctrl+d", "delete", "Delete"),
        Binding("ctrl+1", "tab_dashboard", "Dashboard"),
        Binding("ctrl+2", "tab_collections", "Collections"),
        Binding("ctrl+3", "tab_analytics", "Analytics"),
        Binding("tab", "next_tab", "Next Tab"),
        Binding("shift+tab", "prev_tab", "Prev Tab"),
        Binding("f1", "help", "Help"),
    ]

    collections: var[list[CollectionInfo]] = var([])
    is_loading: var[bool] = var(False)
    selected_collection: reactive[CollectionInfo | None] = reactive(None)
    storage_manager: StorageManager
    weaviate: WeaviateStorage | None
    openwebui: OpenWebUIStorage | None
    r2r: R2RStorage | BaseStorage | None

    def __init__(
        self,
        storage_manager: StorageManager,
        weaviate: WeaviateStorage | None,
        openwebui: OpenWebUIStorage | None,
        r2r: R2RStorage | BaseStorage | None,
    ) -> None:
        super().__init__()
        self.storage_manager = storage_manager
        self.weaviate = weaviate
        self.openwebui = openwebui
        self.r2r = r2r
        self.total_documents = 0
        self.total_collections = 0
        self.active_backends = 0

    @override
    def compose(self) -> ComposeResult:
        yield Header(show_clock=True)

        with TabbedContent():
            # Dashboard Tab
            with TabPane("Dashboard", id="dashboard"):
                yield Container(
                    Static("🚀 Collection Management System", classes="title"),
                    Static("Modern document ingestion and management platform", classes="subtitle"),
                    Rule(line_style="heavy"),
                    # Metrics Grid
                    Container(
                        Grid(
                            MetricsCard(
                                "Collections", str(self.total_collections), "Active collections"
                            ),
                            MetricsCard("Documents", str(self.total_documents), "Total indexed"),
                            MetricsCard(
                                "Backends", str(self.active_backends), "Connected services"
                            ),
                            MetricsCard("Status", "Online", "System health"),
                            classes="responsive-grid metrics-grid",
                        ),
                        classes="center",
                    ),
                    Rule(line_style="dashed"),
                    # Quick Actions
                    Container(
                        Static("⚡ Quick Actions", classes="section-title"),
                        Horizontal(
                            Button("🔄 Refresh Data", id="quick_refresh", variant="primary"),
                            Button("📥 New Ingestion", id="quick_ingest", variant="success"),
                            Button("🔍 Search All", id="quick_search", variant="default"),
                            Button("⚙️ Settings", id="quick_settings", variant="default"),
                            classes="action_buttons",
                        ),
                        classes="card",
                    ),
                    # Recent Activity
                    Container(
                        Static("📊 Recent Activity", classes="section-title"),
                        Static(
                            "Loading recent activity...", id="activity_feed", classes="status-text"
                        ),
                        classes="card",
                    ),
                    classes="main_container",
                )

            # Collections Tab
            with TabPane("Collections", id="collections"):
                yield Container(
                    Static("📚 Collection Overview", classes="title"),
                    # Collection controls
                    Horizontal(
                        Button("🔄 Refresh", id="refresh_btn", variant="primary"),
                        Button("📥 Ingest", id="ingest_btn", variant="success"),
                        Button("🔧 Manage", id="manage_btn", variant="warning"),
                        Button("🗑️ Delete", id="delete_btn", variant="error"),
                        Button("🔍 Search", id="search_btn", variant="default"),
                        classes="button_bar",
                    ),
                    # Collection table with enhanced navigation
                    EnhancedDataTable(id="collections_table", classes="enhanced-table"),
                    # Status bar
                    Container(
                        Static("Ready", id="status_text", classes="status-text"),
                        StatusIndicator("Ready", id="connection_status"),
                        classes="status-bar",
                    ),
                    LoadingIndicator(id="loading", classes="pulse"),
                    classes="main_container",
                )

            # Analytics Tab
            with TabPane("Analytics", id="analytics"):
                yield Container(
                    Static("📈 Analytics & Insights", classes="title"),
                    # Analytics content
                    Container(
                        Static("🚧 Analytics Dashboard", classes="section-title"),
                        Static("Advanced analytics and insights coming soon!", classes="subtitle"),
                        # Placeholder charts area
                        Container(
                            Static("📊 Document Distribution", classes="chart-title"),
                            Static(
                                "Chart placeholder - integrate with visualization library",
                                classes="chart-placeholder",
                            ),
                            classes="card",
                        ),
                        Container(
                            Static("⏱️ Ingestion Timeline", classes="chart-title"),
                            Static("Timeline chart placeholder", classes="chart-placeholder"),
                            classes="card",
                        ),
                        classes="analytics-grid",
                    ),
                    classes="main_container",
                )

        yield Footer()

    async def on_mount(self) -> None:
        """Initialize the screen with enhanced loading."""
        self.query_one("#loading").display = False
        self.update_metrics()
        self.refresh_collections()  # Don't await, let it run as a worker

    def update_metrics(self) -> None:
        """Update dashboard metrics with enhanced calculations."""
        self._calculate_metrics()
        self._update_metrics_cards()
        self._update_activity_feed()

    def _calculate_metrics(self) -> None:
        """Calculate basic metrics from collections."""
        self.total_collections = len(self.collections)
        self.total_documents = sum(col["count"] for col in self.collections)
        # Calculate active backends from storage manager if individual storages are None
        if self.weaviate is None and self.openwebui is None and self.r2r is None:
            self.active_backends = len(self.storage_manager.get_available_backends())
        else:
            self.active_backends = sum([bool(self.weaviate), bool(self.openwebui), bool(self.r2r)])

    def _update_metrics_cards(self) -> None:
        """Update the metrics cards display."""
        try:
            dashboard_tab = self.query_one("#dashboard")
            metrics_cards_query = dashboard_tab.query(MetricsCard)
            if len(metrics_cards_query) >= 4:
                metrics_cards = list(metrics_cards_query)
                self._update_card_values(metrics_cards)
                self._update_status_card(metrics_cards[3])
        except NoMatches:
            return
        except Exception as exc:
            LOGGER.exception("Failed to update dashboard metrics", exc_info=exc)

    def _update_card_values(self, metrics_cards: list[MetricsCard]) -> None:
        """Update individual metric card values."""
        metrics_cards[0].query_one(".metrics-value", Static).update(f"{self.total_collections:,}")
        metrics_cards[1].query_one(".metrics-value", Static).update(f"{self.total_documents:,}")
        metrics_cards[2].query_one(".metrics-value", Static).update(str(self.active_backends))

    def _update_status_card(self, status_card: MetricsCard) -> None:
        """Update the system status card."""
        if self.active_backends > 0 and self.total_collections > 0:
            status_text, status_class = "🟢 Healthy", "status-active"
        elif self.active_backends > 0:
            status_text, status_class = "🟡 Ready", "status-warning"
        else:
            status_text, status_class = "🔴 Offline", "status-error"

        status_card.query_one(".metrics-value", Static).update(status_text)
        status_card.add_class(status_class)

    def _update_activity_feed(self) -> None:
        """Update the activity feed with collection data."""
        try:
            dashboard_tab = self.query_one("#dashboard")
            activity_feed = dashboard_tab.query_one("#activity_feed", Static)
            activity_text = self._generate_activity_text()
            activity_feed.update(activity_text)
        except NoMatches:
            return
        except Exception as exc:
            LOGGER.exception("Failed to update dashboard activity feed", exc_info=exc)

    def _generate_activity_text(self) -> str:
        """Generate activity feed text from collections."""
        if not self.collections:
            return "🚀 No collections found. Start by creating your first ingestion!\n💡 Press 'I' to begin or use the Quick Actions above."

        recent_activity = [self._format_collection_item(col) for col in self.collections[:3]]
        activity_text = "\n".join(recent_activity)

        if len(self.collections) > 3:
            total_docs = sum(c["count"] for c in self.collections)
            activity_text += (
                f"\n📊 Total: {len(self.collections)} collections with {total_docs:,} documents"
            )

        return activity_text

    def _format_collection_item(self, col: CollectionInfo) -> str:
        """Format a single collection item for the activity feed."""
        content_type = self._get_content_type_icon(col["name"])
        size_mb = col["size_mb"]
        backend_info = col["backend"]

        # Check if this represents a multi-backend ingestion result
        if isinstance(backend_info, list):
            if len(backend_info) > 1:
                # Ensure all elements are strings for safe joining
                backend_strings = [str(b) for b in backend_info if b is not None]
                backend_list = " + ".join(backend_strings) if backend_strings else "unknown"
                return f"{content_type} {col['name']}: {col['count']:,} docs ({size_mb:.1f} MB) → {backend_list}"
            elif len(backend_info) == 1:
                backend_name = str(backend_info[0]) if backend_info[0] is not None else "unknown"
                return f"{content_type} {col['name']}: {col['count']:,} docs ({size_mb:.1f} MB) - {backend_name}"
            else:
                return f"{content_type} {col['name']}: {col['count']:,} docs ({size_mb:.1f} MB) - unknown"
        else:
            backend_display = str(backend_info) if backend_info is not None else "unknown"
            return f"{content_type} {col['name']}: {col['count']:,} docs ({size_mb:.1f} MB) - {backend_display}"

    def _get_content_type_icon(self, name: str) -> str:
        """Get appropriate icon for collection content type."""
        name_lower = name.lower()
        if "web" in name_lower:
            return "🌐"
        elif "doc" in name_lower:
            return "📖"
        elif "repo" in name_lower:
            return "📦"
        return "📄"

    @work(exclusive=True)
    async def refresh_collections(self) -> None:
        """Refresh collection data with enhanced multi-backend loading feedback."""
        self.is_loading = True
        loading_indicator = self.query_one("#loading")
        status_text = self.query_one("#status_text", Static)

        loading_indicator.display = True
        status_text.update("🔄 Refreshing collections...")

        try:
            # Use storage manager for unified backend handling
            if not self.storage_manager.is_initialized:
                status_text.update("🔗 Initializing storage backends...")
                backend_results = await self.storage_manager.initialize_all_backends()

                # Report per-backend initialization status
                success_count = sum(backend_results.values())
                total_count = len(backend_results)
                status_text.update(f"✅ Initialized {success_count}/{total_count} backends")

            # Get collections from all backends via storage manager
            status_text.update("📚 Loading collections from all backends...")
            collections = await self.storage_manager.get_all_collections()

            # Update metrics calculation for multi-backend support
            self.active_backends = len(self.storage_manager.get_available_backends())

            self.collections = collections
            await self.update_collections_table()
            self.update_metrics()

            # Enhanced status reporting for multi-backend
            backend_names = ", ".join(
                backend.value for backend in self.storage_manager.get_available_backends()
            )
            status_text.update(f"✨ Ready - {len(collections)} collections from {backend_names}")

            # Update connection status with multi-backend awareness
            connection_status = self.query_one("#connection_status", StatusIndicator)
            if collections and self.active_backends > 0:
                connection_status.update_status(f"✓ {self.active_backends} Active")
            else:
                connection_status.update_status("No Data")

        except Exception as e:
            status_text.update(f"❌ Error: {e}")
            self.notify(f"Failed to refresh: {e}", severity="error", markup=False)
        finally:
            self.is_loading = False
            loading_indicator.display = False

    async def update_collections_table(self) -> None:
        """Update the collections table with enhanced formatting."""
        table = self.query_one("#collections_table", EnhancedDataTable)
        table.clear(columns=True)

        # Add enhanced columns with more metadata
        table.add_columns("Collection", "Backend", "Documents", "Size", "Type", "Status", "Updated")

        # Add rows with enhanced formatting
        for collection in self.collections:
            # Format size
            size_str = f"{collection['size_mb']:.1f} MB"
            if collection["size_mb"] > 1000:
                size_str = f"{collection['size_mb'] / 1000:.1f} GB"

            # Format document count
            doc_count = f"{collection['count']:,}"

            # Determine content type based on collection name or other metadata
            content_type = "📄 Mixed"
            if "web" in collection["name"].lower():
                content_type = "🌐 Web"
            elif "doc" in collection["name"].lower():
                content_type = "📖 Docs"
            elif "repo" in collection["name"].lower():
                content_type = "📦 Code"

            table.add_row(
                collection["name"],
                collection["backend"],
                doc_count,
                size_str,
                content_type,
                collection["status"],
                collection["last_updated"],
            )

        if self.collections:
            table.move_cursor(row=0)

        self.get_selected_collection()

    def update_search_controls(self, collection: CollectionInfo | None) -> None:
        """Enable or disable search controls based on backend support."""
        try:
            search_button = self.query_one("#search_btn", Button)
            quick_search_button = self.query_one("#quick_search", Button)
        except Exception:
            return

        is_weaviate = bool(collection and collection.get("type") == "weaviate")
        search_button.disabled = not is_weaviate
        quick_search_button.disabled = not is_weaviate

    def get_selected_collection(self) -> CollectionInfo | None:
        """Get the currently selected collection."""
        table = self.query_one("#collections_table", EnhancedDataTable)
        try:
            row_index = table.cursor_coordinate.row
        except (AttributeError, IndexError):
            self.selected_collection = None
            self.update_search_controls(None)
            return None

        if 0 <= row_index < len(self.collections):
            collection = self.collections[row_index]
            self.selected_collection = collection
            self.update_search_controls(collection)
            return collection

        self.selected_collection = None
        self.update_search_controls(None)
        return None

    # Action methods
    def action_refresh(self) -> None:
        """Refresh collections."""
        self.refresh_collections()

    def action_ingest(self) -> None:
        """Show enhanced ingestion dialog."""
        if selected := self.get_selected_collection():
            from .ingestion import IngestionScreen

            self.app.push_screen(IngestionScreen(selected, self.storage_manager))
        else:
            self.notify("🔍 Please select a collection first", severity="warning")

    def action_manage(self) -> None:
        """Manage documents in selected collection."""
        if selected := self.get_selected_collection():
            if storage_backend := self._get_storage_for_collection(selected):
                from .documents import DocumentManagementScreen

                self.app.push_screen(DocumentManagementScreen(selected, storage_backend))
            else:
                self.notify(
                    "🚧 No storage backend available for this collection", severity="warning"
                )
        else:
            self.notify("🔍 Please select a collection first", severity="warning")

    def _get_storage_for_collection(self, collection: CollectionInfo) -> BaseStorage | None:
        """Get the appropriate storage backend for a collection."""
        collection_type = collection.get("type", "")

        # Map collection types to storage backends (try direct instances first)
        if collection_type == "weaviate" and self.weaviate:
            return self.weaviate
        elif collection_type == "openwebui" and self.openwebui:
            return self.openwebui
        elif collection_type == "r2r" and self.r2r:
            return self.r2r

        # Fall back to storage manager if direct instances not available
        if collection_type == "weaviate":
            return self.storage_manager.get_backend(StorageBackend.WEAVIATE)
        elif collection_type == "openwebui":
            return self.storage_manager.get_backend(StorageBackend.OPEN_WEBUI)
        elif collection_type == "r2r":
            return self.storage_manager.get_backend(StorageBackend.R2R)

        # Fall back to checking available backends by backend name
        backend_name = collection.get("backend", "")
        if isinstance(backend_name, str):
            if "weaviate" in backend_name.lower():
                return self.weaviate or self.storage_manager.get_backend(StorageBackend.WEAVIATE)
            elif "openwebui" in backend_name.lower():
                return self.openwebui or self.storage_manager.get_backend(StorageBackend.OPEN_WEBUI)
            elif "r2r" in backend_name.lower():
                return self.r2r or self.storage_manager.get_backend(StorageBackend.R2R)

        return None

    def action_search(self) -> None:
        """Search in selected collection."""
        if selected := self.get_selected_collection():
            if selected["type"] != "weaviate":
                self.notify(
                    "🔐 Search is currently available only for Weaviate collections",
                    severity="warning",
                )
                return
            from .search import SearchScreen

            self.app.push_screen(SearchScreen(selected, self.weaviate, self.openwebui))
        else:
            self.notify("🔍 Please select a collection first", severity="warning")

    def action_delete(self) -> None:
        """Delete selected collection."""
        if selected := self.get_selected_collection():
            from .dialogs import ConfirmDeleteScreen

            self.app.push_screen(ConfirmDeleteScreen(selected, self))
        else:
            self.notify("🔍 Please select a collection first", severity="warning")

    def action_tab_dashboard(self) -> None:
        """Switch to dashboard tab."""
        tabbed_content: TabbedContent = self.query_one(TabbedContent)
        tabbed_content.active = "dashboard"

    def action_tab_collections(self) -> None:
        """Switch to collections tab."""
        tabbed_content: TabbedContent = self.query_one(TabbedContent)
        tabbed_content.active = "collections"

    def action_tab_analytics(self) -> None:
        """Switch to analytics tab."""
        tabbed_content: TabbedContent = self.query_one(TabbedContent)
        tabbed_content.active = "analytics"

    def action_next_tab(self) -> None:
        """Switch to next tab."""
        tabbed_content: TabbedContent = self.query_one(TabbedContent)
        tab_ids = ["dashboard", "collections", "analytics"]
        current = tabbed_content.active
        try:
            current_index = tab_ids.index(current)
            next_index = (current_index + 1) % len(tab_ids)
            tabbed_content.active = tab_ids[next_index]
        except (ValueError, AttributeError):
            tabbed_content.active = tab_ids[0]

    def action_prev_tab(self) -> None:
        """Switch to previous tab."""
        tabbed_content: TabbedContent = self.query_one(TabbedContent)
        tab_ids = ["dashboard", "collections", "analytics"]
        current = tabbed_content.active
        try:
            current_index = tab_ids.index(current)
            prev_index = (current_index - 1) % len(tab_ids)
            tabbed_content.active = tab_ids[prev_index]
        except (ValueError, AttributeError):
            tabbed_content.active = tab_ids[0]

    def action_help(self) -> None:
        """Show help screen."""
        from .help import HelpScreen

        help_md = """
# 🚀 Modern Collection Management System

## Navigation
- **Tab** / **Shift+Tab**: Switch between tabs
- **Ctrl+1/2/3**: Direct tab access
- **Enter**: Activate selected item
- **Escape**: Go back/cancel
- **Arrow Keys**: Navigate within tables
- **Home/End**: Jump to first/last row
- **Page Up/Down**: Scroll by page

## Collections
- **R**: Refresh collections
- **I**: Start ingestion
- **M**: Manage documents
- **S**: Search collection
- **Ctrl+D**: Delete collection

## Table Navigation
- **Up/Down** or **J/K**: Navigate rows
- **Space**: Toggle selection
- **Ctrl+A**: Select all
- **Ctrl+Shift+A**: Clear selection

## General
- **Q** / **Ctrl+C**: Quit application
- **F1**: Show this help

Enjoy the enhanced interface! 🎉
        """
        self.app.push_screen(HelpScreen(help_md))

    def on_button_pressed(self, event: Button.Pressed) -> None:
        """Handle button presses with enhanced feedback."""
        button_id = event.button.id

        # Add visual feedback
        event.button.add_class("pressed")
        self.call_later(self.remove_pressed_class, event.button)

        if getattr(event.button, "disabled", False):
            self.notify(
                "🔐 Search is currently limited to Weaviate collections",
                severity="warning",
            )
            return

        if button_id in ["refresh_btn", "quick_refresh"]:
            self.action_refresh()
        elif button_id in ["ingest_btn", "quick_ingest"]:
            self.action_ingest()
        elif button_id == "manage_btn":
            self.action_manage()
        elif button_id == "delete_btn":
            self.action_delete()
        elif button_id in ["search_btn", "quick_search"]:
            self.action_search()
        elif button_id == "quick_settings":
            self.notify("⚙️ Settings panel coming soon!", severity="information")

    def remove_pressed_class(self, button: Button) -> None:
        """Remove pressed visual feedback class."""
        button.remove_class("pressed")
</file>

<file path="ingest_pipeline/cli/tui/screens/dialogs.py">
"""Dialog screens for confirmations and user interactions."""

from __future__ import annotations

from pathlib import Path
from typing import TYPE_CHECKING

from textual.app import ComposeResult
from textual.binding import Binding
from textual.containers import Container, Horizontal
from textual.screen import ModalScreen, Screen
from textual.widgets import Button, Footer, Header, LoadingIndicator, RichLog, Static
from typing_extensions import override

from ..models import CollectionInfo

if TYPE_CHECKING:
    from ..app import CollectionManagementApp
    from .dashboard import CollectionOverviewScreen
    from .documents import DocumentManagementScreen


class ConfirmDeleteScreen(Screen[None]):
    """Screen for confirming collection deletion."""

    collection: CollectionInfo
    parent_screen: CollectionOverviewScreen

    @property
    def app(self) -> CollectionManagementApp:  # type: ignore[override]
        """Return the typed app instance."""
        return super().app  # type: ignore[return-value]

    BINDINGS = [
        Binding("escape", "app.pop_screen", "Cancel"),
        Binding("y", "confirm_delete", "Yes"),
        Binding("n", "app.pop_screen", "No"),
        Binding("enter", "confirm_delete", "Confirm"),
    ]

    def __init__(self, collection: CollectionInfo, parent_screen: CollectionOverviewScreen):
        super().__init__()
        self.collection = collection
        self.parent_screen = parent_screen

    @override
    def compose(self) -> ComposeResult:
        yield Header()
        yield Container(
            Static("⚠️  Confirm Deletion", classes="title warning"),
            Static(f"Are you sure you want to delete collection '{self.collection['name']}'?"),
            Static(f"Backend: {self.collection['backend']}"),
            Static(f"Documents: {self.collection['count']:,}"),
            Static("This action cannot be undone!", classes="warning"),
            Static("Press Y to confirm, N or Escape to cancel", classes="subtitle"),
            Horizontal(
                Button("✅ Yes, Delete (Y)", id="yes_btn", variant="error"),
                Button("❌ Cancel (N)", id="no_btn", variant="default"),
                classes="action_buttons",
            ),
            classes="main_container center",
        )
        yield Footer()

    def on_mount(self) -> None:
        """Initialize the screen with focus on cancel button for safety."""
        self.query_one("#no_btn").focus()

    def on_button_pressed(self, event: Button.Pressed) -> None:
        """Handle button presses."""
        if event.button.id == "yes_btn":
            self.action_confirm_delete()
        elif event.button.id == "no_btn":
            self.app.pop_screen()

    def action_confirm_delete(self) -> None:
        """Confirm deletion."""
        self.run_worker(self.delete_collection())

    async def delete_collection(self) -> None:
        """Delete the collection."""
        try:
            if self.collection["type"] == "weaviate" and self.parent_screen.weaviate:
                # Delete Weaviate collection
                if (
                    self.parent_screen.weaviate.client
                    and self.parent_screen.weaviate.client.collections
                ):
                    self.parent_screen.weaviate.client.collections.delete(self.collection["name"])
                self.notify(
                    f"Deleted Weaviate collection: {self.collection['name']}",
                    severity="information",
                )
            else:
                # Use the dashboard's method to get the appropriate storage backend
                storage_backend = self.parent_screen._get_storage_for_collection(self.collection)
                if not storage_backend:
                    self.notify(
                        f"❌ No storage backend available for {self.collection['type']} collection: {self.collection['name']}",
                        severity="error",
                    )
                    self.app.pop_screen()
                    return

                # Check if the storage backend supports collection deletion
                if not hasattr(storage_backend, "delete_collection"):
                    self.notify(
                        f"❌ Collection deletion not supported for {self.collection['type']} backend",
                        severity="error",
                    )
                    self.app.pop_screen()
                    return

                # Delete the collection using the appropriate backend
                # Ensure we use the exact collection name, not any default from storage config
                collection_name = str(self.collection["name"])
                collection_type = str(self.collection["type"])

                self.notify(
                    f"Deleting {collection_type} collection: {collection_name}...",
                    severity="information",
                )

                # Use the standard delete_collection method for all backends
                if hasattr(storage_backend, "delete_collection"):
                    success = await storage_backend.delete_collection(collection_name)
                else:
                    self.notify("❌ Backend does not support collection deletion", severity="error")
                    self.app.pop_screen()
                    return
                if success:
                    self.notify(
                        f"✅ Successfully deleted {self.collection['type']} collection: {self.collection['name']}",
                        severity="information",
                        timeout=3.0,
                    )
                else:
                    self.notify(
                        f"❌ Failed to delete {self.collection['type']} collection: {self.collection['name']}",
                        severity="error",
                    )
                    # Don't refresh if deletion failed
                    self.app.pop_screen()
                    return

            # Refresh parent screen after a short delay to ensure deletion is processed
            self.call_later(self._refresh_parent_collections, 0.5)  # 500ms delay
            self.app.pop_screen()

        except Exception as e:
            self.notify(f"Failed to delete collection: {e}", severity="error", markup=False)

    def _refresh_parent_collections(self) -> None:
        """Helper method to refresh parent collections."""
        self.parent_screen.refresh_collections()


class ConfirmDocumentDeleteScreen(Screen[None]):
    """Screen for confirming document deletion."""

    doc_ids: list[str]
    collection: CollectionInfo
    parent_screen: DocumentManagementScreen

    @property
    def app(self) -> CollectionManagementApp:  # type: ignore[override]
        """Return the typed app instance."""
        return super().app  # type: ignore[return-value]

    BINDINGS = [
        Binding("escape", "app.pop_screen", "Cancel"),
        Binding("y", "confirm_delete", "Yes"),
        Binding("n", "app.pop_screen", "No"),
        Binding("enter", "confirm_delete", "Confirm"),
    ]

    def __init__(
        self,
        doc_ids: list[str],
        collection: CollectionInfo,
        parent_screen: DocumentManagementScreen,
    ):
        super().__init__()
        self.doc_ids = doc_ids
        self.collection = collection
        self.parent_screen = parent_screen

    @override
    def compose(self) -> ComposeResult:
        yield Header()
        yield Container(
            Static("⚠️  Confirm Document Deletion", classes="title warning"),
            Static(
                f"Are you sure you want to delete {len(self.doc_ids)} documents from '{self.collection['name']}'?"
            ),
            Static("This action cannot be undone!", classes="warning"),
            Static("Press Y to confirm, N or Escape to cancel", classes="subtitle"),
            Horizontal(
                Button("✅ Yes, Delete (Y)", id="yes_btn", variant="error"),
                Button("❌ Cancel (N)", id="no_btn", variant="default"),
                classes="action_buttons",
            ),
            LoadingIndicator(id="loading"),
            classes="main_container center",
        )
        yield Footer()

    def on_mount(self) -> None:
        """Initialize the screen with focus on cancel button for safety."""
        self.query_one("#loading").display = False
        self.query_one("#no_btn").focus()

    def on_button_pressed(self, event: Button.Pressed) -> None:
        """Handle button presses."""
        if event.button.id == "yes_btn":
            self.action_confirm_delete()
        elif event.button.id == "no_btn":
            self.app.pop_screen()

    def action_confirm_delete(self) -> None:
        """Confirm deletion."""
        self.run_worker(self.delete_documents())

    async def delete_documents(self) -> None:
        """Delete the selected documents."""
        loading = self.query_one("#loading")
        loading.display = True

        try:
            results: dict[str, bool] = {}
            if hasattr(self.parent_screen, "storage") and self.parent_screen.storage:
                # Delete documents via storage
                # The storage should have delete_documents method for weaviate
                storage = self.parent_screen.storage
                if hasattr(storage, "delete_documents"):
                    results = await storage.delete_documents(
                        self.doc_ids,
                        collection_name=self.collection["name"],
                    )

                # Count successful deletions
                successful = sum(bool(success) for success in results.values())
                failed = len(results) - successful

                if successful > 0:
                    self.notify(f"Deleted {successful} documents", severity="information")
                if failed > 0:
                    self.notify(f"Failed to delete {failed} documents", severity="error")

                # Clear selection and refresh parent screen
                self.parent_screen.selected_docs.clear()
                await self.parent_screen.load_documents()
                self.app.pop_screen()

        except Exception as e:
            self.notify(f"Failed to delete documents: {e}", severity="error", markup=False)
        finally:
            loading.display = False


class LogViewerScreen(ModalScreen[None]):
    """Display live log output without disrupting the TUI."""

    _log_widget: RichLog | None
    _log_file: Path | None

    @property
    def app(self) -> CollectionManagementApp:  # type: ignore[override]
        """Return the typed app instance."""
        return super().app  # type: ignore[return-value]

    BINDINGS = [
        Binding("escape", "close", "Close"),
        Binding("ctrl+l", "close", "Close"),
        Binding("s", "show_path", "Log File"),
    ]

    def __init__(self) -> None:
        super().__init__()
        self._log_widget = None
        self._log_file = None

    @override
    def compose(self) -> ComposeResult:
        yield Header(show_clock=True)
        yield Container(
            Static("📜 Live Application Logs", classes="title"),
            Static(
                "Logs update in real time. Press S to reveal the log file path.", classes="subtitle"
            ),
            RichLog(id="log_stream", classes="log-stream", wrap=True, highlight=False),
            Static("", id="log_file_path", classes="subtitle"),
            classes="main_container log-viewer-container",
        )
        yield Footer()

    def on_mount(self) -> None:
        """Attach this viewer to the parent application once mounted."""
        self._log_widget = self.query_one(RichLog)

        if hasattr(self.app, "attach_log_viewer"):
            self.app.attach_log_viewer(self)  # type: ignore[arg-type]

    def on_unmount(self) -> None:
        """Detach from the parent application when closed."""

        if hasattr(self.app, "detach_log_viewer"):
            self.app.detach_log_viewer(self)  # type: ignore[arg-type]

    def _get_log_widget(self) -> RichLog:
        if self._log_widget is None:
            self._log_widget = self.query_one(RichLog)
        if self._log_widget is None:
            raise RuntimeError("RichLog widget not found")
        return self._log_widget

    def replace_logs(self, lines: list[str]) -> None:
        """Replace rendered logs with the provided history."""
        log_widget = self._get_log_widget()
        log_widget.clear()
        for line in lines:
            log_widget.write(line)
        log_widget.scroll_end(animate=False)

    def append_logs(self, lines: list[str]) -> None:
        """Append new log lines to the viewer."""
        log_widget = self._get_log_widget()
        for line in lines:
            log_widget.write(line)
        log_widget.scroll_end(animate=False)

    def update_log_file(self, log_file: Path | None) -> None:
        """Update the displayed log file path."""
        self._log_file = log_file
        label = self.query_one("#log_file_path", Static)
        if log_file is None:
            label.update("Logs are not currently being persisted to disk.")
        else:
            label.update(f"Log file: {log_file}")

    def action_close(self) -> None:
        """Close the log viewer."""
        self.app.pop_screen()

    def action_show_path(self) -> None:
        """Reveal the log file location in a notification."""
        if self._log_file is None:
            self.notify("File logging is disabled for this session.", severity="warning")
        else:
            self.notify(
                f"Log file available at: {self._log_file}", severity="information", markup=False
            )
</file>

<file path="ingest_pipeline/config/settings.py">
"""Application settings and configuration."""

from functools import lru_cache
from typing import Annotated, ClassVar, Final, Literal

from prefect.variables import Variable
from pydantic import Field, HttpUrl, model_validator
from pydantic_settings import BaseSettings, SettingsConfigDict


class Settings(BaseSettings):
    """Application settings."""

    model_config: ClassVar[SettingsConfigDict] = SettingsConfigDict(
        env_file=".env",
        env_file_encoding="utf-8",
        case_sensitive=False,
        extra="ignore",  # Ignore extra environment variables
    )

    # API Keys
    firecrawl_api_key: str | None = None
    llm_api_key: str | None = None
    openai_api_key: str | None = None
    openwebui_api_key: str | None = None
    weaviate_api_key: str | None = None
    r2r_api_key: str | None = None

    # Endpoints
    llm_endpoint: HttpUrl = HttpUrl("http://llm.lab")
    weaviate_endpoint: HttpUrl = HttpUrl("http://weaviate.yo")
    openwebui_endpoint: HttpUrl = HttpUrl("http://chat.lab")  # This will be the API URL
    firecrawl_endpoint: HttpUrl = HttpUrl("http://crawl.lab:30002")
    r2r_endpoint: HttpUrl | None = Field(default=None, alias="r2r_api_url")

    # Model Configuration
    embedding_model: str = "ollama/bge-m3:latest"
    metadata_model: str = "fireworks/glm-4p5-air"
    embedding_dimension: int = 1024

    # Ingestion Settings
    default_batch_size: Annotated[int, Field(gt=0, le=500)] = 50
    max_file_size: int = 1_000_000
    max_crawl_depth: Annotated[int, Field(ge=1, le=20)] = 5
    max_crawl_pages: Annotated[int, Field(ge=1, le=1000)] = 100

    # Storage Settings
    default_storage_backend: Literal["weaviate", "open_webui", "r2r"] = "weaviate"
    default_collection_prefix: str = "docs"

    # Prefect Settings
    prefect_api_url: HttpUrl | None = None
    prefect_api_key: str | None = None
    prefect_work_pool: str = "default"

    # Scheduling Defaults
    default_schedule_interval: Annotated[int, Field(ge=1, le=10080)] = 60  # Max 1 week

    # Performance Settings
    max_concurrent_tasks: Annotated[int, Field(ge=1, le=20)] = 5
    request_timeout: Annotated[int, Field(ge=10, le=300)] = 60

    # Logging
    log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR"] = "INFO"

    def get_storage_endpoint(self, backend: str) -> HttpUrl:
        """
        Get endpoint for storage backend.

        Args:
            backend: Storage backend name

        Returns:
            Endpoint URL

        Raises:
            ValueError: If backend is unknown or R2R endpoint not configured
        """
        endpoints = {
            "weaviate": self.weaviate_endpoint,
            "open_webui": self.openwebui_endpoint,
        }

        if backend in endpoints:
            return endpoints[backend]
        elif backend == "r2r":
            if not self.r2r_endpoint:
                raise ValueError(
                    "R2R_API_URL must be set in environment variables. "
                    "This should have been caught during settings validation."
                )
            return self.r2r_endpoint
        else:
            raise ValueError(f"Unknown backend: {backend}. Supported: weaviate, open_webui, r2r")

    def get_api_key(self, service: str) -> str | None:
        """
        Get API key for service.

        Args:
            service: Service name

        Returns:
            API key or None
        """
        service_map: Final[dict[str, str | None]] = {
            "firecrawl": self.firecrawl_api_key,
            "openwebui": self.openwebui_api_key,
            "weaviate": self.weaviate_api_key,
            "r2r": self.r2r_api_key,
            "llm": self.get_llm_api_key(),
            "openai": self.openai_api_key,
        }
        return service_map.get(service)

    def get_llm_api_key(self) -> str | None:
        """Get API key for LLM services with OpenAI fallback."""
        return self.llm_api_key or (self.openai_api_key or None)

    @model_validator(mode="after")
    def validate_backend_configuration(self) -> "Settings":
        """Validate that required configuration is present for the default backend."""
        backend = self.default_storage_backend

        # Validate R2R backend configuration
        if backend == "r2r" and not self.r2r_endpoint:
            raise ValueError(
                "R2R_API_URL must be set in environment variables when using R2R as default backend"
            )

        # Validate API key requirements (optional warning for missing keys)
        required_keys = {
            "weaviate": ("WEAVIATE_API_KEY", self.weaviate_api_key),
            "open_webui": ("OPENWEBUI_API_KEY", self.openwebui_api_key),
            "r2r": ("R2R_API_KEY", self.r2r_api_key),
        }

        if backend in required_keys:
            key_name, key_value = required_keys[backend]
            if not key_value:
                import warnings

                warnings.warn(
                    f"{key_name} not set - authentication may fail for {backend} backend",
                    UserWarning,
                    stacklevel=2,
                )

        return self


@lru_cache
def get_settings() -> Settings:
    """
    Get cached settings instance.

    Returns:
        Settings instance
    """
    return Settings()


class PrefectVariableConfig:
    """Helper class for managing Prefect variables with fallbacks to settings."""

    def __init__(self) -> None:
        self._settings: Settings = get_settings()
        self._variable_names: list[str] = [
            "default_batch_size",
            "max_file_size",
            "max_crawl_depth",
            "max_crawl_pages",
            "default_storage_backend",
            "default_collection_prefix",
            "max_concurrent_tasks",
            "request_timeout",
            "default_schedule_interval",
        ]

    def _get_fallback_value(self, name: str, default_value: object = None) -> object:
        """Get fallback value from settings or default."""
        return default_value or getattr(self._settings, name, default_value)

    def get_with_fallback(
        self, name: str, default_value: str | int | float | None = None
    ) -> str | int | float | None:
        """Get variable value with fallback synchronously."""
        fallback = self._get_fallback_value(name, default_value)
        # Ensure fallback is a type that Variable expects
        variable_fallback = str(fallback) if fallback is not None else None
        try:
            result = Variable.get(name, default=variable_fallback)
            # Variable can return various types, convert to our expected types
            if isinstance(result, (str, int, float)):
                return result
            elif result is None:
                return None
            else:
                # Convert other types to string
                return str(result)
        except Exception:
            # Return fallback with proper type
            if isinstance(fallback, (str, int, float)) or fallback is None:
                return fallback
            return str(fallback) if fallback is not None else None

    async def get_with_fallback_async(
        self, name: str, default_value: str | int | float | None = None
    ) -> str | int | float | None:
        """Get variable value with fallback asynchronously."""
        fallback = self._get_fallback_value(name, default_value)
        variable_fallback = str(fallback) if fallback is not None else None
        try:
            result = await Variable.aget(name, default=variable_fallback)
            # Variable can return various types, convert to our expected types
            if isinstance(result, (str, int, float)):
                return result
            elif result is None:
                return None
            else:
                # Convert other types to string
                return str(result)
        except Exception:
            # Return fallback with proper type
            if isinstance(fallback, (str, int, float)) or fallback is None:
                return fallback
            return str(fallback) if fallback is not None else None

    def get_ingestion_config(self) -> dict[str, str | int | float | None]:
        """Get all ingestion-related configuration variables synchronously."""
        return {name: self.get_with_fallback(name) for name in self._variable_names}

    async def get_ingestion_config_async(self) -> dict[str, str | int | float | None]:
        """Get all ingestion-related configuration variables asynchronously."""
        result: dict[str, str | int | float | None] = {}
        for name in self._variable_names:
            result[name] = await self.get_with_fallback_async(name)
        return result


@lru_cache
def get_prefect_config() -> PrefectVariableConfig:
    """Get cached Prefect variable configuration helper."""
    return PrefectVariableConfig()
</file>

<file path="ingest_pipeline/core/models.py">
"""Core data models with strict typing."""

from datetime import UTC, datetime
from enum import Enum
from typing import Annotated, ClassVar, TypedDict
from uuid import UUID, uuid4

from prefect.blocks.core import Block
from pydantic import BaseModel, Field, HttpUrl, SecretStr

from ..config import get_settings


def _default_embedding_model() -> str:
    return str(get_settings().embedding_model)


def _default_embedding_endpoint() -> HttpUrl:
    endpoint = get_settings().llm_endpoint
    return endpoint if isinstance(endpoint, HttpUrl) else HttpUrl(str(endpoint))


def _default_embedding_dimension() -> int:
    return int(get_settings().embedding_dimension)


def _default_batch_size() -> int:
    return int(get_settings().default_batch_size)


def _default_collection_name() -> str:
    return str(get_settings().default_collection_prefix)


def _default_max_crawl_depth() -> int:
    return int(get_settings().max_crawl_depth)


def _default_max_crawl_pages() -> int:
    return int(get_settings().max_crawl_pages)


def _default_max_file_size() -> int:
    return int(get_settings().max_file_size)


class IngestionStatus(str, Enum):
    """Status of an ingestion job."""

    PENDING = "pending"
    IN_PROGRESS = "in_progress"
    COMPLETED = "completed"
    PARTIAL = "partial"  # Some documents succeeded, some failed
    FAILED = "failed"
    CANCELLED = "cancelled"


class StorageBackend(str, Enum):
    """Available storage backends."""

    WEAVIATE = "weaviate"
    OPEN_WEBUI = "open_webui"
    R2R = "r2r"


class IngestionSource(str, Enum):
    """Types of ingestion sources."""

    WEB = "web"
    REPOSITORY = "repository"
    DOCUMENTATION = "documentation"


class VectorConfig(BaseModel):
    """Configuration for vectorization."""

    model: str = Field(default_factory=_default_embedding_model)
    embedding_endpoint: HttpUrl = Field(default_factory=_default_embedding_endpoint)
    dimension: int = Field(default_factory=_default_embedding_dimension)
    batch_size: Annotated[int, Field(gt=0, le=1000)] = Field(default_factory=_default_batch_size)


class StorageConfig(Block):
    """Configuration for storage backend."""

    _block_type_name: ClassVar[str | None] = "Storage Configuration"
    _block_type_slug: ClassVar[str | None] = "storage-config"
    _description: ClassVar[str | None] = (
        "Configures storage backend connections and settings for document ingestion"
    )

    backend: StorageBackend
    endpoint: HttpUrl
    api_key: SecretStr | None = Field(default=None)
    collection_name: str = Field(default_factory=_default_collection_name)
    batch_size: Annotated[int, Field(gt=0, le=1000)] = Field(default_factory=_default_batch_size)
    grpc_port: int | None = Field(default=None, description="gRPC port for Weaviate connections")


class FirecrawlConfig(Block):
    """Configuration for Firecrawl ingestion (operational parameters only)."""

    _block_type_name: ClassVar[str | None] = "Firecrawl Configuration"
    _block_type_slug: ClassVar[str | None] = "firecrawl-config"
    _description: ClassVar[str | None] = "Configures Firecrawl web scraping and crawling parameters"

    formats: list[str] = Field(default_factory=lambda: ["markdown", "html"])
    max_depth: Annotated[int, Field(ge=1, le=20)] = Field(default_factory=_default_max_crawl_depth)
    limit: Annotated[int, Field(ge=1, le=1000)] = Field(default_factory=_default_max_crawl_pages)
    only_main_content: bool = Field(default=True)
    include_subdomains: bool = Field(default=False)


class RepomixConfig(Block):
    """Configuration for Repomix ingestion."""

    _block_type_name: ClassVar[str | None] = "Repomix Configuration"
    _block_type_slug: ClassVar[str | None] = "repomix-config"
    _description: ClassVar[str | None] = (
        "Configures repository ingestion patterns and file processing settings"
    )

    include_patterns: list[str] = Field(
        default_factory=lambda: ["*.py", "*.js", "*.ts", "*.md", "*.yaml", "*.json"]
    )
    exclude_patterns: list[str] = Field(
        default_factory=lambda: ["**/node_modules/**", "**/__pycache__/**", "**/.git/**"]
    )
    max_file_size: int = Field(default_factory=_default_max_file_size)  # 1MB
    respect_gitignore: bool = Field(default=True)


class R2RConfig(Block):
    """Configuration for R2R ingestion."""

    _block_type_name: ClassVar[str | None] = "R2R Configuration"
    _block_type_slug: ClassVar[str | None] = "r2r-config"
    _description: ClassVar[str | None] = (
        "Configures R2R-specific ingestion settings including chunking and graph enrichment"
    )

    chunk_size: Annotated[int, Field(ge=100, le=8192)] = 1000
    chunk_overlap: Annotated[int, Field(ge=0, le=1000)] = 200
    enable_graph_enrichment: bool = Field(default=False)
    graph_creation_settings: dict[str, object] | None = Field(default=None)


class DocumentMetadataRequired(TypedDict):
    """Required metadata fields for a document."""

    source_url: str
    timestamp: datetime
    content_type: str
    word_count: int
    char_count: int


class DocumentMetadata(DocumentMetadataRequired, total=False):
    """Rich metadata for a document with R2R-compatible fields."""

    # Basic optional fields
    title: str | None
    description: str | None

    # Content categorization
    tags: list[str]
    category: str
    section: str
    language: str

    # Authorship and source info
    author: str
    domain: str
    site_name: str

    # Document structure
    heading_hierarchy: list[str]
    section_depth: int
    has_code_blocks: bool
    has_images: bool
    has_links: bool

    # Processing metadata
    extraction_method: str
    crawl_depth: int
    last_modified: datetime | None

    # Content quality indicators
    readability_score: float | None
    completeness_score: float | None

    # Repository-specific fields
    file_path: str | None
    repository_name: str | None
    branch_name: str | None
    commit_hash: str | None
    programming_language: str | None

    # Custom business metadata
    importance_score: float | None
    review_status: str | None
    assigned_team: str | None


class Document(BaseModel):
    """Represents a single document."""

    id: UUID = Field(default_factory=uuid4)
    content: str
    metadata: DocumentMetadata
    vector: list[float] | None = Field(default=None)
    score: float | None = Field(default=None)
    source: IngestionSource
    collection: str = Field(default_factory=_default_collection_name)


class IngestionJob(BaseModel):
    """Represents an ingestion job."""

    id: UUID = Field(default_factory=uuid4)
    source_type: IngestionSource
    source_url: HttpUrl | str
    status: IngestionStatus = Field(default=IngestionStatus.PENDING)
    created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
    updated_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
    completed_at: datetime | None = Field(default=None)
    error_message: str | None = Field(default=None)
    document_count: int = Field(default=0)
    storage_backend: StorageBackend


class IngestionResult(BaseModel):
    """Result of an ingestion operation."""

    job_id: UUID
    status: IngestionStatus
    documents_processed: int
    documents_failed: int
    duration_seconds: float
    error_messages: list[str] = Field(default_factory=list)
</file>

<file path="ingest_pipeline/storage/r2r/storage.py">
"""R2R storage implementation using the official R2R SDK."""

from __future__ import annotations

import asyncio
import contextlib
import logging
from collections.abc import AsyncGenerator, Iterable, Mapping, Sequence
from datetime import UTC, datetime
from typing import Final, Self, TypeVar, cast
from uuid import UUID, uuid4

# Direct imports for runtime and type checking
from httpx import AsyncClient, HTTPStatusError  # type: ignore
from r2r import R2RAsyncClient, R2RException  # type: ignore
from typing_extensions import override

from ...core.exceptions import StorageError
from ...core.models import Document, DocumentMetadata, IngestionSource, StorageConfig
from ..base import BaseStorage
from ..types import DocumentInfo

LOGGER: Final[logging.Logger] = logging.getLogger(__name__)

T = TypeVar("T")


def _as_mapping(value: object) -> dict[str, object]:
    if isinstance(value, Mapping):
        return dict(cast(Mapping[str, object], value))
    if hasattr(value, "__dict__"):
        return dict(cast(Mapping[str, object], value.__dict__))
    return {}


def _as_sequence(value: object) -> tuple[object, ...]:
    """Convert value to a tuple of objects."""
    if isinstance(value, Sequence):
        return tuple(value)
    return tuple(value) if isinstance(value, Iterable) else ()


def _extract_id(source: object, fallback: str) -> str:
    mapping = _as_mapping(source)
    identifier = mapping.get("id") if mapping else None
    if identifier is None and hasattr(source, "id"):
        identifier = getattr(source, "id", None)
    return fallback if identifier is None else str(identifier)


def _as_datetime(value: object) -> datetime:
    if isinstance(value, datetime):
        return value
    if isinstance(value, str):
        with contextlib.suppress(ValueError):
            return datetime.fromisoformat(value)
    return datetime.now(UTC)


def _as_int(value: object, default: int = 0) -> int:
    if isinstance(value, bool):
        return int(value)
    if isinstance(value, int):
        return value
    if isinstance(value, float):
        return int(value)
    if isinstance(value, str):
        try:
            return int(float(value)) if "." in value else int(value)
        except ValueError:
            return default
    return default


class R2RStorage(BaseStorage):
    """R2R storage implementation using the official R2R SDK."""

    def __init__(self, config: StorageConfig) -> None:
        """Initialize R2R storage with SDK client."""
        super().__init__(config)
        self.endpoint: str = str(config.endpoint).rstrip("/")
        self.client: R2RAsyncClient = R2RAsyncClient(self.endpoint)
        self.default_collection_id: str | None = None

    def _get_http_client_headers(self) -> dict[str, str]:
        """Get consistent HTTP headers for direct API calls."""
        headers = {"Content-Type": "application/json"}

        # Add authentication headers if available
        # Note: R2R SDK may handle auth internally, so we extract it if possible
        if hasattr(self.client, "_get_headers"):
            with contextlib.suppress(Exception):
                sdk_headers = self.client._get_headers()  # type: ignore[attr-defined]
                if isinstance(sdk_headers, dict):
                    headers |= sdk_headers
        return headers

    def _create_http_client(self) -> AsyncClient:
        """Create a properly configured HTTP client for direct API calls."""
        headers = self._get_http_client_headers()
        return AsyncClient(headers=headers, timeout=30.0)

    @override
    async def initialize(self) -> None:
        """Initialize R2R connection and ensure default collection exists."""
        try:
            # Ensure we have an event loop
            try:
                _ = asyncio.get_running_loop()
            except RuntimeError:
                # No event loop running, this should not happen in async context
                # but let's be defensive
                import logging

                logging.warning("No event loop found during R2R initialization")

            # Test connection using direct HTTP call to v3 API
            endpoint = self.endpoint
            client = self._create_http_client()
            try:
                response = await client.get(f"{endpoint}/v3/collections")
                response.raise_for_status()
            finally:
                await client.aclose()
            _ = await self._ensure_collection(self.config.collection_name)
        except Exception as e:
            raise StorageError(f"Failed to initialize R2R: {e}") from e

    async def _ensure_collection(self, collection_name: str) -> str:
        """Get or create collection by name."""
        endpoint = self.endpoint
        client = self._create_http_client()
        try:
            # List collections and find by name
            response = await client.get(f"{endpoint}/v3/collections")
            response.raise_for_status()
            data: dict[str, object] = response.json()

            results = cast(list[dict[str, object]], data.get("results", []))
            for collection in results:
                if collection.get("name") == collection_name:
                    collection_id_raw = collection.get("id")
                    if collection_id_raw is None:
                        raise StorageError(f"Collection '{collection_name}' exists but has no ID")
                    collection_id = str(collection_id_raw)
                    if collection_name == self.config.collection_name:
                        self.default_collection_id = collection_id
                    return collection_id

            # Create if not found
            create_response = await client.post(
                f"{endpoint}/v3/collections",
                json={
                    "name": collection_name,
                    "description": f"Auto-created collection: {collection_name}",
                },
            )
            create_response.raise_for_status()
            created: dict[str, object] = create_response.json()
            created_results = cast(dict[str, object], created.get("results", {}))
            collection_id_raw = created_results.get("id")
            if collection_id_raw is None:
                raise StorageError("Failed to get collection ID from creation response")
            collection_id = str(collection_id_raw)

            if collection_name == self.config.collection_name:
                self.default_collection_id = collection_id

            return collection_id
        except Exception as e:
            raise StorageError(f"Failed to ensure collection '{collection_name}': {e}") from e
        finally:
            await client.aclose()

        # This should never be reached, but satisfies static analyzer
        raise StorageError(f"Unexpected code path in _ensure_collection for '{collection_name}'")

    @override
    async def store(self, document: Document, *, collection_name: str | None = None) -> str:
        """Store a single document."""
        return (await self.store_batch([document], collection_name=collection_name))[0]

    @override
    async def store_batch(
        self, documents: list[Document], *, collection_name: str | None = None
    ) -> list[str]:
        """Store multiple documents efficiently with connection reuse."""
        collection_id = await self._resolve_collection_id(collection_name)
        LOGGER.info(
            "Using collection ID: %s for collection: %s",
            collection_id,
            collection_name or self.config.collection_name,
        )

        # Filter valid documents upfront
        valid_documents = [doc for doc in documents if self._is_document_valid(doc)]
        if not valid_documents:
            return []

        stored_ids: list[str] = []

        # Use a single HTTP client for all requests
        http_client = AsyncClient()
        async with http_client:  # type: ignore
            # Process documents with controlled concurrency
            import asyncio

            semaphore = asyncio.Semaphore(5)  # Limit concurrent uploads

            async def store_single_with_client(document: Document) -> str | None:
                async with semaphore:
                    return await self._store_single_document_with_client(
                        document, collection_id, http_client
                    )

            # Execute all uploads concurrently
            results = await asyncio.gather(
                *[store_single_with_client(doc) for doc in valid_documents], return_exceptions=True
            )

            # Collect successful IDs
            for result in results:
                if isinstance(result, str):
                    stored_ids.append(result)
                elif isinstance(result, Exception):
                    LOGGER.error("Document upload failed: %s", result)

        return stored_ids

    async def _resolve_collection_id(self, collection_name: str | None) -> str:
        """Resolve collection ID from name or use default."""
        if collection_name:
            return await self._ensure_collection(collection_name)

        if self.default_collection_id:
            return self.default_collection_id

        collection_id = await self._ensure_collection(self.config.collection_name)
        self.default_collection_id = collection_id
        return collection_id

    def _is_document_valid(self, document: Document) -> bool:
        """Validate document content and size."""
        requested_id = str(document.id)

        if not document.content or not document.content.strip():
            LOGGER.warning("Skipping document %s: empty content", requested_id)
            return False

        if len(document.content) > 1_000_000:  # 1MB limit
            LOGGER.warning(
                "Skipping document %s: content too large (%d chars)",
                requested_id,
                len(document.content),
            )
            return False

        return True

    async def _store_single_document(self, document: Document, collection_id: str) -> str | None:
        """Store a single document with retry logic."""
        http_client = AsyncClient()
        async with http_client:  # type: ignore
            return await self._store_single_document_with_client(
                document, collection_id, http_client
            )

    async def _store_single_document_with_client(
        self, document: Document, collection_id: str, http_client: AsyncClient
    ) -> str | None:
        """Store a single document with retry logic using provided HTTP client."""
        requested_id = str(document.id)
        LOGGER.debug("Creating document with ID: %s", requested_id)

        max_retries = 3
        retry_delay = 1.0

        for attempt in range(max_retries):
            try:
                doc_response = await self._attempt_document_creation_with_client(
                    document, collection_id, http_client
                )
                if doc_response:
                    return self._process_document_response(
                        doc_response, requested_id, collection_id
                    )
            except (TimeoutError, OSError) as e:
                if not await self._should_retry_timeout(
                    e, attempt, max_retries, requested_id, retry_delay
                ):
                    break
                retry_delay *= 2
            except HTTPStatusError as e:
                if not await self._should_retry_http_error(
                    e, attempt, max_retries, requested_id, retry_delay
                ):
                    break
                retry_delay *= 2
            except Exception as exc:
                self._log_document_error(document.id, exc)
                break

        return None

    async def _attempt_document_creation(
        self, document: Document, collection_id: str
    ) -> dict[str, object] | None:
        """Attempt to create a document via HTTP API."""
        http_client = AsyncClient()
        async with http_client:  # type: ignore
            return await self._attempt_document_creation_with_client(
                document, collection_id, http_client
            )

    async def _attempt_document_creation_with_client(
        self, document: Document, collection_id: str, http_client: AsyncClient
    ) -> dict[str, object] | None:
        """Attempt to create a document via HTTP API using provided client."""
        import json

        requested_id = str(document.id)
        metadata = self._build_metadata(document)
        LOGGER.debug("Built metadata for document %s: %s", requested_id, metadata)

        files = {
            "raw_text": (None, document.content),
            "metadata": (None, json.dumps(metadata)),
            "id": (None, requested_id),
            "ingestion_mode": (None, "hi-res"),
        }

        if collection_id:
            files["collection_ids"] = (None, json.dumps([collection_id]))
            LOGGER.debug(
                "Creating document %s with collection_ids: [%s]", requested_id, collection_id
            )

        LOGGER.debug("Sending to R2R - files keys: %s", list(files.keys()))
        LOGGER.debug("Metadata JSON: %s", files["metadata"][1])

        response = await http_client.post(f"{self.endpoint}/v3/documents", files=files)  # type: ignore[call-arg]

        if response.status_code == 422:
            self._handle_validation_error(response, requested_id, metadata)
            return None

        response.raise_for_status()
        return response.json()

    def _handle_validation_error(
        self, response: object, requested_id: str, metadata: dict[str, object]
    ) -> None:
        """Handle validation errors from R2R API."""
        try:
            error_detail = (
                getattr(response, "json", lambda: {})() if hasattr(response, "json") else {}
            )
            LOGGER.error("R2R validation error for document %s: %s", requested_id, error_detail)
            LOGGER.error("Document metadata sent: %s", metadata)
            LOGGER.error("Response status: %s", getattr(response, "status_code", "unknown"))
            LOGGER.error("Response headers: %s", dict(getattr(response, "headers", {})))
        except Exception:
            LOGGER.error(
                "R2R validation error for document %s: %s",
                requested_id,
                getattr(response, "text", "unknown error"),
            )
            LOGGER.error("Document metadata sent: %s", metadata)

    def _process_document_response(
        self, doc_response: dict[str, object], requested_id: str, collection_id: str
    ) -> str:
        """Process successful document creation response."""
        response_payload = doc_response.get("results", doc_response)
        doc_id = _extract_id(response_payload, requested_id)

        LOGGER.info("R2R returned document ID: %s", doc_id)

        if doc_id != requested_id:
            LOGGER.warning("Requested ID %s but got %s", requested_id, doc_id)

        if collection_id:
            LOGGER.info(
                "Document %s should be assigned to collection %s via creation API",
                doc_id,
                collection_id,
            )

        return doc_id

    async def _should_retry_timeout(
        self,
        error: Exception,
        attempt: int,
        max_retries: int,
        requested_id: str,
        retry_delay: float,
    ) -> bool:
        """Determine if timeout error should be retried."""
        if attempt >= max_retries - 1:
            return False

        LOGGER.warning("Timeout for document %s, retrying in %ss...", requested_id, retry_delay)
        await asyncio.sleep(retry_delay)
        return True

    async def _should_retry_http_error(
        self,
        error: HTTPStatusError,
        attempt: int,
        max_retries: int,
        requested_id: str,
        retry_delay: float,
    ) -> bool:
        """Determine if HTTP error should be retried."""
        status_code = error.response.status_code
        if status_code < 500 or attempt >= max_retries - 1:
            return False

        LOGGER.warning(
            "Server error %s for document %s, retrying in %ss...",
            status_code,
            requested_id,
            retry_delay,
        )
        await asyncio.sleep(retry_delay)
        return True

    def _log_document_error(self, document_id: object, exc: Exception) -> None:
        """Log document storage errors with specific categorization."""
        LOGGER.error("Failed to store document %s: %s", document_id, exc)

        exc_str = str(exc)
        if "422" in exc_str:
            LOGGER.error("  → Data validation issue - check document content and metadata format")
        elif "timeout" in exc_str.lower():
            LOGGER.error("  → Network timeout - R2R may be overloaded")
        elif "500" in exc_str:
            LOGGER.error("  → Server error - R2R internal issue")
        else:
            import traceback

            traceback.print_exc()

    def _build_metadata(self, document: Document) -> dict[str, object]:
        """Convert document metadata to enriched R2R format."""
        metadata = document.metadata

        # Core required fields
        result: dict[str, object] = {
            "source_url": metadata["source_url"],
            "content_type": metadata["content_type"],
            "word_count": metadata["word_count"],
            "char_count": metadata["char_count"],
            "timestamp": metadata["timestamp"].isoformat(),
            "ingestion_source": document.source.value,
        }

        # Basic optional fields
        if title := metadata.get("title"):
            result["title"] = title
        if description := metadata.get("description"):
            result["description"] = description

        # Content categorization
        if tags := metadata.get("tags"):
            result["tags"] = tags
        if category := metadata.get("category"):
            result["category"] = category
        if section := metadata.get("section"):
            result["section"] = section
        if language := metadata.get("language"):
            result["language"] = language

        # Authorship and source info
        if author := metadata.get("author"):
            result["author"] = author
        if domain := metadata.get("domain"):
            result["domain"] = domain
        if site_name := metadata.get("site_name"):
            result["site_name"] = site_name

        # Document structure
        if heading_hierarchy := metadata.get("heading_hierarchy"):
            result["heading_hierarchy"] = heading_hierarchy
        if section_depth := metadata.get("section_depth"):
            result["section_depth"] = section_depth
        if has_code_blocks := metadata.get("has_code_blocks"):
            result["has_code_blocks"] = has_code_blocks
        if has_images := metadata.get("has_images"):
            result["has_images"] = has_images
        if has_links := metadata.get("has_links"):
            result["has_links"] = has_links

        # Processing metadata
        if extraction_method := metadata.get("extraction_method"):
            result["extraction_method"] = extraction_method
        if crawl_depth := metadata.get("crawl_depth"):
            result["crawl_depth"] = crawl_depth
        if last_modified := metadata.get("last_modified"):
            result["last_modified"] = last_modified.isoformat() if last_modified else None

        # Content quality indicators
        if readability_score := metadata.get("readability_score"):
            result["readability_score"] = readability_score
        if completeness_score := metadata.get("completeness_score"):
            result["completeness_score"] = completeness_score

        # Repository-specific fields
        if file_path := metadata.get("file_path"):
            result["file_path"] = file_path
        if repository_name := metadata.get("repository_name"):
            result["repository_name"] = repository_name
        if branch_name := metadata.get("branch_name"):
            result["branch_name"] = branch_name
        if commit_hash := metadata.get("commit_hash"):
            result["commit_hash"] = commit_hash
        if programming_language := metadata.get("programming_language"):
            result["programming_language"] = programming_language

        # Custom business metadata
        if importance_score := metadata.get("importance_score"):
            result["importance_score"] = importance_score
        if review_status := metadata.get("review_status"):
            result["review_status"] = review_status
        if assigned_team := metadata.get("assigned_team"):
            result["assigned_team"] = assigned_team

        return result

    @override
    async def retrieve(
        self, document_id: str, *, collection_name: str | None = None
    ) -> Document | None:
        """Retrieve a document by ID."""
        try:
            response = await self.client.documents.retrieve(document_id)
        except R2RException as exc:
            status_code = getattr(exc, "status_code", None)
            if status_code == 404:
                return None
            import logging

            logging.warning(f"Unexpected error retrieving document {document_id}: {exc}")
            return None
        except Exception as error:
            import logging

            logging.warning(f"Unexpected error retrieving document {document_id}: {error}")
            return None
        payload = getattr(response, "results", response)
        return self._convert_to_document(payload, collection_name)

    def _convert_to_document(self, r2r_doc: object, collection_name: str | None = None) -> Document:
        """Convert R2R document payload to our Document model."""
        doc_map = _as_mapping(r2r_doc)
        metadata_map = _as_mapping(doc_map.get("metadata", {}))

        doc_uuid = self._extract_document_uuid(r2r_doc)
        timestamp = _as_datetime(doc_map.get("created_at", metadata_map.get("timestamp")))

        metadata = self._build_core_metadata(metadata_map, timestamp)
        self._add_optional_metadata_fields(metadata, doc_map, metadata_map)

        source_enum = self._extract_ingestion_source(metadata_map)
        content_value = doc_map.get("content", getattr(r2r_doc, "content", ""))

        return Document(
            id=doc_uuid,
            content=str(content_value),
            metadata=metadata,
            source=source_enum,
            collection=collection_name or self.config.collection_name,
        )

    def _extract_document_uuid(self, r2r_doc: object) -> UUID:
        """Extract and validate document UUID."""
        doc_id_str = _extract_id(r2r_doc, str(uuid4()))
        try:
            return UUID(doc_id_str)
        except ValueError:
            return uuid4()

    def _build_core_metadata(
        self, metadata_map: dict[str, object], timestamp: datetime
    ) -> DocumentMetadata:
        """Build core required metadata fields."""
        return {
            "source_url": str(metadata_map.get("source_url", "")),
            "timestamp": timestamp,
            "content_type": str(metadata_map.get("content_type", "text/plain")),
            "word_count": _as_int(metadata_map.get("word_count")),
            "char_count": _as_int(metadata_map.get("char_count")),
        }

    def _add_optional_metadata_fields(
        self,
        metadata: DocumentMetadata,
        doc_map: dict[str, object],
        metadata_map: dict[str, object],
    ) -> None:
        """Add optional metadata fields if present."""
        self._add_title_and_description(metadata, doc_map, metadata_map)
        self._add_content_categorization(metadata, metadata_map)
        self._add_authorship_fields(metadata, metadata_map)
        self._add_structure_fields(metadata, metadata_map)
        self._add_processing_fields(metadata, metadata_map)
        self._add_quality_scores(metadata, metadata_map)

    def _add_title_and_description(
        self,
        metadata: DocumentMetadata,
        doc_map: dict[str, object],
        metadata_map: dict[str, object],
    ) -> None:
        """Add title and description fields."""
        if title := (doc_map.get("title") or metadata_map.get("title")):
            metadata["title"] = cast(str | None, title)

        if summary := (doc_map.get("summary") or metadata_map.get("summary")):
            metadata["description"] = cast(str | None, summary)
        elif description := metadata_map.get("description"):
            metadata["description"] = cast(str | None, description)

    def _add_content_categorization(
        self, metadata: DocumentMetadata, metadata_map: dict[str, object]
    ) -> None:
        """Add content categorization fields."""
        if tags := metadata_map.get("tags"):
            metadata["tags"] = [str(tag) for tag in tags] if isinstance(tags, list) else []
        if category := metadata_map.get("category"):
            metadata["category"] = str(category)
        if section := metadata_map.get("section"):
            metadata["section"] = str(section)
        if language := metadata_map.get("language"):
            metadata["language"] = str(language)

    def _add_authorship_fields(
        self, metadata: DocumentMetadata, metadata_map: dict[str, object]
    ) -> None:
        """Add authorship and source information fields."""
        if author := metadata_map.get("author"):
            metadata["author"] = str(author)
        if domain := metadata_map.get("domain"):
            metadata["domain"] = str(domain)
        if site_name := metadata_map.get("site_name"):
            metadata["site_name"] = str(site_name)

    def _add_structure_fields(
        self, metadata: DocumentMetadata, metadata_map: dict[str, object]
    ) -> None:
        """Add document structure fields."""
        if heading_hierarchy := metadata_map.get("heading_hierarchy"):
            metadata["heading_hierarchy"] = (
                list(heading_hierarchy) if isinstance(heading_hierarchy, list) else []
            )
        if section_depth := metadata_map.get("section_depth"):
            metadata["section_depth"] = _as_int(section_depth)
        if has_code_blocks := metadata_map.get("has_code_blocks"):
            metadata["has_code_blocks"] = bool(has_code_blocks)
        if has_images := metadata_map.get("has_images"):
            metadata["has_images"] = bool(has_images)
        if has_links := metadata_map.get("has_links"):
            metadata["has_links"] = bool(has_links)

    def _add_processing_fields(
        self, metadata: DocumentMetadata, metadata_map: dict[str, object]
    ) -> None:
        """Add processing-related metadata fields."""
        if extraction_method := metadata_map.get("extraction_method"):
            metadata["extraction_method"] = str(extraction_method)
        if crawl_depth := metadata_map.get("crawl_depth"):
            metadata["crawl_depth"] = _as_int(crawl_depth)
        if last_modified := metadata_map.get("last_modified"):
            metadata["last_modified"] = _as_datetime(last_modified)

    def _add_quality_scores(
        self, metadata: DocumentMetadata, metadata_map: dict[str, object]
    ) -> None:
        """Add quality score fields with safe float conversion."""
        if readability_score := metadata_map.get("readability_score"):
            try:
                metadata["readability_score"] = float(str(readability_score))
            except (ValueError, TypeError):
                metadata["readability_score"] = None
        if completeness_score := metadata_map.get("completeness_score"):
            try:
                metadata["completeness_score"] = float(str(completeness_score))
            except (ValueError, TypeError):
                metadata["completeness_score"] = None

    def _extract_ingestion_source(self, metadata_map: dict[str, object]) -> IngestionSource:
        """Extract and validate ingestion source."""
        source_value = str(metadata_map.get("ingestion_source", IngestionSource.WEB.value))
        try:
            return IngestionSource(source_value)
        except ValueError:
            return IngestionSource.WEB

    @override
    async def search(
        self,
        query: str,
        limit: int = 10,
        threshold: float = 0.7,
        *,
        collection_name: str | None = None,
    ) -> AsyncGenerator[Document, None]:
        """Search documents using R2R."""
        try:
            search_settings: dict[str, object] = {
                "limit": limit,
                "similarity_threshold": threshold,
            }

            if collection_name:
                collection_id = await self._ensure_collection(collection_name)
                search_settings["collection_ids"] = [collection_id]

            search_response = await self.client.retrieval.search(
                query=query,
                search_settings=search_settings,
            )

            for result in _as_sequence(getattr(search_response, "results", ())):
                result_map = _as_mapping(result)
                document_id_value = result_map.get(
                    "document_id", getattr(result, "document_id", None)
                )
                if document_id_value is None:
                    continue
                document_id = str(document_id_value)

                try:
                    doc_response = await self.client.documents.retrieve(document_id)
                except R2RException as exc:
                    import logging

                    logging.warning(
                        f"Failed to retrieve document {document_id} during search: {exc}"
                    )
                    continue

                document_payload = getattr(doc_response, "results", doc_response)
                document = self._convert_to_document(document_payload, collection_name)

                score_value = result_map.get("score", getattr(result, "score", None))
                if score_value is not None:
                    try:
                        # Handle various score value types safely
                        if isinstance(score_value, (int, float, str)):
                            document.score = float(score_value)
                        else:
                            # For unknown types, try string conversion first
                            document.score = float(str(score_value))
                    except (TypeError, ValueError) as e:
                        import logging

                        logging.debug(
                            f"Invalid score value {score_value} for document {document_id}: {e}"
                        )
                        document.score = None

                yield document

        except R2RException as exc:
            raise StorageError(f"Search failed: {exc}") from exc

    @override
    async def delete(self, document_id: str, *, collection_name: str | None = None) -> bool:
        """Delete a document."""
        try:
            _ = await self.client.documents.delete(document_id)
            return True
        except R2RException:
            return False

    @override
    async def count(self, *, collection_name: str | None = None) -> int:
        """Get document count in collection."""
        endpoint = self.endpoint
        client = self._create_http_client()
        try:
            # Get collections and find the count for the specific collection
            response = await client.get(f"{endpoint}/v3/collections")
            response.raise_for_status()
            data: dict[str, object] = response.json()

            target_collection = collection_name or self.config.collection_name
            results = cast(list[dict[str, object]], data.get("results", []))
            for collection in results:
                if collection.get("name") == target_collection:
                    doc_count = collection.get("document_count", 0)
                    return _as_int(doc_count)

            # Collection not found
            return 0
        except Exception:
            return 0
        finally:
            await client.aclose()

        # This should never be reached, but satisfies static analyzer
        return 0

    @override
    async def close(self) -> None:
        """Close R2R client."""
        try:
            await self.client.close()
        except Exception as e:
            import logging

            logging.warning(f"Error closing R2R client: {e}")

    async def __aenter__(self) -> Self:
        """Async context manager entry."""
        return self

    async def __aexit__(
        self,
        exc_type: type[BaseException] | None,
        exc_val: BaseException | None,
        exc_tb: object | None,
    ) -> None:
        """Async context manager exit with proper cleanup."""
        await self.close()

    # Additional R2R-specific comprehensive management methods

    async def create_collection(self, name: str, description: str | None = None) -> str:
        """Create a new collection."""
        try:
            response = await self.client.collections.create(name=name, description=description)
            created = _as_mapping(getattr(response, "results", {}))
            return str(created.get("id", name))
        except R2RException as exc:
            raise StorageError(f"Failed to create collection {name}: {exc}") from exc

    async def delete_collection(self, collection_name: str) -> bool:
        """Delete a collection."""
        try:
            collection_id = await self._ensure_collection(collection_name)
            _ = await self.client.collections.delete(collection_id)
            return True
        except R2RException:
            return False

    @override
    async def list_collections(self) -> list[str]:
        """List all available collections."""
        endpoint = self.endpoint
        client = self._create_http_client()
        try:
            response = await client.get(f"{endpoint}/v3/collections")
            response.raise_for_status()
            data: dict[str, object] = response.json()

            collection_names: list[str] = []
            results = cast(list[dict[str, object]], data.get("results", []))
            for entry in results:
                if name := entry.get("name"):
                    collection_names.append(str(name))
            return collection_names
        except Exception as e:
            raise StorageError(f"Failed to list collections: {e}") from e
        finally:
            await client.aclose()

        # This should never be reached, but satisfies static analyzer
        return []

    async def list_collections_detailed(self) -> list[dict[str, object]]:
        """List all available collections with detailed information."""
        try:
            response = await self.client.collections.list()
            collections: list[dict[str, object]] = []
            for entry in _as_sequence(getattr(response, "results", ())):
                entry_map = _as_mapping(entry)
                collections.append(
                    {
                        "id": str(entry_map.get("id", "")),
                        "name": str(entry_map.get("name", "")),
                        "description": entry_map.get("description"),
                    }
                )
            return collections
        except R2RException as exc:
            raise StorageError(f"Failed to list collections: {exc}") from exc

    async def get_document_chunks(self, document_id: str) -> list[dict[str, object]]:
        """Get all chunks for a specific document."""
        try:
            response = await self.client.chunks.list(filters={"document_id": document_id})
            return [
                dict(_as_mapping(chunk)) for chunk in _as_sequence(getattr(response, "results", ()))
            ]
        except R2RException as exc:
            raise StorageError(f"Failed to get chunks for document {document_id}: {exc}") from exc

    async def extract_entities(self, document_id: str) -> dict[str, object]:
        """Extract entities and relationships from a document."""
        try:
            response = await self.client.documents.extract(id=document_id)
            return dict(_as_mapping(getattr(response, "results", {})))
        except R2RException as exc:
            raise StorageError(
                f"Failed to extract entities from document {document_id}: {exc}"
            ) from exc

    async def get_document_overview(self, document_id: str) -> dict[str, object]:
        """Get comprehensive document overview and statistics."""
        try:
            doc_response = await self.client.documents.retrieve(document_id)
            chunks_response = await self.client.chunks.list(filters={"document_id": document_id})
            document_payload = dict(_as_mapping(getattr(doc_response, "results", {})))
            chunk_payload = [
                dict(_as_mapping(chunk))
                for chunk in _as_sequence(getattr(chunks_response, "results", ()))
            ]
            return {
                "document": document_payload,
                "chunk_count": len(chunk_payload),
                "chunks": chunk_payload,
            }
        except R2RException as exc:
            raise StorageError(f"Failed to get overview for document {document_id}: {exc}") from exc

    @override
    async def list_documents(
        self,
        limit: int = 100,
        offset: int = 0,
        *,
        collection_name: str | None = None,
    ) -> list[DocumentInfo]:
        """
        List documents in R2R with pagination.

        Args:
            limit: Maximum number of documents to return
            offset: Number of documents to skip
            collection_name: Collection name (optional)

        Returns:
            List of document dictionaries with metadata
        """
        try:
            documents: list[DocumentInfo] = []

            if collection_name:
                # Get collection ID first
                collection_id = await self._ensure_collection(collection_name)
                # Use the collections API to list documents in a specific collection
                endpoint = self.endpoint
                client = self._create_http_client()
                try:
                    params = {"offset": offset, "limit": limit}
                    response = await client.get(
                        f"{endpoint}/v3/collections/{collection_id}/documents", params=params
                    )
                    response.raise_for_status()
                    data: dict[str, object] = response.json()
                finally:
                    await client.aclose()

                doc_sequence = _as_sequence(data.get("results", []))
            else:
                # List all documents
                r2r_response = await self.client.documents.list(offset=offset, limit=limit)
                documents_data: list[object] | dict[str, object] = getattr(
                    r2r_response, "results", []
                )

                doc_sequence = _as_sequence(
                    documents_data.get("results", [])
                    if isinstance(documents_data, dict)
                    else documents_data
                )

            for doc_data in doc_sequence:
                doc_map = _as_mapping(doc_data)

                # Extract standard document fields
                doc_id = str(doc_map.get("id", ""))
                title = str(doc_map.get("title", "Untitled"))
                metadata = _as_mapping(doc_map.get("metadata", {}))

                document_info: DocumentInfo = {
                    "id": doc_id,
                    "title": title,
                    "source_url": str(metadata.get("source_url", "")),
                    "description": str(metadata.get("description", "")),
                    "content_type": str(metadata.get("content_type", "text/plain")),
                    "content_preview": str(doc_map.get("content", ""))[:200] + "..."
                    if doc_map.get("content")
                    else "",
                    "word_count": _as_int(metadata.get("word_count", 0)),
                    "timestamp": str(doc_map.get("created_at", "")),
                }
                documents.append(document_info)

            return documents

        except Exception as e:
            raise StorageError(f"Failed to list documents: {e}") from e
</file>

<file path="ingest_pipeline/storage/base.py">
"""Base storage interface."""

import asyncio
import logging
import random
from abc import ABC, abstractmethod
from collections.abc import AsyncGenerator
from types import TracebackType
from typing import Final

import httpx
from pydantic import SecretStr

from ..core.exceptions import StorageError
from ..core.models import Document, StorageConfig
from .types import CollectionSummary, DocumentInfo

LOGGER: Final[logging.Logger] = logging.getLogger(__name__)


class TypedHttpClient:
    """
    A properly typed HTTP client wrapper for HTTPX.

    Provides consistent exception handling and type annotations
    for storage adapters that use HTTP APIs.

    Note: Some type checkers (Pylance) may report warnings about HTTPX types
    due to library compatibility issues. The code functions correctly at runtime.
    """

    client: httpx.AsyncClient
    _base_url: str

    def __init__(
        self,
        base_url: str,
        *,
        api_key: SecretStr | None = None,
        timeout: float = 30.0,
        headers: dict[str, str] | None = None,
        max_connections: int = 100,
        max_keepalive_connections: int = 20,
    ):
        """
        Initialize the typed HTTP client.

        Args:
            base_url: Base URL for all requests
            api_key: Optional API key for authentication
            timeout: Request timeout in seconds
            headers: Additional headers to include with requests
            max_connections: Maximum total connections in pool
            max_keepalive_connections: Maximum keepalive connections
        """
        self._base_url = base_url

        # Build headers with optional authentication
        client_headers: dict[str, str] = headers or {}
        if api_key:
            client_headers["Authorization"] = f"Bearer {api_key.get_secret_value()}"

        # Create typed client configuration with connection pooling
        limits = httpx.Limits(
            max_connections=max_connections, max_keepalive_connections=max_keepalive_connections
        )
        timeout_config = httpx.Timeout(connect=5.0, read=timeout, write=30.0, pool=10.0)
        self.client = httpx.AsyncClient(
            base_url=base_url, headers=client_headers, timeout=timeout_config, limits=limits
        )

    async def request(
        self,
        method: str,
        path: str,
        *,
        allow_404: bool = False,
        json: dict[str, object] | None = None,
        data: dict[str, object] | None = None,
        files: dict[str, tuple[str, bytes, str]] | None = None,
        params: dict[str, str | bool] | None = None,
        max_retries: int = 3,
        retry_delay: float = 1.0,
    ) -> httpx.Response | None:
        """
        Perform an HTTP request with consistent error handling and retries.

        Args:
            method: HTTP method (GET, POST, DELETE, etc.)
            path: URL path relative to base_url
            allow_404: If True, return None for 404 responses instead of raising
            json: JSON data to send
            data: Form data to send
            files: Files to upload
            params: Query parameters
            max_retries: Maximum number of retry attempts
            retry_delay: Base delay between retries in seconds

        Returns:
            HTTP response object, or None if allow_404=True and status is 404

        Raises:
            StorageError: If request fails after retries
        """
        last_exception: Exception | None = None

        for attempt in range(max_retries + 1):
            try:
                response = await self.client.request(
                    method, path, json=json, data=data, files=files, params=params
                )
                response.raise_for_status()
                return response
            except httpx.HTTPStatusError as e:
                # Handle 404 as special case if requested
                if allow_404 and e.response.status_code == 404:
                    LOGGER.debug("Resource not found (404): %s %s", method, path)
                    return None

                # Don't retry client errors (4xx except for specific cases)
                if 400 <= e.response.status_code < 500 and e.response.status_code not in [429, 408]:
                    raise StorageError(
                        f"HTTP {e.response.status_code} error from {self._base_url}: {e}"
                    ) from e

                last_exception = e
                if attempt < max_retries:
                    # Exponential backoff with jitter for retryable errors
                    delay = retry_delay * (2**attempt) + random.uniform(0, 1)
                    LOGGER.warning(
                        "HTTP %d error on attempt %d/%d, retrying in %.2fs: %s",
                        e.response.status_code,
                        attempt + 1,
                        max_retries + 1,
                        delay,
                        e,
                    )
                    await asyncio.sleep(delay)

            except httpx.HTTPError as e:
                last_exception = e
                if attempt < max_retries:
                    # Retry transport errors with backoff
                    delay = retry_delay * (2**attempt) + random.uniform(0, 1)
                    LOGGER.warning(
                        "HTTP transport error on attempt %d/%d, retrying in %.2fs: %s",
                        attempt + 1,
                        max_retries + 1,
                        delay,
                        e,
                    )
                    await asyncio.sleep(delay)

        # All retries exhausted - last_exception should always be set if we reach here
        if last_exception is None:
            raise StorageError(
                f"Request to {self._base_url} failed after {max_retries + 1} attempts with unknown error"
            )

        if isinstance(last_exception, httpx.HTTPStatusError):
            raise StorageError(
                f"HTTP {last_exception.response.status_code} error from {self._base_url} after {max_retries + 1} attempts: {last_exception}"
            ) from last_exception
        else:
            raise StorageError(
                f"HTTP transport error to {self._base_url} after {max_retries + 1} attempts: {last_exception}"
            ) from last_exception

    async def close(self) -> None:
        """Close the HTTP client and cleanup resources."""
        try:
            await self.client.aclose()
        except Exception as e:
            LOGGER.warning("Error closing HTTP client: %s", e)

    async def __aenter__(self) -> "TypedHttpClient":
        """Async context manager entry."""
        return self

    async def __aexit__(
        self,
        exc_type: type[BaseException] | None,
        exc_val: BaseException | None,
        exc_tb: TracebackType | None,
    ) -> None:
        """Async context manager exit."""
        await self.close()


class BaseStorage(ABC):
    """Abstract base class for storage adapters."""

    config: StorageConfig

    def __init__(self, config: StorageConfig):
        """
        Initialize storage adapter.

        Args:
            config: Storage configuration
        """
        self.config = config

    @property
    def display_name(self) -> str:
        """Human-readable name for UI display."""
        return self.__class__.__name__.replace("Storage", "")

    @abstractmethod
    async def initialize(self) -> None:
        """Initialize the storage backend and create collections if needed."""
        pass  # pragma: no cover

    @abstractmethod
    async def store(self, document: Document, *, collection_name: str | None = None) -> str:
        """
        Store a single document.

        Args:
            document: Document to store

        Returns:
            Document ID
        """
        pass  # pragma: no cover

    @abstractmethod
    async def store_batch(
        self, documents: list[Document], *, collection_name: str | None = None
    ) -> list[str]:
        """
        Store multiple documents in batch.

        Args:
            documents: List of documents to store

        Returns:
            List of document IDs
        """
        pass  # pragma: no cover

    async def retrieve(
        self, document_id: str, *, collection_name: str | None = None
    ) -> Document | None:
        """
        Retrieve a document by ID (if supported by backend).

        Args:
            document_id: Document ID

        Returns:
            Document or None if not found

        Raises:
            NotImplementedError: If backend doesn't support retrieval
        """
        raise NotImplementedError(f"{self.__class__.__name__} doesn't support document retrieval")

    async def check_exists(
        self, document_id: str, *, collection_name: str | None = None, stale_after_days: int = 30
    ) -> bool:
        """
        Check if a document exists and is not stale.

        Args:
            document_id: Document ID to check
            collection_name: Collection to check in
            stale_after_days: Consider document stale after this many days

        Returns:
            True if document exists and is not stale, False otherwise
        """
        try:
            document = await self.retrieve(document_id, collection_name=collection_name)
            if document is None:
                return False

            # Check staleness if timestamp is available
            if "timestamp" in document.metadata:
                from datetime import UTC, datetime, timedelta

                timestamp_obj = document.metadata["timestamp"]

                # Handle both datetime objects and ISO strings
                if isinstance(timestamp_obj, datetime):
                    timestamp = timestamp_obj
                    # Ensure timezone awareness
                    if timestamp.tzinfo is None:
                        timestamp = timestamp.replace(tzinfo=UTC)
                elif isinstance(timestamp_obj, str):
                    try:
                        timestamp = datetime.fromisoformat(timestamp_obj)
                        # Ensure timezone awareness
                        if timestamp.tzinfo is None:
                            timestamp = timestamp.replace(tzinfo=UTC)
                    except ValueError:
                        # If parsing fails, assume document is stale
                        return False
                else:
                    # Unknown timestamp format, assume stale
                    return False

                cutoff = datetime.now(UTC) - timedelta(days=stale_after_days)
                return timestamp >= cutoff

            # If no timestamp, assume it exists and is valid
            return True
        except Exception:
            # Backend doesn't support retrieval, assume doesn't exist
            return False

    def search(
        self,
        query: str,
        limit: int = 10,
        threshold: float = 0.7,
        *,
        collection_name: str | None = None,
    ) -> AsyncGenerator[Document, None]:
        """
        Search for documents (if supported by backend).

        Args:
            query: Search query
            limit: Maximum number of results
            threshold: Similarity threshold

        Yields:
            Matching documents

        Raises:
            NotImplementedError: If backend doesn't support search
        """
        raise NotImplementedError(f"{self.__class__.__name__} doesn't support search")

    @abstractmethod
    async def delete(self, document_id: str, *, collection_name: str | None = None) -> bool:
        """
        Delete a document.

        Args:
            document_id: Document ID

        Returns:
            True if deleted successfully
        """
        pass  # pragma: no cover

    async def count(self, *, collection_name: str | None = None) -> int:
        """
        Get total document count (if supported by backend).

        Returns:
            Number of documents, 0 if not supported
        """
        return 0

    async def list_collections(self) -> list[str]:
        """
        List available collections (if supported by backend).

        Returns:
            List of collection names, empty list if not supported
        """
        return []

    async def describe_collections(self) -> list[CollectionSummary]:
        """
        Describe available collections with metadata (if supported by backend).

        Returns:
            List of collection metadata, empty list if not supported
        """
        return []

    async def delete_collection(self, collection_name: str) -> bool:
        """
        Delete a collection (if supported by backend).

        Args:
            collection_name: Name of collection to delete

        Returns:
            True if deleted successfully, False if not supported
        """
        return False

    async def delete_documents(
        self, document_ids: list[str], *, collection_name: str | None = None
    ) -> dict[str, bool]:
        """
        Delete documents by IDs (if supported by backend).

        Args:
            document_ids: List of document IDs to delete
            collection_name: Collection to delete from

        Returns:
            Dict mapping document IDs to success status, empty if not supported
        """
        return {}

    async def list_documents(
        self,
        limit: int = 100,
        offset: int = 0,
        *,
        collection_name: str | None = None,
    ) -> list[DocumentInfo]:
        """
        List documents in the storage backend (if supported).

        Args:
            limit: Maximum number of documents to return
            offset: Number of documents to skip
            collection_name: Collection to list documents from

        Returns:
            List of document information with metadata

        Raises:
            NotImplementedError: If backend doesn't support document listing
        """
        raise NotImplementedError(f"{self.__class__.__name__} doesn't support document listing")

    async def close(self) -> None:
        """
        Close storage connections and cleanup resources.

        Default implementation does nothing.
        """
        # Default implementation - storage backends can override to cleanup connections
        return None
</file>

<file path="ingest_pipeline/storage/openwebui.py">
"""Open WebUI storage adapter."""

import asyncio
import contextlib
import logging
import time
from typing import Final, NamedTuple, TypedDict

from typing_extensions import override

from ..core.exceptions import StorageError
from ..core.models import Document, StorageConfig
from .base import BaseStorage, TypedHttpClient
from .types import CollectionSummary, DocumentInfo

LOGGER: Final[logging.Logger] = logging.getLogger(__name__)


class OpenWebUIFileResponse(TypedDict, total=False):
    """OpenWebUI API file response structure."""

    id: str
    filename: str
    name: str
    content_type: str
    size: int
    created_at: str
    meta: dict[str, str | int]


class OpenWebUIKnowledgeBase(TypedDict, total=False):
    """OpenWebUI knowledge base response structure."""

    id: str
    name: str
    description: str
    files: list[OpenWebUIFileResponse]
    data: dict[str, str]
    created_at: str
    updated_at: str


class CacheEntry(NamedTuple):
    """Cache entry with value and expiration time."""

    value: str
    expires_at: float


class OpenWebUIStorage(BaseStorage):
    """Storage adapter for Open WebUI knowledge endpoints."""

    http_client: TypedHttpClient
    _knowledge_cache: dict[str, CacheEntry]
    _cache_ttl: float

    def __init__(self, config: StorageConfig):
        """
        Initialize Open WebUI storage.

        Args:
            config: Storage configuration
        """
        super().__init__(config)

        self.http_client = TypedHttpClient(
            base_url=str(config.endpoint),
            api_key=config.api_key,
            timeout=30.0,
        )
        self._knowledge_cache = {}
        self._cache_ttl = 300.0  # 5 minutes TTL

    @override
    async def initialize(self) -> None:
        """Initialize Open WebUI connection."""
        try:
            if self.config.collection_name:
                await self._get_knowledge_id(
                    self.config.collection_name,
                    create=True,
                )
        except Exception as e:
            raise StorageError(f"Failed to initialize Open WebUI: {e}") from e

    async def _create_collection(self, name: str) -> str:
        """Create knowledge base in Open WebUI."""
        response = await self.http_client.request(
            "POST",
            "/api/v1/knowledge/create",
            json={
                "name": name,
                "description": "Documents ingested from various sources",
                "data": {},
                "access_control": None,
            },
        )
        if response is None:
            raise StorageError("Unexpected None response from knowledge base creation")
        result = response.json()
        knowledge_id = result.get("id")

        if not knowledge_id or not isinstance(knowledge_id, str):
            raise StorageError("Knowledge base creation failed: no ID returned")

        return str(knowledge_id)

    async def _fetch_knowledge_bases(self) -> list[OpenWebUIKnowledgeBase]:
        """Return the list of knowledge bases from the API."""
        response = await self.http_client.request("GET", "/api/v1/knowledge/list")
        if response is None:
            return []
        data = response.json()
        if not isinstance(data, list):
            return []
        normalized: list[OpenWebUIKnowledgeBase] = []
        for item in data:
            if (
                isinstance(item, dict)
                and "id" in item
                and "name" in item
                and isinstance(item["id"], str)
                and isinstance(item["name"], str)
            ):
                # Create a new dict with known structure
                kb_item: OpenWebUIKnowledgeBase = {
                    "id": item["id"],
                    "name": item["name"],
                    "description": item.get("description", ""),
                    "created_at": item.get("created_at", ""),
                    "updated_at": item.get("updated_at", ""),
                }
                if "files" in item and isinstance(item["files"], list):
                    kb_item["files"] = item["files"]
                if "data" in item and isinstance(item["data"], dict):
                    kb_item["data"] = item["data"]
                normalized.append(kb_item)
        return normalized

    async def _get_knowledge_id(
        self,
        name: str | None,
        *,
        create: bool,
    ) -> str | None:
        """Retrieve (and optionally create) a knowledge base identifier."""
        target_raw = name or self.config.collection_name
        target = str(target_raw) if target_raw else ""
        if not target:
            raise StorageError("Knowledge base name is required")

        # Check cache with TTL
        if cached_entry := self._knowledge_cache.get(target):
            if time.time() < cached_entry.expires_at:
                return cached_entry.value
            else:
                # Entry expired, remove it
                del self._knowledge_cache[target]

        knowledge_bases = await self._fetch_knowledge_bases()
        for kb in knowledge_bases:
            if kb.get("name") == target:
                kb_id = kb.get("id")
                if isinstance(kb_id, str):
                    expires_at = time.time() + self._cache_ttl
                    self._knowledge_cache[target] = CacheEntry(kb_id, expires_at)
                    return kb_id

        if not create:
            return None

        knowledge_id = await self._create_collection(target)
        expires_at = time.time() + self._cache_ttl
        self._knowledge_cache[target] = CacheEntry(knowledge_id, expires_at)
        return knowledge_id

    @override
    async def store(self, document: Document, *, collection_name: str | None = None) -> str:
        """
        Store a document in Open WebUI as a file.

        Args:
            document: Document to store

        Returns:
            File ID
        """
        try:
            knowledge_id = await self._get_knowledge_id(
                collection_name,
                create=True,
            )
            if not knowledge_id:
                raise StorageError("Knowledge base not initialized")

            # Step 1: Upload document as file
            # Use document title from metadata if available, otherwise fall back to ID
            filename = document.metadata.get("title") or f"doc_{document.id}"
            # Ensure filename has proper extension
            if not filename.endswith((".txt", ".md", ".pdf", ".doc", ".docx")):
                filename = f"{filename}.txt"
            files = {"file": (filename, document.content.encode(), "text/plain")}
            response = await self.http_client.request(
                "POST",
                "/api/v1/files/",
                files=files,
                params={"process": True, "process_in_background": False},
            )
            if response is None:
                raise StorageError("Unexpected None response from file upload")

            file_data = response.json()
            file_id = file_data.get("id")

            if not file_id or not isinstance(file_id, str):
                raise StorageError("File upload failed: no file ID returned")

            # Step 2: Add file to knowledge base
            response = await self.http_client.request(
                "POST", f"/api/v1/knowledge/{knowledge_id}/file/add", json={"file_id": file_id}
            )

            return str(file_id)

        except Exception as e:
            raise StorageError(f"Failed to store document: {e}") from e

    @override
    async def store_batch(
        self, documents: list[Document], *, collection_name: str | None = None
    ) -> list[str]:
        """
        Store multiple documents as files in batch.

        Args:
            documents: List of documents

        Returns:
            List of file IDs
        """
        try:
            knowledge_id = await self._get_knowledge_id(
                collection_name,
                create=True,
            )
            if not knowledge_id:
                raise StorageError("Knowledge base not initialized")

            async def upload_and_attach(doc: Document) -> str:
                # Use document title from metadata if available, otherwise fall back to ID
                filename = doc.metadata.get("title") or f"doc_{doc.id}"
                # Ensure filename has proper extension
                if not filename.endswith((".txt", ".md", ".pdf", ".doc", ".docx")):
                    filename = f"{filename}.txt"
                files = {"file": (filename, doc.content.encode(), "text/plain")}
                upload_response = await self.http_client.request(
                    "POST",
                    "/api/v1/files/",
                    files=files,
                    params={"process": True, "process_in_background": False},
                )
                if upload_response is None:
                    raise StorageError(
                        f"Unexpected None response from file upload for document {doc.id}"
                    )

                file_data = upload_response.json()
                file_id = file_data.get("id")

                if not file_id or not isinstance(file_id, str):
                    raise StorageError(
                        f"File upload failed for document {doc.id}: no file ID returned"
                    )

                await self.http_client.request(
                    "POST", f"/api/v1/knowledge/{knowledge_id}/file/add", json={"file_id": file_id}
                )

                return str(file_id)

            tasks = [upload_and_attach(doc) for doc in documents]
            results = await asyncio.gather(*tasks, return_exceptions=True)

            file_ids: list[str] = []
            failures: list[str] = []

            for index, result in enumerate(results):
                doc = documents[index]
                if isinstance(result, Exception):
                    failures.append(f"{doc.id}: {result}")
                else:
                    if isinstance(result, str):
                        file_ids.append(result)

            if failures:
                LOGGER.warning(
                    "OpenWebUI partial batch failure for knowledge base %s: %s",
                    self.config.collection_name,
                    ", ".join(failures),
                )

            return file_ids

        except Exception as e:
            raise StorageError(f"Failed to store batch: {e}") from e

    @override
    async def retrieve(
        self, document_id: str, *, collection_name: str | None = None
    ) -> Document | None:
        """
        OpenWebUI doesn't support document retrieval by ID.

        Args:
            document_id: File ID (not supported)
            collection_name: Collection name (not used)

        Returns:
            Always None - retrieval not supported
        """
        _ = document_id, collection_name  # Mark as used
        # OpenWebUI uses file-based storage without direct document retrieval
        raise NotImplementedError("OpenWebUI doesn't support document retrieval by ID")

    @override
    async def check_exists(
        self, document_id: str, *, collection_name: str | None = None, stale_after_days: int = 30
    ) -> bool:
        """
        Check if a document exists in OpenWebUI knowledge base by searching files.

        Args:
            document_id: Document ID to check (usually based on source URL)
            collection_name: Knowledge base name
            stale_after_days: Consider document stale after this many days

        Returns:
            True if document exists and is not stale, False otherwise
        """
        try:
            from datetime import UTC, datetime, timedelta

            # Get knowledge base
            knowledge_id = await self._get_knowledge_id(collection_name, create=False)
            if not knowledge_id:
                return False

            # Get detailed knowledge base info to access files
            response = await self.http_client.request("GET", f"/api/v1/knowledge/{knowledge_id}")
            if response is None:
                return False

            kb_data = response.json()
            files = kb_data.get("files", [])

            # Look for file with matching document ID or source URL in metadata
            cutoff = datetime.now(UTC) - timedelta(days=stale_after_days)

            def _parse_openwebui_timestamp(timestamp_str: str) -> datetime | None:
                """Parse OpenWebUI timestamp with proper timezone handling."""
                try:
                    # Handle both 'Z' suffix and explicit timezone
                    normalized = timestamp_str.replace("Z", "+00:00")
                    parsed = datetime.fromisoformat(normalized)
                    # Ensure timezone awareness
                    if parsed.tzinfo is None:
                        parsed = parsed.replace(tzinfo=UTC)
                    return parsed
                except (ValueError, AttributeError):
                    return None

            def _check_file_freshness(file_info: dict[str, object]) -> bool:
                """Check if file is fresh enough based on creation date."""
                created_at = file_info.get("created_at")
                if not isinstance(created_at, str):
                    # No date info available, consider stale to be safe
                    return False

                file_date = _parse_openwebui_timestamp(created_at)
                return file_date is not None and file_date >= cutoff

            for file_info in files:
                if not isinstance(file_info, dict):
                    continue

                file_id = file_info.get("id")
                if str(file_id) == document_id:
                    return _check_file_freshness(file_info)

                # Also check meta.source_url if available for URL-based document IDs
                meta = file_info.get("meta", {})
                if isinstance(meta, dict):
                    source_url = meta.get("source_url")
                    if source_url and document_id in str(source_url):
                        return _check_file_freshness(file_info)

            return False

        except Exception as e:
            LOGGER.debug("Error checking document existence in OpenWebUI: %s", e)
            return False

    @override
    async def delete(self, document_id: str, *, collection_name: str | None = None) -> bool:
        """
        Remove a file from Open WebUI knowledge base.

        Args:
            document_id: File ID to remove

        Returns:
            True if removed successfully
        """
        try:
            knowledge_id = await self._get_knowledge_id(
                collection_name,
                create=False,
            )
            if not knowledge_id:
                return False

            # Remove file from knowledge base
            await self.http_client.request(
                "POST",
                f"/api/v1/knowledge/{knowledge_id}/file/remove",
                json={"file_id": document_id},
            )

            await self.http_client.request("DELETE", f"/api/v1/files/{document_id}", allow_404=True)
            return True
        except Exception as exc:
            LOGGER.error("Error deleting file %s from OpenWebUI", document_id, exc_info=exc)
            return False

    async def list_collections(self) -> list[str]:
        """
        List all available knowledge bases.

        Returns:
            List of knowledge base names
        """
        try:
            knowledge_bases = await self._fetch_knowledge_bases()

            # Extract names from knowledge bases
            return [
                str(kb.get("name", f"knowledge_{kb.get('id', 'unknown')}") or "")
                for kb in knowledge_bases
            ]

        except Exception as e:
            raise StorageError(f"Failed to list knowledge bases: {e}") from e

    async def delete_collection(self, collection_name: str) -> bool:
        """
        Delete a knowledge base by name.

        Args:
            collection_name: Name of the knowledge base to delete

        Returns:
            True if deleted successfully, False otherwise
        """
        try:
            knowledge_id = await self._get_knowledge_id(collection_name, create=False)
            if not knowledge_id:
                # Collection doesn't exist, consider it already deleted
                return True

            # Delete the knowledge base using the OpenWebUI API
            await self.http_client.request(
                "DELETE", f"/api/v1/knowledge/{knowledge_id}/delete", allow_404=True
            )

            # Remove from cache if it exists
            if collection_name in self._knowledge_cache:
                del self._knowledge_cache[collection_name]

            LOGGER.info("Successfully deleted knowledge base: %s", collection_name)
            return True

        except Exception as e:
            if hasattr(e, "response"):
                response_attr = getattr(e, "response", None)
                if response_attr is not None and hasattr(response_attr, "status_code"):
                    with contextlib.suppress(Exception):
                        status_code = response_attr.status_code
                        if status_code == 404:
                            LOGGER.info(
                                "Knowledge base %s was already deleted or not found",
                                collection_name,
                            )
                            return True
            LOGGER.error(
                "Error deleting knowledge base %s from OpenWebUI",
                collection_name,
                exc_info=e,
            )
            return False

    async def _get_knowledge_base_count(self, kb: OpenWebUIKnowledgeBase) -> int:
        """Get the file count for a knowledge base."""
        kb_id = kb.get("id")
        name = kb.get("name", "Unknown")

        if not kb_id:
            return self._count_files_from_basic_info(kb)

        return await self._count_files_from_detailed_info(str(kb_id), str(name), kb)

    def _count_files_from_basic_info(self, kb: OpenWebUIKnowledgeBase) -> int:
        """Count files from basic knowledge base info."""
        files = kb.get("files", [])
        return len(files) if isinstance(files, list) and files is not None else 0

    async def _count_files_from_detailed_info(
        self, kb_id: str, name: str, kb: OpenWebUIKnowledgeBase
    ) -> int:
        """Count files by fetching detailed knowledge base info."""
        try:
            LOGGER.debug(f"Fetching detailed info for KB '{name}' from /api/v1/knowledge/{kb_id}")
            detail_response = await self.http_client.request("GET", f"/api/v1/knowledge/{kb_id}")
            if detail_response is None:
                LOGGER.warning(f"Knowledge base '{name}' (ID: {kb_id}) not found")
                return self._count_files_from_basic_info(kb)
            detailed_kb = detail_response.json()

            files = detailed_kb.get("files", [])
            count = len(files) if isinstance(files, list) and files is not None else 0

            LOGGER.info(f"Knowledge base '{name}' (ID: {kb_id}): found {count} files")
            return count

        except Exception as e:
            LOGGER.warning(f"Failed to get detailed info for KB '{name}' (ID: {kb_id}): {e}")
            return self._count_files_from_basic_info(kb)

    async def describe_collections(self) -> list[CollectionSummary]:
        """Return metadata about each knowledge base."""
        try:
            knowledge_bases = await self._fetch_knowledge_bases()
            collections: list[CollectionSummary] = []

            for kb in knowledge_bases:
                count = await self._get_knowledge_base_count(kb)
                name = kb.get("name", "Unknown")
                size_mb = count * 0.5  # rough heuristic

                summary: CollectionSummary = {
                    "name": str(name),
                    "count": count,
                    "size_mb": float(size_mb),
                }
                collections.append(summary)

            return collections

        except Exception as e:
            raise StorageError(f"Failed to describe knowledge bases: {e}") from e

    async def count(self, *, collection_name: str | None = None) -> int:
        """
        Get document count for a specific collection (knowledge base).

        Args:
            collection_name: Name of the knowledge base to count documents for

        Returns:
            Number of documents in the collection, 0 if collection not found
        """
        if not collection_name:
            # If no collection name provided, return total across all collections
            try:
                collections = await self.describe_collections()
                return sum(
                    int(collection["count"]) if isinstance(collection["count"], (int, str)) else 0
                    for collection in collections
                )
            except Exception:
                return 0

        try:
            # Get knowledge base by name and return its file count
            kb = await self.get_knowledge_by_name(collection_name)
            if not kb:
                return 0

            kb_id = kb.get("id")
            if not kb_id:
                return 0

            # Get detailed knowledge base information to get accurate file count
            detail_response = await self.http_client.request("GET", f"/api/v1/knowledge/{kb_id}")
            if detail_response is None:
                LOGGER.warning(f"Knowledge base '{collection_name}' (ID: {kb_id}) not found")
                return self._count_files_from_basic_info(kb)
            detailed_kb = detail_response.json()

            files = detailed_kb.get("files", [])
            count = len(files) if isinstance(files, list) else 0

            LOGGER.debug(f"Count for collection '{collection_name}': {count} files")
            return count

        except Exception as e:
            LOGGER.warning(f"Failed to get count for collection '{collection_name}': {e}")
            return 0

    async def get_knowledge_by_name(self, name: str) -> OpenWebUIKnowledgeBase | None:
        """
        Get knowledge base details by name.

        Args:
            name: Knowledge base name

        Returns:
            Knowledge base details or None if not found
        """
        try:
            response = await self.http_client.request("GET", "/api/v1/knowledge/list")
            if response is None:
                return None
            knowledge_bases = response.json()

            # Find and properly type the knowledge base
            for kb in knowledge_bases:
                if (
                    isinstance(kb, dict)
                    and kb.get("name") == name
                    and "id" in kb
                    and isinstance(kb["id"], str)
                ):
                    # Create properly typed response
                    result: OpenWebUIKnowledgeBase = {
                        "id": kb["id"],
                        "name": str(kb["name"]),
                        "description": kb.get("description", ""),
                        "created_at": kb.get("created_at", ""),
                        "updated_at": kb.get("updated_at", ""),
                    }
                    if "files" in kb and isinstance(kb["files"], list):
                        result["files"] = kb["files"]
                    if "data" in kb and isinstance(kb["data"], dict):
                        result["data"] = kb["data"]
                    return result
            return None
        except Exception as e:
            raise StorageError(f"Failed to get knowledge base by name: {e}") from e

    async def __aenter__(self) -> "OpenWebUIStorage":
        """Async context manager entry."""
        await self.initialize()
        return self

    async def __aexit__(
        self,
        exc_type: type[BaseException] | None,
        exc_val: BaseException | None,
        exc_tb: object | None,
    ) -> None:
        """Async context manager exit."""
        _ = exc_type, exc_val, exc_tb  # Mark as used
        await self.close()

    async def list_documents(
        self,
        limit: int = 100,
        offset: int = 0,
        *,
        collection_name: str | None = None,
    ) -> list[DocumentInfo]:
        """
        List documents (files) in a knowledge base.

        NOTE: This is a basic implementation that attempts to extract file information
        from OpenWebUI knowledge bases. The actual file listing capabilities depend
        on the OpenWebUI API version and may not include detailed file metadata.

        Args:
            limit: Maximum number of documents to return
            offset: Number of documents to skip
            collection_name: Knowledge base name

        Returns:
            List of document dictionaries with available metadata
        """
        try:
            # Use the knowledge base name or fall back to default
            kb_name = collection_name or self.config.collection_name or "default"

            # Try to get knowledge base details
            knowledge_base = await self.get_knowledge_by_name(kb_name)
            if not knowledge_base:
                # If specific KB not found, return empty list with a note
                return []

            # Extract files if available (API structure may vary)
            files = knowledge_base.get("files", [])

            # Handle different possible API response structures
            if not isinstance(files, list):
                # Some API versions might structure this differently
                # Try to handle gracefully
                return [
                    {
                        "id": "unknown",
                        "title": f"Knowledge Base: {kb_name}",
                        "source_url": "",
                        "description": "OpenWebUI knowledge base (file details not available)",
                        "content_type": "text/plain",
                        "content_preview": "Document listing not fully supported for OpenWebUI",
                        "word_count": 0,
                        "timestamp": "",
                    }
                ]

            # Apply pagination
            paginated_files = files[offset : offset + limit]

            # Convert to document format with safe field access
            documents: list[DocumentInfo] = []
            for i, file_info in enumerate(paginated_files):
                # Safely extract fields with fallbacks
                doc_id = str(file_info.get("id", f"file_{i}"))

                # Try multiple ways to get filename from OpenWebUI API response
                filename = None
                # Check direct filename field
                if "filename" in file_info:
                    filename = file_info["filename"]
                # Check name field
                elif "name" in file_info:
                    filename = file_info["name"]
                # Check meta.name (from FileModelResponse schema)
                elif isinstance(file_info.get("meta"), dict):
                    meta = file_info.get("meta")
                    if isinstance(meta, dict):
                        filename_value = meta.get("name")
                        if isinstance(filename_value, str):
                            filename = filename_value

                # Final fallback
                if not filename:
                    filename = f"file_{i}"

                filename = str(filename)

                # Extract size from meta if available
                size = 0
                meta = file_info.get("meta")
                if isinstance(meta, dict):
                    size_value = meta.get("size", 0)
                    size = int(size_value) if isinstance(size_value, (int, float)) else 0
                else:
                    size_value = file_info.get("size", 0)
                    size = int(size_value) if isinstance(size_value, (int, float)) else 0

                # Estimate word count from file size (very rough approximation)
                word_count = max(1, int(size / 6)) if isinstance(size, (int, float)) else 0

                doc_info: DocumentInfo = {
                    "id": doc_id,
                    "title": filename,
                    "source_url": "",  # OpenWebUI files don't typically have source URLs
                    "description": f"File: {filename}",
                    "content_type": str(file_info.get("content_type", "text/plain")),
                    "content_preview": f"File uploaded to OpenWebUI: {filename}",
                    "word_count": word_count,
                    "timestamp": str(file_info.get("created_at") or file_info.get("timestamp", "")),
                }
                documents.append(doc_info)

            return documents

        except Exception as e:
            # Since OpenWebUI file listing API structure is not guaranteed,
            # we gracefully fall back rather than raise an error
            import logging

            logging.warning(f"OpenWebUI document listing failed: {e}")

            # Return a placeholder entry indicating limited support
            return [
                {
                    "id": "api_error",
                    "title": f"Knowledge Base: {collection_name or 'default'}",
                    "source_url": "",
                    "description": "Document listing encountered an error - API compatibility issue",
                    "content_type": "text/plain",
                    "content_preview": f"Error: {str(e)[:100]}...",
                    "word_count": 0,
                    "timestamp": "",
                }
            ]

    async def close(self) -> None:
        """Close client connection."""
        if hasattr(self, "http_client"):
            await self.http_client.close()
</file>

<file path="ingest_pipeline/storage/weaviate.py">
"""Weaviate storage adapter."""

import asyncio
from collections.abc import AsyncGenerator, Callable, Mapping, Sequence
from datetime import UTC, datetime
from typing import Literal, Self, TypeAlias, TypeVar, cast, overload
from uuid import UUID

import weaviate
from typing_extensions import override
from weaviate.classes.config import Configure, DataType, Property
from weaviate.classes.data import DataObject
from weaviate.classes.query import Filter
from weaviate.collections import Collection
from weaviate.exceptions import (
    WeaviateBatchError,
    WeaviateConnectionError,
    WeaviateQueryError,
)

from ..core.exceptions import StorageError
from ..core.models import Document, DocumentMetadata, IngestionSource, StorageConfig
from ..utils.vectorizer import Vectorizer
from .base import BaseStorage
from .types import CollectionSummary, DocumentInfo

VectorContainer: TypeAlias = Mapping[str, object] | Sequence[object] | None
T = TypeVar("T")


class WeaviateStorage(BaseStorage):
    """Storage adapter for Weaviate."""

    client: weaviate.WeaviateClient | None
    vectorizer: Vectorizer
    _default_collection: str

    def __init__(self, config: StorageConfig):
        """
        Initialize Weaviate storage.

        Args:
            config: Storage configuration
        """
        super().__init__(config)
        self.client = None
        self.vectorizer = Vectorizer(config)
        self._default_collection = self._normalize_collection_name(config.collection_name)

    async def _run_sync(self, func: Callable[..., T], *args: object, **kwargs: object) -> T:
        """
        Run synchronous Weaviate operations in thread pool to avoid blocking event loop.

        Args:
            func: Synchronous function to run
            *args: Positional arguments for the function
            **kwargs: Keyword arguments for the function

        Returns:
            Result of the function call

        Raises:
            StorageError: If the operation fails
        """
        try:
            return await asyncio.to_thread(func, *args, **kwargs)
        except (WeaviateConnectionError, WeaviateBatchError, WeaviateQueryError) as e:
            raise StorageError(f"Weaviate operation failed: {e}") from e
        except Exception as e:
            raise StorageError(f"Unexpected error in Weaviate operation: {e}") from e

    @override
    async def initialize(self) -> None:
        """Initialize Weaviate client and create collection if needed."""
        try:
            # Let Weaviate client handle URL parsing
            self.client = weaviate.WeaviateClient(
                connection_params=weaviate.connect.ConnectionParams.from_url(
                    url=str(self.config.endpoint),
                    grpc_port=self.config.grpc_port or 50051,
                ),
                additional_config=weaviate.classes.init.AdditionalConfig(
                    timeout=weaviate.classes.init.Timeout(init=30, query=60, insert=120),
                ),
            )

            # Connect to the client
            await self._run_sync(self.client.connect)

            # Ensure the default collection exists
            await self._ensure_collection(self._default_collection)

        except WeaviateConnectionError as e:
            raise StorageError(f"Failed to connect to Weaviate: {e}") from e
        except Exception as e:
            raise StorageError(f"Failed to initialize Weaviate: {e}") from e

    async def _create_collection(self, collection_name: str) -> None:
        """Create Weaviate collection with schema."""
        if not self.client:
            raise StorageError("Weaviate client not initialized")
        try:
            await self._run_sync(
                self.client.collections.create,
                name=collection_name,
                properties=[
                    Property(
                        name="content", data_type=DataType.TEXT, description="Document content"
                    ),
                    Property(name="source_url", data_type=DataType.TEXT, description="Source URL"),
                    Property(name="title", data_type=DataType.TEXT, description="Document title"),
                    Property(
                        name="description",
                        data_type=DataType.TEXT,
                        description="Document description",
                    ),
                    Property(
                        name="timestamp", data_type=DataType.DATE, description="Ingestion timestamp"
                    ),
                    Property(
                        name="content_type", data_type=DataType.TEXT, description="Content type"
                    ),
                    Property(name="word_count", data_type=DataType.INT, description="Word count"),
                    Property(
                        name="char_count", data_type=DataType.INT, description="Character count"
                    ),
                    Property(
                        name="source", data_type=DataType.TEXT, description="Ingestion source"
                    ),
                ],
                vectorizer_config=Configure.Vectorizer.none(),
            )
        except (WeaviateConnectionError, WeaviateBatchError) as e:
            raise StorageError(f"Failed to create collection: {e}") from e

    @staticmethod
    def _extract_vector(vector_raw: VectorContainer) -> list[float] | None:
        """Normalize vector payloads returned by Weaviate into a float list."""
        if isinstance(vector_raw, Mapping):
            default_vector = vector_raw.get("default")
            return WeaviateStorage._extract_vector(cast(VectorContainer, default_vector))

        if not isinstance(vector_raw, Sequence) or isinstance(vector_raw, (str, bytes, bytearray)):
            return None

        items = list(vector_raw)
        if not items:
            return None

        first_item = items[0]
        if isinstance(first_item, (int, float)):
            numeric_items = cast(list[int | float], items)
            try:
                return [float(value) for value in numeric_items]
            except (TypeError, ValueError):
                return None

        if isinstance(first_item, Sequence) and not isinstance(first_item, (str, bytes, bytearray)):
            inner_items = list(first_item)
            if all(isinstance(item, (int, float)) for item in inner_items):
                try:
                    numeric_inner = cast(list[int | float], inner_items)
                    return [float(item) for item in numeric_inner]
                except (TypeError, ValueError):
                    return None

        return None

    @staticmethod
    def _parse_source(source_raw: object) -> IngestionSource:
        """Safely normalize persistence source values into enum instances."""
        if isinstance(source_raw, IngestionSource):
            return source_raw

        if isinstance(source_raw, str):
            try:
                return IngestionSource(source_raw)
            except ValueError:
                return IngestionSource.WEB

        return IngestionSource.WEB

    @staticmethod
    @overload
    def _coerce_properties(
        properties: object,
        *,
        context: str,
    ) -> Mapping[str, object]: ...

    @staticmethod
    @overload
    def _coerce_properties(
        properties: object,
        *,
        context: str,
        allow_missing: Literal[False],
    ) -> Mapping[str, object]: ...

    @staticmethod
    @overload
    def _coerce_properties(
        properties: object,
        *,
        context: str,
        allow_missing: Literal[True],
    ) -> Mapping[str, object] | None: ...

    @staticmethod
    def _coerce_properties(
        properties: object,
        *,
        context: str,
        allow_missing: bool = False,
    ) -> Mapping[str, object] | None:
        """Ensure Weaviate properties payloads are mappings."""
        if properties is None:
            if allow_missing:
                return None
            raise StorageError(f"{context} returned object without properties")

        if not isinstance(properties, Mapping):
            raise StorageError(
                f"{context} returned invalid properties payload of type {type(properties)!r}"
            )

        return cast(Mapping[str, object], properties)

    @staticmethod
    def _build_document_properties(doc: Document) -> dict[str, object]:
        """
        Build Weaviate properties dict from document.

        Args:
            doc: Document to build properties for

        Returns:
            Properties dict suitable for Weaviate
        """
        return {
            "content": doc.content,
            "source_url": doc.metadata["source_url"],
            "title": doc.metadata.get("title", ""),
            "description": doc.metadata.get("description", ""),
            "timestamp": doc.metadata["timestamp"].isoformat(),
            "content_type": doc.metadata["content_type"],
            "word_count": doc.metadata["word_count"],
            "char_count": doc.metadata["char_count"],
            "source": doc.source.value,
        }

    def _normalize_collection_name(self, collection_name: str | None) -> str:
        """Return a canonicalized collection name, defaulting to configured value."""
        candidate = collection_name or self.config.collection_name
        if not candidate:
            raise StorageError("Collection name is required")

        if normalized := candidate.strip():
            return normalized[0].upper() + normalized[1:]
        else:
            raise StorageError("Collection name cannot be empty")

    async def _ensure_collection(self, collection_name: str) -> None:
        """Create the collection if missing."""
        if not self.client:
            raise StorageError("Weaviate client not initialized")

        client = self.client
        existing = client.collections.list_all()
        if collection_name not in existing:
            await self._create_collection(collection_name)

    async def _prepare_collection(
        self,
        collection_name: str | None,
        *,
        ensure_exists: bool,
    ) -> tuple[Collection, str]:
        """Return a ready collection handle and normalized name."""
        normalized = self._normalize_collection_name(collection_name)

        if not self.client:
            raise StorageError("Weaviate client not initialized")

        if ensure_exists:
            await self._ensure_collection(normalized)

        client = self.client
        return client.collections.get(normalized), normalized

    @override
    async def store(self, document: Document, *, collection_name: str | None = None) -> str:
        """
        Store a document in Weaviate.

        Args:
            document: Document to store

        Returns:
            Document ID
        """
        try:
            # Vectorize content if no vector provided
            if document.vector is None:
                document.vector = await self.vectorizer.vectorize(document.content)

            collection, resolved_name = await self._prepare_collection(
                collection_name, ensure_exists=True
            )

            # Prepare properties
            properties = self._build_document_properties(document)

            # Insert with vector
            result = await self._run_sync(
                collection.data.insert,
                properties=properties,
                vector=document.vector,
                uuid=str(document.id),
            )

            return str(result)

        except (WeaviateConnectionError, WeaviateBatchError, WeaviateQueryError) as e:
            raise StorageError(f"Failed to store document: {e}") from e

    @override
    async def store_batch(
        self, documents: list[Document], *, collection_name: str | None = None
    ) -> list[str]:
        """
        Store multiple documents using proper batch operations.

        Args:
            documents: List of documents

        Returns:
            List of successfully stored document IDs
        """
        try:
            collection, resolved_name = await self._prepare_collection(
                collection_name, ensure_exists=True
            )

            # Vectorize documents without vectors using batch processing
            to_vectorize = [(i, doc) for i, doc in enumerate(documents) if doc.vector is None]
            if to_vectorize:
                contents = [doc.content for _, doc in to_vectorize]
                vectors = await self.vectorizer.vectorize_batch(contents)
                for (idx, _), vector in zip(to_vectorize, vectors, strict=False):
                    documents[idx].vector = vector

            # Prepare batch data for insert_many
            batch_objects = []
            for doc in documents:
                properties = self._build_document_properties(doc)
                batch_objects.append(
                    DataObject(properties=properties, vector=doc.vector, uuid=str(doc.id))
                )

            # Insert batch using insert_many
            response = await self._run_sync(collection.data.insert_many, batch_objects)

            successful_ids: list[str] = []
            error_indices = set(response.errors.keys()) if response else set()

            for index, doc in enumerate(documents):
                if index in error_indices:
                    continue

                uuid_value = response.uuids.get(index) if response else None
                successful_ids.append(str(uuid_value) if uuid_value is not None else str(doc.id))

            if error_indices:
                error_messages = ", ".join(
                    f"{documents[i].id}: {response.errors[i].message}"
                    for i in error_indices
                    if hasattr(response.errors[i], "message")
                )
                print(
                    "Weaviate partial batch failure for collection "
                    f"{resolved_name}: {error_messages}"
                )

            return successful_ids

        except (WeaviateBatchError, WeaviateConnectionError, WeaviateQueryError) as e:
            raise StorageError(f"Failed to store batch: {e}") from e

    @override
    async def retrieve(
        self, document_id: str, *, collection_name: str | None = None
    ) -> Document | None:
        """
        Retrieve a document from Weaviate.

        Args:
            document_id: Document ID

        Returns:
            Document or None
        """
        try:
            collection, resolved_name = await self._prepare_collection(
                collection_name, ensure_exists=False
            )
            result = await self._run_sync(collection.query.fetch_object_by_id, document_id)

            if not result:
                return None

            # Reconstruct document
            props = self._coerce_properties(
                result.properties,
                context="fetch_object_by_id",
            )
            # Parse timestamp to datetime for consistent metadata format
            from datetime import UTC, datetime

            timestamp_raw = props.get("timestamp")
            timestamp_parsed: datetime
            try:
                if isinstance(timestamp_raw, str):
                    timestamp_parsed = datetime.fromisoformat(timestamp_raw)
                    if timestamp_parsed.tzinfo is None:
                        timestamp_parsed = timestamp_parsed.replace(tzinfo=UTC)
                elif isinstance(timestamp_raw, datetime):
                    timestamp_parsed = timestamp_raw
                    if timestamp_parsed.tzinfo is None:
                        timestamp_parsed = timestamp_parsed.replace(tzinfo=UTC)
                else:
                    timestamp_parsed = datetime.now(UTC)
            except (ValueError, TypeError):
                timestamp_parsed = datetime.now(UTC)

            metadata_dict = {
                "source_url": str(props["source_url"]),
                "title": str(props.get("title")) if props.get("title") else None,
                "description": str(props.get("description")) if props.get("description") else None,
                "timestamp": timestamp_parsed,
                "content_type": str(props["content_type"]),
                "word_count": int(str(props["word_count"])),
                "char_count": int(str(props["char_count"])),
            }
            metadata = cast(DocumentMetadata, cast(object, metadata_dict))

            vector = self._extract_vector(cast(VectorContainer, result.vector))

            return Document(
                id=UUID(document_id),
                content=str(props["content"]),
                metadata=metadata,
                vector=vector,
                source=self._parse_source(props.get("source")),
                collection=resolved_name,
            )

        except WeaviateQueryError as e:
            raise StorageError(f"Query failed: {e}") from e
        except WeaviateConnectionError as e:
            # Connection issues should be logged and return None
            import logging

            logging.warning(f"Weaviate connection error retrieving document {document_id}: {e}")
            return None
        except Exception as e:
            # Log unexpected errors for debugging
            import logging

            logging.warning(f"Unexpected error retrieving document {document_id}: {e}")
            return None

    def _build_search_metadata(self, props: Mapping[str, object]) -> DocumentMetadata:
        """Build metadata dictionary from Weaviate properties."""
        metadata_dict = {
            "source_url": str(props["source_url"]),
            "title": str(props.get("title")) if props.get("title") else None,
            "description": str(props.get("description")) if props.get("description") else None,
            "timestamp": str(props["timestamp"]),
            "content_type": str(props["content_type"]),
            "word_count": int(str(props["word_count"])),
            "char_count": int(str(props["char_count"])),
        }
        return cast(DocumentMetadata, cast(object, metadata_dict))

    def _extract_search_score(self, result: object) -> float | None:
        """Extract and convert search score from result metadata."""
        metadata_obj = getattr(result, "metadata", None)
        if metadata_obj is None:
            return None

        raw_distance = getattr(metadata_obj, "distance", None)
        if raw_distance is None:
            return None

        try:
            distance_value = float(raw_distance)
            return max(0.0, 1.0 - distance_value)
        except (TypeError, ValueError) as e:
            import logging

            logging.debug(f"Invalid distance value {raw_distance}: {e}")
            return None

    def _build_search_document(
        self,
        result: object,
        resolved_name: str,
    ) -> Document:
        """Build Document from Weaviate search result."""
        props = self._coerce_properties(
            getattr(result, "properties", None),
            context="search result",
        )
        metadata = self._build_search_metadata(props)

        vector_attr = getattr(result, "vector", None)
        vector = self._extract_vector(cast(VectorContainer, vector_attr))
        score_value = self._extract_search_score(result)

        uuid_raw = getattr(result, "uuid", None)
        if uuid_raw is None:
            raise StorageError("Weaviate search result missing uuid")
        uuid_value = uuid_raw if isinstance(uuid_raw, UUID) else UUID(str(uuid_raw))

        return Document(
            id=uuid_value,
            content=str(props["content"]),
            metadata=metadata,
            vector=vector,
            source=self._parse_source(props.get("source")),
            collection=resolved_name,
            score=score_value,
        )

    @override
    async def search(
        self,
        query: str,
        limit: int = 10,
        threshold: float = 0.7,
        *,
        collection_name: str | None = None,
    ) -> AsyncGenerator[Document, None]:
        """
        Search for documents in Weaviate using hybrid search.

        Args:
            query: Search query
            limit: Maximum results
            threshold: Similarity threshold (not used in hybrid search)

        Yields:
            Matching documents
        """
        try:
            if not self.client:
                raise StorageError("Weaviate client not initialized")

            collection, resolved_name = await self._prepare_collection(
                collection_name, ensure_exists=False
            )

            # Try hybrid search first, fall back to BM25 keyword search
            try:
                response = await self._run_sync(
                    collection.query.hybrid, query=query, limit=limit, return_metadata=["score"]
                )
            except (WeaviateQueryError, StorageError):
                # Fall back to BM25 if hybrid search is not supported or fails
                response = await self._run_sync(
                    collection.query.bm25, query=query, limit=limit, return_metadata=["score"]
                )

            for obj in response.objects:
                yield self._build_document_from_search(obj, resolved_name)

        except (WeaviateQueryError, WeaviateConnectionError) as e:
            raise StorageError(f"Search failed: {e}") from e

    @override
    async def delete(self, document_id: str, *, collection_name: str | None = None) -> bool:
        """
        Delete a document from Weaviate.

        Args:
            document_id: Document ID

        Returns:
            True if deleted
        """
        try:
            collection, _ = await self._prepare_collection(collection_name, ensure_exists=False)
            await self._run_sync(collection.data.delete_by_id, document_id)
            return True
        except WeaviateQueryError as e:
            raise StorageError(f"Delete operation failed: {e}") from e
        except Exception:
            return False

    @override
    async def count(self, *, collection_name: str | None = None) -> int:
        """
        Get document count in collection.

        Returns:
            Number of documents
        """
        try:
            if not self.client:
                return 0
            collection, _ = await self._prepare_collection(collection_name, ensure_exists=False)
            result = collection.aggregate.over_all(total_count=True)
            return result.total_count or 0
        except WeaviateQueryError as e:
            raise StorageError(f"Count query failed: {e}") from e
        except Exception:
            return 0

    async def list_collections(self) -> list[str]:
        """
        List all available collections.

        Returns:
            List of collection names
        """
        try:
            if not self.client:
                raise StorageError("Weaviate client not initialized")

            client = self.client
            return list(client.collections.list_all())

        except WeaviateConnectionError as e:
            raise StorageError(f"Failed to list collections: {e}") from e

    async def describe_collections(self) -> list[CollectionSummary]:
        """Return metadata for each Weaviate collection."""
        if not self.client:
            raise StorageError("Weaviate client not initialized")

        try:
            client = self.client
            collections: list[CollectionSummary] = []
            for name in client.collections.list_all():
                collection_obj = client.collections.get(name)
                if not collection_obj:
                    continue

                count = collection_obj.aggregate.over_all(total_count=True).total_count or 0
                size_mb = count * 0.01
                collection_summary: CollectionSummary = {
                    "name": name,
                    "count": count,
                    "size_mb": size_mb,
                }
                collections.append(collection_summary)

            return collections
        except Exception as e:
            raise StorageError(f"Failed to describe collections: {e}") from e

    async def sample_documents(
        self, limit: int = 5, *, collection_name: str | None = None
    ) -> list[Document]:
        """
        Get sample documents from the collection.

        Args:
            limit: Maximum number of documents to return

        Returns:
            List of sample documents
        """
        try:
            collection, resolved_name = await self._prepare_collection(
                collection_name, ensure_exists=False
            )

            # Query for sample documents
            response = await self._run_sync(collection.query.fetch_objects, limit=limit)

            documents = []
            for obj in response.objects:
                # Convert back to Document format
                props = self._coerce_properties(
                    getattr(obj, "properties", None),
                    context="sample_documents",
                    allow_missing=True,
                )
                if props is None:
                    continue
                uuid_raw = getattr(obj, "uuid", None)
                if uuid_raw is None:
                    continue
                document_id = uuid_raw if isinstance(uuid_raw, UUID) else UUID(str(uuid_raw))
                # Safely convert WeaviateField values
                word_count_val = props.get("word_count")
                if isinstance(word_count_val, (int, float)):
                    word_count = int(word_count_val)
                elif word_count_val:
                    word_count = int(str(word_count_val))
                else:
                    word_count = 0

                char_count_val = props.get("char_count")
                if isinstance(char_count_val, (int, float)):
                    char_count = int(char_count_val)
                elif char_count_val:
                    char_count = int(str(char_count_val))
                else:
                    char_count = 0

                doc = Document(
                    id=document_id,
                    content=str(props.get("content", "")),
                    source=self._parse_source(props.get("source")),
                    metadata={
                        "source_url": str(props.get("source_url", "")),
                        "title": str(props.get("title", "")) if props.get("title") else None,
                        "description": str(props.get("description", ""))
                        if props.get("description")
                        else None,
                        "timestamp": datetime.fromisoformat(
                            str(props.get("timestamp", datetime.now(UTC).isoformat()))
                        ),
                        "content_type": str(props.get("content_type", "text/plain")),
                        "word_count": word_count,
                        "char_count": char_count,
                    },
                    collection=resolved_name,
                )
                documents.append(doc)

            return documents

        except Exception as e:
            raise StorageError(f"Failed to sample documents: {e}") from e

    def _safe_convert_count(self, value: object) -> int:
        """Safely convert a value to integer count."""
        if isinstance(value, (int, float)):
            return int(value)
        elif value:
            return int(str(value))
        else:
            return 0

    def _build_document_metadata(self, props: Mapping[str, object]) -> DocumentMetadata:
        """Build metadata from search document properties."""
        return {
            "source_url": str(props.get("source_url", "")),
            "title": str(props.get("title", "")) if props.get("title") else None,
            "description": str(props.get("description", "")) if props.get("description") else None,
            "timestamp": datetime.fromisoformat(
                str(props.get("timestamp", datetime.now(UTC).isoformat()))
            ),
            "content_type": str(props.get("content_type", "text/plain")),
            "word_count": self._safe_convert_count(props.get("word_count")),
            "char_count": self._safe_convert_count(props.get("char_count")),
        }

    def _extract_document_score(self, obj: object) -> float | None:
        """Extract score from document search result."""
        metadata_obj = getattr(obj, "metadata", None)
        if metadata_obj is None:
            return None

        raw_score = getattr(metadata_obj, "score", None)
        if raw_score is None:
            return None

        try:
            return float(raw_score)
        except (TypeError, ValueError) as e:
            import logging

            logging.debug(f"Invalid score value {raw_score}: {e}")
            return None

    def _build_document_from_search(
        self,
        obj: object,
        resolved_name: str,
    ) -> Document:
        """Build Document from search document result."""
        props = self._coerce_properties(
            getattr(obj, "properties", None),
            context="document search result",
        )
        metadata = self._build_document_metadata(props)
        score_value = self._extract_document_score(obj)

        uuid_raw = getattr(obj, "uuid", None)
        if uuid_raw is None:
            raise StorageError("Weaviate search document result missing uuid")
        uuid_value = uuid_raw if isinstance(uuid_raw, UUID) else UUID(str(uuid_raw))

        return Document(
            id=uuid_value,
            content=str(props.get("content", "")),
            source=self._parse_source(props.get("source")),
            metadata=metadata,
            collection=resolved_name,
            score=score_value,
        )

    async def search_documents(
        self, query: str, limit: int = 10, *, collection_name: str | None = None
    ) -> list[Document]:
        """
        Search documents in the collection.

        Args:
            query: Search query
            limit: Maximum number of results

        Returns:
            List of matching documents
        """
        # Delegate to the unified search method
        results = []
        async for document in self.search(query, limit=limit, collection_name=collection_name):
            results.append(document)
        return results

    async def list_documents(
        self,
        limit: int = 100,
        offset: int = 0,
        *,
        collection_name: str | None = None,
    ) -> list[DocumentInfo]:
        """
        List documents in the collection with pagination.

        Args:
            limit: Maximum number of documents to return
            offset: Number of documents to skip

        Returns:
            List of document dictionaries with id, title, source_url, and content preview
        """
        try:
            if not self.client:
                raise StorageError("Weaviate client not initialized")

            collection, _ = await self._prepare_collection(collection_name, ensure_exists=False)

            # Query documents with pagination
            response = await self._run_sync(
                collection.query.fetch_objects,
                limit=limit,
                offset=offset,
                return_metadata=["creation_time"],
            )

            documents: list[DocumentInfo] = []
            for obj in response.objects:
                props = self._coerce_properties(
                    obj.properties,
                    context="list_documents",
                    allow_missing=True,
                )
                if props is None:
                    continue
                content = str(props.get("content", ""))
                word_count_value = props.get("word_count", 0)
                # Convert WeaviateField to int
                if isinstance(word_count_value, (int, float)):
                    word_count = int(word_count_value)
                elif word_count_value:
                    word_count = int(str(word_count_value))
                else:
                    word_count = 0

                doc_info: DocumentInfo = {
                    "id": str(obj.uuid),
                    "title": str(props.get("title", "Untitled")),
                    "source_url": str(props.get("source_url", "")),
                    "description": str(props.get("description", "")),
                    "content_type": str(props.get("content_type", "text/plain")),
                    "content_preview": (f"{content[:200]}..." if len(content) > 200 else content),
                    "word_count": word_count,
                    "timestamp": str(props.get("timestamp", "")),
                }
                documents.append(doc_info)

            return documents

        except Exception as e:
            raise StorageError(f"Failed to list documents: {e}") from e

    async def delete_documents(
        self, document_ids: list[str], *, collection_name: str | None = None
    ) -> dict[str, bool]:
        """
        Delete multiple documents from Weaviate.

        Args:
            document_ids: List of document IDs to delete

        Returns:
            Dictionary mapping document IDs to deletion success status
        """
        results: dict[str, bool] = {}

        try:
            if not self.client:
                raise StorageError("Weaviate client not initialized")

            if not document_ids:
                return results

            collection, resolved_name = await self._prepare_collection(
                collection_name, ensure_exists=False
            )

            delete_filter = Filter.by_id().contains_any(document_ids)
            response = await self._run_sync(
                collection.data.delete_many, where=delete_filter, verbose=True
            )

            if objects := getattr(response, "objects", None):
                for result_obj in objects:
                    if doc_uuid := str(getattr(result_obj, "uuid", "")):
                        results[doc_uuid] = bool(getattr(result_obj, "successful", False))

            if len(results) < len(document_ids):
                default_success = getattr(response, "failed", 0) == 0
                for doc_id in document_ids:
                    _ = results.setdefault(doc_id, default_success)

            return results

        except Exception as e:
            raise StorageError(f"Failed to delete documents: {e}") from e

    async def delete_by_filter(
        self, filter_dict: dict[str, str], *, collection_name: str | None = None
    ) -> int:
        """
        Delete documents matching a filter.

        Args:
            filter_dict: Filter criteria (e.g., {"source_url": "example.com"})

        Returns:
            Number of documents deleted
        """
        try:
            if not self.client:
                raise StorageError("Weaviate client not initialized")

            collection, _ = await self._prepare_collection(collection_name, ensure_exists=False)

            # Build where filter
            where_filter = None
            if "source_url" in filter_dict:
                where_filter = Filter.by_property("source_url").equal(filter_dict["source_url"])

            # Get documents matching filter
            if where_filter:
                response = await self._run_sync(
                    collection.query.fetch_objects,
                    filters=where_filter,
                    limit=1000,  # Max batch size
                )
            else:
                response = await self._run_sync(
                    collection.query.fetch_objects,
                    limit=1000,  # Max batch size
                )

            # Delete matching documents
            deleted_count = 0
            for obj in response.objects:
                try:
                    await self._run_sync(collection.data.delete_by_id, obj.uuid)
                    deleted_count += 1
                except Exception:
                    continue

            return deleted_count

        except Exception as e:
            raise StorageError(f"Failed to delete by filter: {e}") from e

    async def delete_collection(self, collection_name: str | None = None) -> bool:
        """
        Delete the entire collection.

        Returns:
            True if successful
        """
        try:
            if not self.client:
                raise StorageError("Weaviate client not initialized")

            target = self._normalize_collection_name(collection_name)

            # Delete the collection using the client's collections API
            client = self.client
            client.collections.delete(target)

            return True

        except Exception as e:
            raise StorageError(f"Failed to delete collection: {e}") from e

    async def __aenter__(self) -> Self:
        """Async context manager entry."""
        return self

    async def __aexit__(
        self,
        exc_type: type[BaseException] | None,
        exc_val: BaseException | None,
        exc_tb: object | None,
    ) -> None:
        """Async context manager exit with proper cleanup."""
        await self.close()

    async def close(self) -> None:
        """Close client connection and vectorizer HTTP client."""
        if self.client:
            try:
                client = self.client
                client.close()
            except (WeaviateConnectionError, AttributeError) as e:
                import logging

                logging.warning(f"Error closing Weaviate client: {e}")

        # Close vectorizer HTTP client to prevent resource leaks
        try:
            await self.vectorizer.close()
        except (AttributeError, OSError) as e:
            import logging

            logging.warning(f"Error closing vectorizer client: {e}")

    def __del__(self) -> None:
        """Clean up client connection as fallback."""
        if self.client:
            try:
                client = self.client
                client.close()
            except Exception:
                pass  # Ignore errors in destructor
</file>

</files>