This commit is contained in:
2025-09-19 08:31:36 +00:00
parent 2d01228f24
commit 8185eaaf93
16 changed files with 1382 additions and 1235 deletions

View File

@@ -46,7 +46,9 @@ uv run ruff check .
uv run ruff format .
# Type checking
uv run mypy ingest_pipeline
uv run pyrefly check ingest_pipeline/ tests/
uv run basedpyright
uv run sourcery review ingest_pipeline/ tests/ --fix
# Install dev dependencies
uv sync --dev

View File

@@ -2,10 +2,11 @@
from __future__ import annotations
from typing import Any
from typing import cast
from textual.app import ComposeResult
from textual.containers import Container, VerticalScroll
from textual.widget import Widget
from textual.widgets import Static
from typing_extensions import override
@@ -13,7 +14,7 @@ from typing_extensions import override
class ResponsiveGrid(Container):
"""Grid that auto-adjusts based on terminal size."""
DEFAULT_CSS = """
DEFAULT_CSS: str = """
ResponsiveGrid {
layout: grid;
grid-size: 1;
@@ -45,58 +46,64 @@ class ResponsiveGrid(Container):
def __init__(
self,
*children: Any,
*children: Widget,
columns: int = 1,
auto_fit: bool = False,
compact: bool = False,
**kwargs: Any,
name: str | None = None,
id: str | None = None,
classes: str | None = None,
disabled: bool = False,
markup: bool = True,
) -> None:
"""Initialize responsive grid."""
super().__init__(*children, **kwargs)
self.columns = columns
self.auto_fit = auto_fit
self.compact = compact
super().__init__(*children, name=name, id=id, classes=classes, disabled=disabled, markup=markup)
self._columns: int = columns
self._auto_fit: bool = auto_fit
self._compact: bool = compact
def on_mount(self) -> None:
"""Apply responsive classes based on configuration."""
if self.auto_fit:
_ = self.add_class("auto-fit")
elif self.columns == 2:
_ = self.add_class("two-column")
elif self.columns == 3:
_ = self.add_class("three-column")
widget = cast(Widget, self)
if self._auto_fit:
widget.add_class("auto-fit")
elif self._columns == 2:
widget.add_class("two-column")
elif self._columns == 3:
widget.add_class("three-column")
if self.compact:
_ = self.add_class("compact")
if self._compact:
widget.add_class("compact")
def on_resize(self) -> None:
"""Adjust layout based on terminal size."""
if self.auto_fit:
if self._auto_fit:
# Let CSS handle auto-fit
return
terminal_width = self.size.width
widget = cast(Widget, self)
terminal_width = widget.size.width
if terminal_width < 60:
# Force single column on narrow terminals
_ = self.remove_class("two-column", "three-column")
self.styles.grid_size_columns = 1
self.styles.grid_columns = "1fr"
elif terminal_width < 100 and self.columns > 2:
widget.remove_class("two-column", "three-column")
widget.styles.grid_size_columns = 1
widget.styles.grid_columns = "1fr"
elif terminal_width < 100 and self._columns > 2:
# Force two columns on medium terminals
_ = self.remove_class("three-column")
_ = self.add_class("two-column")
self.styles.grid_size_columns = 2
self.styles.grid_columns = "1fr 1fr"
elif self.columns == 2:
_ = self.add_class("two-column")
elif self.columns == 3:
_ = self.add_class("three-column")
widget.remove_class("three-column")
widget.add_class("two-column")
widget.styles.grid_size_columns = 2
widget.styles.grid_columns = "1fr 1fr"
elif self._columns == 2:
widget.add_class("two-column")
elif self._columns == 3:
widget.add_class("three-column")
class CollapsibleSidebar(Container):
"""Sidebar that can be collapsed to save space."""
DEFAULT_CSS = """
DEFAULT_CSS: str = """
CollapsibleSidebar {
dock: left;
width: 25%;
@@ -133,11 +140,20 @@ class CollapsibleSidebar(Container):
}
"""
def __init__(self, *children: Any, collapsed: bool = False, **kwargs: Any) -> None:
def __init__(
self,
*children: Widget,
collapsed: bool = False,
name: str | None = None,
id: str | None = None,
classes: str | None = None,
disabled: bool = False,
markup: bool = True,
) -> None:
"""Initialize collapsible sidebar."""
super().__init__(**kwargs)
self.collapsed = collapsed
self._children = children
super().__init__(name=name, id=id, classes=classes, disabled=disabled, markup=markup)
self._collapsed: bool = collapsed
self._children: tuple[Widget, ...] = children
@override
def compose(self) -> ComposeResult:
@@ -148,8 +164,8 @@ class CollapsibleSidebar(Container):
def on_mount(self) -> None:
"""Apply initial collapsed state."""
if self.collapsed:
_ = self.add_class("collapsed")
if self._collapsed:
cast(Widget, self).add_class("collapsed")
def on_click(self) -> None:
"""Toggle sidebar when clicked."""
@@ -157,27 +173,28 @@ class CollapsibleSidebar(Container):
def toggle(self) -> None:
"""Toggle sidebar collapsed state."""
self.collapsed = not self.collapsed
if self.collapsed:
_ = self.add_class("collapsed")
self._collapsed = not self._collapsed
widget = cast(Widget, self)
if self._collapsed:
widget.add_class("collapsed")
else:
_ = self.remove_class("collapsed")
widget.remove_class("collapsed")
def expand_sidebar(self) -> None:
"""Expand sidebar."""
if self.collapsed:
if self._collapsed:
self.toggle()
def collapse_sidebar(self) -> None:
"""Collapse sidebar."""
if not self.collapsed:
if not self._collapsed:
self.toggle()
class TabularLayout(Container):
"""Optimized layout for data tables with optional sidebar."""
DEFAULT_CSS = """
DEFAULT_CSS: str = """
TabularLayout {
layout: horizontal;
height: 100%;
@@ -215,18 +232,22 @@ class TabularLayout(Container):
def __init__(
self,
table_widget: Any,
header_content: Any | None = None,
footer_content: Any | None = None,
sidebar_content: Any | None = None,
**kwargs: Any,
table_widget: Widget,
header_content: Widget | None = None,
footer_content: Widget | None = None,
sidebar_content: Widget | None = None,
name: str | None = None,
id: str | None = None,
classes: str | None = None,
disabled: bool = False,
markup: bool = True,
) -> None:
"""Initialize tabular layout."""
super().__init__(**kwargs)
self.table_widget = table_widget
self.header_content = header_content
self.footer_content = footer_content
self.sidebar_content = sidebar_content
super().__init__(name=name, id=id, classes=classes, disabled=disabled, markup=markup)
self.table_widget: Widget = table_widget
self.header_content: Widget | None = header_content
self.footer_content: Widget | None = footer_content
self.sidebar_content: Widget | None = sidebar_content
@override
def compose(self) -> ComposeResult:
@@ -247,7 +268,7 @@ class TabularLayout(Container):
class CardLayout(ResponsiveGrid):
"""Grid layout optimized for card-based content."""
DEFAULT_CSS = """
DEFAULT_CSS: str = """
CardLayout {
grid-gutter: 2;
padding: 2;
@@ -296,16 +317,23 @@ class CardLayout(ResponsiveGrid):
}
"""
def __init__(self, **kwargs: Any) -> None:
def __init__(
self,
name: str | None = None,
id: str | None = None,
classes: str | None = None,
disabled: bool = False,
markup: bool = True,
) -> None:
"""Initialize card layout with default settings for cards."""
# Default to auto-fit cards with minimum width
super().__init__(auto_fit=True, **kwargs)
super().__init__(auto_fit=True, name=name, id=id, classes=classes, disabled=disabled, markup=markup)
class SplitPane(Container):
"""Resizable split pane layout."""
DEFAULT_CSS = """
DEFAULT_CSS: str = """
SplitPane {
layout: horizontal;
height: 100%;
@@ -345,36 +373,41 @@ class SplitPane(Container):
def __init__(
self,
left_content: Any,
right_content: Any,
left_content: Widget,
right_content: Widget,
vertical: bool = False,
split_ratio: float = 0.5,
**kwargs: Any,
name: str | None = None,
id: str | None = None,
classes: str | None = None,
disabled: bool = False,
markup: bool = True,
) -> None:
"""Initialize split pane."""
super().__init__(**kwargs)
self.left_content = left_content
self.right_content = right_content
self.vertical = vertical
self.split_ratio = split_ratio
super().__init__(name=name, id=id, classes=classes, disabled=disabled, markup=markup)
self._left_content: Widget = left_content
self._right_content: Widget = right_content
self._vertical: bool = vertical
self._split_ratio: float = split_ratio
@override
def compose(self) -> ComposeResult:
"""Compose split pane layout."""
if self.vertical:
_ = self.add_class("vertical")
if self._vertical:
cast(Widget, self).add_class("vertical")
pane_classes = ("top-pane", "bottom-pane") if self.vertical else ("left-pane", "right-pane")
pane_classes = ("top-pane", "bottom-pane") if self._vertical else ("left-pane", "right-pane")
yield Container(self.left_content, classes=pane_classes[0])
yield Container(self._left_content, classes=pane_classes[0])
yield Static("", classes="splitter")
yield Container(self.right_content, classes=pane_classes[1])
yield Container(self._right_content, classes=pane_classes[1])
def on_mount(self) -> None:
"""Apply split ratio."""
if self.vertical:
self.query_one(f".{self.__class__.__name__} .top-pane").styles.height = f"{self.split_ratio * 100}%"
self.query_one(f".{self.__class__.__name__} .bottom-pane").styles.height = f"{(1 - self.split_ratio) * 100}%"
widget = cast(Widget, self)
if self._vertical:
widget.query_one(".top-pane").styles.height = f"{self._split_ratio * 100}%"
widget.query_one(".bottom-pane").styles.height = f"{(1 - self._split_ratio) * 100}%"
else:
self.query_one(f".{self.__class__.__name__} .left-pane").styles.width = f"{self.split_ratio * 100}%"
self.query_one(f".{self.__class__.__name__} .right-pane").styles.width = f"{(1 - self.split_ratio) * 100}%"
widget.query_one(".left-pane").styles.width = f"{self._split_ratio * 100}%"
widget.query_one(".right-pane").styles.width = f"{(1 - self._split_ratio) * 100}%"

View File

@@ -387,7 +387,12 @@ class CollectionOverviewScreen(Screen[None]):
async def list_openwebui_collections(self) -> list[CollectionInfo]:
"""List OpenWebUI collections with enhanced metadata."""
# Try to get OpenWebUI backend from storage manager if direct instance not available
openwebui_backend = self.openwebui or self.storage_manager.get_backend(StorageBackend.OPEN_WEBUI)
openwebui_backend = self.openwebui
if not openwebui_backend:
backend = self.storage_manager.get_backend(StorageBackend.OPEN_WEBUI)
if not isinstance(backend, OpenWebUIStorage):
return []
openwebui_backend = backend
if not openwebui_backend:
return []
@@ -578,42 +583,42 @@ class CollectionOverviewScreen(Screen[None]):
def action_tab_dashboard(self) -> None:
"""Switch to dashboard tab."""
tabs = self.query_one(TabbedContent)
tabs.active = "dashboard"
tabbed_content: TabbedContent = self.query_one(TabbedContent)
tabbed_content.active = "dashboard"
def action_tab_collections(self) -> None:
"""Switch to collections tab."""
tabs = self.query_one(TabbedContent)
tabs.active = "collections"
tabbed_content: TabbedContent = self.query_one(TabbedContent)
tabbed_content.active = "collections"
def action_tab_analytics(self) -> None:
"""Switch to analytics tab."""
tabs = self.query_one(TabbedContent)
tabs.active = "analytics"
tabbed_content: TabbedContent = self.query_one(TabbedContent)
tabbed_content.active = "analytics"
def action_next_tab(self) -> None:
"""Switch to next tab."""
tabs = self.query_one(TabbedContent)
tabbed_content: TabbedContent = self.query_one(TabbedContent)
tab_ids = ["dashboard", "collections", "analytics"]
current = tabs.active
current = tabbed_content.active
try:
current_index = tab_ids.index(current)
next_index = (current_index + 1) % len(tab_ids)
tabs.active = tab_ids[next_index]
tabbed_content.active = tab_ids[next_index]
except (ValueError, AttributeError):
tabs.active = tab_ids[0]
tabbed_content.active = tab_ids[0]
def action_prev_tab(self) -> None:
"""Switch to previous tab."""
tabs = self.query_one(TabbedContent)
tabbed_content: TabbedContent = self.query_one(TabbedContent)
tab_ids = ["dashboard", "collections", "analytics"]
current = tabs.active
current = tabbed_content.active
try:
current_index = tab_ids.index(current)
prev_index = (current_index - 1) % len(tab_ids)
tabs.active = tab_ids[prev_index]
tabbed_content.active = tab_ids[prev_index]
except (ValueError, AttributeError):
tabs.active = tab_ids[0]
tabbed_content.active = tab_ids[0]
def action_help(self) -> None:
"""Show help screen."""

View File

@@ -1,7 +1,9 @@
"""Dialog screens for confirmations and user interactions."""
from __future__ import annotations
from pathlib import Path
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, ClassVar
from textual.app import ComposeResult
from textual.binding import Binding
@@ -21,16 +23,16 @@ class ConfirmDeleteScreen(Screen[None]):
"""Screen for confirming collection deletion."""
collection: CollectionInfo
parent_screen: "CollectionOverviewScreen"
parent_screen: CollectionOverviewScreen
BINDINGS = [
BINDINGS: list[Binding] = [
Binding("escape", "app.pop_screen", "Cancel"),
Binding("y", "confirm_delete", "Yes"),
Binding("n", "app.pop_screen", "No"),
Binding("enter", "confirm_delete", "Confirm"),
]
def __init__(self, collection: CollectionInfo, parent_screen: "CollectionOverviewScreen"):
def __init__(self, collection: CollectionInfo, parent_screen: CollectionOverviewScreen):
super().__init__()
self.collection = collection
self.parent_screen = parent_screen
@@ -74,7 +76,7 @@ class ConfirmDeleteScreen(Screen[None]):
try:
if self.collection["type"] == "weaviate" and self.parent_screen.weaviate:
# Delete Weaviate collection
if self.parent_screen.weaviate.client:
if self.parent_screen.weaviate.client and self.parent_screen.weaviate.client.collections:
self.parent_screen.weaviate.client.collections.delete(self.collection["name"])
self.notify(
f"Deleted Weaviate collection: {self.collection['name']}",
@@ -145,7 +147,7 @@ class ConfirmDocumentDeleteScreen(Screen[None]):
collection: CollectionInfo
parent_screen: "DocumentManagementScreen"
BINDINGS = [
BINDINGS: list[Binding] = [
Binding("escape", "app.pop_screen", "Cancel"),
Binding("y", "confirm_delete", "Yes"),
Binding("n", "app.pop_screen", "No"),
@@ -205,15 +207,16 @@ class ConfirmDocumentDeleteScreen(Screen[None]):
loading.display = True
try:
results: dict[str, bool] = {}
if hasattr(self.parent_screen, 'storage') and self.parent_screen.storage:
# Delete documents via storage
# The storage should have delete_documents method for weaviate
storage = self.parent_screen.storage
if hasattr(storage, 'delete_documents'):
results = await storage.delete_documents(
self.doc_ids,
collection_name=self.collection["name"],
)
self.doc_ids,
collection_name=self.collection["name"],
)
# Count successful deletions
successful = sum(bool(success) for success in results.values())
@@ -241,7 +244,7 @@ class LogViewerScreen(ModalScreen[None]):
_log_widget: RichLog | None
_log_file: Path | None
BINDINGS = [
BINDINGS: list[Binding] = [
Binding("escape", "close", "Close"),
Binding("ctrl+l", "close", "Close"),
Binding("s", "show_path", "Log File"),

View File

@@ -8,13 +8,14 @@ from typing import cast
from textual.app import ComposeResult
from textual.containers import Container, Horizontal
from textual.validation import Integer
from textual.widget import Widget
from textual.widgets import Button, Checkbox, Input, Label, Switch, TextArea
from typing_extensions import override
from ..models import FirecrawlOptions
class ScrapeOptionsForm(Container):
class ScrapeOptionsForm(Widget):
"""Form for configuring Firecrawl scraping options."""
DEFAULT_CSS = """
@@ -206,7 +207,7 @@ class ScrapeOptionsForm(Container):
self.query_one("#wait_for", Input).value = str(wait_for)
class MapOptionsForm(Container):
class MapOptionsForm(Widget):
"""Form for configuring site mapping options."""
DEFAULT_CSS = """
@@ -352,7 +353,7 @@ class MapOptionsForm(Container):
self.query_one("#max_depth", Input).value = str(max_depth)
class ExtractOptionsForm(Container):
class ExtractOptionsForm(Widget):
"""Form for configuring data extraction options."""
DEFAULT_CSS = """
@@ -532,7 +533,7 @@ class ExtractOptionsForm(Container):
prompt_widget.text = "Extract numerical data, metrics, and tabular information"
class FirecrawlConfigWidget(Container):
class FirecrawlConfigWidget(Widget):
"""Complete Firecrawl configuration widget with tabbed interface."""
DEFAULT_CSS = """

View File

@@ -7,6 +7,7 @@ from typing import Any
from textual import work
from textual.app import ComposeResult
from textual.containers import Container, Horizontal, Vertical, VerticalScroll
from textual.widget import Widget
from textual.widgets import Button, DataTable, Label, Markdown, ProgressBar, Static, Tree
from typing_extensions import override
@@ -14,7 +15,7 @@ from ....storage.r2r.storage import R2RStorage
from ..models import ChunkInfo, EntityInfo
class ChunkViewer(Container):
class ChunkViewer(Widget):
"""Widget for viewing document chunks with navigation."""
DEFAULT_CSS = """
@@ -56,10 +57,10 @@ class ChunkViewer(Container):
def __init__(self, r2r_storage: R2RStorage, document_id: str, **kwargs: Any) -> None:
"""Initialize chunk viewer."""
super().__init__(**kwargs)
self.r2r_storage = r2r_storage
self.document_id = document_id
self.r2r_storage: R2RStorage = r2r_storage
self.document_id: str = document_id
self.chunks: list[ChunkInfo] = []
self.current_chunk_index = 0
self.current_chunk_index: int = 0
@override
def compose(self) -> ComposeResult:
@@ -154,7 +155,7 @@ class ChunkViewer(Container):
self.update_chunk_display()
class EntityGraph(Container):
class EntityGraph(Widget):
"""Widget for visualizing extracted entities and relationships."""
DEFAULT_CSS = """
@@ -189,8 +190,8 @@ class EntityGraph(Container):
def __init__(self, r2r_storage: R2RStorage, document_id: str, **kwargs: Any) -> None:
"""Initialize entity graph."""
super().__init__(**kwargs)
self.r2r_storage = r2r_storage
self.document_id = document_id
self.r2r_storage: R2RStorage = r2r_storage
self.document_id: str = document_id
self.entities: list[EntityInfo] = []
@override
@@ -287,7 +288,7 @@ class EntityGraph(Container):
details_widget.update(details_text)
class CollectionStats(Container):
class CollectionStats(Widget):
"""Widget for showing R2R-specific collection statistics."""
DEFAULT_CSS = """
@@ -342,8 +343,8 @@ class CollectionStats(Container):
def __init__(self, r2r_storage: R2RStorage, collection_name: str, **kwargs: Any) -> None:
"""Initialize collection stats."""
super().__init__(**kwargs)
self.r2r_storage = r2r_storage
self.collection_name = collection_name
self.r2r_storage: R2RStorage = r2r_storage
self.collection_name: str = collection_name
@override
def compose(self) -> ComposeResult:
@@ -413,7 +414,7 @@ class CollectionStats(Container):
self.query_one("#processing_status", Static).update(f"Error: {e}")
class DocumentOverview(Container):
class DocumentOverview(Widget):
"""Widget for comprehensive document overview and statistics."""
DEFAULT_CSS = """
@@ -454,8 +455,8 @@ class DocumentOverview(Container):
def __init__(self, r2r_storage: R2RStorage, document_id: str, **kwargs: Any) -> None:
"""Initialize document overview."""
super().__init__(**kwargs)
self.r2r_storage = r2r_storage
self.document_id = document_id
self.r2r_storage: R2RStorage = r2r_storage
self.document_id: str = document_id
@override
def compose(self) -> ComposeResult:

View File

@@ -1,7 +1,7 @@
"""Application settings and configuration."""
from functools import lru_cache
from typing import Annotated, Literal
from typing import Annotated, ClassVar, Literal
from prefect.variables import Variable
from pydantic import Field, HttpUrl, model_validator
@@ -11,7 +11,7 @@ from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
"""Application settings."""
model_config = SettingsConfigDict(
model_config: ClassVar[SettingsConfigDict] = SettingsConfigDict(
env_file=".env",
env_file_encoding="utf-8",
case_sensitive=False,
@@ -154,8 +154,8 @@ class PrefectVariableConfig:
"""Helper class for managing Prefect variables with fallbacks to settings."""
def __init__(self) -> None:
self._settings = get_settings()
self._variable_names = [
self._settings: Settings = get_settings()
self._variable_names: list[str] = [
"default_batch_size", "max_file_size", "max_crawl_depth", "max_crawl_pages",
"default_storage_backend", "default_collection_prefix", "max_concurrent_tasks",
"request_timeout", "default_schedule_interval"
@@ -212,7 +212,7 @@ class PrefectVariableConfig:
async def get_ingestion_config_async(self) -> dict[str, str | int | float | None]:
"""Get all ingestion-related configuration variables asynchronously."""
result = {}
result: dict[str, str | int | float | None] = {}
for name in self._variable_names:
result[name] = await self.get_with_fallback_async(name)
return result

View File

@@ -2,7 +2,7 @@
from datetime import UTC, datetime
from enum import Enum
from typing import Annotated, TypedDict
from typing import Annotated, ClassVar, TypedDict
from uuid import UUID, uuid4
from prefect.blocks.core import Block
@@ -48,9 +48,9 @@ class VectorConfig(BaseModel):
class StorageConfig(Block):
"""Configuration for storage backend."""
_block_type_name = "Storage Configuration"
_block_type_slug = "storage-config"
_description = "Configures storage backend connections and settings for document ingestion"
_block_type_name: ClassVar[str] = "Storage Configuration"
_block_type_slug: ClassVar[str] = "storage-config"
_description: ClassVar[str] = "Configures storage backend connections and settings for document ingestion"
backend: StorageBackend
endpoint: HttpUrl
@@ -62,9 +62,9 @@ class StorageConfig(Block):
class FirecrawlConfig(Block):
"""Configuration for Firecrawl ingestion (operational parameters only)."""
_block_type_name = "Firecrawl Configuration"
_block_type_slug = "firecrawl-config"
_description = "Configures Firecrawl web scraping and crawling parameters"
_block_type_name: ClassVar[str] = "Firecrawl Configuration"
_block_type_slug: ClassVar[str] = "firecrawl-config"
_description: ClassVar[str] = "Configures Firecrawl web scraping and crawling parameters"
formats: list[str] = Field(default_factory=lambda: ["markdown", "html"])
max_depth: Annotated[int, Field(ge=1, le=20)] = 5
@@ -76,9 +76,9 @@ class FirecrawlConfig(Block):
class RepomixConfig(Block):
"""Configuration for Repomix ingestion."""
_block_type_name = "Repomix Configuration"
_block_type_slug = "repomix-config"
_description = "Configures repository ingestion patterns and file processing settings"
_block_type_name: ClassVar[str] = "Repomix Configuration"
_block_type_slug: ClassVar[str] = "repomix-config"
_description: ClassVar[str] = "Configures repository ingestion patterns and file processing settings"
include_patterns: list[str] = Field(
default_factory=lambda: ["*.py", "*.js", "*.ts", "*.md", "*.yaml", "*.json"]
@@ -93,9 +93,9 @@ class RepomixConfig(Block):
class R2RConfig(Block):
"""Configuration for R2R ingestion."""
_block_type_name = "R2R Configuration"
_block_type_slug = "r2r-config"
_description = "Configures R2R-specific ingestion settings including chunking and graph enrichment"
_block_type_name: ClassVar[str] = "R2R Configuration"
_block_type_slug: ClassVar[str] = "r2r-config"
_description: ClassVar[str] = "Configures R2R-specific ingestion settings including chunking and graph enrichment"
chunk_size: Annotated[int, Field(ge=100, le=8192)] = 1000
chunk_overlap: Annotated[int, Field(ge=0, le=1000)] = 200

View File

@@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Literal, TypeAlias, assert_never, cast
from prefect import flow, get_run_logger, task
from prefect.blocks.core import Block
from prefect.variables import Variable
from pydantic.types import SecretStr
from pydantic import SecretStr
from ..config.settings import Settings
from ..core.exceptions import IngestionError

View File

@@ -173,6 +173,33 @@ class BaseStorage(ABC):
"""
return []
async def delete_collection(self, collection_name: str) -> bool:
"""
Delete a collection (if supported by backend).
Args:
collection_name: Name of collection to delete
Returns:
True if deleted successfully, False if not supported
"""
return False
async def delete_documents(
self, document_ids: list[str], *, collection_name: str | None = None
) -> dict[str, bool]:
"""
Delete documents by IDs (if supported by backend).
Args:
document_ids: List of document IDs to delete
collection_name: Collection to delete from
Returns:
Dict mapping document IDs to success status, empty if not supported
"""
return {}
async def list_documents(
self,
limit: int = 100,

View File

@@ -2,11 +2,21 @@
import asyncio
import logging
from typing import Final, TypedDict, cast
from typing import TYPE_CHECKING, Final, TypedDict, cast
import httpx
from typing_extensions import override
if TYPE_CHECKING:
# Type checking imports - these will be ignored at runtime
from httpx import AsyncClient, ConnectError, HTTPStatusError, RequestError
else:
# Runtime imports that work properly
AsyncClient = httpx.AsyncClient
ConnectError = httpx.ConnectError
HTTPStatusError = httpx.HTTPStatusError
RequestError = httpx.RequestError
from ..core.exceptions import StorageError
from ..core.models import Document, StorageConfig
from .base import BaseStorage
@@ -17,7 +27,7 @@ LOGGER: Final[logging.Logger] = logging.getLogger(__name__)
class OpenWebUIStorage(BaseStorage):
"""Storage adapter for Open WebUI knowledge endpoints."""
client: httpx.AsyncClient
client: AsyncClient
_knowledge_cache: dict[str, str]
def __init__(self, config: StorageConfig):
@@ -33,7 +43,7 @@ class OpenWebUIStorage(BaseStorage):
if config.api_key:
headers["Authorization"] = f"Bearer {config.api_key}"
self.client = httpx.AsyncClient(
self.client = AsyncClient(
base_url=str(config.endpoint),
headers=headers,
timeout=30.0,
@@ -50,11 +60,11 @@ class OpenWebUIStorage(BaseStorage):
create=True,
)
except httpx.ConnectError as e:
except ConnectError as e:
raise StorageError(f"Connection to OpenWebUI failed: {e}") from e
except httpx.HTTPStatusError as e:
except HTTPStatusError as e:
raise StorageError(f"OpenWebUI returned error {e.response.status_code}: {e}") from e
except httpx.RequestError as e:
except RequestError as e:
raise StorageError(f"Request to OpenWebUI failed: {e}") from e
except Exception as e:
raise StorageError(f"Failed to initialize Open WebUI: {e}") from e
@@ -80,13 +90,13 @@ class OpenWebUIStorage(BaseStorage):
return str(knowledge_id)
except httpx.ConnectError as e:
except ConnectError as e:
raise StorageError(f"Connection to OpenWebUI failed during creation: {e}") from e
except httpx.HTTPStatusError as e:
except HTTPStatusError as e:
raise StorageError(
f"OpenWebUI returned error {e.response.status_code} during creation: {e}"
) from e
except httpx.RequestError as e:
except RequestError as e:
raise StorageError(f"Request to OpenWebUI failed during creation: {e}") from e
except Exception as e:
raise StorageError(f"Failed to create knowledge base: {e}") from e
@@ -182,11 +192,11 @@ class OpenWebUIStorage(BaseStorage):
return str(file_id)
except httpx.ConnectError as e:
except ConnectError as e:
raise StorageError(f"Connection to OpenWebUI failed: {e}") from e
except httpx.HTTPStatusError as e:
except HTTPStatusError as e:
raise StorageError(f"OpenWebUI returned error {e.response.status_code}: {e}") from e
except httpx.RequestError as e:
except RequestError as e:
raise StorageError(f"Request to OpenWebUI failed: {e}") from e
except Exception as e:
raise StorageError(f"Failed to store document: {e}") from e
@@ -263,13 +273,13 @@ class OpenWebUIStorage(BaseStorage):
return file_ids
except httpx.ConnectError as e:
except ConnectError as e:
raise StorageError(f"Connection to OpenWebUI failed during batch: {e}") from e
except httpx.HTTPStatusError as e:
except HTTPStatusError as e:
raise StorageError(
f"OpenWebUI returned error {e.response.status_code} during batch: {e}"
) from e
except httpx.RequestError as e:
except RequestError as e:
raise StorageError(f"Request to OpenWebUI failed during batch: {e}") from e
except Exception as e:
raise StorageError(f"Failed to store batch: {e}") from e
@@ -324,12 +334,12 @@ class OpenWebUIStorage(BaseStorage):
delete_response.raise_for_status()
return True
except httpx.ConnectError as exc:
except ConnectError as exc:
LOGGER.error(
"Failed to reach OpenWebUI when deleting file %s", document_id, exc_info=exc
)
return False
except httpx.HTTPStatusError as exc:
except HTTPStatusError as exc:
LOGGER.error(
"OpenWebUI returned status error %s when deleting file %s",
exc.response.status_code if exc.response else "unknown",
@@ -337,7 +347,7 @@ class OpenWebUIStorage(BaseStorage):
exc_info=exc,
)
return False
except httpx.RequestError as exc:
except RequestError as exc:
LOGGER.error("Request error deleting file %s from OpenWebUI", document_id, exc_info=exc)
return False
except Exception as exc:
@@ -360,11 +370,11 @@ class OpenWebUIStorage(BaseStorage):
for kb in knowledge_bases
]
except httpx.ConnectError as e:
except ConnectError as e:
raise StorageError(f"Connection to OpenWebUI failed: {e}") from e
except httpx.HTTPStatusError as e:
except HTTPStatusError as e:
raise StorageError(f"OpenWebUI returned error {e.response.status_code}: {e}") from e
except httpx.RequestError as e:
except RequestError as e:
raise StorageError(f"Request to OpenWebUI failed: {e}") from e
except Exception as e:
raise StorageError(f"Failed to list knowledge bases: {e}") from e
@@ -396,7 +406,7 @@ class OpenWebUIStorage(BaseStorage):
LOGGER.info("Successfully deleted knowledge base: %s", collection_name)
return True
except httpx.HTTPStatusError as e:
except HTTPStatusError as e:
# Handle 404 as success (already deleted)
if e.response.status_code == 404:
LOGGER.info("Knowledge base %s was already deleted or not found", collection_name)
@@ -408,14 +418,14 @@ class OpenWebUIStorage(BaseStorage):
exc_info=e,
)
return False
except httpx.ConnectError as e:
except ConnectError as e:
LOGGER.error(
"Failed to reach OpenWebUI when deleting knowledge base %s",
collection_name,
exc_info=e,
)
return False
except httpx.RequestError as e:
except RequestError as e:
LOGGER.error(
"Request error deleting knowledge base %s from OpenWebUI",
collection_name,

View File

@@ -110,45 +110,49 @@ class R2RStorage(BaseStorage):
async def _ensure_collection(self, collection_name: str) -> str:
"""Get or create collection by name."""
endpoint = self.endpoint
client = AsyncClient()
try:
endpoint = self.endpoint
client = AsyncClient()
try:
# List collections and find by name
response = await client.get(f"{endpoint}/v3/collections")
response.raise_for_status()
data: dict[str, object] = response.json()
# List collections and find by name
response = await client.get(f"{endpoint}/v3/collections")
response.raise_for_status()
data: dict[str, object] = response.json()
results = cast(list[dict[str, object]], data.get("results", []))
for collection in results:
if collection.get("name") == collection_name:
collection_id = str(collection.get("id"))
if collection_name == self.config.collection_name:
self.default_collection_id = collection_id
return collection_id
results = cast(list[dict[str, object]], data.get("results", []))
for collection in results:
if collection.get("name") == collection_name:
collection_id_raw = collection.get("id")
if collection_id_raw is None:
raise StorageError(f"Collection '{collection_name}' exists but has no ID")
collection_id = str(collection_id_raw)
if collection_name == self.config.collection_name:
self.default_collection_id = collection_id
return collection_id
# Create if not found
create_response = await client.post(
f"{endpoint}/v3/collections",
json={
"name": collection_name,
"description": f"Auto-created collection: {collection_name}",
},
)
create_response.raise_for_status()
created: dict[str, object] = create_response.json()
created_results = cast(dict[str, object], created.get("results", {}))
collection_id = str(created_results.get("id"))
# Create if not found
create_response = await client.post(
f"{endpoint}/v3/collections",
json={
"name": collection_name,
"description": f"Auto-created collection: {collection_name}",
},
)
create_response.raise_for_status()
created: dict[str, object] = create_response.json()
created_results = cast(dict[str, object], created.get("results", {}))
collection_id_raw = created_results.get("id")
if collection_id_raw is None:
raise StorageError("Failed to get collection ID from creation response")
collection_id = str(collection_id_raw)
if collection_name == self.config.collection_name:
self.default_collection_id = collection_id
return collection_id
finally:
await client.aclose()
if collection_name == self.config.collection_name:
self.default_collection_id = collection_id
return collection_id
except Exception as e:
raise StorageError(f"Failed to ensure collection '{collection_name}': {e}") from e
finally:
await client.aclose()
@override
async def store(self, document: Document, *, collection_name: str | None = None) -> str:
@@ -614,27 +618,27 @@ class R2RStorage(BaseStorage):
@override
async def count(self, *, collection_name: str | None = None) -> int:
"""Get document count in collection."""
endpoint = self.endpoint
client = AsyncClient()
try:
endpoint = self.endpoint
client = AsyncClient()
try:
# Get collections and find the count for the specific collection
response = await client.get(f"{endpoint}/v3/collections")
response.raise_for_status()
data: dict[str, object] = response.json()
# Get collections and find the count for the specific collection
response = await client.get(f"{endpoint}/v3/collections")
response.raise_for_status()
data: dict[str, object] = response.json()
target_collection = collection_name or self.config.collection_name
results = cast(list[dict[str, object]], data.get("results", []))
for collection in results:
if collection.get("name") == target_collection:
doc_count = collection.get("document_count", 0)
return _as_int(doc_count)
target_collection = collection_name or self.config.collection_name
results = cast(list[dict[str, object]], data.get("results", []))
for collection in results:
if collection.get("name") == target_collection:
doc_count = collection.get("document_count", 0)
return _as_int(doc_count)
return 0
finally:
await client.aclose()
# Collection not found
return 0
except Exception:
return 0
finally:
await client.aclose()
@override
async def close(self) -> None:
@@ -682,24 +686,23 @@ class R2RStorage(BaseStorage):
@override
async def list_collections(self) -> list[str]:
"""List all available collections."""
endpoint = self.endpoint
client = AsyncClient()
try:
endpoint = self.endpoint
client = AsyncClient()
try:
response = await client.get(f"{endpoint}/v3/collections")
response.raise_for_status()
data: dict[str, object] = response.json()
response = await client.get(f"{endpoint}/v3/collections")
response.raise_for_status()
data: dict[str, object] = response.json()
collection_names: list[str] = []
results = cast(list[dict[str, object]], data.get("results", []))
for entry in results:
if name := entry.get("name"):
collection_names.append(str(name))
return collection_names
finally:
await client.aclose()
collection_names: list[str] = []
results = cast(list[dict[str, object]], data.get("results", []))
for entry in results:
if name := entry.get("name"):
collection_names.append(str(name))
return collection_names
except Exception as e:
raise StorageError(f"Failed to list collections: {e}") from e
finally:
await client.aclose()
async def list_collections_detailed(self) -> list[dict[str, object]]:
"""List all available collections with detailed information."""

View File

@@ -75,7 +75,8 @@ class WeaviateStorage(BaseStorage):
if not self.client:
raise StorageError("Weaviate client not initialized")
try:
self.client.collections.create(
client = cast(weaviate.WeaviateClient, self.client)
client.collections.create(
name=collection_name,
properties=[
Property(
@@ -225,7 +226,8 @@ class WeaviateStorage(BaseStorage):
if not self.client:
raise StorageError("Weaviate client not initialized")
existing = self.client.collections.list_all()
client = cast(weaviate.WeaviateClient, self.client)
existing = client.collections.list_all()
if collection_name not in existing:
await self._create_collection(collection_name)
@@ -244,7 +246,8 @@ class WeaviateStorage(BaseStorage):
if ensure_exists:
await self._ensure_collection(normalized)
return self.client.collections.get(normalized), normalized
client = cast(weaviate.WeaviateClient, self.client)
return client.collections.get(normalized), normalized
@override
async def store(self, document: Document, *, collection_name: str | None = None) -> str:
@@ -585,7 +588,8 @@ class WeaviateStorage(BaseStorage):
if not self.client:
raise StorageError("Weaviate client not initialized")
return list(self.client.collections.list_all())
client = cast(weaviate.WeaviateClient, self.client)
return list(client.collections.list_all())
except Exception as e:
raise StorageError(f"Failed to list collections: {e}") from e
@@ -596,9 +600,10 @@ class WeaviateStorage(BaseStorage):
raise StorageError("Weaviate client not initialized")
try:
client = cast(weaviate.WeaviateClient, self.client)
collections: list[dict[str, object]] = []
for name in self.client.collections.list_all():
collection_obj = self.client.collections.get(name)
for name in client.collections.list_all():
collection_obj = client.collections.get(name)
if not collection_obj:
continue
@@ -970,7 +975,8 @@ class WeaviateStorage(BaseStorage):
target = self._normalize_collection_name(collection_name)
# Delete the collection using the client's collections API
self.client.collections.delete(target)
client = cast(weaviate.WeaviateClient, self.client)
client.collections.delete(target)
return True
@@ -994,7 +1000,8 @@ class WeaviateStorage(BaseStorage):
"""Close client connection."""
if self.client:
try:
self.client.close()
client = cast(weaviate.WeaviateClient, self.client)
client.close()
except Exception as e:
import logging
logging.warning(f"Error closing Weaviate client: {e}")
@@ -1003,6 +1010,7 @@ class WeaviateStorage(BaseStorage):
"""Clean up client connection as fallback."""
if self.client:
try:
self.client.close()
client = cast(weaviate.WeaviateClient, self.client)
client.close()
except Exception:
pass # Ignore errors in destructor

File diff suppressed because it is too large Load Diff

View File

@@ -1,17 +1,40 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Protocol
import pytest
from pydantic import HttpUrl
from textual.app import App
from ingest_pipeline.cli.tui.models import CollectionInfo
from ingest_pipeline.cli.tui.screens.dashboard import CollectionOverviewScreen
from ingest_pipeline.cli.tui.screens.documents import DocumentManagementScreen
from ingest_pipeline.cli.tui.widgets.tables import EnhancedDataTable
from ingest_pipeline.core.models import StorageBackend, StorageConfig
from ingest_pipeline.core.models import Document, StorageBackend, StorageConfig
from ingest_pipeline.storage.base import BaseStorage
if TYPE_CHECKING:
from ingest_pipeline.cli.tui.utils.storage_manager import StorageManager
class StorageManagerProtocol(Protocol):
"""Protocol for storage manager interface used in tests."""
is_initialized: bool
async def initialize_all_backends(self) -> dict[StorageBackend, bool]: ...
async def get_all_collections(self) -> list[CollectionInfo]: ...
def get_available_backends(self) -> list[StorageBackend]: ...
def get_backend(self, backend: StorageBackend) -> BaseStorage | None: ...
async def close_all(self) -> None: ...
class StorageManagerStub:
_collections: list[CollectionInfo]
_available: list[StorageBackend]
backends: dict[StorageBackend, BaseStorage]
is_initialized: bool
def __init__(self, collections: list[CollectionInfo], backends: dict[StorageBackend, BaseStorage]) -> None:
self._collections = collections
self._available = list(backends.keys())
@@ -40,7 +63,7 @@ class PassiveStorage(BaseStorage):
super().__init__(
StorageConfig(
backend=backend,
endpoint="http://example.com",
endpoint=HttpUrl("http://example.com"),
collection_name="collection",
)
)
@@ -48,10 +71,10 @@ class PassiveStorage(BaseStorage):
async def initialize(self) -> None:
return None
async def store(self, document, *, collection_name: str | None = None) -> str: # pragma: no cover - unused
async def store(self, document: Document, *, collection_name: str | None = None) -> str: # pragma: no cover - unused
raise NotImplementedError
async def store_batch(self, documents, *, collection_name: str | None = None): # pragma: no cover - unused
async def store_batch(self, documents: list[Document], *, collection_name: str | None = None) -> list[str]: # pragma: no cover - unused
raise NotImplementedError
async def delete(self, document_id: str, *, collection_name: str | None = None) -> bool: # pragma: no cover - unused
@@ -59,6 +82,8 @@ class PassiveStorage(BaseStorage):
class ScreenTestApp(App[None]):
_screen: CollectionOverviewScreen
def __init__(self, screen: CollectionOverviewScreen) -> None:
super().__init__()
self._screen = screen
@@ -68,7 +93,7 @@ class ScreenTestApp(App[None]):
@pytest.mark.asyncio
async def test_collection_overview_table_reflects_collections(monkeypatch: pytest.MonkeyPatch) -> None:
async def test_collection_overview_table_reflects_collections() -> None:
collections: list[CollectionInfo] = [
{
"name": "docs_example_com",
@@ -98,10 +123,13 @@ async def test_collection_overview_table_reflects_collections(monkeypatch: pytes
},
)
# Type cast for the protocol - storage_manager implements the required interface
storage_manager_typed: StorageManager = storage_manager # type: ignore[assignment]
screen = CollectionOverviewScreen(
storage_manager,
weaviate=storage_manager.get_backend(StorageBackend.WEAVIATE),
openwebui=storage_manager.get_backend(StorageBackend.OPEN_WEBUI),
storage_manager_typed,
weaviate=None, # Use None and let the screen get it from storage_manager
openwebui=None, # Use None and let the screen get it from storage_manager
r2r=None,
)
@@ -142,7 +170,7 @@ class DocumentStorageStub(BaseStorage):
super().__init__(
StorageConfig(
backend=StorageBackend.WEAVIATE,
endpoint="http://example.com",
endpoint=HttpUrl("http://example.com"),
collection_name="docs",
)
)
@@ -150,16 +178,16 @@ class DocumentStorageStub(BaseStorage):
async def initialize(self) -> None:
return None
async def store(self, document, *, collection_name: str | None = None) -> str: # pragma: no cover - unused
async def store(self, document: Document, *, collection_name: str | None = None) -> str: # pragma: no cover - unused
raise NotImplementedError
async def store_batch(self, documents, *, collection_name: str | None = None): # pragma: no cover - unused
async def store_batch(self, documents: list[Document], *, collection_name: str | None = None) -> list[str]: # pragma: no cover - unused
raise NotImplementedError
async def delete(self, document_id: str, *, collection_name: str | None = None) -> bool: # pragma: no cover - unused
raise NotImplementedError
async def list_documents(self, *, limit: int = 100, offset: int = 0, collection_name: str | None = None):
async def list_documents(self, limit: int = 100, offset: int = 0, *, collection_name: str | None = None) -> list[dict[str, object]]:
return [
{
"id": "doc-1234567890",
@@ -187,7 +215,9 @@ def build_collection(count: int) -> CollectionInfo:
class DocumentScreenTestApp(App[None]):
def __init__(self, screen) -> None:
_screen: DocumentManagementScreen
def __init__(self, screen: DocumentManagementScreen) -> None:
super().__init__()
self._screen = screen
@@ -208,8 +238,10 @@ async def test_document_management_screen_formats_rows() -> None:
assert row[1] == "Deep Dive into Markdown Rendering"
assert row[2] == "https://example.com/post"
assert row[3].startswith("Extensive article")
description: str = str(row[3])
doc_id: str = str(row[7])
assert description.startswith("Extensive article")
assert row[4] == "📝 md"
assert row[5] == "321"
assert row[6] == "03/04 10:15"
assert row[7].startswith("doc-1234")
assert doc_id.startswith("doc-1234")