Files
rag-manager/ingest_pipeline/storage/base.py
2025-09-19 08:31:36 +00:00

234 lines
6.6 KiB
Python

"""Base storage interface."""
from abc import ABC, abstractmethod
from collections.abc import AsyncGenerator
from ..core.models import Document, StorageConfig
class BaseStorage(ABC):
"""Abstract base class for storage adapters."""
config: StorageConfig
def __init__(self, config: StorageConfig):
"""
Initialize storage adapter.
Args:
config: Storage configuration
"""
self.config = config
@property
def display_name(self) -> str:
"""Human-readable name for UI display."""
return self.__class__.__name__.replace("Storage", "")
@abstractmethod
async def initialize(self) -> None:
"""Initialize the storage backend and create collections if needed."""
pass # pragma: no cover
@abstractmethod
async def store(self, document: Document, *, collection_name: str | None = None) -> str:
"""
Store a single document.
Args:
document: Document to store
Returns:
Document ID
"""
pass # pragma: no cover
@abstractmethod
async def store_batch(
self, documents: list[Document], *, collection_name: str | None = None
) -> list[str]:
"""
Store multiple documents in batch.
Args:
documents: List of documents to store
Returns:
List of document IDs
"""
pass # pragma: no cover
async def retrieve(
self, document_id: str, *, collection_name: str | None = None
) -> Document | None:
"""
Retrieve a document by ID (if supported by backend).
Args:
document_id: Document ID
Returns:
Document or None if not found
Raises:
NotImplementedError: If backend doesn't support retrieval
"""
raise NotImplementedError(f"{self.__class__.__name__} doesn't support document retrieval")
async def check_exists(
self, document_id: str, *, collection_name: str | None = None, stale_after_days: int = 30
) -> bool:
"""
Check if a document exists and is not stale.
Args:
document_id: Document ID to check
collection_name: Collection to check in
stale_after_days: Consider document stale after this many days
Returns:
True if document exists and is not stale, False otherwise
"""
try:
document = await self.retrieve(document_id, collection_name=collection_name)
if document is None:
return False
# Check staleness if timestamp is available
if "timestamp" in document.metadata:
from datetime import UTC, datetime, timedelta
timestamp_obj = document.metadata["timestamp"]
if isinstance(timestamp_obj, datetime):
timestamp = timestamp_obj
cutoff = datetime.now(UTC) - timedelta(days=stale_after_days)
return timestamp >= cutoff
# If no timestamp, assume it exists and is valid
return True
except Exception:
# Backend doesn't support retrieval, assume doesn't exist
return False
def search(
self,
query: str,
limit: int = 10,
threshold: float = 0.7,
*,
collection_name: str | None = None,
) -> AsyncGenerator[Document, None]:
"""
Search for documents (if supported by backend).
Args:
query: Search query
limit: Maximum number of results
threshold: Similarity threshold
Yields:
Matching documents
Raises:
NotImplementedError: If backend doesn't support search
"""
raise NotImplementedError(f"{self.__class__.__name__} doesn't support search")
@abstractmethod
async def delete(self, document_id: str, *, collection_name: str | None = None) -> bool:
"""
Delete a document.
Args:
document_id: Document ID
Returns:
True if deleted successfully
"""
pass # pragma: no cover
async def count(self, *, collection_name: str | None = None) -> int:
"""
Get total document count (if supported by backend).
Returns:
Number of documents, 0 if not supported
"""
return 0
async def list_collections(self) -> list[str]:
"""
List available collections (if supported by backend).
Returns:
List of collection names, empty list if not supported
"""
return []
async def describe_collections(self) -> list[dict[str, object]]:
"""
Describe available collections with metadata (if supported by backend).
Returns:
List of collection metadata dictionaries, empty list if not supported
"""
return []
async def delete_collection(self, collection_name: str) -> bool:
"""
Delete a collection (if supported by backend).
Args:
collection_name: Name of collection to delete
Returns:
True if deleted successfully, False if not supported
"""
return False
async def delete_documents(
self, document_ids: list[str], *, collection_name: str | None = None
) -> dict[str, bool]:
"""
Delete documents by IDs (if supported by backend).
Args:
document_ids: List of document IDs to delete
collection_name: Collection to delete from
Returns:
Dict mapping document IDs to success status, empty if not supported
"""
return {}
async def list_documents(
self,
limit: int = 100,
offset: int = 0,
*,
collection_name: str | None = None,
) -> list[dict[str, object]]:
"""
List documents in the storage backend (if supported).
Args:
limit: Maximum number of documents to return
offset: Number of documents to skip
collection_name: Collection to list documents from
Returns:
List of document dictionaries with metadata
Raises:
NotImplementedError: If backend doesn't support document listing
"""
raise NotImplementedError(f"{self.__class__.__name__} doesn't support document listing")
async def close(self) -> None:
"""
Close storage connections and cleanup resources.
Default implementation does nothing.
"""
# Default implementation - storage backends can override to cleanup connections
return None