Files
rag-manager/ingest_pipeline/storage/openwebui.py
2025-09-18 09:44:16 +00:00

711 lines
26 KiB
Python

"""Open WebUI storage adapter."""
import asyncio
import logging
from typing import Final, TypedDict, cast
import httpx
from typing_extensions import override
from ..core.exceptions import StorageError
from ..core.models import Document, StorageConfig
from .base import BaseStorage
LOGGER: Final[logging.Logger] = logging.getLogger(__name__)
class OpenWebUIStorage(BaseStorage):
"""Storage adapter for Open WebUI knowledge endpoints."""
client: httpx.AsyncClient
_knowledge_cache: dict[str, str]
def __init__(self, config: StorageConfig):
"""
Initialize Open WebUI storage.
Args:
config: Storage configuration
"""
super().__init__(config)
headers: dict[str, str] = {}
if config.api_key:
headers["Authorization"] = f"Bearer {config.api_key}"
self.client = httpx.AsyncClient(
base_url=str(config.endpoint),
headers=headers,
timeout=30.0,
)
self._knowledge_cache = {}
@override
async def initialize(self) -> None:
"""Initialize Open WebUI connection."""
try:
if self.config.collection_name:
await self._get_knowledge_id(
self.config.collection_name,
create=True,
)
except httpx.ConnectError as e:
raise StorageError(f"Connection to OpenWebUI failed: {e}") from e
except httpx.HTTPStatusError as e:
raise StorageError(f"OpenWebUI returned error {e.response.status_code}: {e}") from e
except httpx.RequestError as e:
raise StorageError(f"Request to OpenWebUI failed: {e}") from e
except Exception as e:
raise StorageError(f"Failed to initialize Open WebUI: {e}") from e
async def _create_collection(self, name: str) -> str:
"""Create knowledge base in Open WebUI."""
try:
response = await self.client.post(
"/api/v1/knowledge/create",
json={
"name": name,
"description": "Documents ingested from various sources",
"data": {},
"access_control": None,
},
)
response.raise_for_status()
result = response.json()
knowledge_id = result.get("id")
if not knowledge_id or not isinstance(knowledge_id, str):
raise StorageError("Knowledge base creation failed: no ID returned")
return str(knowledge_id)
except httpx.ConnectError as e:
raise StorageError(f"Connection to OpenWebUI failed during creation: {e}") from e
except httpx.HTTPStatusError as e:
raise StorageError(
f"OpenWebUI returned error {e.response.status_code} during creation: {e}"
) from e
except httpx.RequestError as e:
raise StorageError(f"Request to OpenWebUI failed during creation: {e}") from e
except Exception as e:
raise StorageError(f"Failed to create knowledge base: {e}") from e
async def _fetch_knowledge_bases(self) -> list[dict[str, object]]:
"""Return the list of knowledge bases from the API."""
response = await self.client.get("/api/v1/knowledge/list")
response.raise_for_status()
data = response.json()
if not isinstance(data, list):
return []
normalized: list[dict[str, object]] = []
for item in data:
if isinstance(item, dict):
item_dict: dict[str, object] = item
normalized.append({str(k): v for k, v in item_dict.items()})
return normalized
async def _get_knowledge_id(
self,
name: str | None,
*,
create: bool,
) -> str | None:
"""Retrieve (and optionally create) a knowledge base identifier."""
target_raw = name or self.config.collection_name
target = str(target_raw) if target_raw else ""
if not target:
raise StorageError("Knowledge base name is required")
if cached := self._knowledge_cache.get(target):
return cached
knowledge_bases = await self._fetch_knowledge_bases()
for kb in knowledge_bases:
if kb.get("name") == target:
kb_id = kb.get("id")
if isinstance(kb_id, str):
self._knowledge_cache[target] = kb_id
return kb_id
if not create:
return None
knowledge_id = await self._create_collection(target)
self._knowledge_cache[target] = knowledge_id
return knowledge_id
@override
async def store(self, document: Document, *, collection_name: str | None = None) -> str:
"""
Store a document in Open WebUI as a file.
Args:
document: Document to store
Returns:
File ID
"""
try:
knowledge_id = await self._get_knowledge_id(
collection_name,
create=True,
)
if not knowledge_id:
raise StorageError("Knowledge base not initialized")
# Step 1: Upload document as file
# Use document title from metadata if available, otherwise fall back to ID
filename = document.metadata.get("title") or f"doc_{document.id}"
# Ensure filename has proper extension
if not filename.endswith(('.txt', '.md', '.pdf', '.doc', '.docx')):
filename = f"{filename}.txt"
files = {"file": (filename, document.content.encode(), "text/plain")}
response = await self.client.post(
"/api/v1/files/",
files=files,
params={"process": True, "process_in_background": False},
)
response.raise_for_status()
file_data = response.json()
file_id = file_data.get("id")
if not file_id or not isinstance(file_id, str):
raise StorageError("File upload failed: no file ID returned")
# Step 2: Add file to knowledge base
response = await self.client.post(
f"/api/v1/knowledge/{knowledge_id}/file/add", json={"file_id": file_id}
)
response.raise_for_status()
return str(file_id)
except httpx.ConnectError as e:
raise StorageError(f"Connection to OpenWebUI failed: {e}") from e
except httpx.HTTPStatusError as e:
raise StorageError(f"OpenWebUI returned error {e.response.status_code}: {e}") from e
except httpx.RequestError as e:
raise StorageError(f"Request to OpenWebUI failed: {e}") from e
except Exception as e:
raise StorageError(f"Failed to store document: {e}") from e
@override
async def store_batch(
self, documents: list[Document], *, collection_name: str | None = None
) -> list[str]:
"""
Store multiple documents as files in batch.
Args:
documents: List of documents
Returns:
List of file IDs
"""
try:
knowledge_id = await self._get_knowledge_id(
collection_name,
create=True,
)
if not knowledge_id:
raise StorageError("Knowledge base not initialized")
async def upload_and_attach(doc: Document) -> str:
# Use document title from metadata if available, otherwise fall back to ID
filename = doc.metadata.get("title") or f"doc_{doc.id}"
# Ensure filename has proper extension
if not filename.endswith(('.txt', '.md', '.pdf', '.doc', '.docx')):
filename = f"{filename}.txt"
files = {"file": (filename, doc.content.encode(), "text/plain")}
upload_response = await self.client.post(
"/api/v1/files/",
files=files,
params={"process": True, "process_in_background": False},
)
upload_response.raise_for_status()
file_data = upload_response.json()
file_id = file_data.get("id")
if not file_id or not isinstance(file_id, str):
raise StorageError(
f"File upload failed for document {doc.id}: no file ID returned"
)
attach_response = await self.client.post(
f"/api/v1/knowledge/{knowledge_id}/file/add", json={"file_id": file_id}
)
attach_response.raise_for_status()
return str(file_id)
tasks = [upload_and_attach(doc) for doc in documents]
results = await asyncio.gather(*tasks, return_exceptions=True)
file_ids: list[str] = []
failures: list[str] = []
for index, result in enumerate(results):
doc = documents[index]
if isinstance(result, Exception):
failures.append(f"{doc.id}: {result}")
else:
file_ids.append(cast(str, result))
if failures:
LOGGER.warning(
"OpenWebUI partial batch failure for knowledge base %s: %s",
self.config.collection_name,
", ".join(failures),
)
return file_ids
except httpx.ConnectError as e:
raise StorageError(f"Connection to OpenWebUI failed during batch: {e}") from e
except httpx.HTTPStatusError as e:
raise StorageError(
f"OpenWebUI returned error {e.response.status_code} during batch: {e}"
) from e
except httpx.RequestError as e:
raise StorageError(f"Request to OpenWebUI failed during batch: {e}") from e
except Exception as e:
raise StorageError(f"Failed to store batch: {e}") from e
@override
async def delete(self, document_id: str, *, collection_name: str | None = None) -> bool:
"""
Remove a file from Open WebUI knowledge base.
Args:
document_id: File ID to remove
Returns:
True if removed successfully
"""
try:
knowledge_id = await self._get_knowledge_id(
collection_name,
create=False,
)
if not knowledge_id:
return False
# Remove file from knowledge base
response = await self.client.post(
f"/api/v1/knowledge/{knowledge_id}/file/remove", json={"file_id": document_id}
)
response.raise_for_status()
delete_response = await self.client.delete(f"/api/v1/files/{document_id}")
if delete_response.status_code == 404:
return True
delete_response.raise_for_status()
return True
except httpx.ConnectError as exc:
LOGGER.error(
"Failed to reach OpenWebUI when deleting file %s", document_id, exc_info=exc
)
return False
except httpx.HTTPStatusError as exc:
LOGGER.error(
"OpenWebUI returned status error %s when deleting file %s",
exc.response.status_code if exc.response else "unknown",
document_id,
exc_info=exc,
)
return False
except httpx.RequestError as exc:
LOGGER.error("Request error deleting file %s from OpenWebUI", document_id, exc_info=exc)
return False
except Exception as exc:
LOGGER.error("Unexpected error deleting file %s", document_id, exc_info=exc)
return False
async def list_collections(self) -> list[str]:
"""
List all available knowledge bases.
Returns:
List of knowledge base names
"""
try:
knowledge_bases = await self._fetch_knowledge_bases()
# Extract names from knowledge bases
return [
str(kb.get("name", f"knowledge_{kb.get('id', 'unknown')}") or "")
for kb in knowledge_bases
]
except httpx.ConnectError as e:
raise StorageError(f"Connection to OpenWebUI failed: {e}") from e
except httpx.HTTPStatusError as e:
raise StorageError(f"OpenWebUI returned error {e.response.status_code}: {e}") from e
except httpx.RequestError as e:
raise StorageError(f"Request to OpenWebUI failed: {e}") from e
except Exception as e:
raise StorageError(f"Failed to list knowledge bases: {e}") from e
async def delete_collection(self, collection_name: str) -> bool:
"""
Delete a knowledge base by name.
Args:
collection_name: Name of the knowledge base to delete
Returns:
True if deleted successfully, False otherwise
"""
try:
knowledge_id = await self._get_knowledge_id(collection_name, create=False)
if not knowledge_id:
# Collection doesn't exist, consider it already deleted
return True
# Delete the knowledge base using the OpenWebUI API
response = await self.client.delete(f"/api/v1/knowledge/{knowledge_id}/delete")
response.raise_for_status()
# Remove from cache if it exists
if collection_name in self._knowledge_cache:
del self._knowledge_cache[collection_name]
LOGGER.info("Successfully deleted knowledge base: %s", collection_name)
return True
except httpx.HTTPStatusError as e:
# Handle 404 as success (already deleted)
if e.response.status_code == 404:
LOGGER.info("Knowledge base %s was already deleted or not found", collection_name)
return True
LOGGER.error(
"OpenWebUI returned error %s when deleting knowledge base %s",
e.response.status_code,
collection_name,
exc_info=e,
)
return False
except httpx.ConnectError as e:
LOGGER.error(
"Failed to reach OpenWebUI when deleting knowledge base %s",
collection_name,
exc_info=e,
)
return False
except httpx.RequestError as e:
LOGGER.error(
"Request error deleting knowledge base %s from OpenWebUI",
collection_name,
exc_info=e,
)
return False
except Exception as e:
LOGGER.error("Unexpected error deleting knowledge base %s", collection_name, exc_info=e)
return False
class CollectionSummary(TypedDict):
"""Structure describing a knowledge base summary."""
name: str
count: int
size_mb: float
async def describe_collections(self) -> list[CollectionSummary]:
"""Return metadata about each knowledge base."""
try:
# First get the list of knowledge bases
response = await self.client.get("/api/v1/knowledge/")
response.raise_for_status()
knowledge_bases = response.json()
LOGGER.info(f"OpenWebUI returned {len(knowledge_bases)} knowledge bases")
LOGGER.debug(f"Knowledge bases structure: {knowledge_bases}")
collections: list[OpenWebUIStorage.CollectionSummary] = []
for kb in knowledge_bases:
if not isinstance(kb, dict):
continue
kb_id = kb.get("id")
name = kb.get("name", "Unknown")
LOGGER.info(f"Processing knowledge base: '{name}' (ID: {kb_id})")
LOGGER.debug(f"KB structure: {kb}")
if not kb_id:
# If no ID, fall back to basic count from list response
files = kb.get("files", [])
if files is None:
files = []
count = len(files) if isinstance(files, list) else 0
else:
# Get detailed knowledge base information using the correct endpoint
try:
LOGGER.debug(f"Fetching detailed info for KB '{name}' from /api/v1/knowledge/{kb_id}")
detail_response = await self.client.get(f"/api/v1/knowledge/{kb_id}")
detail_response.raise_for_status()
detailed_kb = detail_response.json()
LOGGER.debug(f"Detailed KB response: {detailed_kb}")
files = detailed_kb.get("files", [])
if files is None:
files = []
count = len(files) if isinstance(files, list) else 0
# Debug logging
LOGGER.info(f"Knowledge base '{name}' (ID: {kb_id}): found {count} files")
if count > 0 and len(files) > 0:
LOGGER.debug(f"First file structure: {files[0] if files else 'No files'}")
elif count == 0:
LOGGER.warning(f"Knowledge base '{name}' has 0 files. Files field type: {type(files)}, value: {files}")
except Exception as e:
LOGGER.warning(f"Failed to get detailed info for KB '{name}' (ID: {kb_id}): {e}")
# Fallback to basic files list if detailed fetch fails
files = kb.get("files", [])
if files is None:
files = []
count = len(files) if isinstance(files, list) else 0
LOGGER.info(f"Fallback count for KB '{name}': {count}")
size_mb = count * 0.5 # rough heuristic
summary: OpenWebUIStorage.CollectionSummary = {
"name": str(name),
"count": int(count),
"size_mb": float(size_mb),
}
collections.append(summary)
return collections
except Exception as e:
raise StorageError(f"Failed to describe knowledge bases: {e}") from e
async def count(self, *, collection_name: str | None = None) -> int:
"""
Get document count for a specific collection (knowledge base).
Args:
collection_name: Name of the knowledge base to count documents for
Returns:
Number of documents in the collection, 0 if collection not found
"""
if not collection_name:
# If no collection name provided, return total across all collections
try:
collections = await self.describe_collections()
return sum(collection["count"] for collection in collections)
except Exception:
return 0
try:
# Get knowledge base by name and return its file count
kb = await self.get_knowledge_by_name(collection_name)
if not kb:
return 0
kb_id = kb.get("id")
if not kb_id:
return 0
# Get detailed knowledge base information to get accurate file count
detail_response = await self.client.get(f"/api/v1/knowledge/{kb_id}")
detail_response.raise_for_status()
detailed_kb = detail_response.json()
files = detailed_kb.get("files", [])
count = len(files) if isinstance(files, list) else 0
LOGGER.debug(f"Count for collection '{collection_name}': {count} files")
return count
except Exception as e:
LOGGER.warning(f"Failed to get count for collection '{collection_name}': {e}")
return 0
async def get_knowledge_by_name(self, name: str) -> dict[str, object] | None:
"""
Get knowledge base details by name.
Args:
name: Knowledge base name
Returns:
Knowledge base details or None if not found
"""
try:
response = await self.client.get("/api/v1/knowledge/list")
response.raise_for_status()
knowledge_bases = response.json()
return next(
(
{str(k): v for k, v in kb.items()}
for kb in knowledge_bases
if isinstance(kb, dict) and kb.get("name") == name
),
None,
)
except Exception as e:
raise StorageError(f"Failed to get knowledge base by name: {e}") from e
async def __aenter__(self) -> "OpenWebUIStorage":
"""Async context manager entry."""
await self.initialize()
return self
async def __aexit__(
self,
exc_type: type[BaseException] | None,
exc_val: BaseException | None,
exc_tb: object | None,
) -> None:
"""Async context manager exit."""
await self.close()
async def list_documents(
self,
limit: int = 100,
offset: int = 0,
*,
collection_name: str | None = None,
) -> list[dict[str, object]]:
"""
List documents (files) in a knowledge base.
NOTE: This is a basic implementation that attempts to extract file information
from OpenWebUI knowledge bases. The actual file listing capabilities depend
on the OpenWebUI API version and may not include detailed file metadata.
Args:
limit: Maximum number of documents to return
offset: Number of documents to skip
collection_name: Knowledge base name
Returns:
List of document dictionaries with available metadata
"""
try:
# Use the knowledge base name or fall back to default
kb_name = collection_name or self.config.collection_name or "default"
# Try to get knowledge base details
knowledge_base = await self.get_knowledge_by_name(kb_name)
if not knowledge_base:
# If specific KB not found, return empty list with a note
return []
# Extract files if available (API structure may vary)
files = knowledge_base.get("files", [])
# Handle different possible API response structures
if not isinstance(files, list):
# Some API versions might structure this differently
# Try to handle gracefully
return [
{
"id": "unknown",
"title": f"Knowledge Base: {kb_name}",
"source_url": "",
"description": "OpenWebUI knowledge base (file details not available)",
"content_type": "text/plain",
"content_preview": "Document listing not fully supported for OpenWebUI",
"word_count": 0,
"timestamp": "",
}
]
# Apply pagination
paginated_files = files[offset : offset + limit]
# Convert to document format with safe field access
documents: list[dict[str, object]] = []
for i, file_info in enumerate(paginated_files):
if not isinstance(file_info, dict):
continue
# Safely extract fields with fallbacks
doc_id = str(file_info.get("id", f"file_{i}"))
# Try multiple ways to get filename from OpenWebUI API response
filename = None
# Check direct filename field
if "filename" in file_info:
filename = file_info["filename"]
# Check name field
elif "name" in file_info:
filename = file_info["name"]
# Check meta.name (from FileModelResponse schema)
elif isinstance(file_info.get("meta"), dict):
filename = file_info["meta"].get("name")
# Final fallback
if not filename:
filename = f"file_{i}"
filename = str(filename)
# Extract size from meta if available
size = 0
if isinstance(file_info.get("meta"), dict):
size = file_info["meta"].get("size", 0)
else:
size = file_info.get("size", 0)
# Estimate word count from file size (very rough approximation)
word_count = max(1, int(size / 6)) if isinstance(size, (int, float)) else 0
documents.append(
{
"id": doc_id,
"title": filename,
"source_url": "", # OpenWebUI files don't typically have source URLs
"description": f"File: {filename}",
"content_type": str(file_info.get("content_type", "text/plain")),
"content_preview": f"File uploaded to OpenWebUI: {filename}",
"word_count": word_count,
"timestamp": str(
file_info.get("created_at") or file_info.get("timestamp", "")
),
}
)
return documents
except Exception as e:
# Since OpenWebUI file listing API structure is not guaranteed,
# we gracefully fall back rather than raise an error
import logging
logging.warning(f"OpenWebUI document listing failed: {e}")
# Return a placeholder entry indicating limited support
return [
{
"id": "api_error",
"title": f"Knowledge Base: {collection_name or 'default'}",
"source_url": "",
"description": "Document listing encountered an error - API compatibility issue",
"content_type": "text/plain",
"content_preview": f"Error: {str(e)[:100]}...",
"word_count": 0,
"timestamp": "",
}
]
async def close(self) -> None:
"""Close client connection."""
if hasattr(self, "client") and self.client:
try:
await self.client.aclose()
except Exception as e:
import logging
logging.warning(f"Error closing OpenWebUI client: {e}")