This commit is contained in:
2025-09-15 12:35:42 -04:00
commit 94ddcfeff6
94 changed files with 9583 additions and 0 deletions

View File

@@ -0,0 +1,11 @@
{
"permissions": {
"allow": [
"mcp__context7__resolve-library-id",
"mcp__context7__get-library-docs",
"mcp__sequential-thinking__sequentialthinking"
],
"deny": [],
"ask": []
}
}

51
.env Normal file
View File

@@ -0,0 +1,51 @@
WEAVIATE_IS_LOCAL=True
# URL can be just a host or full URL; defaults shown below
WCD_URL=http://weaviate.yo # or http://localhost:8080
# LOCAL_WEAVIATE_PORT=8080 # optional override
# LOCAL_WEAVIATE_GRPC_PORT=50051 # optional override
# No API key required for local unless you enabled local auth
# WCD_API_KEY=
# API Keys (only if not using local/self-hosted services)
FIRECRAWL_API_KEY=dummy-key
OPENWEBUI_API_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6IjlmNjEwODg2LWRhM2MtNDQ4YS05OWE0LTYyZGEyZjIyZjJiNiJ9.W-dqabcE4F-LQ--k2yrJM_KEBDB-wi1CmoahlN1tQbY
OPENWEBUI_API_URL=http://chat.lab
WEAVIATE_API_KEY=
OPENAI_API_KEY=sk-1234
LLM_API_KEY=sk-1234
# Endpoints
LLM_ENDPOINT=http://llm.lab
WEAVIATE_ENDPOINT=http://weaviate.yo
OPENWEBUI_ENDPOINT=http://chat.lab
FIRECRAWL_ENDPOINT=http://crawl.lab:30002
# Model Configuration
EMBEDDING_MODEL=ollama/bge-m3:latest
EMBEDDING_DIMENSION=1024
# Ingestion Settings
BATCH_SIZE=50
MAX_FILE_SIZE=1000000
MAX_CRAWL_DEPTH=5
MAX_CRAWL_PAGES=100
# Storage Settings
DEFAULT_STORAGE_BACKEND=weaviate
COLLECTION_PREFIX=docs
# Prefect Settings
PREFECT_API_URL=http://prefect.lab/api
PREFECT_API_KEY=0nR4WAkQ3q9MY1bjqATK6pVmolighvrS
PREFECT_WORK_POOL=default
# Scheduling
DEFAULT_SCHEDULE_INTERVAL=60
# Performance
MAX_CONCURRENT_TASKS=5
REQUEST_TIMEOUT=60
# Logging
LOG_LEVEL=INFO
FIRST_START_ELYSIA='1'

39
.env.example Normal file
View File

@@ -0,0 +1,39 @@
# API Keys (only if not using local/self-hosted services)
FIRECRAWL_API_KEY=
OPENWEBUI_API_KEY=
WEAVIATE_API_KEY=
# Endpoints
LLM_ENDPOINT=http://llm.lab
WEAVIATE_ENDPOINT=http://weaviate.yo
OPENWEBUI_ENDPOINT=http://chat.lab
FIRECRAWL_ENDPOINT=http://crawl.lab:30002
# Model Configuration
EMBEDDING_MODEL=ollama/bge-m3:latest
EMBEDDING_DIMENSION=1024
# Ingestion Settings
BATCH_SIZE=50
MAX_FILE_SIZE=1000000
MAX_CRAWL_DEPTH=5
MAX_CRAWL_PAGES=100
# Storage Settings
DEFAULT_STORAGE_BACKEND=weaviate
COLLECTION_PREFIX=docs
# Prefect Settings
PREFECT_API_URL=
PREFECT_API_KEY=
PREFECT_WORK_POOL=default
# Scheduling
DEFAULT_SCHEDULE_INTERVAL=60
# Performance
MAX_CONCURRENT_TASKS=5
REQUEST_TIMEOUT=60
# Logging
LOG_LEVEL=INFO

100
CLAUDE.md Normal file
View File

@@ -0,0 +1,100 @@
# CLAUDE.md
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
## Project Overview
This is a modular document ingestion pipeline using Prefect for orchestrating ingestion from web/documentation sites (via Firecrawl) and Git repositories (via Repomix) into Weaviate vector database or Open WebUI knowledge endpoints.
## Development Commands
### Environment Setup
```bash
# Install dependencies using uv (required)
uv sync
# Activate virtual environment
source .venv/bin/activate
# Install repomix globally (required for repository ingestion)
npm install -g repomix
# Configure environment
cp .env.example .env
# Edit .env with your settings
```
### Running the Application
```bash
# One-time ingestion
python -m ingest_pipeline ingest <url> --type web --storage weaviate
# Schedule recurring ingestion
python -m ingest_pipeline schedule <name> <url> --type web --storage weaviate --cron "0 2 * * *"
# Start deployment server
python -m ingest_pipeline serve
# View configuration
python -m ingest_pipeline config
```
### Code Quality
```bash
# Run linting
uv run ruff check .
uv run ruff format .
# Type checking
uv run mypy ingest_pipeline
# Install dev dependencies
uv sync --dev
```
## Architecture
The pipeline follows a modular architecture with clear separation of concerns:
- **Ingestors** (`ingest_pipeline/ingestors/`): Abstract base class pattern for different data sources (Firecrawl for web, Repomix for repositories)
- **Storage Adapters** (`ingest_pipeline/storage/`): Abstract base class for storage backends (Weaviate, Open WebUI)
- **Prefect Flows** (`ingest_pipeline/flows/`): Orchestration layer using Prefect for scheduling and task management
- **CLI** (`ingest_pipeline/cli/main.py`): Typer-based command interface with commands: `ingest`, `schedule`, `serve`, `config`
## Key Implementation Details
### Type Safety
- Strict typing enforced with no `Any` types allowed
- Modern typing syntax using `|` instead of `Union`
- Pydantic v2+ for all models and settings
- All models in `core/models.py` use TypedDict for metadata and strict Pydantic models
### Configuration Management
- Settings loaded from `.env` file via Pydantic Settings
- Cached singleton pattern in `config/settings.py` using `@lru_cache`
- Environment-specific endpoints configured for local services (llm.lab, weaviate.yo, chat.lab)
### Flow Orchestration
- Main ingestion flow in `flows/ingestion.py` with retry logic and task decorators
- Deployment scheduling in `flows/scheduler.py` supporting both cron and interval schedules
- Tasks use Prefect's `@task` decorator with retries and tags for monitoring
### Storage Backends
- Weaviate: Uses batch ingestion with configurable batch size, automatic collection creation
- Open WebUI: Direct API integration for knowledge base management
- Both inherit from abstract `BaseStorage` class ensuring consistent interface
## Service Endpoints
- **LLM Proxy**: http://llm.lab (for embeddings and processing)
- **Weaviate**: http://weaviate.yo (vector database)
- **Open WebUI**: http://chat.lab (knowledge interface)
- **Firecrawl**: http://crawl.lab:30002 (web crawling service)
## Important Constraints
- Cyclomatic complexity must remain < 15 for all functions
- Maximum file size for ingestion: 1MB
- Batch size limits: 50-500 documents
- Concurrent task limit: 5 (configurable via MAX_CONCURRENT_TASKS)
- All async operations use proper async/await patterns

150
README.md Normal file
View File

@@ -0,0 +1,150 @@
# Document Ingestion Pipeline
A modular, type-safe Python application using Prefect for scheduling ingestion jobs from web/documentation sites (via Firecrawl) and Git repositories (via Repomix) into Weaviate or Open WebUI knowledge endpoints.
## Features
- **Multiple Data Sources**:
- Web/documentation sites via Firecrawl
- Git repositories via Repomix
- **Multiple Storage Backends**:
- Weaviate vector database (self-hosted at http://weaviate.yo)
- Open WebUI knowledge endpoints (http://chat.lab)
- **Scheduling & Orchestration**:
- Prefect-based workflow orchestration
- Cron and interval-based scheduling
- Concurrent task execution
- **Type Safety**:
- Strict Python typing with no `Any` types
- Modern typing syntax (using `|` instead of `Union`)
- Pydantic models for validation
- **Code Quality**:
- Modular architecture
- Cyclomatic complexity < 15
- Clean separation of concerns
## Installation
```bash
# Install dependencies
pip install -r requirements.txt
# Install repomix globally (required for repository ingestion)
npm install -g repomix
# Copy and configure environment
cp .env.example .env
# Edit .env with your settings
```
## Usage
### One-time Ingestion
```bash
# Ingest a documentation site into Weaviate
python -m ingest_pipeline ingest https://docs.example.com --type web --storage weaviate
# Ingest a repository into Open WebUI
python -m ingest_pipeline ingest https://github.com/user/repo --type repository --storage open_webui
```
### Scheduled Ingestion
```bash
# Create a daily documentation crawl
python -m ingest_pipeline schedule daily-docs https://docs.example.com \
--type documentation \
--storage weaviate \
--cron "0 2 * * *"
# Create an hourly repository sync
python -m ingest_pipeline schedule repo-sync https://github.com/user/repo \
--type repository \
--storage open_webui \
--interval 60
```
### Serve Deployments
```bash
# Start serving scheduled deployments
python -m ingest_pipeline serve
```
### Configuration
```bash
# View current configuration
python -m ingest_pipeline config
```
## Architecture
```
ingest_pipeline/
├── core/ # Core models and exceptions
│ ├── models.py # Pydantic models with strict typing
│ └── exceptions.py # Custom exceptions
├── ingestors/ # Data source ingestors
│ ├── base.py # Abstract base ingestor
│ ├── firecrawl.py # Web/docs ingestion via Firecrawl
│ └── repomix.py # Repository ingestion via Repomix
├── storage/ # Storage adapters
│ ├── base.py # Abstract base storage
│ ├── weaviate.py # Weaviate adapter
│ └── openwebui.py # Open WebUI adapter
├── flows/ # Prefect flows
│ ├── ingestion.py # Main ingestion flow
│ └── scheduler.py # Deployment scheduling
├── config/ # Configuration management
│ └── settings.py # Settings with Pydantic
├── utils/ # Utilities
│ └── vectorizer.py # Text vectorization
└── cli/ # CLI interface
└── main.py # Typer-based CLI
```
## Environment Variables
- `FIRECRAWL_API_KEY`: API key for Firecrawl (optional)
- `LLM_ENDPOINT`: LLM proxy endpoint (default: http://llm.lab)
- `WEAVIATE_ENDPOINT`: Weaviate endpoint (default: http://weaviate.yo)
- `OPENWEBUI_ENDPOINT`: Open WebUI endpoint (default: http://chat.lab)
- `EMBEDDING_MODEL`: Model for embeddings (default: ollama/bge-m3:latest)
## Vectorization
The pipeline uses your LLM proxy at http://llm.lab with:
- Model: `ollama/gpt-oss:20b` for processing
- Embeddings: `ollama/bge-m3:latest` for vectorization
## Storage Backends
### Weaviate
- Endpoint: http://weaviate.yo
- Automatic collection creation
- Vector similarity search
- Batch ingestion support
### Open WebUI
- Endpoint: http://chat.lab/docs
- Knowledge base integration
- Direct API access
- Document management
## Development
The codebase follows strict typing and quality standards:
- No use of `Any` type
- Modern Python typing syntax
- Cyclomatic complexity < 15
- Modular, testable architecture
## License
MIT

24
basedpyright.json Normal file
View File

@@ -0,0 +1,24 @@
{
"include": [
"ingest_pipeline"
],
"exclude": [
"**/__pycache__",
"**/.pytest_cache",
"**/node_modules",
".venv"
],
"reportCallInDefaultInitializer": "none",
"reportUnknownVariableType": "warning",
"reportUnknownMemberType": "warning",
"reportUnknownArgumentType": "warning",
"reportUnknownLambdaType": "warning",
"reportUnknownParameterType": "warning",
"reportMissingParameterType": "warning",
"reportUnannotatedClassAttribute": "warning",
"reportAny": "warning",
"reportUnusedCallResult": "none",
"reportUnnecessaryIsInstance": "none",
"reportImplicitOverride": "none",
"reportDeprecated": "warning"
}

248
docs/elysia.md Normal file
View File

@@ -0,0 +1,248 @@
38 async def output_resources(): │
│ │
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/services/use │
│ r.py:223 in check_all_trees_timeout │
│ │
│ 220 │ │ Check all trees in all TreeManagers across all users and remove any │
│ not been active in the last tree_timeout. │
│ 221 │ │ """ │
│ 222 │ │ for user_id in self.users: │
│ ❱ 223 │ │ │ self.users[user_id]["tree_manager"].check_all_trees_timeout() │
│ 224 │ │
│ 225 │ def check_user_timeout(self, user_id: str): │
│ 226 │ │ """ │
╰───────────────────────────────────────────────────────────────────────────────────╯
KeyError: 'tree_manager'
[10:08:31] ERROR Job "check_timeouts (trigger: interval[0:00:29], next run at: 2025-09-15 10:09:00 base.py:195
EDT)" raised an exception
╭──────────────────────── Traceback (most recent call last) ────────────────────────╮
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/apscheduler/executors/b │
│ ase.py:181 in run_coroutine_job │
│ │
│ 178 │ │ │
│ 179 │ │ logger.info('Running job "%s" (scheduled at %s)', job, run_time) │
│ 180 │ │ try: │
│ ❱ 181 │ │ │ retval = await job.func(*job.args, **job.kwargs) │
│ 182 │ │ except BaseException: │
│ 183 │ │ │ exc, tb = sys.exc_info()[1:] │
│ 184 │ │ │ formatted_tb = "".join(format_tb(tb)) │
│ │
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/app.py:35 in │
│ check_timeouts │
│ │
│ 32 │
│ 33 async def check_timeouts(): │
│ 34 │ user_manager = get_user_manager() │
│ ❱ 35 │ await user_manager.check_all_trees_timeout() │
│ 36 │
│ 37 │
│ 38 async def output_resources(): │
│ │
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/services/use │
│ r.py:223 in check_all_trees_timeout │
│ │
│ 220 │ │ Check all trees in all TreeManagers across all users and remove any │
│ not been active in the last tree_timeout. │
│ 221 │ │ """ │
│ 222 │ │ for user_id in self.users: │
│ ❱ 223 │ │ │ self.users[user_id]["tree_manager"].check_all_trees_timeout() │
│ 224 │ │
│ 225 │ def check_user_timeout(self, user_id: str): │
│ 226 │ │ """ │
╰───────────────────────────────────────────────────────────────────────────────────╯
KeyError: 'tree_manager'
[10:26:25] WARNING Run time of job "check_restart_clients (trigger: interval[0:00:31], next run at: base.py:176
2025-09-15 10:26:33 EDT)" was missed by 0:00:23.029499
WARNING Run time of job "check_timeouts (trigger: interval[0:00:29], next run at: 2025-09-15 base.py:176
10:26:53 EDT)" was missed by 0:00:01.030848
WARNING Run time of job "output_resources (trigger: interval[0:18:23], next run at: base.py:176
2025-09-15 10:33:44 EDT)" was missed by 0:11:04.063842
[10:41:41] WARNING Run time of job "check_restart_clients (trigger: interval[0:00:31], next run at: base.py:176
2025-09-15 10:42:03 EDT)" was missed by 0:00:09.036380
WARNING Run time of job "check_timeouts (trigger: interval[0:00:29], next run at: 2025-09-15 base.py:176
10:41:52 EDT)" was missed by 0:00:18.037363
WARNING Run time of job "output_resources (trigger: interval[0:18:23], next run at: base.py:176
2025-09-15 10:52:07 EDT)" was missed by 0:07:57.071763
[10:51:25] WARNING Run time of job "check_timeouts (trigger: interval[0:00:29], next run at: 2025-09-15 base.py:176
10:51:32 EDT)" was missed by 0:00:21.808772
WARNING Run time of job "check_restart_clients (trigger: interval[0:00:31], next run at: base.py:176
2025-09-15 10:51:52 EDT)" was missed by 0:00:03.810823
[10:51:32] ERROR Job "check_timeouts (trigger: interval[0:00:29], next run at: 2025-09-15 10:52:01 base.py:195
EDT)" raised an exception
╭──────────────────────── Traceback (most recent call last) ────────────────────────╮
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/apscheduler/executors/b │
│ ase.py:181 in run_coroutine_job │
│ │
│ 178 │ │ │
│ 179 │ │ logger.info('Running job "%s" (scheduled at %s)', job, run_time) │
│ 180 │ │ try: │
│ ❱ 181 │ │ │ retval = await job.func(*job.args, **job.kwargs) │
│ 182 │ │ except BaseException: │
│ 183 │ │ │ exc, tb = sys.exc_info()[1:] │
│ 184 │ │ │ formatted_tb = "".join(format_tb(tb)) │
│ │
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/app.py:35 in │
│ check_timeouts │
│ │
│ 32 │
│ 33 async def check_timeouts(): │
│ 34 │ user_manager = get_user_manager() │
│ ❱ 35 │ await user_manager.check_all_trees_timeout() │
│ 36 │
│ 37 │
│ 38 async def output_resources(): │
│ │
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/services/use │
│ r.py:223 in check_all_trees_timeout │
│ │
│ 220 │ │ Check all trees in all TreeManagers across all users and remove any │
│ not been active in the last tree_timeout. │
│ 221 │ │ """ │
│ 222 │ │ for user_id in self.users: │
│ ❱ 223 │ │ │ self.users[user_id]["tree_manager"].check_all_trees_timeout() │
│ 224 │ │
│ 225 │ def check_user_timeout(self, user_id: str): │
│ 226 │ │ """ │
╰───────────────────────────────────────────────────────────────────────────────────╯
KeyError: 'tree_manager'
[10:51:43] ERROR Unexpected error: 'client_manager' error_handlers.py:32
INFO: 127.0.0.1:50043 - "GET /feedback/metadata/b6c0f65db8197395b453a7777a5e4c44 HTTP/1.1" 500 Internal Server Error
ERROR: Exception in ASGI application
Traceback (most recent call last):
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/uvicorn/protocols/http/httptools_impl.py", line 409, in run_asgi
result = await app( # type: ignore[func-returns-value]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/uvicorn/middleware/proxy_headers.py", line 60, in __call__
return await self.app(scope, receive, send)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/fastapi/applications.py", line 1054, in __call__
await super().__call__(scope, receive, send)
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/applications.py", line 113, in __call__
await self.middleware_stack(scope, receive, send)
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/middleware/errors.py", line 186, in __call__
raise exc
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/middleware/errors.py", line 164, in __call__
await self.app(scope, receive, _send)
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/middleware/cors.py", line 85, in __call__
await self.app(scope, receive, send)
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/middleware/exceptions.py", line 63, in __call__
await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
raise exc
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app
await app(scope, receive, sender)
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/routing.py", line 716, in __call__
await self.middleware_stack(scope, receive, send)
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/routing.py", line 736, in app
await route.handle(scope, receive, send)
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/routing.py", line 290, in handle
await self.app(scope, receive, send)
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/routing.py", line 78, in app
await wrap_app_handling_exceptions(app, request)(scope, receive, send)
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
raise exc
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app
await app(scope, receive, sender)
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/routing.py", line 75, in app
response = await f(request)
^^^^^^^^^^^^^^^^
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/fastapi/routing.py", line 302, in app
raw_response = await run_endpoint_function(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/fastapi/routing.py", line 213, in run_endpoint_function
return await dependant.call(**values)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/routes/feedback.py", line 81, in run_feedback_metadata
client_manager: ClientManager = user["client_manager"]
~~~~^^^^^^^^^^^^^^^^^^
KeyError: 'client_manager'
ERROR HTTP error occurred: Not Found error_handlers.py:14
INFO: 127.0.0.1:50045 - "GET /icon.svg?d6c34577c7161f78 HTTP/1.1" 404 Not Found
INFO: 127.0.0.1:50045 - "GET /user/config/models HTTP/1.1" 200 OK
INFO: 127.0.0.1:50054 - "GET /user/config/models HTTP/1.1" 200 OK
[10:52:01] ERROR Job "check_timeouts (trigger: interval[0:00:29], next run at: 2025-09-15 10:52:30 base.py:195
EDT)" raised an exception
╭──────────────────────── Traceback (most recent call last) ────────────────────────╮
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/apscheduler/executors/b │
│ ase.py:181 in run_coroutine_job │
│ │
│ 178 │ │ │
│ 179 │ │ logger.info('Running job "%s" (scheduled at %s)', job, run_time) │
│ 180 │ │ try: │
│ ❱ 181 │ │ │ retval = await job.func(*job.args, **job.kwargs) │
│ 182 │ │ except BaseException: │
│ 183 │ │ │ exc, tb = sys.exc_info()[1:] │
│ 184 │ │ │ formatted_tb = "".join(format_tb(tb)) │
│ │
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/app.py:35 in │
│ check_timeouts │
│ │
│ 32 │
│ 33 async def check_timeouts(): │
│ 34 │ user_manager = get_user_manager() │
│ ❱ 35 │ await user_manager.check_all_trees_timeout() │
│ 36 │
│ 37 │
│ 38 async def output_resources(): │
│ │
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/services/use │
│ r.py:223 in check_all_trees_timeout │
│ │
│ 220 │ │ Check all trees in all TreeManagers across all users and remove any │
│ not been active in the last tree_timeout. │
│ 221 │ │ """ │
│ 222 │ │ for user_id in self.users: │
│ ❱ 223 │ │ │ self.users[user_id]["tree_manager"].check_all_trees_timeout() │
│ 224 │ │
│ 225 │ def check_user_timeout(self, user_id: str): │
│ 226 │ │ """ │
╰───────────────────────────────────────────────────────────────────────────────────╯
KeyError: 'tree_manager'
^X[10:52:07] ERROR Job "output_resources (trigger: interval[0:18:23], next run at: 2025-09-15 11:10:30 base.py:195
EDT)" raised an exception
╭──────────────────────── Traceback (most recent call last) ────────────────────────╮
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/apscheduler/executors/b │
│ ase.py:181 in run_coroutine_job │
│ │
│ 178 │ │ │
│ 179 │ │ logger.info('Running job "%s" (scheduled at %s)', job, run_time) │
│ 180 │ │ try: │
│ ❱ 181 │ │ │ retval = await job.func(*job.args, **job.kwargs) │
│ 182 │ │ except BaseException: │
│ 183 │ │ │ exc, tb = sys.exc_info()[1:] │
│ 184 │ │ │ formatted_tb = "".join(format_tb(tb)) │
│ │
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/app.py:40 in │
│ output_resources │
│ │
│ 37 │
│ 38 async def output_resources(): │
│ 39 │ user_manager = get_user_manager() │
│ ❱ 40 │ await print_resources(user_manager, save_to_file=True) │
│ 41 │
│ 42 │
│ 43 async def check_restart_clients(): │
│ │
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/utils/resour │
│ ces.py:59 in print_resources │
│ │
│ 56 │ user_manager: UserManager | None = None, save_to_file: bool = False │
│ 57 ): │
│ 58 │ if user_manager is not None: │
│ ❱ 59 │ │ avg_user_memory, avg_tree_memory = await get_average_user_memory(us │
│ 60 │ │ # avg_user_requests = await get_average_user_requests(user_manager) │
│ 61 │ │ # num_users_db = await get_number_local_users_db(user_manager) │
│ 62 │
│ │
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/utils/resour │
│ ces.py:37 in get_average_user_memory │
│ │
│ 34 │ avg_tree_memory = 0 │
│ 35 │ for user in user_manager.users.values(): │
│ 36 │ │ user_memory = 0 │
│ ❱ 37 │ │ for tree in user["tree_manager"].trees.values(): │
│ 38 │ │ │ user_memory += tree["tree"].detailed_memory_usage()["total"] / │
│ 39 │ │ │
│ 40 │ │ if len(user["tree_manager"].trees) > 0: │
╰───────────────────────────────────────────────────────────────────────────────────╯
KeyError: 'tree_manager'

108
docs/tagging.md Normal file
View File

@@ -0,0 +1,108 @@
Here are clear written examples of **metadata tagging** in both Open WebUI and Weaviate, showing how you can associate tags and structured metadata with knowledge objects for RAG and semantic search.
***
### Example: Metadata Tagging in Open WebUI
You send a document to the Open WebUI API endpoint, attaching metadata and tags in the content field as a JSON string:
```json
POST http://localhost/api/v1/documents/create
Content-Type: application/json
{
"name": "policy_doc_2022",
"title": "2022 Policy Handbook",
"collection_name": "company_handbooks",
"filename": "policy_2022.pdf",
"content": "{\"tags\": [\"policy\", \"2022\", \"hr\"], \"source_url\": \"https://example.com/policy_2022.pdf\", \"author\": \"Jane Doe\"}"
}
```
- The `"tags"` field is a list of labels for classification (policy, 2022, hr).
- The `"source_url"` and `"author"` fields provide additional metadata useful for retrieval, audit, and filtering.[1][2]
For pipeline-based ingestion, you might design a function to extract and append metadata before vectorization:
```python
metadata = {
"tags": ["policy", "2022"],
"source_url": document_url,
"author": document_author
}
embed_with_metadata(chunk, metadata)
```
This metadata becomes part of your retrieval context in RAG workflows.[1]
***
### Example: Metadata Tagging in Weaviate
In Weaviate, metadata and tags are defined directly in the schema and attached to each object when added:
**Schema definition:**
```json
{
"class": "Document",
"properties": [
{"name": "title", "dataType": ["text"]},
{"name": "tags", "dataType": ["text[]"]},
{"name": "source_url", "dataType": ["text"]},
{"name": "author", "dataType": ["text"]}
]
}
```
**Object creation example:**
```python
client.data_object.create(
data_object={
"title": "2022 Policy Handbook",
"tags": ["policy", "2022", "hr"],
"source_url": "https://example.com/policy_2022.pdf",
"author": "Jane Doe"
},
class_name="Document"
)
```
- The `"tags"` field is a text array, ideal for semantic filtering and faceting.
- Other fields store provenance metadata, supporting advanced queries and data governance.[3][4][5]
**Query with metadata filtering:**
```python
result = (
client.query
.get("Document", ["title", "tags", "author"])
.with_filter({"path": ["tags"], "operator": "ContainsAny", "value": ["policy", "hr"]})
.do()
)
```
This retrieves documents classified with either "policy" or "hr" tags.[4][3]
***
Both platforms support **metadata tagging** for documents, which enables powerful RAG scenarios, detailed filtering, and context-rich retrievals.[5][2][3][4][1]
[1](https://www.reddit.com/r/OpenWebUI/comments/1hmmg9a/how_to_handle_metadata_during_vectorization/)
[2](https://github.com/open-webui/open-webui/discussions/4692)
[3](https://stackoverflow.com/questions/75006703/query-large-list-of-metadate-in-weaviate)
[4](https://weaviate.io/blog/enterprise-workflow-langchain-weaviate)
[5](https://docs.weaviate.io/academy/py/zero_to_mvp/schema_and_imports/schema)
[6](https://docs.weaviate.io/weaviate/api/graphql/additional-properties)
[7](https://weaviate.io/blog/sycamore-and-weaviate)
[8](https://docs.llamaindex.ai/en/stable/examples/vector_stores/WeaviateIndex_auto_retriever/)
[9](https://forum.weaviate.io/t/recommendations-for-metadata-or-knowledge-graphs/960)
[10](https://weaviate.io/blog/agent-workflow-automation-n8n-weaviate)
[11](https://github.com/open-webui/open-webui/discussions/9804)
[12](https://docs.quarkiverse.io/quarkus-langchain4j/dev/rag-weaviate.html)
[13](https://github.com/weaviate/weaviate-examples)
[14](https://docs.openwebui.com/getting-started/api-endpoints/)
[15](https://weaviate.io/blog/hybrid-search-for-web-developers)
[16](https://dev.to/stephenc222/how-to-use-weaviate-to-store-and-query-vector-embeddings-4b9b)
[17](https://helpdesk.egnyte.com/hc/en-us/articles/360035813612-Using-Metadata-in-the-WebUI)
[18](https://docs.datadoghq.com/integrations/weaviate/)
[19](https://docs.openwebui.com/features/)
[20](https://documentation.suse.com/suse-ai/1.0/html/openwebui-configuring/index.html)
[21](https://docs.openwebui.com/getting-started/env-configuration/)

38
ingest_pipeline/.env Normal file
View File

@@ -0,0 +1,38 @@
# API Keys
FIRECRAWL_API_KEY=fc-your-api-key
OPENWEBUI_API_KEY=
WEAVIATE_API_KEY=
# Endpoints
LLM_ENDPOINT=http://llm.lab
WEAVIATE_ENDPOINT=http://weaviate.yo
OPENWEBUI_ENDPOINT=http://chat.lab
# Model Configuration
EMBEDDING_MODEL=ollama/bge-m3:latest
EMBEDDING_DIMENSION=1024
# Ingestion Settings
BATCH_SIZE=50
MAX_FILE_SIZE=1000000
MAX_CRAWL_DEPTH=5
MAX_CRAWL_PAGES=100
# Storage Settings
DEFAULT_STORAGE_BACKEND=weaviate
COLLECTION_PREFIX=docs
# Prefect Settings
PREFECT_API_URL=http://prefect.lab
PREFECT_API_KEY=0nR4WAkQ3q9MY1bjqATK6pVmolighvrS
PREFECT_WORK_POOL=default
# Scheduling
DEFAULT_SCHEDULE_INTERVAL=60
# Performance
MAX_CONCURRENT_TASKS=5
REQUEST_TIMEOUT=60
# Logging
LOG_LEVEL=INFO

View File

@@ -0,0 +1,6 @@
"""Main entry point for the ingestion pipeline."""
from .cli.main import app
if __name__ == "__main__":
app()

Binary file not shown.

View File

@@ -0,0 +1,5 @@
"""CLI module for the ingestion pipeline."""
from .main import app
__all__ = ["app"]

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

616
ingest_pipeline/cli/main.py Normal file
View File

@@ -0,0 +1,616 @@
"""CLI interface for ingestion pipeline."""
import asyncio
from enum import Enum
import typer
from rich.console import Console
from rich.panel import Panel
from rich.progress import BarColumn, Progress, SpinnerColumn, TaskProgressColumn, TextColumn
from rich.table import Table
from ..config import get_settings
from ..core.models import IngestionResult
from ..flows.ingestion import create_ingestion_flow
from ..flows.scheduler import create_scheduled_deployment, serve_deployments
class SourceType(str, Enum):
"""Source types for ingestion."""
web = "web"
repository = "repository"
documentation = "documentation"
class StorageBackend(str, Enum):
"""Storage backend options."""
weaviate = "weaviate"
open_webui = "open_webui"
app = typer.Typer(
name="ingest",
help="🚀 Modern Document Ingestion Pipeline - Advanced web and repository processing",
rich_markup_mode="rich",
add_completion=False,
)
console = Console()
@app.callback()
def main(
version: bool = typer.Option(False, "--version", "-v", help="Show version information"),
) -> None:
"""
🚀 Modern Document Ingestion Pipeline
[bold cyan]Advanced document processing and management platform[/bold cyan]
Features:
• 🌐 Web scraping and crawling with Firecrawl
• 📦 Repository ingestion with Repomix
• 🗄️ Multiple storage backends (Weaviate, OpenWebUI)
• 📊 Modern TUI for collection management
• ⚡ Async processing with Prefect orchestration
• 🎨 Rich CLI with enhanced visuals
"""
if version:
console.print(
Panel(
"[bold magenta]Ingest Pipeline v0.1.0[/bold magenta]\n"
"[dim]Modern Document Ingestion & Management System[/dim]",
title="🚀 Version Info",
border_style="magenta"
)
)
raise typer.Exit()
@app.command()
def ingest(
source_url: str = typer.Argument(..., help="URL or path to ingest from"),
source_type: SourceType = typer.Option(SourceType.web, "--type", "-t", help="Type of source"),
storage: StorageBackend = typer.Option(
StorageBackend.weaviate, "--storage", "-s", help="Storage backend"
),
collection: str = typer.Option(
None, "--collection", "-c", help="Target collection name (auto-generated if not specified)"
),
validate: bool = typer.Option(
True, "--validate/--no-validate", help="Validate source before ingesting"
),
) -> None:
"""
🚀 Run a one-time ingestion job with enhanced progress tracking.
This command processes documents from various sources and stores them in
your chosen backend with full progress visualization.
"""
# Enhanced startup message
console.print(
Panel(
f"[bold cyan]🚀 Starting Modern Ingestion[/bold cyan]\n\n"
f"[yellow]Source:[/yellow] {source_url}\n"
f"[yellow]Type:[/yellow] {source_type.value.title()}\n"
f"[yellow]Storage:[/yellow] {storage.value.replace('_', ' ').title()}\n"
f"[yellow]Collection:[/yellow] {collection or '[dim]Auto-generated[/dim]'}",
title="🔥 Ingestion Configuration",
border_style="cyan"
)
)
async def run_with_progress() -> IngestionResult:
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
console=console,
) as progress:
task = progress.add_task("🔄 Processing documents...", total=100)
# Simulate progress updates during ingestion
progress.update(task, advance=20, description="🔗 Connecting to services...")
await asyncio.sleep(0.5)
progress.update(task, advance=30, description="📄 Fetching documents...")
result = await run_ingestion(
url=source_url,
source_type=source_type.value,
storage_backend=storage.value,
collection_name=collection,
validate_first=validate,
)
progress.update(task, advance=50, description="✅ Ingestion complete!")
return result
result = asyncio.run(run_with_progress())
# Enhanced results display
status_color = "green" if result.status.value == "completed" else "red"
# Create results table with enhanced styling
table = Table(
title="📊 Ingestion Results",
title_style="bold magenta",
border_style="cyan",
header_style="bold blue"
)
table.add_column("📋 Metric", style="cyan", no_wrap=True)
table.add_column("📈 Value", style=status_color, justify="right")
# Add enhanced status icon
status_icon = "" if result.status.value == "completed" else ""
table.add_row("Status", f"{status_icon} {result.status.value.title()}")
table.add_row("Documents Processed", f"📄 {result.documents_processed:,}")
table.add_row("Documents Failed", f"⚠️ {result.documents_failed:,}")
table.add_row("Duration", f"⏱️ {result.duration_seconds:.2f}s")
if result.error_messages:
error_text = "\n".join(f"{error}" for error in result.error_messages[:3])
if len(result.error_messages) > 3:
error_text += f"\n... and {len(result.error_messages) - 3} more errors"
table.add_row("Errors", error_text)
console.print(table)
# Success celebration or error guidance
if result.status.value == "completed" and result.documents_processed > 0:
console.print(
Panel(
f"🎉 [bold green]Success![/bold green] {result.documents_processed} documents ingested\n\n"
f"💡 [dim]Try '[bold cyan]ingest modern[/bold cyan]' to explore your collections![/dim]",
title="✨ Ingestion Complete",
border_style="green"
)
)
elif result.error_messages:
console.print(
Panel(
"❌ [bold red]Ingestion encountered errors[/bold red]\n\n"
"💡 [dim]Check your configuration and try again[/dim]",
title="⚠️ Issues Detected",
border_style="red"
)
)
@app.command()
def schedule(
name: str = typer.Argument(..., help="Deployment name"),
source_url: str = typer.Argument(..., help="URL or path to ingest from"),
source_type: SourceType = typer.Option(SourceType.web, "--type", "-t", help="Type of source"),
storage: StorageBackend = typer.Option(
StorageBackend.weaviate, "--storage", "-s", help="Storage backend"
),
cron: str | None = typer.Option(None, "--cron", "-c", help="Cron expression for scheduling"),
interval: int = typer.Option(60, "--interval", "-i", help="Interval in minutes"),
serve_now: bool = typer.Option(False, "--serve/--no-serve", help="Start serving immediately"),
) -> None:
"""
Create a scheduled deployment for recurring ingestion.
"""
console.print(f"[bold blue]Creating deployment: {name}[/bold blue]")
deployment = create_scheduled_deployment(
name=name,
source_url=source_url,
source_type=source_type.value,
storage_backend=storage.value,
schedule_type="cron" if cron else "interval",
cron_expression=cron,
interval_minutes=interval,
)
console.print(f"[green]✓ Deployment '{name}' created[/green]")
if serve_now:
console.print("[yellow]Starting deployment server...[/yellow]")
serve_deployments([deployment])
@app.command()
def serve(
config_file: str | None = typer.Option(
None, "--config", "-c", help="Path to deployments config file"
),
ui: str | None = typer.Option(
None, "--ui", help="Launch user interface (options: tui, web)"
),
) -> None:
"""
🚀 Serve configured deployments with optional UI interface.
Launch the deployment server to run scheduled ingestion jobs,
optionally with a modern Terminal User Interface (TUI) or web interface.
"""
# Handle UI mode first
if ui == "tui":
console.print(
Panel(
"[bold cyan]🚀 Launching Enhanced TUI[/bold cyan]\n\n"
"[yellow]Features:[/yellow]\n"
"• 📊 Interactive collection management\n"
"• ⌨️ Enhanced keyboard navigation\n"
"• 🎨 Modern design with focus indicators\n"
"• 📄 Document browsing and search\n"
"• 🔄 Real-time status updates",
title="🎉 TUI Mode",
border_style="cyan"
)
)
from .tui import dashboard
dashboard()
return
elif ui == "web":
console.print("[red]Web UI not yet implemented. Use --ui tui for Terminal UI.[/red]")
return
elif ui:
console.print(f"[red]Unknown UI option: {ui}[/red]")
console.print("[yellow]Available options: tui, web[/yellow]")
return
# Normal deployment server mode
if config_file:
# Load deployments from config
console.print(f"[yellow]Loading deployments from {config_file}[/yellow]")
# Implementation would load YAML/JSON config
else:
# Create example deployments
deployments = [
create_scheduled_deployment(
name="docs-daily",
source_url="https://docs.example.com",
source_type="documentation",
storage_backend="weaviate",
schedule_type="cron",
cron_expression="0 2 * * *", # Daily at 2 AM
),
create_scheduled_deployment(
name="repo-hourly",
source_url="https://github.com/example/repo",
source_type="repository",
storage_backend="open_webui",
schedule_type="interval",
interval_minutes=60,
),
]
console.print(
"[bold green]Starting deployment server with example deployments[/bold green]"
)
serve_deployments(deployments)
@app.command()
def tui() -> None:
"""
🚀 Launch the enhanced Terminal User Interface.
Quick shortcut for 'serve --ui tui' with modern keyboard navigation,
interactive collection management, and real-time status updates.
"""
console.print(
Panel(
"[bold cyan]🚀 Launching Enhanced TUI[/bold cyan]\n\n"
"[yellow]Features:[/yellow]\n"
"• 📊 Interactive collection management\n"
"• ⌨️ Enhanced keyboard navigation\n"
"• 🎨 Modern design with focus indicators\n"
"• 📄 Document browsing and search\n"
"• 🔄 Real-time status updates",
title="🎉 TUI Mode",
border_style="cyan"
)
)
from .tui import dashboard
dashboard()
@app.command()
def config() -> None:
"""
📋 Display current configuration with enhanced formatting.
Shows all configured endpoints, models, and settings in a beautiful
table format with status indicators.
"""
settings = get_settings()
console.print(
Panel(
"[bold cyan]⚙️ System Configuration[/bold cyan]\n"
"[dim]Current pipeline settings and endpoints[/dim]",
title="🔧 Configuration",
border_style="cyan"
)
)
# Enhanced configuration table
table = Table(
title="📊 Configuration Details",
title_style="bold magenta",
border_style="blue",
header_style="bold cyan",
show_lines=True
)
table.add_column("🏷️ Setting", style="cyan", no_wrap=True, width=25)
table.add_column("🎯 Value", style="yellow", overflow="fold")
table.add_column("📊 Status", style="green", width=12, justify="center")
# Add configuration rows with status indicators
def get_status_indicator(value: str | None) -> str:
return "✅ Set" if value else "❌ Missing"
table.add_row(
"🤖 LLM Endpoint",
str(settings.llm_endpoint),
"✅ Active"
)
table.add_row(
"🔥 Firecrawl Endpoint",
str(settings.firecrawl_endpoint),
"✅ Active"
)
table.add_row(
"🗄️ Weaviate Endpoint",
str(settings.weaviate_endpoint),
get_status_indicator(str(settings.weaviate_api_key) if settings.weaviate_api_key else None)
)
table.add_row(
"🌐 OpenWebUI Endpoint",
str(settings.openwebui_endpoint),
get_status_indicator(settings.openwebui_api_key)
)
table.add_row(
"🧠 Embedding Model",
settings.embedding_model,
"✅ Set"
)
table.add_row(
"💾 Default Storage",
settings.default_storage_backend.title(),
"✅ Set"
)
table.add_row(
"📦 Default Batch Size",
f"{settings.default_batch_size:,}",
"✅ Set"
)
table.add_row(
"⚡ Max Concurrent Tasks",
f"{settings.max_concurrent_tasks}",
"✅ Set"
)
console.print(table)
# Additional helpful information
console.print(
Panel(
"💡 [bold cyan]Quick Tips[/bold cyan]\n\n"
"• Use '[bold]ingest list-collections[/bold]' to view all collections\n"
"• Use '[bold]ingest search[/bold]' to search content\n"
"• Configure API keys in your [yellow].env[/yellow] file\n"
"• Default collection names are auto-generated from URLs",
title="🚀 Usage Tips",
border_style="green"
)
)
@app.command()
def list_collections() -> None:
"""
📋 List all collections across storage backends.
"""
console.print("[bold cyan]📚 Collection Overview[/bold cyan]")
asyncio.run(run_list_collections())
@app.command()
def search(
query: str = typer.Argument(..., help="Search query"),
collection: str = typer.Option(None, "--collection", "-c", help="Target collection"),
backend: StorageBackend = typer.Option(StorageBackend.weaviate, "--backend", "-b", help="Storage backend"),
limit: int = typer.Option(10, "--limit", "-l", help="Result limit"),
) -> None:
"""
🔍 Search across collections.
"""
console.print(f"[bold cyan]🔍 Searching for: {query}[/bold cyan]")
asyncio.run(run_search(query, collection, backend.value, limit))
async def run_ingestion(
url: str,
source_type: str,
storage_backend: str,
collection_name: str | None = None,
validate_first: bool = True
) -> IngestionResult:
"""
Run ingestion with support for targeted collections.
"""
# Auto-generate collection name if not provided
if not collection_name:
from urllib.parse import urlparse
parsed = urlparse(url)
domain = parsed.netloc.replace(".", "_").replace("-", "_")
collection_name = f"{domain}_{source_type}"
result = await create_ingestion_flow(
source_url=url,
source_type=source_type,
storage_backend=storage_backend,
collection_name=collection_name,
validate_first=validate_first,
)
return result
async def run_list_collections() -> None:
"""
List collections across storage backends.
"""
from ..config import get_settings
from ..core.models import StorageBackend, StorageConfig
from ..storage.openwebui import OpenWebUIStorage
from ..storage.weaviate import WeaviateStorage
settings = get_settings()
console.print("🔍 [bold cyan]Scanning storage backends...[/bold cyan]")
# Try to connect to Weaviate
weaviate_collections = []
try:
weaviate_config = StorageConfig(
backend=StorageBackend.WEAVIATE,
endpoint=settings.weaviate_endpoint,
api_key=settings.weaviate_api_key,
collection_name="default",
)
weaviate = WeaviateStorage(weaviate_config)
await weaviate.initialize()
collections_list = weaviate.client.collections.list_all() if weaviate.client else []
for collection in collections_list:
collection_obj = weaviate.client.collections.get(collection) if weaviate.client else None
if collection_obj:
count = collection_obj.aggregate.over_all(total_count=True).total_count or 0
weaviate_collections.append((collection, count))
except Exception as e:
console.print(f"❌ [red]Weaviate connection failed: {e}[/red]")
# Try to connect to OpenWebUI
openwebui_collections = []
try:
openwebui_config = StorageConfig(
backend=StorageBackend.OPEN_WEBUI,
endpoint=settings.openwebui_endpoint,
api_key=settings.openwebui_api_key,
collection_name="default",
)
openwebui = OpenWebUIStorage(openwebui_config)
await openwebui.initialize()
response = await openwebui.client.get("/api/v1/knowledge/")
response.raise_for_status()
knowledge_bases = response.json()
for kb in knowledge_bases:
name = kb.get("name", "Unknown")
file_count = len(kb.get("files", []))
openwebui_collections.append((name, file_count))
except Exception as e:
console.print(f"❌ [red]OpenWebUI connection failed: {e}[/red]")
# Display results
if weaviate_collections or openwebui_collections:
# Create results table
from rich.table import Table
table = Table(
title="📚 Collection Overview",
title_style="bold magenta",
border_style="cyan",
header_style="bold blue"
)
table.add_column("🏷️ Collection", style="cyan", no_wrap=True)
table.add_column("📊 Backend", style="yellow")
table.add_column("📄 Documents", style="green", justify="right")
# Add Weaviate collections
for name, count in weaviate_collections:
table.add_row(name, "🗄️ Weaviate", f"{count:,}")
# Add OpenWebUI collections
for name, count in openwebui_collections:
table.add_row(name, "🌐 OpenWebUI", f"{count:,}")
console.print(table)
else:
console.print("❌ [yellow]No collections found in any backend[/yellow]")
async def run_search(query: str, collection: str | None, backend: str, limit: int) -> None:
"""
Search across collections.
"""
from ..config import get_settings
from ..core.models import StorageBackend, StorageConfig
from ..storage.weaviate import WeaviateStorage
settings = get_settings()
console.print(f"🔍 Searching for: '[bold cyan]{query}[/bold cyan]'")
if collection:
console.print(f"📚 Target collection: [yellow]{collection}[/yellow]")
console.print(f"💾 Backend: [blue]{backend}[/blue]")
results = []
try:
if backend == "weaviate":
weaviate_config = StorageConfig(
backend=StorageBackend.WEAVIATE,
endpoint=settings.weaviate_endpoint,
api_key=settings.weaviate_api_key,
collection_name=collection or "default",
)
weaviate = WeaviateStorage(weaviate_config)
await weaviate.initialize()
results_generator = weaviate.search(query, limit=limit)
async for doc in results_generator:
results.append({
"title": getattr(doc, "title", "Untitled"),
"content": getattr(doc, "content", ""),
"score": getattr(doc, "score", 0.0),
"backend": "🗄️ Weaviate"
})
elif backend == "open_webui":
console.print("❌ [red]OpenWebUI search not yet implemented[/red]")
return
except Exception as e:
console.print(f"❌ [red]Search failed: {e}[/red]")
return
# Display results
if results:
from rich.table import Table
table = Table(
title=f"🔍 Search Results for '{query}'",
title_style="bold magenta",
border_style="green",
header_style="bold blue"
)
table.add_column("📄 Title", style="cyan", max_width=40)
table.add_column("📝 Preview", style="white", max_width=60)
table.add_column("📊 Score", style="yellow", justify="right")
for result in results[:limit]:
title = str(result["title"])
title_display = title[:40] + "..." if len(title) > 40 else title
content = str(result["content"])
content_display = content[:60] + "..." if len(content) > 60 else content
score = f"{result['score']:.3f}"
table.add_row(title_display, content_display, score)
console.print(table)
console.print(f"\n✅ [green]Found {len(results)} results[/green]")
else:
console.print("❌ [yellow]No results found[/yellow]")
if __name__ == "__main__":
app()

View File

@@ -0,0 +1,13 @@
"""Enhanced TUI package with keyboard navigation and modular architecture."""
from .app import CollectionManagementApp
from .models import CollectionInfo, DocumentInfo
from .utils import dashboard, run_textual_tui
__all__ = [
"CollectionManagementApp",
"CollectionInfo",
"DocumentInfo",
"dashboard",
"run_textual_tui",
]

View File

@@ -0,0 +1,181 @@
"""Main TUI application with enhanced keyboard navigation."""
from textual import events
from textual.app import App
from textual.binding import Binding
from ...storage.openwebui import OpenWebUIStorage
from ...storage.weaviate import WeaviateStorage
from .screens import CollectionOverviewScreen, HelpScreen
from .styles import TUI_CSS
class CollectionManagementApp(App[None]):
"""Enhanced modern Textual application with comprehensive keyboard navigation."""
CSS = TUI_CSS
BINDINGS = [
Binding("q", "quit", "Quit"),
Binding("ctrl+c", "quit", "Quit"),
Binding("ctrl+q", "quit", "Quit"),
Binding("f1", "help", "Help"),
Binding("ctrl+h", "help", "Help"),
Binding("?", "help", "Quick Help"),
# Global navigation shortcuts
Binding("ctrl+r", "refresh_current", "Refresh Current Screen"),
Binding("ctrl+w", "close_current", "Close Current Screen"),
# Tab navigation shortcuts
Binding("ctrl+1", "dashboard_tab", "Dashboard", show=False),
Binding("ctrl+2", "collections_tab", "Collections", show=False),
Binding("ctrl+3", "analytics_tab", "Analytics", show=False),
]
weaviate: WeaviateStorage | None
openwebui: OpenWebUIStorage | None
def __init__(
self, weaviate: WeaviateStorage | None = None, openwebui: OpenWebUIStorage | None = None
):
super().__init__()
self.weaviate = weaviate
self.openwebui = openwebui
def on_mount(self) -> None:
"""Initialize the enhanced app with better branding."""
self.title = "🚀 Enhanced Collection Management System"
self.sub_title = "Advanced Document Ingestion & Management Platform with Keyboard Navigation"
self.push_screen(CollectionOverviewScreen(self.weaviate, self.openwebui))
def action_help(self) -> None:
"""Show comprehensive help information with all keyboard shortcuts."""
help_md = """
# 🚀 Enhanced Collection Management System
## 🎯 Global Navigation
- **F1** / **Ctrl+H** / **?**: Show this help
- **Q** / **Ctrl+C** / **Ctrl+Q**: Quit application
- **Ctrl+R**: Refresh current screen
- **Ctrl+W**: Close current screen/dialog
- **Escape**: Go back/cancel current action
## 📑 Tab Navigation
- **Tab** / **Shift+Tab**: Switch between tabs
- **Ctrl+1**: Jump to Dashboard tab
- **Ctrl+2**: Jump to Collections tab
- **Ctrl+3**: Jump to Analytics tab
## 📚 Collections Management
- **R**: Refresh collections list
- **I**: Start new ingestion
- **M**: Manage documents in selected collection
- **S**: Search within selected collection
- **Ctrl+D**: Delete selected collection
## 🗂️ Table Navigation
- **Arrow Keys** / **J/K/H/L**: Navigate table cells (Vi-style)
- **Home** / **End**: Jump to first/last row
- **Page Up** / **Page Down**: Scroll by page
- **Enter**: Select/activate current row
- **Space**: Toggle row selection
- **Ctrl+A**: Select all items
- **Ctrl+Shift+A**: Clear all selections
## 📄 Document Management
- **Space**: Toggle document selection
- **Delete** / **Ctrl+D**: Delete selected documents
- **A**: Select all documents on page
- **N**: Clear selection
- **Page Up/Down**: Navigate between pages
- **Home/End**: Go to first/last page
## 🔍 Search Features
- **/** : Quick search (focus search field)
- **Ctrl+F**: Focus search input
- **Enter**: Perform search
- **F3**: Repeat last search
- **Ctrl+R**: Clear search results
- **Escape**: Clear search/exit search mode
## 📥 Ingestion Interface
- **1/2/3**: Select ingestion type (Web/Repository/Documentation)
- **Tab/Shift+Tab**: Navigate between fields
- **Enter**: Start ingestion process
- **Ctrl+I**: Quick start ingestion
- **Escape**: Cancel ingestion
## 🎨 Visual Features
- Enhanced focus indicators with colored borders
- Smooth keyboard navigation with visual feedback
- Status indicators with real-time updates
- Progress bars with detailed status messages
- Responsive design with accessibility features
## 💡 Pro Tips
- Use **Vi-style** navigation (J/K/H/L) for efficient movement
- **Tab** through interactive elements for keyboard-only operation
- Hold **Shift** with arrow keys for range selection (where supported)
- Use **Ctrl+** shortcuts for power user efficiency
- **Escape** is your friend - it cancels most operations safely
## 🚀 Performance Features
- Lazy loading for large collections
- Paginated document views
- Background refresh operations
- Efficient memory management
- Responsive UI updates
---
**Enjoy the enhanced keyboard-driven interface!** 🎉
*Press Escape, Enter, or Q to close this help.*
"""
self.push_screen(HelpScreen(help_md))
def action_refresh_current(self) -> None:
"""Refresh the current screen if it supports it."""
current_screen = self.screen
if hasattr(current_screen, "action_refresh"):
current_screen.action_refresh()
else:
self.notify("Current screen doesn't support refresh", severity="information")
def action_close_current(self) -> None:
"""Close current screen/dialog."""
if len(self.screen_stack) > 1: # Don't close the main screen
self.pop_screen()
else:
self.notify("Cannot close main screen. Use Q to quit.", severity="warning")
def action_dashboard_tab(self) -> None:
"""Switch to dashboard tab in current screen."""
current_screen = self.screen
if hasattr(current_screen, "action_tab_dashboard"):
current_screen.action_tab_dashboard()
def action_collections_tab(self) -> None:
"""Switch to collections tab in current screen."""
current_screen = self.screen
if hasattr(current_screen, "action_tab_collections"):
current_screen.action_tab_collections()
def action_analytics_tab(self) -> None:
"""Switch to analytics tab in current screen."""
current_screen = self.screen
if hasattr(current_screen, "action_tab_analytics"):
current_screen.action_tab_analytics()
def on_key(self, event: events.Key) -> None:
"""Handle global keyboard shortcuts."""
# Handle global shortcuts that might not be bound to specific actions
if event.key == "ctrl+shift+?":
# Alternative help shortcut
self.action_help()
event.prevent_default()
elif event.key == "ctrl+alt+r":
# Force refresh all connections
self.notify("🔄 Refreshing all connections...", severity="information")
# This could trigger a full reinit if needed
event.prevent_default()
# No else clause needed - just handle our events

View File

@@ -0,0 +1,26 @@
"""Data models and TypedDict definitions for the TUI."""
from typing import TypedDict
class CollectionInfo(TypedDict):
"""Information about a collection."""
name: str
type: str
count: int
backend: str
status: str
last_updated: str
size_mb: float
class DocumentInfo(TypedDict):
"""Information about a document."""
id: str
title: str
source_url: str
content_preview: str
word_count: int
timestamp: str

View File

@@ -0,0 +1,18 @@
"""Screen components for the TUI application."""
from .dashboard import CollectionOverviewScreen
from .dialogs import ConfirmDeleteScreen, ConfirmDocumentDeleteScreen
from .documents import DocumentManagementScreen
from .help import HelpScreen
from .ingestion import IngestionScreen
from .search import SearchScreen
__all__ = [
"CollectionOverviewScreen",
"IngestionScreen",
"SearchScreen",
"DocumentManagementScreen",
"ConfirmDeleteScreen",
"ConfirmDocumentDeleteScreen",
"HelpScreen",
]

View File

@@ -0,0 +1,542 @@
"""Main dashboard screen with collections overview."""
from datetime import datetime
from textual import work
from textual.app import ComposeResult
from textual.binding import Binding
from textual.containers import Container, Grid, Horizontal
from textual.reactive import reactive, var
from textual.screen import Screen
from textual.widgets import (
Button,
Footer,
Header,
LoadingIndicator,
Rule,
Static,
TabbedContent,
TabPane,
)
from typing_extensions import override
from ....storage.openwebui import OpenWebUIStorage
from ....storage.weaviate import WeaviateStorage
from ..models import CollectionInfo
from ..widgets import EnhancedDataTable, MetricsCard, StatusIndicator
class CollectionOverviewScreen(Screen[None]):
"""Enhanced dashboard with modern design and metrics."""
total_documents: int = 0
total_collections: int = 0
active_backends: int = 0
BINDINGS = [
Binding("q", "quit", "Quit"),
Binding("r", "refresh", "Refresh"),
Binding("i", "ingest", "Ingest"),
Binding("m", "manage", "Manage"),
Binding("s", "search", "Search"),
Binding("ctrl+d", "delete", "Delete"),
Binding("ctrl+1", "tab_dashboard", "Dashboard"),
Binding("ctrl+2", "tab_collections", "Collections"),
Binding("ctrl+3", "tab_analytics", "Analytics"),
Binding("tab", "next_tab", "Next Tab"),
Binding("shift+tab", "prev_tab", "Prev Tab"),
Binding("f1", "help", "Help"),
]
collections: var[list[CollectionInfo]] = var([])
is_loading: var[bool] = var(False)
selected_collection: reactive[CollectionInfo | None] = reactive(None)
def __init__(self, weaviate: WeaviateStorage | None, openwebui: OpenWebUIStorage | None):
super().__init__()
self.weaviate = weaviate
self.openwebui = openwebui
self.total_documents = 0
self.total_collections = 0
self.active_backends = 0
@override
def compose(self) -> ComposeResult:
yield Header(show_clock=True)
with TabbedContent("Dashboard", "Collections", "Analytics"):
# Dashboard Tab
with TabPane("Dashboard", id="dashboard"):
yield Container(
Static("🚀 Collection Management System", classes="title"),
Static("Modern document ingestion and management platform", classes="subtitle"),
Rule(line_style="heavy"),
# Metrics Grid
Grid(
MetricsCard(
"Collections", str(self.total_collections), "Active collections"
),
MetricsCard("Documents", str(self.total_documents), "Total indexed"),
MetricsCard("Backends", str(self.active_backends), "Connected services"),
MetricsCard("Status", "Online", "System health"),
classes="responsive-grid metrics-grid",
),
Rule(line_style="dashed"),
# Quick Actions
Container(
Static("⚡ Quick Actions", classes="section-title"),
Horizontal(
Button("🔄 Refresh Data", id="quick_refresh", variant="primary"),
Button("📥 New Ingestion", id="quick_ingest", variant="success"),
Button("🔍 Search All", id="quick_search", variant="default"),
Button("⚙️ Settings", id="quick_settings", variant="default"),
classes="action_buttons",
),
classes="card",
),
# Recent Activity
Container(
Static("📊 Recent Activity", classes="section-title"),
Static(
"Loading recent activity...", id="activity_feed", classes="status-text"
),
classes="card",
),
classes="main_container",
)
# Collections Tab
with TabPane("Collections", id="collections"):
yield Container(
Static("📚 Collection Overview", classes="title"),
# Collection controls
Horizontal(
Button("🔄 Refresh", id="refresh_btn", variant="primary"),
Button("📥 Ingest", id="ingest_btn", variant="success"),
Button("🔧 Manage", id="manage_btn", variant="warning"),
Button("🗑️ Delete", id="delete_btn", variant="error"),
Button("🔍 Search", id="search_btn", variant="default"),
classes="button_bar",
),
# Collection table with enhanced navigation
EnhancedDataTable(id="collections_table", classes="enhanced-table"),
# Status bar
Container(
Static("Ready", id="status_text", classes="status-text"),
StatusIndicator("Ready", id="connection_status"),
classes="status-bar",
),
LoadingIndicator(id="loading", classes="pulse"),
classes="main_container",
)
# Analytics Tab
with TabPane("Analytics", id="analytics"):
yield Container(
Static("📈 Analytics & Insights", classes="title"),
# Analytics content
Container(
Static("🚧 Analytics Dashboard", classes="section-title"),
Static("Advanced analytics and insights coming soon!", classes="subtitle"),
# Placeholder charts area
Container(
Static("📊 Document Distribution", classes="chart-title"),
Static(
"Chart placeholder - integrate with visualization library",
classes="chart-placeholder",
),
classes="card",
),
Container(
Static("⏱️ Ingestion Timeline", classes="chart-title"),
Static("Timeline chart placeholder", classes="chart-placeholder"),
classes="card",
),
classes="analytics-grid",
),
classes="main_container",
)
yield Footer()
async def on_mount(self) -> None:
"""Initialize the screen with enhanced loading."""
self.query_one("#loading").display = False
self.update_metrics()
self.refresh_collections() # Don't await, let it run as a worker
def update_metrics(self) -> None:
"""Update dashboard metrics with enhanced calculations."""
self.total_collections = len(self.collections)
self.total_documents = sum(col["count"] for col in self.collections)
# Count active backends
self.active_backends = 0
if self.weaviate:
self.active_backends += 1
if self.openwebui:
self.active_backends += 1
# Update metrics cards if they exist
try:
dashboard_tab = self.query_one("#dashboard")
metrics_cards = dashboard_tab.query(MetricsCard)
if len(metrics_cards) >= 4:
# Update existing cards with formatted values
metrics_cards[0].query_one(".metrics-value", Static).update(
f"{self.total_collections:,}"
)
metrics_cards[1].query_one(".metrics-value", Static).update(
f"{self.total_documents:,}"
)
metrics_cards[2].query_one(".metrics-value", Static).update(
str(self.active_backends)
)
# Update status card based on system health
if self.active_backends > 0 and self.total_collections > 0:
status_text = "🟢 Healthy"
status_class = "status-active"
elif self.active_backends > 0:
status_text = "🟡 Ready"
status_class = "status-warning"
else:
status_text = "🔴 Offline"
status_class = "status-error"
metrics_cards[3].query_one(".metrics-value", Static).update(status_text)
metrics_cards[3].add_class(status_class)
except Exception:
pass # Cards might not be rendered yet
# Update activity feed with real data
try:
dashboard_tab = self.query_one("#dashboard")
activity_feed = dashboard_tab.query_one("#activity_feed", Static)
if self.collections:
recent_activity = []
for col in self.collections[:3]: # Show top 3 collections
recent_activity.append(
f"📚 {col['name']}: {col['count']:,} docs ({col.get('size_mb', 0):.1f} MB)"
)
activity_text = "\\n".join(recent_activity)
if len(self.collections) > 3:
activity_text += f"\\n... and {len(self.collections) - 3} more collections"
else:
activity_text = "No collections found. Start by creating your first ingestion!"
activity_feed.update(activity_text)
except Exception:
pass
@work(exclusive=True)
async def refresh_collections(self) -> None:
"""Refresh collection data with enhanced loading feedback."""
self.is_loading = True
loading_indicator = self.query_one("#loading")
status_text = self.query_one("#status_text", Static)
loading_indicator.display = True
status_text.update("🔄 Refreshing collections...")
try:
collections = []
# Get Weaviate collections
if self.weaviate:
try:
status_text.update("🔗 Connecting to Weaviate...")
await self.weaviate.initialize()
weaviate_collections = await self.list_weaviate_collections()
collections.extend(weaviate_collections)
status_text.update("✅ Weaviate collections loaded")
except Exception as e:
self.notify(f"❌ Weaviate error: {e}", severity="error")
status_text.update("❌ Weaviate connection failed")
# Get OpenWebUI collections
if self.openwebui:
try:
status_text.update("🔗 Connecting to OpenWebUI...")
await self.openwebui.initialize()
openwebui_collections = await self.list_openwebui_collections()
collections.extend(openwebui_collections)
status_text.update("✅ OpenWebUI collections loaded")
except Exception as e:
self.notify(f"❌ OpenWebUI error: {e}", severity="error")
status_text.update("❌ OpenWebUI connection failed")
self.collections = collections
await self.update_collections_table()
self.update_metrics()
status_text.update(f"✨ Ready - {len(collections)} collections loaded")
# Update connection status
connection_status = self.query_one("#connection_status", StatusIndicator)
if collections:
connection_status.update_status("✓ Active")
else:
connection_status.update_status("No Data")
except Exception as e:
status_text.update(f"❌ Error: {e}")
self.notify(f"Failed to refresh: {e}", severity="error")
finally:
self.is_loading = False
loading_indicator.display = False
async def list_weaviate_collections(self) -> list[CollectionInfo]:
"""List Weaviate collections with enhanced metadata."""
if not self.weaviate:
return []
try:
collections = []
collections_list = (
self.weaviate.client.collections.list_all()
if self.weaviate and self.weaviate.client
else []
)
for collection in collections_list:
collection_obj = (
self.weaviate.client.collections.get(collection)
if self.weaviate and self.weaviate.client
else None
)
if not collection_obj:
continue
count = collection_obj.aggregate.over_all(total_count=True).total_count or 0
# Estimate size
size_mb = count * 0.01 # Rough estimate
collection_info = CollectionInfo(
name=collection,
type="weaviate",
count=count,
backend="🗄️ Weaviate",
status="✓ Active",
last_updated=datetime.now().strftime("%Y-%m-%d %H:%M"),
size_mb=size_mb,
)
collections.append(collection_info)
return collections
except Exception as e:
self.notify(f"Error listing Weaviate collections: {e}", severity="error")
return []
async def list_openwebui_collections(self) -> list[CollectionInfo]:
"""List OpenWebUI collections with enhanced metadata."""
if not self.openwebui:
return []
try:
response = await self.openwebui.client.get("/api/v1/knowledge/")
response.raise_for_status()
knowledge_bases = response.json()
collections = []
for kb in knowledge_bases:
file_count = len(kb.get("files", []))
size_mb = file_count * 0.5 # Rough estimate
collection_info = CollectionInfo(
name=kb.get("name", "Unknown"),
type="openwebui",
count=file_count,
backend="🌐 OpenWebUI",
status="✓ Active",
last_updated=kb.get("updated_at", datetime.now().strftime("%Y-%m-%d %H:%M")),
size_mb=size_mb,
)
collections.append(collection_info)
return collections
except Exception as e:
self.notify(f"Error listing OpenWebUI collections: {e}", severity="error")
return []
async def update_collections_table(self) -> None:
"""Update the collections table with enhanced formatting."""
table = self.query_one("#collections_table", EnhancedDataTable)
table.clear()
# Add enhanced columns
table.add_columns("Collection", "Backend", "Documents", "Size", "Status", "Updated")
# Add rows with enhanced formatting
for collection in self.collections:
# Format size
size_str = f"{collection['size_mb']:.1f} MB"
if collection["size_mb"] > 1000:
size_str = f"{collection['size_mb'] / 1000:.1f} GB"
# Format document count
doc_count = f"{collection['count']:,}"
table.add_row(
collection["name"],
collection["backend"],
doc_count,
size_str,
collection["status"],
collection["last_updated"],
)
def get_selected_collection(self) -> CollectionInfo | None:
"""Get the currently selected collection."""
table = self.query_one("#collections_table", EnhancedDataTable)
try:
if table.cursor_coordinate.row < len(self.collections):
return self.collections[table.cursor_coordinate.row]
except (AttributeError, IndexError):
pass
return None
# Action methods
def action_refresh(self) -> None:
"""Refresh collections."""
self.refresh_collections()
def action_ingest(self) -> None:
"""Show enhanced ingestion dialog."""
selected = self.get_selected_collection()
if selected:
from .ingestion import IngestionScreen
self.app.push_screen(IngestionScreen(selected))
else:
self.notify("🔍 Please select a collection first", severity="warning")
def action_manage(self) -> None:
"""Manage documents in selected collection."""
selected = self.get_selected_collection()
if selected:
if selected["type"] == "weaviate":
from .documents import DocumentManagementScreen
self.app.push_screen(DocumentManagementScreen(selected, self.weaviate))
else:
self.notify(
"🚧 Document management only available for Weaviate", severity="warning"
)
else:
self.notify("🔍 Please select a collection first", severity="warning")
def action_search(self) -> None:
"""Search in selected collection."""
selected = self.get_selected_collection()
if selected:
from .search import SearchScreen
self.app.push_screen(SearchScreen(selected, self.weaviate, self.openwebui))
else:
self.notify("🔍 Please select a collection first", severity="warning")
def action_delete(self) -> None:
"""Delete selected collection."""
selected = self.get_selected_collection()
if selected:
from .dialogs import ConfirmDeleteScreen
self.app.push_screen(ConfirmDeleteScreen(selected, self))
else:
self.notify("🔍 Please select a collection first", severity="warning")
def action_tab_dashboard(self) -> None:
"""Switch to dashboard tab."""
tabs = self.query_one(TabbedContent)
tabs.active = "dashboard"
def action_tab_collections(self) -> None:
"""Switch to collections tab."""
tabs = self.query_one(TabbedContent)
tabs.active = "collections"
def action_tab_analytics(self) -> None:
"""Switch to analytics tab."""
tabs = self.query_one(TabbedContent)
tabs.active = "analytics"
def action_next_tab(self) -> None:
"""Switch to next tab."""
tabs = self.query_one(TabbedContent)
tab_ids = ["dashboard", "collections", "analytics"]
current = tabs.active
try:
current_index = tab_ids.index(current)
next_index = (current_index + 1) % len(tab_ids)
tabs.active = tab_ids[next_index]
except (ValueError, AttributeError):
tabs.active = tab_ids[0]
def action_prev_tab(self) -> None:
"""Switch to previous tab."""
tabs = self.query_one(TabbedContent)
tab_ids = ["dashboard", "collections", "analytics"]
current = tabs.active
try:
current_index = tab_ids.index(current)
prev_index = (current_index - 1) % len(tab_ids)
tabs.active = tab_ids[prev_index]
except (ValueError, AttributeError):
tabs.active = tab_ids[0]
def action_help(self) -> None:
"""Show help screen."""
from .help import HelpScreen
help_md = """
# 🚀 Modern Collection Management System
## Navigation
- **Tab** / **Shift+Tab**: Switch between tabs
- **Ctrl+1/2/3**: Direct tab access
- **Enter**: Activate selected item
- **Escape**: Go back/cancel
- **Arrow Keys**: Navigate within tables
- **Home/End**: Jump to first/last row
- **Page Up/Down**: Scroll by page
## Collections
- **R**: Refresh collections
- **I**: Start ingestion
- **M**: Manage documents
- **S**: Search collection
- **Ctrl+D**: Delete collection
## Table Navigation
- **Up/Down** or **J/K**: Navigate rows
- **Space**: Toggle selection
- **Ctrl+A**: Select all
- **Ctrl+Shift+A**: Clear selection
## General
- **Q** / **Ctrl+C**: Quit application
- **F1**: Show this help
Enjoy the enhanced interface! 🎉
"""
self.app.push_screen(HelpScreen(help_md))
def on_button_pressed(self, event: Button.Pressed) -> None:
"""Handle button presses with enhanced feedback."""
button_id = event.button.id
# Add visual feedback
event.button.add_class("pressed")
self.call_later(self.remove_pressed_class, event.button)
if button_id == "refresh_btn" or button_id == "quick_refresh":
self.action_refresh()
elif button_id == "ingest_btn" or button_id == "quick_ingest":
self.action_ingest()
elif button_id == "manage_btn":
self.action_manage()
elif button_id == "delete_btn":
self.action_delete()
elif button_id == "search_btn" or button_id == "quick_search":
self.action_search()
elif button_id == "quick_settings":
self.notify("⚙️ Settings panel coming soon!", severity="information")
def remove_pressed_class(self, button: Button) -> None:
"""Remove pressed visual feedback class."""
button.remove_class("pressed")

View File

@@ -0,0 +1,189 @@
"""Dialog screens for confirmations and user interactions."""
from typing import TYPE_CHECKING
from textual.app import ComposeResult
from textual.binding import Binding
from textual.containers import Container, Horizontal
from textual.screen import Screen
from textual.widgets import Button, Footer, Header, LoadingIndicator, Static
from typing_extensions import override
from ..models import CollectionInfo
if TYPE_CHECKING:
from .dashboard import CollectionOverviewScreen
from .documents import DocumentManagementScreen
class ConfirmDeleteScreen(Screen[None]):
"""Screen for confirming collection deletion."""
collection: CollectionInfo
parent_screen: "CollectionOverviewScreen"
BINDINGS = [
Binding("escape", "app.pop_screen", "Cancel"),
Binding("y", "confirm_delete", "Yes"),
Binding("n", "app.pop_screen", "No"),
Binding("enter", "confirm_delete", "Confirm"),
]
def __init__(self, collection: CollectionInfo, parent_screen: "CollectionOverviewScreen"):
super().__init__()
self.collection = collection
self.parent_screen = parent_screen
@override
def compose(self) -> ComposeResult:
yield Header()
yield Container(
Static("⚠️ Confirm Deletion", classes="title warning"),
Static(f"Are you sure you want to delete collection '{self.collection['name']}'?"),
Static(f"Backend: {self.collection['backend']}"),
Static(f"Documents: {self.collection['count']:,}"),
Static("This action cannot be undone!", classes="warning"),
Static("Press Y to confirm, N or Escape to cancel", classes="subtitle"),
Horizontal(
Button("✅ Yes, Delete (Y)", id="yes_btn", variant="error"),
Button("❌ Cancel (N)", id="no_btn", variant="default"),
classes="action_buttons",
),
classes="main_container center",
)
yield Footer()
def on_mount(self) -> None:
"""Initialize the screen with focus on cancel button for safety."""
self.query_one("#no_btn").focus()
def on_button_pressed(self, event: Button.Pressed) -> None:
"""Handle button presses."""
if event.button.id == "yes_btn":
self.action_confirm_delete()
elif event.button.id == "no_btn":
self.app.pop_screen()
def action_confirm_delete(self) -> None:
"""Confirm deletion."""
self.run_worker(self.delete_collection())
async def delete_collection(self) -> None:
"""Delete the collection."""
try:
if self.collection["type"] == "weaviate" and self.parent_screen.weaviate:
# Delete Weaviate collection
if self.parent_screen.weaviate and self.parent_screen.weaviate.client:
self.parent_screen.weaviate.client.collections.delete(self.collection["name"])
self.notify(
f"Deleted Weaviate collection: {self.collection['name']}",
severity="information",
)
elif self.collection["type"] == "openwebui" and self.parent_screen.openwebui:
# Delete OpenWebUI knowledge base
response = await self.parent_screen.openwebui.client.delete(
f"/api/v1/knowledge/{self.collection['name']}"
)
response.raise_for_status()
self.notify(
f"Deleted OpenWebUI collection: {self.collection['name']}",
severity="information",
)
# Refresh parent screen
self.parent_screen.refresh_collections() # Don't await, let it run as a worker
self.app.pop_screen()
except Exception as e:
self.notify(f"Failed to delete collection: {e}", severity="error")
class ConfirmDocumentDeleteScreen(Screen[None]):
"""Screen for confirming document deletion."""
doc_ids: list[str]
collection: CollectionInfo
parent_screen: "DocumentManagementScreen"
BINDINGS = [
Binding("escape", "app.pop_screen", "Cancel"),
Binding("y", "confirm_delete", "Yes"),
Binding("n", "app.pop_screen", "No"),
Binding("enter", "confirm_delete", "Confirm"),
]
def __init__(
self,
doc_ids: list[str],
collection: CollectionInfo,
parent_screen: "DocumentManagementScreen",
):
super().__init__()
self.doc_ids = doc_ids
self.collection = collection
self.parent_screen = parent_screen
@override
def compose(self) -> ComposeResult:
yield Header()
yield Container(
Static("⚠️ Confirm Document Deletion", classes="title warning"),
Static(
f"Are you sure you want to delete {len(self.doc_ids)} documents from '{self.collection['name']}'?"
),
Static("This action cannot be undone!", classes="warning"),
Static("Press Y to confirm, N or Escape to cancel", classes="subtitle"),
Horizontal(
Button("✅ Yes, Delete (Y)", id="yes_btn", variant="error"),
Button("❌ Cancel (N)", id="no_btn", variant="default"),
classes="action_buttons",
),
LoadingIndicator(id="loading"),
classes="main_container center",
)
yield Footer()
def on_mount(self) -> None:
"""Initialize the screen with focus on cancel button for safety."""
self.query_one("#loading").display = False
self.query_one("#no_btn").focus()
def on_button_pressed(self, event: Button.Pressed) -> None:
"""Handle button presses."""
if event.button.id == "yes_btn":
self.action_confirm_delete()
elif event.button.id == "no_btn":
self.app.pop_screen()
def action_confirm_delete(self) -> None:
"""Confirm deletion."""
self.run_worker(self.delete_documents())
async def delete_documents(self) -> None:
"""Delete the selected documents."""
loading = self.query_one("#loading")
loading.display = True
try:
if self.parent_screen.weaviate:
# Delete documents
results = await self.parent_screen.weaviate.delete_documents(self.doc_ids)
# Count successful deletions
successful = sum(1 for success in results.values() if success)
failed = len(results) - successful
if successful > 0:
self.notify(f"Deleted {successful} documents", severity="information")
if failed > 0:
self.notify(f"Failed to delete {failed} documents", severity="error")
# Clear selection and refresh parent screen
self.parent_screen.selected_docs.clear()
await self.parent_screen.load_documents()
self.app.pop_screen()
except Exception as e:
self.notify(f"Failed to delete documents: {e}", severity="error")
finally:
loading.display = False

View File

@@ -0,0 +1,279 @@
"""Document management screen with enhanced navigation."""
from textual.app import ComposeResult
from textual.binding import Binding
from textual.containers import Container, Horizontal
from textual.screen import Screen
from textual.widgets import Button, Footer, Header, Label, LoadingIndicator, Static
from typing_extensions import override
from ....storage.weaviate import WeaviateStorage
from ..models import CollectionInfo, DocumentInfo
from ..widgets import EnhancedDataTable
class DocumentManagementScreen(Screen[None]):
"""Screen for managing documents within a collection with enhanced keyboard navigation."""
collection: CollectionInfo
weaviate: WeaviateStorage | None
documents: list[DocumentInfo]
selected_docs: set[str]
current_offset: int
page_size: int
BINDINGS = [
Binding("escape", "app.pop_screen", "Back"),
Binding("r", "refresh", "Refresh"),
Binding("delete", "delete_selected", "Delete Selected"),
Binding("a", "select_all", "Select All"),
Binding("ctrl+a", "select_all", "Select All"),
Binding("n", "select_none", "Clear Selection"),
Binding("ctrl+shift+a", "select_none", "Clear Selection"),
Binding("space", "toggle_selection", "Toggle Selection"),
Binding("ctrl+d", "delete_selected", "Delete Selected"),
Binding("pageup", "prev_page", "Previous Page"),
Binding("pagedown", "next_page", "Next Page"),
Binding("home", "first_page", "First Page"),
Binding("end", "last_page", "Last Page"),
]
def __init__(self, collection: CollectionInfo, weaviate: WeaviateStorage | None):
super().__init__()
self.collection = collection
self.weaviate = weaviate
self.documents: list[DocumentInfo] = []
self.selected_docs: set[str] = set()
self.current_offset = 0
self.page_size = 50
@override
def compose(self) -> ComposeResult:
yield Header()
yield Container(
Static(f"📄 Document Management: {self.collection['name']}", classes="title"),
Static(
f"Total Documents: {self.collection['count']:,} | Use Space to select, Delete to remove",
classes="subtitle"
),
Label(f"Page size: {self.page_size} documents"),
EnhancedDataTable(id="documents_table", classes="enhanced-table"),
Horizontal(
Button("🔄 Refresh", id="refresh_docs_btn", variant="primary"),
Button("🗑️ Delete Selected", id="delete_selected_btn", variant="error"),
Button("✅ Select All", id="select_all_btn", variant="default"),
Button("❌ Clear Selection", id="clear_selection_btn", variant="default"),
Button("⬅️ Previous Page", id="prev_page_btn", variant="default"),
Button("➡️ Next Page", id="next_page_btn", variant="default"),
classes="button_bar",
),
Label("", id="selection_status"),
Static("", id="page_info", classes="status-text"),
LoadingIndicator(id="loading"),
classes="main_container",
)
yield Footer()
async def on_mount(self) -> None:
"""Initialize the screen."""
self.query_one("#loading").display = False
# Setup documents table
table = self.query_one("#documents_table", EnhancedDataTable)
table.add_columns("", "Title", "Source URL", "Words", "ID")
# Set up message handling for table events
table.can_focus = True
await self.load_documents()
async def load_documents(self) -> None:
"""Load documents from the collection."""
loading = self.query_one("#loading")
loading.display = True
try:
if self.weaviate:
# Set the collection name
self.weaviate.config.collection_name = self.collection["name"]
# Load documents with pagination
raw_docs = await self.weaviate.list_documents(
limit=self.page_size, offset=self.current_offset
)
# Cast to proper type with type checking
self.documents = [
DocumentInfo(
id=str(doc["id"]),
title=str(doc["title"]),
source_url=str(doc["source_url"]),
content_preview=str(doc["content_preview"]),
word_count=int(doc["word_count"])
if isinstance(doc["word_count"], (int, str))
and str(doc["word_count"]).isdigit()
else 0,
timestamp=str(doc["timestamp"]),
)
for doc in raw_docs
]
await self.update_table()
self.update_selection_status()
self.update_page_info()
except Exception as e:
self.notify(f"Error loading documents: {e}", severity="error")
finally:
loading.display = False
async def update_table(self) -> None:
"""Update the documents table."""
table = self.query_one("#documents_table", EnhancedDataTable)
table.clear()
# Re-add columns
table.add_columns("", "Title", "Source URL", "Words", "ID")
# Add rows
for doc in self.documents:
selected = "" if doc["id"] in self.selected_docs else ""
table.add_row(
selected,
doc.get("title", "Untitled")[:50],
doc.get("source_url", "")[:50],
str(doc.get("word_count", 0)),
doc["id"][:8] + "...", # Show truncated ID
)
def update_selection_status(self) -> None:
"""Update the selection status label."""
status_label = self.query_one("#selection_status", Label)
total_selected = len(self.selected_docs)
status_label.update(f"Selected: {total_selected} documents")
def update_page_info(self) -> None:
"""Update the page information."""
page_info = self.query_one("#page_info", Static)
total_docs = self.collection["count"]
start = self.current_offset + 1
end = min(self.current_offset + len(self.documents), total_docs)
page_num = (self.current_offset // self.page_size) + 1
total_pages = (total_docs + self.page_size - 1) // self.page_size
page_info.update(
f"Showing {start:,}-{end:,} of {total_docs:,} documents (Page {page_num} of {total_pages})"
)
def get_current_document(self) -> DocumentInfo | None:
"""Get the currently selected document."""
table = self.query_one("#documents_table", EnhancedDataTable)
try:
if 0 <= table.cursor_coordinate.row < len(self.documents):
return self.documents[table.cursor_coordinate.row]
except (AttributeError, IndexError):
pass
return None
# Action methods
def action_refresh(self) -> None:
"""Refresh the document list."""
self.run_worker(self.load_documents())
def action_toggle_selection(self) -> None:
"""Toggle selection of current row."""
doc = self.get_current_document()
if doc:
doc_id = doc["id"]
if doc_id in self.selected_docs:
self.selected_docs.remove(doc_id)
else:
self.selected_docs.add(doc_id)
self.run_worker(self.update_table())
self.update_selection_status()
def action_select_all(self) -> None:
"""Select all documents on current page."""
for doc in self.documents:
self.selected_docs.add(doc["id"])
self.run_worker(self.update_table())
self.update_selection_status()
def action_select_none(self) -> None:
"""Clear all selections."""
self.selected_docs.clear()
self.run_worker(self.update_table())
self.update_selection_status()
def action_delete_selected(self) -> None:
"""Delete selected documents."""
if self.selected_docs:
from .dialogs import ConfirmDocumentDeleteScreen
self.app.push_screen(
ConfirmDocumentDeleteScreen(list(self.selected_docs), self.collection, self)
)
else:
self.notify("No documents selected", severity="warning")
def action_next_page(self) -> None:
"""Go to next page."""
if self.current_offset + self.page_size < self.collection["count"]:
self.current_offset += self.page_size
self.run_worker(self.load_documents())
def action_prev_page(self) -> None:
"""Go to previous page."""
if self.current_offset >= self.page_size:
self.current_offset -= self.page_size
self.run_worker(self.load_documents())
def action_first_page(self) -> None:
"""Go to first page."""
if self.current_offset > 0:
self.current_offset = 0
self.run_worker(self.load_documents())
def action_last_page(self) -> None:
"""Go to last page."""
total_docs = self.collection["count"]
last_offset = ((total_docs - 1) // self.page_size) * self.page_size
if self.current_offset != last_offset:
self.current_offset = last_offset
self.run_worker(self.load_documents())
def on_button_pressed(self, event: Button.Pressed) -> None:
"""Handle button presses."""
if event.button.id == "refresh_docs_btn":
self.action_refresh()
elif event.button.id == "delete_selected_btn":
self.action_delete_selected()
elif event.button.id == "select_all_btn":
self.action_select_all()
elif event.button.id == "clear_selection_btn":
self.action_select_none()
elif event.button.id == "next_page_btn":
self.action_next_page()
elif event.button.id == "prev_page_btn":
self.action_prev_page()
def on_enhanced_data_table_row_toggled(self, event: EnhancedDataTable.RowToggled) -> None:
"""Handle row toggle from enhanced table."""
if 0 <= event.row_index < len(self.documents):
doc = self.documents[event.row_index]
doc_id = doc["id"]
if doc_id in self.selected_docs:
self.selected_docs.remove(doc_id)
else:
self.selected_docs.add(doc_id)
self.run_worker(self.update_table())
self.update_selection_status()
def on_enhanced_data_table_select_all(self, event: EnhancedDataTable.SelectAll) -> None:
"""Handle select all from enhanced table."""
self.action_select_all()
def on_enhanced_data_table_clear_selection(self, event: EnhancedDataTable.ClearSelection) -> None:
"""Handle clear selection from enhanced table."""
self.action_select_none()

View File

@@ -0,0 +1,50 @@
"""Help screen with keyboard shortcuts and usage information."""
from textual.app import ComposeResult
from textual.binding import Binding
from textual.containers import Container, ScrollableContainer
from textual.screen import ModalScreen
from textual.widgets import Button, Markdown, Rule, Static
from typing_extensions import override
class HelpScreen(ModalScreen[None]):
"""Modern help screen with comprehensive keyboard shortcuts."""
help_content: str
BINDINGS = [
Binding("escape", "app.pop_screen", "Close"),
Binding("q", "app.pop_screen", "Close"),
Binding("enter", "app.pop_screen", "Close"),
Binding("f1", "app.pop_screen", "Close"),
]
def __init__(self, help_content: str):
super().__init__()
self.help_content = help_content
@override
def compose(self) -> ComposeResult:
with Container(classes="modal-container"):
yield Static("📚 Help & Keyboard Shortcuts", classes="title")
yield Static("Enhanced navigation and productivity features", classes="subtitle")
yield Rule(line_style="heavy")
with ScrollableContainer():
yield Markdown(self.help_content)
yield Container(
Button("✅ Got it! (Press Escape or Enter)", id="close_btn", variant="primary"),
classes="action_buttons center",
)
def on_mount(self) -> None:
"""Initialize the help screen."""
# Focus the close button
self.query_one("#close_btn").focus()
def on_button_pressed(self, event: Button.Pressed) -> None:
"""Close help screen."""
if event.button.id == "close_btn":
self.app.pop_screen()

View File

@@ -0,0 +1,253 @@
"""Enhanced ingestion screen with better UX."""
import asyncio
from datetime import datetime
from textual import work
from textual.app import ComposeResult
from textual.binding import Binding
from textual.containers import Container, Horizontal
from textual.screen import ModalScreen
from textual.widgets import Button, Input, Label, LoadingIndicator, Rule, Static
from typing_extensions import override
from ....core.models import IngestionJob, IngestionSource, StorageBackend
from ..models import CollectionInfo
from ..widgets import EnhancedProgressBar
class IngestionScreen(ModalScreen[None]):
"""Enhanced ingestion screen with better UX and keyboard navigation."""
collection: CollectionInfo
selected_type: IngestionSource
progress_value: int
BINDINGS = [
Binding("escape", "app.pop_screen", "Cancel"),
Binding("ctrl+i", "start_ingestion", "Start"),
Binding("1", "select_web", "Web", show=False),
Binding("2", "select_repo", "Repository", show=False),
Binding("3", "select_docs", "Documentation", show=False),
Binding("enter", "start_ingestion", "Start Ingestion"),
Binding("tab", "focus_next", "Next Field"),
Binding("shift+tab", "focus_previous", "Previous Field"),
]
def __init__(self, collection: CollectionInfo):
super().__init__()
self.collection = collection
self.selected_type = IngestionSource.WEB
self.progress_value = 0
@override
def compose(self) -> ComposeResult:
with Container(classes="modal-container"):
yield Static("📥 Modern Ingestion Interface", classes="title")
yield Static(
f"Target: {self.collection['name']} ({self.collection['backend']})",
classes="subtitle",
)
yield Rule()
# Enhanced input section
yield Container(
Label("🌐 Source URL:", classes="input-label"),
Input(
placeholder="https://docs.example.com or file:///path/to/repo",
id="url_input",
classes="modern-input",
),
Label("📋 Source Type (Press 1/2/3):", classes="input-label"),
Horizontal(
Button("🌐 Web (1)", id="web_btn", variant="primary", classes="type-button"),
Button(
"📦 Repository (2)", id="repo_btn", variant="default", classes="type-button"
),
Button(
"📖 Documentation (3)", id="docs_btn", variant="default", classes="type-button"
),
classes="type_buttons",
),
Rule(line_style="dashed"),
classes="input-section card",
)
# Enhanced Progress section
yield Container(
Label("🔄 Progress:", classes="progress-label"),
EnhancedProgressBar(id="enhanced_progress", total=100),
Static("Ready to start", id="progress_text", classes="status-text"),
classes="progress-section card",
)
# Action buttons
yield Horizontal(
Button("🚀 Start Ingestion", id="start_btn", variant="success"),
Button("❌ Cancel", id="cancel_btn", variant="error"),
classes="action_buttons",
)
yield LoadingIndicator(id="loading", classes="pulse")
def on_mount(self) -> None:
"""Initialize the screen."""
self.query_one("#loading").display = False
self.selected_type = IngestionSource.WEB
# Focus the URL input field by default
self.query_one("#url_input").focus()
def action_select_web(self) -> None:
"""Select web ingestion type."""
self.selected_type = IngestionSource.WEB
self.update_type_buttons("web")
def action_select_repo(self) -> None:
"""Select repository ingestion type."""
self.selected_type = IngestionSource.REPOSITORY
self.update_type_buttons("repo")
def action_select_docs(self) -> None:
"""Select documentation ingestion type."""
self.selected_type = IngestionSource.DOCUMENTATION
self.update_type_buttons("docs")
def on_button_pressed(self, event: Button.Pressed) -> None:
"""Handle button presses with enhanced feedback."""
button_id = event.button.id
if button_id == "web_btn":
self.action_select_web()
elif button_id == "repo_btn":
self.action_select_repo()
elif button_id == "docs_btn":
self.action_select_docs()
elif button_id == "start_btn":
self.action_start_ingestion()
elif button_id == "cancel_btn":
self.app.pop_screen()
def update_type_buttons(self, selected: str) -> None:
"""Update type button visual states."""
buttons = {
"web": self.query_one("#web_btn", Button),
"repo": self.query_one("#repo_btn", Button),
"docs": self.query_one("#docs_btn", Button),
}
for btn_type, button in buttons.items():
if btn_type == selected:
button.variant = "primary"
else:
button.variant = "default"
def on_input_submitted(self, event: Input.Submitted) -> None:
"""Handle URL input submission."""
if event.input.id == "url_input":
self.action_start_ingestion()
def action_start_ingestion(self) -> None:
"""Start the enhanced ingestion process."""
url_input = self.query_one("#url_input", Input)
if not url_input.value.strip():
self.notify("🔍 Please enter a source URL", severity="error")
url_input.focus()
return
self.perform_ingestion(url_input.value.strip())
@work(exclusive=True)
async def perform_ingestion(self, source_url: str) -> None:
"""Perform ingestion with enhanced progress tracking and better UX."""
loading = self.query_one("#loading")
enhanced_progress = self.query_one("#enhanced_progress", EnhancedProgressBar)
progress_text = self.query_one("#progress_text", Static)
try:
loading.display = True
# Enhanced progress tracking with better visual feedback
enhanced_progress.update_progress(5, "Initializing ingestion pipeline...")
progress_text.update("🚀 Starting modern ingestion process...")
await asyncio.sleep(0.3)
# Determine storage backend
storage_backend = (
StorageBackend.WEAVIATE
if self.collection["type"] == "weaviate"
else StorageBackend.OPEN_WEBUI
)
enhanced_progress.update_progress(15, "Creating ingestion job...")
progress_text.update("📋 Configuring job parameters...")
await asyncio.sleep(0.4)
# Create ingestion job
job = IngestionJob(
source_url=source_url,
source_type=self.selected_type,
storage_backend=storage_backend,
created_at=datetime.now(),
)
enhanced_progress.update_progress(25, "Loading ingestion modules...")
progress_text.update("⚡ Importing processing components...")
await asyncio.sleep(0.4)
from ....flows.ingestion import ingest_documents_task
enhanced_progress.update_progress(35, "Connecting to services...")
progress_text.update(f"🔗 Establishing connection to {storage_backend.value}...")
await asyncio.sleep(0.5)
enhanced_progress.update_progress(45, "Fetching source content...")
progress_text.update("📄 Retrieving documents from source...")
await asyncio.sleep(0.6)
# Simulate realistic progress steps
progress_steps = [
(55, "Parsing document structure...", "🔍 Analyzing content structure..."),
(65, "Extracting text content...", "📝 Processing text and metadata..."),
(75, "Generating embeddings...", "🧠 Creating vector embeddings..."),
(85, "Storing in database...", "💾 Persisting to storage backend..."),
(95, "Finalizing operation...", "🎯 Completing ingestion process..."),
]
for progress, status, text in progress_steps:
enhanced_progress.update_progress(progress, status)
progress_text.update(text)
await asyncio.sleep(0.7)
# Perform actual ingestion
successful, failed = await ingest_documents_task(
job, collection_name=self.collection["name"]
)
# Success handling with celebratory feedback
enhanced_progress.update_progress(100, "Completed successfully!")
progress_text.update(
f"🎉 Ingestion complete: {successful} documents added, {failed} failed"
)
# Show enhanced success notification
if successful > 0:
self.notify(
f"🎉 Successfully ingested {successful} documents!",
severity="information"
)
if failed > 0:
self.notify(f"⚠️ {failed} documents failed to process", severity="warning")
else:
self.notify("❌ No documents were successfully processed", severity="error")
# Keep results visible before closing
await asyncio.sleep(3)
self.app.pop_screen()
except Exception as e:
enhanced_progress.update_progress(0, "Ingestion failed")
progress_text.update(f"❌ Error occurred: {str(e)[:100]}")
self.notify(f"❌ Ingestion failed: {e}", severity="error")
await asyncio.sleep(2) # Show error before allowing interaction
finally:
loading.display = False

View File

@@ -0,0 +1,190 @@
"""Search screen for finding documents within collections."""
from textual.app import ComposeResult
from textual.binding import Binding
from textual.containers import Container
from textual.screen import Screen
from textual.widgets import Button, Footer, Header, Input, LoadingIndicator, Static
from typing_extensions import override
from ....storage.openwebui import OpenWebUIStorage
from ....storage.weaviate import WeaviateStorage
from ..models import CollectionInfo
from ..widgets import EnhancedDataTable
class SearchScreen(Screen[None]):
"""Screen for searching within a collection with enhanced keyboard navigation."""
collection: CollectionInfo
weaviate: WeaviateStorage | None
openwebui: OpenWebUIStorage | None
BINDINGS = [
Binding("escape", "app.pop_screen", "Back"),
Binding("enter", "perform_search", "Search"),
Binding("ctrl+f", "focus_search", "Focus Search"),
Binding("f3", "perform_search", "Search Again"),
Binding("ctrl+r", "clear_results", "Clear Results"),
Binding("/", "focus_search", "Quick Search"),
]
def __init__(
self,
collection: CollectionInfo,
weaviate: WeaviateStorage | None,
openwebui: OpenWebUIStorage | None,
):
super().__init__()
self.collection = collection
self.weaviate = weaviate
self.openwebui = openwebui
@override
def compose(self) -> ComposeResult:
yield Header()
yield Container(
Static(
f"🔍 Search in: {self.collection['name']} ({self.collection['backend']})",
classes="title",
),
Static("Press / or Ctrl+F to focus search, Enter to search", classes="subtitle"),
Input(placeholder="Enter search query... (press Enter to search)", id="search_input"),
Button("🔍 Search", id="search_btn", variant="primary"),
Button("🗑️ Clear Results", id="clear_btn", variant="default"),
EnhancedDataTable(id="results_table"),
Static("Enter your search query to find relevant documents.", id="search_status", classes="status-text"),
LoadingIndicator(id="loading"),
classes="main_container",
)
yield Footer()
def on_mount(self) -> None:
"""Initialize the screen."""
self.query_one("#loading").display = False
# Setup results table
table = self.query_one("#results_table", EnhancedDataTable)
table.add_columns("Title", "Content Preview", "Score")
# Focus search input
self.query_one("#search_input").focus()
def action_focus_search(self) -> None:
"""Focus the search input field."""
search_input = self.query_one("#search_input", Input)
search_input.focus()
def action_clear_results(self) -> None:
"""Clear search results."""
table = self.query_one("#results_table", EnhancedDataTable)
table.clear()
table.add_columns("Title", "Content Preview", "Score")
status = self.query_one("#search_status", Static)
status.update("Search results cleared. Enter a new query to search.")
def on_input_submitted(self, event: Input.Submitted) -> None:
"""Handle search input submission."""
if event.input.id == "search_input":
self.action_perform_search()
def on_button_pressed(self, event: Button.Pressed) -> None:
"""Handle button presses."""
if event.button.id == "search_btn":
self.action_perform_search()
elif event.button.id == "clear_btn":
self.action_clear_results()
def action_perform_search(self) -> None:
"""Perform search."""
search_input = self.query_one("#search_input", Input)
if not search_input.value.strip():
self.notify("Please enter a search query", severity="warning")
search_input.focus()
return
self.run_worker(self.search_collection(search_input.value.strip()))
async def search_collection(self, query: str) -> None:
"""Search the collection."""
loading = self.query_one("#loading")
table = self.query_one("#results_table", EnhancedDataTable)
status = self.query_one("#search_status", Static)
try:
loading.display = True
status.update(f"🔍 Searching for '{query}'...")
table.clear()
table.add_columns("Title", "Content Preview", "Score")
results = []
if self.collection["type"] == "weaviate" and self.weaviate:
results = await self.search_weaviate(query)
elif self.collection["type"] == "openwebui" and self.openwebui:
results = await self.search_openwebui(query)
# Add results to table
for result in results:
title = result.get("title", "Untitled")
content = result.get("content", "")
score = result.get("score", 0)
table.add_row(
title[:50] if isinstance(title, str) else str(title)[:50],
(content[:100] + "...")
if isinstance(content, str)
else str(content)[:100] + "...",
f"{score:.3f}" if isinstance(score, (int, float)) else str(score),
)
if not results:
status.update(f"No results found for '{query}'. Try different keywords.")
self.notify("No results found", severity="information")
else:
status.update(f"Found {len(results)} results for '{query}'. Use arrow keys to navigate.")
self.notify(f"Found {len(results)} results", severity="information")
# Focus the table for navigation
table.focus()
except Exception as e:
status.update(f"Search error: {e}")
self.notify(f"Search error: {e}", severity="error")
finally:
loading.display = False
async def search_weaviate(self, query: str) -> list[dict[str, str | float]]:
"""Search Weaviate collection."""
if not self.weaviate:
return []
try:
await self.weaviate.initialize()
results_generator = self.weaviate.search(query, limit=20)
results = [doc async for doc in results_generator]
# Convert Document objects to dict format expected by the UI
return [
{
"title": getattr(doc, "title", "Untitled"),
"content": getattr(doc, "content", ""),
"score": getattr(doc, "score", 0.0),
}
for doc in results
]
except Exception as e:
self.notify(f"Weaviate search error: {e}", severity="error")
return []
async def search_openwebui(self, query: str) -> list[dict[str, str | float]]:
"""Search OpenWebUI collection."""
if not self.openwebui:
return []
try:
# OpenWebUI does not have a direct search API, so return empty
# In a real implementation, you would need to implement search via their API
self.notify("OpenWebUI search not yet implemented", severity="warning")
return []
except Exception as e:
self.notify(f"OpenWebUI search error: {e}", severity="error")
return []

View File

@@ -0,0 +1,346 @@
"""Modern CSS styles for the TUI application."""
# Enhanced modern CSS with better focus indicators and navigation feedback
TUI_CSS = """
/* Base styling */
Screen {
background: #1a1a1a;
}
* {
color: #ffffff;
}
/* Title styling */
.title {
text-align: center;
margin: 1;
color: #ffffff;
text-style: bold;
background: #333333;
padding: 1;
border: solid #0088cc;
}
.subtitle {
text-align: center;
margin: 1 0;
color: #cccccc;
text-style: italic;
background: #333333;
padding: 1;
}
/* Container styling */
.main_container {
margin: 1;
padding: 1;
background: #333333;
}
.card {
background: #333333;
padding: 1;
margin: 1;
color: #ffffff;
border: solid #444444;
}
.card:focus-within {
border: solid #0088cc;
}
/* Button styling with focus states */
Button {
background: #444444;
color: #ffffff;
margin: 0 1;
border: solid transparent;
}
Button:hover {
background: #0088cc;
color: #ffffff;
}
Button:focus {
border: solid #ffffff;
background: #0088cc;
}
Button.-primary {
background: #0088cc;
color: #ffffff;
}
Button.-success {
background: #28a745;
color: #ffffff;
}
Button.-error {
background: #dc3545;
color: #ffffff;
}
Button.-warning {
background: #ffc107;
color: #000000;
}
/* Enhanced DataTable with focus indicators */
DataTable {
background: #333333;
color: #ffffff;
border: solid #444444;
}
DataTable:focus {
border: solid #0088cc;
}
DataTable > .datatable--header {
background: #444444;
color: #ffffff;
text-style: bold;
}
DataTable > .datatable--cursor {
background: #0088cc;
color: #ffffff;
}
DataTable > .datatable--cursor-row {
background: #0066aa;
color: #ffffff;
}
/* Input styling */
Input {
background: #333333;
color: #ffffff;
border: solid #666666;
}
Input:focus {
border: solid #0088cc;
}
/* Header and Footer */
Header, Footer {
background: #333333;
color: #ffffff;
}
/* Tab styling with focus indicators */
Tab {
background: #333333;
color: #ffffff;
border: solid transparent;
}
Tab:focus {
border: solid #ffffff;
}
Tab.-active {
background: #0088cc;
color: #ffffff;
text-style: bold;
}
/* Label styling */
Label {
color: #ffffff;
}
/* Status indicators */
.status-active {
color: #28a745;
}
.status-error {
color: #dc3545;
}
.status-warning {
color: #ffc107;
}
/* Animations */
.pulse {
text-style: blink;
}
.glow {
background: #0088cc;
color: #ffffff;
}
.shimmer {
text-style: italic;
}
/* Metrics styling */
.metrics-value {
text-style: bold;
text-align: center;
color: #ffffff;
}
.metrics-label {
text-align: center;
color: #cccccc;
}
.metrics-description {
text-align: center;
color: #999999;
text-style: italic;
}
/* Section titles */
.section-title {
text-style: bold;
color: #ffffff;
margin: 1 0;
}
/* Status text */
.status-text {
color: #cccccc;
}
/* Button groups */
.button_bar {
margin: 1 0;
}
.action_buttons {
margin: 1;
text-align: center;
}
/* Progress styling */
.progress-label {
color: #ffffff;
margin: 1 0;
}
/* Responsive grid */
.responsive-grid {
grid-size: 4;
grid-gutter: 1;
}
.metrics-grid {
grid-size: 4;
grid-gutter: 1;
margin: 1;
}
/* Modal container */
.modal-container {
background: #333333;
border: solid #0088cc;
padding: 2;
margin: 2;
}
/* Chart placeholders */
.chart-title {
text-style: bold;
color: #ffffff;
margin: 1 0;
}
.chart-placeholder {
color: #999999;
text-style: italic;
text-align: center;
padding: 2;
}
/* Analytics grid */
.analytics-grid {
grid-size: 2;
grid-gutter: 1;
}
/* Enhanced table styling */
.enhanced-table {
background: #333333;
color: #ffffff;
border: solid #666666;
}
.enhanced-table:focus {
border: solid #0088cc;
}
/* Status bar */
.status-bar {
background: #444444;
color: #ffffff;
padding: 0 1;
}
/* Input section styling */
.input-section {
margin: 1;
padding: 1;
}
.input-label {
color: #ffffff;
margin: 1 0;
}
.modern-input {
background: #333333;
color: #ffffff;
border: solid #666666;
margin: 1 0;
}
.modern-input:focus {
border: solid #0088cc;
}
/* Type buttons */
.type_buttons {
margin: 1 0;
}
.type-button {
margin: 0 1;
}
/* Progress section */
.progress-section {
margin: 1;
padding: 1;
}
/* Center alignment */
.center {
text-align: center;
}
/* Warning styling */
.warning {
color: #ffc107;
text-style: bold;
}
/* Pressed button state */
.pressed {
background: #006699;
color: #ffffff;
}
/* Focus ring for better accessibility */
*:focus {
outline: solid #0088cc;
}
"""

View File

@@ -0,0 +1,5 @@
"""Utility functions for the TUI."""
from .runners import dashboard, run_textual_tui
__all__ = ["dashboard", "run_textual_tui"]

View File

@@ -0,0 +1,64 @@
"""TUI runner functions and initialization."""
import asyncio
from ....core.models import StorageBackend, StorageConfig
from ....storage.openwebui import OpenWebUIStorage
from ....storage.weaviate import WeaviateStorage
from ..app import CollectionManagementApp
async def run_textual_tui() -> None:
"""Run the enhanced modern TUI with better error handling and initialization."""
from ....config.settings import get_settings
settings = get_settings()
# Initialize storage backends with enhanced error handling
weaviate = None
openwebui = None
print("🚀 Initializing Modern Collection Management System...")
try:
print("🔗 Connecting to Weaviate...")
weaviate_config = StorageConfig(
backend=StorageBackend.WEAVIATE,
endpoint=settings.weaviate_endpoint,
api_key=settings.weaviate_api_key,
collection_name="default",
)
weaviate = WeaviateStorage(weaviate_config)
await weaviate.initialize()
print("✅ Weaviate connected successfully!")
except Exception as e:
print(f"⚠️ Weaviate connection failed: {e}")
try:
print("🔗 Connecting to OpenWebUI...")
openwebui_config = StorageConfig(
backend=StorageBackend.OPEN_WEBUI,
endpoint=settings.openwebui_endpoint,
api_key=settings.openwebui_api_key,
collection_name="default",
)
openwebui = OpenWebUIStorage(openwebui_config)
await openwebui.initialize()
print("✅ OpenWebUI connected successfully!")
except Exception as e:
print(f"⚠️ OpenWebUI connection failed: {e}")
if not weaviate and not openwebui:
print("❌ Error: Could not connect to any storage backend")
print("Please check your configuration and try again.")
return
print("🎉 Launching Enhanced TUI with Keyboard Navigation...")
app = CollectionManagementApp(weaviate, openwebui)
await app.run_async()
def dashboard() -> None:
"""Launch the modern collection dashboard."""
asyncio.run(run_textual_tui())

View File

@@ -0,0 +1,12 @@
"""Enhanced widgets with keyboard navigation support."""
from .cards import MetricsCard
from .indicators import EnhancedProgressBar, StatusIndicator
from .tables import EnhancedDataTable
__all__ = [
"MetricsCard",
"StatusIndicator",
"EnhancedProgressBar",
"EnhancedDataTable",
]

View File

@@ -0,0 +1,28 @@
"""Metrics card widget."""
from typing import Any
from textual.app import ComposeResult
from textual.widgets import Static
from typing_extensions import override
class MetricsCard(Static):
"""A modern metrics display card."""
title: str
value: str
description: str
def __init__(self, title: str, value: str, description: str = "", **kwargs: Any) -> None:
super().__init__(**kwargs)
self.title = title
self.value = value
self.description = description
@override
def compose(self) -> ComposeResult:
yield Static(self.value, classes="metrics-value")
yield Static(self.title, classes="metrics-label")
if self.description:
yield Static(self.description, classes="metrics-description")

View File

@@ -0,0 +1,86 @@
"""Status indicators and progress bars with enhanced visual feedback."""
from typing import Any
from textual.app import ComposeResult
from textual.widgets import ProgressBar, Static
from typing_extensions import override
class StatusIndicator(Static):
"""Modern status indicator with color coding and animations."""
status: str
def __init__(self, status: str, **kwargs: Any) -> None:
super().__init__(**kwargs)
self.status = status
self.update_status(status)
def update_status(self, status: str) -> None:
"""Update the status display with enhanced visual feedback."""
self.status = status
# Remove previous status classes
self.remove_class("status-active", "status-error", "status-warning", "pulse", "glow")
if status.lower() in ["active", "online", "connected", "✓ active"]:
self.add_class("status-active")
self.add_class("glow")
self.update("🟢 " + status)
elif status.lower() in ["error", "failed", "offline", "disconnected"]:
self.add_class("status-error")
self.add_class("pulse")
self.update("🔴 " + status)
elif status.lower() in ["warning", "pending", "in_progress"]:
self.add_class("status-warning")
self.add_class("pulse")
self.update("🟡 " + status)
elif status.lower() in ["loading", "connecting"]:
self.add_class("shimmer")
self.update("🔄 " + status)
else:
self.update("" + status)
class EnhancedProgressBar(Static):
"""Enhanced progress bar with better visual feedback."""
total: int
progress: int
status_text: str
def __init__(self, total: int = 100, **kwargs: Any) -> None:
super().__init__(**kwargs)
self.total = total
self.progress = 0
self.status_text = "Ready"
@override
def compose(self) -> ComposeResult:
yield Static("", id="progress_status", classes="progress-label")
yield ProgressBar(total=self.total, id="progress_bar", show_eta=True, classes="shimmer")
def update_progress(self, progress: int, status: str = "") -> None:
"""Update progress with enhanced feedback."""
self.progress = progress
if status:
self.status_text = status
# Update the progress bar
progress_bar = self.query_one("#progress_bar", ProgressBar)
progress_bar.update(progress=progress)
# Update status text with icons
status_display = self.query_one("#progress_status", Static)
if progress >= 100:
status_display.update(f"{self.status_text}")
progress_bar.add_class("glow")
elif progress >= 75:
status_display.update(f"🔥 {self.status_text}")
elif progress >= 50:
status_display.update(f"{self.status_text}")
elif progress >= 25:
status_display.update(f"🔄 {self.status_text}")
else:
status_display.update(f"🚀 {self.status_text}")

View File

@@ -0,0 +1,126 @@
"""Enhanced DataTable with improved keyboard navigation."""
from typing import Any
from textual import events
from textual.binding import Binding
from textual.message import Message
from textual.widgets import DataTable
class EnhancedDataTable(DataTable[Any]):
"""DataTable with enhanced keyboard navigation and visual feedback."""
BINDINGS = [
Binding("up,k", "cursor_up", "Cursor Up", show=False),
Binding("down,j", "cursor_down", "Cursor Down", show=False),
Binding("left,h", "cursor_left", "Cursor Left", show=False),
Binding("right,l", "cursor_right", "Cursor Right", show=False),
Binding("home", "cursor_home", "First Row", show=False),
Binding("end", "cursor_end", "Last Row", show=False),
Binding("pageup", "page_up", "Page Up", show=False),
Binding("pagedown", "page_down", "Page Down", show=False),
Binding("enter", "select_cursor", "Select", show=False),
Binding("space", "toggle_selection", "Toggle Selection", show=False),
Binding("ctrl+a", "select_all", "Select All", show=False),
Binding("ctrl+shift+a", "clear_selection", "Clear Selection", show=False),
]
def __init__(self, **kwargs: Any) -> None:
super().__init__(**kwargs)
self.cursor_type = "row" # Default to row selection
self.zebra_stripes = True # Enable zebra striping for better visibility
self.show_cursor = True
def on_key(self, event: events.Key) -> None:
"""Handle additional keyboard shortcuts."""
if event.key == "ctrl+1":
# Jump to first column
self.move_cursor(column=0)
event.prevent_default()
elif event.key == "ctrl+9":
# Jump to last column
if self.columns:
self.move_cursor(column=len(self.columns) - 1)
event.prevent_default()
elif event.key == "/":
# Start quick search (to be implemented by parent)
self.post_message(self.QuickSearch(self))
event.prevent_default()
elif event.key == "escape":
# Clear selection or exit search
# Clear selection by calling action
self.action_clear_selection()
event.prevent_default()
# No else clause needed - just handle our events
def action_cursor_home(self) -> None:
"""Move cursor to first row."""
if self.row_count > 0:
self.move_cursor(row=0)
def action_cursor_end(self) -> None:
"""Move cursor to last row."""
if self.row_count > 0:
self.move_cursor(row=self.row_count - 1)
def action_page_up(self) -> None:
"""Move cursor up by visible page size."""
if self.row_count > 0:
page_size = max(1, self.size.height // 2) # Approximate visible rows
new_row = max(0, self.cursor_coordinate.row - page_size)
self.move_cursor(row=new_row)
def action_page_down(self) -> None:
"""Move cursor down by visible page size."""
if self.row_count > 0:
page_size = max(1, self.size.height // 2) # Approximate visible rows
new_row = min(self.row_count - 1, self.cursor_coordinate.row + page_size)
self.move_cursor(row=new_row)
def action_toggle_selection(self) -> None:
"""Toggle selection of current row."""
if self.row_count > 0:
current_row = self.cursor_coordinate.row
# This will be handled by the parent screen
self.post_message(self.RowToggled(self, current_row))
def action_select_all(self) -> None:
"""Select all rows."""
# This will be handled by the parent screen
self.post_message(self.SelectAll(self))
def action_clear_selection(self) -> None:
"""Clear all selections."""
# This will be handled by the parent screen
self.post_message(self.ClearSelection(self))
# Custom messages for enhanced functionality
class QuickSearch(Message):
"""Posted when user wants to start a quick search."""
def __init__(self, table: "EnhancedDataTable") -> None:
super().__init__()
self.table = table
class RowToggled(Message):
"""Posted when a row selection is toggled."""
def __init__(self, table: "EnhancedDataTable", row_index: int) -> None:
super().__init__()
self.table = table
self.row_index = row_index
class SelectAll(Message):
"""Posted when user wants to select all rows."""
def __init__(self, table: "EnhancedDataTable") -> None:
super().__init__()
self.table = table
class ClearSelection(Message):
"""Posted when user wants to clear selection."""
def __init__(self, table: "EnhancedDataTable") -> None:
super().__init__()
self.table = table

View File

@@ -0,0 +1,5 @@
"""Configuration management."""
from .settings import Settings, get_settings
__all__ = ["Settings", "get_settings"]

View File

@@ -0,0 +1,103 @@
"""Application settings and configuration."""
from functools import lru_cache
from typing import Literal
from pydantic import Field, HttpUrl
from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
"""Application settings."""
model_config = SettingsConfigDict(
env_file=".env",
env_file_encoding="utf-8",
case_sensitive=False,
extra="ignore", # Ignore extra environment variables
)
# API Keys
firecrawl_api_key: str | None = None
openwebui_api_key: str | None = None
weaviate_api_key: str | None = None
# Endpoints
llm_endpoint: HttpUrl = HttpUrl("http://llm.lab")
weaviate_endpoint: HttpUrl = HttpUrl("http://weaviate.yo")
openwebui_endpoint: HttpUrl = HttpUrl("http://chat.lab") # This will be the API URL
firecrawl_endpoint: HttpUrl = HttpUrl("http://crawl.lab:30002")
# Model Configuration
embedding_model: str = "ollama/bge-m3:latest"
embedding_dimension: int = 1024
# Ingestion Settings
default_batch_size: int = Field(default=50, gt=0, le=500)
max_file_size: int = 1_000_000
max_crawl_depth: int = Field(default=5, ge=1, le=20)
max_crawl_pages: int = Field(default=100, ge=1, le=1000)
# Storage Settings
default_storage_backend: Literal["weaviate", "open_webui"] = "weaviate"
default_collection_prefix: str = "docs"
# Prefect Settings
prefect_api_url: HttpUrl | None = None
prefect_api_key: str | None = None
prefect_work_pool: str = "default"
# Scheduling Defaults
default_schedule_interval: int = Field(default=60, ge=1, le=10080) # Max 1 week
# Performance Settings
max_concurrent_tasks: int = Field(default=5, ge=1, le=20)
request_timeout: int = Field(default=60, ge=10, le=300)
# Logging
log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR"] = "INFO"
def get_storage_endpoint(self, backend: str) -> HttpUrl:
"""
Get endpoint for storage backend.
Args:
backend: Storage backend name
Returns:
Endpoint URL
"""
if backend == "weaviate":
return self.weaviate_endpoint
elif backend == "open_webui":
return self.openwebui_endpoint
else:
raise ValueError(f"Unknown backend: {backend}")
def get_api_key(self, service: str) -> str | None:
"""
Get API key for service.
Args:
service: Service name
Returns:
API key or None
"""
service_map = {
"firecrawl": self.firecrawl_api_key,
"openwebui": self.openwebui_api_key,
"weaviate": self.weaviate_api_key,
}
return service_map.get(service)
@lru_cache
def get_settings() -> Settings:
"""
Get cached settings instance.
Returns:
Settings instance
"""
return Settings()

View File

@@ -0,0 +1,27 @@
"""Core module for ingestion pipeline."""
from .exceptions import (
IngestionError,
StorageError,
VectorizationError,
)
from .models import (
Document,
IngestionJob,
IngestionResult,
IngestionSource,
IngestionStatus,
StorageBackend,
)
__all__ = [
"Document",
"IngestionJob",
"IngestionResult",
"IngestionSource",
"IngestionStatus",
"StorageBackend",
"IngestionError",
"StorageError",
"VectorizationError",
]

View File

@@ -0,0 +1,31 @@
"""Custom exceptions for the ingestion pipeline."""
class IngestionError(Exception):
"""Base exception for ingestion errors."""
pass
class StorageError(IngestionError):
"""Exception for storage-related errors."""
pass
class VectorizationError(IngestionError):
"""Exception for vectorization errors."""
pass
class ConfigurationError(IngestionError):
"""Exception for configuration errors."""
pass
class SourceNotFoundError(IngestionError):
"""Exception when source cannot be found or accessed."""
pass

View File

@@ -0,0 +1,149 @@
"""Core data models with strict typing."""
from collections.abc import Callable
from datetime import UTC, datetime
from enum import Enum
from typing import TypedDict
from uuid import UUID, uuid4
from pydantic import BaseModel, Field, HttpUrl
class IngestionStatus(str, Enum):
"""Status of an ingestion job."""
PENDING = "pending"
IN_PROGRESS = "in_progress"
COMPLETED = "completed"
PARTIAL = "partial" # Some documents succeeded, some failed
FAILED = "failed"
CANCELLED = "cancelled"
class StorageBackend(str, Enum):
"""Available storage backends."""
WEAVIATE = "weaviate"
OPEN_WEBUI = "open_webui"
class IngestionSource(str, Enum):
"""Types of ingestion sources."""
WEB = "web"
REPOSITORY = "repository"
DOCUMENTATION = "documentation"
class VectorConfig(BaseModel):
"""Configuration for vectorization."""
model: str = Field(default="ollama/bge-m3:latest")
embedding_endpoint: HttpUrl = Field(default=HttpUrl("http://llm.lab"))
dimension: int = Field(default=1024)
batch_size: int = Field(default=100, gt=0, le=1000)
class StorageConfig(BaseModel):
"""Configuration for storage backend."""
backend: StorageBackend
endpoint: HttpUrl
api_key: str | None = Field(default=None)
collection_name: str = Field(default="documents")
batch_size: int = Field(default=100, gt=0, le=1000)
class FirecrawlConfig(BaseModel):
"""Configuration for Firecrawl ingestion (operational parameters only)."""
formats: list[str] = Field(default_factory=lambda: ["markdown", "html"])
max_depth: int = Field(default=5, ge=1, le=20)
limit: int = Field(default=100, ge=1, le=1000)
only_main_content: bool = Field(default=True)
include_subdomains: bool = Field(default=False)
class RepomixConfig(BaseModel):
"""Configuration for Repomix ingestion."""
include_patterns: list[str] = Field(
default_factory=lambda: ["*.py", "*.js", "*.ts", "*.md", "*.yaml", "*.json"]
)
exclude_patterns: list[str] = Field(
default_factory=lambda: ["**/node_modules/**", "**/__pycache__/**", "**/.git/**"]
)
max_file_size: int = Field(default=1_000_000) # 1MB
respect_gitignore: bool = Field(default=True)
class DocumentMetadata(TypedDict):
"""Metadata for a document."""
source_url: str
title: str | None
description: str | None
timestamp: datetime
content_type: str
word_count: int
char_count: int
class Document(BaseModel):
"""Represents a single document."""
id: UUID = Field(default_factory=uuid4)
content: str
metadata: DocumentMetadata
vector: list[float] | None = Field(default=None)
source: IngestionSource
collection: str = Field(default="documents")
class Config:
"""Pydantic configuration."""
json_encoders: dict[type, Callable[[UUID | datetime], str]] = {
UUID: lambda v: str(v) if isinstance(v, UUID) else str(v),
datetime: lambda v: v.isoformat() if isinstance(v, datetime) else str(v),
}
class IngestionJob(BaseModel):
"""Represents an ingestion job."""
id: UUID = Field(default_factory=uuid4)
source_type: IngestionSource
source_url: HttpUrl | str
status: IngestionStatus = Field(default=IngestionStatus.PENDING)
created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
updated_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
completed_at: datetime | None = Field(default=None)
error_message: str | None = Field(default=None)
document_count: int = Field(default=0)
storage_backend: StorageBackend
class Config:
"""Pydantic configuration."""
json_encoders: dict[type, Callable[[UUID | datetime], str]] = {
UUID: lambda v: str(v) if isinstance(v, UUID) else str(v),
datetime: lambda v: v.isoformat() if isinstance(v, datetime) else str(v),
}
class IngestionResult(BaseModel):
"""Result of an ingestion operation."""
job_id: UUID
status: IngestionStatus
documents_processed: int
documents_failed: int
duration_seconds: float
error_messages: list[str] = Field(default_factory=list)
class Config:
"""Pydantic configuration."""
json_encoders: dict[type, Callable[[UUID], str]] = {
UUID: lambda v: str(v),
}

View File

@@ -0,0 +1,9 @@
"""Prefect flows for orchestration."""
from .ingestion import create_ingestion_flow
from .scheduler import create_scheduled_deployment
__all__ = [
"create_ingestion_flow",
"create_scheduled_deployment",
]

View File

@@ -0,0 +1,274 @@
"""Prefect flow for ingestion pipeline."""
from datetime import UTC, datetime
from typing import Literal
from prefect import flow, task
from ..core.exceptions import IngestionError
from ..core.models import (
FirecrawlConfig,
IngestionJob,
IngestionResult,
IngestionSource,
IngestionStatus,
RepomixConfig,
StorageBackend,
StorageConfig,
)
from ..ingestors import FirecrawlIngestor, RepomixIngestor
from ..storage import OpenWebUIStorage, WeaviateStorage
from ..storage.base import BaseStorage
@task(name="validate_source", retries=2, retry_delay_seconds=10, tags=["validation"])
async def validate_source_task(source_url: str, source_type: IngestionSource) -> bool:
"""
Validate that a source is accessible.
Args:
source_url: URL or path to source
source_type: Type of source
Returns:
True if valid
"""
if source_type == IngestionSource.WEB:
ingestor = FirecrawlIngestor()
elif source_type == IngestionSource.REPOSITORY:
ingestor = RepomixIngestor()
else:
raise ValueError(f"Unsupported source type: {source_type}")
result = await ingestor.validate_source(source_url)
return bool(result)
@task(name="initialize_storage", retries=3, retry_delay_seconds=5, tags=["storage"])
async def initialize_storage_task(config: StorageConfig) -> BaseStorage:
"""
Initialize storage backend.
Args:
config: Storage configuration
Returns:
Initialized storage adapter
"""
if config.backend == StorageBackend.WEAVIATE:
storage = WeaviateStorage(config)
elif config.backend == StorageBackend.OPEN_WEBUI:
storage = OpenWebUIStorage(config)
else:
raise ValueError(f"Unsupported backend: {config.backend}")
await storage.initialize()
return storage
@task(name="ingest_documents", retries=2, retry_delay_seconds=30, tags=["ingestion"])
async def ingest_documents_task(job: IngestionJob, collection_name: str | None = None, batch_size: int = 50) -> tuple[int, int]:
"""
Ingest documents from source.
Args:
job: Ingestion job configuration
batch_size: Number of documents per batch
Returns:
Tuple of (processed_count, failed_count)
"""
# Select ingestor
if job.source_type == IngestionSource.WEB:
config = FirecrawlConfig()
ingestor = FirecrawlIngestor(config)
elif job.source_type == IngestionSource.REPOSITORY:
config = RepomixConfig()
ingestor = RepomixIngestor(config)
else:
raise ValueError(f"Unsupported source: {job.source_type}")
processed = 0
failed = 0
batch = []
# Initialize storage
from pydantic import HttpUrl
# Use provided collection name or generate default
if collection_name is None:
collection_name = f"docs_{job.source_type.value}"
storage_config = StorageConfig(
backend=job.storage_backend,
endpoint=HttpUrl("http://weaviate.yo")
if job.storage_backend == StorageBackend.WEAVIATE
else HttpUrl("http://chat.lab"),
collection_name=collection_name,
)
if job.storage_backend == StorageBackend.WEAVIATE:
storage = WeaviateStorage(storage_config)
else:
storage = OpenWebUIStorage(storage_config)
await storage.initialize()
# Process documents
async for document in ingestor.ingest(job):
batch.append(document)
if len(batch) >= batch_size:
try:
stored_ids = await storage.store_batch(batch)
print(f"Successfully stored {len(stored_ids)} documents in batch")
processed += len(stored_ids)
failed += len(batch) - len(stored_ids)
except Exception as e:
print(f"Batch storage failed: {e}")
failed += len(batch)
batch = []
# Process remaining batch
if batch:
try:
stored_ids = await storage.store_batch(batch)
print(f"Successfully stored {len(stored_ids)} documents in final batch")
processed += len(stored_ids)
failed += len(batch) - len(stored_ids)
except Exception as e:
print(f"Final batch storage failed: {e}")
failed += len(batch)
return processed, failed
@task(name="update_job_status", tags=["tracking"])
async def update_job_status_task(
job: IngestionJob,
status: IngestionStatus,
processed: int = 0,
failed: int = 0,
error: str | None = None,
) -> IngestionJob:
"""
Update job status.
Args:
job: Ingestion job
status: New status
processed: Documents processed
failed: Documents failed
error: Error message if any
Returns:
Updated job
"""
job.status = status
job.updated_at = datetime.now(UTC)
job.document_count = processed
if status == IngestionStatus.COMPLETED:
job.completed_at = datetime.now(UTC)
if error:
job.error_message = error
return job
@flow(
name="ingestion_pipeline",
description="Main ingestion pipeline for documents",
retries=1,
retry_delay_seconds=60,
persist_result=True,
log_prints=True,
)
async def create_ingestion_flow(
source_url: str,
source_type: Literal["web", "repository", "documentation"],
storage_backend: Literal["weaviate", "open_webui"] = "weaviate",
collection_name: str | None = None,
validate_first: bool = True,
) -> IngestionResult:
"""
Main ingestion flow.
Args:
source_url: URL or path to source
source_type: Type of source
storage_backend: Storage backend to use
validate_first: Whether to validate source first
Returns:
Ingestion result
"""
print(f"Starting ingestion from {source_url}")
# Create job
job = IngestionJob(
source_url=source_url,
source_type=IngestionSource(source_type),
storage_backend=StorageBackend(storage_backend),
status=IngestionStatus.PENDING,
)
start_time = datetime.now(UTC)
error_messages = []
processed = 0
failed = 0
try:
# Validate source if requested
if validate_first:
print("Validating source...")
is_valid = await validate_source_task(source_url, job.source_type)
if not is_valid:
raise IngestionError(f"Source validation failed: {source_url}")
# Update status to in progress
job = await update_job_status_task(job, IngestionStatus.IN_PROGRESS)
# Run ingestion
print("Ingesting documents...")
processed, failed = await ingest_documents_task(job, collection_name)
# Update final status
if failed > 0:
error_messages.append(f"{failed} documents failed to process")
# Set status based on results
if processed == 0 and failed > 0:
final_status = IngestionStatus.FAILED
elif failed > 0:
final_status = IngestionStatus.PARTIAL
else:
final_status = IngestionStatus.COMPLETED
job = await update_job_status_task(job, final_status, processed=processed, failed=failed)
print(f"Ingestion completed: {processed} processed, {failed} failed")
except Exception as e:
print(f"Ingestion failed: {e}")
error_messages.append(str(e))
# Don't reset counts - keep whatever was processed before the error
job = await update_job_status_task(job, IngestionStatus.FAILED,
processed=processed,
failed=failed,
error=str(e))
# Calculate duration
duration = (datetime.now(UTC) - start_time).total_seconds()
return IngestionResult(
job_id=job.id,
status=job.status,
documents_processed=processed,
documents_failed=failed,
duration_seconds=duration,
error_messages=error_messages,
)

View File

@@ -0,0 +1,89 @@
"""Scheduler for Prefect deployments."""
from datetime import timedelta
from typing import TYPE_CHECKING, Literal, Protocol
from prefect import serve
from prefect.deployments.runner import RunnerDeployment
from prefect.schedules import Cron, Interval
from .ingestion import create_ingestion_flow
class FlowWithDeployment(Protocol):
"""Protocol for flows that have deployment methods."""
def to_deployment(
self,
name: str,
**kwargs: object,
) -> RunnerDeployment:
"""Create a deployment from this flow."""
...
def create_scheduled_deployment(
name: str,
source_url: str,
source_type: Literal["web", "repository", "documentation"],
storage_backend: Literal["weaviate", "open_webui"] = "weaviate",
schedule_type: Literal["cron", "interval"] = "interval",
cron_expression: str | None = None,
interval_minutes: int = 60,
tags: list[str] | None = None,
) -> RunnerDeployment:
"""
Create a scheduled deployment for ingestion.
Args:
name: Deployment name
source_url: Source to ingest from
source_type: Type of source
storage_backend: Storage backend
schedule_type: Type of schedule
cron_expression: Cron expression if using cron
interval_minutes: Interval in minutes if using interval
tags: Optional tags for deployment
Returns:
Deployment configuration
"""
# Create schedule
if schedule_type == "cron" and cron_expression:
schedule = Cron(cron_expression, timezone="UTC")
else:
schedule = Interval(timedelta(minutes=interval_minutes), timezone="UTC")
# Default tags
if tags is None:
tags = [source_type, storage_backend]
# Create deployment
# The flow decorator adds the to_deployment method at runtime
to_deployment = create_ingestion_flow.to_deployment
deployment = to_deployment(
name=name,
schedule=schedule,
parameters={
"source_url": source_url,
"source_type": source_type,
"storage_backend": storage_backend,
"validate_first": True,
},
tags=tags,
description=f"Scheduled ingestion from {source_url}",
)
from typing import cast
return cast("RunnerDeployment", deployment)
def serve_deployments(deployments: list[RunnerDeployment]) -> None:
"""
Serve multiple deployments.
Args:
deployments: List of deployment configurations
"""
serve(*deployments, limit=10)

View File

@@ -0,0 +1,11 @@
"""Ingestors module for different data sources."""
from .base import BaseIngestor
from .firecrawl import FirecrawlIngestor
from .repomix import RepomixIngestor
__all__ = [
"BaseIngestor",
"FirecrawlIngestor",
"RepomixIngestor",
]

View File

@@ -0,0 +1,50 @@
"""Base ingestor interface."""
from abc import ABC, abstractmethod
from collections.abc import AsyncGenerator
from ..core.models import Document, IngestionJob
class BaseIngestor(ABC):
"""Abstract base class for all ingestors."""
@abstractmethod
async def ingest(self, job: IngestionJob) -> AsyncGenerator[Document, None]:
"""
Ingest data from a source.
Args:
job: The ingestion job configuration
Yields:
Documents from the source
"""
return # type: ignore # pragma: no cover
yield # pragma: no cover
@abstractmethod
async def validate_source(self, source_url: str) -> bool:
"""
Validate if the source is accessible.
Args:
source_url: URL or path to the source
Returns:
True if source is valid and accessible
"""
pass # pragma: no cover
@abstractmethod
async def estimate_size(self, source_url: str) -> int:
"""
Estimate the number of documents in the source.
Args:
source_url: URL or path to the source
Returns:
Estimated number of documents
"""
pass # pragma: no cover

View File

@@ -0,0 +1,229 @@
"""Firecrawl ingestor for web and documentation sites."""
import asyncio
from collections.abc import AsyncGenerator
from datetime import UTC, datetime
from typing import Any
from uuid import uuid4
from firecrawl import AsyncFirecrawl
from typing_extensions import override
from ..config import get_settings
from ..core.models import (
Document,
DocumentMetadata,
FirecrawlConfig,
IngestionJob,
IngestionSource,
)
from .base import BaseIngestor
class FirecrawlIngestor(BaseIngestor):
"""Ingestor for web and documentation sites using Firecrawl."""
config: FirecrawlConfig
client: Any # AsyncFirecrawl client instance
def __init__(self, config: FirecrawlConfig | None = None):
"""
Initialize Firecrawl ingestor.
Args:
config: Firecrawl configuration (for operational params only)
"""
self.config = config or FirecrawlConfig()
settings = get_settings()
# All connection details come from settings/.env
# For self-hosted instances, use a dummy API key if none is provided
# The SDK requires an API key even for self-hosted instances
api_key = settings.firecrawl_api_key or "no-key-required"
# AsyncFirecrawl automatically uses v2 endpoints
self.client = AsyncFirecrawl(api_key=api_key, api_url=str(settings.firecrawl_endpoint))
@override
async def ingest(self, job: IngestionJob) -> AsyncGenerator[Document, None]:
"""
Ingest documents from a web source.
Args:
job: The ingestion job configuration
Yields:
Documents from the web source
"""
url = str(job.source_url)
# First, map the site to understand its structure
site_map = await self._map_site(url)
# If map returns empty, just use the main URL
if not site_map:
site_map = [url]
# Process pages in batches
batch_size = 10
for i in range(0, len(site_map), batch_size):
batch_urls = site_map[i : i + batch_size]
documents = await self._scrape_batch(batch_urls)
for doc_data in documents:
yield self._create_document(doc_data, job)
@override
async def validate_source(self, source_url: str) -> bool:
"""
Validate if the web source is accessible.
Args:
source_url: URL to validate
Returns:
True if source is accessible
"""
try:
# Use SDK v2 endpoints for both self-hosted and cloud
result = await self.client.scrape(source_url, formats=["markdown"])
return result is not None and hasattr(result, "markdown")
except Exception:
return False
@override
async def estimate_size(self, source_url: str) -> int:
"""
Estimate the number of pages in the website.
Args:
source_url: URL of the website
Returns:
Estimated number of pages
"""
try:
site_map = await self._map_site(source_url)
return len(site_map) if site_map else 0
except Exception:
return 0
async def _map_site(self, url: str) -> list[str]:
"""
Map a website to get all URLs.
Args:
url: Base URL to map
Returns:
List of URLs found
"""
try:
# Use SDK v2 map endpoint
result = await self.client.map(url=url, limit=self.config.limit)
if result and hasattr(result, "links"):
# Extract URLs from the result
return [
link if isinstance(link, str) else getattr(link, "url", str(link))
for link in result.links
]
return []
except Exception as e:
# If map fails (might not be available in all versions), fall back to single URL
import logging
logging.warning(f"Map endpoint not available or failed: {e}. Using single URL.")
return [url]
async def _scrape_batch(self, urls: list[str]) -> list[dict[str, str]]:
"""
Scrape a batch of URLs.
Args:
urls: List of URLs to scrape
Returns:
List of scraped documents
"""
tasks = []
for url in urls:
task = self._scrape_single(url)
tasks.append(task)
results = await asyncio.gather(*tasks, return_exceptions=True)
documents = []
for result in results:
if isinstance(result, Exception):
continue
if result and isinstance(result, dict) and "markdown" in result:
documents.append(result)
return documents
async def _scrape_single(self, url: str) -> dict[str, str]:
"""
Scrape a single URL.
Args:
url: URL to scrape
Returns:
Scraped document data
"""
try:
# Use SDK v2 scrape endpoint
result = await self.client.scrape(url, formats=self.config.formats)
# Extract data from the result
if result:
# The SDK returns a ScrapeResult object with markdown and metadata
metadata = getattr(result, "metadata", {})
return {
"markdown": getattr(result, "markdown", ""),
"sourceURL": url,
"title": metadata.get("title", "")
if isinstance(metadata, dict)
else getattr(metadata, "title", ""),
"description": metadata.get("description", "")
if isinstance(metadata, dict)
else getattr(metadata, "description", ""),
}
return {}
except Exception as e:
import logging
logging.debug(f"Failed to scrape {url}: {e}")
return {}
def _create_document(self, doc_data: dict[str, str], job: IngestionJob) -> Document:
"""
Create a Document from scraped data.
Args:
doc_data: Scraped document data
job: The ingestion job
Returns:
Document instance
"""
content = doc_data.get("markdown", "")
metadata: DocumentMetadata = {
"source_url": doc_data.get("sourceURL", str(job.source_url)),
"title": doc_data.get("title"),
"description": doc_data.get("description"),
"timestamp": datetime.now(UTC),
"content_type": "text/markdown",
"word_count": len(content.split()),
"char_count": len(content),
}
return Document(
id=uuid4(),
content=content,
metadata=metadata,
source=IngestionSource.WEB,
collection=job.storage_backend.value,
)

View File

@@ -0,0 +1,339 @@
"""Repomix ingestor for Git repositories."""
import asyncio
import subprocess
import tempfile
from collections.abc import AsyncGenerator
from datetime import UTC, datetime
from pathlib import Path
from uuid import uuid4
from typing_extensions import override
from ..core.exceptions import IngestionError, SourceNotFoundError
from ..core.models import (
Document,
DocumentMetadata,
IngestionJob,
IngestionSource,
RepomixConfig,
)
from .base import BaseIngestor
class RepomixIngestor(BaseIngestor):
"""Ingestor for Git repositories using Repomix."""
config: RepomixConfig
def __init__(self, config: RepomixConfig | None = None):
"""
Initialize Repomix ingestor.
Args:
config: Repomix configuration
"""
self.config = config or RepomixConfig()
@override
async def ingest(self, job: IngestionJob) -> AsyncGenerator[Document, None]:
"""
Ingest documents from a Git repository.
Args:
job: The ingestion job configuration
Yields:
Documents from the repository
"""
repo_url = str(job.source_url)
with tempfile.TemporaryDirectory() as temp_dir:
# Clone the repository
repo_path = await self._clone_repository(repo_url, temp_dir)
# Run repomix to generate output
output_file = await self._run_repomix(repo_path)
# Parse and yield documents
documents = await self._parse_repomix_output(output_file, job)
for doc in documents:
yield doc
@override
async def validate_source(self, source_url: str) -> bool:
"""
Validate if the Git repository is accessible.
Args:
source_url: Git repository URL
Returns:
True if repository is accessible
"""
try:
# Test if we can list remote refs
result = await self._run_command(
["git", "ls-remote", "--heads", source_url], timeout=10
)
return result.returncode == 0
except Exception:
return False
@override
async def estimate_size(self, source_url: str) -> int:
"""
Estimate the number of files in the repository.
Args:
source_url: Git repository URL
Returns:
Estimated number of files
"""
try:
with tempfile.TemporaryDirectory() as temp_dir:
# Shallow clone to get file count
repo_path = await self._clone_repository(source_url, temp_dir, shallow=True)
# Count files matching patterns
file_count = 0
for pattern in self.config.include_patterns:
files = list(Path(repo_path).rglob(pattern))
file_count += len(files)
return file_count
except Exception:
return 0
async def _clone_repository(
self, repo_url: str, target_dir: str, shallow: bool = False
) -> Path:
"""
Clone a Git repository.
Args:
repo_url: Repository URL
target_dir: Directory to clone into
shallow: Whether to do a shallow clone
Returns:
Path to cloned repository
"""
repo_name = repo_url.split("/")[-1].replace(".git", "")
repo_path = Path(target_dir) / repo_name
cmd = ["git", "clone"]
if shallow:
cmd.extend(["--depth", "1"])
cmd.extend([repo_url, str(repo_path)])
result = await self._run_command(cmd, timeout=300)
if result.returncode != 0:
raise SourceNotFoundError(f"Failed to clone repository: {repo_url}")
return repo_path
async def _run_repomix(self, repo_path: Path) -> Path:
"""
Run repomix on a repository.
Args:
repo_path: Path to the repository
Returns:
Path to repomix output file
"""
output_file = repo_path / "repomix-output.md"
# Build repomix command
cmd = ["npx", "repomix", "--output", str(output_file)]
# Add include patterns
if self.config.include_patterns:
for pattern in self.config.include_patterns:
cmd.extend(["--include", pattern])
# Add exclude patterns
if self.config.exclude_patterns:
for pattern in self.config.exclude_patterns:
cmd.extend(["--exclude", pattern])
if self.config.respect_gitignore:
cmd.append("--respect-gitignore")
result = await self._run_command(cmd, cwd=str(repo_path), timeout=120)
if result.returncode != 0:
stderr_text = (
result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr
)
raise IngestionError(f"Repomix failed: {stderr_text}")
return output_file
async def _parse_repomix_output(self, output_file: Path, job: IngestionJob) -> list[Document]:
"""
Parse repomix output into documents.
Args:
output_file: Path to repomix output
job: The ingestion job
Returns:
List of documents
"""
documents = []
try:
content = output_file.read_text()
# Split by file markers (repomix uses specific delimiters)
file_sections = self._split_by_files(content)
for file_path, file_content in file_sections.items():
if len(file_content) > self.config.max_file_size:
# Split large files into chunks
chunks = self._chunk_content(file_content)
for i, chunk in enumerate(chunks):
doc = self._create_document(file_path, chunk, job, chunk_index=i)
documents.append(doc)
else:
doc = self._create_document(file_path, file_content, job)
documents.append(doc)
except Exception as e:
raise IngestionError(f"Failed to parse repomix output: {e}") from e
return documents
def _split_by_files(self, content: str) -> dict[str, str]:
"""
Split repomix output by files.
Args:
content: Repomix output content
Returns:
Dictionary of file paths to content
"""
files: dict[str, str] = {}
current_file: str | None = None
current_content: list[str] = []
for line in content.split("\n"):
# Look for file markers (adjust based on actual repomix format)
if line.startswith("## File:") or line.startswith("### "):
if current_file:
files[current_file] = "\n".join(current_content)
current_file = line.replace("## File:", "").replace("### ", "").strip()
current_content = []
else:
current_content.append(line)
# Add last file
if current_file:
files[current_file] = "\n".join(current_content)
# If no file markers found, treat as single document
if not files:
files["repository"] = content
return files
def _chunk_content(self, content: str, chunk_size: int = 500000) -> list[str]:
"""
Split content into chunks.
Args:
content: Content to chunk
chunk_size: Maximum size per chunk
Returns:
List of content chunks
"""
chunks: list[str] = []
lines = content.split("\n")
current_chunk: list[str] = []
current_size = 0
for line in lines:
line_size = len(line) + 1 # +1 for newline
if current_size + line_size > chunk_size and current_chunk:
chunks.append("\n".join(current_chunk))
current_chunk = []
current_size = 0
current_chunk.append(line)
current_size += line_size
if current_chunk:
chunks.append("\n".join(current_chunk))
return chunks
def _create_document(
self, file_path: str, content: str, job: IngestionJob, chunk_index: int = 0
) -> Document:
"""
Create a Document from repository content.
Args:
file_path: Path to the file in repository
content: File content
job: The ingestion job
chunk_index: Index if content is chunked
Returns:
Document instance
"""
metadata: DocumentMetadata = {
"source_url": str(job.source_url),
"title": f"{file_path}" + (f" (chunk {chunk_index})" if chunk_index > 0 else ""),
"description": f"Repository file: {file_path}",
"timestamp": datetime.now(UTC),
"content_type": "text/plain",
"word_count": len(content.split()),
"char_count": len(content),
}
return Document(
id=uuid4(),
content=content,
metadata=metadata,
source=IngestionSource.REPOSITORY,
collection=job.storage_backend.value,
)
async def _run_command(
self, cmd: list[str], cwd: str | None = None, timeout: int = 60
) -> subprocess.CompletedProcess[bytes]:
"""
Run a shell command asynchronously.
Args:
cmd: Command and arguments
cwd: Working directory
timeout: Command timeout in seconds
Returns:
Completed process result
"""
proc = await asyncio.create_subprocess_exec(
*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, cwd=cwd
)
try:
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
return subprocess.CompletedProcess(
cmd,
proc.returncode or 0,
stdout,
stderr,
)
except TimeoutError as e:
proc.kill()
raise IngestionError(f"Command timed out: {' '.join(cmd)}") from e

View File

@@ -0,0 +1,11 @@
"""Storage adapters for different backends."""
from .base import BaseStorage
from .openwebui import OpenWebUIStorage
from .weaviate import WeaviateStorage
__all__ = [
"BaseStorage",
"WeaviateStorage",
"OpenWebUIStorage",
]

View File

@@ -0,0 +1,106 @@
"""Base storage interface."""
from abc import ABC, abstractmethod
from collections.abc import AsyncGenerator
from ..core.models import Document, StorageConfig
class BaseStorage(ABC):
"""Abstract base class for storage adapters."""
config: StorageConfig
def __init__(self, config: StorageConfig):
"""
Initialize storage adapter.
Args:
config: Storage configuration
"""
self.config = config
@abstractmethod
async def initialize(self) -> None:
"""Initialize the storage backend and create collections if needed."""
pass # pragma: no cover
@abstractmethod
async def store(self, document: Document) -> str:
"""
Store a single document.
Args:
document: Document to store
Returns:
Document ID
"""
pass # pragma: no cover
@abstractmethod
async def store_batch(self, documents: list[Document]) -> list[str]:
"""
Store multiple documents in batch.
Args:
documents: List of documents to store
Returns:
List of document IDs
"""
pass # pragma: no cover
@abstractmethod
async def retrieve(self, document_id: str) -> Document | None:
"""
Retrieve a document by ID.
Args:
document_id: Document ID
Returns:
Document or None if not found
"""
pass # pragma: no cover
@abstractmethod
async def search(
self, query: str, limit: int = 10, threshold: float = 0.7
) -> AsyncGenerator[Document, None]:
"""
Search for documents.
Args:
query: Search query
limit: Maximum number of results
threshold: Similarity threshold
Yields:
Matching documents
"""
return # type: ignore # pragma: no cover
yield # pragma: no cover
@abstractmethod
async def delete(self, document_id: str) -> bool:
"""
Delete a document.
Args:
document_id: Document ID
Returns:
True if deleted successfully
"""
pass # pragma: no cover
@abstractmethod
async def count(self) -> int:
"""
Get total document count.
Returns:
Number of documents
"""
pass # pragma: no cover

View File

@@ -0,0 +1,296 @@
"""Open WebUI storage adapter."""
from collections.abc import AsyncGenerator
from uuid import UUID
import httpx
from typing_extensions import override
from ..core.exceptions import StorageError
from ..core.models import Document, StorageConfig
from ..utils.vectorizer import Vectorizer
from .base import BaseStorage
class OpenWebUIStorage(BaseStorage):
"""Storage adapter for Open WebUI knowledge endpoints."""
client: httpx.AsyncClient
vectorizer: Vectorizer
def __init__(self, config: StorageConfig):
"""
Initialize Open WebUI storage.
Args:
config: Storage configuration
"""
super().__init__(config)
self.client = httpx.AsyncClient(
base_url=str(config.endpoint),
headers={
"Authorization": f"Bearer {config.api_key}" if config.api_key else "",
"Content-Type": "application/json",
},
timeout=30.0,
)
self.vectorizer = Vectorizer(config)
@override
async def initialize(self) -> None:
"""Initialize Open WebUI connection."""
try:
# Test connection with OpenWebUI knowledge API
response = await self.client.get("/api/v1/knowledge/")
response.raise_for_status()
# Check if collection (knowledge base) exists, create if not
knowledge_bases = response.json()
collection_exists = any(
kb.get("name") == self.config.collection_name for kb in knowledge_bases
)
if not collection_exists:
await self._create_collection()
except Exception as e:
raise StorageError(f"Failed to initialize Open WebUI: {e}") from e
async def _create_collection(self) -> None:
"""Create knowledge base in Open WebUI."""
try:
response = await self.client.post(
"/api/v1/knowledge/create",
json={
"name": self.config.collection_name,
"description": "Documents ingested from various sources"
},
)
response.raise_for_status()
except Exception as e:
raise StorageError(f"Failed to create knowledge base: {e}") from e
@override
async def store(self, document: Document) -> str:
"""
Store a document in Open WebUI.
Args:
document: Document to store
Returns:
Document ID
"""
try:
# Vectorize if needed
if document.vector is None:
document.vector = await self.vectorizer.vectorize(document.content)
# Prepare document data
doc_data = {
"id": str(document.id),
"collection": self.config.collection_name,
"content": document.content,
"metadata": {
**document.metadata,
"timestamp": document.metadata["timestamp"].isoformat(),
"source": document.source.value,
},
"embedding": document.vector,
}
# Store document
response = await self.client.post(
f"/api/knowledge/collections/{self.config.collection_name}/documents", json=doc_data
)
response.raise_for_status()
result = response.json()
document_id = result.get("id") if isinstance(result, dict) else None
return str(document_id) if document_id else str(document.id)
except Exception as e:
raise StorageError(f"Failed to store document: {e}") from e
@override
async def store_batch(self, documents: list[Document]) -> list[str]:
"""
Store multiple documents in batch.
Args:
documents: List of documents
Returns:
List of document IDs
"""
try:
# Vectorize documents without vectors
for doc in documents:
if doc.vector is None:
doc.vector = await self.vectorizer.vectorize(doc.content)
# Prepare batch data
batch_data = []
for doc in documents:
batch_data.append(
{
"id": str(doc.id),
"content": doc.content,
"metadata": {
**doc.metadata,
"timestamp": doc.metadata["timestamp"].isoformat(),
"source": doc.source.value,
},
"embedding": doc.vector,
}
)
# Store batch
response = await self.client.post(
f"/api/knowledge/collections/{self.config.collection_name}/documents/batch",
json={"documents": batch_data},
)
response.raise_for_status()
result = response.json()
ids = result.get("ids") if isinstance(result, dict) else None
return ids if isinstance(ids, list) else [str(doc.id) for doc in documents]
except Exception as e:
raise StorageError(f"Failed to store batch: {e}") from e
@override
async def retrieve(self, document_id: str) -> Document | None:
"""
Retrieve a document from Open WebUI.
Args:
document_id: Document ID
Returns:
Document or None
"""
try:
response = await self.client.get(
f"/api/knowledge/collections/{self.config.collection_name}/documents/{document_id}"
)
if response.status_code == 404:
return None
response.raise_for_status()
data = response.json()
# Reconstruct document
metadata = data.get("metadata", {})
return Document(
id=UUID(document_id),
content=data["content"],
metadata=metadata,
vector=data.get("embedding"),
source=metadata.get("source", "unknown"),
collection=self.config.collection_name,
)
except Exception:
return None
@override
async def search(
self, query: str, limit: int = 10, threshold: float = 0.7
) -> AsyncGenerator[Document, None]:
"""
Search for documents in Open WebUI.
Args:
query: Search query
limit: Maximum results
threshold: Similarity threshold
Yields:
Matching documents
"""
try:
# Vectorize query
query_vector = await self.vectorizer.vectorize(query)
# Perform search
response = await self.client.post(
f"/api/knowledge/collections/{self.config.collection_name}/search",
json={
"query": query,
"embedding": query_vector,
"limit": limit,
"threshold": threshold,
},
)
response.raise_for_status()
results = response.json()
for result in results.get("documents", []):
metadata = result.get("metadata", {})
doc = Document(
id=result["id"],
content=result["content"],
metadata=metadata,
vector=result.get("embedding"),
source=metadata.get("source", "unknown"),
collection=self.config.collection_name,
)
yield doc
except Exception as e:
raise StorageError(f"Search failed: {e}") from e
async def delete(self, document_id: str) -> bool:
"""
Delete a document from Open WebUI.
Args:
document_id: Document ID
Returns:
True if deleted
"""
try:
response = await self.client.delete(
f"/api/knowledge/collections/{self.config.collection_name}/documents/{document_id}"
)
return response.status_code in [200, 204]
except Exception:
return False
async def count(self) -> int:
"""
Get document count in collection.
Returns:
Number of documents
"""
try:
response = await self.client.get(
f"/api/knowledge/collections/{self.config.collection_name}/stats"
)
response.raise_for_status()
stats = response.json()
count = stats.get("document_count") if isinstance(stats, dict) else None
return int(count) if isinstance(count, (int, str)) else 0
except Exception:
return 0
async def __aenter__(self) -> "OpenWebUIStorage":
"""Async context manager entry."""
await self.initialize()
return self
async def __aexit__(
self,
exc_type: type[BaseException] | None,
exc_val: BaseException | None,
exc_tb: object | None,
) -> None:
"""Async context manager exit."""
await self.client.aclose()

View File

@@ -0,0 +1,703 @@
"""Weaviate storage adapter."""
from collections.abc import AsyncGenerator
from datetime import UTC, datetime
from typing import cast
from uuid import UUID
import weaviate
from typing_extensions import override
from weaviate.classes.config import Configure, DataType, Property
from ..core.exceptions import StorageError
from ..core.models import Document, DocumentMetadata, IngestionSource, StorageConfig
from ..utils.vectorizer import Vectorizer
from .base import BaseStorage
class WeaviateStorage(BaseStorage):
"""Storage adapter for Weaviate."""
client: weaviate.WeaviateClient | None
vectorizer: Vectorizer
collection_name: str
def __init__(self, config: StorageConfig):
"""
Initialize Weaviate storage.
Args:
config: Storage configuration
"""
super().__init__(config)
self.client = None
self.vectorizer = Vectorizer(config)
self.collection_name = config.collection_name.capitalize()
@override
async def initialize(self) -> None:
"""Initialize Weaviate client and create collection if needed."""
try:
# Connect to Weaviate
# Parse endpoint - Weaviate expects just the hostname without protocol
endpoint_str = str(self.config.endpoint).replace("http://", "").replace("https://", "")
# Split host and port if port is specified in the URL
if ":" in endpoint_str and "/" not in endpoint_str:
# Only split if it's a port number (no path)
host, port_str = endpoint_str.rsplit(":", 1)
http_port = int(port_str) if port_str.isdigit() else 80
else:
# Remove any path if present
host = endpoint_str.split("/")[0]
# For reverse proxy setups, use port 80
http_port = 80
# For reverse proxy setups, use HTTP-only connection
self.client = weaviate.WeaviateClient(
connection_params=weaviate.connect.ConnectionParams.from_url(
url=f"http://{host}:{http_port}",
grpc_port=50051, # Default gRPC port but will be ignored
),
skip_init_checks=True, # Skip gRPC health checks
additional_config=weaviate.classes.init.AdditionalConfig(
timeout=weaviate.classes.init.Timeout(init=30, query=60, insert=120),
)
)
# Connect to the client
self.client.connect()
# Check if collection exists
collections = self.client.collections.list_all()
if self.collection_name not in collections:
await self._create_collection()
except Exception as e:
raise StorageError(f"Failed to initialize Weaviate: {e}") from e
async def _create_collection(self) -> None:
"""Create Weaviate collection with schema."""
if not self.client:
raise StorageError("Weaviate client not initialized")
try:
self.client.collections.create(
name=self.collection_name,
properties=[
Property(
name="content", data_type=DataType.TEXT, description="Document content"
),
Property(name="source_url", data_type=DataType.TEXT, description="Source URL"),
Property(name="title", data_type=DataType.TEXT, description="Document title"),
Property(
name="description",
data_type=DataType.TEXT,
description="Document description",
),
Property(
name="timestamp", data_type=DataType.DATE, description="Ingestion timestamp"
),
Property(
name="content_type", data_type=DataType.TEXT, description="Content type"
),
Property(name="word_count", data_type=DataType.INT, description="Word count"),
Property(
name="char_count", data_type=DataType.INT, description="Character count"
),
Property(
name="source", data_type=DataType.TEXT, description="Ingestion source"
),
],
vectorizer_config=Configure.Vectorizer.none(),
)
except Exception as e:
raise StorageError(f"Failed to create collection: {e}") from e
@override
async def store(self, document: Document) -> str:
"""
Store a document in Weaviate.
Args:
document: Document to store
Returns:
Document ID
"""
try:
# Vectorize content if no vector provided
if document.vector is None:
document.vector = await self.vectorizer.vectorize(document.content)
if not self.client:
raise StorageError("Weaviate client not initialized")
collection = self.client.collections.get(self.collection_name)
# Prepare properties
properties = {
"content": document.content,
"source_url": document.metadata["source_url"],
"title": document.metadata.get("title", ""),
"description": document.metadata.get("description", ""),
"timestamp": document.metadata["timestamp"].isoformat(),
"content_type": document.metadata["content_type"],
"word_count": document.metadata["word_count"],
"char_count": document.metadata["char_count"],
"source": document.source.value,
}
# Insert with vector
result = collection.data.insert(
properties=properties, vector=document.vector, uuid=str(document.id)
)
return str(result)
except Exception as e:
raise StorageError(f"Failed to store document: {e}") from e
@override
async def store_batch(self, documents: list[Document]) -> list[str]:
"""
Store multiple documents in batch.
Args:
documents: List of documents
Returns:
List of successfully stored document IDs
"""
try:
if not self.client:
raise StorageError("Weaviate client not initialized")
collection = self.client.collections.get(self.collection_name)
# Vectorize documents without vectors
for doc in documents:
if doc.vector is None:
doc.vector = await self.vectorizer.vectorize(doc.content)
# Try individual inserts to avoid gRPC batch issues
successful_ids: list[str] = []
for doc in documents:
try:
properties = {
"content": doc.content,
"source_url": doc.metadata["source_url"],
"title": doc.metadata.get("title", ""),
"description": doc.metadata.get("description", ""),
"timestamp": doc.metadata["timestamp"].isoformat(),
"content_type": doc.metadata["content_type"],
"word_count": doc.metadata["word_count"],
"char_count": doc.metadata["char_count"],
"source": doc.source.value,
}
# Insert individual document
collection.data.insert(
properties=properties,
vector=doc.vector,
uuid=str(doc.id)
)
successful_ids.append(str(doc.id))
except Exception as e:
print(f"Failed to store document {doc.id}: {e}")
continue
if not successful_ids:
raise StorageError("All documents in batch failed to store")
return successful_ids
except Exception as e:
raise StorageError(f"Failed to store batch: {e}") from e
@override
async def retrieve(self, document_id: str) -> Document | None:
"""
Retrieve a document from Weaviate.
Args:
document_id: Document ID
Returns:
Document or None
"""
try:
if not self.client:
raise StorageError("Weaviate client not initialized")
collection = self.client.collections.get(self.collection_name)
result = collection.query.fetch_object_by_id(document_id)
if not result:
return None
# Reconstruct document
props = result.properties
metadata_dict = {
"source_url": str(props["source_url"]),
"title": str(props.get("title")) if props.get("title") else None,
"description": str(props.get("description")) if props.get("description") else None,
"timestamp": str(props["timestamp"]),
"content_type": str(props["content_type"]),
"word_count": int(str(props["word_count"])),
"char_count": int(str(props["char_count"])),
}
metadata = cast(DocumentMetadata, cast(object, metadata_dict))
vector_raw = result.vector.get("default") if result.vector else None
vector: list[float] | None = None
if isinstance(vector_raw, list) and vector_raw:
first_elem = vector_raw[0]
if isinstance(first_elem, list):
# Nested list - take first one and ensure all elements are numbers
nested_vector = first_elem
try:
vector = [float(x) for x in nested_vector if isinstance(x, (int, float))]
except (ValueError, TypeError):
vector = None
else:
# Flat list - ensure all elements are numbers
try:
vector = [float(x) for x in vector_raw if isinstance(x, (int, float))]
except (ValueError, TypeError):
vector = None
return Document(
id=UUID(document_id),
content=str(props["content"]),
metadata=metadata,
vector=vector,
source=IngestionSource.WEB, # Default to WEB
collection=self.collection_name,
)
except Exception:
return None
@override
async def search(
self, query: str, limit: int = 10, threshold: float = 0.7
) -> AsyncGenerator[Document, None]:
"""
Search for documents in Weaviate.
Args:
query: Search query
limit: Maximum results
threshold: Similarity threshold
Yields:
Matching documents
"""
try:
# Vectorize query
query_vector = await self.vectorizer.vectorize(query)
if not self.client:
raise StorageError("Weaviate client not initialized")
collection = self.client.collections.get(self.collection_name)
# Perform vector search
results = collection.query.near_vector(
near_vector=query_vector,
limit=limit,
distance=1 - threshold, # Convert similarity to distance
return_metadata=["distance"],
)
for result in results.objects:
props = result.properties
metadata_dict = {
"source_url": str(props["source_url"]),
"title": str(props.get("title")) if props.get("title") else None,
"description": str(props.get("description"))
if props.get("description")
else None,
"timestamp": str(props["timestamp"]),
"content_type": str(props["content_type"]),
"word_count": int(str(props["word_count"])),
"char_count": int(str(props["char_count"])),
}
metadata = cast(DocumentMetadata, cast(object, metadata_dict))
vector_raw = result.vector.get("default") if result.vector else None
vector: list[float] | None = None
if isinstance(vector_raw, list) and vector_raw:
first_elem = vector_raw[0]
if isinstance(first_elem, list):
# Nested list - take first one and ensure all elements are numbers
nested_vector = first_elem
try:
vector = [
float(x) for x in nested_vector if isinstance(x, (int, float))
]
except (ValueError, TypeError):
vector = None
else:
# Flat list - ensure all elements are numbers
try:
vector = [float(x) for x in vector_raw if isinstance(x, (int, float))]
except (ValueError, TypeError):
vector = None
doc = Document(
id=result.uuid,
content=str(props["content"]),
metadata=metadata,
vector=vector,
source=IngestionSource.WEB, # Default to WEB
collection=self.collection_name,
)
yield doc
except Exception as e:
raise StorageError(f"Search failed: {e}") from e
@override
async def delete(self, document_id: str) -> bool:
"""
Delete a document from Weaviate.
Args:
document_id: Document ID
Returns:
True if deleted
"""
try:
if not self.client:
raise StorageError("Weaviate client not initialized")
collection = self.client.collections.get(self.collection_name)
collection.data.delete_by_id(document_id)
return True
except Exception:
return False
@override
async def count(self) -> int:
"""
Get document count in collection.
Returns:
Number of documents
"""
try:
if not self.client:
return 0
collection = self.client.collections.get(self.collection_name)
result = collection.aggregate.over_all(total_count=True)
return result.total_count or 0
except Exception:
return 0
async def list_collections(self) -> list[str]:
"""
List all available collections.
Returns:
List of collection names
"""
try:
if not self.client:
raise StorageError("Weaviate client not initialized")
return list(self.client.collections.list_all())
except Exception as e:
raise StorageError(f"Failed to list collections: {e}") from e
async def sample_documents(self, limit: int = 5) -> list[Document]:
"""
Get sample documents from the collection.
Args:
limit: Maximum number of documents to return
Returns:
List of sample documents
"""
try:
if not self.client:
raise StorageError("Weaviate client not initialized")
collection = self.client.collections.get(self.collection_name)
# Query for sample documents
response = collection.query.fetch_objects(limit=limit)
documents = []
for obj in response.objects:
# Convert back to Document format
props = obj.properties
# Safely convert WeaviateField values
word_count_val = props.get("word_count")
if isinstance(word_count_val, (int, float)):
word_count = int(word_count_val)
elif word_count_val:
word_count = int(str(word_count_val))
else:
word_count = 0
char_count_val = props.get("char_count")
if isinstance(char_count_val, (int, float)):
char_count = int(char_count_val)
elif char_count_val:
char_count = int(str(char_count_val))
else:
char_count = 0
doc = Document(
id=obj.uuid,
content=str(props.get("content", "")),
source=IngestionSource(str(props.get("source", "web"))),
metadata={
"source_url": str(props.get("source_url", "")),
"title": str(props.get("title", "")) if props.get("title") else None,
"description": str(props.get("description", "")) if props.get("description") else None,
"timestamp": datetime.fromisoformat(str(props.get("timestamp", datetime.now(UTC).isoformat()))),
"content_type": str(props.get("content_type", "text/plain")),
"word_count": word_count,
"char_count": char_count,
}
)
documents.append(doc)
return documents
except Exception as e:
raise StorageError(f"Failed to sample documents: {e}") from e
async def search_documents(self, query: str, limit: int = 10) -> list[Document]:
"""
Search documents in the collection.
Args:
query: Search query
limit: Maximum number of results
Returns:
List of matching documents
"""
try:
if not self.client:
raise StorageError("Weaviate client not initialized")
collection = self.client.collections.get(self.collection_name)
# Try hybrid search first, fall back to BM25 keyword search
try:
response = collection.query.hybrid(
query=query,
limit=limit,
return_metadata=["score"]
)
except Exception:
# Fall back to BM25 keyword search if hybrid search fails
response = collection.query.bm25(
query=query,
limit=limit,
return_metadata=["score"]
)
documents = []
for obj in response.objects:
# Convert back to Document format
props = obj.properties
# Safely convert WeaviateField values
word_count_val = props.get("word_count")
if isinstance(word_count_val, (int, float)):
word_count = int(word_count_val)
elif word_count_val:
word_count = int(str(word_count_val))
else:
word_count = 0
char_count_val = props.get("char_count")
if isinstance(char_count_val, (int, float)):
char_count = int(char_count_val)
elif char_count_val:
char_count = int(str(char_count_val))
else:
char_count = 0
# Build metadata - note that search_score is not part of DocumentMetadata
metadata: DocumentMetadata = {
"source_url": str(props.get("source_url", "")),
"title": str(props.get("title", "")) if props.get("title") else None,
"description": str(props.get("description", "")) if props.get("description") else None,
"timestamp": datetime.fromisoformat(str(props.get("timestamp", datetime.now(UTC).isoformat()))),
"content_type": str(props.get("content_type", "text/plain")),
"word_count": word_count,
"char_count": char_count,
}
doc = Document(
id=obj.uuid,
content=str(props.get("content", "")),
source=IngestionSource(str(props.get("source", "web"))),
metadata=metadata
)
documents.append(doc)
return documents
except Exception as e:
raise StorageError(f"Failed to search documents: {e}") from e
async def list_documents(self, limit: int = 100, offset: int = 0) -> list[dict[str, str | int]]:
"""
List documents in the collection with pagination.
Args:
limit: Maximum number of documents to return
offset: Number of documents to skip
Returns:
List of document dictionaries with id, title, source_url, and content preview
"""
try:
if not self.client:
raise StorageError("Weaviate client not initialized")
collection = self.client.collections.get(self.collection_name)
# Query documents with pagination
response = collection.query.fetch_objects(
limit=limit,
offset=offset,
return_metadata=["creation_time"]
)
documents = []
for obj in response.objects:
props = obj.properties
content = str(props.get("content", ""))
word_count_value = props.get("word_count", 0)
# Convert WeaviateField to int
if isinstance(word_count_value, (int, float)):
word_count = int(word_count_value)
elif word_count_value:
word_count = int(str(word_count_value))
else:
word_count = 0
doc_info: dict[str, str | int] = {
"id": str(obj.uuid),
"title": str(props.get("title", "Untitled")),
"source_url": str(props.get("source_url", "")),
"content_preview": content[:200] + "..." if len(content) > 200 else content,
"word_count": word_count,
"timestamp": str(props.get("timestamp", "")),
}
documents.append(doc_info)
return documents
except Exception as e:
raise StorageError(f"Failed to list documents: {e}") from e
async def delete_documents(self, document_ids: list[str]) -> dict[str, bool]:
"""
Delete multiple documents from Weaviate.
Args:
document_ids: List of document IDs to delete
Returns:
Dictionary mapping document IDs to deletion success status
"""
results = {}
try:
if not self.client:
raise StorageError("Weaviate client not initialized")
collection = self.client.collections.get(self.collection_name)
for doc_id in document_ids:
try:
collection.data.delete_by_id(doc_id)
results[doc_id] = True
except Exception:
results[doc_id] = False
return results
except Exception as e:
raise StorageError(f"Failed to delete documents: {e}") from e
async def delete_by_filter(self, filter_dict: dict[str, str]) -> int:
"""
Delete documents matching a filter.
Args:
filter_dict: Filter criteria (e.g., {"source_url": "example.com"})
Returns:
Number of documents deleted
"""
try:
if not self.client:
raise StorageError("Weaviate client not initialized")
collection = self.client.collections.get(self.collection_name)
# Build where filter
where_filter = None
if "source_url" in filter_dict:
from weaviate.classes.query import Filter
where_filter = Filter.by_property("source_url").equal(filter_dict["source_url"])
# Get documents matching filter
if where_filter:
response = collection.query.fetch_objects(
filters=where_filter,
limit=1000 # Max batch size
)
else:
response = collection.query.fetch_objects(
limit=1000 # Max batch size
)
# Delete matching documents
deleted_count = 0
for obj in response.objects:
try:
collection.data.delete_by_id(obj.uuid)
deleted_count += 1
except Exception:
continue
return deleted_count
except Exception as e:
raise StorageError(f"Failed to delete by filter: {e}") from e
async def delete_collection(self) -> bool:
"""
Delete the entire collection.
Returns:
True if successful
"""
try:
if not self.client:
raise StorageError("Weaviate client not initialized")
# Delete the collection using the client's collections API
self.client.collections.delete(self.collection_name)
return True
except Exception as e:
raise StorageError(f"Failed to delete collection: {e}") from e
def __del__(self) -> None:
"""Clean up client connection."""
if self.client:
self.client.close()

View File

@@ -0,0 +1,6 @@
"""Utility modules."""
from .metadata_tagger import MetadataTagger
from .vectorizer import Vectorizer
__all__ = ["MetadataTagger", "Vectorizer"]

View File

@@ -0,0 +1,269 @@
"""Metadata tagger for enriching documents with AI-generated tags and metadata."""
import json
from datetime import UTC, datetime
from typing import TypedDict
import httpx
from ..core.exceptions import IngestionError
from ..core.models import Document
class DocumentMetadata(TypedDict, total=False):
"""Structured metadata for documents."""
tags: list[str]
category: str
summary: str
key_topics: list[str]
document_type: str
language: str
technical_level: str
class MetadataTagger:
"""Generates metadata tags for documents using language models."""
endpoint: str
model: str
client: httpx.AsyncClient
def __init__(
self,
llm_endpoint: str = "http://llm.lab",
model: str = "openai/gpt-4o-mini",
):
"""
Initialize metadata tagger.
Args:
llm_endpoint: LLM API endpoint
model: Model to use for tagging
"""
self.endpoint = llm_endpoint
self.model = model
# Get API key from environment
import os
from pathlib import Path
from dotenv import load_dotenv
# Load .env from the project root
env_path = Path(__file__).parent.parent.parent / ".env"
load_dotenv(env_path)
api_key = os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY") or ""
headers = {"Content-Type": "application/json"}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
self.client = httpx.AsyncClient(timeout=60.0, headers=headers)
async def tag_document(
self, document: Document, custom_instructions: str | None = None
) -> Document:
"""
Analyze document and generate metadata tags.
Args:
document: Document to tag
custom_instructions: Optional custom instructions for tagging
Returns:
Document with enriched metadata
"""
if not document.content:
return document
try:
# Generate metadata using LLM
metadata = await self._generate_metadata(
document.content,
document.metadata.get("title") if document.metadata else None,
custom_instructions
)
# Merge with existing metadata - preserve required fields
from ..core.models import DocumentMetadata as CoreDocumentMetadata
updated_metadata: CoreDocumentMetadata = {
"source_url": document.metadata.get("source_url", ""),
"title": metadata.get("title") or document.metadata.get("title"),
"description": metadata.get("summary") or document.metadata.get("description"),
"timestamp": document.metadata.get("timestamp", datetime.now(UTC)),
"content_type": document.metadata.get("content_type", "text/plain"),
"word_count": document.metadata.get("word_count", len(document.content.split())),
"char_count": document.metadata.get("char_count", len(document.content)),
}
# Store additional metadata as extra fields in the document's metadata
# Note: Since DocumentMetadata is a TypedDict, we can only include the defined fields
# Additional metadata like tags, category, etc. would need to be stored separately
# or the DocumentMetadata model would need to be extended
document.metadata = updated_metadata
return document
except Exception as e:
raise IngestionError(f"Failed to tag document: {e}") from e
async def tag_batch(
self,
documents: list[Document],
custom_instructions: str | None = None,
) -> list[Document]:
"""
Tag multiple documents with metadata.
Args:
documents: Documents to tag
custom_instructions: Optional custom instructions
Returns:
Documents with enriched metadata
"""
tagged_docs: list[Document] = []
for doc in documents:
tagged_doc = await self.tag_document(doc, custom_instructions)
tagged_docs.append(tagged_doc)
return tagged_docs
async def _generate_metadata(
self,
content: str,
title: str | None = None,
custom_instructions: str | None = None,
) -> DocumentMetadata:
"""
Generate metadata using LLM.
Args:
content: Document content
title: Document title
custom_instructions: Optional custom instructions
Returns:
Generated metadata dictionary
"""
# Prepare the prompt
system_prompt = """You are a document metadata tagger. Analyze the given content and generate relevant metadata.
Return a JSON object with the following structure:
{
"tags": ["tag1", "tag2", ...], # 3-7 relevant topic tags
"category": "string", # Main category
"summary": "string", # 1-2 sentence summary
"key_topics": ["topic1", "topic2", ...], # Main topics discussed
"document_type": "string", # Type of document (e.g., "technical", "tutorial", "reference")
"language": "string", # Primary language (e.g., "en", "es")
"technical_level": "string" # One of: "beginner", "intermediate", "advanced"
}"""
if custom_instructions:
system_prompt += f"\n\nAdditional instructions: {custom_instructions}"
# Prepare user prompt
user_prompt = "Document to analyze:\n"
if title:
user_prompt += f"Title: {title}\n"
user_prompt += f"Content:\n{content[:3000]}" # Limit content length
# Call LLM
response = await self.client.post(
f"{self.endpoint}/v1/chat/completions",
json={
"model": self.model,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
],
"temperature": 0.3,
"max_tokens": 500,
"response_format": {"type": "json_object"},
},
)
response.raise_for_status()
result = response.json()
if not isinstance(result, dict):
raise IngestionError("Invalid response format from LLM")
# Extract content from response
choices = result.get("choices", [])
if not choices:
raise IngestionError("No response from LLM")
message = choices[0].get("message", {})
content_str = message.get("content", "{}")
try:
metadata = json.loads(content_str)
except json.JSONDecodeError as e:
raise IngestionError(f"Failed to parse LLM response: {e}") from e
# Validate and sanitize metadata
return self._sanitize_metadata(metadata)
def _sanitize_metadata(self, metadata: dict[str, object]) -> DocumentMetadata:
"""
Sanitize and validate metadata.
Args:
metadata: Raw metadata from LLM
Returns:
Sanitized metadata
"""
sanitized: DocumentMetadata = {}
# Tags
if "tags" in metadata and isinstance(metadata["tags"], list):
tags = [str(tag).lower().strip() for tag in metadata["tags"][:10]]
sanitized["tags"] = [tag for tag in tags if tag]
# Category
if "category" in metadata:
sanitized["category"] = str(metadata["category"]).strip()
# Summary
if "summary" in metadata:
summary = str(metadata["summary"]).strip()
if summary:
sanitized["summary"] = summary[:500] # Limit length
# Key topics
if "key_topics" in metadata and isinstance(metadata["key_topics"], list):
topics = [str(topic).strip() for topic in metadata["key_topics"][:10]]
sanitized["key_topics"] = [topic for topic in topics if topic]
# Document type
if "document_type" in metadata:
sanitized["document_type"] = str(metadata["document_type"]).strip()
# Language
if "language" in metadata:
lang = str(metadata["language"]).strip().lower()
if len(lang) == 2: # Basic validation for ISO 639-1
sanitized["language"] = lang
# Technical level
if "technical_level" in metadata:
level = str(metadata["technical_level"]).strip().lower()
if level in ["beginner", "intermediate", "advanced"]:
sanitized["technical_level"] = level
return sanitized
async def __aenter__(self) -> "MetadataTagger":
"""Async context manager entry."""
return self
async def __aexit__(self, *args: object) -> None:
"""Async context manager exit."""
await self.client.aclose()

View File

@@ -0,0 +1,220 @@
"""Vectorizer utility for generating embeddings."""
from types import TracebackType
from typing import Self
import httpx
from ..core.exceptions import VectorizationError
from ..core.models import StorageConfig, VectorConfig
class Vectorizer:
"""Handles text vectorization using LLM endpoints."""
endpoint: str
model: str
dimension: int
client: httpx.AsyncClient
def __init__(self, config: StorageConfig | VectorConfig):
"""
Initialize vectorizer.
Args:
config: Configuration with embedding details
"""
if isinstance(config, StorageConfig):
# Extract vector config from storage config
self.endpoint = "http://llm.lab"
self.model = "ollama/bge-m3:latest"
self.dimension = 1024
else:
self.endpoint = str(config.embedding_endpoint)
self.model = config.model
self.dimension = config.dimension
# Get API key from environment
import os
from dotenv import load_dotenv
from pathlib import Path
# Load .env from the project root
env_path = Path(__file__).parent.parent.parent / ".env"
load_dotenv(env_path)
api_key = os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY") or ""
headers = {"Content-Type": "application/json"}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
self.client = httpx.AsyncClient(timeout=60.0, headers=headers)
async def vectorize(self, text: str) -> list[float]:
"""
Generate embedding vector for text.
Args:
text: Text to vectorize
Returns:
Embedding vector
"""
if not text:
raise VectorizationError("Cannot vectorize empty text")
try:
# Prepare request based on model type
if "ollama" in self.model:
response = await self._ollama_embed(text)
else:
response = await self._openai_embed(text)
return response
except Exception as e:
raise VectorizationError(f"Vectorization failed: {e}") from e
async def vectorize_batch(self, texts: list[str]) -> list[list[float]]:
"""
Generate embeddings for multiple texts.
Args:
texts: List of texts to vectorize
Returns:
List of embedding vectors
"""
vectors: list[list[float]] = []
for text in texts:
vector = await self.vectorize(text)
vectors.append(vector)
return vectors
async def _ollama_embed(self, text: str) -> list[float]:
"""
Generate embedding using Ollama via OpenAI-compatible endpoint.
Args:
text: Text to embed
Returns:
Embedding vector
"""
# Keep the full model name for OpenAI-compatible endpoints
model_name = self.model
# Use OpenAI-compatible endpoint for ollama models
response = await self.client.post(
f"{self.endpoint}/v1/embeddings",
json={
"model": model_name,
"input": text,
},
)
_ = response.raise_for_status()
response_data = response.json()
if not isinstance(response_data, dict):
raise VectorizationError("Invalid response format from OpenAI-compatible API")
# Parse OpenAI-compatible response format
embeddings_raw = response_data.get("data", [])
if not isinstance(embeddings_raw, list) or not embeddings_raw:
raise VectorizationError("No embeddings returned")
first_embedding_data = embeddings_raw[0]
if not isinstance(first_embedding_data, dict):
raise VectorizationError("Invalid embedding data format")
embedding_raw = first_embedding_data.get("embedding")
if not isinstance(embedding_raw, list):
raise VectorizationError("Invalid embedding format")
# Convert to float list and validate
embedding: list[float] = []
for item in embedding_raw:
if isinstance(item, (int, float)):
embedding.append(float(item))
else:
raise VectorizationError(f"Invalid embedding value type: {type(item)}")
# Ensure correct dimension
if len(embedding) != self.dimension:
# Truncate or pad as needed
if len(embedding) > self.dimension:
embedding = embedding[: self.dimension]
else:
embedding.extend([0.0] * (self.dimension - len(embedding)))
return embedding
async def _openai_embed(self, text: str) -> list[float]:
"""
Generate embedding using OpenAI-compatible API.
Args:
text: Text to embed
Returns:
Embedding vector
"""
response = await self.client.post(
f"{self.endpoint}/v1/embeddings",
json={
"model": self.model,
"input": text,
},
)
_ = response.raise_for_status()
response_data = response.json()
if not isinstance(response_data, dict):
raise VectorizationError("Invalid response format from OpenAI API")
data: dict[str, list[dict[str, list[float]]]] = response_data
embeddings_raw = data.get("data", [])
if not isinstance(embeddings_raw, list) or not embeddings_raw:
raise VectorizationError("No embeddings returned")
first_embedding_data = embeddings_raw[0]
if not isinstance(first_embedding_data, dict):
raise VectorizationError("Invalid embedding data format")
embedding_raw = first_embedding_data.get("embedding")
if not isinstance(embedding_raw, list):
raise VectorizationError("Invalid embedding format")
# Convert to float list and validate
embedding: list[float] = []
for item in embedding_raw:
if isinstance(item, (int, float)):
embedding.append(float(item))
else:
raise VectorizationError(f"Invalid embedding value type: {type(item)}")
# Ensure correct dimension
if len(embedding) != self.dimension:
if len(embedding) > self.dimension:
embedding = embedding[: self.dimension]
else:
embedding.extend([0.0] * (self.dimension - len(embedding)))
return embedding
async def __aenter__(self) -> Self:
"""Async context manager entry."""
return self
async def __aexit__(
self,
exc_type: type[BaseException] | None,
exc_val: BaseException | None,
exc_tb: TracebackType | None,
) -> None:
"""Async context manager exit."""
await self.client.aclose()

78
pyproject.toml Normal file
View File

@@ -0,0 +1,78 @@
[project]
name = "ingest-pipeline"
version = "0.1.0"
description = "Document ingestion pipeline with Prefect orchestration"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"prefect>=2.14.0",
"pydantic>=2.5.0",
"pydantic-settings>=2.1.0",
"firecrawl-py>=0.0.1",
"gitpython>=3.1.40",
"weaviate-client>=4.4.0",
"httpx>=0.25.0",
"typer>=0.9.0",
"rich>=13.7.0",
"textual>=0.50.0",
"python-dotenv>=1.0.0",
]
[project.scripts]
ingest = "ingest_pipeline.cli.main:app"
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = ["ingest_pipeline"]
[tool.uv]
dev-dependencies = [
"pytest>=7.4.0",
"pytest-asyncio>=0.21.0",
"pytest-cov>=4.1.0",
"mypy>=1.7.0",
"ruff>=0.1.0",
"basedpyright>=1.31.4",
]
[tool.ruff]
line-length = 100
target-version = "py311"
[tool.ruff.lint]
select = [
"E", # pycodestyle errors
"W", # pycodestyle warnings
"F", # pyflakes
"I", # isort
"B", # flake8-bugbear
"C4", # flake8-comprehensions
"UP", # pyupgrade
]
ignore = [
"E501", # line too long (handled by formatter)
]
[tool.ruff.lint.per-file-ignores]
"ingest_pipeline/cli/main.py" = ["B008"] # Typer uses function calls in defaults
[tool.mypy]
python_version = "3.11"
strict = true
warn_return_any = true
warn_unused_configs = true
ignore_missing_imports = true
# Allow AsyncGenerator types in overrides
disable_error_code = ["override"]
[tool.pytest.ini_options]
asyncio_mode = "auto"
testpaths = ["tests"]
pythonpath = ["."]
[tool.coverage.run]
source = ["ingest_pipeline"]
omit = ["*/tests/*", "*/__main__.py"]

3
tui Executable file
View File

@@ -0,0 +1,3 @@
#!/usr/bin/env bash
cd "$(dirname "$0")"
uv run python -m ingest_pipeline tui

2771
uv.lock generated Normal file

File diff suppressed because it is too large Load Diff