init
This commit is contained in:
11
.claude/settings.local.json
Normal file
11
.claude/settings.local.json
Normal file
@@ -0,0 +1,11 @@
|
||||
{
|
||||
"permissions": {
|
||||
"allow": [
|
||||
"mcp__context7__resolve-library-id",
|
||||
"mcp__context7__get-library-docs",
|
||||
"mcp__sequential-thinking__sequentialthinking"
|
||||
],
|
||||
"deny": [],
|
||||
"ask": []
|
||||
}
|
||||
}
|
||||
51
.env
Normal file
51
.env
Normal file
@@ -0,0 +1,51 @@
|
||||
WEAVIATE_IS_LOCAL=True
|
||||
|
||||
# URL can be just a host or full URL; defaults shown below
|
||||
WCD_URL=http://weaviate.yo # or http://localhost:8080
|
||||
# LOCAL_WEAVIATE_PORT=8080 # optional override
|
||||
# LOCAL_WEAVIATE_GRPC_PORT=50051 # optional override
|
||||
|
||||
# No API key required for local unless you enabled local auth
|
||||
# WCD_API_KEY=
|
||||
# API Keys (only if not using local/self-hosted services)
|
||||
FIRECRAWL_API_KEY=dummy-key
|
||||
OPENWEBUI_API_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6IjlmNjEwODg2LWRhM2MtNDQ4YS05OWE0LTYyZGEyZjIyZjJiNiJ9.W-dqabcE4F-LQ--k2yrJM_KEBDB-wi1CmoahlN1tQbY
|
||||
OPENWEBUI_API_URL=http://chat.lab
|
||||
WEAVIATE_API_KEY=
|
||||
OPENAI_API_KEY=sk-1234
|
||||
LLM_API_KEY=sk-1234
|
||||
# Endpoints
|
||||
LLM_ENDPOINT=http://llm.lab
|
||||
WEAVIATE_ENDPOINT=http://weaviate.yo
|
||||
OPENWEBUI_ENDPOINT=http://chat.lab
|
||||
FIRECRAWL_ENDPOINT=http://crawl.lab:30002
|
||||
|
||||
# Model Configuration
|
||||
EMBEDDING_MODEL=ollama/bge-m3:latest
|
||||
EMBEDDING_DIMENSION=1024
|
||||
|
||||
# Ingestion Settings
|
||||
BATCH_SIZE=50
|
||||
MAX_FILE_SIZE=1000000
|
||||
MAX_CRAWL_DEPTH=5
|
||||
MAX_CRAWL_PAGES=100
|
||||
|
||||
# Storage Settings
|
||||
DEFAULT_STORAGE_BACKEND=weaviate
|
||||
COLLECTION_PREFIX=docs
|
||||
|
||||
# Prefect Settings
|
||||
PREFECT_API_URL=http://prefect.lab/api
|
||||
PREFECT_API_KEY=0nR4WAkQ3q9MY1bjqATK6pVmolighvrS
|
||||
PREFECT_WORK_POOL=default
|
||||
|
||||
# Scheduling
|
||||
DEFAULT_SCHEDULE_INTERVAL=60
|
||||
|
||||
# Performance
|
||||
MAX_CONCURRENT_TASKS=5
|
||||
REQUEST_TIMEOUT=60
|
||||
|
||||
# Logging
|
||||
LOG_LEVEL=INFO
|
||||
FIRST_START_ELYSIA='1'
|
||||
39
.env.example
Normal file
39
.env.example
Normal file
@@ -0,0 +1,39 @@
|
||||
# API Keys (only if not using local/self-hosted services)
|
||||
FIRECRAWL_API_KEY=
|
||||
OPENWEBUI_API_KEY=
|
||||
WEAVIATE_API_KEY=
|
||||
|
||||
# Endpoints
|
||||
LLM_ENDPOINT=http://llm.lab
|
||||
WEAVIATE_ENDPOINT=http://weaviate.yo
|
||||
OPENWEBUI_ENDPOINT=http://chat.lab
|
||||
FIRECRAWL_ENDPOINT=http://crawl.lab:30002
|
||||
|
||||
# Model Configuration
|
||||
EMBEDDING_MODEL=ollama/bge-m3:latest
|
||||
EMBEDDING_DIMENSION=1024
|
||||
|
||||
# Ingestion Settings
|
||||
BATCH_SIZE=50
|
||||
MAX_FILE_SIZE=1000000
|
||||
MAX_CRAWL_DEPTH=5
|
||||
MAX_CRAWL_PAGES=100
|
||||
|
||||
# Storage Settings
|
||||
DEFAULT_STORAGE_BACKEND=weaviate
|
||||
COLLECTION_PREFIX=docs
|
||||
|
||||
# Prefect Settings
|
||||
PREFECT_API_URL=
|
||||
PREFECT_API_KEY=
|
||||
PREFECT_WORK_POOL=default
|
||||
|
||||
# Scheduling
|
||||
DEFAULT_SCHEDULE_INTERVAL=60
|
||||
|
||||
# Performance
|
||||
MAX_CONCURRENT_TASKS=5
|
||||
REQUEST_TIMEOUT=60
|
||||
|
||||
# Logging
|
||||
LOG_LEVEL=INFO
|
||||
100
CLAUDE.md
Normal file
100
CLAUDE.md
Normal file
@@ -0,0 +1,100 @@
|
||||
# CLAUDE.md
|
||||
|
||||
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
||||
|
||||
## Project Overview
|
||||
|
||||
This is a modular document ingestion pipeline using Prefect for orchestrating ingestion from web/documentation sites (via Firecrawl) and Git repositories (via Repomix) into Weaviate vector database or Open WebUI knowledge endpoints.
|
||||
|
||||
## Development Commands
|
||||
|
||||
### Environment Setup
|
||||
```bash
|
||||
# Install dependencies using uv (required)
|
||||
uv sync
|
||||
|
||||
# Activate virtual environment
|
||||
source .venv/bin/activate
|
||||
|
||||
# Install repomix globally (required for repository ingestion)
|
||||
npm install -g repomix
|
||||
|
||||
# Configure environment
|
||||
cp .env.example .env
|
||||
# Edit .env with your settings
|
||||
```
|
||||
|
||||
### Running the Application
|
||||
```bash
|
||||
# One-time ingestion
|
||||
python -m ingest_pipeline ingest <url> --type web --storage weaviate
|
||||
|
||||
# Schedule recurring ingestion
|
||||
python -m ingest_pipeline schedule <name> <url> --type web --storage weaviate --cron "0 2 * * *"
|
||||
|
||||
# Start deployment server
|
||||
python -m ingest_pipeline serve
|
||||
|
||||
# View configuration
|
||||
python -m ingest_pipeline config
|
||||
```
|
||||
|
||||
### Code Quality
|
||||
```bash
|
||||
# Run linting
|
||||
uv run ruff check .
|
||||
uv run ruff format .
|
||||
|
||||
# Type checking
|
||||
uv run mypy ingest_pipeline
|
||||
|
||||
# Install dev dependencies
|
||||
uv sync --dev
|
||||
```
|
||||
|
||||
## Architecture
|
||||
|
||||
The pipeline follows a modular architecture with clear separation of concerns:
|
||||
|
||||
- **Ingestors** (`ingest_pipeline/ingestors/`): Abstract base class pattern for different data sources (Firecrawl for web, Repomix for repositories)
|
||||
- **Storage Adapters** (`ingest_pipeline/storage/`): Abstract base class for storage backends (Weaviate, Open WebUI)
|
||||
- **Prefect Flows** (`ingest_pipeline/flows/`): Orchestration layer using Prefect for scheduling and task management
|
||||
- **CLI** (`ingest_pipeline/cli/main.py`): Typer-based command interface with commands: `ingest`, `schedule`, `serve`, `config`
|
||||
|
||||
## Key Implementation Details
|
||||
|
||||
### Type Safety
|
||||
- Strict typing enforced with no `Any` types allowed
|
||||
- Modern typing syntax using `|` instead of `Union`
|
||||
- Pydantic v2+ for all models and settings
|
||||
- All models in `core/models.py` use TypedDict for metadata and strict Pydantic models
|
||||
|
||||
### Configuration Management
|
||||
- Settings loaded from `.env` file via Pydantic Settings
|
||||
- Cached singleton pattern in `config/settings.py` using `@lru_cache`
|
||||
- Environment-specific endpoints configured for local services (llm.lab, weaviate.yo, chat.lab)
|
||||
|
||||
### Flow Orchestration
|
||||
- Main ingestion flow in `flows/ingestion.py` with retry logic and task decorators
|
||||
- Deployment scheduling in `flows/scheduler.py` supporting both cron and interval schedules
|
||||
- Tasks use Prefect's `@task` decorator with retries and tags for monitoring
|
||||
|
||||
### Storage Backends
|
||||
- Weaviate: Uses batch ingestion with configurable batch size, automatic collection creation
|
||||
- Open WebUI: Direct API integration for knowledge base management
|
||||
- Both inherit from abstract `BaseStorage` class ensuring consistent interface
|
||||
|
||||
## Service Endpoints
|
||||
|
||||
- **LLM Proxy**: http://llm.lab (for embeddings and processing)
|
||||
- **Weaviate**: http://weaviate.yo (vector database)
|
||||
- **Open WebUI**: http://chat.lab (knowledge interface)
|
||||
- **Firecrawl**: http://crawl.lab:30002 (web crawling service)
|
||||
|
||||
## Important Constraints
|
||||
|
||||
- Cyclomatic complexity must remain < 15 for all functions
|
||||
- Maximum file size for ingestion: 1MB
|
||||
- Batch size limits: 50-500 documents
|
||||
- Concurrent task limit: 5 (configurable via MAX_CONCURRENT_TASKS)
|
||||
- All async operations use proper async/await patterns
|
||||
150
README.md
Normal file
150
README.md
Normal file
@@ -0,0 +1,150 @@
|
||||
# Document Ingestion Pipeline
|
||||
|
||||
A modular, type-safe Python application using Prefect for scheduling ingestion jobs from web/documentation sites (via Firecrawl) and Git repositories (via Repomix) into Weaviate or Open WebUI knowledge endpoints.
|
||||
|
||||
## Features
|
||||
|
||||
- **Multiple Data Sources**:
|
||||
- Web/documentation sites via Firecrawl
|
||||
- Git repositories via Repomix
|
||||
|
||||
- **Multiple Storage Backends**:
|
||||
- Weaviate vector database (self-hosted at http://weaviate.yo)
|
||||
- Open WebUI knowledge endpoints (http://chat.lab)
|
||||
|
||||
- **Scheduling & Orchestration**:
|
||||
- Prefect-based workflow orchestration
|
||||
- Cron and interval-based scheduling
|
||||
- Concurrent task execution
|
||||
|
||||
- **Type Safety**:
|
||||
- Strict Python typing with no `Any` types
|
||||
- Modern typing syntax (using `|` instead of `Union`)
|
||||
- Pydantic models for validation
|
||||
|
||||
- **Code Quality**:
|
||||
- Modular architecture
|
||||
- Cyclomatic complexity < 15
|
||||
- Clean separation of concerns
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
# Install dependencies
|
||||
pip install -r requirements.txt
|
||||
|
||||
# Install repomix globally (required for repository ingestion)
|
||||
npm install -g repomix
|
||||
|
||||
# Copy and configure environment
|
||||
cp .env.example .env
|
||||
# Edit .env with your settings
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### One-time Ingestion
|
||||
|
||||
```bash
|
||||
# Ingest a documentation site into Weaviate
|
||||
python -m ingest_pipeline ingest https://docs.example.com --type web --storage weaviate
|
||||
|
||||
# Ingest a repository into Open WebUI
|
||||
python -m ingest_pipeline ingest https://github.com/user/repo --type repository --storage open_webui
|
||||
```
|
||||
|
||||
### Scheduled Ingestion
|
||||
|
||||
```bash
|
||||
# Create a daily documentation crawl
|
||||
python -m ingest_pipeline schedule daily-docs https://docs.example.com \
|
||||
--type documentation \
|
||||
--storage weaviate \
|
||||
--cron "0 2 * * *"
|
||||
|
||||
# Create an hourly repository sync
|
||||
python -m ingest_pipeline schedule repo-sync https://github.com/user/repo \
|
||||
--type repository \
|
||||
--storage open_webui \
|
||||
--interval 60
|
||||
```
|
||||
|
||||
### Serve Deployments
|
||||
|
||||
```bash
|
||||
# Start serving scheduled deployments
|
||||
python -m ingest_pipeline serve
|
||||
```
|
||||
|
||||
### Configuration
|
||||
|
||||
```bash
|
||||
# View current configuration
|
||||
python -m ingest_pipeline config
|
||||
```
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
ingest_pipeline/
|
||||
├── core/ # Core models and exceptions
|
||||
│ ├── models.py # Pydantic models with strict typing
|
||||
│ └── exceptions.py # Custom exceptions
|
||||
├── ingestors/ # Data source ingestors
|
||||
│ ├── base.py # Abstract base ingestor
|
||||
│ ├── firecrawl.py # Web/docs ingestion via Firecrawl
|
||||
│ └── repomix.py # Repository ingestion via Repomix
|
||||
├── storage/ # Storage adapters
|
||||
│ ├── base.py # Abstract base storage
|
||||
│ ├── weaviate.py # Weaviate adapter
|
||||
│ └── openwebui.py # Open WebUI adapter
|
||||
├── flows/ # Prefect flows
|
||||
│ ├── ingestion.py # Main ingestion flow
|
||||
│ └── scheduler.py # Deployment scheduling
|
||||
├── config/ # Configuration management
|
||||
│ └── settings.py # Settings with Pydantic
|
||||
├── utils/ # Utilities
|
||||
│ └── vectorizer.py # Text vectorization
|
||||
└── cli/ # CLI interface
|
||||
└── main.py # Typer-based CLI
|
||||
```
|
||||
|
||||
## Environment Variables
|
||||
|
||||
- `FIRECRAWL_API_KEY`: API key for Firecrawl (optional)
|
||||
- `LLM_ENDPOINT`: LLM proxy endpoint (default: http://llm.lab)
|
||||
- `WEAVIATE_ENDPOINT`: Weaviate endpoint (default: http://weaviate.yo)
|
||||
- `OPENWEBUI_ENDPOINT`: Open WebUI endpoint (default: http://chat.lab)
|
||||
- `EMBEDDING_MODEL`: Model for embeddings (default: ollama/bge-m3:latest)
|
||||
|
||||
## Vectorization
|
||||
|
||||
The pipeline uses your LLM proxy at http://llm.lab with:
|
||||
- Model: `ollama/gpt-oss:20b` for processing
|
||||
- Embeddings: `ollama/bge-m3:latest` for vectorization
|
||||
|
||||
## Storage Backends
|
||||
|
||||
### Weaviate
|
||||
- Endpoint: http://weaviate.yo
|
||||
- Automatic collection creation
|
||||
- Vector similarity search
|
||||
- Batch ingestion support
|
||||
|
||||
### Open WebUI
|
||||
- Endpoint: http://chat.lab/docs
|
||||
- Knowledge base integration
|
||||
- Direct API access
|
||||
- Document management
|
||||
|
||||
## Development
|
||||
|
||||
The codebase follows strict typing and quality standards:
|
||||
- No use of `Any` type
|
||||
- Modern Python typing syntax
|
||||
- Cyclomatic complexity < 15
|
||||
- Modular, testable architecture
|
||||
|
||||
## License
|
||||
|
||||
MIT
|
||||
24
basedpyright.json
Normal file
24
basedpyright.json
Normal file
@@ -0,0 +1,24 @@
|
||||
{
|
||||
"include": [
|
||||
"ingest_pipeline"
|
||||
],
|
||||
"exclude": [
|
||||
"**/__pycache__",
|
||||
"**/.pytest_cache",
|
||||
"**/node_modules",
|
||||
".venv"
|
||||
],
|
||||
"reportCallInDefaultInitializer": "none",
|
||||
"reportUnknownVariableType": "warning",
|
||||
"reportUnknownMemberType": "warning",
|
||||
"reportUnknownArgumentType": "warning",
|
||||
"reportUnknownLambdaType": "warning",
|
||||
"reportUnknownParameterType": "warning",
|
||||
"reportMissingParameterType": "warning",
|
||||
"reportUnannotatedClassAttribute": "warning",
|
||||
"reportAny": "warning",
|
||||
"reportUnusedCallResult": "none",
|
||||
"reportUnnecessaryIsInstance": "none",
|
||||
"reportImplicitOverride": "none",
|
||||
"reportDeprecated": "warning"
|
||||
}
|
||||
248
docs/elysia.md
Normal file
248
docs/elysia.md
Normal file
@@ -0,0 +1,248 @@
|
||||
38 async def output_resources(): │
|
||||
│ │
|
||||
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/services/use │
|
||||
│ r.py:223 in check_all_trees_timeout │
|
||||
│ │
|
||||
│ 220 │ │ Check all trees in all TreeManagers across all users and remove any │
|
||||
│ not been active in the last tree_timeout. │
|
||||
│ 221 │ │ """ │
|
||||
│ 222 │ │ for user_id in self.users: │
|
||||
│ ❱ 223 │ │ │ self.users[user_id]["tree_manager"].check_all_trees_timeout() │
|
||||
│ 224 │ │
|
||||
│ 225 │ def check_user_timeout(self, user_id: str): │
|
||||
│ 226 │ │ """ │
|
||||
╰───────────────────────────────────────────────────────────────────────────────────╯
|
||||
KeyError: 'tree_manager'
|
||||
[10:08:31] ERROR Job "check_timeouts (trigger: interval[0:00:29], next run at: 2025-09-15 10:09:00 base.py:195
|
||||
EDT)" raised an exception
|
||||
╭──────────────────────── Traceback (most recent call last) ────────────────────────╮
|
||||
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/apscheduler/executors/b │
|
||||
│ ase.py:181 in run_coroutine_job │
|
||||
│ │
|
||||
│ 178 │ │ │
|
||||
│ 179 │ │ logger.info('Running job "%s" (scheduled at %s)', job, run_time) │
|
||||
│ 180 │ │ try: │
|
||||
│ ❱ 181 │ │ │ retval = await job.func(*job.args, **job.kwargs) │
|
||||
│ 182 │ │ except BaseException: │
|
||||
│ 183 │ │ │ exc, tb = sys.exc_info()[1:] │
|
||||
│ 184 │ │ │ formatted_tb = "".join(format_tb(tb)) │
|
||||
│ │
|
||||
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/app.py:35 in │
|
||||
│ check_timeouts │
|
||||
│ │
|
||||
│ 32 │
|
||||
│ 33 async def check_timeouts(): │
|
||||
│ 34 │ user_manager = get_user_manager() │
|
||||
│ ❱ 35 │ await user_manager.check_all_trees_timeout() │
|
||||
│ 36 │
|
||||
│ 37 │
|
||||
│ 38 async def output_resources(): │
|
||||
│ │
|
||||
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/services/use │
|
||||
│ r.py:223 in check_all_trees_timeout │
|
||||
│ │
|
||||
│ 220 │ │ Check all trees in all TreeManagers across all users and remove any │
|
||||
│ not been active in the last tree_timeout. │
|
||||
│ 221 │ │ """ │
|
||||
│ 222 │ │ for user_id in self.users: │
|
||||
│ ❱ 223 │ │ │ self.users[user_id]["tree_manager"].check_all_trees_timeout() │
|
||||
│ 224 │ │
|
||||
│ 225 │ def check_user_timeout(self, user_id: str): │
|
||||
│ 226 │ │ """ │
|
||||
╰───────────────────────────────────────────────────────────────────────────────────╯
|
||||
KeyError: 'tree_manager'
|
||||
[10:26:25] WARNING Run time of job "check_restart_clients (trigger: interval[0:00:31], next run at: base.py:176
|
||||
2025-09-15 10:26:33 EDT)" was missed by 0:00:23.029499
|
||||
WARNING Run time of job "check_timeouts (trigger: interval[0:00:29], next run at: 2025-09-15 base.py:176
|
||||
10:26:53 EDT)" was missed by 0:00:01.030848
|
||||
WARNING Run time of job "output_resources (trigger: interval[0:18:23], next run at: base.py:176
|
||||
2025-09-15 10:33:44 EDT)" was missed by 0:11:04.063842
|
||||
[10:41:41] WARNING Run time of job "check_restart_clients (trigger: interval[0:00:31], next run at: base.py:176
|
||||
2025-09-15 10:42:03 EDT)" was missed by 0:00:09.036380
|
||||
WARNING Run time of job "check_timeouts (trigger: interval[0:00:29], next run at: 2025-09-15 base.py:176
|
||||
10:41:52 EDT)" was missed by 0:00:18.037363
|
||||
WARNING Run time of job "output_resources (trigger: interval[0:18:23], next run at: base.py:176
|
||||
2025-09-15 10:52:07 EDT)" was missed by 0:07:57.071763
|
||||
[10:51:25] WARNING Run time of job "check_timeouts (trigger: interval[0:00:29], next run at: 2025-09-15 base.py:176
|
||||
10:51:32 EDT)" was missed by 0:00:21.808772
|
||||
WARNING Run time of job "check_restart_clients (trigger: interval[0:00:31], next run at: base.py:176
|
||||
2025-09-15 10:51:52 EDT)" was missed by 0:00:03.810823
|
||||
[10:51:32] ERROR Job "check_timeouts (trigger: interval[0:00:29], next run at: 2025-09-15 10:52:01 base.py:195
|
||||
EDT)" raised an exception
|
||||
╭──────────────────────── Traceback (most recent call last) ────────────────────────╮
|
||||
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/apscheduler/executors/b │
|
||||
│ ase.py:181 in run_coroutine_job │
|
||||
│ │
|
||||
│ 178 │ │ │
|
||||
│ 179 │ │ logger.info('Running job "%s" (scheduled at %s)', job, run_time) │
|
||||
│ 180 │ │ try: │
|
||||
│ ❱ 181 │ │ │ retval = await job.func(*job.args, **job.kwargs) │
|
||||
│ 182 │ │ except BaseException: │
|
||||
│ 183 │ │ │ exc, tb = sys.exc_info()[1:] │
|
||||
│ 184 │ │ │ formatted_tb = "".join(format_tb(tb)) │
|
||||
│ │
|
||||
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/app.py:35 in │
|
||||
│ check_timeouts │
|
||||
│ │
|
||||
│ 32 │
|
||||
│ 33 async def check_timeouts(): │
|
||||
│ 34 │ user_manager = get_user_manager() │
|
||||
│ ❱ 35 │ await user_manager.check_all_trees_timeout() │
|
||||
│ 36 │
|
||||
│ 37 │
|
||||
│ 38 async def output_resources(): │
|
||||
│ │
|
||||
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/services/use │
|
||||
│ r.py:223 in check_all_trees_timeout │
|
||||
│ │
|
||||
│ 220 │ │ Check all trees in all TreeManagers across all users and remove any │
|
||||
│ not been active in the last tree_timeout. │
|
||||
│ 221 │ │ """ │
|
||||
│ 222 │ │ for user_id in self.users: │
|
||||
│ ❱ 223 │ │ │ self.users[user_id]["tree_manager"].check_all_trees_timeout() │
|
||||
│ 224 │ │
|
||||
│ 225 │ def check_user_timeout(self, user_id: str): │
|
||||
│ 226 │ │ """ │
|
||||
╰───────────────────────────────────────────────────────────────────────────────────╯
|
||||
KeyError: 'tree_manager'
|
||||
[10:51:43] ERROR Unexpected error: 'client_manager' error_handlers.py:32
|
||||
INFO: 127.0.0.1:50043 - "GET /feedback/metadata/b6c0f65db8197395b453a7777a5e4c44 HTTP/1.1" 500 Internal Server Error
|
||||
ERROR: Exception in ASGI application
|
||||
Traceback (most recent call last):
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/uvicorn/protocols/http/httptools_impl.py", line 409, in run_asgi
|
||||
result = await app( # type: ignore[func-returns-value]
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/uvicorn/middleware/proxy_headers.py", line 60, in __call__
|
||||
return await self.app(scope, receive, send)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/fastapi/applications.py", line 1054, in __call__
|
||||
await super().__call__(scope, receive, send)
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/applications.py", line 113, in __call__
|
||||
await self.middleware_stack(scope, receive, send)
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/middleware/errors.py", line 186, in __call__
|
||||
raise exc
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/middleware/errors.py", line 164, in __call__
|
||||
await self.app(scope, receive, _send)
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/middleware/cors.py", line 85, in __call__
|
||||
await self.app(scope, receive, send)
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/middleware/exceptions.py", line 63, in __call__
|
||||
await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
|
||||
raise exc
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app
|
||||
await app(scope, receive, sender)
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/routing.py", line 716, in __call__
|
||||
await self.middleware_stack(scope, receive, send)
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/routing.py", line 736, in app
|
||||
await route.handle(scope, receive, send)
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/routing.py", line 290, in handle
|
||||
await self.app(scope, receive, send)
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/routing.py", line 78, in app
|
||||
await wrap_app_handling_exceptions(app, request)(scope, receive, send)
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
|
||||
raise exc
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app
|
||||
await app(scope, receive, sender)
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/routing.py", line 75, in app
|
||||
response = await f(request)
|
||||
^^^^^^^^^^^^^^^^
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/fastapi/routing.py", line 302, in app
|
||||
raw_response = await run_endpoint_function(
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/fastapi/routing.py", line 213, in run_endpoint_function
|
||||
return await dependant.call(**values)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/routes/feedback.py", line 81, in run_feedback_metadata
|
||||
client_manager: ClientManager = user["client_manager"]
|
||||
~~~~^^^^^^^^^^^^^^^^^^
|
||||
KeyError: 'client_manager'
|
||||
ERROR HTTP error occurred: Not Found error_handlers.py:14
|
||||
INFO: 127.0.0.1:50045 - "GET /icon.svg?d6c34577c7161f78 HTTP/1.1" 404 Not Found
|
||||
INFO: 127.0.0.1:50045 - "GET /user/config/models HTTP/1.1" 200 OK
|
||||
INFO: 127.0.0.1:50054 - "GET /user/config/models HTTP/1.1" 200 OK
|
||||
[10:52:01] ERROR Job "check_timeouts (trigger: interval[0:00:29], next run at: 2025-09-15 10:52:30 base.py:195
|
||||
EDT)" raised an exception
|
||||
╭──────────────────────── Traceback (most recent call last) ────────────────────────╮
|
||||
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/apscheduler/executors/b │
|
||||
│ ase.py:181 in run_coroutine_job │
|
||||
│ │
|
||||
│ 178 │ │ │
|
||||
│ 179 │ │ logger.info('Running job "%s" (scheduled at %s)', job, run_time) │
|
||||
│ 180 │ │ try: │
|
||||
│ ❱ 181 │ │ │ retval = await job.func(*job.args, **job.kwargs) │
|
||||
│ 182 │ │ except BaseException: │
|
||||
│ 183 │ │ │ exc, tb = sys.exc_info()[1:] │
|
||||
│ 184 │ │ │ formatted_tb = "".join(format_tb(tb)) │
|
||||
│ │
|
||||
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/app.py:35 in │
|
||||
│ check_timeouts │
|
||||
│ │
|
||||
│ 32 │
|
||||
│ 33 async def check_timeouts(): │
|
||||
│ 34 │ user_manager = get_user_manager() │
|
||||
│ ❱ 35 │ await user_manager.check_all_trees_timeout() │
|
||||
│ 36 │
|
||||
│ 37 │
|
||||
│ 38 async def output_resources(): │
|
||||
│ │
|
||||
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/services/use │
|
||||
│ r.py:223 in check_all_trees_timeout │
|
||||
│ │
|
||||
│ 220 │ │ Check all trees in all TreeManagers across all users and remove any │
|
||||
│ not been active in the last tree_timeout. │
|
||||
│ 221 │ │ """ │
|
||||
│ 222 │ │ for user_id in self.users: │
|
||||
│ ❱ 223 │ │ │ self.users[user_id]["tree_manager"].check_all_trees_timeout() │
|
||||
│ 224 │ │
|
||||
│ 225 │ def check_user_timeout(self, user_id: str): │
|
||||
│ 226 │ │ """ │
|
||||
╰───────────────────────────────────────────────────────────────────────────────────╯
|
||||
KeyError: 'tree_manager'
|
||||
^X[10:52:07] ERROR Job "output_resources (trigger: interval[0:18:23], next run at: 2025-09-15 11:10:30 base.py:195
|
||||
EDT)" raised an exception
|
||||
╭──────────────────────── Traceback (most recent call last) ────────────────────────╮
|
||||
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/apscheduler/executors/b │
|
||||
│ ase.py:181 in run_coroutine_job │
|
||||
│ │
|
||||
│ 178 │ │ │
|
||||
│ 179 │ │ logger.info('Running job "%s" (scheduled at %s)', job, run_time) │
|
||||
│ 180 │ │ try: │
|
||||
│ ❱ 181 │ │ │ retval = await job.func(*job.args, **job.kwargs) │
|
||||
│ 182 │ │ except BaseException: │
|
||||
│ 183 │ │ │ exc, tb = sys.exc_info()[1:] │
|
||||
│ 184 │ │ │ formatted_tb = "".join(format_tb(tb)) │
|
||||
│ │
|
||||
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/app.py:40 in │
|
||||
│ output_resources │
|
||||
│ │
|
||||
│ 37 │
|
||||
│ 38 async def output_resources(): │
|
||||
│ 39 │ user_manager = get_user_manager() │
|
||||
│ ❱ 40 │ await print_resources(user_manager, save_to_file=True) │
|
||||
│ 41 │
|
||||
│ 42 │
|
||||
│ 43 async def check_restart_clients(): │
|
||||
│ │
|
||||
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/utils/resour │
|
||||
│ ces.py:59 in print_resources │
|
||||
│ │
|
||||
│ 56 │ user_manager: UserManager | None = None, save_to_file: bool = False │
|
||||
│ 57 ): │
|
||||
│ 58 │ if user_manager is not None: │
|
||||
│ ❱ 59 │ │ avg_user_memory, avg_tree_memory = await get_average_user_memory(us │
|
||||
│ 60 │ │ # avg_user_requests = await get_average_user_requests(user_manager) │
|
||||
│ 61 │ │ # num_users_db = await get_number_local_users_db(user_manager) │
|
||||
│ 62 │
|
||||
│ │
|
||||
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/utils/resour │
|
||||
│ ces.py:37 in get_average_user_memory │
|
||||
│ │
|
||||
│ 34 │ avg_tree_memory = 0 │
|
||||
│ 35 │ for user in user_manager.users.values(): │
|
||||
│ 36 │ │ user_memory = 0 │
|
||||
│ ❱ 37 │ │ for tree in user["tree_manager"].trees.values(): │
|
||||
│ 38 │ │ │ user_memory += tree["tree"].detailed_memory_usage()["total"] / │
|
||||
│ 39 │ │ │
|
||||
│ 40 │ │ if len(user["tree_manager"].trees) > 0: │
|
||||
╰───────────────────────────────────────────────────────────────────────────────────╯
|
||||
KeyError: 'tree_manager'
|
||||
108
docs/tagging.md
Normal file
108
docs/tagging.md
Normal file
@@ -0,0 +1,108 @@
|
||||
Here are clear written examples of **metadata tagging** in both Open WebUI and Weaviate, showing how you can associate tags and structured metadata with knowledge objects for RAG and semantic search.
|
||||
|
||||
***
|
||||
|
||||
### Example: Metadata Tagging in Open WebUI
|
||||
|
||||
You send a document to the Open WebUI API endpoint, attaching metadata and tags in the content field as a JSON string:
|
||||
|
||||
```json
|
||||
POST http://localhost/api/v1/documents/create
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"name": "policy_doc_2022",
|
||||
"title": "2022 Policy Handbook",
|
||||
"collection_name": "company_handbooks",
|
||||
"filename": "policy_2022.pdf",
|
||||
"content": "{\"tags\": [\"policy\", \"2022\", \"hr\"], \"source_url\": \"https://example.com/policy_2022.pdf\", \"author\": \"Jane Doe\"}"
|
||||
}
|
||||
```
|
||||
- The `"tags"` field is a list of labels for classification (policy, 2022, hr).
|
||||
- The `"source_url"` and `"author"` fields provide additional metadata useful for retrieval, audit, and filtering.[1][2]
|
||||
|
||||
For pipeline-based ingestion, you might design a function to extract and append metadata before vectorization:
|
||||
|
||||
```python
|
||||
metadata = {
|
||||
"tags": ["policy", "2022"],
|
||||
"source_url": document_url,
|
||||
"author": document_author
|
||||
}
|
||||
embed_with_metadata(chunk, metadata)
|
||||
```
|
||||
This metadata becomes part of your retrieval context in RAG workflows.[1]
|
||||
|
||||
***
|
||||
|
||||
### Example: Metadata Tagging in Weaviate
|
||||
|
||||
In Weaviate, metadata and tags are defined directly in the schema and attached to each object when added:
|
||||
|
||||
**Schema definition:**
|
||||
|
||||
```json
|
||||
{
|
||||
"class": "Document",
|
||||
"properties": [
|
||||
{"name": "title", "dataType": ["text"]},
|
||||
{"name": "tags", "dataType": ["text[]"]},
|
||||
{"name": "source_url", "dataType": ["text"]},
|
||||
{"name": "author", "dataType": ["text"]}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
**Object creation example:**
|
||||
|
||||
```python
|
||||
client.data_object.create(
|
||||
data_object={
|
||||
"title": "2022 Policy Handbook",
|
||||
"tags": ["policy", "2022", "hr"],
|
||||
"source_url": "https://example.com/policy_2022.pdf",
|
||||
"author": "Jane Doe"
|
||||
},
|
||||
class_name="Document"
|
||||
)
|
||||
```
|
||||
- The `"tags"` field is a text array, ideal for semantic filtering and faceting.
|
||||
- Other fields store provenance metadata, supporting advanced queries and data governance.[3][4][5]
|
||||
|
||||
**Query with metadata filtering:**
|
||||
|
||||
```python
|
||||
result = (
|
||||
client.query
|
||||
.get("Document", ["title", "tags", "author"])
|
||||
.with_filter({"path": ["tags"], "operator": "ContainsAny", "value": ["policy", "hr"]})
|
||||
.do()
|
||||
)
|
||||
```
|
||||
This retrieves documents classified with either "policy" or "hr" tags.[4][3]
|
||||
|
||||
***
|
||||
|
||||
Both platforms support **metadata tagging** for documents, which enables powerful RAG scenarios, detailed filtering, and context-rich retrievals.[5][2][3][4][1]
|
||||
|
||||
[1](https://www.reddit.com/r/OpenWebUI/comments/1hmmg9a/how_to_handle_metadata_during_vectorization/)
|
||||
[2](https://github.com/open-webui/open-webui/discussions/4692)
|
||||
[3](https://stackoverflow.com/questions/75006703/query-large-list-of-metadate-in-weaviate)
|
||||
[4](https://weaviate.io/blog/enterprise-workflow-langchain-weaviate)
|
||||
[5](https://docs.weaviate.io/academy/py/zero_to_mvp/schema_and_imports/schema)
|
||||
[6](https://docs.weaviate.io/weaviate/api/graphql/additional-properties)
|
||||
[7](https://weaviate.io/blog/sycamore-and-weaviate)
|
||||
[8](https://docs.llamaindex.ai/en/stable/examples/vector_stores/WeaviateIndex_auto_retriever/)
|
||||
[9](https://forum.weaviate.io/t/recommendations-for-metadata-or-knowledge-graphs/960)
|
||||
[10](https://weaviate.io/blog/agent-workflow-automation-n8n-weaviate)
|
||||
[11](https://github.com/open-webui/open-webui/discussions/9804)
|
||||
[12](https://docs.quarkiverse.io/quarkus-langchain4j/dev/rag-weaviate.html)
|
||||
[13](https://github.com/weaviate/weaviate-examples)
|
||||
[14](https://docs.openwebui.com/getting-started/api-endpoints/)
|
||||
[15](https://weaviate.io/blog/hybrid-search-for-web-developers)
|
||||
[16](https://dev.to/stephenc222/how-to-use-weaviate-to-store-and-query-vector-embeddings-4b9b)
|
||||
[17](https://helpdesk.egnyte.com/hc/en-us/articles/360035813612-Using-Metadata-in-the-WebUI)
|
||||
[18](https://docs.datadoghq.com/integrations/weaviate/)
|
||||
[19](https://docs.openwebui.com/features/)
|
||||
[20](https://documentation.suse.com/suse-ai/1.0/html/openwebui-configuring/index.html)
|
||||
[21](https://docs.openwebui.com/getting-started/env-configuration/)
|
||||
38
ingest_pipeline/.env
Normal file
38
ingest_pipeline/.env
Normal file
@@ -0,0 +1,38 @@
|
||||
# API Keys
|
||||
FIRECRAWL_API_KEY=fc-your-api-key
|
||||
OPENWEBUI_API_KEY=
|
||||
WEAVIATE_API_KEY=
|
||||
|
||||
# Endpoints
|
||||
LLM_ENDPOINT=http://llm.lab
|
||||
WEAVIATE_ENDPOINT=http://weaviate.yo
|
||||
OPENWEBUI_ENDPOINT=http://chat.lab
|
||||
|
||||
# Model Configuration
|
||||
EMBEDDING_MODEL=ollama/bge-m3:latest
|
||||
EMBEDDING_DIMENSION=1024
|
||||
|
||||
# Ingestion Settings
|
||||
BATCH_SIZE=50
|
||||
MAX_FILE_SIZE=1000000
|
||||
MAX_CRAWL_DEPTH=5
|
||||
MAX_CRAWL_PAGES=100
|
||||
|
||||
# Storage Settings
|
||||
DEFAULT_STORAGE_BACKEND=weaviate
|
||||
COLLECTION_PREFIX=docs
|
||||
|
||||
# Prefect Settings
|
||||
PREFECT_API_URL=http://prefect.lab
|
||||
PREFECT_API_KEY=0nR4WAkQ3q9MY1bjqATK6pVmolighvrS
|
||||
PREFECT_WORK_POOL=default
|
||||
|
||||
# Scheduling
|
||||
DEFAULT_SCHEDULE_INTERVAL=60
|
||||
|
||||
# Performance
|
||||
MAX_CONCURRENT_TASKS=5
|
||||
REQUEST_TIMEOUT=60
|
||||
|
||||
# Logging
|
||||
LOG_LEVEL=INFO
|
||||
6
ingest_pipeline/__main__.py
Normal file
6
ingest_pipeline/__main__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
"""Main entry point for the ingestion pipeline."""
|
||||
|
||||
from .cli.main import app
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
BIN
ingest_pipeline/__pycache__/__main__.cpython-312.pyc
Normal file
BIN
ingest_pipeline/__pycache__/__main__.cpython-312.pyc
Normal file
Binary file not shown.
5
ingest_pipeline/cli/__init__.py
Normal file
5
ingest_pipeline/cli/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""CLI module for the ingestion pipeline."""
|
||||
|
||||
from .main import app
|
||||
|
||||
__all__ = ["app"]
|
||||
BIN
ingest_pipeline/cli/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
ingest_pipeline/cli/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
ingest_pipeline/cli/__pycache__/__init__.cpython-313.pyc
Normal file
BIN
ingest_pipeline/cli/__pycache__/__init__.cpython-313.pyc
Normal file
Binary file not shown.
BIN
ingest_pipeline/cli/__pycache__/main.cpython-312.pyc
Normal file
BIN
ingest_pipeline/cli/__pycache__/main.cpython-312.pyc
Normal file
Binary file not shown.
BIN
ingest_pipeline/cli/__pycache__/main.cpython-313.pyc
Normal file
BIN
ingest_pipeline/cli/__pycache__/main.cpython-313.pyc
Normal file
Binary file not shown.
BIN
ingest_pipeline/cli/__pycache__/tui.cpython-312.pyc
Normal file
BIN
ingest_pipeline/cli/__pycache__/tui.cpython-312.pyc
Normal file
Binary file not shown.
BIN
ingest_pipeline/cli/__pycache__/tui.cpython-313.pyc
Normal file
BIN
ingest_pipeline/cli/__pycache__/tui.cpython-313.pyc
Normal file
Binary file not shown.
616
ingest_pipeline/cli/main.py
Normal file
616
ingest_pipeline/cli/main.py
Normal file
@@ -0,0 +1,616 @@
|
||||
"""CLI interface for ingestion pipeline."""
|
||||
|
||||
import asyncio
|
||||
from enum import Enum
|
||||
|
||||
import typer
|
||||
from rich.console import Console
|
||||
from rich.panel import Panel
|
||||
from rich.progress import BarColumn, Progress, SpinnerColumn, TaskProgressColumn, TextColumn
|
||||
from rich.table import Table
|
||||
|
||||
from ..config import get_settings
|
||||
from ..core.models import IngestionResult
|
||||
from ..flows.ingestion import create_ingestion_flow
|
||||
from ..flows.scheduler import create_scheduled_deployment, serve_deployments
|
||||
|
||||
|
||||
class SourceType(str, Enum):
|
||||
"""Source types for ingestion."""
|
||||
|
||||
web = "web"
|
||||
repository = "repository"
|
||||
documentation = "documentation"
|
||||
|
||||
|
||||
class StorageBackend(str, Enum):
|
||||
"""Storage backend options."""
|
||||
|
||||
weaviate = "weaviate"
|
||||
open_webui = "open_webui"
|
||||
|
||||
|
||||
app = typer.Typer(
|
||||
name="ingest",
|
||||
help="🚀 Modern Document Ingestion Pipeline - Advanced web and repository processing",
|
||||
rich_markup_mode="rich",
|
||||
add_completion=False,
|
||||
)
|
||||
console = Console()
|
||||
|
||||
|
||||
@app.callback()
|
||||
def main(
|
||||
version: bool = typer.Option(False, "--version", "-v", help="Show version information"),
|
||||
) -> None:
|
||||
"""
|
||||
🚀 Modern Document Ingestion Pipeline
|
||||
|
||||
[bold cyan]Advanced document processing and management platform[/bold cyan]
|
||||
|
||||
Features:
|
||||
• 🌐 Web scraping and crawling with Firecrawl
|
||||
• 📦 Repository ingestion with Repomix
|
||||
• 🗄️ Multiple storage backends (Weaviate, OpenWebUI)
|
||||
• 📊 Modern TUI for collection management
|
||||
• ⚡ Async processing with Prefect orchestration
|
||||
• 🎨 Rich CLI with enhanced visuals
|
||||
"""
|
||||
if version:
|
||||
console.print(
|
||||
Panel(
|
||||
"[bold magenta]Ingest Pipeline v0.1.0[/bold magenta]\n"
|
||||
"[dim]Modern Document Ingestion & Management System[/dim]",
|
||||
title="🚀 Version Info",
|
||||
border_style="magenta"
|
||||
)
|
||||
)
|
||||
raise typer.Exit()
|
||||
|
||||
|
||||
@app.command()
|
||||
def ingest(
|
||||
source_url: str = typer.Argument(..., help="URL or path to ingest from"),
|
||||
source_type: SourceType = typer.Option(SourceType.web, "--type", "-t", help="Type of source"),
|
||||
storage: StorageBackend = typer.Option(
|
||||
StorageBackend.weaviate, "--storage", "-s", help="Storage backend"
|
||||
),
|
||||
collection: str = typer.Option(
|
||||
None, "--collection", "-c", help="Target collection name (auto-generated if not specified)"
|
||||
),
|
||||
validate: bool = typer.Option(
|
||||
True, "--validate/--no-validate", help="Validate source before ingesting"
|
||||
),
|
||||
) -> None:
|
||||
"""
|
||||
🚀 Run a one-time ingestion job with enhanced progress tracking.
|
||||
|
||||
This command processes documents from various sources and stores them in
|
||||
your chosen backend with full progress visualization.
|
||||
"""
|
||||
# Enhanced startup message
|
||||
console.print(
|
||||
Panel(
|
||||
f"[bold cyan]🚀 Starting Modern Ingestion[/bold cyan]\n\n"
|
||||
f"[yellow]Source:[/yellow] {source_url}\n"
|
||||
f"[yellow]Type:[/yellow] {source_type.value.title()}\n"
|
||||
f"[yellow]Storage:[/yellow] {storage.value.replace('_', ' ').title()}\n"
|
||||
f"[yellow]Collection:[/yellow] {collection or '[dim]Auto-generated[/dim]'}",
|
||||
title="🔥 Ingestion Configuration",
|
||||
border_style="cyan"
|
||||
)
|
||||
)
|
||||
|
||||
async def run_with_progress() -> IngestionResult:
|
||||
with Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
console=console,
|
||||
) as progress:
|
||||
task = progress.add_task("🔄 Processing documents...", total=100)
|
||||
|
||||
# Simulate progress updates during ingestion
|
||||
progress.update(task, advance=20, description="🔗 Connecting to services...")
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
progress.update(task, advance=30, description="📄 Fetching documents...")
|
||||
result = await run_ingestion(
|
||||
url=source_url,
|
||||
source_type=source_type.value,
|
||||
storage_backend=storage.value,
|
||||
collection_name=collection,
|
||||
validate_first=validate,
|
||||
)
|
||||
|
||||
progress.update(task, advance=50, description="✅ Ingestion complete!")
|
||||
return result
|
||||
|
||||
result = asyncio.run(run_with_progress())
|
||||
|
||||
# Enhanced results display
|
||||
status_color = "green" if result.status.value == "completed" else "red"
|
||||
|
||||
# Create results table with enhanced styling
|
||||
table = Table(
|
||||
title="📊 Ingestion Results",
|
||||
title_style="bold magenta",
|
||||
border_style="cyan",
|
||||
header_style="bold blue"
|
||||
)
|
||||
table.add_column("📋 Metric", style="cyan", no_wrap=True)
|
||||
table.add_column("📈 Value", style=status_color, justify="right")
|
||||
|
||||
# Add enhanced status icon
|
||||
status_icon = "✅" if result.status.value == "completed" else "❌"
|
||||
table.add_row("Status", f"{status_icon} {result.status.value.title()}")
|
||||
|
||||
table.add_row("Documents Processed", f"📄 {result.documents_processed:,}")
|
||||
table.add_row("Documents Failed", f"⚠️ {result.documents_failed:,}")
|
||||
table.add_row("Duration", f"⏱️ {result.duration_seconds:.2f}s")
|
||||
|
||||
if result.error_messages:
|
||||
error_text = "\n".join(f"❌ {error}" for error in result.error_messages[:3])
|
||||
if len(result.error_messages) > 3:
|
||||
error_text += f"\n... and {len(result.error_messages) - 3} more errors"
|
||||
table.add_row("Errors", error_text)
|
||||
|
||||
console.print(table)
|
||||
|
||||
# Success celebration or error guidance
|
||||
if result.status.value == "completed" and result.documents_processed > 0:
|
||||
console.print(
|
||||
Panel(
|
||||
f"🎉 [bold green]Success![/bold green] {result.documents_processed} documents ingested\n\n"
|
||||
f"💡 [dim]Try '[bold cyan]ingest modern[/bold cyan]' to explore your collections![/dim]",
|
||||
title="✨ Ingestion Complete",
|
||||
border_style="green"
|
||||
)
|
||||
)
|
||||
elif result.error_messages:
|
||||
console.print(
|
||||
Panel(
|
||||
"❌ [bold red]Ingestion encountered errors[/bold red]\n\n"
|
||||
"💡 [dim]Check your configuration and try again[/dim]",
|
||||
title="⚠️ Issues Detected",
|
||||
border_style="red"
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@app.command()
|
||||
def schedule(
|
||||
name: str = typer.Argument(..., help="Deployment name"),
|
||||
source_url: str = typer.Argument(..., help="URL or path to ingest from"),
|
||||
source_type: SourceType = typer.Option(SourceType.web, "--type", "-t", help="Type of source"),
|
||||
storage: StorageBackend = typer.Option(
|
||||
StorageBackend.weaviate, "--storage", "-s", help="Storage backend"
|
||||
),
|
||||
cron: str | None = typer.Option(None, "--cron", "-c", help="Cron expression for scheduling"),
|
||||
interval: int = typer.Option(60, "--interval", "-i", help="Interval in minutes"),
|
||||
serve_now: bool = typer.Option(False, "--serve/--no-serve", help="Start serving immediately"),
|
||||
) -> None:
|
||||
"""
|
||||
Create a scheduled deployment for recurring ingestion.
|
||||
"""
|
||||
console.print(f"[bold blue]Creating deployment: {name}[/bold blue]")
|
||||
|
||||
deployment = create_scheduled_deployment(
|
||||
name=name,
|
||||
source_url=source_url,
|
||||
source_type=source_type.value,
|
||||
storage_backend=storage.value,
|
||||
schedule_type="cron" if cron else "interval",
|
||||
cron_expression=cron,
|
||||
interval_minutes=interval,
|
||||
)
|
||||
|
||||
console.print(f"[green]✓ Deployment '{name}' created[/green]")
|
||||
|
||||
if serve_now:
|
||||
console.print("[yellow]Starting deployment server...[/yellow]")
|
||||
serve_deployments([deployment])
|
||||
|
||||
|
||||
@app.command()
|
||||
def serve(
|
||||
config_file: str | None = typer.Option(
|
||||
None, "--config", "-c", help="Path to deployments config file"
|
||||
),
|
||||
ui: str | None = typer.Option(
|
||||
None, "--ui", help="Launch user interface (options: tui, web)"
|
||||
),
|
||||
) -> None:
|
||||
"""
|
||||
🚀 Serve configured deployments with optional UI interface.
|
||||
|
||||
Launch the deployment server to run scheduled ingestion jobs,
|
||||
optionally with a modern Terminal User Interface (TUI) or web interface.
|
||||
"""
|
||||
# Handle UI mode first
|
||||
if ui == "tui":
|
||||
console.print(
|
||||
Panel(
|
||||
"[bold cyan]🚀 Launching Enhanced TUI[/bold cyan]\n\n"
|
||||
"[yellow]Features:[/yellow]\n"
|
||||
"• 📊 Interactive collection management\n"
|
||||
"• ⌨️ Enhanced keyboard navigation\n"
|
||||
"• 🎨 Modern design with focus indicators\n"
|
||||
"• 📄 Document browsing and search\n"
|
||||
"• 🔄 Real-time status updates",
|
||||
title="🎉 TUI Mode",
|
||||
border_style="cyan"
|
||||
)
|
||||
)
|
||||
from .tui import dashboard
|
||||
dashboard()
|
||||
return
|
||||
elif ui == "web":
|
||||
console.print("[red]Web UI not yet implemented. Use --ui tui for Terminal UI.[/red]")
|
||||
return
|
||||
elif ui:
|
||||
console.print(f"[red]Unknown UI option: {ui}[/red]")
|
||||
console.print("[yellow]Available options: tui, web[/yellow]")
|
||||
return
|
||||
|
||||
# Normal deployment server mode
|
||||
if config_file:
|
||||
# Load deployments from config
|
||||
console.print(f"[yellow]Loading deployments from {config_file}[/yellow]")
|
||||
# Implementation would load YAML/JSON config
|
||||
else:
|
||||
# Create example deployments
|
||||
deployments = [
|
||||
create_scheduled_deployment(
|
||||
name="docs-daily",
|
||||
source_url="https://docs.example.com",
|
||||
source_type="documentation",
|
||||
storage_backend="weaviate",
|
||||
schedule_type="cron",
|
||||
cron_expression="0 2 * * *", # Daily at 2 AM
|
||||
),
|
||||
create_scheduled_deployment(
|
||||
name="repo-hourly",
|
||||
source_url="https://github.com/example/repo",
|
||||
source_type="repository",
|
||||
storage_backend="open_webui",
|
||||
schedule_type="interval",
|
||||
interval_minutes=60,
|
||||
),
|
||||
]
|
||||
|
||||
console.print(
|
||||
"[bold green]Starting deployment server with example deployments[/bold green]"
|
||||
)
|
||||
serve_deployments(deployments)
|
||||
|
||||
|
||||
@app.command()
|
||||
def tui() -> None:
|
||||
"""
|
||||
🚀 Launch the enhanced Terminal User Interface.
|
||||
|
||||
Quick shortcut for 'serve --ui tui' with modern keyboard navigation,
|
||||
interactive collection management, and real-time status updates.
|
||||
"""
|
||||
console.print(
|
||||
Panel(
|
||||
"[bold cyan]🚀 Launching Enhanced TUI[/bold cyan]\n\n"
|
||||
"[yellow]Features:[/yellow]\n"
|
||||
"• 📊 Interactive collection management\n"
|
||||
"• ⌨️ Enhanced keyboard navigation\n"
|
||||
"• 🎨 Modern design with focus indicators\n"
|
||||
"• 📄 Document browsing and search\n"
|
||||
"• 🔄 Real-time status updates",
|
||||
title="🎉 TUI Mode",
|
||||
border_style="cyan"
|
||||
)
|
||||
)
|
||||
from .tui import dashboard
|
||||
dashboard()
|
||||
|
||||
|
||||
@app.command()
|
||||
def config() -> None:
|
||||
"""
|
||||
📋 Display current configuration with enhanced formatting.
|
||||
|
||||
Shows all configured endpoints, models, and settings in a beautiful
|
||||
table format with status indicators.
|
||||
"""
|
||||
settings = get_settings()
|
||||
|
||||
console.print(
|
||||
Panel(
|
||||
"[bold cyan]⚙️ System Configuration[/bold cyan]\n"
|
||||
"[dim]Current pipeline settings and endpoints[/dim]",
|
||||
title="🔧 Configuration",
|
||||
border_style="cyan"
|
||||
)
|
||||
)
|
||||
|
||||
# Enhanced configuration table
|
||||
table = Table(
|
||||
title="📊 Configuration Details",
|
||||
title_style="bold magenta",
|
||||
border_style="blue",
|
||||
header_style="bold cyan",
|
||||
show_lines=True
|
||||
)
|
||||
table.add_column("🏷️ Setting", style="cyan", no_wrap=True, width=25)
|
||||
table.add_column("🎯 Value", style="yellow", overflow="fold")
|
||||
table.add_column("📊 Status", style="green", width=12, justify="center")
|
||||
|
||||
# Add configuration rows with status indicators
|
||||
def get_status_indicator(value: str | None) -> str:
|
||||
return "✅ Set" if value else "❌ Missing"
|
||||
|
||||
table.add_row(
|
||||
"🤖 LLM Endpoint",
|
||||
str(settings.llm_endpoint),
|
||||
"✅ Active"
|
||||
)
|
||||
table.add_row(
|
||||
"🔥 Firecrawl Endpoint",
|
||||
str(settings.firecrawl_endpoint),
|
||||
"✅ Active"
|
||||
)
|
||||
table.add_row(
|
||||
"🗄️ Weaviate Endpoint",
|
||||
str(settings.weaviate_endpoint),
|
||||
get_status_indicator(str(settings.weaviate_api_key) if settings.weaviate_api_key else None)
|
||||
)
|
||||
table.add_row(
|
||||
"🌐 OpenWebUI Endpoint",
|
||||
str(settings.openwebui_endpoint),
|
||||
get_status_indicator(settings.openwebui_api_key)
|
||||
)
|
||||
table.add_row(
|
||||
"🧠 Embedding Model",
|
||||
settings.embedding_model,
|
||||
"✅ Set"
|
||||
)
|
||||
table.add_row(
|
||||
"💾 Default Storage",
|
||||
settings.default_storage_backend.title(),
|
||||
"✅ Set"
|
||||
)
|
||||
table.add_row(
|
||||
"📦 Default Batch Size",
|
||||
f"{settings.default_batch_size:,}",
|
||||
"✅ Set"
|
||||
)
|
||||
table.add_row(
|
||||
"⚡ Max Concurrent Tasks",
|
||||
f"{settings.max_concurrent_tasks}",
|
||||
"✅ Set"
|
||||
)
|
||||
|
||||
console.print(table)
|
||||
|
||||
# Additional helpful information
|
||||
console.print(
|
||||
Panel(
|
||||
"💡 [bold cyan]Quick Tips[/bold cyan]\n\n"
|
||||
"• Use '[bold]ingest list-collections[/bold]' to view all collections\n"
|
||||
"• Use '[bold]ingest search[/bold]' to search content\n"
|
||||
"• Configure API keys in your [yellow].env[/yellow] file\n"
|
||||
"• Default collection names are auto-generated from URLs",
|
||||
title="🚀 Usage Tips",
|
||||
border_style="green"
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@app.command()
|
||||
def list_collections() -> None:
|
||||
"""
|
||||
📋 List all collections across storage backends.
|
||||
"""
|
||||
console.print("[bold cyan]📚 Collection Overview[/bold cyan]")
|
||||
asyncio.run(run_list_collections())
|
||||
|
||||
|
||||
@app.command()
|
||||
def search(
|
||||
query: str = typer.Argument(..., help="Search query"),
|
||||
collection: str = typer.Option(None, "--collection", "-c", help="Target collection"),
|
||||
backend: StorageBackend = typer.Option(StorageBackend.weaviate, "--backend", "-b", help="Storage backend"),
|
||||
limit: int = typer.Option(10, "--limit", "-l", help="Result limit"),
|
||||
) -> None:
|
||||
"""
|
||||
🔍 Search across collections.
|
||||
"""
|
||||
console.print(f"[bold cyan]🔍 Searching for: {query}[/bold cyan]")
|
||||
asyncio.run(run_search(query, collection, backend.value, limit))
|
||||
|
||||
|
||||
async def run_ingestion(
|
||||
url: str,
|
||||
source_type: str,
|
||||
storage_backend: str,
|
||||
collection_name: str | None = None,
|
||||
validate_first: bool = True
|
||||
) -> IngestionResult:
|
||||
"""
|
||||
Run ingestion with support for targeted collections.
|
||||
"""
|
||||
# Auto-generate collection name if not provided
|
||||
if not collection_name:
|
||||
from urllib.parse import urlparse
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.replace(".", "_").replace("-", "_")
|
||||
collection_name = f"{domain}_{source_type}"
|
||||
|
||||
result = await create_ingestion_flow(
|
||||
source_url=url,
|
||||
source_type=source_type,
|
||||
storage_backend=storage_backend,
|
||||
collection_name=collection_name,
|
||||
validate_first=validate_first,
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
async def run_list_collections() -> None:
|
||||
"""
|
||||
List collections across storage backends.
|
||||
"""
|
||||
from ..config import get_settings
|
||||
from ..core.models import StorageBackend, StorageConfig
|
||||
from ..storage.openwebui import OpenWebUIStorage
|
||||
from ..storage.weaviate import WeaviateStorage
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
console.print("🔍 [bold cyan]Scanning storage backends...[/bold cyan]")
|
||||
|
||||
# Try to connect to Weaviate
|
||||
weaviate_collections = []
|
||||
try:
|
||||
weaviate_config = StorageConfig(
|
||||
backend=StorageBackend.WEAVIATE,
|
||||
endpoint=settings.weaviate_endpoint,
|
||||
api_key=settings.weaviate_api_key,
|
||||
collection_name="default",
|
||||
)
|
||||
weaviate = WeaviateStorage(weaviate_config)
|
||||
await weaviate.initialize()
|
||||
|
||||
collections_list = weaviate.client.collections.list_all() if weaviate.client else []
|
||||
for collection in collections_list:
|
||||
collection_obj = weaviate.client.collections.get(collection) if weaviate.client else None
|
||||
if collection_obj:
|
||||
count = collection_obj.aggregate.over_all(total_count=True).total_count or 0
|
||||
weaviate_collections.append((collection, count))
|
||||
except Exception as e:
|
||||
console.print(f"❌ [red]Weaviate connection failed: {e}[/red]")
|
||||
|
||||
# Try to connect to OpenWebUI
|
||||
openwebui_collections = []
|
||||
try:
|
||||
openwebui_config = StorageConfig(
|
||||
backend=StorageBackend.OPEN_WEBUI,
|
||||
endpoint=settings.openwebui_endpoint,
|
||||
api_key=settings.openwebui_api_key,
|
||||
collection_name="default",
|
||||
)
|
||||
openwebui = OpenWebUIStorage(openwebui_config)
|
||||
await openwebui.initialize()
|
||||
|
||||
response = await openwebui.client.get("/api/v1/knowledge/")
|
||||
response.raise_for_status()
|
||||
knowledge_bases = response.json()
|
||||
|
||||
for kb in knowledge_bases:
|
||||
name = kb.get("name", "Unknown")
|
||||
file_count = len(kb.get("files", []))
|
||||
openwebui_collections.append((name, file_count))
|
||||
except Exception as e:
|
||||
console.print(f"❌ [red]OpenWebUI connection failed: {e}[/red]")
|
||||
|
||||
# Display results
|
||||
if weaviate_collections or openwebui_collections:
|
||||
# Create results table
|
||||
from rich.table import Table
|
||||
table = Table(
|
||||
title="📚 Collection Overview",
|
||||
title_style="bold magenta",
|
||||
border_style="cyan",
|
||||
header_style="bold blue"
|
||||
)
|
||||
table.add_column("🏷️ Collection", style="cyan", no_wrap=True)
|
||||
table.add_column("📊 Backend", style="yellow")
|
||||
table.add_column("📄 Documents", style="green", justify="right")
|
||||
|
||||
# Add Weaviate collections
|
||||
for name, count in weaviate_collections:
|
||||
table.add_row(name, "🗄️ Weaviate", f"{count:,}")
|
||||
|
||||
# Add OpenWebUI collections
|
||||
for name, count in openwebui_collections:
|
||||
table.add_row(name, "🌐 OpenWebUI", f"{count:,}")
|
||||
|
||||
console.print(table)
|
||||
else:
|
||||
console.print("❌ [yellow]No collections found in any backend[/yellow]")
|
||||
|
||||
|
||||
async def run_search(query: str, collection: str | None, backend: str, limit: int) -> None:
|
||||
"""
|
||||
Search across collections.
|
||||
"""
|
||||
from ..config import get_settings
|
||||
from ..core.models import StorageBackend, StorageConfig
|
||||
from ..storage.weaviate import WeaviateStorage
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
console.print(f"🔍 Searching for: '[bold cyan]{query}[/bold cyan]'")
|
||||
if collection:
|
||||
console.print(f"📚 Target collection: [yellow]{collection}[/yellow]")
|
||||
console.print(f"💾 Backend: [blue]{backend}[/blue]")
|
||||
|
||||
results = []
|
||||
|
||||
try:
|
||||
if backend == "weaviate":
|
||||
weaviate_config = StorageConfig(
|
||||
backend=StorageBackend.WEAVIATE,
|
||||
endpoint=settings.weaviate_endpoint,
|
||||
api_key=settings.weaviate_api_key,
|
||||
collection_name=collection or "default",
|
||||
)
|
||||
weaviate = WeaviateStorage(weaviate_config)
|
||||
await weaviate.initialize()
|
||||
|
||||
results_generator = weaviate.search(query, limit=limit)
|
||||
async for doc in results_generator:
|
||||
results.append({
|
||||
"title": getattr(doc, "title", "Untitled"),
|
||||
"content": getattr(doc, "content", ""),
|
||||
"score": getattr(doc, "score", 0.0),
|
||||
"backend": "🗄️ Weaviate"
|
||||
})
|
||||
|
||||
elif backend == "open_webui":
|
||||
console.print("❌ [red]OpenWebUI search not yet implemented[/red]")
|
||||
return
|
||||
|
||||
except Exception as e:
|
||||
console.print(f"❌ [red]Search failed: {e}[/red]")
|
||||
return
|
||||
|
||||
# Display results
|
||||
if results:
|
||||
from rich.table import Table
|
||||
table = Table(
|
||||
title=f"🔍 Search Results for '{query}'",
|
||||
title_style="bold magenta",
|
||||
border_style="green",
|
||||
header_style="bold blue"
|
||||
)
|
||||
table.add_column("📄 Title", style="cyan", max_width=40)
|
||||
table.add_column("📝 Preview", style="white", max_width=60)
|
||||
table.add_column("📊 Score", style="yellow", justify="right")
|
||||
|
||||
for result in results[:limit]:
|
||||
title = str(result["title"])
|
||||
title_display = title[:40] + "..." if len(title) > 40 else title
|
||||
|
||||
content = str(result["content"])
|
||||
content_display = content[:60] + "..." if len(content) > 60 else content
|
||||
|
||||
score = f"{result['score']:.3f}"
|
||||
|
||||
table.add_row(title_display, content_display, score)
|
||||
|
||||
console.print(table)
|
||||
console.print(f"\n✅ [green]Found {len(results)} results[/green]")
|
||||
else:
|
||||
console.print("❌ [yellow]No results found[/yellow]")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
13
ingest_pipeline/cli/tui/__init__.py
Normal file
13
ingest_pipeline/cli/tui/__init__.py
Normal file
@@ -0,0 +1,13 @@
|
||||
"""Enhanced TUI package with keyboard navigation and modular architecture."""
|
||||
|
||||
from .app import CollectionManagementApp
|
||||
from .models import CollectionInfo, DocumentInfo
|
||||
from .utils import dashboard, run_textual_tui
|
||||
|
||||
__all__ = [
|
||||
"CollectionManagementApp",
|
||||
"CollectionInfo",
|
||||
"DocumentInfo",
|
||||
"dashboard",
|
||||
"run_textual_tui",
|
||||
]
|
||||
BIN
ingest_pipeline/cli/tui/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
ingest_pipeline/cli/tui/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
ingest_pipeline/cli/tui/__pycache__/app.cpython-312.pyc
Normal file
BIN
ingest_pipeline/cli/tui/__pycache__/app.cpython-312.pyc
Normal file
Binary file not shown.
BIN
ingest_pipeline/cli/tui/__pycache__/models.cpython-312.pyc
Normal file
BIN
ingest_pipeline/cli/tui/__pycache__/models.cpython-312.pyc
Normal file
Binary file not shown.
BIN
ingest_pipeline/cli/tui/__pycache__/styles.cpython-312.pyc
Normal file
BIN
ingest_pipeline/cli/tui/__pycache__/styles.cpython-312.pyc
Normal file
Binary file not shown.
181
ingest_pipeline/cli/tui/app.py
Normal file
181
ingest_pipeline/cli/tui/app.py
Normal file
@@ -0,0 +1,181 @@
|
||||
"""Main TUI application with enhanced keyboard navigation."""
|
||||
|
||||
from textual import events
|
||||
from textual.app import App
|
||||
from textual.binding import Binding
|
||||
|
||||
from ...storage.openwebui import OpenWebUIStorage
|
||||
from ...storage.weaviate import WeaviateStorage
|
||||
from .screens import CollectionOverviewScreen, HelpScreen
|
||||
from .styles import TUI_CSS
|
||||
|
||||
|
||||
class CollectionManagementApp(App[None]):
|
||||
"""Enhanced modern Textual application with comprehensive keyboard navigation."""
|
||||
|
||||
CSS = TUI_CSS
|
||||
|
||||
BINDINGS = [
|
||||
Binding("q", "quit", "Quit"),
|
||||
Binding("ctrl+c", "quit", "Quit"),
|
||||
Binding("ctrl+q", "quit", "Quit"),
|
||||
Binding("f1", "help", "Help"),
|
||||
Binding("ctrl+h", "help", "Help"),
|
||||
Binding("?", "help", "Quick Help"),
|
||||
# Global navigation shortcuts
|
||||
Binding("ctrl+r", "refresh_current", "Refresh Current Screen"),
|
||||
Binding("ctrl+w", "close_current", "Close Current Screen"),
|
||||
# Tab navigation shortcuts
|
||||
Binding("ctrl+1", "dashboard_tab", "Dashboard", show=False),
|
||||
Binding("ctrl+2", "collections_tab", "Collections", show=False),
|
||||
Binding("ctrl+3", "analytics_tab", "Analytics", show=False),
|
||||
]
|
||||
|
||||
weaviate: WeaviateStorage | None
|
||||
openwebui: OpenWebUIStorage | None
|
||||
|
||||
def __init__(
|
||||
self, weaviate: WeaviateStorage | None = None, openwebui: OpenWebUIStorage | None = None
|
||||
):
|
||||
super().__init__()
|
||||
self.weaviate = weaviate
|
||||
self.openwebui = openwebui
|
||||
|
||||
def on_mount(self) -> None:
|
||||
"""Initialize the enhanced app with better branding."""
|
||||
self.title = "🚀 Enhanced Collection Management System"
|
||||
self.sub_title = "Advanced Document Ingestion & Management Platform with Keyboard Navigation"
|
||||
self.push_screen(CollectionOverviewScreen(self.weaviate, self.openwebui))
|
||||
|
||||
def action_help(self) -> None:
|
||||
"""Show comprehensive help information with all keyboard shortcuts."""
|
||||
help_md = """
|
||||
# 🚀 Enhanced Collection Management System
|
||||
|
||||
## 🎯 Global Navigation
|
||||
- **F1** / **Ctrl+H** / **?**: Show this help
|
||||
- **Q** / **Ctrl+C** / **Ctrl+Q**: Quit application
|
||||
- **Ctrl+R**: Refresh current screen
|
||||
- **Ctrl+W**: Close current screen/dialog
|
||||
- **Escape**: Go back/cancel current action
|
||||
|
||||
## 📑 Tab Navigation
|
||||
- **Tab** / **Shift+Tab**: Switch between tabs
|
||||
- **Ctrl+1**: Jump to Dashboard tab
|
||||
- **Ctrl+2**: Jump to Collections tab
|
||||
- **Ctrl+3**: Jump to Analytics tab
|
||||
|
||||
## 📚 Collections Management
|
||||
- **R**: Refresh collections list
|
||||
- **I**: Start new ingestion
|
||||
- **M**: Manage documents in selected collection
|
||||
- **S**: Search within selected collection
|
||||
- **Ctrl+D**: Delete selected collection
|
||||
|
||||
## 🗂️ Table Navigation
|
||||
- **Arrow Keys** / **J/K/H/L**: Navigate table cells (Vi-style)
|
||||
- **Home** / **End**: Jump to first/last row
|
||||
- **Page Up** / **Page Down**: Scroll by page
|
||||
- **Enter**: Select/activate current row
|
||||
- **Space**: Toggle row selection
|
||||
- **Ctrl+A**: Select all items
|
||||
- **Ctrl+Shift+A**: Clear all selections
|
||||
|
||||
## 📄 Document Management
|
||||
- **Space**: Toggle document selection
|
||||
- **Delete** / **Ctrl+D**: Delete selected documents
|
||||
- **A**: Select all documents on page
|
||||
- **N**: Clear selection
|
||||
- **Page Up/Down**: Navigate between pages
|
||||
- **Home/End**: Go to first/last page
|
||||
|
||||
## 🔍 Search Features
|
||||
- **/** : Quick search (focus search field)
|
||||
- **Ctrl+F**: Focus search input
|
||||
- **Enter**: Perform search
|
||||
- **F3**: Repeat last search
|
||||
- **Ctrl+R**: Clear search results
|
||||
- **Escape**: Clear search/exit search mode
|
||||
|
||||
## 📥 Ingestion Interface
|
||||
- **1/2/3**: Select ingestion type (Web/Repository/Documentation)
|
||||
- **Tab/Shift+Tab**: Navigate between fields
|
||||
- **Enter**: Start ingestion process
|
||||
- **Ctrl+I**: Quick start ingestion
|
||||
- **Escape**: Cancel ingestion
|
||||
|
||||
## 🎨 Visual Features
|
||||
- Enhanced focus indicators with colored borders
|
||||
- Smooth keyboard navigation with visual feedback
|
||||
- Status indicators with real-time updates
|
||||
- Progress bars with detailed status messages
|
||||
- Responsive design with accessibility features
|
||||
|
||||
## 💡 Pro Tips
|
||||
- Use **Vi-style** navigation (J/K/H/L) for efficient movement
|
||||
- **Tab** through interactive elements for keyboard-only operation
|
||||
- Hold **Shift** with arrow keys for range selection (where supported)
|
||||
- Use **Ctrl+** shortcuts for power user efficiency
|
||||
- **Escape** is your friend - it cancels most operations safely
|
||||
|
||||
## 🚀 Performance Features
|
||||
- Lazy loading for large collections
|
||||
- Paginated document views
|
||||
- Background refresh operations
|
||||
- Efficient memory management
|
||||
- Responsive UI updates
|
||||
|
||||
---
|
||||
|
||||
**Enjoy the enhanced keyboard-driven interface!** 🎉
|
||||
|
||||
*Press Escape, Enter, or Q to close this help.*
|
||||
"""
|
||||
self.push_screen(HelpScreen(help_md))
|
||||
|
||||
def action_refresh_current(self) -> None:
|
||||
"""Refresh the current screen if it supports it."""
|
||||
current_screen = self.screen
|
||||
if hasattr(current_screen, "action_refresh"):
|
||||
current_screen.action_refresh()
|
||||
else:
|
||||
self.notify("Current screen doesn't support refresh", severity="information")
|
||||
|
||||
def action_close_current(self) -> None:
|
||||
"""Close current screen/dialog."""
|
||||
if len(self.screen_stack) > 1: # Don't close the main screen
|
||||
self.pop_screen()
|
||||
else:
|
||||
self.notify("Cannot close main screen. Use Q to quit.", severity="warning")
|
||||
|
||||
def action_dashboard_tab(self) -> None:
|
||||
"""Switch to dashboard tab in current screen."""
|
||||
current_screen = self.screen
|
||||
if hasattr(current_screen, "action_tab_dashboard"):
|
||||
current_screen.action_tab_dashboard()
|
||||
|
||||
def action_collections_tab(self) -> None:
|
||||
"""Switch to collections tab in current screen."""
|
||||
current_screen = self.screen
|
||||
if hasattr(current_screen, "action_tab_collections"):
|
||||
current_screen.action_tab_collections()
|
||||
|
||||
def action_analytics_tab(self) -> None:
|
||||
"""Switch to analytics tab in current screen."""
|
||||
current_screen = self.screen
|
||||
if hasattr(current_screen, "action_tab_analytics"):
|
||||
current_screen.action_tab_analytics()
|
||||
|
||||
def on_key(self, event: events.Key) -> None:
|
||||
"""Handle global keyboard shortcuts."""
|
||||
# Handle global shortcuts that might not be bound to specific actions
|
||||
if event.key == "ctrl+shift+?":
|
||||
# Alternative help shortcut
|
||||
self.action_help()
|
||||
event.prevent_default()
|
||||
elif event.key == "ctrl+alt+r":
|
||||
# Force refresh all connections
|
||||
self.notify("🔄 Refreshing all connections...", severity="information")
|
||||
# This could trigger a full reinit if needed
|
||||
event.prevent_default()
|
||||
# No else clause needed - just handle our events
|
||||
26
ingest_pipeline/cli/tui/models.py
Normal file
26
ingest_pipeline/cli/tui/models.py
Normal file
@@ -0,0 +1,26 @@
|
||||
"""Data models and TypedDict definitions for the TUI."""
|
||||
|
||||
from typing import TypedDict
|
||||
|
||||
|
||||
class CollectionInfo(TypedDict):
|
||||
"""Information about a collection."""
|
||||
|
||||
name: str
|
||||
type: str
|
||||
count: int
|
||||
backend: str
|
||||
status: str
|
||||
last_updated: str
|
||||
size_mb: float
|
||||
|
||||
|
||||
class DocumentInfo(TypedDict):
|
||||
"""Information about a document."""
|
||||
|
||||
id: str
|
||||
title: str
|
||||
source_url: str
|
||||
content_preview: str
|
||||
word_count: int
|
||||
timestamp: str
|
||||
18
ingest_pipeline/cli/tui/screens/__init__.py
Normal file
18
ingest_pipeline/cli/tui/screens/__init__.py
Normal file
@@ -0,0 +1,18 @@
|
||||
"""Screen components for the TUI application."""
|
||||
|
||||
from .dashboard import CollectionOverviewScreen
|
||||
from .dialogs import ConfirmDeleteScreen, ConfirmDocumentDeleteScreen
|
||||
from .documents import DocumentManagementScreen
|
||||
from .help import HelpScreen
|
||||
from .ingestion import IngestionScreen
|
||||
from .search import SearchScreen
|
||||
|
||||
__all__ = [
|
||||
"CollectionOverviewScreen",
|
||||
"IngestionScreen",
|
||||
"SearchScreen",
|
||||
"DocumentManagementScreen",
|
||||
"ConfirmDeleteScreen",
|
||||
"ConfirmDocumentDeleteScreen",
|
||||
"HelpScreen",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
ingest_pipeline/cli/tui/screens/__pycache__/help.cpython-312.pyc
Normal file
BIN
ingest_pipeline/cli/tui/screens/__pycache__/help.cpython-312.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
542
ingest_pipeline/cli/tui/screens/dashboard.py
Normal file
542
ingest_pipeline/cli/tui/screens/dashboard.py
Normal file
@@ -0,0 +1,542 @@
|
||||
"""Main dashboard screen with collections overview."""
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
from textual import work
|
||||
from textual.app import ComposeResult
|
||||
from textual.binding import Binding
|
||||
from textual.containers import Container, Grid, Horizontal
|
||||
from textual.reactive import reactive, var
|
||||
from textual.screen import Screen
|
||||
from textual.widgets import (
|
||||
Button,
|
||||
Footer,
|
||||
Header,
|
||||
LoadingIndicator,
|
||||
Rule,
|
||||
Static,
|
||||
TabbedContent,
|
||||
TabPane,
|
||||
)
|
||||
from typing_extensions import override
|
||||
|
||||
from ....storage.openwebui import OpenWebUIStorage
|
||||
from ....storage.weaviate import WeaviateStorage
|
||||
from ..models import CollectionInfo
|
||||
from ..widgets import EnhancedDataTable, MetricsCard, StatusIndicator
|
||||
|
||||
|
||||
class CollectionOverviewScreen(Screen[None]):
|
||||
"""Enhanced dashboard with modern design and metrics."""
|
||||
|
||||
total_documents: int = 0
|
||||
total_collections: int = 0
|
||||
active_backends: int = 0
|
||||
|
||||
BINDINGS = [
|
||||
Binding("q", "quit", "Quit"),
|
||||
Binding("r", "refresh", "Refresh"),
|
||||
Binding("i", "ingest", "Ingest"),
|
||||
Binding("m", "manage", "Manage"),
|
||||
Binding("s", "search", "Search"),
|
||||
Binding("ctrl+d", "delete", "Delete"),
|
||||
Binding("ctrl+1", "tab_dashboard", "Dashboard"),
|
||||
Binding("ctrl+2", "tab_collections", "Collections"),
|
||||
Binding("ctrl+3", "tab_analytics", "Analytics"),
|
||||
Binding("tab", "next_tab", "Next Tab"),
|
||||
Binding("shift+tab", "prev_tab", "Prev Tab"),
|
||||
Binding("f1", "help", "Help"),
|
||||
]
|
||||
|
||||
collections: var[list[CollectionInfo]] = var([])
|
||||
is_loading: var[bool] = var(False)
|
||||
selected_collection: reactive[CollectionInfo | None] = reactive(None)
|
||||
|
||||
def __init__(self, weaviate: WeaviateStorage | None, openwebui: OpenWebUIStorage | None):
|
||||
super().__init__()
|
||||
self.weaviate = weaviate
|
||||
self.openwebui = openwebui
|
||||
self.total_documents = 0
|
||||
self.total_collections = 0
|
||||
self.active_backends = 0
|
||||
|
||||
@override
|
||||
def compose(self) -> ComposeResult:
|
||||
yield Header(show_clock=True)
|
||||
|
||||
with TabbedContent("Dashboard", "Collections", "Analytics"):
|
||||
# Dashboard Tab
|
||||
with TabPane("Dashboard", id="dashboard"):
|
||||
yield Container(
|
||||
Static("🚀 Collection Management System", classes="title"),
|
||||
Static("Modern document ingestion and management platform", classes="subtitle"),
|
||||
Rule(line_style="heavy"),
|
||||
# Metrics Grid
|
||||
Grid(
|
||||
MetricsCard(
|
||||
"Collections", str(self.total_collections), "Active collections"
|
||||
),
|
||||
MetricsCard("Documents", str(self.total_documents), "Total indexed"),
|
||||
MetricsCard("Backends", str(self.active_backends), "Connected services"),
|
||||
MetricsCard("Status", "Online", "System health"),
|
||||
classes="responsive-grid metrics-grid",
|
||||
),
|
||||
Rule(line_style="dashed"),
|
||||
# Quick Actions
|
||||
Container(
|
||||
Static("⚡ Quick Actions", classes="section-title"),
|
||||
Horizontal(
|
||||
Button("🔄 Refresh Data", id="quick_refresh", variant="primary"),
|
||||
Button("📥 New Ingestion", id="quick_ingest", variant="success"),
|
||||
Button("🔍 Search All", id="quick_search", variant="default"),
|
||||
Button("⚙️ Settings", id="quick_settings", variant="default"),
|
||||
classes="action_buttons",
|
||||
),
|
||||
classes="card",
|
||||
),
|
||||
# Recent Activity
|
||||
Container(
|
||||
Static("📊 Recent Activity", classes="section-title"),
|
||||
Static(
|
||||
"Loading recent activity...", id="activity_feed", classes="status-text"
|
||||
),
|
||||
classes="card",
|
||||
),
|
||||
classes="main_container",
|
||||
)
|
||||
|
||||
# Collections Tab
|
||||
with TabPane("Collections", id="collections"):
|
||||
yield Container(
|
||||
Static("📚 Collection Overview", classes="title"),
|
||||
# Collection controls
|
||||
Horizontal(
|
||||
Button("🔄 Refresh", id="refresh_btn", variant="primary"),
|
||||
Button("📥 Ingest", id="ingest_btn", variant="success"),
|
||||
Button("🔧 Manage", id="manage_btn", variant="warning"),
|
||||
Button("🗑️ Delete", id="delete_btn", variant="error"),
|
||||
Button("🔍 Search", id="search_btn", variant="default"),
|
||||
classes="button_bar",
|
||||
),
|
||||
# Collection table with enhanced navigation
|
||||
EnhancedDataTable(id="collections_table", classes="enhanced-table"),
|
||||
# Status bar
|
||||
Container(
|
||||
Static("Ready", id="status_text", classes="status-text"),
|
||||
StatusIndicator("Ready", id="connection_status"),
|
||||
classes="status-bar",
|
||||
),
|
||||
LoadingIndicator(id="loading", classes="pulse"),
|
||||
classes="main_container",
|
||||
)
|
||||
|
||||
# Analytics Tab
|
||||
with TabPane("Analytics", id="analytics"):
|
||||
yield Container(
|
||||
Static("📈 Analytics & Insights", classes="title"),
|
||||
# Analytics content
|
||||
Container(
|
||||
Static("🚧 Analytics Dashboard", classes="section-title"),
|
||||
Static("Advanced analytics and insights coming soon!", classes="subtitle"),
|
||||
# Placeholder charts area
|
||||
Container(
|
||||
Static("📊 Document Distribution", classes="chart-title"),
|
||||
Static(
|
||||
"Chart placeholder - integrate with visualization library",
|
||||
classes="chart-placeholder",
|
||||
),
|
||||
classes="card",
|
||||
),
|
||||
Container(
|
||||
Static("⏱️ Ingestion Timeline", classes="chart-title"),
|
||||
Static("Timeline chart placeholder", classes="chart-placeholder"),
|
||||
classes="card",
|
||||
),
|
||||
classes="analytics-grid",
|
||||
),
|
||||
classes="main_container",
|
||||
)
|
||||
|
||||
yield Footer()
|
||||
|
||||
async def on_mount(self) -> None:
|
||||
"""Initialize the screen with enhanced loading."""
|
||||
self.query_one("#loading").display = False
|
||||
self.update_metrics()
|
||||
self.refresh_collections() # Don't await, let it run as a worker
|
||||
|
||||
def update_metrics(self) -> None:
|
||||
"""Update dashboard metrics with enhanced calculations."""
|
||||
self.total_collections = len(self.collections)
|
||||
self.total_documents = sum(col["count"] for col in self.collections)
|
||||
|
||||
# Count active backends
|
||||
self.active_backends = 0
|
||||
if self.weaviate:
|
||||
self.active_backends += 1
|
||||
if self.openwebui:
|
||||
self.active_backends += 1
|
||||
|
||||
# Update metrics cards if they exist
|
||||
try:
|
||||
dashboard_tab = self.query_one("#dashboard")
|
||||
metrics_cards = dashboard_tab.query(MetricsCard)
|
||||
if len(metrics_cards) >= 4:
|
||||
# Update existing cards with formatted values
|
||||
metrics_cards[0].query_one(".metrics-value", Static).update(
|
||||
f"{self.total_collections:,}"
|
||||
)
|
||||
metrics_cards[1].query_one(".metrics-value", Static).update(
|
||||
f"{self.total_documents:,}"
|
||||
)
|
||||
metrics_cards[2].query_one(".metrics-value", Static).update(
|
||||
str(self.active_backends)
|
||||
)
|
||||
|
||||
# Update status card based on system health
|
||||
if self.active_backends > 0 and self.total_collections > 0:
|
||||
status_text = "🟢 Healthy"
|
||||
status_class = "status-active"
|
||||
elif self.active_backends > 0:
|
||||
status_text = "🟡 Ready"
|
||||
status_class = "status-warning"
|
||||
else:
|
||||
status_text = "🔴 Offline"
|
||||
status_class = "status-error"
|
||||
|
||||
metrics_cards[3].query_one(".metrics-value", Static).update(status_text)
|
||||
metrics_cards[3].add_class(status_class)
|
||||
|
||||
except Exception:
|
||||
pass # Cards might not be rendered yet
|
||||
|
||||
# Update activity feed with real data
|
||||
try:
|
||||
dashboard_tab = self.query_one("#dashboard")
|
||||
activity_feed = dashboard_tab.query_one("#activity_feed", Static)
|
||||
if self.collections:
|
||||
recent_activity = []
|
||||
for col in self.collections[:3]: # Show top 3 collections
|
||||
recent_activity.append(
|
||||
f"📚 {col['name']}: {col['count']:,} docs ({col.get('size_mb', 0):.1f} MB)"
|
||||
)
|
||||
activity_text = "\\n".join(recent_activity)
|
||||
if len(self.collections) > 3:
|
||||
activity_text += f"\\n... and {len(self.collections) - 3} more collections"
|
||||
else:
|
||||
activity_text = "No collections found. Start by creating your first ingestion!"
|
||||
|
||||
activity_feed.update(activity_text)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@work(exclusive=True)
|
||||
async def refresh_collections(self) -> None:
|
||||
"""Refresh collection data with enhanced loading feedback."""
|
||||
self.is_loading = True
|
||||
loading_indicator = self.query_one("#loading")
|
||||
status_text = self.query_one("#status_text", Static)
|
||||
|
||||
loading_indicator.display = True
|
||||
status_text.update("🔄 Refreshing collections...")
|
||||
|
||||
try:
|
||||
collections = []
|
||||
|
||||
# Get Weaviate collections
|
||||
if self.weaviate:
|
||||
try:
|
||||
status_text.update("🔗 Connecting to Weaviate...")
|
||||
await self.weaviate.initialize()
|
||||
weaviate_collections = await self.list_weaviate_collections()
|
||||
collections.extend(weaviate_collections)
|
||||
status_text.update("✅ Weaviate collections loaded")
|
||||
except Exception as e:
|
||||
self.notify(f"❌ Weaviate error: {e}", severity="error")
|
||||
status_text.update("❌ Weaviate connection failed")
|
||||
|
||||
# Get OpenWebUI collections
|
||||
if self.openwebui:
|
||||
try:
|
||||
status_text.update("🔗 Connecting to OpenWebUI...")
|
||||
await self.openwebui.initialize()
|
||||
openwebui_collections = await self.list_openwebui_collections()
|
||||
collections.extend(openwebui_collections)
|
||||
status_text.update("✅ OpenWebUI collections loaded")
|
||||
except Exception as e:
|
||||
self.notify(f"❌ OpenWebUI error: {e}", severity="error")
|
||||
status_text.update("❌ OpenWebUI connection failed")
|
||||
|
||||
self.collections = collections
|
||||
await self.update_collections_table()
|
||||
self.update_metrics()
|
||||
status_text.update(f"✨ Ready - {len(collections)} collections loaded")
|
||||
|
||||
# Update connection status
|
||||
connection_status = self.query_one("#connection_status", StatusIndicator)
|
||||
if collections:
|
||||
connection_status.update_status("✓ Active")
|
||||
else:
|
||||
connection_status.update_status("No Data")
|
||||
|
||||
except Exception as e:
|
||||
status_text.update(f"❌ Error: {e}")
|
||||
self.notify(f"Failed to refresh: {e}", severity="error")
|
||||
finally:
|
||||
self.is_loading = False
|
||||
loading_indicator.display = False
|
||||
|
||||
async def list_weaviate_collections(self) -> list[CollectionInfo]:
|
||||
"""List Weaviate collections with enhanced metadata."""
|
||||
if not self.weaviate:
|
||||
return []
|
||||
|
||||
try:
|
||||
collections = []
|
||||
collections_list = (
|
||||
self.weaviate.client.collections.list_all()
|
||||
if self.weaviate and self.weaviate.client
|
||||
else []
|
||||
)
|
||||
for collection in collections_list:
|
||||
collection_obj = (
|
||||
self.weaviate.client.collections.get(collection)
|
||||
if self.weaviate and self.weaviate.client
|
||||
else None
|
||||
)
|
||||
if not collection_obj:
|
||||
continue
|
||||
count = collection_obj.aggregate.over_all(total_count=True).total_count or 0
|
||||
|
||||
# Estimate size
|
||||
size_mb = count * 0.01 # Rough estimate
|
||||
|
||||
collection_info = CollectionInfo(
|
||||
name=collection,
|
||||
type="weaviate",
|
||||
count=count,
|
||||
backend="🗄️ Weaviate",
|
||||
status="✓ Active",
|
||||
last_updated=datetime.now().strftime("%Y-%m-%d %H:%M"),
|
||||
size_mb=size_mb,
|
||||
)
|
||||
collections.append(collection_info)
|
||||
|
||||
return collections
|
||||
except Exception as e:
|
||||
self.notify(f"Error listing Weaviate collections: {e}", severity="error")
|
||||
return []
|
||||
|
||||
async def list_openwebui_collections(self) -> list[CollectionInfo]:
|
||||
"""List OpenWebUI collections with enhanced metadata."""
|
||||
if not self.openwebui:
|
||||
return []
|
||||
|
||||
try:
|
||||
response = await self.openwebui.client.get("/api/v1/knowledge/")
|
||||
response.raise_for_status()
|
||||
knowledge_bases = response.json()
|
||||
|
||||
collections = []
|
||||
for kb in knowledge_bases:
|
||||
file_count = len(kb.get("files", []))
|
||||
size_mb = file_count * 0.5 # Rough estimate
|
||||
|
||||
collection_info = CollectionInfo(
|
||||
name=kb.get("name", "Unknown"),
|
||||
type="openwebui",
|
||||
count=file_count,
|
||||
backend="🌐 OpenWebUI",
|
||||
status="✓ Active",
|
||||
last_updated=kb.get("updated_at", datetime.now().strftime("%Y-%m-%d %H:%M")),
|
||||
size_mb=size_mb,
|
||||
)
|
||||
collections.append(collection_info)
|
||||
|
||||
return collections
|
||||
except Exception as e:
|
||||
self.notify(f"Error listing OpenWebUI collections: {e}", severity="error")
|
||||
return []
|
||||
|
||||
async def update_collections_table(self) -> None:
|
||||
"""Update the collections table with enhanced formatting."""
|
||||
table = self.query_one("#collections_table", EnhancedDataTable)
|
||||
table.clear()
|
||||
|
||||
# Add enhanced columns
|
||||
table.add_columns("Collection", "Backend", "Documents", "Size", "Status", "Updated")
|
||||
|
||||
# Add rows with enhanced formatting
|
||||
for collection in self.collections:
|
||||
# Format size
|
||||
size_str = f"{collection['size_mb']:.1f} MB"
|
||||
if collection["size_mb"] > 1000:
|
||||
size_str = f"{collection['size_mb'] / 1000:.1f} GB"
|
||||
|
||||
# Format document count
|
||||
doc_count = f"{collection['count']:,}"
|
||||
|
||||
table.add_row(
|
||||
collection["name"],
|
||||
collection["backend"],
|
||||
doc_count,
|
||||
size_str,
|
||||
collection["status"],
|
||||
collection["last_updated"],
|
||||
)
|
||||
|
||||
def get_selected_collection(self) -> CollectionInfo | None:
|
||||
"""Get the currently selected collection."""
|
||||
table = self.query_one("#collections_table", EnhancedDataTable)
|
||||
try:
|
||||
if table.cursor_coordinate.row < len(self.collections):
|
||||
return self.collections[table.cursor_coordinate.row]
|
||||
except (AttributeError, IndexError):
|
||||
pass
|
||||
return None
|
||||
|
||||
# Action methods
|
||||
def action_refresh(self) -> None:
|
||||
"""Refresh collections."""
|
||||
self.refresh_collections()
|
||||
|
||||
def action_ingest(self) -> None:
|
||||
"""Show enhanced ingestion dialog."""
|
||||
selected = self.get_selected_collection()
|
||||
if selected:
|
||||
from .ingestion import IngestionScreen
|
||||
self.app.push_screen(IngestionScreen(selected))
|
||||
else:
|
||||
self.notify("🔍 Please select a collection first", severity="warning")
|
||||
|
||||
def action_manage(self) -> None:
|
||||
"""Manage documents in selected collection."""
|
||||
selected = self.get_selected_collection()
|
||||
if selected:
|
||||
if selected["type"] == "weaviate":
|
||||
from .documents import DocumentManagementScreen
|
||||
self.app.push_screen(DocumentManagementScreen(selected, self.weaviate))
|
||||
else:
|
||||
self.notify(
|
||||
"🚧 Document management only available for Weaviate", severity="warning"
|
||||
)
|
||||
else:
|
||||
self.notify("🔍 Please select a collection first", severity="warning")
|
||||
|
||||
def action_search(self) -> None:
|
||||
"""Search in selected collection."""
|
||||
selected = self.get_selected_collection()
|
||||
if selected:
|
||||
from .search import SearchScreen
|
||||
self.app.push_screen(SearchScreen(selected, self.weaviate, self.openwebui))
|
||||
else:
|
||||
self.notify("🔍 Please select a collection first", severity="warning")
|
||||
|
||||
def action_delete(self) -> None:
|
||||
"""Delete selected collection."""
|
||||
selected = self.get_selected_collection()
|
||||
if selected:
|
||||
from .dialogs import ConfirmDeleteScreen
|
||||
self.app.push_screen(ConfirmDeleteScreen(selected, self))
|
||||
else:
|
||||
self.notify("🔍 Please select a collection first", severity="warning")
|
||||
|
||||
def action_tab_dashboard(self) -> None:
|
||||
"""Switch to dashboard tab."""
|
||||
tabs = self.query_one(TabbedContent)
|
||||
tabs.active = "dashboard"
|
||||
|
||||
def action_tab_collections(self) -> None:
|
||||
"""Switch to collections tab."""
|
||||
tabs = self.query_one(TabbedContent)
|
||||
tabs.active = "collections"
|
||||
|
||||
def action_tab_analytics(self) -> None:
|
||||
"""Switch to analytics tab."""
|
||||
tabs = self.query_one(TabbedContent)
|
||||
tabs.active = "analytics"
|
||||
|
||||
def action_next_tab(self) -> None:
|
||||
"""Switch to next tab."""
|
||||
tabs = self.query_one(TabbedContent)
|
||||
tab_ids = ["dashboard", "collections", "analytics"]
|
||||
current = tabs.active
|
||||
try:
|
||||
current_index = tab_ids.index(current)
|
||||
next_index = (current_index + 1) % len(tab_ids)
|
||||
tabs.active = tab_ids[next_index]
|
||||
except (ValueError, AttributeError):
|
||||
tabs.active = tab_ids[0]
|
||||
|
||||
def action_prev_tab(self) -> None:
|
||||
"""Switch to previous tab."""
|
||||
tabs = self.query_one(TabbedContent)
|
||||
tab_ids = ["dashboard", "collections", "analytics"]
|
||||
current = tabs.active
|
||||
try:
|
||||
current_index = tab_ids.index(current)
|
||||
prev_index = (current_index - 1) % len(tab_ids)
|
||||
tabs.active = tab_ids[prev_index]
|
||||
except (ValueError, AttributeError):
|
||||
tabs.active = tab_ids[0]
|
||||
|
||||
def action_help(self) -> None:
|
||||
"""Show help screen."""
|
||||
from .help import HelpScreen
|
||||
help_md = """
|
||||
# 🚀 Modern Collection Management System
|
||||
|
||||
## Navigation
|
||||
- **Tab** / **Shift+Tab**: Switch between tabs
|
||||
- **Ctrl+1/2/3**: Direct tab access
|
||||
- **Enter**: Activate selected item
|
||||
- **Escape**: Go back/cancel
|
||||
- **Arrow Keys**: Navigate within tables
|
||||
- **Home/End**: Jump to first/last row
|
||||
- **Page Up/Down**: Scroll by page
|
||||
|
||||
## Collections
|
||||
- **R**: Refresh collections
|
||||
- **I**: Start ingestion
|
||||
- **M**: Manage documents
|
||||
- **S**: Search collection
|
||||
- **Ctrl+D**: Delete collection
|
||||
|
||||
## Table Navigation
|
||||
- **Up/Down** or **J/K**: Navigate rows
|
||||
- **Space**: Toggle selection
|
||||
- **Ctrl+A**: Select all
|
||||
- **Ctrl+Shift+A**: Clear selection
|
||||
|
||||
## General
|
||||
- **Q** / **Ctrl+C**: Quit application
|
||||
- **F1**: Show this help
|
||||
|
||||
Enjoy the enhanced interface! 🎉
|
||||
"""
|
||||
self.app.push_screen(HelpScreen(help_md))
|
||||
|
||||
def on_button_pressed(self, event: Button.Pressed) -> None:
|
||||
"""Handle button presses with enhanced feedback."""
|
||||
button_id = event.button.id
|
||||
|
||||
# Add visual feedback
|
||||
event.button.add_class("pressed")
|
||||
self.call_later(self.remove_pressed_class, event.button)
|
||||
|
||||
if button_id == "refresh_btn" or button_id == "quick_refresh":
|
||||
self.action_refresh()
|
||||
elif button_id == "ingest_btn" or button_id == "quick_ingest":
|
||||
self.action_ingest()
|
||||
elif button_id == "manage_btn":
|
||||
self.action_manage()
|
||||
elif button_id == "delete_btn":
|
||||
self.action_delete()
|
||||
elif button_id == "search_btn" or button_id == "quick_search":
|
||||
self.action_search()
|
||||
elif button_id == "quick_settings":
|
||||
self.notify("⚙️ Settings panel coming soon!", severity="information")
|
||||
|
||||
def remove_pressed_class(self, button: Button) -> None:
|
||||
"""Remove pressed visual feedback class."""
|
||||
button.remove_class("pressed")
|
||||
189
ingest_pipeline/cli/tui/screens/dialogs.py
Normal file
189
ingest_pipeline/cli/tui/screens/dialogs.py
Normal file
@@ -0,0 +1,189 @@
|
||||
"""Dialog screens for confirmations and user interactions."""
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from textual.app import ComposeResult
|
||||
from textual.binding import Binding
|
||||
from textual.containers import Container, Horizontal
|
||||
from textual.screen import Screen
|
||||
from textual.widgets import Button, Footer, Header, LoadingIndicator, Static
|
||||
from typing_extensions import override
|
||||
|
||||
from ..models import CollectionInfo
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .dashboard import CollectionOverviewScreen
|
||||
from .documents import DocumentManagementScreen
|
||||
|
||||
|
||||
class ConfirmDeleteScreen(Screen[None]):
|
||||
"""Screen for confirming collection deletion."""
|
||||
|
||||
collection: CollectionInfo
|
||||
parent_screen: "CollectionOverviewScreen"
|
||||
|
||||
BINDINGS = [
|
||||
Binding("escape", "app.pop_screen", "Cancel"),
|
||||
Binding("y", "confirm_delete", "Yes"),
|
||||
Binding("n", "app.pop_screen", "No"),
|
||||
Binding("enter", "confirm_delete", "Confirm"),
|
||||
]
|
||||
|
||||
def __init__(self, collection: CollectionInfo, parent_screen: "CollectionOverviewScreen"):
|
||||
super().__init__()
|
||||
self.collection = collection
|
||||
self.parent_screen = parent_screen
|
||||
|
||||
@override
|
||||
def compose(self) -> ComposeResult:
|
||||
yield Header()
|
||||
yield Container(
|
||||
Static("⚠️ Confirm Deletion", classes="title warning"),
|
||||
Static(f"Are you sure you want to delete collection '{self.collection['name']}'?"),
|
||||
Static(f"Backend: {self.collection['backend']}"),
|
||||
Static(f"Documents: {self.collection['count']:,}"),
|
||||
Static("This action cannot be undone!", classes="warning"),
|
||||
Static("Press Y to confirm, N or Escape to cancel", classes="subtitle"),
|
||||
Horizontal(
|
||||
Button("✅ Yes, Delete (Y)", id="yes_btn", variant="error"),
|
||||
Button("❌ Cancel (N)", id="no_btn", variant="default"),
|
||||
classes="action_buttons",
|
||||
),
|
||||
classes="main_container center",
|
||||
)
|
||||
yield Footer()
|
||||
|
||||
def on_mount(self) -> None:
|
||||
"""Initialize the screen with focus on cancel button for safety."""
|
||||
self.query_one("#no_btn").focus()
|
||||
|
||||
def on_button_pressed(self, event: Button.Pressed) -> None:
|
||||
"""Handle button presses."""
|
||||
if event.button.id == "yes_btn":
|
||||
self.action_confirm_delete()
|
||||
elif event.button.id == "no_btn":
|
||||
self.app.pop_screen()
|
||||
|
||||
def action_confirm_delete(self) -> None:
|
||||
"""Confirm deletion."""
|
||||
self.run_worker(self.delete_collection())
|
||||
|
||||
async def delete_collection(self) -> None:
|
||||
"""Delete the collection."""
|
||||
try:
|
||||
if self.collection["type"] == "weaviate" and self.parent_screen.weaviate:
|
||||
# Delete Weaviate collection
|
||||
if self.parent_screen.weaviate and self.parent_screen.weaviate.client:
|
||||
self.parent_screen.weaviate.client.collections.delete(self.collection["name"])
|
||||
self.notify(
|
||||
f"Deleted Weaviate collection: {self.collection['name']}",
|
||||
severity="information",
|
||||
)
|
||||
elif self.collection["type"] == "openwebui" and self.parent_screen.openwebui:
|
||||
# Delete OpenWebUI knowledge base
|
||||
response = await self.parent_screen.openwebui.client.delete(
|
||||
f"/api/v1/knowledge/{self.collection['name']}"
|
||||
)
|
||||
response.raise_for_status()
|
||||
self.notify(
|
||||
f"Deleted OpenWebUI collection: {self.collection['name']}",
|
||||
severity="information",
|
||||
)
|
||||
|
||||
# Refresh parent screen
|
||||
self.parent_screen.refresh_collections() # Don't await, let it run as a worker
|
||||
self.app.pop_screen()
|
||||
|
||||
except Exception as e:
|
||||
self.notify(f"Failed to delete collection: {e}", severity="error")
|
||||
|
||||
|
||||
class ConfirmDocumentDeleteScreen(Screen[None]):
|
||||
"""Screen for confirming document deletion."""
|
||||
|
||||
doc_ids: list[str]
|
||||
collection: CollectionInfo
|
||||
parent_screen: "DocumentManagementScreen"
|
||||
|
||||
BINDINGS = [
|
||||
Binding("escape", "app.pop_screen", "Cancel"),
|
||||
Binding("y", "confirm_delete", "Yes"),
|
||||
Binding("n", "app.pop_screen", "No"),
|
||||
Binding("enter", "confirm_delete", "Confirm"),
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
doc_ids: list[str],
|
||||
collection: CollectionInfo,
|
||||
parent_screen: "DocumentManagementScreen",
|
||||
):
|
||||
super().__init__()
|
||||
self.doc_ids = doc_ids
|
||||
self.collection = collection
|
||||
self.parent_screen = parent_screen
|
||||
|
||||
@override
|
||||
def compose(self) -> ComposeResult:
|
||||
yield Header()
|
||||
yield Container(
|
||||
Static("⚠️ Confirm Document Deletion", classes="title warning"),
|
||||
Static(
|
||||
f"Are you sure you want to delete {len(self.doc_ids)} documents from '{self.collection['name']}'?"
|
||||
),
|
||||
Static("This action cannot be undone!", classes="warning"),
|
||||
Static("Press Y to confirm, N or Escape to cancel", classes="subtitle"),
|
||||
Horizontal(
|
||||
Button("✅ Yes, Delete (Y)", id="yes_btn", variant="error"),
|
||||
Button("❌ Cancel (N)", id="no_btn", variant="default"),
|
||||
classes="action_buttons",
|
||||
),
|
||||
LoadingIndicator(id="loading"),
|
||||
classes="main_container center",
|
||||
)
|
||||
yield Footer()
|
||||
|
||||
def on_mount(self) -> None:
|
||||
"""Initialize the screen with focus on cancel button for safety."""
|
||||
self.query_one("#loading").display = False
|
||||
self.query_one("#no_btn").focus()
|
||||
|
||||
def on_button_pressed(self, event: Button.Pressed) -> None:
|
||||
"""Handle button presses."""
|
||||
if event.button.id == "yes_btn":
|
||||
self.action_confirm_delete()
|
||||
elif event.button.id == "no_btn":
|
||||
self.app.pop_screen()
|
||||
|
||||
def action_confirm_delete(self) -> None:
|
||||
"""Confirm deletion."""
|
||||
self.run_worker(self.delete_documents())
|
||||
|
||||
async def delete_documents(self) -> None:
|
||||
"""Delete the selected documents."""
|
||||
loading = self.query_one("#loading")
|
||||
loading.display = True
|
||||
|
||||
try:
|
||||
if self.parent_screen.weaviate:
|
||||
# Delete documents
|
||||
results = await self.parent_screen.weaviate.delete_documents(self.doc_ids)
|
||||
|
||||
# Count successful deletions
|
||||
successful = sum(1 for success in results.values() if success)
|
||||
failed = len(results) - successful
|
||||
|
||||
if successful > 0:
|
||||
self.notify(f"Deleted {successful} documents", severity="information")
|
||||
if failed > 0:
|
||||
self.notify(f"Failed to delete {failed} documents", severity="error")
|
||||
|
||||
# Clear selection and refresh parent screen
|
||||
self.parent_screen.selected_docs.clear()
|
||||
await self.parent_screen.load_documents()
|
||||
self.app.pop_screen()
|
||||
|
||||
except Exception as e:
|
||||
self.notify(f"Failed to delete documents: {e}", severity="error")
|
||||
finally:
|
||||
loading.display = False
|
||||
279
ingest_pipeline/cli/tui/screens/documents.py
Normal file
279
ingest_pipeline/cli/tui/screens/documents.py
Normal file
@@ -0,0 +1,279 @@
|
||||
"""Document management screen with enhanced navigation."""
|
||||
|
||||
from textual.app import ComposeResult
|
||||
from textual.binding import Binding
|
||||
from textual.containers import Container, Horizontal
|
||||
from textual.screen import Screen
|
||||
from textual.widgets import Button, Footer, Header, Label, LoadingIndicator, Static
|
||||
from typing_extensions import override
|
||||
|
||||
from ....storage.weaviate import WeaviateStorage
|
||||
from ..models import CollectionInfo, DocumentInfo
|
||||
from ..widgets import EnhancedDataTable
|
||||
|
||||
|
||||
class DocumentManagementScreen(Screen[None]):
|
||||
"""Screen for managing documents within a collection with enhanced keyboard navigation."""
|
||||
|
||||
collection: CollectionInfo
|
||||
weaviate: WeaviateStorage | None
|
||||
documents: list[DocumentInfo]
|
||||
selected_docs: set[str]
|
||||
current_offset: int
|
||||
page_size: int
|
||||
|
||||
BINDINGS = [
|
||||
Binding("escape", "app.pop_screen", "Back"),
|
||||
Binding("r", "refresh", "Refresh"),
|
||||
Binding("delete", "delete_selected", "Delete Selected"),
|
||||
Binding("a", "select_all", "Select All"),
|
||||
Binding("ctrl+a", "select_all", "Select All"),
|
||||
Binding("n", "select_none", "Clear Selection"),
|
||||
Binding("ctrl+shift+a", "select_none", "Clear Selection"),
|
||||
Binding("space", "toggle_selection", "Toggle Selection"),
|
||||
Binding("ctrl+d", "delete_selected", "Delete Selected"),
|
||||
Binding("pageup", "prev_page", "Previous Page"),
|
||||
Binding("pagedown", "next_page", "Next Page"),
|
||||
Binding("home", "first_page", "First Page"),
|
||||
Binding("end", "last_page", "Last Page"),
|
||||
]
|
||||
|
||||
def __init__(self, collection: CollectionInfo, weaviate: WeaviateStorage | None):
|
||||
super().__init__()
|
||||
self.collection = collection
|
||||
self.weaviate = weaviate
|
||||
self.documents: list[DocumentInfo] = []
|
||||
self.selected_docs: set[str] = set()
|
||||
self.current_offset = 0
|
||||
self.page_size = 50
|
||||
|
||||
@override
|
||||
def compose(self) -> ComposeResult:
|
||||
yield Header()
|
||||
yield Container(
|
||||
Static(f"📄 Document Management: {self.collection['name']}", classes="title"),
|
||||
Static(
|
||||
f"Total Documents: {self.collection['count']:,} | Use Space to select, Delete to remove",
|
||||
classes="subtitle"
|
||||
),
|
||||
Label(f"Page size: {self.page_size} documents"),
|
||||
EnhancedDataTable(id="documents_table", classes="enhanced-table"),
|
||||
Horizontal(
|
||||
Button("🔄 Refresh", id="refresh_docs_btn", variant="primary"),
|
||||
Button("🗑️ Delete Selected", id="delete_selected_btn", variant="error"),
|
||||
Button("✅ Select All", id="select_all_btn", variant="default"),
|
||||
Button("❌ Clear Selection", id="clear_selection_btn", variant="default"),
|
||||
Button("⬅️ Previous Page", id="prev_page_btn", variant="default"),
|
||||
Button("➡️ Next Page", id="next_page_btn", variant="default"),
|
||||
classes="button_bar",
|
||||
),
|
||||
Label("", id="selection_status"),
|
||||
Static("", id="page_info", classes="status-text"),
|
||||
LoadingIndicator(id="loading"),
|
||||
classes="main_container",
|
||||
)
|
||||
yield Footer()
|
||||
|
||||
async def on_mount(self) -> None:
|
||||
"""Initialize the screen."""
|
||||
self.query_one("#loading").display = False
|
||||
|
||||
# Setup documents table
|
||||
table = self.query_one("#documents_table", EnhancedDataTable)
|
||||
table.add_columns("✓", "Title", "Source URL", "Words", "ID")
|
||||
|
||||
# Set up message handling for table events
|
||||
table.can_focus = True
|
||||
|
||||
await self.load_documents()
|
||||
|
||||
async def load_documents(self) -> None:
|
||||
"""Load documents from the collection."""
|
||||
loading = self.query_one("#loading")
|
||||
loading.display = True
|
||||
|
||||
try:
|
||||
if self.weaviate:
|
||||
# Set the collection name
|
||||
self.weaviate.config.collection_name = self.collection["name"]
|
||||
|
||||
# Load documents with pagination
|
||||
raw_docs = await self.weaviate.list_documents(
|
||||
limit=self.page_size, offset=self.current_offset
|
||||
)
|
||||
# Cast to proper type with type checking
|
||||
self.documents = [
|
||||
DocumentInfo(
|
||||
id=str(doc["id"]),
|
||||
title=str(doc["title"]),
|
||||
source_url=str(doc["source_url"]),
|
||||
content_preview=str(doc["content_preview"]),
|
||||
word_count=int(doc["word_count"])
|
||||
if isinstance(doc["word_count"], (int, str))
|
||||
and str(doc["word_count"]).isdigit()
|
||||
else 0,
|
||||
timestamp=str(doc["timestamp"]),
|
||||
)
|
||||
for doc in raw_docs
|
||||
]
|
||||
|
||||
await self.update_table()
|
||||
self.update_selection_status()
|
||||
self.update_page_info()
|
||||
|
||||
except Exception as e:
|
||||
self.notify(f"Error loading documents: {e}", severity="error")
|
||||
finally:
|
||||
loading.display = False
|
||||
|
||||
async def update_table(self) -> None:
|
||||
"""Update the documents table."""
|
||||
table = self.query_one("#documents_table", EnhancedDataTable)
|
||||
table.clear()
|
||||
|
||||
# Re-add columns
|
||||
table.add_columns("✓", "Title", "Source URL", "Words", "ID")
|
||||
|
||||
# Add rows
|
||||
for doc in self.documents:
|
||||
selected = "✓" if doc["id"] in self.selected_docs else ""
|
||||
table.add_row(
|
||||
selected,
|
||||
doc.get("title", "Untitled")[:50],
|
||||
doc.get("source_url", "")[:50],
|
||||
str(doc.get("word_count", 0)),
|
||||
doc["id"][:8] + "...", # Show truncated ID
|
||||
)
|
||||
|
||||
def update_selection_status(self) -> None:
|
||||
"""Update the selection status label."""
|
||||
status_label = self.query_one("#selection_status", Label)
|
||||
total_selected = len(self.selected_docs)
|
||||
status_label.update(f"Selected: {total_selected} documents")
|
||||
|
||||
def update_page_info(self) -> None:
|
||||
"""Update the page information."""
|
||||
page_info = self.query_one("#page_info", Static)
|
||||
total_docs = self.collection["count"]
|
||||
start = self.current_offset + 1
|
||||
end = min(self.current_offset + len(self.documents), total_docs)
|
||||
page_num = (self.current_offset // self.page_size) + 1
|
||||
total_pages = (total_docs + self.page_size - 1) // self.page_size
|
||||
|
||||
page_info.update(
|
||||
f"Showing {start:,}-{end:,} of {total_docs:,} documents (Page {page_num} of {total_pages})"
|
||||
)
|
||||
|
||||
def get_current_document(self) -> DocumentInfo | None:
|
||||
"""Get the currently selected document."""
|
||||
table = self.query_one("#documents_table", EnhancedDataTable)
|
||||
try:
|
||||
if 0 <= table.cursor_coordinate.row < len(self.documents):
|
||||
return self.documents[table.cursor_coordinate.row]
|
||||
except (AttributeError, IndexError):
|
||||
pass
|
||||
return None
|
||||
|
||||
# Action methods
|
||||
def action_refresh(self) -> None:
|
||||
"""Refresh the document list."""
|
||||
self.run_worker(self.load_documents())
|
||||
|
||||
def action_toggle_selection(self) -> None:
|
||||
"""Toggle selection of current row."""
|
||||
doc = self.get_current_document()
|
||||
if doc:
|
||||
doc_id = doc["id"]
|
||||
if doc_id in self.selected_docs:
|
||||
self.selected_docs.remove(doc_id)
|
||||
else:
|
||||
self.selected_docs.add(doc_id)
|
||||
|
||||
self.run_worker(self.update_table())
|
||||
self.update_selection_status()
|
||||
|
||||
def action_select_all(self) -> None:
|
||||
"""Select all documents on current page."""
|
||||
for doc in self.documents:
|
||||
self.selected_docs.add(doc["id"])
|
||||
self.run_worker(self.update_table())
|
||||
self.update_selection_status()
|
||||
|
||||
def action_select_none(self) -> None:
|
||||
"""Clear all selections."""
|
||||
self.selected_docs.clear()
|
||||
self.run_worker(self.update_table())
|
||||
self.update_selection_status()
|
||||
|
||||
def action_delete_selected(self) -> None:
|
||||
"""Delete selected documents."""
|
||||
if self.selected_docs:
|
||||
from .dialogs import ConfirmDocumentDeleteScreen
|
||||
self.app.push_screen(
|
||||
ConfirmDocumentDeleteScreen(list(self.selected_docs), self.collection, self)
|
||||
)
|
||||
else:
|
||||
self.notify("No documents selected", severity="warning")
|
||||
|
||||
def action_next_page(self) -> None:
|
||||
"""Go to next page."""
|
||||
if self.current_offset + self.page_size < self.collection["count"]:
|
||||
self.current_offset += self.page_size
|
||||
self.run_worker(self.load_documents())
|
||||
|
||||
def action_prev_page(self) -> None:
|
||||
"""Go to previous page."""
|
||||
if self.current_offset >= self.page_size:
|
||||
self.current_offset -= self.page_size
|
||||
self.run_worker(self.load_documents())
|
||||
|
||||
def action_first_page(self) -> None:
|
||||
"""Go to first page."""
|
||||
if self.current_offset > 0:
|
||||
self.current_offset = 0
|
||||
self.run_worker(self.load_documents())
|
||||
|
||||
def action_last_page(self) -> None:
|
||||
"""Go to last page."""
|
||||
total_docs = self.collection["count"]
|
||||
last_offset = ((total_docs - 1) // self.page_size) * self.page_size
|
||||
if self.current_offset != last_offset:
|
||||
self.current_offset = last_offset
|
||||
self.run_worker(self.load_documents())
|
||||
|
||||
def on_button_pressed(self, event: Button.Pressed) -> None:
|
||||
"""Handle button presses."""
|
||||
if event.button.id == "refresh_docs_btn":
|
||||
self.action_refresh()
|
||||
elif event.button.id == "delete_selected_btn":
|
||||
self.action_delete_selected()
|
||||
elif event.button.id == "select_all_btn":
|
||||
self.action_select_all()
|
||||
elif event.button.id == "clear_selection_btn":
|
||||
self.action_select_none()
|
||||
elif event.button.id == "next_page_btn":
|
||||
self.action_next_page()
|
||||
elif event.button.id == "prev_page_btn":
|
||||
self.action_prev_page()
|
||||
|
||||
def on_enhanced_data_table_row_toggled(self, event: EnhancedDataTable.RowToggled) -> None:
|
||||
"""Handle row toggle from enhanced table."""
|
||||
if 0 <= event.row_index < len(self.documents):
|
||||
doc = self.documents[event.row_index]
|
||||
doc_id = doc["id"]
|
||||
|
||||
if doc_id in self.selected_docs:
|
||||
self.selected_docs.remove(doc_id)
|
||||
else:
|
||||
self.selected_docs.add(doc_id)
|
||||
|
||||
self.run_worker(self.update_table())
|
||||
self.update_selection_status()
|
||||
|
||||
def on_enhanced_data_table_select_all(self, event: EnhancedDataTable.SelectAll) -> None:
|
||||
"""Handle select all from enhanced table."""
|
||||
self.action_select_all()
|
||||
|
||||
def on_enhanced_data_table_clear_selection(self, event: EnhancedDataTable.ClearSelection) -> None:
|
||||
"""Handle clear selection from enhanced table."""
|
||||
self.action_select_none()
|
||||
50
ingest_pipeline/cli/tui/screens/help.py
Normal file
50
ingest_pipeline/cli/tui/screens/help.py
Normal file
@@ -0,0 +1,50 @@
|
||||
"""Help screen with keyboard shortcuts and usage information."""
|
||||
|
||||
from textual.app import ComposeResult
|
||||
from textual.binding import Binding
|
||||
from textual.containers import Container, ScrollableContainer
|
||||
from textual.screen import ModalScreen
|
||||
from textual.widgets import Button, Markdown, Rule, Static
|
||||
from typing_extensions import override
|
||||
|
||||
|
||||
class HelpScreen(ModalScreen[None]):
|
||||
"""Modern help screen with comprehensive keyboard shortcuts."""
|
||||
|
||||
help_content: str
|
||||
|
||||
BINDINGS = [
|
||||
Binding("escape", "app.pop_screen", "Close"),
|
||||
Binding("q", "app.pop_screen", "Close"),
|
||||
Binding("enter", "app.pop_screen", "Close"),
|
||||
Binding("f1", "app.pop_screen", "Close"),
|
||||
]
|
||||
|
||||
def __init__(self, help_content: str):
|
||||
super().__init__()
|
||||
self.help_content = help_content
|
||||
|
||||
@override
|
||||
def compose(self) -> ComposeResult:
|
||||
with Container(classes="modal-container"):
|
||||
yield Static("📚 Help & Keyboard Shortcuts", classes="title")
|
||||
yield Static("Enhanced navigation and productivity features", classes="subtitle")
|
||||
yield Rule(line_style="heavy")
|
||||
|
||||
with ScrollableContainer():
|
||||
yield Markdown(self.help_content)
|
||||
|
||||
yield Container(
|
||||
Button("✅ Got it! (Press Escape or Enter)", id="close_btn", variant="primary"),
|
||||
classes="action_buttons center",
|
||||
)
|
||||
|
||||
def on_mount(self) -> None:
|
||||
"""Initialize the help screen."""
|
||||
# Focus the close button
|
||||
self.query_one("#close_btn").focus()
|
||||
|
||||
def on_button_pressed(self, event: Button.Pressed) -> None:
|
||||
"""Close help screen."""
|
||||
if event.button.id == "close_btn":
|
||||
self.app.pop_screen()
|
||||
253
ingest_pipeline/cli/tui/screens/ingestion.py
Normal file
253
ingest_pipeline/cli/tui/screens/ingestion.py
Normal file
@@ -0,0 +1,253 @@
|
||||
"""Enhanced ingestion screen with better UX."""
|
||||
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
|
||||
from textual import work
|
||||
from textual.app import ComposeResult
|
||||
from textual.binding import Binding
|
||||
from textual.containers import Container, Horizontal
|
||||
from textual.screen import ModalScreen
|
||||
from textual.widgets import Button, Input, Label, LoadingIndicator, Rule, Static
|
||||
from typing_extensions import override
|
||||
|
||||
from ....core.models import IngestionJob, IngestionSource, StorageBackend
|
||||
from ..models import CollectionInfo
|
||||
from ..widgets import EnhancedProgressBar
|
||||
|
||||
|
||||
class IngestionScreen(ModalScreen[None]):
|
||||
"""Enhanced ingestion screen with better UX and keyboard navigation."""
|
||||
|
||||
collection: CollectionInfo
|
||||
selected_type: IngestionSource
|
||||
progress_value: int
|
||||
|
||||
BINDINGS = [
|
||||
Binding("escape", "app.pop_screen", "Cancel"),
|
||||
Binding("ctrl+i", "start_ingestion", "Start"),
|
||||
Binding("1", "select_web", "Web", show=False),
|
||||
Binding("2", "select_repo", "Repository", show=False),
|
||||
Binding("3", "select_docs", "Documentation", show=False),
|
||||
Binding("enter", "start_ingestion", "Start Ingestion"),
|
||||
Binding("tab", "focus_next", "Next Field"),
|
||||
Binding("shift+tab", "focus_previous", "Previous Field"),
|
||||
]
|
||||
|
||||
def __init__(self, collection: CollectionInfo):
|
||||
super().__init__()
|
||||
self.collection = collection
|
||||
self.selected_type = IngestionSource.WEB
|
||||
self.progress_value = 0
|
||||
|
||||
@override
|
||||
def compose(self) -> ComposeResult:
|
||||
with Container(classes="modal-container"):
|
||||
yield Static("📥 Modern Ingestion Interface", classes="title")
|
||||
yield Static(
|
||||
f"Target: {self.collection['name']} ({self.collection['backend']})",
|
||||
classes="subtitle",
|
||||
)
|
||||
yield Rule()
|
||||
|
||||
# Enhanced input section
|
||||
yield Container(
|
||||
Label("🌐 Source URL:", classes="input-label"),
|
||||
Input(
|
||||
placeholder="https://docs.example.com or file:///path/to/repo",
|
||||
id="url_input",
|
||||
classes="modern-input",
|
||||
),
|
||||
Label("📋 Source Type (Press 1/2/3):", classes="input-label"),
|
||||
Horizontal(
|
||||
Button("🌐 Web (1)", id="web_btn", variant="primary", classes="type-button"),
|
||||
Button(
|
||||
"📦 Repository (2)", id="repo_btn", variant="default", classes="type-button"
|
||||
),
|
||||
Button(
|
||||
"📖 Documentation (3)", id="docs_btn", variant="default", classes="type-button"
|
||||
),
|
||||
classes="type_buttons",
|
||||
),
|
||||
Rule(line_style="dashed"),
|
||||
classes="input-section card",
|
||||
)
|
||||
|
||||
# Enhanced Progress section
|
||||
yield Container(
|
||||
Label("🔄 Progress:", classes="progress-label"),
|
||||
EnhancedProgressBar(id="enhanced_progress", total=100),
|
||||
Static("Ready to start", id="progress_text", classes="status-text"),
|
||||
classes="progress-section card",
|
||||
)
|
||||
|
||||
# Action buttons
|
||||
yield Horizontal(
|
||||
Button("🚀 Start Ingestion", id="start_btn", variant="success"),
|
||||
Button("❌ Cancel", id="cancel_btn", variant="error"),
|
||||
classes="action_buttons",
|
||||
)
|
||||
|
||||
yield LoadingIndicator(id="loading", classes="pulse")
|
||||
|
||||
def on_mount(self) -> None:
|
||||
"""Initialize the screen."""
|
||||
self.query_one("#loading").display = False
|
||||
self.selected_type = IngestionSource.WEB
|
||||
# Focus the URL input field by default
|
||||
self.query_one("#url_input").focus()
|
||||
|
||||
def action_select_web(self) -> None:
|
||||
"""Select web ingestion type."""
|
||||
self.selected_type = IngestionSource.WEB
|
||||
self.update_type_buttons("web")
|
||||
|
||||
def action_select_repo(self) -> None:
|
||||
"""Select repository ingestion type."""
|
||||
self.selected_type = IngestionSource.REPOSITORY
|
||||
self.update_type_buttons("repo")
|
||||
|
||||
def action_select_docs(self) -> None:
|
||||
"""Select documentation ingestion type."""
|
||||
self.selected_type = IngestionSource.DOCUMENTATION
|
||||
self.update_type_buttons("docs")
|
||||
|
||||
def on_button_pressed(self, event: Button.Pressed) -> None:
|
||||
"""Handle button presses with enhanced feedback."""
|
||||
button_id = event.button.id
|
||||
|
||||
if button_id == "web_btn":
|
||||
self.action_select_web()
|
||||
elif button_id == "repo_btn":
|
||||
self.action_select_repo()
|
||||
elif button_id == "docs_btn":
|
||||
self.action_select_docs()
|
||||
elif button_id == "start_btn":
|
||||
self.action_start_ingestion()
|
||||
elif button_id == "cancel_btn":
|
||||
self.app.pop_screen()
|
||||
|
||||
def update_type_buttons(self, selected: str) -> None:
|
||||
"""Update type button visual states."""
|
||||
buttons = {
|
||||
"web": self.query_one("#web_btn", Button),
|
||||
"repo": self.query_one("#repo_btn", Button),
|
||||
"docs": self.query_one("#docs_btn", Button),
|
||||
}
|
||||
|
||||
for btn_type, button in buttons.items():
|
||||
if btn_type == selected:
|
||||
button.variant = "primary"
|
||||
else:
|
||||
button.variant = "default"
|
||||
|
||||
def on_input_submitted(self, event: Input.Submitted) -> None:
|
||||
"""Handle URL input submission."""
|
||||
if event.input.id == "url_input":
|
||||
self.action_start_ingestion()
|
||||
|
||||
def action_start_ingestion(self) -> None:
|
||||
"""Start the enhanced ingestion process."""
|
||||
url_input = self.query_one("#url_input", Input)
|
||||
if not url_input.value.strip():
|
||||
self.notify("🔍 Please enter a source URL", severity="error")
|
||||
url_input.focus()
|
||||
return
|
||||
|
||||
self.perform_ingestion(url_input.value.strip())
|
||||
|
||||
@work(exclusive=True)
|
||||
async def perform_ingestion(self, source_url: str) -> None:
|
||||
"""Perform ingestion with enhanced progress tracking and better UX."""
|
||||
loading = self.query_one("#loading")
|
||||
enhanced_progress = self.query_one("#enhanced_progress", EnhancedProgressBar)
|
||||
progress_text = self.query_one("#progress_text", Static)
|
||||
|
||||
try:
|
||||
loading.display = True
|
||||
|
||||
# Enhanced progress tracking with better visual feedback
|
||||
enhanced_progress.update_progress(5, "Initializing ingestion pipeline...")
|
||||
progress_text.update("🚀 Starting modern ingestion process...")
|
||||
await asyncio.sleep(0.3)
|
||||
|
||||
# Determine storage backend
|
||||
storage_backend = (
|
||||
StorageBackend.WEAVIATE
|
||||
if self.collection["type"] == "weaviate"
|
||||
else StorageBackend.OPEN_WEBUI
|
||||
)
|
||||
|
||||
enhanced_progress.update_progress(15, "Creating ingestion job...")
|
||||
progress_text.update("📋 Configuring job parameters...")
|
||||
await asyncio.sleep(0.4)
|
||||
|
||||
# Create ingestion job
|
||||
job = IngestionJob(
|
||||
source_url=source_url,
|
||||
source_type=self.selected_type,
|
||||
storage_backend=storage_backend,
|
||||
created_at=datetime.now(),
|
||||
)
|
||||
|
||||
enhanced_progress.update_progress(25, "Loading ingestion modules...")
|
||||
progress_text.update("⚡ Importing processing components...")
|
||||
await asyncio.sleep(0.4)
|
||||
|
||||
from ....flows.ingestion import ingest_documents_task
|
||||
|
||||
enhanced_progress.update_progress(35, "Connecting to services...")
|
||||
progress_text.update(f"🔗 Establishing connection to {storage_backend.value}...")
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
enhanced_progress.update_progress(45, "Fetching source content...")
|
||||
progress_text.update("📄 Retrieving documents from source...")
|
||||
await asyncio.sleep(0.6)
|
||||
|
||||
# Simulate realistic progress steps
|
||||
progress_steps = [
|
||||
(55, "Parsing document structure...", "🔍 Analyzing content structure..."),
|
||||
(65, "Extracting text content...", "📝 Processing text and metadata..."),
|
||||
(75, "Generating embeddings...", "🧠 Creating vector embeddings..."),
|
||||
(85, "Storing in database...", "💾 Persisting to storage backend..."),
|
||||
(95, "Finalizing operation...", "🎯 Completing ingestion process..."),
|
||||
]
|
||||
|
||||
for progress, status, text in progress_steps:
|
||||
enhanced_progress.update_progress(progress, status)
|
||||
progress_text.update(text)
|
||||
await asyncio.sleep(0.7)
|
||||
|
||||
# Perform actual ingestion
|
||||
successful, failed = await ingest_documents_task(
|
||||
job, collection_name=self.collection["name"]
|
||||
)
|
||||
|
||||
# Success handling with celebratory feedback
|
||||
enhanced_progress.update_progress(100, "Completed successfully!")
|
||||
progress_text.update(
|
||||
f"🎉 Ingestion complete: {successful} documents added, {failed} failed"
|
||||
)
|
||||
|
||||
# Show enhanced success notification
|
||||
if successful > 0:
|
||||
self.notify(
|
||||
f"🎉 Successfully ingested {successful} documents!",
|
||||
severity="information"
|
||||
)
|
||||
if failed > 0:
|
||||
self.notify(f"⚠️ {failed} documents failed to process", severity="warning")
|
||||
else:
|
||||
self.notify("❌ No documents were successfully processed", severity="error")
|
||||
|
||||
# Keep results visible before closing
|
||||
await asyncio.sleep(3)
|
||||
self.app.pop_screen()
|
||||
|
||||
except Exception as e:
|
||||
enhanced_progress.update_progress(0, "Ingestion failed")
|
||||
progress_text.update(f"❌ Error occurred: {str(e)[:100]}")
|
||||
self.notify(f"❌ Ingestion failed: {e}", severity="error")
|
||||
await asyncio.sleep(2) # Show error before allowing interaction
|
||||
finally:
|
||||
loading.display = False
|
||||
190
ingest_pipeline/cli/tui/screens/search.py
Normal file
190
ingest_pipeline/cli/tui/screens/search.py
Normal file
@@ -0,0 +1,190 @@
|
||||
"""Search screen for finding documents within collections."""
|
||||
|
||||
from textual.app import ComposeResult
|
||||
from textual.binding import Binding
|
||||
from textual.containers import Container
|
||||
from textual.screen import Screen
|
||||
from textual.widgets import Button, Footer, Header, Input, LoadingIndicator, Static
|
||||
from typing_extensions import override
|
||||
|
||||
from ....storage.openwebui import OpenWebUIStorage
|
||||
from ....storage.weaviate import WeaviateStorage
|
||||
from ..models import CollectionInfo
|
||||
from ..widgets import EnhancedDataTable
|
||||
|
||||
|
||||
class SearchScreen(Screen[None]):
|
||||
"""Screen for searching within a collection with enhanced keyboard navigation."""
|
||||
|
||||
collection: CollectionInfo
|
||||
weaviate: WeaviateStorage | None
|
||||
openwebui: OpenWebUIStorage | None
|
||||
|
||||
BINDINGS = [
|
||||
Binding("escape", "app.pop_screen", "Back"),
|
||||
Binding("enter", "perform_search", "Search"),
|
||||
Binding("ctrl+f", "focus_search", "Focus Search"),
|
||||
Binding("f3", "perform_search", "Search Again"),
|
||||
Binding("ctrl+r", "clear_results", "Clear Results"),
|
||||
Binding("/", "focus_search", "Quick Search"),
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
collection: CollectionInfo,
|
||||
weaviate: WeaviateStorage | None,
|
||||
openwebui: OpenWebUIStorage | None,
|
||||
):
|
||||
super().__init__()
|
||||
self.collection = collection
|
||||
self.weaviate = weaviate
|
||||
self.openwebui = openwebui
|
||||
|
||||
@override
|
||||
def compose(self) -> ComposeResult:
|
||||
yield Header()
|
||||
yield Container(
|
||||
Static(
|
||||
f"🔍 Search in: {self.collection['name']} ({self.collection['backend']})",
|
||||
classes="title",
|
||||
),
|
||||
Static("Press / or Ctrl+F to focus search, Enter to search", classes="subtitle"),
|
||||
Input(placeholder="Enter search query... (press Enter to search)", id="search_input"),
|
||||
Button("🔍 Search", id="search_btn", variant="primary"),
|
||||
Button("🗑️ Clear Results", id="clear_btn", variant="default"),
|
||||
EnhancedDataTable(id="results_table"),
|
||||
Static("Enter your search query to find relevant documents.", id="search_status", classes="status-text"),
|
||||
LoadingIndicator(id="loading"),
|
||||
classes="main_container",
|
||||
)
|
||||
yield Footer()
|
||||
|
||||
def on_mount(self) -> None:
|
||||
"""Initialize the screen."""
|
||||
self.query_one("#loading").display = False
|
||||
|
||||
# Setup results table
|
||||
table = self.query_one("#results_table", EnhancedDataTable)
|
||||
table.add_columns("Title", "Content Preview", "Score")
|
||||
|
||||
# Focus search input
|
||||
self.query_one("#search_input").focus()
|
||||
|
||||
def action_focus_search(self) -> None:
|
||||
"""Focus the search input field."""
|
||||
search_input = self.query_one("#search_input", Input)
|
||||
search_input.focus()
|
||||
|
||||
def action_clear_results(self) -> None:
|
||||
"""Clear search results."""
|
||||
table = self.query_one("#results_table", EnhancedDataTable)
|
||||
table.clear()
|
||||
table.add_columns("Title", "Content Preview", "Score")
|
||||
|
||||
status = self.query_one("#search_status", Static)
|
||||
status.update("Search results cleared. Enter a new query to search.")
|
||||
|
||||
def on_input_submitted(self, event: Input.Submitted) -> None:
|
||||
"""Handle search input submission."""
|
||||
if event.input.id == "search_input":
|
||||
self.action_perform_search()
|
||||
|
||||
def on_button_pressed(self, event: Button.Pressed) -> None:
|
||||
"""Handle button presses."""
|
||||
if event.button.id == "search_btn":
|
||||
self.action_perform_search()
|
||||
elif event.button.id == "clear_btn":
|
||||
self.action_clear_results()
|
||||
|
||||
def action_perform_search(self) -> None:
|
||||
"""Perform search."""
|
||||
search_input = self.query_one("#search_input", Input)
|
||||
if not search_input.value.strip():
|
||||
self.notify("Please enter a search query", severity="warning")
|
||||
search_input.focus()
|
||||
return
|
||||
|
||||
self.run_worker(self.search_collection(search_input.value.strip()))
|
||||
|
||||
async def search_collection(self, query: str) -> None:
|
||||
"""Search the collection."""
|
||||
loading = self.query_one("#loading")
|
||||
table = self.query_one("#results_table", EnhancedDataTable)
|
||||
status = self.query_one("#search_status", Static)
|
||||
|
||||
try:
|
||||
loading.display = True
|
||||
status.update(f"🔍 Searching for '{query}'...")
|
||||
table.clear()
|
||||
table.add_columns("Title", "Content Preview", "Score")
|
||||
|
||||
results = []
|
||||
|
||||
if self.collection["type"] == "weaviate" and self.weaviate:
|
||||
results = await self.search_weaviate(query)
|
||||
elif self.collection["type"] == "openwebui" and self.openwebui:
|
||||
results = await self.search_openwebui(query)
|
||||
|
||||
# Add results to table
|
||||
for result in results:
|
||||
title = result.get("title", "Untitled")
|
||||
content = result.get("content", "")
|
||||
score = result.get("score", 0)
|
||||
table.add_row(
|
||||
title[:50] if isinstance(title, str) else str(title)[:50],
|
||||
(content[:100] + "...")
|
||||
if isinstance(content, str)
|
||||
else str(content)[:100] + "...",
|
||||
f"{score:.3f}" if isinstance(score, (int, float)) else str(score),
|
||||
)
|
||||
|
||||
if not results:
|
||||
status.update(f"No results found for '{query}'. Try different keywords.")
|
||||
self.notify("No results found", severity="information")
|
||||
else:
|
||||
status.update(f"Found {len(results)} results for '{query}'. Use arrow keys to navigate.")
|
||||
self.notify(f"Found {len(results)} results", severity="information")
|
||||
# Focus the table for navigation
|
||||
table.focus()
|
||||
|
||||
except Exception as e:
|
||||
status.update(f"Search error: {e}")
|
||||
self.notify(f"Search error: {e}", severity="error")
|
||||
finally:
|
||||
loading.display = False
|
||||
|
||||
async def search_weaviate(self, query: str) -> list[dict[str, str | float]]:
|
||||
"""Search Weaviate collection."""
|
||||
if not self.weaviate:
|
||||
return []
|
||||
|
||||
try:
|
||||
await self.weaviate.initialize()
|
||||
results_generator = self.weaviate.search(query, limit=20)
|
||||
results = [doc async for doc in results_generator]
|
||||
# Convert Document objects to dict format expected by the UI
|
||||
return [
|
||||
{
|
||||
"title": getattr(doc, "title", "Untitled"),
|
||||
"content": getattr(doc, "content", ""),
|
||||
"score": getattr(doc, "score", 0.0),
|
||||
}
|
||||
for doc in results
|
||||
]
|
||||
except Exception as e:
|
||||
self.notify(f"Weaviate search error: {e}", severity="error")
|
||||
return []
|
||||
|
||||
async def search_openwebui(self, query: str) -> list[dict[str, str | float]]:
|
||||
"""Search OpenWebUI collection."""
|
||||
if not self.openwebui:
|
||||
return []
|
||||
|
||||
try:
|
||||
# OpenWebUI does not have a direct search API, so return empty
|
||||
# In a real implementation, you would need to implement search via their API
|
||||
self.notify("OpenWebUI search not yet implemented", severity="warning")
|
||||
return []
|
||||
except Exception as e:
|
||||
self.notify(f"OpenWebUI search error: {e}", severity="error")
|
||||
return []
|
||||
346
ingest_pipeline/cli/tui/styles.py
Normal file
346
ingest_pipeline/cli/tui/styles.py
Normal file
@@ -0,0 +1,346 @@
|
||||
"""Modern CSS styles for the TUI application."""
|
||||
|
||||
# Enhanced modern CSS with better focus indicators and navigation feedback
|
||||
TUI_CSS = """
|
||||
/* Base styling */
|
||||
Screen {
|
||||
background: #1a1a1a;
|
||||
}
|
||||
|
||||
* {
|
||||
color: #ffffff;
|
||||
}
|
||||
|
||||
/* Title styling */
|
||||
.title {
|
||||
text-align: center;
|
||||
margin: 1;
|
||||
color: #ffffff;
|
||||
text-style: bold;
|
||||
background: #333333;
|
||||
padding: 1;
|
||||
border: solid #0088cc;
|
||||
}
|
||||
|
||||
.subtitle {
|
||||
text-align: center;
|
||||
margin: 1 0;
|
||||
color: #cccccc;
|
||||
text-style: italic;
|
||||
background: #333333;
|
||||
padding: 1;
|
||||
}
|
||||
|
||||
/* Container styling */
|
||||
.main_container {
|
||||
margin: 1;
|
||||
padding: 1;
|
||||
background: #333333;
|
||||
}
|
||||
|
||||
.card {
|
||||
background: #333333;
|
||||
padding: 1;
|
||||
margin: 1;
|
||||
color: #ffffff;
|
||||
border: solid #444444;
|
||||
}
|
||||
|
||||
.card:focus-within {
|
||||
border: solid #0088cc;
|
||||
}
|
||||
|
||||
/* Button styling with focus states */
|
||||
Button {
|
||||
background: #444444;
|
||||
color: #ffffff;
|
||||
margin: 0 1;
|
||||
border: solid transparent;
|
||||
}
|
||||
|
||||
Button:hover {
|
||||
background: #0088cc;
|
||||
color: #ffffff;
|
||||
}
|
||||
|
||||
Button:focus {
|
||||
border: solid #ffffff;
|
||||
background: #0088cc;
|
||||
}
|
||||
|
||||
Button.-primary {
|
||||
background: #0088cc;
|
||||
color: #ffffff;
|
||||
}
|
||||
|
||||
Button.-success {
|
||||
background: #28a745;
|
||||
color: #ffffff;
|
||||
}
|
||||
|
||||
Button.-error {
|
||||
background: #dc3545;
|
||||
color: #ffffff;
|
||||
}
|
||||
|
||||
Button.-warning {
|
||||
background: #ffc107;
|
||||
color: #000000;
|
||||
}
|
||||
|
||||
/* Enhanced DataTable with focus indicators */
|
||||
DataTable {
|
||||
background: #333333;
|
||||
color: #ffffff;
|
||||
border: solid #444444;
|
||||
}
|
||||
|
||||
DataTable:focus {
|
||||
border: solid #0088cc;
|
||||
}
|
||||
|
||||
DataTable > .datatable--header {
|
||||
background: #444444;
|
||||
color: #ffffff;
|
||||
text-style: bold;
|
||||
}
|
||||
|
||||
DataTable > .datatable--cursor {
|
||||
background: #0088cc;
|
||||
color: #ffffff;
|
||||
}
|
||||
|
||||
DataTable > .datatable--cursor-row {
|
||||
background: #0066aa;
|
||||
color: #ffffff;
|
||||
}
|
||||
|
||||
/* Input styling */
|
||||
Input {
|
||||
background: #333333;
|
||||
color: #ffffff;
|
||||
border: solid #666666;
|
||||
}
|
||||
|
||||
Input:focus {
|
||||
border: solid #0088cc;
|
||||
}
|
||||
|
||||
/* Header and Footer */
|
||||
Header, Footer {
|
||||
background: #333333;
|
||||
color: #ffffff;
|
||||
}
|
||||
|
||||
/* Tab styling with focus indicators */
|
||||
Tab {
|
||||
background: #333333;
|
||||
color: #ffffff;
|
||||
border: solid transparent;
|
||||
}
|
||||
|
||||
Tab:focus {
|
||||
border: solid #ffffff;
|
||||
}
|
||||
|
||||
Tab.-active {
|
||||
background: #0088cc;
|
||||
color: #ffffff;
|
||||
text-style: bold;
|
||||
}
|
||||
|
||||
/* Label styling */
|
||||
Label {
|
||||
color: #ffffff;
|
||||
}
|
||||
|
||||
/* Status indicators */
|
||||
.status-active {
|
||||
color: #28a745;
|
||||
}
|
||||
|
||||
.status-error {
|
||||
color: #dc3545;
|
||||
}
|
||||
|
||||
.status-warning {
|
||||
color: #ffc107;
|
||||
}
|
||||
|
||||
/* Animations */
|
||||
.pulse {
|
||||
text-style: blink;
|
||||
}
|
||||
|
||||
.glow {
|
||||
background: #0088cc;
|
||||
color: #ffffff;
|
||||
}
|
||||
|
||||
.shimmer {
|
||||
text-style: italic;
|
||||
}
|
||||
|
||||
/* Metrics styling */
|
||||
.metrics-value {
|
||||
text-style: bold;
|
||||
text-align: center;
|
||||
color: #ffffff;
|
||||
}
|
||||
|
||||
.metrics-label {
|
||||
text-align: center;
|
||||
color: #cccccc;
|
||||
}
|
||||
|
||||
.metrics-description {
|
||||
text-align: center;
|
||||
color: #999999;
|
||||
text-style: italic;
|
||||
}
|
||||
|
||||
/* Section titles */
|
||||
.section-title {
|
||||
text-style: bold;
|
||||
color: #ffffff;
|
||||
margin: 1 0;
|
||||
}
|
||||
|
||||
/* Status text */
|
||||
.status-text {
|
||||
color: #cccccc;
|
||||
}
|
||||
|
||||
/* Button groups */
|
||||
.button_bar {
|
||||
margin: 1 0;
|
||||
}
|
||||
|
||||
.action_buttons {
|
||||
margin: 1;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
/* Progress styling */
|
||||
.progress-label {
|
||||
color: #ffffff;
|
||||
margin: 1 0;
|
||||
}
|
||||
|
||||
/* Responsive grid */
|
||||
.responsive-grid {
|
||||
grid-size: 4;
|
||||
grid-gutter: 1;
|
||||
}
|
||||
|
||||
.metrics-grid {
|
||||
grid-size: 4;
|
||||
grid-gutter: 1;
|
||||
margin: 1;
|
||||
}
|
||||
|
||||
/* Modal container */
|
||||
.modal-container {
|
||||
background: #333333;
|
||||
border: solid #0088cc;
|
||||
padding: 2;
|
||||
margin: 2;
|
||||
}
|
||||
|
||||
/* Chart placeholders */
|
||||
.chart-title {
|
||||
text-style: bold;
|
||||
color: #ffffff;
|
||||
margin: 1 0;
|
||||
}
|
||||
|
||||
.chart-placeholder {
|
||||
color: #999999;
|
||||
text-style: italic;
|
||||
text-align: center;
|
||||
padding: 2;
|
||||
}
|
||||
|
||||
/* Analytics grid */
|
||||
.analytics-grid {
|
||||
grid-size: 2;
|
||||
grid-gutter: 1;
|
||||
}
|
||||
|
||||
/* Enhanced table styling */
|
||||
.enhanced-table {
|
||||
background: #333333;
|
||||
color: #ffffff;
|
||||
border: solid #666666;
|
||||
}
|
||||
|
||||
.enhanced-table:focus {
|
||||
border: solid #0088cc;
|
||||
}
|
||||
|
||||
/* Status bar */
|
||||
.status-bar {
|
||||
background: #444444;
|
||||
color: #ffffff;
|
||||
padding: 0 1;
|
||||
}
|
||||
|
||||
/* Input section styling */
|
||||
.input-section {
|
||||
margin: 1;
|
||||
padding: 1;
|
||||
}
|
||||
|
||||
.input-label {
|
||||
color: #ffffff;
|
||||
margin: 1 0;
|
||||
}
|
||||
|
||||
.modern-input {
|
||||
background: #333333;
|
||||
color: #ffffff;
|
||||
border: solid #666666;
|
||||
margin: 1 0;
|
||||
}
|
||||
|
||||
.modern-input:focus {
|
||||
border: solid #0088cc;
|
||||
}
|
||||
|
||||
/* Type buttons */
|
||||
.type_buttons {
|
||||
margin: 1 0;
|
||||
}
|
||||
|
||||
.type-button {
|
||||
margin: 0 1;
|
||||
}
|
||||
|
||||
/* Progress section */
|
||||
.progress-section {
|
||||
margin: 1;
|
||||
padding: 1;
|
||||
}
|
||||
|
||||
/* Center alignment */
|
||||
.center {
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
/* Warning styling */
|
||||
.warning {
|
||||
color: #ffc107;
|
||||
text-style: bold;
|
||||
}
|
||||
|
||||
/* Pressed button state */
|
||||
.pressed {
|
||||
background: #006699;
|
||||
color: #ffffff;
|
||||
}
|
||||
|
||||
/* Focus ring for better accessibility */
|
||||
*:focus {
|
||||
outline: solid #0088cc;
|
||||
}
|
||||
"""
|
||||
5
ingest_pipeline/cli/tui/utils/__init__.py
Normal file
5
ingest_pipeline/cli/tui/utils/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""Utility functions for the TUI."""
|
||||
|
||||
from .runners import dashboard, run_textual_tui
|
||||
|
||||
__all__ = ["dashboard", "run_textual_tui"]
|
||||
Binary file not shown.
Binary file not shown.
64
ingest_pipeline/cli/tui/utils/runners.py
Normal file
64
ingest_pipeline/cli/tui/utils/runners.py
Normal file
@@ -0,0 +1,64 @@
|
||||
"""TUI runner functions and initialization."""
|
||||
|
||||
import asyncio
|
||||
|
||||
from ....core.models import StorageBackend, StorageConfig
|
||||
from ....storage.openwebui import OpenWebUIStorage
|
||||
from ....storage.weaviate import WeaviateStorage
|
||||
from ..app import CollectionManagementApp
|
||||
|
||||
|
||||
async def run_textual_tui() -> None:
|
||||
"""Run the enhanced modern TUI with better error handling and initialization."""
|
||||
from ....config.settings import get_settings
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
# Initialize storage backends with enhanced error handling
|
||||
weaviate = None
|
||||
openwebui = None
|
||||
|
||||
print("🚀 Initializing Modern Collection Management System...")
|
||||
|
||||
try:
|
||||
print("🔗 Connecting to Weaviate...")
|
||||
weaviate_config = StorageConfig(
|
||||
backend=StorageBackend.WEAVIATE,
|
||||
endpoint=settings.weaviate_endpoint,
|
||||
api_key=settings.weaviate_api_key,
|
||||
collection_name="default",
|
||||
)
|
||||
weaviate = WeaviateStorage(weaviate_config)
|
||||
await weaviate.initialize()
|
||||
print("✅ Weaviate connected successfully!")
|
||||
except Exception as e:
|
||||
print(f"⚠️ Weaviate connection failed: {e}")
|
||||
|
||||
try:
|
||||
print("🔗 Connecting to OpenWebUI...")
|
||||
openwebui_config = StorageConfig(
|
||||
backend=StorageBackend.OPEN_WEBUI,
|
||||
endpoint=settings.openwebui_endpoint,
|
||||
api_key=settings.openwebui_api_key,
|
||||
collection_name="default",
|
||||
)
|
||||
openwebui = OpenWebUIStorage(openwebui_config)
|
||||
await openwebui.initialize()
|
||||
print("✅ OpenWebUI connected successfully!")
|
||||
except Exception as e:
|
||||
print(f"⚠️ OpenWebUI connection failed: {e}")
|
||||
|
||||
if not weaviate and not openwebui:
|
||||
print("❌ Error: Could not connect to any storage backend")
|
||||
print("Please check your configuration and try again.")
|
||||
return
|
||||
|
||||
print("🎉 Launching Enhanced TUI with Keyboard Navigation...")
|
||||
|
||||
app = CollectionManagementApp(weaviate, openwebui)
|
||||
await app.run_async()
|
||||
|
||||
|
||||
def dashboard() -> None:
|
||||
"""Launch the modern collection dashboard."""
|
||||
asyncio.run(run_textual_tui())
|
||||
12
ingest_pipeline/cli/tui/widgets/__init__.py
Normal file
12
ingest_pipeline/cli/tui/widgets/__init__.py
Normal file
@@ -0,0 +1,12 @@
|
||||
"""Enhanced widgets with keyboard navigation support."""
|
||||
|
||||
from .cards import MetricsCard
|
||||
from .indicators import EnhancedProgressBar, StatusIndicator
|
||||
from .tables import EnhancedDataTable
|
||||
|
||||
__all__ = [
|
||||
"MetricsCard",
|
||||
"StatusIndicator",
|
||||
"EnhancedProgressBar",
|
||||
"EnhancedDataTable",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
28
ingest_pipeline/cli/tui/widgets/cards.py
Normal file
28
ingest_pipeline/cli/tui/widgets/cards.py
Normal file
@@ -0,0 +1,28 @@
|
||||
"""Metrics card widget."""
|
||||
|
||||
from typing import Any
|
||||
|
||||
from textual.app import ComposeResult
|
||||
from textual.widgets import Static
|
||||
from typing_extensions import override
|
||||
|
||||
|
||||
class MetricsCard(Static):
|
||||
"""A modern metrics display card."""
|
||||
|
||||
title: str
|
||||
value: str
|
||||
description: str
|
||||
|
||||
def __init__(self, title: str, value: str, description: str = "", **kwargs: Any) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self.title = title
|
||||
self.value = value
|
||||
self.description = description
|
||||
|
||||
@override
|
||||
def compose(self) -> ComposeResult:
|
||||
yield Static(self.value, classes="metrics-value")
|
||||
yield Static(self.title, classes="metrics-label")
|
||||
if self.description:
|
||||
yield Static(self.description, classes="metrics-description")
|
||||
86
ingest_pipeline/cli/tui/widgets/indicators.py
Normal file
86
ingest_pipeline/cli/tui/widgets/indicators.py
Normal file
@@ -0,0 +1,86 @@
|
||||
"""Status indicators and progress bars with enhanced visual feedback."""
|
||||
|
||||
from typing import Any
|
||||
|
||||
from textual.app import ComposeResult
|
||||
from textual.widgets import ProgressBar, Static
|
||||
from typing_extensions import override
|
||||
|
||||
|
||||
class StatusIndicator(Static):
|
||||
"""Modern status indicator with color coding and animations."""
|
||||
|
||||
status: str
|
||||
|
||||
def __init__(self, status: str, **kwargs: Any) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self.status = status
|
||||
self.update_status(status)
|
||||
|
||||
def update_status(self, status: str) -> None:
|
||||
"""Update the status display with enhanced visual feedback."""
|
||||
self.status = status
|
||||
|
||||
# Remove previous status classes
|
||||
self.remove_class("status-active", "status-error", "status-warning", "pulse", "glow")
|
||||
|
||||
if status.lower() in ["active", "online", "connected", "✓ active"]:
|
||||
self.add_class("status-active")
|
||||
self.add_class("glow")
|
||||
self.update("🟢 " + status)
|
||||
elif status.lower() in ["error", "failed", "offline", "disconnected"]:
|
||||
self.add_class("status-error")
|
||||
self.add_class("pulse")
|
||||
self.update("🔴 " + status)
|
||||
elif status.lower() in ["warning", "pending", "in_progress"]:
|
||||
self.add_class("status-warning")
|
||||
self.add_class("pulse")
|
||||
self.update("🟡 " + status)
|
||||
elif status.lower() in ["loading", "connecting"]:
|
||||
self.add_class("shimmer")
|
||||
self.update("🔄 " + status)
|
||||
else:
|
||||
self.update("⚪ " + status)
|
||||
|
||||
|
||||
class EnhancedProgressBar(Static):
|
||||
"""Enhanced progress bar with better visual feedback."""
|
||||
|
||||
total: int
|
||||
progress: int
|
||||
status_text: str
|
||||
|
||||
def __init__(self, total: int = 100, **kwargs: Any) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self.total = total
|
||||
self.progress = 0
|
||||
self.status_text = "Ready"
|
||||
|
||||
@override
|
||||
def compose(self) -> ComposeResult:
|
||||
yield Static("", id="progress_status", classes="progress-label")
|
||||
yield ProgressBar(total=self.total, id="progress_bar", show_eta=True, classes="shimmer")
|
||||
|
||||
def update_progress(self, progress: int, status: str = "") -> None:
|
||||
"""Update progress with enhanced feedback."""
|
||||
self.progress = progress
|
||||
if status:
|
||||
self.status_text = status
|
||||
|
||||
# Update the progress bar
|
||||
progress_bar = self.query_one("#progress_bar", ProgressBar)
|
||||
progress_bar.update(progress=progress)
|
||||
|
||||
# Update status text with icons
|
||||
status_display = self.query_one("#progress_status", Static)
|
||||
if progress >= 100:
|
||||
status_display.update(f"✅ {self.status_text}")
|
||||
progress_bar.add_class("glow")
|
||||
elif progress >= 75:
|
||||
status_display.update(f"🔥 {self.status_text}")
|
||||
elif progress >= 50:
|
||||
status_display.update(f"⚡ {self.status_text}")
|
||||
elif progress >= 25:
|
||||
status_display.update(f"🔄 {self.status_text}")
|
||||
else:
|
||||
status_display.update(f"🚀 {self.status_text}")
|
||||
126
ingest_pipeline/cli/tui/widgets/tables.py
Normal file
126
ingest_pipeline/cli/tui/widgets/tables.py
Normal file
@@ -0,0 +1,126 @@
|
||||
"""Enhanced DataTable with improved keyboard navigation."""
|
||||
|
||||
from typing import Any
|
||||
|
||||
from textual import events
|
||||
from textual.binding import Binding
|
||||
from textual.message import Message
|
||||
from textual.widgets import DataTable
|
||||
|
||||
|
||||
class EnhancedDataTable(DataTable[Any]):
|
||||
"""DataTable with enhanced keyboard navigation and visual feedback."""
|
||||
|
||||
BINDINGS = [
|
||||
Binding("up,k", "cursor_up", "Cursor Up", show=False),
|
||||
Binding("down,j", "cursor_down", "Cursor Down", show=False),
|
||||
Binding("left,h", "cursor_left", "Cursor Left", show=False),
|
||||
Binding("right,l", "cursor_right", "Cursor Right", show=False),
|
||||
Binding("home", "cursor_home", "First Row", show=False),
|
||||
Binding("end", "cursor_end", "Last Row", show=False),
|
||||
Binding("pageup", "page_up", "Page Up", show=False),
|
||||
Binding("pagedown", "page_down", "Page Down", show=False),
|
||||
Binding("enter", "select_cursor", "Select", show=False),
|
||||
Binding("space", "toggle_selection", "Toggle Selection", show=False),
|
||||
Binding("ctrl+a", "select_all", "Select All", show=False),
|
||||
Binding("ctrl+shift+a", "clear_selection", "Clear Selection", show=False),
|
||||
]
|
||||
|
||||
def __init__(self, **kwargs: Any) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self.cursor_type = "row" # Default to row selection
|
||||
self.zebra_stripes = True # Enable zebra striping for better visibility
|
||||
self.show_cursor = True
|
||||
|
||||
def on_key(self, event: events.Key) -> None:
|
||||
"""Handle additional keyboard shortcuts."""
|
||||
if event.key == "ctrl+1":
|
||||
# Jump to first column
|
||||
self.move_cursor(column=0)
|
||||
event.prevent_default()
|
||||
elif event.key == "ctrl+9":
|
||||
# Jump to last column
|
||||
if self.columns:
|
||||
self.move_cursor(column=len(self.columns) - 1)
|
||||
event.prevent_default()
|
||||
elif event.key == "/":
|
||||
# Start quick search (to be implemented by parent)
|
||||
self.post_message(self.QuickSearch(self))
|
||||
event.prevent_default()
|
||||
elif event.key == "escape":
|
||||
# Clear selection or exit search
|
||||
# Clear selection by calling action
|
||||
self.action_clear_selection()
|
||||
event.prevent_default()
|
||||
# No else clause needed - just handle our events
|
||||
|
||||
def action_cursor_home(self) -> None:
|
||||
"""Move cursor to first row."""
|
||||
if self.row_count > 0:
|
||||
self.move_cursor(row=0)
|
||||
|
||||
def action_cursor_end(self) -> None:
|
||||
"""Move cursor to last row."""
|
||||
if self.row_count > 0:
|
||||
self.move_cursor(row=self.row_count - 1)
|
||||
|
||||
def action_page_up(self) -> None:
|
||||
"""Move cursor up by visible page size."""
|
||||
if self.row_count > 0:
|
||||
page_size = max(1, self.size.height // 2) # Approximate visible rows
|
||||
new_row = max(0, self.cursor_coordinate.row - page_size)
|
||||
self.move_cursor(row=new_row)
|
||||
|
||||
def action_page_down(self) -> None:
|
||||
"""Move cursor down by visible page size."""
|
||||
if self.row_count > 0:
|
||||
page_size = max(1, self.size.height // 2) # Approximate visible rows
|
||||
new_row = min(self.row_count - 1, self.cursor_coordinate.row + page_size)
|
||||
self.move_cursor(row=new_row)
|
||||
|
||||
def action_toggle_selection(self) -> None:
|
||||
"""Toggle selection of current row."""
|
||||
if self.row_count > 0:
|
||||
current_row = self.cursor_coordinate.row
|
||||
# This will be handled by the parent screen
|
||||
self.post_message(self.RowToggled(self, current_row))
|
||||
|
||||
def action_select_all(self) -> None:
|
||||
"""Select all rows."""
|
||||
# This will be handled by the parent screen
|
||||
self.post_message(self.SelectAll(self))
|
||||
|
||||
def action_clear_selection(self) -> None:
|
||||
"""Clear all selections."""
|
||||
# This will be handled by the parent screen
|
||||
self.post_message(self.ClearSelection(self))
|
||||
|
||||
# Custom messages for enhanced functionality
|
||||
class QuickSearch(Message):
|
||||
"""Posted when user wants to start a quick search."""
|
||||
|
||||
def __init__(self, table: "EnhancedDataTable") -> None:
|
||||
super().__init__()
|
||||
self.table = table
|
||||
|
||||
class RowToggled(Message):
|
||||
"""Posted when a row selection is toggled."""
|
||||
|
||||
def __init__(self, table: "EnhancedDataTable", row_index: int) -> None:
|
||||
super().__init__()
|
||||
self.table = table
|
||||
self.row_index = row_index
|
||||
|
||||
class SelectAll(Message):
|
||||
"""Posted when user wants to select all rows."""
|
||||
|
||||
def __init__(self, table: "EnhancedDataTable") -> None:
|
||||
super().__init__()
|
||||
self.table = table
|
||||
|
||||
class ClearSelection(Message):
|
||||
"""Posted when user wants to clear selection."""
|
||||
|
||||
def __init__(self, table: "EnhancedDataTable") -> None:
|
||||
super().__init__()
|
||||
self.table = table
|
||||
5
ingest_pipeline/config/__init__.py
Normal file
5
ingest_pipeline/config/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""Configuration management."""
|
||||
|
||||
from .settings import Settings, get_settings
|
||||
|
||||
__all__ = ["Settings", "get_settings"]
|
||||
BIN
ingest_pipeline/config/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
ingest_pipeline/config/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
ingest_pipeline/config/__pycache__/settings.cpython-312.pyc
Normal file
BIN
ingest_pipeline/config/__pycache__/settings.cpython-312.pyc
Normal file
Binary file not shown.
103
ingest_pipeline/config/settings.py
Normal file
103
ingest_pipeline/config/settings.py
Normal file
@@ -0,0 +1,103 @@
|
||||
"""Application settings and configuration."""
|
||||
|
||||
from functools import lru_cache
|
||||
from typing import Literal
|
||||
|
||||
from pydantic import Field, HttpUrl
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
"""Application settings."""
|
||||
|
||||
model_config = SettingsConfigDict(
|
||||
env_file=".env",
|
||||
env_file_encoding="utf-8",
|
||||
case_sensitive=False,
|
||||
extra="ignore", # Ignore extra environment variables
|
||||
)
|
||||
|
||||
# API Keys
|
||||
firecrawl_api_key: str | None = None
|
||||
openwebui_api_key: str | None = None
|
||||
weaviate_api_key: str | None = None
|
||||
|
||||
# Endpoints
|
||||
llm_endpoint: HttpUrl = HttpUrl("http://llm.lab")
|
||||
weaviate_endpoint: HttpUrl = HttpUrl("http://weaviate.yo")
|
||||
openwebui_endpoint: HttpUrl = HttpUrl("http://chat.lab") # This will be the API URL
|
||||
firecrawl_endpoint: HttpUrl = HttpUrl("http://crawl.lab:30002")
|
||||
|
||||
# Model Configuration
|
||||
embedding_model: str = "ollama/bge-m3:latest"
|
||||
embedding_dimension: int = 1024
|
||||
|
||||
# Ingestion Settings
|
||||
default_batch_size: int = Field(default=50, gt=0, le=500)
|
||||
max_file_size: int = 1_000_000
|
||||
max_crawl_depth: int = Field(default=5, ge=1, le=20)
|
||||
max_crawl_pages: int = Field(default=100, ge=1, le=1000)
|
||||
|
||||
# Storage Settings
|
||||
default_storage_backend: Literal["weaviate", "open_webui"] = "weaviate"
|
||||
default_collection_prefix: str = "docs"
|
||||
|
||||
# Prefect Settings
|
||||
prefect_api_url: HttpUrl | None = None
|
||||
prefect_api_key: str | None = None
|
||||
prefect_work_pool: str = "default"
|
||||
|
||||
# Scheduling Defaults
|
||||
default_schedule_interval: int = Field(default=60, ge=1, le=10080) # Max 1 week
|
||||
|
||||
# Performance Settings
|
||||
max_concurrent_tasks: int = Field(default=5, ge=1, le=20)
|
||||
request_timeout: int = Field(default=60, ge=10, le=300)
|
||||
|
||||
# Logging
|
||||
log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR"] = "INFO"
|
||||
|
||||
def get_storage_endpoint(self, backend: str) -> HttpUrl:
|
||||
"""
|
||||
Get endpoint for storage backend.
|
||||
|
||||
Args:
|
||||
backend: Storage backend name
|
||||
|
||||
Returns:
|
||||
Endpoint URL
|
||||
"""
|
||||
if backend == "weaviate":
|
||||
return self.weaviate_endpoint
|
||||
elif backend == "open_webui":
|
||||
return self.openwebui_endpoint
|
||||
else:
|
||||
raise ValueError(f"Unknown backend: {backend}")
|
||||
|
||||
def get_api_key(self, service: str) -> str | None:
|
||||
"""
|
||||
Get API key for service.
|
||||
|
||||
Args:
|
||||
service: Service name
|
||||
|
||||
Returns:
|
||||
API key or None
|
||||
"""
|
||||
service_map = {
|
||||
"firecrawl": self.firecrawl_api_key,
|
||||
"openwebui": self.openwebui_api_key,
|
||||
"weaviate": self.weaviate_api_key,
|
||||
}
|
||||
return service_map.get(service)
|
||||
|
||||
|
||||
@lru_cache
|
||||
def get_settings() -> Settings:
|
||||
"""
|
||||
Get cached settings instance.
|
||||
|
||||
Returns:
|
||||
Settings instance
|
||||
"""
|
||||
return Settings()
|
||||
27
ingest_pipeline/core/__init__.py
Normal file
27
ingest_pipeline/core/__init__.py
Normal file
@@ -0,0 +1,27 @@
|
||||
"""Core module for ingestion pipeline."""
|
||||
|
||||
from .exceptions import (
|
||||
IngestionError,
|
||||
StorageError,
|
||||
VectorizationError,
|
||||
)
|
||||
from .models import (
|
||||
Document,
|
||||
IngestionJob,
|
||||
IngestionResult,
|
||||
IngestionSource,
|
||||
IngestionStatus,
|
||||
StorageBackend,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"Document",
|
||||
"IngestionJob",
|
||||
"IngestionResult",
|
||||
"IngestionSource",
|
||||
"IngestionStatus",
|
||||
"StorageBackend",
|
||||
"IngestionError",
|
||||
"StorageError",
|
||||
"VectorizationError",
|
||||
]
|
||||
BIN
ingest_pipeline/core/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
ingest_pipeline/core/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
ingest_pipeline/core/__pycache__/exceptions.cpython-312.pyc
Normal file
BIN
ingest_pipeline/core/__pycache__/exceptions.cpython-312.pyc
Normal file
Binary file not shown.
BIN
ingest_pipeline/core/__pycache__/models.cpython-312.pyc
Normal file
BIN
ingest_pipeline/core/__pycache__/models.cpython-312.pyc
Normal file
Binary file not shown.
31
ingest_pipeline/core/exceptions.py
Normal file
31
ingest_pipeline/core/exceptions.py
Normal file
@@ -0,0 +1,31 @@
|
||||
"""Custom exceptions for the ingestion pipeline."""
|
||||
|
||||
|
||||
class IngestionError(Exception):
|
||||
"""Base exception for ingestion errors."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class StorageError(IngestionError):
|
||||
"""Exception for storage-related errors."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class VectorizationError(IngestionError):
|
||||
"""Exception for vectorization errors."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class ConfigurationError(IngestionError):
|
||||
"""Exception for configuration errors."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class SourceNotFoundError(IngestionError):
|
||||
"""Exception when source cannot be found or accessed."""
|
||||
|
||||
pass
|
||||
149
ingest_pipeline/core/models.py
Normal file
149
ingest_pipeline/core/models.py
Normal file
@@ -0,0 +1,149 @@
|
||||
"""Core data models with strict typing."""
|
||||
|
||||
from collections.abc import Callable
|
||||
from datetime import UTC, datetime
|
||||
from enum import Enum
|
||||
from typing import TypedDict
|
||||
from uuid import UUID, uuid4
|
||||
|
||||
from pydantic import BaseModel, Field, HttpUrl
|
||||
|
||||
|
||||
class IngestionStatus(str, Enum):
|
||||
"""Status of an ingestion job."""
|
||||
|
||||
PENDING = "pending"
|
||||
IN_PROGRESS = "in_progress"
|
||||
COMPLETED = "completed"
|
||||
PARTIAL = "partial" # Some documents succeeded, some failed
|
||||
FAILED = "failed"
|
||||
CANCELLED = "cancelled"
|
||||
|
||||
|
||||
class StorageBackend(str, Enum):
|
||||
"""Available storage backends."""
|
||||
|
||||
WEAVIATE = "weaviate"
|
||||
OPEN_WEBUI = "open_webui"
|
||||
|
||||
|
||||
class IngestionSource(str, Enum):
|
||||
"""Types of ingestion sources."""
|
||||
|
||||
WEB = "web"
|
||||
REPOSITORY = "repository"
|
||||
DOCUMENTATION = "documentation"
|
||||
|
||||
|
||||
class VectorConfig(BaseModel):
|
||||
"""Configuration for vectorization."""
|
||||
|
||||
model: str = Field(default="ollama/bge-m3:latest")
|
||||
embedding_endpoint: HttpUrl = Field(default=HttpUrl("http://llm.lab"))
|
||||
dimension: int = Field(default=1024)
|
||||
batch_size: int = Field(default=100, gt=0, le=1000)
|
||||
|
||||
|
||||
class StorageConfig(BaseModel):
|
||||
"""Configuration for storage backend."""
|
||||
|
||||
backend: StorageBackend
|
||||
endpoint: HttpUrl
|
||||
api_key: str | None = Field(default=None)
|
||||
collection_name: str = Field(default="documents")
|
||||
batch_size: int = Field(default=100, gt=0, le=1000)
|
||||
|
||||
|
||||
class FirecrawlConfig(BaseModel):
|
||||
"""Configuration for Firecrawl ingestion (operational parameters only)."""
|
||||
|
||||
formats: list[str] = Field(default_factory=lambda: ["markdown", "html"])
|
||||
max_depth: int = Field(default=5, ge=1, le=20)
|
||||
limit: int = Field(default=100, ge=1, le=1000)
|
||||
only_main_content: bool = Field(default=True)
|
||||
include_subdomains: bool = Field(default=False)
|
||||
|
||||
|
||||
class RepomixConfig(BaseModel):
|
||||
"""Configuration for Repomix ingestion."""
|
||||
|
||||
include_patterns: list[str] = Field(
|
||||
default_factory=lambda: ["*.py", "*.js", "*.ts", "*.md", "*.yaml", "*.json"]
|
||||
)
|
||||
exclude_patterns: list[str] = Field(
|
||||
default_factory=lambda: ["**/node_modules/**", "**/__pycache__/**", "**/.git/**"]
|
||||
)
|
||||
max_file_size: int = Field(default=1_000_000) # 1MB
|
||||
respect_gitignore: bool = Field(default=True)
|
||||
|
||||
|
||||
class DocumentMetadata(TypedDict):
|
||||
"""Metadata for a document."""
|
||||
|
||||
source_url: str
|
||||
title: str | None
|
||||
description: str | None
|
||||
timestamp: datetime
|
||||
content_type: str
|
||||
word_count: int
|
||||
char_count: int
|
||||
|
||||
|
||||
class Document(BaseModel):
|
||||
"""Represents a single document."""
|
||||
|
||||
id: UUID = Field(default_factory=uuid4)
|
||||
content: str
|
||||
metadata: DocumentMetadata
|
||||
vector: list[float] | None = Field(default=None)
|
||||
source: IngestionSource
|
||||
collection: str = Field(default="documents")
|
||||
|
||||
class Config:
|
||||
"""Pydantic configuration."""
|
||||
|
||||
json_encoders: dict[type, Callable[[UUID | datetime], str]] = {
|
||||
UUID: lambda v: str(v) if isinstance(v, UUID) else str(v),
|
||||
datetime: lambda v: v.isoformat() if isinstance(v, datetime) else str(v),
|
||||
}
|
||||
|
||||
|
||||
class IngestionJob(BaseModel):
|
||||
"""Represents an ingestion job."""
|
||||
|
||||
id: UUID = Field(default_factory=uuid4)
|
||||
source_type: IngestionSource
|
||||
source_url: HttpUrl | str
|
||||
status: IngestionStatus = Field(default=IngestionStatus.PENDING)
|
||||
created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
|
||||
updated_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
|
||||
completed_at: datetime | None = Field(default=None)
|
||||
error_message: str | None = Field(default=None)
|
||||
document_count: int = Field(default=0)
|
||||
storage_backend: StorageBackend
|
||||
|
||||
class Config:
|
||||
"""Pydantic configuration."""
|
||||
|
||||
json_encoders: dict[type, Callable[[UUID | datetime], str]] = {
|
||||
UUID: lambda v: str(v) if isinstance(v, UUID) else str(v),
|
||||
datetime: lambda v: v.isoformat() if isinstance(v, datetime) else str(v),
|
||||
}
|
||||
|
||||
|
||||
class IngestionResult(BaseModel):
|
||||
"""Result of an ingestion operation."""
|
||||
|
||||
job_id: UUID
|
||||
status: IngestionStatus
|
||||
documents_processed: int
|
||||
documents_failed: int
|
||||
duration_seconds: float
|
||||
error_messages: list[str] = Field(default_factory=list)
|
||||
|
||||
class Config:
|
||||
"""Pydantic configuration."""
|
||||
|
||||
json_encoders: dict[type, Callable[[UUID], str]] = {
|
||||
UUID: lambda v: str(v),
|
||||
}
|
||||
9
ingest_pipeline/flows/__init__.py
Normal file
9
ingest_pipeline/flows/__init__.py
Normal file
@@ -0,0 +1,9 @@
|
||||
"""Prefect flows for orchestration."""
|
||||
|
||||
from .ingestion import create_ingestion_flow
|
||||
from .scheduler import create_scheduled_deployment
|
||||
|
||||
__all__ = [
|
||||
"create_ingestion_flow",
|
||||
"create_scheduled_deployment",
|
||||
]
|
||||
BIN
ingest_pipeline/flows/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
ingest_pipeline/flows/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
ingest_pipeline/flows/__pycache__/ingestion.cpython-312.pyc
Normal file
BIN
ingest_pipeline/flows/__pycache__/ingestion.cpython-312.pyc
Normal file
Binary file not shown.
BIN
ingest_pipeline/flows/__pycache__/scheduler.cpython-312.pyc
Normal file
BIN
ingest_pipeline/flows/__pycache__/scheduler.cpython-312.pyc
Normal file
Binary file not shown.
274
ingest_pipeline/flows/ingestion.py
Normal file
274
ingest_pipeline/flows/ingestion.py
Normal file
@@ -0,0 +1,274 @@
|
||||
"""Prefect flow for ingestion pipeline."""
|
||||
|
||||
from datetime import UTC, datetime
|
||||
from typing import Literal
|
||||
|
||||
from prefect import flow, task
|
||||
|
||||
from ..core.exceptions import IngestionError
|
||||
from ..core.models import (
|
||||
FirecrawlConfig,
|
||||
IngestionJob,
|
||||
IngestionResult,
|
||||
IngestionSource,
|
||||
IngestionStatus,
|
||||
RepomixConfig,
|
||||
StorageBackend,
|
||||
StorageConfig,
|
||||
)
|
||||
from ..ingestors import FirecrawlIngestor, RepomixIngestor
|
||||
from ..storage import OpenWebUIStorage, WeaviateStorage
|
||||
from ..storage.base import BaseStorage
|
||||
|
||||
|
||||
@task(name="validate_source", retries=2, retry_delay_seconds=10, tags=["validation"])
|
||||
async def validate_source_task(source_url: str, source_type: IngestionSource) -> bool:
|
||||
"""
|
||||
Validate that a source is accessible.
|
||||
|
||||
Args:
|
||||
source_url: URL or path to source
|
||||
source_type: Type of source
|
||||
|
||||
Returns:
|
||||
True if valid
|
||||
"""
|
||||
if source_type == IngestionSource.WEB:
|
||||
ingestor = FirecrawlIngestor()
|
||||
elif source_type == IngestionSource.REPOSITORY:
|
||||
ingestor = RepomixIngestor()
|
||||
else:
|
||||
raise ValueError(f"Unsupported source type: {source_type}")
|
||||
|
||||
result = await ingestor.validate_source(source_url)
|
||||
return bool(result)
|
||||
|
||||
|
||||
@task(name="initialize_storage", retries=3, retry_delay_seconds=5, tags=["storage"])
|
||||
async def initialize_storage_task(config: StorageConfig) -> BaseStorage:
|
||||
"""
|
||||
Initialize storage backend.
|
||||
|
||||
Args:
|
||||
config: Storage configuration
|
||||
|
||||
Returns:
|
||||
Initialized storage adapter
|
||||
"""
|
||||
if config.backend == StorageBackend.WEAVIATE:
|
||||
storage = WeaviateStorage(config)
|
||||
elif config.backend == StorageBackend.OPEN_WEBUI:
|
||||
storage = OpenWebUIStorage(config)
|
||||
else:
|
||||
raise ValueError(f"Unsupported backend: {config.backend}")
|
||||
|
||||
await storage.initialize()
|
||||
return storage
|
||||
|
||||
|
||||
@task(name="ingest_documents", retries=2, retry_delay_seconds=30, tags=["ingestion"])
|
||||
async def ingest_documents_task(job: IngestionJob, collection_name: str | None = None, batch_size: int = 50) -> tuple[int, int]:
|
||||
"""
|
||||
Ingest documents from source.
|
||||
|
||||
Args:
|
||||
job: Ingestion job configuration
|
||||
batch_size: Number of documents per batch
|
||||
|
||||
Returns:
|
||||
Tuple of (processed_count, failed_count)
|
||||
"""
|
||||
# Select ingestor
|
||||
if job.source_type == IngestionSource.WEB:
|
||||
config = FirecrawlConfig()
|
||||
ingestor = FirecrawlIngestor(config)
|
||||
elif job.source_type == IngestionSource.REPOSITORY:
|
||||
config = RepomixConfig()
|
||||
ingestor = RepomixIngestor(config)
|
||||
else:
|
||||
raise ValueError(f"Unsupported source: {job.source_type}")
|
||||
|
||||
processed = 0
|
||||
failed = 0
|
||||
batch = []
|
||||
|
||||
# Initialize storage
|
||||
from pydantic import HttpUrl
|
||||
|
||||
# Use provided collection name or generate default
|
||||
if collection_name is None:
|
||||
collection_name = f"docs_{job.source_type.value}"
|
||||
|
||||
storage_config = StorageConfig(
|
||||
backend=job.storage_backend,
|
||||
endpoint=HttpUrl("http://weaviate.yo")
|
||||
if job.storage_backend == StorageBackend.WEAVIATE
|
||||
else HttpUrl("http://chat.lab"),
|
||||
collection_name=collection_name,
|
||||
)
|
||||
|
||||
if job.storage_backend == StorageBackend.WEAVIATE:
|
||||
storage = WeaviateStorage(storage_config)
|
||||
else:
|
||||
storage = OpenWebUIStorage(storage_config)
|
||||
|
||||
await storage.initialize()
|
||||
|
||||
# Process documents
|
||||
async for document in ingestor.ingest(job):
|
||||
batch.append(document)
|
||||
|
||||
if len(batch) >= batch_size:
|
||||
try:
|
||||
stored_ids = await storage.store_batch(batch)
|
||||
print(f"Successfully stored {len(stored_ids)} documents in batch")
|
||||
processed += len(stored_ids)
|
||||
failed += len(batch) - len(stored_ids)
|
||||
except Exception as e:
|
||||
print(f"Batch storage failed: {e}")
|
||||
failed += len(batch)
|
||||
batch = []
|
||||
|
||||
# Process remaining batch
|
||||
if batch:
|
||||
try:
|
||||
stored_ids = await storage.store_batch(batch)
|
||||
print(f"Successfully stored {len(stored_ids)} documents in final batch")
|
||||
processed += len(stored_ids)
|
||||
failed += len(batch) - len(stored_ids)
|
||||
except Exception as e:
|
||||
print(f"Final batch storage failed: {e}")
|
||||
failed += len(batch)
|
||||
|
||||
return processed, failed
|
||||
|
||||
|
||||
@task(name="update_job_status", tags=["tracking"])
|
||||
async def update_job_status_task(
|
||||
job: IngestionJob,
|
||||
status: IngestionStatus,
|
||||
processed: int = 0,
|
||||
failed: int = 0,
|
||||
error: str | None = None,
|
||||
) -> IngestionJob:
|
||||
"""
|
||||
Update job status.
|
||||
|
||||
Args:
|
||||
job: Ingestion job
|
||||
status: New status
|
||||
processed: Documents processed
|
||||
failed: Documents failed
|
||||
error: Error message if any
|
||||
|
||||
Returns:
|
||||
Updated job
|
||||
"""
|
||||
job.status = status
|
||||
job.updated_at = datetime.now(UTC)
|
||||
job.document_count = processed
|
||||
|
||||
if status == IngestionStatus.COMPLETED:
|
||||
job.completed_at = datetime.now(UTC)
|
||||
|
||||
if error:
|
||||
job.error_message = error
|
||||
|
||||
return job
|
||||
|
||||
|
||||
@flow(
|
||||
name="ingestion_pipeline",
|
||||
description="Main ingestion pipeline for documents",
|
||||
retries=1,
|
||||
retry_delay_seconds=60,
|
||||
persist_result=True,
|
||||
log_prints=True,
|
||||
)
|
||||
async def create_ingestion_flow(
|
||||
source_url: str,
|
||||
source_type: Literal["web", "repository", "documentation"],
|
||||
storage_backend: Literal["weaviate", "open_webui"] = "weaviate",
|
||||
collection_name: str | None = None,
|
||||
validate_first: bool = True,
|
||||
) -> IngestionResult:
|
||||
"""
|
||||
Main ingestion flow.
|
||||
|
||||
Args:
|
||||
source_url: URL or path to source
|
||||
source_type: Type of source
|
||||
storage_backend: Storage backend to use
|
||||
validate_first: Whether to validate source first
|
||||
|
||||
Returns:
|
||||
Ingestion result
|
||||
"""
|
||||
print(f"Starting ingestion from {source_url}")
|
||||
|
||||
# Create job
|
||||
job = IngestionJob(
|
||||
source_url=source_url,
|
||||
source_type=IngestionSource(source_type),
|
||||
storage_backend=StorageBackend(storage_backend),
|
||||
status=IngestionStatus.PENDING,
|
||||
)
|
||||
|
||||
start_time = datetime.now(UTC)
|
||||
error_messages = []
|
||||
processed = 0
|
||||
failed = 0
|
||||
|
||||
try:
|
||||
# Validate source if requested
|
||||
if validate_first:
|
||||
print("Validating source...")
|
||||
is_valid = await validate_source_task(source_url, job.source_type)
|
||||
|
||||
if not is_valid:
|
||||
raise IngestionError(f"Source validation failed: {source_url}")
|
||||
|
||||
# Update status to in progress
|
||||
job = await update_job_status_task(job, IngestionStatus.IN_PROGRESS)
|
||||
|
||||
# Run ingestion
|
||||
print("Ingesting documents...")
|
||||
processed, failed = await ingest_documents_task(job, collection_name)
|
||||
|
||||
# Update final status
|
||||
if failed > 0:
|
||||
error_messages.append(f"{failed} documents failed to process")
|
||||
|
||||
# Set status based on results
|
||||
if processed == 0 and failed > 0:
|
||||
final_status = IngestionStatus.FAILED
|
||||
elif failed > 0:
|
||||
final_status = IngestionStatus.PARTIAL
|
||||
else:
|
||||
final_status = IngestionStatus.COMPLETED
|
||||
|
||||
job = await update_job_status_task(job, final_status, processed=processed, failed=failed)
|
||||
|
||||
print(f"Ingestion completed: {processed} processed, {failed} failed")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Ingestion failed: {e}")
|
||||
error_messages.append(str(e))
|
||||
|
||||
# Don't reset counts - keep whatever was processed before the error
|
||||
job = await update_job_status_task(job, IngestionStatus.FAILED,
|
||||
processed=processed,
|
||||
failed=failed,
|
||||
error=str(e))
|
||||
|
||||
# Calculate duration
|
||||
duration = (datetime.now(UTC) - start_time).total_seconds()
|
||||
|
||||
return IngestionResult(
|
||||
job_id=job.id,
|
||||
status=job.status,
|
||||
documents_processed=processed,
|
||||
documents_failed=failed,
|
||||
duration_seconds=duration,
|
||||
error_messages=error_messages,
|
||||
)
|
||||
89
ingest_pipeline/flows/scheduler.py
Normal file
89
ingest_pipeline/flows/scheduler.py
Normal file
@@ -0,0 +1,89 @@
|
||||
"""Scheduler for Prefect deployments."""
|
||||
|
||||
from datetime import timedelta
|
||||
from typing import TYPE_CHECKING, Literal, Protocol
|
||||
|
||||
from prefect import serve
|
||||
from prefect.deployments.runner import RunnerDeployment
|
||||
from prefect.schedules import Cron, Interval
|
||||
|
||||
from .ingestion import create_ingestion_flow
|
||||
|
||||
|
||||
class FlowWithDeployment(Protocol):
|
||||
"""Protocol for flows that have deployment methods."""
|
||||
|
||||
def to_deployment(
|
||||
self,
|
||||
name: str,
|
||||
**kwargs: object,
|
||||
) -> RunnerDeployment:
|
||||
"""Create a deployment from this flow."""
|
||||
...
|
||||
|
||||
|
||||
def create_scheduled_deployment(
|
||||
name: str,
|
||||
source_url: str,
|
||||
source_type: Literal["web", "repository", "documentation"],
|
||||
storage_backend: Literal["weaviate", "open_webui"] = "weaviate",
|
||||
schedule_type: Literal["cron", "interval"] = "interval",
|
||||
cron_expression: str | None = None,
|
||||
interval_minutes: int = 60,
|
||||
tags: list[str] | None = None,
|
||||
) -> RunnerDeployment:
|
||||
"""
|
||||
Create a scheduled deployment for ingestion.
|
||||
|
||||
Args:
|
||||
name: Deployment name
|
||||
source_url: Source to ingest from
|
||||
source_type: Type of source
|
||||
storage_backend: Storage backend
|
||||
schedule_type: Type of schedule
|
||||
cron_expression: Cron expression if using cron
|
||||
interval_minutes: Interval in minutes if using interval
|
||||
tags: Optional tags for deployment
|
||||
|
||||
Returns:
|
||||
Deployment configuration
|
||||
"""
|
||||
# Create schedule
|
||||
if schedule_type == "cron" and cron_expression:
|
||||
schedule = Cron(cron_expression, timezone="UTC")
|
||||
else:
|
||||
schedule = Interval(timedelta(minutes=interval_minutes), timezone="UTC")
|
||||
|
||||
# Default tags
|
||||
if tags is None:
|
||||
tags = [source_type, storage_backend]
|
||||
|
||||
# Create deployment
|
||||
# The flow decorator adds the to_deployment method at runtime
|
||||
to_deployment = create_ingestion_flow.to_deployment
|
||||
deployment = to_deployment(
|
||||
name=name,
|
||||
schedule=schedule,
|
||||
parameters={
|
||||
"source_url": source_url,
|
||||
"source_type": source_type,
|
||||
"storage_backend": storage_backend,
|
||||
"validate_first": True,
|
||||
},
|
||||
tags=tags,
|
||||
description=f"Scheduled ingestion from {source_url}",
|
||||
)
|
||||
|
||||
from typing import cast
|
||||
|
||||
return cast("RunnerDeployment", deployment)
|
||||
|
||||
|
||||
def serve_deployments(deployments: list[RunnerDeployment]) -> None:
|
||||
"""
|
||||
Serve multiple deployments.
|
||||
|
||||
Args:
|
||||
deployments: List of deployment configurations
|
||||
"""
|
||||
serve(*deployments, limit=10)
|
||||
11
ingest_pipeline/ingestors/__init__.py
Normal file
11
ingest_pipeline/ingestors/__init__.py
Normal file
@@ -0,0 +1,11 @@
|
||||
"""Ingestors module for different data sources."""
|
||||
|
||||
from .base import BaseIngestor
|
||||
from .firecrawl import FirecrawlIngestor
|
||||
from .repomix import RepomixIngestor
|
||||
|
||||
__all__ = [
|
||||
"BaseIngestor",
|
||||
"FirecrawlIngestor",
|
||||
"RepomixIngestor",
|
||||
]
|
||||
BIN
ingest_pipeline/ingestors/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
ingest_pipeline/ingestors/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
ingest_pipeline/ingestors/__pycache__/base.cpython-312.pyc
Normal file
BIN
ingest_pipeline/ingestors/__pycache__/base.cpython-312.pyc
Normal file
Binary file not shown.
BIN
ingest_pipeline/ingestors/__pycache__/firecrawl.cpython-312.pyc
Normal file
BIN
ingest_pipeline/ingestors/__pycache__/firecrawl.cpython-312.pyc
Normal file
Binary file not shown.
BIN
ingest_pipeline/ingestors/__pycache__/repomix.cpython-312.pyc
Normal file
BIN
ingest_pipeline/ingestors/__pycache__/repomix.cpython-312.pyc
Normal file
Binary file not shown.
50
ingest_pipeline/ingestors/base.py
Normal file
50
ingest_pipeline/ingestors/base.py
Normal file
@@ -0,0 +1,50 @@
|
||||
"""Base ingestor interface."""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from collections.abc import AsyncGenerator
|
||||
|
||||
from ..core.models import Document, IngestionJob
|
||||
|
||||
|
||||
class BaseIngestor(ABC):
|
||||
"""Abstract base class for all ingestors."""
|
||||
|
||||
@abstractmethod
|
||||
async def ingest(self, job: IngestionJob) -> AsyncGenerator[Document, None]:
|
||||
"""
|
||||
Ingest data from a source.
|
||||
|
||||
Args:
|
||||
job: The ingestion job configuration
|
||||
|
||||
Yields:
|
||||
Documents from the source
|
||||
"""
|
||||
return # type: ignore # pragma: no cover
|
||||
yield # pragma: no cover
|
||||
|
||||
@abstractmethod
|
||||
async def validate_source(self, source_url: str) -> bool:
|
||||
"""
|
||||
Validate if the source is accessible.
|
||||
|
||||
Args:
|
||||
source_url: URL or path to the source
|
||||
|
||||
Returns:
|
||||
True if source is valid and accessible
|
||||
"""
|
||||
pass # pragma: no cover
|
||||
|
||||
@abstractmethod
|
||||
async def estimate_size(self, source_url: str) -> int:
|
||||
"""
|
||||
Estimate the number of documents in the source.
|
||||
|
||||
Args:
|
||||
source_url: URL or path to the source
|
||||
|
||||
Returns:
|
||||
Estimated number of documents
|
||||
"""
|
||||
pass # pragma: no cover
|
||||
229
ingest_pipeline/ingestors/firecrawl.py
Normal file
229
ingest_pipeline/ingestors/firecrawl.py
Normal file
@@ -0,0 +1,229 @@
|
||||
"""Firecrawl ingestor for web and documentation sites."""
|
||||
|
||||
import asyncio
|
||||
from collections.abc import AsyncGenerator
|
||||
from datetime import UTC, datetime
|
||||
from typing import Any
|
||||
from uuid import uuid4
|
||||
|
||||
from firecrawl import AsyncFirecrawl
|
||||
from typing_extensions import override
|
||||
|
||||
from ..config import get_settings
|
||||
from ..core.models import (
|
||||
Document,
|
||||
DocumentMetadata,
|
||||
FirecrawlConfig,
|
||||
IngestionJob,
|
||||
IngestionSource,
|
||||
)
|
||||
from .base import BaseIngestor
|
||||
|
||||
|
||||
class FirecrawlIngestor(BaseIngestor):
|
||||
"""Ingestor for web and documentation sites using Firecrawl."""
|
||||
|
||||
config: FirecrawlConfig
|
||||
client: Any # AsyncFirecrawl client instance
|
||||
|
||||
def __init__(self, config: FirecrawlConfig | None = None):
|
||||
"""
|
||||
Initialize Firecrawl ingestor.
|
||||
|
||||
Args:
|
||||
config: Firecrawl configuration (for operational params only)
|
||||
"""
|
||||
self.config = config or FirecrawlConfig()
|
||||
settings = get_settings()
|
||||
|
||||
# All connection details come from settings/.env
|
||||
# For self-hosted instances, use a dummy API key if none is provided
|
||||
# The SDK requires an API key even for self-hosted instances
|
||||
api_key = settings.firecrawl_api_key or "no-key-required"
|
||||
|
||||
# AsyncFirecrawl automatically uses v2 endpoints
|
||||
self.client = AsyncFirecrawl(api_key=api_key, api_url=str(settings.firecrawl_endpoint))
|
||||
|
||||
@override
|
||||
async def ingest(self, job: IngestionJob) -> AsyncGenerator[Document, None]:
|
||||
"""
|
||||
Ingest documents from a web source.
|
||||
|
||||
Args:
|
||||
job: The ingestion job configuration
|
||||
|
||||
Yields:
|
||||
Documents from the web source
|
||||
"""
|
||||
url = str(job.source_url)
|
||||
|
||||
# First, map the site to understand its structure
|
||||
site_map = await self._map_site(url)
|
||||
|
||||
# If map returns empty, just use the main URL
|
||||
if not site_map:
|
||||
site_map = [url]
|
||||
|
||||
# Process pages in batches
|
||||
batch_size = 10
|
||||
for i in range(0, len(site_map), batch_size):
|
||||
batch_urls = site_map[i : i + batch_size]
|
||||
documents = await self._scrape_batch(batch_urls)
|
||||
|
||||
for doc_data in documents:
|
||||
yield self._create_document(doc_data, job)
|
||||
|
||||
@override
|
||||
async def validate_source(self, source_url: str) -> bool:
|
||||
"""
|
||||
Validate if the web source is accessible.
|
||||
|
||||
Args:
|
||||
source_url: URL to validate
|
||||
|
||||
Returns:
|
||||
True if source is accessible
|
||||
"""
|
||||
try:
|
||||
# Use SDK v2 endpoints for both self-hosted and cloud
|
||||
result = await self.client.scrape(source_url, formats=["markdown"])
|
||||
return result is not None and hasattr(result, "markdown")
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
@override
|
||||
async def estimate_size(self, source_url: str) -> int:
|
||||
"""
|
||||
Estimate the number of pages in the website.
|
||||
|
||||
Args:
|
||||
source_url: URL of the website
|
||||
|
||||
Returns:
|
||||
Estimated number of pages
|
||||
"""
|
||||
try:
|
||||
site_map = await self._map_site(source_url)
|
||||
return len(site_map) if site_map else 0
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
async def _map_site(self, url: str) -> list[str]:
|
||||
"""
|
||||
Map a website to get all URLs.
|
||||
|
||||
Args:
|
||||
url: Base URL to map
|
||||
|
||||
Returns:
|
||||
List of URLs found
|
||||
"""
|
||||
try:
|
||||
# Use SDK v2 map endpoint
|
||||
result = await self.client.map(url=url, limit=self.config.limit)
|
||||
|
||||
if result and hasattr(result, "links"):
|
||||
# Extract URLs from the result
|
||||
return [
|
||||
link if isinstance(link, str) else getattr(link, "url", str(link))
|
||||
for link in result.links
|
||||
]
|
||||
return []
|
||||
except Exception as e:
|
||||
# If map fails (might not be available in all versions), fall back to single URL
|
||||
import logging
|
||||
|
||||
logging.warning(f"Map endpoint not available or failed: {e}. Using single URL.")
|
||||
return [url]
|
||||
|
||||
async def _scrape_batch(self, urls: list[str]) -> list[dict[str, str]]:
|
||||
"""
|
||||
Scrape a batch of URLs.
|
||||
|
||||
Args:
|
||||
urls: List of URLs to scrape
|
||||
|
||||
Returns:
|
||||
List of scraped documents
|
||||
"""
|
||||
tasks = []
|
||||
for url in urls:
|
||||
task = self._scrape_single(url)
|
||||
tasks.append(task)
|
||||
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
documents = []
|
||||
for result in results:
|
||||
if isinstance(result, Exception):
|
||||
continue
|
||||
if result and isinstance(result, dict) and "markdown" in result:
|
||||
documents.append(result)
|
||||
|
||||
return documents
|
||||
|
||||
async def _scrape_single(self, url: str) -> dict[str, str]:
|
||||
"""
|
||||
Scrape a single URL.
|
||||
|
||||
Args:
|
||||
url: URL to scrape
|
||||
|
||||
Returns:
|
||||
Scraped document data
|
||||
"""
|
||||
try:
|
||||
# Use SDK v2 scrape endpoint
|
||||
result = await self.client.scrape(url, formats=self.config.formats)
|
||||
|
||||
# Extract data from the result
|
||||
if result:
|
||||
# The SDK returns a ScrapeResult object with markdown and metadata
|
||||
metadata = getattr(result, "metadata", {})
|
||||
return {
|
||||
"markdown": getattr(result, "markdown", ""),
|
||||
"sourceURL": url,
|
||||
"title": metadata.get("title", "")
|
||||
if isinstance(metadata, dict)
|
||||
else getattr(metadata, "title", ""),
|
||||
"description": metadata.get("description", "")
|
||||
if isinstance(metadata, dict)
|
||||
else getattr(metadata, "description", ""),
|
||||
}
|
||||
return {}
|
||||
except Exception as e:
|
||||
import logging
|
||||
|
||||
logging.debug(f"Failed to scrape {url}: {e}")
|
||||
return {}
|
||||
|
||||
def _create_document(self, doc_data: dict[str, str], job: IngestionJob) -> Document:
|
||||
"""
|
||||
Create a Document from scraped data.
|
||||
|
||||
Args:
|
||||
doc_data: Scraped document data
|
||||
job: The ingestion job
|
||||
|
||||
Returns:
|
||||
Document instance
|
||||
"""
|
||||
content = doc_data.get("markdown", "")
|
||||
|
||||
metadata: DocumentMetadata = {
|
||||
"source_url": doc_data.get("sourceURL", str(job.source_url)),
|
||||
"title": doc_data.get("title"),
|
||||
"description": doc_data.get("description"),
|
||||
"timestamp": datetime.now(UTC),
|
||||
"content_type": "text/markdown",
|
||||
"word_count": len(content.split()),
|
||||
"char_count": len(content),
|
||||
}
|
||||
|
||||
return Document(
|
||||
id=uuid4(),
|
||||
content=content,
|
||||
metadata=metadata,
|
||||
source=IngestionSource.WEB,
|
||||
collection=job.storage_backend.value,
|
||||
)
|
||||
339
ingest_pipeline/ingestors/repomix.py
Normal file
339
ingest_pipeline/ingestors/repomix.py
Normal file
@@ -0,0 +1,339 @@
|
||||
"""Repomix ingestor for Git repositories."""
|
||||
|
||||
import asyncio
|
||||
import subprocess
|
||||
import tempfile
|
||||
from collections.abc import AsyncGenerator
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
from uuid import uuid4
|
||||
|
||||
from typing_extensions import override
|
||||
|
||||
from ..core.exceptions import IngestionError, SourceNotFoundError
|
||||
from ..core.models import (
|
||||
Document,
|
||||
DocumentMetadata,
|
||||
IngestionJob,
|
||||
IngestionSource,
|
||||
RepomixConfig,
|
||||
)
|
||||
from .base import BaseIngestor
|
||||
|
||||
|
||||
class RepomixIngestor(BaseIngestor):
|
||||
"""Ingestor for Git repositories using Repomix."""
|
||||
|
||||
config: RepomixConfig
|
||||
|
||||
def __init__(self, config: RepomixConfig | None = None):
|
||||
"""
|
||||
Initialize Repomix ingestor.
|
||||
|
||||
Args:
|
||||
config: Repomix configuration
|
||||
"""
|
||||
self.config = config or RepomixConfig()
|
||||
|
||||
@override
|
||||
async def ingest(self, job: IngestionJob) -> AsyncGenerator[Document, None]:
|
||||
"""
|
||||
Ingest documents from a Git repository.
|
||||
|
||||
Args:
|
||||
job: The ingestion job configuration
|
||||
|
||||
Yields:
|
||||
Documents from the repository
|
||||
"""
|
||||
repo_url = str(job.source_url)
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Clone the repository
|
||||
repo_path = await self._clone_repository(repo_url, temp_dir)
|
||||
|
||||
# Run repomix to generate output
|
||||
output_file = await self._run_repomix(repo_path)
|
||||
|
||||
# Parse and yield documents
|
||||
documents = await self._parse_repomix_output(output_file, job)
|
||||
for doc in documents:
|
||||
yield doc
|
||||
|
||||
@override
|
||||
async def validate_source(self, source_url: str) -> bool:
|
||||
"""
|
||||
Validate if the Git repository is accessible.
|
||||
|
||||
Args:
|
||||
source_url: Git repository URL
|
||||
|
||||
Returns:
|
||||
True if repository is accessible
|
||||
"""
|
||||
try:
|
||||
# Test if we can list remote refs
|
||||
result = await self._run_command(
|
||||
["git", "ls-remote", "--heads", source_url], timeout=10
|
||||
)
|
||||
return result.returncode == 0
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
@override
|
||||
async def estimate_size(self, source_url: str) -> int:
|
||||
"""
|
||||
Estimate the number of files in the repository.
|
||||
|
||||
Args:
|
||||
source_url: Git repository URL
|
||||
|
||||
Returns:
|
||||
Estimated number of files
|
||||
"""
|
||||
try:
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Shallow clone to get file count
|
||||
repo_path = await self._clone_repository(source_url, temp_dir, shallow=True)
|
||||
|
||||
# Count files matching patterns
|
||||
file_count = 0
|
||||
for pattern in self.config.include_patterns:
|
||||
files = list(Path(repo_path).rglob(pattern))
|
||||
file_count += len(files)
|
||||
|
||||
return file_count
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
async def _clone_repository(
|
||||
self, repo_url: str, target_dir: str, shallow: bool = False
|
||||
) -> Path:
|
||||
"""
|
||||
Clone a Git repository.
|
||||
|
||||
Args:
|
||||
repo_url: Repository URL
|
||||
target_dir: Directory to clone into
|
||||
shallow: Whether to do a shallow clone
|
||||
|
||||
Returns:
|
||||
Path to cloned repository
|
||||
"""
|
||||
repo_name = repo_url.split("/")[-1].replace(".git", "")
|
||||
repo_path = Path(target_dir) / repo_name
|
||||
|
||||
cmd = ["git", "clone"]
|
||||
if shallow:
|
||||
cmd.extend(["--depth", "1"])
|
||||
cmd.extend([repo_url, str(repo_path)])
|
||||
|
||||
result = await self._run_command(cmd, timeout=300)
|
||||
|
||||
if result.returncode != 0:
|
||||
raise SourceNotFoundError(f"Failed to clone repository: {repo_url}")
|
||||
|
||||
return repo_path
|
||||
|
||||
async def _run_repomix(self, repo_path: Path) -> Path:
|
||||
"""
|
||||
Run repomix on a repository.
|
||||
|
||||
Args:
|
||||
repo_path: Path to the repository
|
||||
|
||||
Returns:
|
||||
Path to repomix output file
|
||||
"""
|
||||
output_file = repo_path / "repomix-output.md"
|
||||
|
||||
# Build repomix command
|
||||
cmd = ["npx", "repomix", "--output", str(output_file)]
|
||||
|
||||
# Add include patterns
|
||||
if self.config.include_patterns:
|
||||
for pattern in self.config.include_patterns:
|
||||
cmd.extend(["--include", pattern])
|
||||
|
||||
# Add exclude patterns
|
||||
if self.config.exclude_patterns:
|
||||
for pattern in self.config.exclude_patterns:
|
||||
cmd.extend(["--exclude", pattern])
|
||||
|
||||
if self.config.respect_gitignore:
|
||||
cmd.append("--respect-gitignore")
|
||||
|
||||
result = await self._run_command(cmd, cwd=str(repo_path), timeout=120)
|
||||
|
||||
if result.returncode != 0:
|
||||
stderr_text = (
|
||||
result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr
|
||||
)
|
||||
raise IngestionError(f"Repomix failed: {stderr_text}")
|
||||
|
||||
return output_file
|
||||
|
||||
async def _parse_repomix_output(self, output_file: Path, job: IngestionJob) -> list[Document]:
|
||||
"""
|
||||
Parse repomix output into documents.
|
||||
|
||||
Args:
|
||||
output_file: Path to repomix output
|
||||
job: The ingestion job
|
||||
|
||||
Returns:
|
||||
List of documents
|
||||
"""
|
||||
documents = []
|
||||
|
||||
try:
|
||||
content = output_file.read_text()
|
||||
|
||||
# Split by file markers (repomix uses specific delimiters)
|
||||
file_sections = self._split_by_files(content)
|
||||
|
||||
for file_path, file_content in file_sections.items():
|
||||
if len(file_content) > self.config.max_file_size:
|
||||
# Split large files into chunks
|
||||
chunks = self._chunk_content(file_content)
|
||||
for i, chunk in enumerate(chunks):
|
||||
doc = self._create_document(file_path, chunk, job, chunk_index=i)
|
||||
documents.append(doc)
|
||||
else:
|
||||
doc = self._create_document(file_path, file_content, job)
|
||||
documents.append(doc)
|
||||
|
||||
except Exception as e:
|
||||
raise IngestionError(f"Failed to parse repomix output: {e}") from e
|
||||
|
||||
return documents
|
||||
|
||||
def _split_by_files(self, content: str) -> dict[str, str]:
|
||||
"""
|
||||
Split repomix output by files.
|
||||
|
||||
Args:
|
||||
content: Repomix output content
|
||||
|
||||
Returns:
|
||||
Dictionary of file paths to content
|
||||
"""
|
||||
files: dict[str, str] = {}
|
||||
current_file: str | None = None
|
||||
current_content: list[str] = []
|
||||
|
||||
for line in content.split("\n"):
|
||||
# Look for file markers (adjust based on actual repomix format)
|
||||
if line.startswith("## File:") or line.startswith("### "):
|
||||
if current_file:
|
||||
files[current_file] = "\n".join(current_content)
|
||||
current_file = line.replace("## File:", "").replace("### ", "").strip()
|
||||
current_content = []
|
||||
else:
|
||||
current_content.append(line)
|
||||
|
||||
# Add last file
|
||||
if current_file:
|
||||
files[current_file] = "\n".join(current_content)
|
||||
|
||||
# If no file markers found, treat as single document
|
||||
if not files:
|
||||
files["repository"] = content
|
||||
|
||||
return files
|
||||
|
||||
def _chunk_content(self, content: str, chunk_size: int = 500000) -> list[str]:
|
||||
"""
|
||||
Split content into chunks.
|
||||
|
||||
Args:
|
||||
content: Content to chunk
|
||||
chunk_size: Maximum size per chunk
|
||||
|
||||
Returns:
|
||||
List of content chunks
|
||||
"""
|
||||
chunks: list[str] = []
|
||||
lines = content.split("\n")
|
||||
current_chunk: list[str] = []
|
||||
current_size = 0
|
||||
|
||||
for line in lines:
|
||||
line_size = len(line) + 1 # +1 for newline
|
||||
|
||||
if current_size + line_size > chunk_size and current_chunk:
|
||||
chunks.append("\n".join(current_chunk))
|
||||
current_chunk = []
|
||||
current_size = 0
|
||||
|
||||
current_chunk.append(line)
|
||||
current_size += line_size
|
||||
|
||||
if current_chunk:
|
||||
chunks.append("\n".join(current_chunk))
|
||||
|
||||
return chunks
|
||||
|
||||
def _create_document(
|
||||
self, file_path: str, content: str, job: IngestionJob, chunk_index: int = 0
|
||||
) -> Document:
|
||||
"""
|
||||
Create a Document from repository content.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file in repository
|
||||
content: File content
|
||||
job: The ingestion job
|
||||
chunk_index: Index if content is chunked
|
||||
|
||||
Returns:
|
||||
Document instance
|
||||
"""
|
||||
metadata: DocumentMetadata = {
|
||||
"source_url": str(job.source_url),
|
||||
"title": f"{file_path}" + (f" (chunk {chunk_index})" if chunk_index > 0 else ""),
|
||||
"description": f"Repository file: {file_path}",
|
||||
"timestamp": datetime.now(UTC),
|
||||
"content_type": "text/plain",
|
||||
"word_count": len(content.split()),
|
||||
"char_count": len(content),
|
||||
}
|
||||
|
||||
return Document(
|
||||
id=uuid4(),
|
||||
content=content,
|
||||
metadata=metadata,
|
||||
source=IngestionSource.REPOSITORY,
|
||||
collection=job.storage_backend.value,
|
||||
)
|
||||
|
||||
async def _run_command(
|
||||
self, cmd: list[str], cwd: str | None = None, timeout: int = 60
|
||||
) -> subprocess.CompletedProcess[bytes]:
|
||||
"""
|
||||
Run a shell command asynchronously.
|
||||
|
||||
Args:
|
||||
cmd: Command and arguments
|
||||
cwd: Working directory
|
||||
timeout: Command timeout in seconds
|
||||
|
||||
Returns:
|
||||
Completed process result
|
||||
"""
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, cwd=cwd
|
||||
)
|
||||
|
||||
try:
|
||||
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
|
||||
|
||||
return subprocess.CompletedProcess(
|
||||
cmd,
|
||||
proc.returncode or 0,
|
||||
stdout,
|
||||
stderr,
|
||||
)
|
||||
except TimeoutError as e:
|
||||
proc.kill()
|
||||
raise IngestionError(f"Command timed out: {' '.join(cmd)}") from e
|
||||
11
ingest_pipeline/storage/__init__.py
Normal file
11
ingest_pipeline/storage/__init__.py
Normal file
@@ -0,0 +1,11 @@
|
||||
"""Storage adapters for different backends."""
|
||||
|
||||
from .base import BaseStorage
|
||||
from .openwebui import OpenWebUIStorage
|
||||
from .weaviate import WeaviateStorage
|
||||
|
||||
__all__ = [
|
||||
"BaseStorage",
|
||||
"WeaviateStorage",
|
||||
"OpenWebUIStorage",
|
||||
]
|
||||
BIN
ingest_pipeline/storage/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
ingest_pipeline/storage/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
ingest_pipeline/storage/__pycache__/base.cpython-312.pyc
Normal file
BIN
ingest_pipeline/storage/__pycache__/base.cpython-312.pyc
Normal file
Binary file not shown.
BIN
ingest_pipeline/storage/__pycache__/openwebui.cpython-312.pyc
Normal file
BIN
ingest_pipeline/storage/__pycache__/openwebui.cpython-312.pyc
Normal file
Binary file not shown.
BIN
ingest_pipeline/storage/__pycache__/weaviate.cpython-312.pyc
Normal file
BIN
ingest_pipeline/storage/__pycache__/weaviate.cpython-312.pyc
Normal file
Binary file not shown.
106
ingest_pipeline/storage/base.py
Normal file
106
ingest_pipeline/storage/base.py
Normal file
@@ -0,0 +1,106 @@
|
||||
"""Base storage interface."""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from collections.abc import AsyncGenerator
|
||||
|
||||
from ..core.models import Document, StorageConfig
|
||||
|
||||
|
||||
class BaseStorage(ABC):
|
||||
"""Abstract base class for storage adapters."""
|
||||
|
||||
config: StorageConfig
|
||||
|
||||
def __init__(self, config: StorageConfig):
|
||||
"""
|
||||
Initialize storage adapter.
|
||||
|
||||
Args:
|
||||
config: Storage configuration
|
||||
"""
|
||||
self.config = config
|
||||
|
||||
@abstractmethod
|
||||
async def initialize(self) -> None:
|
||||
"""Initialize the storage backend and create collections if needed."""
|
||||
pass # pragma: no cover
|
||||
|
||||
@abstractmethod
|
||||
async def store(self, document: Document) -> str:
|
||||
"""
|
||||
Store a single document.
|
||||
|
||||
Args:
|
||||
document: Document to store
|
||||
|
||||
Returns:
|
||||
Document ID
|
||||
"""
|
||||
pass # pragma: no cover
|
||||
|
||||
@abstractmethod
|
||||
async def store_batch(self, documents: list[Document]) -> list[str]:
|
||||
"""
|
||||
Store multiple documents in batch.
|
||||
|
||||
Args:
|
||||
documents: List of documents to store
|
||||
|
||||
Returns:
|
||||
List of document IDs
|
||||
"""
|
||||
pass # pragma: no cover
|
||||
|
||||
@abstractmethod
|
||||
async def retrieve(self, document_id: str) -> Document | None:
|
||||
"""
|
||||
Retrieve a document by ID.
|
||||
|
||||
Args:
|
||||
document_id: Document ID
|
||||
|
||||
Returns:
|
||||
Document or None if not found
|
||||
"""
|
||||
pass # pragma: no cover
|
||||
|
||||
@abstractmethod
|
||||
async def search(
|
||||
self, query: str, limit: int = 10, threshold: float = 0.7
|
||||
) -> AsyncGenerator[Document, None]:
|
||||
"""
|
||||
Search for documents.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
limit: Maximum number of results
|
||||
threshold: Similarity threshold
|
||||
|
||||
Yields:
|
||||
Matching documents
|
||||
"""
|
||||
return # type: ignore # pragma: no cover
|
||||
yield # pragma: no cover
|
||||
|
||||
@abstractmethod
|
||||
async def delete(self, document_id: str) -> bool:
|
||||
"""
|
||||
Delete a document.
|
||||
|
||||
Args:
|
||||
document_id: Document ID
|
||||
|
||||
Returns:
|
||||
True if deleted successfully
|
||||
"""
|
||||
pass # pragma: no cover
|
||||
|
||||
@abstractmethod
|
||||
async def count(self) -> int:
|
||||
"""
|
||||
Get total document count.
|
||||
|
||||
Returns:
|
||||
Number of documents
|
||||
"""
|
||||
pass # pragma: no cover
|
||||
296
ingest_pipeline/storage/openwebui.py
Normal file
296
ingest_pipeline/storage/openwebui.py
Normal file
@@ -0,0 +1,296 @@
|
||||
"""Open WebUI storage adapter."""
|
||||
|
||||
from collections.abc import AsyncGenerator
|
||||
from uuid import UUID
|
||||
|
||||
import httpx
|
||||
from typing_extensions import override
|
||||
|
||||
from ..core.exceptions import StorageError
|
||||
from ..core.models import Document, StorageConfig
|
||||
from ..utils.vectorizer import Vectorizer
|
||||
from .base import BaseStorage
|
||||
|
||||
|
||||
class OpenWebUIStorage(BaseStorage):
|
||||
"""Storage adapter for Open WebUI knowledge endpoints."""
|
||||
|
||||
client: httpx.AsyncClient
|
||||
vectorizer: Vectorizer
|
||||
|
||||
def __init__(self, config: StorageConfig):
|
||||
"""
|
||||
Initialize Open WebUI storage.
|
||||
|
||||
Args:
|
||||
config: Storage configuration
|
||||
"""
|
||||
super().__init__(config)
|
||||
|
||||
self.client = httpx.AsyncClient(
|
||||
base_url=str(config.endpoint),
|
||||
headers={
|
||||
"Authorization": f"Bearer {config.api_key}" if config.api_key else "",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
timeout=30.0,
|
||||
)
|
||||
self.vectorizer = Vectorizer(config)
|
||||
|
||||
@override
|
||||
async def initialize(self) -> None:
|
||||
"""Initialize Open WebUI connection."""
|
||||
try:
|
||||
# Test connection with OpenWebUI knowledge API
|
||||
response = await self.client.get("/api/v1/knowledge/")
|
||||
response.raise_for_status()
|
||||
|
||||
# Check if collection (knowledge base) exists, create if not
|
||||
knowledge_bases = response.json()
|
||||
collection_exists = any(
|
||||
kb.get("name") == self.config.collection_name for kb in knowledge_bases
|
||||
)
|
||||
|
||||
if not collection_exists:
|
||||
await self._create_collection()
|
||||
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to initialize Open WebUI: {e}") from e
|
||||
|
||||
async def _create_collection(self) -> None:
|
||||
"""Create knowledge base in Open WebUI."""
|
||||
try:
|
||||
response = await self.client.post(
|
||||
"/api/v1/knowledge/create",
|
||||
json={
|
||||
"name": self.config.collection_name,
|
||||
"description": "Documents ingested from various sources"
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to create knowledge base: {e}") from e
|
||||
|
||||
@override
|
||||
async def store(self, document: Document) -> str:
|
||||
"""
|
||||
Store a document in Open WebUI.
|
||||
|
||||
Args:
|
||||
document: Document to store
|
||||
|
||||
Returns:
|
||||
Document ID
|
||||
"""
|
||||
try:
|
||||
# Vectorize if needed
|
||||
if document.vector is None:
|
||||
document.vector = await self.vectorizer.vectorize(document.content)
|
||||
|
||||
# Prepare document data
|
||||
doc_data = {
|
||||
"id": str(document.id),
|
||||
"collection": self.config.collection_name,
|
||||
"content": document.content,
|
||||
"metadata": {
|
||||
**document.metadata,
|
||||
"timestamp": document.metadata["timestamp"].isoformat(),
|
||||
"source": document.source.value,
|
||||
},
|
||||
"embedding": document.vector,
|
||||
}
|
||||
|
||||
# Store document
|
||||
response = await self.client.post(
|
||||
f"/api/knowledge/collections/{self.config.collection_name}/documents", json=doc_data
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
document_id = result.get("id") if isinstance(result, dict) else None
|
||||
return str(document_id) if document_id else str(document.id)
|
||||
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to store document: {e}") from e
|
||||
|
||||
@override
|
||||
async def store_batch(self, documents: list[Document]) -> list[str]:
|
||||
"""
|
||||
Store multiple documents in batch.
|
||||
|
||||
Args:
|
||||
documents: List of documents
|
||||
|
||||
Returns:
|
||||
List of document IDs
|
||||
"""
|
||||
try:
|
||||
# Vectorize documents without vectors
|
||||
for doc in documents:
|
||||
if doc.vector is None:
|
||||
doc.vector = await self.vectorizer.vectorize(doc.content)
|
||||
|
||||
# Prepare batch data
|
||||
batch_data = []
|
||||
for doc in documents:
|
||||
batch_data.append(
|
||||
{
|
||||
"id": str(doc.id),
|
||||
"content": doc.content,
|
||||
"metadata": {
|
||||
**doc.metadata,
|
||||
"timestamp": doc.metadata["timestamp"].isoformat(),
|
||||
"source": doc.source.value,
|
||||
},
|
||||
"embedding": doc.vector,
|
||||
}
|
||||
)
|
||||
|
||||
# Store batch
|
||||
response = await self.client.post(
|
||||
f"/api/knowledge/collections/{self.config.collection_name}/documents/batch",
|
||||
json={"documents": batch_data},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
ids = result.get("ids") if isinstance(result, dict) else None
|
||||
return ids if isinstance(ids, list) else [str(doc.id) for doc in documents]
|
||||
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to store batch: {e}") from e
|
||||
|
||||
@override
|
||||
async def retrieve(self, document_id: str) -> Document | None:
|
||||
"""
|
||||
Retrieve a document from Open WebUI.
|
||||
|
||||
Args:
|
||||
document_id: Document ID
|
||||
|
||||
Returns:
|
||||
Document or None
|
||||
"""
|
||||
try:
|
||||
response = await self.client.get(
|
||||
f"/api/knowledge/collections/{self.config.collection_name}/documents/{document_id}"
|
||||
)
|
||||
|
||||
if response.status_code == 404:
|
||||
return None
|
||||
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
# Reconstruct document
|
||||
metadata = data.get("metadata", {})
|
||||
return Document(
|
||||
id=UUID(document_id),
|
||||
content=data["content"],
|
||||
metadata=metadata,
|
||||
vector=data.get("embedding"),
|
||||
source=metadata.get("source", "unknown"),
|
||||
collection=self.config.collection_name,
|
||||
)
|
||||
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
@override
|
||||
async def search(
|
||||
self, query: str, limit: int = 10, threshold: float = 0.7
|
||||
) -> AsyncGenerator[Document, None]:
|
||||
"""
|
||||
Search for documents in Open WebUI.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
limit: Maximum results
|
||||
threshold: Similarity threshold
|
||||
|
||||
Yields:
|
||||
Matching documents
|
||||
"""
|
||||
try:
|
||||
# Vectorize query
|
||||
query_vector = await self.vectorizer.vectorize(query)
|
||||
|
||||
# Perform search
|
||||
response = await self.client.post(
|
||||
f"/api/knowledge/collections/{self.config.collection_name}/search",
|
||||
json={
|
||||
"query": query,
|
||||
"embedding": query_vector,
|
||||
"limit": limit,
|
||||
"threshold": threshold,
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
results = response.json()
|
||||
|
||||
for result in results.get("documents", []):
|
||||
metadata = result.get("metadata", {})
|
||||
doc = Document(
|
||||
id=result["id"],
|
||||
content=result["content"],
|
||||
metadata=metadata,
|
||||
vector=result.get("embedding"),
|
||||
source=metadata.get("source", "unknown"),
|
||||
collection=self.config.collection_name,
|
||||
)
|
||||
yield doc
|
||||
|
||||
except Exception as e:
|
||||
raise StorageError(f"Search failed: {e}") from e
|
||||
|
||||
async def delete(self, document_id: str) -> bool:
|
||||
"""
|
||||
Delete a document from Open WebUI.
|
||||
|
||||
Args:
|
||||
document_id: Document ID
|
||||
|
||||
Returns:
|
||||
True if deleted
|
||||
"""
|
||||
try:
|
||||
response = await self.client.delete(
|
||||
f"/api/knowledge/collections/{self.config.collection_name}/documents/{document_id}"
|
||||
)
|
||||
return response.status_code in [200, 204]
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
async def count(self) -> int:
|
||||
"""
|
||||
Get document count in collection.
|
||||
|
||||
Returns:
|
||||
Number of documents
|
||||
"""
|
||||
try:
|
||||
response = await self.client.get(
|
||||
f"/api/knowledge/collections/{self.config.collection_name}/stats"
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
stats = response.json()
|
||||
count = stats.get("document_count") if isinstance(stats, dict) else None
|
||||
return int(count) if isinstance(count, (int, str)) else 0
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
async def __aenter__(self) -> "OpenWebUIStorage":
|
||||
"""Async context manager entry."""
|
||||
await self.initialize()
|
||||
return self
|
||||
|
||||
async def __aexit__(
|
||||
self,
|
||||
exc_type: type[BaseException] | None,
|
||||
exc_val: BaseException | None,
|
||||
exc_tb: object | None,
|
||||
) -> None:
|
||||
"""Async context manager exit."""
|
||||
await self.client.aclose()
|
||||
703
ingest_pipeline/storage/weaviate.py
Normal file
703
ingest_pipeline/storage/weaviate.py
Normal file
@@ -0,0 +1,703 @@
|
||||
"""Weaviate storage adapter."""
|
||||
|
||||
from collections.abc import AsyncGenerator
|
||||
from datetime import UTC, datetime
|
||||
from typing import cast
|
||||
from uuid import UUID
|
||||
|
||||
import weaviate
|
||||
from typing_extensions import override
|
||||
from weaviate.classes.config import Configure, DataType, Property
|
||||
|
||||
from ..core.exceptions import StorageError
|
||||
from ..core.models import Document, DocumentMetadata, IngestionSource, StorageConfig
|
||||
from ..utils.vectorizer import Vectorizer
|
||||
from .base import BaseStorage
|
||||
|
||||
|
||||
class WeaviateStorage(BaseStorage):
|
||||
"""Storage adapter for Weaviate."""
|
||||
|
||||
client: weaviate.WeaviateClient | None
|
||||
vectorizer: Vectorizer
|
||||
collection_name: str
|
||||
|
||||
def __init__(self, config: StorageConfig):
|
||||
"""
|
||||
Initialize Weaviate storage.
|
||||
|
||||
Args:
|
||||
config: Storage configuration
|
||||
"""
|
||||
super().__init__(config)
|
||||
self.client = None
|
||||
self.vectorizer = Vectorizer(config)
|
||||
self.collection_name = config.collection_name.capitalize()
|
||||
|
||||
@override
|
||||
async def initialize(self) -> None:
|
||||
"""Initialize Weaviate client and create collection if needed."""
|
||||
try:
|
||||
# Connect to Weaviate
|
||||
# Parse endpoint - Weaviate expects just the hostname without protocol
|
||||
endpoint_str = str(self.config.endpoint).replace("http://", "").replace("https://", "")
|
||||
|
||||
# Split host and port if port is specified in the URL
|
||||
if ":" in endpoint_str and "/" not in endpoint_str:
|
||||
# Only split if it's a port number (no path)
|
||||
host, port_str = endpoint_str.rsplit(":", 1)
|
||||
http_port = int(port_str) if port_str.isdigit() else 80
|
||||
else:
|
||||
# Remove any path if present
|
||||
host = endpoint_str.split("/")[0]
|
||||
# For reverse proxy setups, use port 80
|
||||
http_port = 80
|
||||
|
||||
# For reverse proxy setups, use HTTP-only connection
|
||||
self.client = weaviate.WeaviateClient(
|
||||
connection_params=weaviate.connect.ConnectionParams.from_url(
|
||||
url=f"http://{host}:{http_port}",
|
||||
grpc_port=50051, # Default gRPC port but will be ignored
|
||||
),
|
||||
skip_init_checks=True, # Skip gRPC health checks
|
||||
additional_config=weaviate.classes.init.AdditionalConfig(
|
||||
timeout=weaviate.classes.init.Timeout(init=30, query=60, insert=120),
|
||||
)
|
||||
)
|
||||
|
||||
# Connect to the client
|
||||
self.client.connect()
|
||||
|
||||
# Check if collection exists
|
||||
collections = self.client.collections.list_all()
|
||||
|
||||
if self.collection_name not in collections:
|
||||
await self._create_collection()
|
||||
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to initialize Weaviate: {e}") from e
|
||||
|
||||
async def _create_collection(self) -> None:
|
||||
"""Create Weaviate collection with schema."""
|
||||
if not self.client:
|
||||
raise StorageError("Weaviate client not initialized")
|
||||
try:
|
||||
self.client.collections.create(
|
||||
name=self.collection_name,
|
||||
properties=[
|
||||
Property(
|
||||
name="content", data_type=DataType.TEXT, description="Document content"
|
||||
),
|
||||
Property(name="source_url", data_type=DataType.TEXT, description="Source URL"),
|
||||
Property(name="title", data_type=DataType.TEXT, description="Document title"),
|
||||
Property(
|
||||
name="description",
|
||||
data_type=DataType.TEXT,
|
||||
description="Document description",
|
||||
),
|
||||
Property(
|
||||
name="timestamp", data_type=DataType.DATE, description="Ingestion timestamp"
|
||||
),
|
||||
Property(
|
||||
name="content_type", data_type=DataType.TEXT, description="Content type"
|
||||
),
|
||||
Property(name="word_count", data_type=DataType.INT, description="Word count"),
|
||||
Property(
|
||||
name="char_count", data_type=DataType.INT, description="Character count"
|
||||
),
|
||||
Property(
|
||||
name="source", data_type=DataType.TEXT, description="Ingestion source"
|
||||
),
|
||||
],
|
||||
vectorizer_config=Configure.Vectorizer.none(),
|
||||
)
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to create collection: {e}") from e
|
||||
|
||||
@override
|
||||
async def store(self, document: Document) -> str:
|
||||
"""
|
||||
Store a document in Weaviate.
|
||||
|
||||
Args:
|
||||
document: Document to store
|
||||
|
||||
Returns:
|
||||
Document ID
|
||||
"""
|
||||
try:
|
||||
# Vectorize content if no vector provided
|
||||
if document.vector is None:
|
||||
document.vector = await self.vectorizer.vectorize(document.content)
|
||||
|
||||
if not self.client:
|
||||
raise StorageError("Weaviate client not initialized")
|
||||
collection = self.client.collections.get(self.collection_name)
|
||||
|
||||
# Prepare properties
|
||||
properties = {
|
||||
"content": document.content,
|
||||
"source_url": document.metadata["source_url"],
|
||||
"title": document.metadata.get("title", ""),
|
||||
"description": document.metadata.get("description", ""),
|
||||
"timestamp": document.metadata["timestamp"].isoformat(),
|
||||
"content_type": document.metadata["content_type"],
|
||||
"word_count": document.metadata["word_count"],
|
||||
"char_count": document.metadata["char_count"],
|
||||
"source": document.source.value,
|
||||
}
|
||||
|
||||
# Insert with vector
|
||||
result = collection.data.insert(
|
||||
properties=properties, vector=document.vector, uuid=str(document.id)
|
||||
)
|
||||
|
||||
return str(result)
|
||||
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to store document: {e}") from e
|
||||
|
||||
@override
|
||||
async def store_batch(self, documents: list[Document]) -> list[str]:
|
||||
"""
|
||||
Store multiple documents in batch.
|
||||
|
||||
Args:
|
||||
documents: List of documents
|
||||
|
||||
Returns:
|
||||
List of successfully stored document IDs
|
||||
"""
|
||||
try:
|
||||
if not self.client:
|
||||
raise StorageError("Weaviate client not initialized")
|
||||
collection = self.client.collections.get(self.collection_name)
|
||||
|
||||
# Vectorize documents without vectors
|
||||
for doc in documents:
|
||||
if doc.vector is None:
|
||||
doc.vector = await self.vectorizer.vectorize(doc.content)
|
||||
|
||||
# Try individual inserts to avoid gRPC batch issues
|
||||
successful_ids: list[str] = []
|
||||
|
||||
for doc in documents:
|
||||
try:
|
||||
properties = {
|
||||
"content": doc.content,
|
||||
"source_url": doc.metadata["source_url"],
|
||||
"title": doc.metadata.get("title", ""),
|
||||
"description": doc.metadata.get("description", ""),
|
||||
"timestamp": doc.metadata["timestamp"].isoformat(),
|
||||
"content_type": doc.metadata["content_type"],
|
||||
"word_count": doc.metadata["word_count"],
|
||||
"char_count": doc.metadata["char_count"],
|
||||
"source": doc.source.value,
|
||||
}
|
||||
|
||||
# Insert individual document
|
||||
collection.data.insert(
|
||||
properties=properties,
|
||||
vector=doc.vector,
|
||||
uuid=str(doc.id)
|
||||
)
|
||||
successful_ids.append(str(doc.id))
|
||||
|
||||
except Exception as e:
|
||||
print(f"Failed to store document {doc.id}: {e}")
|
||||
continue
|
||||
|
||||
if not successful_ids:
|
||||
raise StorageError("All documents in batch failed to store")
|
||||
|
||||
return successful_ids
|
||||
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to store batch: {e}") from e
|
||||
|
||||
@override
|
||||
async def retrieve(self, document_id: str) -> Document | None:
|
||||
"""
|
||||
Retrieve a document from Weaviate.
|
||||
|
||||
Args:
|
||||
document_id: Document ID
|
||||
|
||||
Returns:
|
||||
Document or None
|
||||
"""
|
||||
try:
|
||||
if not self.client:
|
||||
raise StorageError("Weaviate client not initialized")
|
||||
collection = self.client.collections.get(self.collection_name)
|
||||
result = collection.query.fetch_object_by_id(document_id)
|
||||
|
||||
if not result:
|
||||
return None
|
||||
|
||||
# Reconstruct document
|
||||
props = result.properties
|
||||
metadata_dict = {
|
||||
"source_url": str(props["source_url"]),
|
||||
"title": str(props.get("title")) if props.get("title") else None,
|
||||
"description": str(props.get("description")) if props.get("description") else None,
|
||||
"timestamp": str(props["timestamp"]),
|
||||
"content_type": str(props["content_type"]),
|
||||
"word_count": int(str(props["word_count"])),
|
||||
"char_count": int(str(props["char_count"])),
|
||||
}
|
||||
metadata = cast(DocumentMetadata, cast(object, metadata_dict))
|
||||
|
||||
vector_raw = result.vector.get("default") if result.vector else None
|
||||
vector: list[float] | None = None
|
||||
if isinstance(vector_raw, list) and vector_raw:
|
||||
first_elem = vector_raw[0]
|
||||
if isinstance(first_elem, list):
|
||||
# Nested list - take first one and ensure all elements are numbers
|
||||
nested_vector = first_elem
|
||||
try:
|
||||
vector = [float(x) for x in nested_vector if isinstance(x, (int, float))]
|
||||
except (ValueError, TypeError):
|
||||
vector = None
|
||||
else:
|
||||
# Flat list - ensure all elements are numbers
|
||||
try:
|
||||
vector = [float(x) for x in vector_raw if isinstance(x, (int, float))]
|
||||
except (ValueError, TypeError):
|
||||
vector = None
|
||||
|
||||
return Document(
|
||||
id=UUID(document_id),
|
||||
content=str(props["content"]),
|
||||
metadata=metadata,
|
||||
vector=vector,
|
||||
source=IngestionSource.WEB, # Default to WEB
|
||||
collection=self.collection_name,
|
||||
)
|
||||
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
@override
|
||||
async def search(
|
||||
self, query: str, limit: int = 10, threshold: float = 0.7
|
||||
) -> AsyncGenerator[Document, None]:
|
||||
"""
|
||||
Search for documents in Weaviate.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
limit: Maximum results
|
||||
threshold: Similarity threshold
|
||||
|
||||
Yields:
|
||||
Matching documents
|
||||
"""
|
||||
try:
|
||||
# Vectorize query
|
||||
query_vector = await self.vectorizer.vectorize(query)
|
||||
|
||||
if not self.client:
|
||||
raise StorageError("Weaviate client not initialized")
|
||||
collection = self.client.collections.get(self.collection_name)
|
||||
|
||||
# Perform vector search
|
||||
results = collection.query.near_vector(
|
||||
near_vector=query_vector,
|
||||
limit=limit,
|
||||
distance=1 - threshold, # Convert similarity to distance
|
||||
return_metadata=["distance"],
|
||||
)
|
||||
|
||||
for result in results.objects:
|
||||
props = result.properties
|
||||
metadata_dict = {
|
||||
"source_url": str(props["source_url"]),
|
||||
"title": str(props.get("title")) if props.get("title") else None,
|
||||
"description": str(props.get("description"))
|
||||
if props.get("description")
|
||||
else None,
|
||||
"timestamp": str(props["timestamp"]),
|
||||
"content_type": str(props["content_type"]),
|
||||
"word_count": int(str(props["word_count"])),
|
||||
"char_count": int(str(props["char_count"])),
|
||||
}
|
||||
metadata = cast(DocumentMetadata, cast(object, metadata_dict))
|
||||
|
||||
vector_raw = result.vector.get("default") if result.vector else None
|
||||
vector: list[float] | None = None
|
||||
if isinstance(vector_raw, list) and vector_raw:
|
||||
first_elem = vector_raw[0]
|
||||
if isinstance(first_elem, list):
|
||||
# Nested list - take first one and ensure all elements are numbers
|
||||
nested_vector = first_elem
|
||||
try:
|
||||
vector = [
|
||||
float(x) for x in nested_vector if isinstance(x, (int, float))
|
||||
]
|
||||
except (ValueError, TypeError):
|
||||
vector = None
|
||||
else:
|
||||
# Flat list - ensure all elements are numbers
|
||||
try:
|
||||
vector = [float(x) for x in vector_raw if isinstance(x, (int, float))]
|
||||
except (ValueError, TypeError):
|
||||
vector = None
|
||||
|
||||
doc = Document(
|
||||
id=result.uuid,
|
||||
content=str(props["content"]),
|
||||
metadata=metadata,
|
||||
vector=vector,
|
||||
source=IngestionSource.WEB, # Default to WEB
|
||||
collection=self.collection_name,
|
||||
)
|
||||
yield doc
|
||||
|
||||
except Exception as e:
|
||||
raise StorageError(f"Search failed: {e}") from e
|
||||
|
||||
@override
|
||||
async def delete(self, document_id: str) -> bool:
|
||||
"""
|
||||
Delete a document from Weaviate.
|
||||
|
||||
Args:
|
||||
document_id: Document ID
|
||||
|
||||
Returns:
|
||||
True if deleted
|
||||
"""
|
||||
try:
|
||||
if not self.client:
|
||||
raise StorageError("Weaviate client not initialized")
|
||||
collection = self.client.collections.get(self.collection_name)
|
||||
collection.data.delete_by_id(document_id)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
@override
|
||||
async def count(self) -> int:
|
||||
"""
|
||||
Get document count in collection.
|
||||
|
||||
Returns:
|
||||
Number of documents
|
||||
"""
|
||||
try:
|
||||
if not self.client:
|
||||
return 0
|
||||
collection = self.client.collections.get(self.collection_name)
|
||||
result = collection.aggregate.over_all(total_count=True)
|
||||
return result.total_count or 0
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
async def list_collections(self) -> list[str]:
|
||||
"""
|
||||
List all available collections.
|
||||
|
||||
Returns:
|
||||
List of collection names
|
||||
"""
|
||||
try:
|
||||
if not self.client:
|
||||
raise StorageError("Weaviate client not initialized")
|
||||
|
||||
return list(self.client.collections.list_all())
|
||||
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to list collections: {e}") from e
|
||||
|
||||
async def sample_documents(self, limit: int = 5) -> list[Document]:
|
||||
"""
|
||||
Get sample documents from the collection.
|
||||
|
||||
Args:
|
||||
limit: Maximum number of documents to return
|
||||
|
||||
Returns:
|
||||
List of sample documents
|
||||
"""
|
||||
try:
|
||||
if not self.client:
|
||||
raise StorageError("Weaviate client not initialized")
|
||||
|
||||
collection = self.client.collections.get(self.collection_name)
|
||||
|
||||
# Query for sample documents
|
||||
response = collection.query.fetch_objects(limit=limit)
|
||||
|
||||
documents = []
|
||||
for obj in response.objects:
|
||||
# Convert back to Document format
|
||||
props = obj.properties
|
||||
# Safely convert WeaviateField values
|
||||
word_count_val = props.get("word_count")
|
||||
if isinstance(word_count_val, (int, float)):
|
||||
word_count = int(word_count_val)
|
||||
elif word_count_val:
|
||||
word_count = int(str(word_count_val))
|
||||
else:
|
||||
word_count = 0
|
||||
|
||||
char_count_val = props.get("char_count")
|
||||
if isinstance(char_count_val, (int, float)):
|
||||
char_count = int(char_count_val)
|
||||
elif char_count_val:
|
||||
char_count = int(str(char_count_val))
|
||||
else:
|
||||
char_count = 0
|
||||
|
||||
doc = Document(
|
||||
id=obj.uuid,
|
||||
content=str(props.get("content", "")),
|
||||
source=IngestionSource(str(props.get("source", "web"))),
|
||||
metadata={
|
||||
"source_url": str(props.get("source_url", "")),
|
||||
"title": str(props.get("title", "")) if props.get("title") else None,
|
||||
"description": str(props.get("description", "")) if props.get("description") else None,
|
||||
"timestamp": datetime.fromisoformat(str(props.get("timestamp", datetime.now(UTC).isoformat()))),
|
||||
"content_type": str(props.get("content_type", "text/plain")),
|
||||
"word_count": word_count,
|
||||
"char_count": char_count,
|
||||
}
|
||||
)
|
||||
documents.append(doc)
|
||||
|
||||
return documents
|
||||
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to sample documents: {e}") from e
|
||||
|
||||
async def search_documents(self, query: str, limit: int = 10) -> list[Document]:
|
||||
"""
|
||||
Search documents in the collection.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
limit: Maximum number of results
|
||||
|
||||
Returns:
|
||||
List of matching documents
|
||||
"""
|
||||
try:
|
||||
if not self.client:
|
||||
raise StorageError("Weaviate client not initialized")
|
||||
|
||||
collection = self.client.collections.get(self.collection_name)
|
||||
|
||||
# Try hybrid search first, fall back to BM25 keyword search
|
||||
try:
|
||||
response = collection.query.hybrid(
|
||||
query=query,
|
||||
limit=limit,
|
||||
return_metadata=["score"]
|
||||
)
|
||||
except Exception:
|
||||
# Fall back to BM25 keyword search if hybrid search fails
|
||||
response = collection.query.bm25(
|
||||
query=query,
|
||||
limit=limit,
|
||||
return_metadata=["score"]
|
||||
)
|
||||
|
||||
documents = []
|
||||
for obj in response.objects:
|
||||
# Convert back to Document format
|
||||
props = obj.properties
|
||||
|
||||
# Safely convert WeaviateField values
|
||||
word_count_val = props.get("word_count")
|
||||
if isinstance(word_count_val, (int, float)):
|
||||
word_count = int(word_count_val)
|
||||
elif word_count_val:
|
||||
word_count = int(str(word_count_val))
|
||||
else:
|
||||
word_count = 0
|
||||
|
||||
char_count_val = props.get("char_count")
|
||||
if isinstance(char_count_val, (int, float)):
|
||||
char_count = int(char_count_val)
|
||||
elif char_count_val:
|
||||
char_count = int(str(char_count_val))
|
||||
else:
|
||||
char_count = 0
|
||||
|
||||
# Build metadata - note that search_score is not part of DocumentMetadata
|
||||
metadata: DocumentMetadata = {
|
||||
"source_url": str(props.get("source_url", "")),
|
||||
"title": str(props.get("title", "")) if props.get("title") else None,
|
||||
"description": str(props.get("description", "")) if props.get("description") else None,
|
||||
"timestamp": datetime.fromisoformat(str(props.get("timestamp", datetime.now(UTC).isoformat()))),
|
||||
"content_type": str(props.get("content_type", "text/plain")),
|
||||
"word_count": word_count,
|
||||
"char_count": char_count,
|
||||
}
|
||||
|
||||
doc = Document(
|
||||
id=obj.uuid,
|
||||
content=str(props.get("content", "")),
|
||||
source=IngestionSource(str(props.get("source", "web"))),
|
||||
metadata=metadata
|
||||
)
|
||||
documents.append(doc)
|
||||
|
||||
return documents
|
||||
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to search documents: {e}") from e
|
||||
|
||||
async def list_documents(self, limit: int = 100, offset: int = 0) -> list[dict[str, str | int]]:
|
||||
"""
|
||||
List documents in the collection with pagination.
|
||||
|
||||
Args:
|
||||
limit: Maximum number of documents to return
|
||||
offset: Number of documents to skip
|
||||
|
||||
Returns:
|
||||
List of document dictionaries with id, title, source_url, and content preview
|
||||
"""
|
||||
try:
|
||||
if not self.client:
|
||||
raise StorageError("Weaviate client not initialized")
|
||||
|
||||
collection = self.client.collections.get(self.collection_name)
|
||||
|
||||
# Query documents with pagination
|
||||
response = collection.query.fetch_objects(
|
||||
limit=limit,
|
||||
offset=offset,
|
||||
return_metadata=["creation_time"]
|
||||
)
|
||||
|
||||
documents = []
|
||||
for obj in response.objects:
|
||||
props = obj.properties
|
||||
content = str(props.get("content", ""))
|
||||
word_count_value = props.get("word_count", 0)
|
||||
# Convert WeaviateField to int
|
||||
if isinstance(word_count_value, (int, float)):
|
||||
word_count = int(word_count_value)
|
||||
elif word_count_value:
|
||||
word_count = int(str(word_count_value))
|
||||
else:
|
||||
word_count = 0
|
||||
|
||||
doc_info: dict[str, str | int] = {
|
||||
"id": str(obj.uuid),
|
||||
"title": str(props.get("title", "Untitled")),
|
||||
"source_url": str(props.get("source_url", "")),
|
||||
"content_preview": content[:200] + "..." if len(content) > 200 else content,
|
||||
"word_count": word_count,
|
||||
"timestamp": str(props.get("timestamp", "")),
|
||||
}
|
||||
documents.append(doc_info)
|
||||
|
||||
return documents
|
||||
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to list documents: {e}") from e
|
||||
|
||||
async def delete_documents(self, document_ids: list[str]) -> dict[str, bool]:
|
||||
"""
|
||||
Delete multiple documents from Weaviate.
|
||||
|
||||
Args:
|
||||
document_ids: List of document IDs to delete
|
||||
|
||||
Returns:
|
||||
Dictionary mapping document IDs to deletion success status
|
||||
"""
|
||||
results = {}
|
||||
|
||||
try:
|
||||
if not self.client:
|
||||
raise StorageError("Weaviate client not initialized")
|
||||
|
||||
collection = self.client.collections.get(self.collection_name)
|
||||
|
||||
for doc_id in document_ids:
|
||||
try:
|
||||
collection.data.delete_by_id(doc_id)
|
||||
results[doc_id] = True
|
||||
except Exception:
|
||||
results[doc_id] = False
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to delete documents: {e}") from e
|
||||
|
||||
async def delete_by_filter(self, filter_dict: dict[str, str]) -> int:
|
||||
"""
|
||||
Delete documents matching a filter.
|
||||
|
||||
Args:
|
||||
filter_dict: Filter criteria (e.g., {"source_url": "example.com"})
|
||||
|
||||
Returns:
|
||||
Number of documents deleted
|
||||
"""
|
||||
try:
|
||||
if not self.client:
|
||||
raise StorageError("Weaviate client not initialized")
|
||||
|
||||
collection = self.client.collections.get(self.collection_name)
|
||||
|
||||
# Build where filter
|
||||
where_filter = None
|
||||
if "source_url" in filter_dict:
|
||||
from weaviate.classes.query import Filter
|
||||
where_filter = Filter.by_property("source_url").equal(filter_dict["source_url"])
|
||||
|
||||
# Get documents matching filter
|
||||
if where_filter:
|
||||
response = collection.query.fetch_objects(
|
||||
filters=where_filter,
|
||||
limit=1000 # Max batch size
|
||||
)
|
||||
else:
|
||||
response = collection.query.fetch_objects(
|
||||
limit=1000 # Max batch size
|
||||
)
|
||||
|
||||
# Delete matching documents
|
||||
deleted_count = 0
|
||||
for obj in response.objects:
|
||||
try:
|
||||
collection.data.delete_by_id(obj.uuid)
|
||||
deleted_count += 1
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return deleted_count
|
||||
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to delete by filter: {e}") from e
|
||||
|
||||
async def delete_collection(self) -> bool:
|
||||
"""
|
||||
Delete the entire collection.
|
||||
|
||||
Returns:
|
||||
True if successful
|
||||
"""
|
||||
try:
|
||||
if not self.client:
|
||||
raise StorageError("Weaviate client not initialized")
|
||||
|
||||
# Delete the collection using the client's collections API
|
||||
self.client.collections.delete(self.collection_name)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to delete collection: {e}") from e
|
||||
|
||||
def __del__(self) -> None:
|
||||
"""Clean up client connection."""
|
||||
if self.client:
|
||||
self.client.close()
|
||||
6
ingest_pipeline/utils/__init__.py
Normal file
6
ingest_pipeline/utils/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
"""Utility modules."""
|
||||
|
||||
from .metadata_tagger import MetadataTagger
|
||||
from .vectorizer import Vectorizer
|
||||
|
||||
__all__ = ["MetadataTagger", "Vectorizer"]
|
||||
BIN
ingest_pipeline/utils/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
ingest_pipeline/utils/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
Binary file not shown.
BIN
ingest_pipeline/utils/__pycache__/vectorizer.cpython-312.pyc
Normal file
BIN
ingest_pipeline/utils/__pycache__/vectorizer.cpython-312.pyc
Normal file
Binary file not shown.
269
ingest_pipeline/utils/metadata_tagger.py
Normal file
269
ingest_pipeline/utils/metadata_tagger.py
Normal file
@@ -0,0 +1,269 @@
|
||||
"""Metadata tagger for enriching documents with AI-generated tags and metadata."""
|
||||
|
||||
import json
|
||||
from datetime import UTC, datetime
|
||||
from typing import TypedDict
|
||||
|
||||
import httpx
|
||||
|
||||
from ..core.exceptions import IngestionError
|
||||
from ..core.models import Document
|
||||
|
||||
|
||||
class DocumentMetadata(TypedDict, total=False):
|
||||
"""Structured metadata for documents."""
|
||||
|
||||
tags: list[str]
|
||||
category: str
|
||||
summary: str
|
||||
key_topics: list[str]
|
||||
document_type: str
|
||||
language: str
|
||||
technical_level: str
|
||||
|
||||
|
||||
class MetadataTagger:
|
||||
"""Generates metadata tags for documents using language models."""
|
||||
|
||||
endpoint: str
|
||||
model: str
|
||||
client: httpx.AsyncClient
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
llm_endpoint: str = "http://llm.lab",
|
||||
model: str = "openai/gpt-4o-mini",
|
||||
):
|
||||
"""
|
||||
Initialize metadata tagger.
|
||||
|
||||
Args:
|
||||
llm_endpoint: LLM API endpoint
|
||||
model: Model to use for tagging
|
||||
"""
|
||||
self.endpoint = llm_endpoint
|
||||
self.model = model
|
||||
|
||||
# Get API key from environment
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Load .env from the project root
|
||||
env_path = Path(__file__).parent.parent.parent / ".env"
|
||||
load_dotenv(env_path)
|
||||
|
||||
api_key = os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY") or ""
|
||||
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if api_key:
|
||||
headers["Authorization"] = f"Bearer {api_key}"
|
||||
|
||||
self.client = httpx.AsyncClient(timeout=60.0, headers=headers)
|
||||
|
||||
async def tag_document(
|
||||
self, document: Document, custom_instructions: str | None = None
|
||||
) -> Document:
|
||||
"""
|
||||
Analyze document and generate metadata tags.
|
||||
|
||||
Args:
|
||||
document: Document to tag
|
||||
custom_instructions: Optional custom instructions for tagging
|
||||
|
||||
Returns:
|
||||
Document with enriched metadata
|
||||
"""
|
||||
if not document.content:
|
||||
return document
|
||||
|
||||
try:
|
||||
# Generate metadata using LLM
|
||||
metadata = await self._generate_metadata(
|
||||
document.content,
|
||||
document.metadata.get("title") if document.metadata else None,
|
||||
custom_instructions
|
||||
)
|
||||
|
||||
# Merge with existing metadata - preserve required fields
|
||||
from ..core.models import DocumentMetadata as CoreDocumentMetadata
|
||||
|
||||
updated_metadata: CoreDocumentMetadata = {
|
||||
"source_url": document.metadata.get("source_url", ""),
|
||||
"title": metadata.get("title") or document.metadata.get("title"),
|
||||
"description": metadata.get("summary") or document.metadata.get("description"),
|
||||
"timestamp": document.metadata.get("timestamp", datetime.now(UTC)),
|
||||
"content_type": document.metadata.get("content_type", "text/plain"),
|
||||
"word_count": document.metadata.get("word_count", len(document.content.split())),
|
||||
"char_count": document.metadata.get("char_count", len(document.content)),
|
||||
}
|
||||
|
||||
# Store additional metadata as extra fields in the document's metadata
|
||||
# Note: Since DocumentMetadata is a TypedDict, we can only include the defined fields
|
||||
# Additional metadata like tags, category, etc. would need to be stored separately
|
||||
# or the DocumentMetadata model would need to be extended
|
||||
|
||||
document.metadata = updated_metadata
|
||||
|
||||
return document
|
||||
|
||||
except Exception as e:
|
||||
raise IngestionError(f"Failed to tag document: {e}") from e
|
||||
|
||||
async def tag_batch(
|
||||
self,
|
||||
documents: list[Document],
|
||||
custom_instructions: str | None = None,
|
||||
) -> list[Document]:
|
||||
"""
|
||||
Tag multiple documents with metadata.
|
||||
|
||||
Args:
|
||||
documents: Documents to tag
|
||||
custom_instructions: Optional custom instructions
|
||||
|
||||
Returns:
|
||||
Documents with enriched metadata
|
||||
"""
|
||||
tagged_docs: list[Document] = []
|
||||
|
||||
for doc in documents:
|
||||
tagged_doc = await self.tag_document(doc, custom_instructions)
|
||||
tagged_docs.append(tagged_doc)
|
||||
|
||||
return tagged_docs
|
||||
|
||||
async def _generate_metadata(
|
||||
self,
|
||||
content: str,
|
||||
title: str | None = None,
|
||||
custom_instructions: str | None = None,
|
||||
) -> DocumentMetadata:
|
||||
"""
|
||||
Generate metadata using LLM.
|
||||
|
||||
Args:
|
||||
content: Document content
|
||||
title: Document title
|
||||
custom_instructions: Optional custom instructions
|
||||
|
||||
Returns:
|
||||
Generated metadata dictionary
|
||||
"""
|
||||
# Prepare the prompt
|
||||
system_prompt = """You are a document metadata tagger. Analyze the given content and generate relevant metadata.
|
||||
|
||||
Return a JSON object with the following structure:
|
||||
{
|
||||
"tags": ["tag1", "tag2", ...], # 3-7 relevant topic tags
|
||||
"category": "string", # Main category
|
||||
"summary": "string", # 1-2 sentence summary
|
||||
"key_topics": ["topic1", "topic2", ...], # Main topics discussed
|
||||
"document_type": "string", # Type of document (e.g., "technical", "tutorial", "reference")
|
||||
"language": "string", # Primary language (e.g., "en", "es")
|
||||
"technical_level": "string" # One of: "beginner", "intermediate", "advanced"
|
||||
}"""
|
||||
|
||||
if custom_instructions:
|
||||
system_prompt += f"\n\nAdditional instructions: {custom_instructions}"
|
||||
|
||||
# Prepare user prompt
|
||||
user_prompt = "Document to analyze:\n"
|
||||
if title:
|
||||
user_prompt += f"Title: {title}\n"
|
||||
user_prompt += f"Content:\n{content[:3000]}" # Limit content length
|
||||
|
||||
# Call LLM
|
||||
response = await self.client.post(
|
||||
f"{self.endpoint}/v1/chat/completions",
|
||||
json={
|
||||
"model": self.model,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt},
|
||||
],
|
||||
"temperature": 0.3,
|
||||
"max_tokens": 500,
|
||||
"response_format": {"type": "json_object"},
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
if not isinstance(result, dict):
|
||||
raise IngestionError("Invalid response format from LLM")
|
||||
|
||||
# Extract content from response
|
||||
choices = result.get("choices", [])
|
||||
if not choices:
|
||||
raise IngestionError("No response from LLM")
|
||||
|
||||
message = choices[0].get("message", {})
|
||||
content_str = message.get("content", "{}")
|
||||
|
||||
try:
|
||||
metadata = json.loads(content_str)
|
||||
except json.JSONDecodeError as e:
|
||||
raise IngestionError(f"Failed to parse LLM response: {e}") from e
|
||||
|
||||
# Validate and sanitize metadata
|
||||
return self._sanitize_metadata(metadata)
|
||||
|
||||
def _sanitize_metadata(self, metadata: dict[str, object]) -> DocumentMetadata:
|
||||
"""
|
||||
Sanitize and validate metadata.
|
||||
|
||||
Args:
|
||||
metadata: Raw metadata from LLM
|
||||
|
||||
Returns:
|
||||
Sanitized metadata
|
||||
"""
|
||||
sanitized: DocumentMetadata = {}
|
||||
|
||||
# Tags
|
||||
if "tags" in metadata and isinstance(metadata["tags"], list):
|
||||
tags = [str(tag).lower().strip() for tag in metadata["tags"][:10]]
|
||||
sanitized["tags"] = [tag for tag in tags if tag]
|
||||
|
||||
# Category
|
||||
if "category" in metadata:
|
||||
sanitized["category"] = str(metadata["category"]).strip()
|
||||
|
||||
# Summary
|
||||
if "summary" in metadata:
|
||||
summary = str(metadata["summary"]).strip()
|
||||
if summary:
|
||||
sanitized["summary"] = summary[:500] # Limit length
|
||||
|
||||
# Key topics
|
||||
if "key_topics" in metadata and isinstance(metadata["key_topics"], list):
|
||||
topics = [str(topic).strip() for topic in metadata["key_topics"][:10]]
|
||||
sanitized["key_topics"] = [topic for topic in topics if topic]
|
||||
|
||||
# Document type
|
||||
if "document_type" in metadata:
|
||||
sanitized["document_type"] = str(metadata["document_type"]).strip()
|
||||
|
||||
# Language
|
||||
if "language" in metadata:
|
||||
lang = str(metadata["language"]).strip().lower()
|
||||
if len(lang) == 2: # Basic validation for ISO 639-1
|
||||
sanitized["language"] = lang
|
||||
|
||||
# Technical level
|
||||
if "technical_level" in metadata:
|
||||
level = str(metadata["technical_level"]).strip().lower()
|
||||
if level in ["beginner", "intermediate", "advanced"]:
|
||||
sanitized["technical_level"] = level
|
||||
|
||||
return sanitized
|
||||
|
||||
async def __aenter__(self) -> "MetadataTagger":
|
||||
"""Async context manager entry."""
|
||||
return self
|
||||
|
||||
async def __aexit__(self, *args: object) -> None:
|
||||
"""Async context manager exit."""
|
||||
await self.client.aclose()
|
||||
220
ingest_pipeline/utils/vectorizer.py
Normal file
220
ingest_pipeline/utils/vectorizer.py
Normal file
@@ -0,0 +1,220 @@
|
||||
"""Vectorizer utility for generating embeddings."""
|
||||
|
||||
from types import TracebackType
|
||||
from typing import Self
|
||||
|
||||
import httpx
|
||||
|
||||
from ..core.exceptions import VectorizationError
|
||||
from ..core.models import StorageConfig, VectorConfig
|
||||
|
||||
|
||||
class Vectorizer:
|
||||
"""Handles text vectorization using LLM endpoints."""
|
||||
|
||||
endpoint: str
|
||||
model: str
|
||||
dimension: int
|
||||
client: httpx.AsyncClient
|
||||
|
||||
def __init__(self, config: StorageConfig | VectorConfig):
|
||||
"""
|
||||
Initialize vectorizer.
|
||||
|
||||
Args:
|
||||
config: Configuration with embedding details
|
||||
"""
|
||||
if isinstance(config, StorageConfig):
|
||||
# Extract vector config from storage config
|
||||
self.endpoint = "http://llm.lab"
|
||||
self.model = "ollama/bge-m3:latest"
|
||||
self.dimension = 1024
|
||||
else:
|
||||
self.endpoint = str(config.embedding_endpoint)
|
||||
self.model = config.model
|
||||
self.dimension = config.dimension
|
||||
|
||||
# Get API key from environment
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from pathlib import Path
|
||||
|
||||
# Load .env from the project root
|
||||
env_path = Path(__file__).parent.parent.parent / ".env"
|
||||
load_dotenv(env_path)
|
||||
|
||||
api_key = os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY") or ""
|
||||
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if api_key:
|
||||
headers["Authorization"] = f"Bearer {api_key}"
|
||||
|
||||
self.client = httpx.AsyncClient(timeout=60.0, headers=headers)
|
||||
|
||||
async def vectorize(self, text: str) -> list[float]:
|
||||
"""
|
||||
Generate embedding vector for text.
|
||||
|
||||
Args:
|
||||
text: Text to vectorize
|
||||
|
||||
Returns:
|
||||
Embedding vector
|
||||
"""
|
||||
if not text:
|
||||
raise VectorizationError("Cannot vectorize empty text")
|
||||
|
||||
try:
|
||||
# Prepare request based on model type
|
||||
if "ollama" in self.model:
|
||||
response = await self._ollama_embed(text)
|
||||
else:
|
||||
response = await self._openai_embed(text)
|
||||
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
raise VectorizationError(f"Vectorization failed: {e}") from e
|
||||
|
||||
async def vectorize_batch(self, texts: list[str]) -> list[list[float]]:
|
||||
"""
|
||||
Generate embeddings for multiple texts.
|
||||
|
||||
Args:
|
||||
texts: List of texts to vectorize
|
||||
|
||||
Returns:
|
||||
List of embedding vectors
|
||||
"""
|
||||
vectors: list[list[float]] = []
|
||||
|
||||
for text in texts:
|
||||
vector = await self.vectorize(text)
|
||||
vectors.append(vector)
|
||||
|
||||
return vectors
|
||||
|
||||
async def _ollama_embed(self, text: str) -> list[float]:
|
||||
"""
|
||||
Generate embedding using Ollama via OpenAI-compatible endpoint.
|
||||
|
||||
Args:
|
||||
text: Text to embed
|
||||
|
||||
Returns:
|
||||
Embedding vector
|
||||
"""
|
||||
# Keep the full model name for OpenAI-compatible endpoints
|
||||
model_name = self.model
|
||||
|
||||
# Use OpenAI-compatible endpoint for ollama models
|
||||
response = await self.client.post(
|
||||
f"{self.endpoint}/v1/embeddings",
|
||||
json={
|
||||
"model": model_name,
|
||||
"input": text,
|
||||
},
|
||||
)
|
||||
_ = response.raise_for_status()
|
||||
|
||||
response_data = response.json()
|
||||
if not isinstance(response_data, dict):
|
||||
raise VectorizationError("Invalid response format from OpenAI-compatible API")
|
||||
|
||||
# Parse OpenAI-compatible response format
|
||||
embeddings_raw = response_data.get("data", [])
|
||||
if not isinstance(embeddings_raw, list) or not embeddings_raw:
|
||||
raise VectorizationError("No embeddings returned")
|
||||
|
||||
first_embedding_data = embeddings_raw[0]
|
||||
if not isinstance(first_embedding_data, dict):
|
||||
raise VectorizationError("Invalid embedding data format")
|
||||
|
||||
embedding_raw = first_embedding_data.get("embedding")
|
||||
if not isinstance(embedding_raw, list):
|
||||
raise VectorizationError("Invalid embedding format")
|
||||
|
||||
# Convert to float list and validate
|
||||
embedding: list[float] = []
|
||||
for item in embedding_raw:
|
||||
if isinstance(item, (int, float)):
|
||||
embedding.append(float(item))
|
||||
else:
|
||||
raise VectorizationError(f"Invalid embedding value type: {type(item)}")
|
||||
|
||||
# Ensure correct dimension
|
||||
if len(embedding) != self.dimension:
|
||||
# Truncate or pad as needed
|
||||
if len(embedding) > self.dimension:
|
||||
embedding = embedding[: self.dimension]
|
||||
else:
|
||||
embedding.extend([0.0] * (self.dimension - len(embedding)))
|
||||
|
||||
return embedding
|
||||
|
||||
async def _openai_embed(self, text: str) -> list[float]:
|
||||
"""
|
||||
Generate embedding using OpenAI-compatible API.
|
||||
|
||||
Args:
|
||||
text: Text to embed
|
||||
|
||||
Returns:
|
||||
Embedding vector
|
||||
"""
|
||||
response = await self.client.post(
|
||||
f"{self.endpoint}/v1/embeddings",
|
||||
json={
|
||||
"model": self.model,
|
||||
"input": text,
|
||||
},
|
||||
)
|
||||
_ = response.raise_for_status()
|
||||
|
||||
response_data = response.json()
|
||||
if not isinstance(response_data, dict):
|
||||
raise VectorizationError("Invalid response format from OpenAI API")
|
||||
|
||||
data: dict[str, list[dict[str, list[float]]]] = response_data
|
||||
|
||||
embeddings_raw = data.get("data", [])
|
||||
if not isinstance(embeddings_raw, list) or not embeddings_raw:
|
||||
raise VectorizationError("No embeddings returned")
|
||||
|
||||
first_embedding_data = embeddings_raw[0]
|
||||
if not isinstance(first_embedding_data, dict):
|
||||
raise VectorizationError("Invalid embedding data format")
|
||||
|
||||
embedding_raw = first_embedding_data.get("embedding")
|
||||
if not isinstance(embedding_raw, list):
|
||||
raise VectorizationError("Invalid embedding format")
|
||||
|
||||
# Convert to float list and validate
|
||||
embedding: list[float] = []
|
||||
for item in embedding_raw:
|
||||
if isinstance(item, (int, float)):
|
||||
embedding.append(float(item))
|
||||
else:
|
||||
raise VectorizationError(f"Invalid embedding value type: {type(item)}")
|
||||
|
||||
# Ensure correct dimension
|
||||
if len(embedding) != self.dimension:
|
||||
if len(embedding) > self.dimension:
|
||||
embedding = embedding[: self.dimension]
|
||||
else:
|
||||
embedding.extend([0.0] * (self.dimension - len(embedding)))
|
||||
|
||||
return embedding
|
||||
|
||||
async def __aenter__(self) -> Self:
|
||||
"""Async context manager entry."""
|
||||
return self
|
||||
|
||||
async def __aexit__(
|
||||
self,
|
||||
exc_type: type[BaseException] | None,
|
||||
exc_val: BaseException | None,
|
||||
exc_tb: TracebackType | None,
|
||||
) -> None:
|
||||
"""Async context manager exit."""
|
||||
await self.client.aclose()
|
||||
78
pyproject.toml
Normal file
78
pyproject.toml
Normal file
@@ -0,0 +1,78 @@
|
||||
[project]
|
||||
name = "ingest-pipeline"
|
||||
version = "0.1.0"
|
||||
description = "Document ingestion pipeline with Prefect orchestration"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.11"
|
||||
dependencies = [
|
||||
"prefect>=2.14.0",
|
||||
"pydantic>=2.5.0",
|
||||
"pydantic-settings>=2.1.0",
|
||||
"firecrawl-py>=0.0.1",
|
||||
"gitpython>=3.1.40",
|
||||
"weaviate-client>=4.4.0",
|
||||
"httpx>=0.25.0",
|
||||
"typer>=0.9.0",
|
||||
"rich>=13.7.0",
|
||||
"textual>=0.50.0",
|
||||
"python-dotenv>=1.0.0",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
ingest = "ingest_pipeline.cli.main:app"
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["ingest_pipeline"]
|
||||
|
||||
[tool.uv]
|
||||
dev-dependencies = [
|
||||
"pytest>=7.4.0",
|
||||
"pytest-asyncio>=0.21.0",
|
||||
"pytest-cov>=4.1.0",
|
||||
"mypy>=1.7.0",
|
||||
"ruff>=0.1.0",
|
||||
"basedpyright>=1.31.4",
|
||||
]
|
||||
|
||||
[tool.ruff]
|
||||
line-length = 100
|
||||
target-version = "py311"
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = [
|
||||
"E", # pycodestyle errors
|
||||
"W", # pycodestyle warnings
|
||||
"F", # pyflakes
|
||||
"I", # isort
|
||||
"B", # flake8-bugbear
|
||||
"C4", # flake8-comprehensions
|
||||
"UP", # pyupgrade
|
||||
]
|
||||
ignore = [
|
||||
"E501", # line too long (handled by formatter)
|
||||
]
|
||||
|
||||
[tool.ruff.lint.per-file-ignores]
|
||||
"ingest_pipeline/cli/main.py" = ["B008"] # Typer uses function calls in defaults
|
||||
|
||||
[tool.mypy]
|
||||
python_version = "3.11"
|
||||
strict = true
|
||||
warn_return_any = true
|
||||
warn_unused_configs = true
|
||||
ignore_missing_imports = true
|
||||
# Allow AsyncGenerator types in overrides
|
||||
disable_error_code = ["override"]
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
asyncio_mode = "auto"
|
||||
testpaths = ["tests"]
|
||||
pythonpath = ["."]
|
||||
|
||||
[tool.coverage.run]
|
||||
source = ["ingest_pipeline"]
|
||||
omit = ["*/tests/*", "*/__main__.py"]
|
||||
3
tui
Executable file
3
tui
Executable file
@@ -0,0 +1,3 @@
|
||||
#!/usr/bin/env bash
|
||||
cd "$(dirname "$0")"
|
||||
uv run python -m ingest_pipeline tui
|
||||
Reference in New Issue
Block a user