init

2025-09-15 12:35:42 -04:00
commit 94ddcfeff6
94 changed files with 9583 additions and 0 deletions
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -0,0 +1,11 @@
+{
+  "permissions": {
+    "allow": [
+      "mcp__context7__resolve-library-id",
+      "mcp__context7__get-library-docs",
+      "mcp__sequential-thinking__sequentialthinking"
+    ],
+    "deny": [],
+    "ask": []
+  }
+}
--- a/.env
+++ b/.env
@@ -0,0 +1,51 @@
+WEAVIATE_IS_LOCAL=True
+
+# URL can be just a host or full URL; defaults shown below
+WCD_URL=http://weaviate.yo            # or http://localhost:8080
+# LOCAL_WEAVIATE_PORT=8080     # optional override
+# LOCAL_WEAVIATE_GRPC_PORT=50051  # optional override
+
+# No API key required for local unless you enabled local auth
+# WCD_API_KEY=
+# API Keys (only if not using local/self-hosted services)
+FIRECRAWL_API_KEY=dummy-key
+OPENWEBUI_API_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6IjlmNjEwODg2LWRhM2MtNDQ4YS05OWE0LTYyZGEyZjIyZjJiNiJ9.W-dqabcE4F-LQ--k2yrJM_KEBDB-wi1CmoahlN1tQbY
+OPENWEBUI_API_URL=http://chat.lab
+WEAVIATE_API_KEY=
+OPENAI_API_KEY=sk-1234
+LLM_API_KEY=sk-1234
+# Endpoints
+LLM_ENDPOINT=http://llm.lab
+WEAVIATE_ENDPOINT=http://weaviate.yo
+OPENWEBUI_ENDPOINT=http://chat.lab
+FIRECRAWL_ENDPOINT=http://crawl.lab:30002
+
+# Model Configuration
+EMBEDDING_MODEL=ollama/bge-m3:latest
+EMBEDDING_DIMENSION=1024
+
+# Ingestion Settings
+BATCH_SIZE=50
+MAX_FILE_SIZE=1000000
+MAX_CRAWL_DEPTH=5
+MAX_CRAWL_PAGES=100
+
+# Storage Settings
+DEFAULT_STORAGE_BACKEND=weaviate
+COLLECTION_PREFIX=docs
+
+# Prefect Settings
+PREFECT_API_URL=http://prefect.lab/api
+PREFECT_API_KEY=0nR4WAkQ3q9MY1bjqATK6pVmolighvrS
+PREFECT_WORK_POOL=default
+
+# Scheduling
+DEFAULT_SCHEDULE_INTERVAL=60
+
+# Performance
+MAX_CONCURRENT_TASKS=5
+REQUEST_TIMEOUT=60
+
+# Logging
+LOG_LEVEL=INFO
+FIRST_START_ELYSIA='1'
--- a/.env.example
+++ b/.env.example
@@ -0,0 +1,39 @@
+# API Keys (only if not using local/self-hosted services)
+FIRECRAWL_API_KEY=
+OPENWEBUI_API_KEY=
+WEAVIATE_API_KEY=
+
+# Endpoints
+LLM_ENDPOINT=http://llm.lab
+WEAVIATE_ENDPOINT=http://weaviate.yo
+OPENWEBUI_ENDPOINT=http://chat.lab
+FIRECRAWL_ENDPOINT=http://crawl.lab:30002
+
+# Model Configuration
+EMBEDDING_MODEL=ollama/bge-m3:latest
+EMBEDDING_DIMENSION=1024
+
+# Ingestion Settings
+BATCH_SIZE=50
+MAX_FILE_SIZE=1000000
+MAX_CRAWL_DEPTH=5
+MAX_CRAWL_PAGES=100
+
+# Storage Settings
+DEFAULT_STORAGE_BACKEND=weaviate
+COLLECTION_PREFIX=docs
+
+# Prefect Settings
+PREFECT_API_URL=
+PREFECT_API_KEY=
+PREFECT_WORK_POOL=default
+
+# Scheduling
+DEFAULT_SCHEDULE_INTERVAL=60
+
+# Performance
+MAX_CONCURRENT_TASKS=5
+REQUEST_TIMEOUT=60
+
+# Logging
+LOG_LEVEL=INFO
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -0,0 +1,100 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Project Overview
+
+This is a modular document ingestion pipeline using Prefect for orchestrating ingestion from web/documentation sites (via Firecrawl) and Git repositories (via Repomix) into Weaviate vector database or Open WebUI knowledge endpoints.
+
+## Development Commands
+
+### Environment Setup
+```bash
+# Install dependencies using uv (required)
+uv sync
+
+# Activate virtual environment
+source .venv/bin/activate
+
+# Install repomix globally (required for repository ingestion)
+npm install -g repomix
+
+# Configure environment
+cp .env.example .env
+# Edit .env with your settings
+```
+
+### Running the Application
+```bash
+# One-time ingestion
+python -m ingest_pipeline ingest <url> --type web --storage weaviate
+
+# Schedule recurring ingestion
+python -m ingest_pipeline schedule <name> <url> --type web --storage weaviate --cron "0 2 * * *"
+
+# Start deployment server
+python -m ingest_pipeline serve
+
+# View configuration
+python -m ingest_pipeline config
+```
+
+### Code Quality
+```bash
+# Run linting
+uv run ruff check .
+uv run ruff format .
+
+# Type checking
+uv run mypy ingest_pipeline
+
+# Install dev dependencies
+uv sync --dev
+```
+
+## Architecture
+
+The pipeline follows a modular architecture with clear separation of concerns:
+
+- **Ingestors** (`ingest_pipeline/ingestors/`): Abstract base class pattern for different data sources (Firecrawl for web, Repomix for repositories)
+- **Storage Adapters** (`ingest_pipeline/storage/`): Abstract base class for storage backends (Weaviate, Open WebUI)
+- **Prefect Flows** (`ingest_pipeline/flows/`): Orchestration layer using Prefect for scheduling and task management
+- **CLI** (`ingest_pipeline/cli/main.py`): Typer-based command interface with commands: `ingest`, `schedule`, `serve`, `config`
+
+## Key Implementation Details
+
+### Type Safety
+- Strict typing enforced with no `Any` types allowed
+- Modern typing syntax using `|` instead of `Union`
+- Pydantic v2+ for all models and settings
+- All models in `core/models.py` use TypedDict for metadata and strict Pydantic models
+
+### Configuration Management
+- Settings loaded from `.env` file via Pydantic Settings
+- Cached singleton pattern in `config/settings.py` using `@lru_cache`
+- Environment-specific endpoints configured for local services (llm.lab, weaviate.yo, chat.lab)
+
+### Flow Orchestration
+- Main ingestion flow in `flows/ingestion.py` with retry logic and task decorators
+- Deployment scheduling in `flows/scheduler.py` supporting both cron and interval schedules
+- Tasks use Prefect's `@task` decorator with retries and tags for monitoring
+
+### Storage Backends
+- Weaviate: Uses batch ingestion with configurable batch size, automatic collection creation
+- Open WebUI: Direct API integration for knowledge base management
+- Both inherit from abstract `BaseStorage` class ensuring consistent interface
+
+## Service Endpoints
+
+- **LLM Proxy**: http://llm.lab (for embeddings and processing)
+- **Weaviate**: http://weaviate.yo (vector database)
+- **Open WebUI**: http://chat.lab (knowledge interface)
+- **Firecrawl**: http://crawl.lab:30002 (web crawling service)
+
+## Important Constraints
+
+- Cyclomatic complexity must remain < 15 for all functions
+- Maximum file size for ingestion: 1MB
+- Batch size limits: 50-500 documents
+- Concurrent task limit: 5 (configurable via MAX_CONCURRENT_TASKS)
+- All async operations use proper async/await patterns
--- a/README.md
+++ b/README.md
@@ -0,0 +1,150 @@
+# Document Ingestion Pipeline
+
+A modular, type-safe Python application using Prefect for scheduling ingestion jobs from web/documentation sites (via Firecrawl) and Git repositories (via Repomix) into Weaviate or Open WebUI knowledge endpoints.
+
+## Features
+
+- **Multiple Data Sources**: 
+  - Web/documentation sites via Firecrawl
+  - Git repositories via Repomix
+  
+- **Multiple Storage Backends**:
+  - Weaviate vector database (self-hosted at http://weaviate.yo)
+  - Open WebUI knowledge endpoints (http://chat.lab)
+
+- **Scheduling & Orchestration**:
+  - Prefect-based workflow orchestration
+  - Cron and interval-based scheduling
+  - Concurrent task execution
+
+- **Type Safety**:
+  - Strict Python typing with no `Any` types
+  - Modern typing syntax (using `|` instead of `Union`)
+  - Pydantic models for validation
+
+- **Code Quality**:
+  - Modular architecture
+  - Cyclomatic complexity < 15
+  - Clean separation of concerns
+
+## Installation
+
+```bash
+# Install dependencies
+pip install -r requirements.txt
+
+# Install repomix globally (required for repository ingestion)
+npm install -g repomix
+
+# Copy and configure environment
+cp .env.example .env
+# Edit .env with your settings
+```
+
+## Usage
+
+### One-time Ingestion
+
+```bash
+# Ingest a documentation site into Weaviate
+python -m ingest_pipeline ingest https://docs.example.com --type web --storage weaviate
+
+# Ingest a repository into Open WebUI
+python -m ingest_pipeline ingest https://github.com/user/repo --type repository --storage open_webui
+```
+
+### Scheduled Ingestion
+
+```bash
+# Create a daily documentation crawl
+python -m ingest_pipeline schedule daily-docs https://docs.example.com \
+  --type documentation \
+  --storage weaviate \
+  --cron "0 2 * * *"
+
+# Create an hourly repository sync
+python -m ingest_pipeline schedule repo-sync https://github.com/user/repo \
+  --type repository \
+  --storage open_webui \
+  --interval 60
+```
+
+### Serve Deployments
+
+```bash
+# Start serving scheduled deployments
+python -m ingest_pipeline serve
+```
+
+### Configuration
+
+```bash
+# View current configuration
+python -m ingest_pipeline config
+```
+
+## Architecture
+
+```
+ingest_pipeline/
+├── core/               # Core models and exceptions
+│   ├── models.py      # Pydantic models with strict typing
+│   └── exceptions.py  # Custom exceptions
+├── ingestors/         # Data source ingestors
+│   ├── base.py       # Abstract base ingestor
+│   ├── firecrawl.py  # Web/docs ingestion via Firecrawl
+│   └── repomix.py    # Repository ingestion via Repomix
+├── storage/           # Storage adapters
+│   ├── base.py       # Abstract base storage
+│   ├── weaviate.py   # Weaviate adapter
+│   └── openwebui.py  # Open WebUI adapter
+├── flows/             # Prefect flows
+│   ├── ingestion.py  # Main ingestion flow
+│   └── scheduler.py  # Deployment scheduling
+├── config/            # Configuration management
+│   └── settings.py   # Settings with Pydantic
+├── utils/             # Utilities
+│   └── vectorizer.py # Text vectorization
+└── cli/              # CLI interface
+    └── main.py       # Typer-based CLI
+```
+
+## Environment Variables
+
+- `FIRECRAWL_API_KEY`: API key for Firecrawl (optional)
+- `LLM_ENDPOINT`: LLM proxy endpoint (default: http://llm.lab)
+- `WEAVIATE_ENDPOINT`: Weaviate endpoint (default: http://weaviate.yo)
+- `OPENWEBUI_ENDPOINT`: Open WebUI endpoint (default: http://chat.lab)
+- `EMBEDDING_MODEL`: Model for embeddings (default: ollama/bge-m3:latest)
+
+## Vectorization
+
+The pipeline uses your LLM proxy at http://llm.lab with:
+- Model: `ollama/gpt-oss:20b` for processing
+- Embeddings: `ollama/bge-m3:latest` for vectorization
+
+## Storage Backends
+
+### Weaviate
+- Endpoint: http://weaviate.yo
+- Automatic collection creation
+- Vector similarity search
+- Batch ingestion support
+
+### Open WebUI
+- Endpoint: http://chat.lab/docs
+- Knowledge base integration
+- Direct API access
+- Document management
+
+## Development
+
+The codebase follows strict typing and quality standards:
+- No use of `Any` type
+- Modern Python typing syntax
+- Cyclomatic complexity < 15
+- Modular, testable architecture
+
+## License
+
+MIT
--- a/basedpyright.json
+++ b/basedpyright.json
@@ -0,0 +1,24 @@
+{
+  "include": [
+    "ingest_pipeline"
+  ],
+  "exclude": [
+    "**/__pycache__",
+    "**/.pytest_cache",
+    "**/node_modules",
+    ".venv"
+  ],
+  "reportCallInDefaultInitializer": "none",
+  "reportUnknownVariableType": "warning",
+  "reportUnknownMemberType": "warning",
+  "reportUnknownArgumentType": "warning",
+  "reportUnknownLambdaType": "warning",
+  "reportUnknownParameterType": "warning",
+  "reportMissingParameterType": "warning",
+  "reportUnannotatedClassAttribute": "warning",
+  "reportAny": "warning",
+  "reportUnusedCallResult": "none",
+  "reportUnnecessaryIsInstance": "none",
+  "reportImplicitOverride": "none",
+  "reportDeprecated": "warning"
+}
--- a/docs/elysia.md
+++ b/docs/elysia.md
@@ -0,0 +1,248 @@
+  38 async def output_resources():                                               │
+                    │                                                                                   │
+                    │ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/services/use │
+                    │ r.py:223 in check_all_trees_timeout                                               │
+                    │                                                                                   │
+                    │   220 │   │   Check all trees in all TreeManagers across all users and remove any │
+                    │       not been active in the last tree_timeout.                                   │
+                    │   221 │   │   """                                                                 │
+                    │   222 │   │   for user_id in self.users:                                          │
+                    │ ❱ 223 │   │   │   self.users[user_id]["tree_manager"].check_all_trees_timeout()   │
+                    │   224 │                                                                           │
+                    │   225 │   def check_user_timeout(self, user_id: str):                             │
+                    │   226 │   │   """                                                                 │
+                    ╰───────────────────────────────────────────────────────────────────────────────────╯
+                    KeyError: 'tree_manager'
+[10:08:31] ERROR    Job "check_timeouts (trigger: interval[0:00:29], next run at: 2025-09-15 10:09:00     base.py:195
+                    EDT)" raised an exception
+                    ╭──────────────────────── Traceback (most recent call last) ────────────────────────╮
+                    │ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/apscheduler/executors/b │
+                    │ ase.py:181 in run_coroutine_job                                                   │
+                    │                                                                                   │
+                    │   178 │   │                                                                       │
+                    │   179 │   │   logger.info('Running job "%s" (scheduled at %s)', job, run_time)    │
+                    │   180 │   │   try:                                                                │
+                    │ ❱ 181 │   │   │   retval = await job.func(*job.args, **job.kwargs)                │
+                    │   182 │   │   except BaseException:                                               │
+                    │   183 │   │   │   exc, tb = sys.exc_info()[1:]                                    │
+                    │   184 │   │   │   formatted_tb = "".join(format_tb(tb))                           │
+                    │                                                                                   │
+                    │ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/app.py:35 in │
+                    │ check_timeouts                                                                    │
+                    │                                                                                   │
+                    │    32                                                                             │
+                    │    33 async def check_timeouts():                                                 │
+                    │    34 │   user_manager = get_user_manager()                                       │
+                    │ ❱  35 │   await user_manager.check_all_trees_timeout()                            │
+                    │    36                                                                             │
+                    │    37                                                                             │
+                    │    38 async def output_resources():                                               │
+                    │                                                                                   │
+                    │ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/services/use │
+                    │ r.py:223 in check_all_trees_timeout                                               │
+                    │                                                                                   │
+                    │   220 │   │   Check all trees in all TreeManagers across all users and remove any │
+                    │       not been active in the last tree_timeout.                                   │
+                    │   221 │   │   """                                                                 │
+                    │   222 │   │   for user_id in self.users:                                          │
+                    │ ❱ 223 │   │   │   self.users[user_id]["tree_manager"].check_all_trees_timeout()   │
+                    │   224 │                                                                           │
+                    │   225 │   def check_user_timeout(self, user_id: str):                             │
+                    │   226 │   │   """                                                                 │
+                    ╰───────────────────────────────────────────────────────────────────────────────────╯
+                    KeyError: 'tree_manager'
+[10:26:25] WARNING  Run time of job "check_restart_clients (trigger: interval[0:00:31], next run at:      base.py:176
+                    2025-09-15 10:26:33 EDT)" was missed by 0:00:23.029499
+           WARNING  Run time of job "check_timeouts (trigger: interval[0:00:29], next run at: 2025-09-15  base.py:176
+                    10:26:53 EDT)" was missed by 0:00:01.030848
+           WARNING  Run time of job "output_resources (trigger: interval[0:18:23], next run at:           base.py:176
+                    2025-09-15 10:33:44 EDT)" was missed by 0:11:04.063842
+[10:41:41] WARNING  Run time of job "check_restart_clients (trigger: interval[0:00:31], next run at:      base.py:176
+                    2025-09-15 10:42:03 EDT)" was missed by 0:00:09.036380
+           WARNING  Run time of job "check_timeouts (trigger: interval[0:00:29], next run at: 2025-09-15  base.py:176
+                    10:41:52 EDT)" was missed by 0:00:18.037363
+           WARNING  Run time of job "output_resources (trigger: interval[0:18:23], next run at:           base.py:176
+                    2025-09-15 10:52:07 EDT)" was missed by 0:07:57.071763
+[10:51:25] WARNING  Run time of job "check_timeouts (trigger: interval[0:00:29], next run at: 2025-09-15  base.py:176
+                    10:51:32 EDT)" was missed by 0:00:21.808772
+           WARNING  Run time of job "check_restart_clients (trigger: interval[0:00:31], next run at:      base.py:176
+                    2025-09-15 10:51:52 EDT)" was missed by 0:00:03.810823
+[10:51:32] ERROR    Job "check_timeouts (trigger: interval[0:00:29], next run at: 2025-09-15 10:52:01     base.py:195
+                    EDT)" raised an exception
+                    ╭──────────────────────── Traceback (most recent call last) ────────────────────────╮
+                    │ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/apscheduler/executors/b │
+                    │ ase.py:181 in run_coroutine_job                                                   │
+                    │                                                                                   │
+                    │   178 │   │                                                                       │
+                    │   179 │   │   logger.info('Running job "%s" (scheduled at %s)', job, run_time)    │
+                    │   180 │   │   try:                                                                │
+                    │ ❱ 181 │   │   │   retval = await job.func(*job.args, **job.kwargs)                │
+                    │   182 │   │   except BaseException:                                               │
+                    │   183 │   │   │   exc, tb = sys.exc_info()[1:]                                    │
+                    │   184 │   │   │   formatted_tb = "".join(format_tb(tb))                           │
+                    │                                                                                   │
+                    │ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/app.py:35 in │
+                    │ check_timeouts                                                                    │
+                    │                                                                                   │
+                    │    32                                                                             │
+                    │    33 async def check_timeouts():                                                 │
+                    │    34 │   user_manager = get_user_manager()                                       │
+                    │ ❱  35 │   await user_manager.check_all_trees_timeout()                            │
+                    │    36                                                                             │
+                    │    37                                                                             │
+                    │    38 async def output_resources():                                               │
+                    │                                                                                   │
+                    │ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/services/use │
+                    │ r.py:223 in check_all_trees_timeout                                               │
+                    │                                                                                   │
+                    │   220 │   │   Check all trees in all TreeManagers across all users and remove any │
+                    │       not been active in the last tree_timeout.                                   │
+                    │   221 │   │   """                                                                 │
+                    │   222 │   │   for user_id in self.users:                                          │
+                    │ ❱ 223 │   │   │   self.users[user_id]["tree_manager"].check_all_trees_timeout()   │
+                    │   224 │                                                                           │
+                    │   225 │   def check_user_timeout(self, user_id: str):                             │
+                    │   226 │   │   """                                                                 │
+                    ╰───────────────────────────────────────────────────────────────────────────────────╯
+                    KeyError: 'tree_manager'
+[10:51:43] ERROR    Unexpected error: 'client_manager'                                           error_handlers.py:32
+INFO:     127.0.0.1:50043 - "GET /feedback/metadata/b6c0f65db8197395b453a7777a5e4c44 HTTP/1.1" 500 Internal Server Error
+ERROR:    Exception in ASGI application
+Traceback (most recent call last):
+  File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/uvicorn/protocols/http/httptools_impl.py", line 409, in run_asgi
+    result = await app(  # type: ignore[func-returns-value]
+             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/uvicorn/middleware/proxy_headers.py", line 60, in __call__
+    return await self.app(scope, receive, send)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/fastapi/applications.py", line 1054, in __call__
+    await super().__call__(scope, receive, send)
+  File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/applications.py", line 113, in __call__
+    await self.middleware_stack(scope, receive, send)
+  File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/middleware/errors.py", line 186, in __call__
+    raise exc
+  File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/middleware/errors.py", line 164, in __call__
+    await self.app(scope, receive, _send)
+  File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/middleware/cors.py", line 85, in __call__
+    await self.app(scope, receive, send)
+  File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/middleware/exceptions.py", line 63, in __call__
+    await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
+  File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
+    raise exc
+  File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app
+    await app(scope, receive, sender)
+  File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/routing.py", line 716, in __call__
+    await self.middleware_stack(scope, receive, send)
+  File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/routing.py", line 736, in app
+    await route.handle(scope, receive, send)
+  File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/routing.py", line 290, in handle
+    await self.app(scope, receive, send)
+  File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/routing.py", line 78, in app
+    await wrap_app_handling_exceptions(app, request)(scope, receive, send)
+  File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
+    raise exc
+  File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app
+    await app(scope, receive, sender)
+  File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/routing.py", line 75, in app
+    response = await f(request)
+               ^^^^^^^^^^^^^^^^
+  File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/fastapi/routing.py", line 302, in app
+    raw_response = await run_endpoint_function(
+                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/fastapi/routing.py", line 213, in run_endpoint_function
+    return await dependant.call(**values)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/routes/feedback.py", line 81, in run_feedback_metadata
+    client_manager: ClientManager = user["client_manager"]
+                                    ~~~~^^^^^^^^^^^^^^^^^^
+KeyError: 'client_manager'
+           ERROR    HTTP error occurred: Not Found                                               error_handlers.py:14
+INFO:     127.0.0.1:50045 - "GET /icon.svg?d6c34577c7161f78 HTTP/1.1" 404 Not Found
+INFO:     127.0.0.1:50045 - "GET /user/config/models HTTP/1.1" 200 OK
+INFO:     127.0.0.1:50054 - "GET /user/config/models HTTP/1.1" 200 OK
+[10:52:01] ERROR    Job "check_timeouts (trigger: interval[0:00:29], next run at: 2025-09-15 10:52:30     base.py:195
+                    EDT)" raised an exception
+                    ╭──────────────────────── Traceback (most recent call last) ────────────────────────╮
+                    │ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/apscheduler/executors/b │
+                    │ ase.py:181 in run_coroutine_job                                                   │
+                    │                                                                                   │
+                    │   178 │   │                                                                       │
+                    │   179 │   │   logger.info('Running job "%s" (scheduled at %s)', job, run_time)    │
+                    │   180 │   │   try:                                                                │
+                    │ ❱ 181 │   │   │   retval = await job.func(*job.args, **job.kwargs)                │
+                    │   182 │   │   except BaseException:                                               │
+                    │   183 │   │   │   exc, tb = sys.exc_info()[1:]                                    │
+                    │   184 │   │   │   formatted_tb = "".join(format_tb(tb))                           │
+                    │                                                                                   │
+                    │ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/app.py:35 in │
+                    │ check_timeouts                                                                    │
+                    │                                                                                   │
+                    │    32                                                                             │
+                    │    33 async def check_timeouts():                                                 │
+                    │    34 │   user_manager = get_user_manager()                                       │
+                    │ ❱  35 │   await user_manager.check_all_trees_timeout()                            │
+                    │    36                                                                             │
+                    │    37                                                                             │
+                    │    38 async def output_resources():                                               │
+                    │                                                                                   │
+                    │ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/services/use │
+                    │ r.py:223 in check_all_trees_timeout                                               │
+                    │                                                                                   │
+                    │   220 │   │   Check all trees in all TreeManagers across all users and remove any │
+                    │       not been active in the last tree_timeout.                                   │
+                    │   221 │   │   """                                                                 │
+                    │   222 │   │   for user_id in self.users:                                          │
+                    │ ❱ 223 │   │   │   self.users[user_id]["tree_manager"].check_all_trees_timeout()   │
+                    │   224 │                                                                           │
+                    │   225 │   def check_user_timeout(self, user_id: str):                             │
+                    │   226 │   │   """                                                                 │
+                    ╰───────────────────────────────────────────────────────────────────────────────────╯
+                    KeyError: 'tree_manager'
+^X[10:52:07] ERROR    Job "output_resources (trigger: interval[0:18:23], next run at: 2025-09-15 11:10:30   base.py:195
+                    EDT)" raised an exception
+                    ╭──────────────────────── Traceback (most recent call last) ────────────────────────╮
+                    │ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/apscheduler/executors/b │
+                    │ ase.py:181 in run_coroutine_job                                                   │
+                    │                                                                                   │
+                    │   178 │   │                                                                       │
+                    │   179 │   │   logger.info('Running job "%s" (scheduled at %s)', job, run_time)    │
+                    │   180 │   │   try:                                                                │
+                    │ ❱ 181 │   │   │   retval = await job.func(*job.args, **job.kwargs)                │
+                    │   182 │   │   except BaseException:                                               │
+                    │   183 │   │   │   exc, tb = sys.exc_info()[1:]                                    │
+                    │   184 │   │   │   formatted_tb = "".join(format_tb(tb))                           │
+                    │                                                                                   │
+                    │ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/app.py:40 in │
+                    │ output_resources                                                                  │
+                    │                                                                                   │
+                    │    37                                                                             │
+                    │    38 async def output_resources():                                               │
+                    │    39 │   user_manager = get_user_manager()                                       │
+                    │ ❱  40 │   await print_resources(user_manager, save_to_file=True)                  │
+                    │    41                                                                             │
+                    │    42                                                                             │
+                    │    43 async def check_restart_clients():                                          │
+                    │                                                                                   │
+                    │ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/utils/resour │
+                    │ ces.py:59 in print_resources                                                      │
+                    │                                                                                   │
+                    │    56 │   user_manager: UserManager | None = None, save_to_file: bool = False     │
+                    │    57 ):                                                                          │
+                    │    58 │   if user_manager is not None:                                            │
+                    │ ❱  59 │   │   avg_user_memory, avg_tree_memory = await get_average_user_memory(us │
+                    │    60 │   │   # avg_user_requests = await get_average_user_requests(user_manager) │
+                    │    61 │   │   # num_users_db = await get_number_local_users_db(user_manager)      │
+                    │    62                                                                             │
+                    │                                                                                   │
+                    │ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/utils/resour │
+                    │ ces.py:37 in get_average_user_memory                                              │
+                    │                                                                                   │
+                    │    34 │   avg_tree_memory = 0                                                     │
+                    │    35 │   for user in user_manager.users.values():                                │
+                    │    36 │   │   user_memory = 0                                                     │
+                    │ ❱  37 │   │   for tree in user["tree_manager"].trees.values():                    │
+                    │    38 │   │   │   user_memory += tree["tree"].detailed_memory_usage()["total"] /  │
+                    │    39 │   │                                                                       │
+                    │    40 │   │   if len(user["tree_manager"].trees) > 0:                             │
+                    ╰───────────────────────────────────────────────────────────────────────────────────╯
+                    KeyError: 'tree_manager'
--- a/docs/tagging.md
+++ b/docs/tagging.md
@@ -0,0 +1,108 @@
+Here are clear written examples of **metadata tagging** in both Open WebUI and Weaviate, showing how you can associate tags and structured metadata with knowledge objects for RAG and semantic search.
+
+***
+
+### Example: Metadata Tagging in Open WebUI
+
+You send a document to the Open WebUI API endpoint, attaching metadata and tags in the content field as a JSON string:
+
+```json
+POST http://localhost/api/v1/documents/create
+Content-Type: application/json
+
+{
+  "name": "policy_doc_2022",
+  "title": "2022 Policy Handbook",
+  "collection_name": "company_handbooks",
+  "filename": "policy_2022.pdf",
+  "content": "{\"tags\": [\"policy\", \"2022\", \"hr\"], \"source_url\": \"https://example.com/policy_2022.pdf\", \"author\": \"Jane Doe\"}"
+}
+```
+- The `"tags"` field is a list of labels for classification (policy, 2022, hr).
+- The `"source_url"` and `"author"` fields provide additional metadata useful for retrieval, audit, and filtering.[1][2]
+
+For pipeline-based ingestion, you might design a function to extract and append metadata before vectorization:
+
+```python
+metadata = {
+    "tags": ["policy", "2022"],
+    "source_url": document_url,
+    "author": document_author
+}
+embed_with_metadata(chunk, metadata)
+```
+This metadata becomes part of your retrieval context in RAG workflows.[1]
+
+***
+
+### Example: Metadata Tagging in Weaviate
+
+In Weaviate, metadata and tags are defined directly in the schema and attached to each object when added:
+
+**Schema definition:**
+
+```json
+{
+  "class": "Document",
+  "properties": [
+    {"name": "title", "dataType": ["text"]},
+    {"name": "tags", "dataType": ["text[]"]},
+    {"name": "source_url", "dataType": ["text"]},
+    {"name": "author", "dataType": ["text"]}
+  ]
+}
+```
+
+**Object creation example:**
+
+```python
+client.data_object.create(
+    data_object={
+        "title": "2022 Policy Handbook",
+        "tags": ["policy", "2022", "hr"],
+        "source_url": "https://example.com/policy_2022.pdf",
+        "author": "Jane Doe"
+    },
+    class_name="Document"
+)
+```
+- The `"tags"` field is a text array, ideal for semantic filtering and faceting.
+- Other fields store provenance metadata, supporting advanced queries and data governance.[3][4][5]
+
+**Query with metadata filtering:**
+
+```python
+result = (
+    client.query
+    .get("Document", ["title", "tags", "author"])
+    .with_filter({"path": ["tags"], "operator": "ContainsAny", "value": ["policy", "hr"]})
+    .do()
+)
+```
+This retrieves documents classified with either "policy" or "hr" tags.[4][3]
+
+***
+
+Both platforms support **metadata tagging** for documents, which enables powerful RAG scenarios, detailed filtering, and context-rich retrievals.[5][2][3][4][1]
+
+[1](https://www.reddit.com/r/OpenWebUI/comments/1hmmg9a/how_to_handle_metadata_during_vectorization/)
+[2](https://github.com/open-webui/open-webui/discussions/4692)
+[3](https://stackoverflow.com/questions/75006703/query-large-list-of-metadate-in-weaviate)
+[4](https://weaviate.io/blog/enterprise-workflow-langchain-weaviate)
+[5](https://docs.weaviate.io/academy/py/zero_to_mvp/schema_and_imports/schema)
+[6](https://docs.weaviate.io/weaviate/api/graphql/additional-properties)
+[7](https://weaviate.io/blog/sycamore-and-weaviate)
+[8](https://docs.llamaindex.ai/en/stable/examples/vector_stores/WeaviateIndex_auto_retriever/)
+[9](https://forum.weaviate.io/t/recommendations-for-metadata-or-knowledge-graphs/960)
+[10](https://weaviate.io/blog/agent-workflow-automation-n8n-weaviate)
+[11](https://github.com/open-webui/open-webui/discussions/9804)
+[12](https://docs.quarkiverse.io/quarkus-langchain4j/dev/rag-weaviate.html)
+[13](https://github.com/weaviate/weaviate-examples)
+[14](https://docs.openwebui.com/getting-started/api-endpoints/)
+[15](https://weaviate.io/blog/hybrid-search-for-web-developers)
+[16](https://dev.to/stephenc222/how-to-use-weaviate-to-store-and-query-vector-embeddings-4b9b)
+[17](https://helpdesk.egnyte.com/hc/en-us/articles/360035813612-Using-Metadata-in-the-WebUI)
+[18](https://docs.datadoghq.com/integrations/weaviate/)
+[19](https://docs.openwebui.com/features/)
+[20](https://documentation.suse.com/suse-ai/1.0/html/openwebui-configuring/index.html)
+[21](https://docs.openwebui.com/getting-started/env-configuration/)
--- a/ingest_pipeline/.env
+++ b/ingest_pipeline/.env
@@ -0,0 +1,38 @@
+# API Keys
+FIRECRAWL_API_KEY=fc-your-api-key
+OPENWEBUI_API_KEY=
+WEAVIATE_API_KEY=
+
+# Endpoints
+LLM_ENDPOINT=http://llm.lab
+WEAVIATE_ENDPOINT=http://weaviate.yo
+OPENWEBUI_ENDPOINT=http://chat.lab
+
+# Model Configuration
+EMBEDDING_MODEL=ollama/bge-m3:latest
+EMBEDDING_DIMENSION=1024
+
+# Ingestion Settings
+BATCH_SIZE=50
+MAX_FILE_SIZE=1000000
+MAX_CRAWL_DEPTH=5
+MAX_CRAWL_PAGES=100
+
+# Storage Settings
+DEFAULT_STORAGE_BACKEND=weaviate
+COLLECTION_PREFIX=docs
+
+# Prefect Settings
+PREFECT_API_URL=http://prefect.lab
+PREFECT_API_KEY=0nR4WAkQ3q9MY1bjqATK6pVmolighvrS
+PREFECT_WORK_POOL=default
+
+# Scheduling
+DEFAULT_SCHEDULE_INTERVAL=60
+
+# Performance
+MAX_CONCURRENT_TASKS=5
+REQUEST_TIMEOUT=60
+
+# Logging
+LOG_LEVEL=INFO
--- a/ingest_pipeline/main.py
+++ b/ingest_pipeline/main.py
@@ -0,0 +1,6 @@
+"""Main entry point for the ingestion pipeline."""
+
+from .cli.main import app
+
+if __name__ == "__main__":
+    app()
--- a/ingest_pipeline/pycache/main.cpython-312.pyc
+++ b/ingest_pipeline/pycache/main.cpython-312.pyc
--- a/ingest_pipeline/cli/init.py
+++ b/ingest_pipeline/cli/init.py
@@ -0,0 +1,5 @@
+"""CLI module for the ingestion pipeline."""
+
+from .main import app
+
+__all__ = ["app"]
--- a/ingest_pipeline/cli/pycache/init.cpython-312.pyc
+++ b/ingest_pipeline/cli/pycache/init.cpython-312.pyc
--- a/ingest_pipeline/cli/pycache/init.cpython-313.pyc
+++ b/ingest_pipeline/cli/pycache/init.cpython-313.pyc
--- a/ingest_pipeline/cli/pycache/main.cpython-312.pyc
+++ b/ingest_pipeline/cli/pycache/main.cpython-312.pyc
--- a/ingest_pipeline/cli/pycache/main.cpython-313.pyc
+++ b/ingest_pipeline/cli/pycache/main.cpython-313.pyc
--- a/ingest_pipeline/cli/pycache/tui.cpython-312.pyc
+++ b/ingest_pipeline/cli/pycache/tui.cpython-312.pyc
--- a/ingest_pipeline/cli/pycache/tui.cpython-313.pyc
+++ b/ingest_pipeline/cli/pycache/tui.cpython-313.pyc
--- a/ingest_pipeline/cli/main.py
+++ b/ingest_pipeline/cli/main.py
@@ -0,0 +1,616 @@
+"""CLI interface for ingestion pipeline."""
+
+import asyncio
+from enum import Enum
+
+import typer
+from rich.console import Console
+from rich.panel import Panel
+from rich.progress import BarColumn, Progress, SpinnerColumn, TaskProgressColumn, TextColumn
+from rich.table import Table
+
+from ..config import get_settings
+from ..core.models import IngestionResult
+from ..flows.ingestion import create_ingestion_flow
+from ..flows.scheduler import create_scheduled_deployment, serve_deployments
+
+
+class SourceType(str, Enum):
+    """Source types for ingestion."""
+
+    web = "web"
+    repository = "repository"
+    documentation = "documentation"
+
+
+class StorageBackend(str, Enum):
+    """Storage backend options."""
+
+    weaviate = "weaviate"
+    open_webui = "open_webui"
+
+
+app = typer.Typer(
+    name="ingest",
+    help="🚀 Modern Document Ingestion Pipeline - Advanced web and repository processing",
+    rich_markup_mode="rich",
+    add_completion=False,
+)
+console = Console()
+
+
+@app.callback()
+def main(
+    version: bool = typer.Option(False, "--version", "-v", help="Show version information"),
+) -> None:
+    """
+    🚀 Modern Document Ingestion Pipeline
+
+    [bold cyan]Advanced document processing and management platform[/bold cyan]
+
+    Features:
+    • 🌐 Web scraping and crawling with Firecrawl
+    • 📦 Repository ingestion with Repomix
+    • 🗄️ Multiple storage backends (Weaviate, OpenWebUI)
+    • 📊 Modern TUI for collection management
+    • ⚡ Async processing with Prefect orchestration
+    • 🎨 Rich CLI with enhanced visuals
+    """
+    if version:
+        console.print(
+            Panel(
+                "[bold magenta]Ingest Pipeline v0.1.0[/bold magenta]\n"
+                "[dim]Modern Document Ingestion & Management System[/dim]",
+                title="🚀 Version Info",
+                border_style="magenta"
+            )
+        )
+        raise typer.Exit()
+
+
+@app.command()
+def ingest(
+    source_url: str = typer.Argument(..., help="URL or path to ingest from"),
+    source_type: SourceType = typer.Option(SourceType.web, "--type", "-t", help="Type of source"),
+    storage: StorageBackend = typer.Option(
+        StorageBackend.weaviate, "--storage", "-s", help="Storage backend"
+    ),
+    collection: str = typer.Option(
+        None, "--collection", "-c", help="Target collection name (auto-generated if not specified)"
+    ),
+    validate: bool = typer.Option(
+        True, "--validate/--no-validate", help="Validate source before ingesting"
+    ),
+) -> None:
+    """
+    🚀 Run a one-time ingestion job with enhanced progress tracking.
+
+    This command processes documents from various sources and stores them in
+    your chosen backend with full progress visualization.
+    """
+    # Enhanced startup message
+    console.print(
+        Panel(
+            f"[bold cyan]🚀 Starting Modern Ingestion[/bold cyan]\n\n"
+            f"[yellow]Source:[/yellow] {source_url}\n"
+            f"[yellow]Type:[/yellow] {source_type.value.title()}\n"
+            f"[yellow]Storage:[/yellow] {storage.value.replace('_', ' ').title()}\n"
+            f"[yellow]Collection:[/yellow] {collection or '[dim]Auto-generated[/dim]'}",
+            title="🔥 Ingestion Configuration",
+            border_style="cyan"
+        )
+    )
+
+    async def run_with_progress() -> IngestionResult:
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            BarColumn(),
+            TaskProgressColumn(),
+            console=console,
+        ) as progress:
+            task = progress.add_task("🔄 Processing documents...", total=100)
+
+            # Simulate progress updates during ingestion
+            progress.update(task, advance=20, description="🔗 Connecting to services...")
+            await asyncio.sleep(0.5)
+
+            progress.update(task, advance=30, description="📄 Fetching documents...")
+            result = await run_ingestion(
+                url=source_url,
+                source_type=source_type.value,
+                storage_backend=storage.value,
+                collection_name=collection,
+                validate_first=validate,
+            )
+
+            progress.update(task, advance=50, description="✅ Ingestion complete!")
+            return result
+
+    result = asyncio.run(run_with_progress())
+
+    # Enhanced results display
+    status_color = "green" if result.status.value == "completed" else "red"
+
+    # Create results table with enhanced styling
+    table = Table(
+        title="📊 Ingestion Results",
+        title_style="bold magenta",
+        border_style="cyan",
+        header_style="bold blue"
+    )
+    table.add_column("📋 Metric", style="cyan", no_wrap=True)
+    table.add_column("📈 Value", style=status_color, justify="right")
+
+    # Add enhanced status icon
+    status_icon = "✅" if result.status.value == "completed" else "❌"
+    table.add_row("Status", f"{status_icon} {result.status.value.title()}")
+
+    table.add_row("Documents Processed", f"📄 {result.documents_processed:,}")
+    table.add_row("Documents Failed", f"⚠️ {result.documents_failed:,}")
+    table.add_row("Duration", f"⏱️ {result.duration_seconds:.2f}s")
+
+    if result.error_messages:
+        error_text = "\n".join(f"❌ {error}" for error in result.error_messages[:3])
+        if len(result.error_messages) > 3:
+            error_text += f"\n... and {len(result.error_messages) - 3} more errors"
+        table.add_row("Errors", error_text)
+
+    console.print(table)
+
+    # Success celebration or error guidance
+    if result.status.value == "completed" and result.documents_processed > 0:
+        console.print(
+            Panel(
+                f"🎉 [bold green]Success![/bold green] {result.documents_processed} documents ingested\n\n"
+                f"💡 [dim]Try '[bold cyan]ingest modern[/bold cyan]' to explore your collections![/dim]",
+                title="✨ Ingestion Complete",
+                border_style="green"
+            )
+        )
+    elif result.error_messages:
+        console.print(
+            Panel(
+                "❌ [bold red]Ingestion encountered errors[/bold red]\n\n"
+                "💡 [dim]Check your configuration and try again[/dim]",
+                title="⚠️ Issues Detected",
+                border_style="red"
+            )
+        )
+
+
+@app.command()
+def schedule(
+    name: str = typer.Argument(..., help="Deployment name"),
+    source_url: str = typer.Argument(..., help="URL or path to ingest from"),
+    source_type: SourceType = typer.Option(SourceType.web, "--type", "-t", help="Type of source"),
+    storage: StorageBackend = typer.Option(
+        StorageBackend.weaviate, "--storage", "-s", help="Storage backend"
+    ),
+    cron: str | None = typer.Option(None, "--cron", "-c", help="Cron expression for scheduling"),
+    interval: int = typer.Option(60, "--interval", "-i", help="Interval in minutes"),
+    serve_now: bool = typer.Option(False, "--serve/--no-serve", help="Start serving immediately"),
+) -> None:
+    """
+    Create a scheduled deployment for recurring ingestion.
+    """
+    console.print(f"[bold blue]Creating deployment: {name}[/bold blue]")
+
+    deployment = create_scheduled_deployment(
+        name=name,
+        source_url=source_url,
+        source_type=source_type.value,
+        storage_backend=storage.value,
+        schedule_type="cron" if cron else "interval",
+        cron_expression=cron,
+        interval_minutes=interval,
+    )
+
+    console.print(f"[green]✓ Deployment '{name}' created[/green]")
+
+    if serve_now:
+        console.print("[yellow]Starting deployment server...[/yellow]")
+        serve_deployments([deployment])
+
+
+@app.command()
+def serve(
+    config_file: str | None = typer.Option(
+        None, "--config", "-c", help="Path to deployments config file"
+    ),
+    ui: str | None = typer.Option(
+        None, "--ui", help="Launch user interface (options: tui, web)"
+    ),
+) -> None:
+    """
+    🚀 Serve configured deployments with optional UI interface.
+
+    Launch the deployment server to run scheduled ingestion jobs,
+    optionally with a modern Terminal User Interface (TUI) or web interface.
+    """
+    # Handle UI mode first
+    if ui == "tui":
+        console.print(
+            Panel(
+                "[bold cyan]🚀 Launching Enhanced TUI[/bold cyan]\n\n"
+                "[yellow]Features:[/yellow]\n"
+                "• 📊 Interactive collection management\n"
+                "• ⌨️  Enhanced keyboard navigation\n"
+                "• 🎨 Modern design with focus indicators\n"
+                "• 📄 Document browsing and search\n"
+                "• 🔄 Real-time status updates",
+                title="🎉 TUI Mode",
+                border_style="cyan"
+            )
+        )
+        from .tui import dashboard
+        dashboard()
+        return
+    elif ui == "web":
+        console.print("[red]Web UI not yet implemented. Use --ui tui for Terminal UI.[/red]")
+        return
+    elif ui:
+        console.print(f"[red]Unknown UI option: {ui}[/red]")
+        console.print("[yellow]Available options: tui, web[/yellow]")
+        return
+
+    # Normal deployment server mode
+    if config_file:
+        # Load deployments from config
+        console.print(f"[yellow]Loading deployments from {config_file}[/yellow]")
+        # Implementation would load YAML/JSON config
+    else:
+        # Create example deployments
+        deployments = [
+            create_scheduled_deployment(
+                name="docs-daily",
+                source_url="https://docs.example.com",
+                source_type="documentation",
+                storage_backend="weaviate",
+                schedule_type="cron",
+                cron_expression="0 2 * * *",  # Daily at 2 AM
+            ),
+            create_scheduled_deployment(
+                name="repo-hourly",
+                source_url="https://github.com/example/repo",
+                source_type="repository",
+                storage_backend="open_webui",
+                schedule_type="interval",
+                interval_minutes=60,
+            ),
+        ]
+
+        console.print(
+            "[bold green]Starting deployment server with example deployments[/bold green]"
+        )
+        serve_deployments(deployments)
+
+
+@app.command()
+def tui() -> None:
+    """
+    🚀 Launch the enhanced Terminal User Interface.
+
+    Quick shortcut for 'serve --ui tui' with modern keyboard navigation,
+    interactive collection management, and real-time status updates.
+    """
+    console.print(
+        Panel(
+            "[bold cyan]🚀 Launching Enhanced TUI[/bold cyan]\n\n"
+            "[yellow]Features:[/yellow]\n"
+            "• 📊 Interactive collection management\n"
+            "• ⌨️  Enhanced keyboard navigation\n"
+            "• 🎨 Modern design with focus indicators\n"
+            "• 📄 Document browsing and search\n"
+            "• 🔄 Real-time status updates",
+            title="🎉 TUI Mode",
+            border_style="cyan"
+        )
+    )
+    from .tui import dashboard
+    dashboard()
+
+
+@app.command()
+def config() -> None:
+    """
+    📋 Display current configuration with enhanced formatting.
+
+    Shows all configured endpoints, models, and settings in a beautiful
+    table format with status indicators.
+    """
+    settings = get_settings()
+
+    console.print(
+        Panel(
+            "[bold cyan]⚙️ System Configuration[/bold cyan]\n"
+            "[dim]Current pipeline settings and endpoints[/dim]",
+            title="🔧 Configuration",
+            border_style="cyan"
+        )
+    )
+
+    # Enhanced configuration table
+    table = Table(
+        title="📊 Configuration Details",
+        title_style="bold magenta",
+        border_style="blue",
+        header_style="bold cyan",
+        show_lines=True
+    )
+    table.add_column("🏷️ Setting", style="cyan", no_wrap=True, width=25)
+    table.add_column("🎯 Value", style="yellow", overflow="fold")
+    table.add_column("📊 Status", style="green", width=12, justify="center")
+
+    # Add configuration rows with status indicators
+    def get_status_indicator(value: str | None) -> str:
+        return "✅ Set" if value else "❌ Missing"
+
+    table.add_row(
+        "🤖 LLM Endpoint",
+        str(settings.llm_endpoint),
+        "✅ Active"
+    )
+    table.add_row(
+        "🔥 Firecrawl Endpoint",
+        str(settings.firecrawl_endpoint),
+        "✅ Active"
+    )
+    table.add_row(
+        "🗄️ Weaviate Endpoint",
+        str(settings.weaviate_endpoint),
+        get_status_indicator(str(settings.weaviate_api_key) if settings.weaviate_api_key else None)
+    )
+    table.add_row(
+        "🌐 OpenWebUI Endpoint",
+        str(settings.openwebui_endpoint),
+        get_status_indicator(settings.openwebui_api_key)
+    )
+    table.add_row(
+        "🧠 Embedding Model",
+        settings.embedding_model,
+        "✅ Set"
+    )
+    table.add_row(
+        "💾 Default Storage",
+        settings.default_storage_backend.title(),
+        "✅ Set"
+    )
+    table.add_row(
+        "📦 Default Batch Size",
+        f"{settings.default_batch_size:,}",
+        "✅ Set"
+    )
+    table.add_row(
+        "⚡ Max Concurrent Tasks",
+        f"{settings.max_concurrent_tasks}",
+        "✅ Set"
+    )
+
+    console.print(table)
+
+    # Additional helpful information
+    console.print(
+        Panel(
+            "💡 [bold cyan]Quick Tips[/bold cyan]\n\n"
+            "• Use '[bold]ingest list-collections[/bold]' to view all collections\n"
+            "• Use '[bold]ingest search[/bold]' to search content\n"
+            "• Configure API keys in your [yellow].env[/yellow] file\n"
+            "• Default collection names are auto-generated from URLs",
+            title="🚀 Usage Tips",
+            border_style="green"
+        )
+    )
+
+
+@app.command()
+def list_collections() -> None:
+    """
+    📋 List all collections across storage backends.
+    """
+    console.print("[bold cyan]📚 Collection Overview[/bold cyan]")
+    asyncio.run(run_list_collections())
+
+
+@app.command()
+def search(
+    query: str = typer.Argument(..., help="Search query"),
+    collection: str = typer.Option(None, "--collection", "-c", help="Target collection"),
+    backend: StorageBackend = typer.Option(StorageBackend.weaviate, "--backend", "-b", help="Storage backend"),
+    limit: int = typer.Option(10, "--limit", "-l", help="Result limit"),
+) -> None:
+    """
+    🔍 Search across collections.
+    """
+    console.print(f"[bold cyan]🔍 Searching for: {query}[/bold cyan]")
+    asyncio.run(run_search(query, collection, backend.value, limit))
+
+
+async def run_ingestion(
+    url: str,
+    source_type: str,
+    storage_backend: str,
+    collection_name: str | None = None,
+    validate_first: bool = True
+) -> IngestionResult:
+    """
+    Run ingestion with support for targeted collections.
+    """
+    # Auto-generate collection name if not provided
+    if not collection_name:
+        from urllib.parse import urlparse
+        parsed = urlparse(url)
+        domain = parsed.netloc.replace(".", "_").replace("-", "_")
+        collection_name = f"{domain}_{source_type}"
+
+    result = await create_ingestion_flow(
+        source_url=url,
+        source_type=source_type,
+        storage_backend=storage_backend,
+        collection_name=collection_name,
+        validate_first=validate_first,
+    )
+    return result
+
+
+async def run_list_collections() -> None:
+    """
+    List collections across storage backends.
+    """
+    from ..config import get_settings
+    from ..core.models import StorageBackend, StorageConfig
+    from ..storage.openwebui import OpenWebUIStorage
+    from ..storage.weaviate import WeaviateStorage
+
+    settings = get_settings()
+
+    console.print("🔍 [bold cyan]Scanning storage backends...[/bold cyan]")
+
+    # Try to connect to Weaviate
+    weaviate_collections = []
+    try:
+        weaviate_config = StorageConfig(
+            backend=StorageBackend.WEAVIATE,
+            endpoint=settings.weaviate_endpoint,
+            api_key=settings.weaviate_api_key,
+            collection_name="default",
+        )
+        weaviate = WeaviateStorage(weaviate_config)
+        await weaviate.initialize()
+
+        collections_list = weaviate.client.collections.list_all() if weaviate.client else []
+        for collection in collections_list:
+            collection_obj = weaviate.client.collections.get(collection) if weaviate.client else None
+            if collection_obj:
+                count = collection_obj.aggregate.over_all(total_count=True).total_count or 0
+                weaviate_collections.append((collection, count))
+    except Exception as e:
+        console.print(f"❌ [red]Weaviate connection failed: {e}[/red]")
+
+    # Try to connect to OpenWebUI
+    openwebui_collections = []
+    try:
+        openwebui_config = StorageConfig(
+            backend=StorageBackend.OPEN_WEBUI,
+            endpoint=settings.openwebui_endpoint,
+            api_key=settings.openwebui_api_key,
+            collection_name="default",
+        )
+        openwebui = OpenWebUIStorage(openwebui_config)
+        await openwebui.initialize()
+
+        response = await openwebui.client.get("/api/v1/knowledge/")
+        response.raise_for_status()
+        knowledge_bases = response.json()
+
+        for kb in knowledge_bases:
+            name = kb.get("name", "Unknown")
+            file_count = len(kb.get("files", []))
+            openwebui_collections.append((name, file_count))
+    except Exception as e:
+        console.print(f"❌ [red]OpenWebUI connection failed: {e}[/red]")
+
+    # Display results
+    if weaviate_collections or openwebui_collections:
+        # Create results table
+        from rich.table import Table
+        table = Table(
+            title="📚 Collection Overview",
+            title_style="bold magenta",
+            border_style="cyan",
+            header_style="bold blue"
+        )
+        table.add_column("🏷️ Collection", style="cyan", no_wrap=True)
+        table.add_column("📊 Backend", style="yellow")
+        table.add_column("📄 Documents", style="green", justify="right")
+
+        # Add Weaviate collections
+        for name, count in weaviate_collections:
+            table.add_row(name, "🗄️ Weaviate", f"{count:,}")
+
+        # Add OpenWebUI collections
+        for name, count in openwebui_collections:
+            table.add_row(name, "🌐 OpenWebUI", f"{count:,}")
+
+        console.print(table)
+    else:
+        console.print("❌ [yellow]No collections found in any backend[/yellow]")
+
+
+async def run_search(query: str, collection: str | None, backend: str, limit: int) -> None:
+    """
+    Search across collections.
+    """
+    from ..config import get_settings
+    from ..core.models import StorageBackend, StorageConfig
+    from ..storage.weaviate import WeaviateStorage
+
+    settings = get_settings()
+
+    console.print(f"🔍 Searching for: '[bold cyan]{query}[/bold cyan]'")
+    if collection:
+        console.print(f"📚 Target collection: [yellow]{collection}[/yellow]")
+    console.print(f"💾 Backend: [blue]{backend}[/blue]")
+
+    results = []
+
+    try:
+        if backend == "weaviate":
+            weaviate_config = StorageConfig(
+                backend=StorageBackend.WEAVIATE,
+                endpoint=settings.weaviate_endpoint,
+                api_key=settings.weaviate_api_key,
+                collection_name=collection or "default",
+            )
+            weaviate = WeaviateStorage(weaviate_config)
+            await weaviate.initialize()
+
+            results_generator = weaviate.search(query, limit=limit)
+            async for doc in results_generator:
+                results.append({
+                    "title": getattr(doc, "title", "Untitled"),
+                    "content": getattr(doc, "content", ""),
+                    "score": getattr(doc, "score", 0.0),
+                    "backend": "🗄️ Weaviate"
+                })
+
+        elif backend == "open_webui":
+            console.print("❌ [red]OpenWebUI search not yet implemented[/red]")
+            return
+
+    except Exception as e:
+        console.print(f"❌ [red]Search failed: {e}[/red]")
+        return
+
+    # Display results
+    if results:
+        from rich.table import Table
+        table = Table(
+            title=f"🔍 Search Results for '{query}'",
+            title_style="bold magenta",
+            border_style="green",
+            header_style="bold blue"
+        )
+        table.add_column("📄 Title", style="cyan", max_width=40)
+        table.add_column("📝 Preview", style="white", max_width=60)
+        table.add_column("📊 Score", style="yellow", justify="right")
+
+        for result in results[:limit]:
+            title = str(result["title"])
+            title_display = title[:40] + "..." if len(title) > 40 else title
+
+            content = str(result["content"])
+            content_display = content[:60] + "..." if len(content) > 60 else content
+
+            score = f"{result['score']:.3f}"
+
+            table.add_row(title_display, content_display, score)
+
+        console.print(table)
+        console.print(f"\n✅ [green]Found {len(results)} results[/green]")
+    else:
+        console.print("❌ [yellow]No results found[/yellow]")
+
+
+if __name__ == "__main__":
+    app()
--- a/ingest_pipeline/cli/tui/init.py
+++ b/ingest_pipeline/cli/tui/init.py
@@ -0,0 +1,13 @@
+"""Enhanced TUI package with keyboard navigation and modular architecture."""
+
+from .app import CollectionManagementApp
+from .models import CollectionInfo, DocumentInfo
+from .utils import dashboard, run_textual_tui
+
+__all__ = [
+    "CollectionManagementApp",
+    "CollectionInfo",
+    "DocumentInfo",
+    "dashboard",
+    "run_textual_tui",
+]
--- a/ingest_pipeline/cli/tui/pycache/init.cpython-312.pyc
+++ b/ingest_pipeline/cli/tui/pycache/init.cpython-312.pyc
--- a/ingest_pipeline/cli/tui/pycache/app.cpython-312.pyc
+++ b/ingest_pipeline/cli/tui/pycache/app.cpython-312.pyc
--- a/ingest_pipeline/cli/tui/pycache/models.cpython-312.pyc
+++ b/ingest_pipeline/cli/tui/pycache/models.cpython-312.pyc
--- a/ingest_pipeline/cli/tui/pycache/styles.cpython-312.pyc
+++ b/ingest_pipeline/cli/tui/pycache/styles.cpython-312.pyc
--- a/ingest_pipeline/cli/tui/app.py
+++ b/ingest_pipeline/cli/tui/app.py
@@ -0,0 +1,181 @@
+"""Main TUI application with enhanced keyboard navigation."""
+
+from textual import events
+from textual.app import App
+from textual.binding import Binding
+
+from ...storage.openwebui import OpenWebUIStorage
+from ...storage.weaviate import WeaviateStorage
+from .screens import CollectionOverviewScreen, HelpScreen
+from .styles import TUI_CSS
+
+
+class CollectionManagementApp(App[None]):
+    """Enhanced modern Textual application with comprehensive keyboard navigation."""
+
+    CSS = TUI_CSS
+
+    BINDINGS = [
+        Binding("q", "quit", "Quit"),
+        Binding("ctrl+c", "quit", "Quit"),
+        Binding("ctrl+q", "quit", "Quit"),
+        Binding("f1", "help", "Help"),
+        Binding("ctrl+h", "help", "Help"),
+        Binding("?", "help", "Quick Help"),
+        # Global navigation shortcuts
+        Binding("ctrl+r", "refresh_current", "Refresh Current Screen"),
+        Binding("ctrl+w", "close_current", "Close Current Screen"),
+        # Tab navigation shortcuts
+        Binding("ctrl+1", "dashboard_tab", "Dashboard", show=False),
+        Binding("ctrl+2", "collections_tab", "Collections", show=False),
+        Binding("ctrl+3", "analytics_tab", "Analytics", show=False),
+    ]
+
+    weaviate: WeaviateStorage | None
+    openwebui: OpenWebUIStorage | None
+
+    def __init__(
+        self, weaviate: WeaviateStorage | None = None, openwebui: OpenWebUIStorage | None = None
+    ):
+        super().__init__()
+        self.weaviate = weaviate
+        self.openwebui = openwebui
+
+    def on_mount(self) -> None:
+        """Initialize the enhanced app with better branding."""
+        self.title = "🚀 Enhanced Collection Management System"
+        self.sub_title = "Advanced Document Ingestion & Management Platform with Keyboard Navigation"
+        self.push_screen(CollectionOverviewScreen(self.weaviate, self.openwebui))
+
+    def action_help(self) -> None:
+        """Show comprehensive help information with all keyboard shortcuts."""
+        help_md = """
+# 🚀 Enhanced Collection Management System
+
+## 🎯 Global Navigation
+- **F1** / **Ctrl+H** / **?**: Show this help
+- **Q** / **Ctrl+C** / **Ctrl+Q**: Quit application
+- **Ctrl+R**: Refresh current screen
+- **Ctrl+W**: Close current screen/dialog
+- **Escape**: Go back/cancel current action
+
+## 📑 Tab Navigation
+- **Tab** / **Shift+Tab**: Switch between tabs
+- **Ctrl+1**: Jump to Dashboard tab
+- **Ctrl+2**: Jump to Collections tab
+- **Ctrl+3**: Jump to Analytics tab
+
+## 📚 Collections Management
+- **R**: Refresh collections list
+- **I**: Start new ingestion
+- **M**: Manage documents in selected collection
+- **S**: Search within selected collection
+- **Ctrl+D**: Delete selected collection
+
+## 🗂️ Table Navigation
+- **Arrow Keys** / **J/K/H/L**: Navigate table cells (Vi-style)
+- **Home** / **End**: Jump to first/last row
+- **Page Up** / **Page Down**: Scroll by page
+- **Enter**: Select/activate current row
+- **Space**: Toggle row selection
+- **Ctrl+A**: Select all items
+- **Ctrl+Shift+A**: Clear all selections
+
+## 📄 Document Management
+- **Space**: Toggle document selection
+- **Delete** / **Ctrl+D**: Delete selected documents
+- **A**: Select all documents on page
+- **N**: Clear selection
+- **Page Up/Down**: Navigate between pages
+- **Home/End**: Go to first/last page
+
+## 🔍 Search Features
+- **/** : Quick search (focus search field)
+- **Ctrl+F**: Focus search input
+- **Enter**: Perform search
+- **F3**: Repeat last search
+- **Ctrl+R**: Clear search results
+- **Escape**: Clear search/exit search mode
+
+## 📥 Ingestion Interface
+- **1/2/3**: Select ingestion type (Web/Repository/Documentation)
+- **Tab/Shift+Tab**: Navigate between fields
+- **Enter**: Start ingestion process
+- **Ctrl+I**: Quick start ingestion
+- **Escape**: Cancel ingestion
+
+## 🎨 Visual Features
+- Enhanced focus indicators with colored borders
+- Smooth keyboard navigation with visual feedback
+- Status indicators with real-time updates
+- Progress bars with detailed status messages
+- Responsive design with accessibility features
+
+## 💡 Pro Tips
+- Use **Vi-style** navigation (J/K/H/L) for efficient movement
+- **Tab** through interactive elements for keyboard-only operation
+- Hold **Shift** with arrow keys for range selection (where supported)
+- Use **Ctrl+** shortcuts for power user efficiency
+- **Escape** is your friend - it cancels most operations safely
+
+## 🚀 Performance Features
+- Lazy loading for large collections
+- Paginated document views
+- Background refresh operations
+- Efficient memory management
+- Responsive UI updates
+
+---
+
+**Enjoy the enhanced keyboard-driven interface!** 🎉
+
+*Press Escape, Enter, or Q to close this help.*
+        """
+        self.push_screen(HelpScreen(help_md))
+
+    def action_refresh_current(self) -> None:
+        """Refresh the current screen if it supports it."""
+        current_screen = self.screen
+        if hasattr(current_screen, "action_refresh"):
+            current_screen.action_refresh()
+        else:
+            self.notify("Current screen doesn't support refresh", severity="information")
+
+    def action_close_current(self) -> None:
+        """Close current screen/dialog."""
+        if len(self.screen_stack) > 1:  # Don't close the main screen
+            self.pop_screen()
+        else:
+            self.notify("Cannot close main screen. Use Q to quit.", severity="warning")
+
+    def action_dashboard_tab(self) -> None:
+        """Switch to dashboard tab in current screen."""
+        current_screen = self.screen
+        if hasattr(current_screen, "action_tab_dashboard"):
+            current_screen.action_tab_dashboard()
+
+    def action_collections_tab(self) -> None:
+        """Switch to collections tab in current screen."""
+        current_screen = self.screen
+        if hasattr(current_screen, "action_tab_collections"):
+            current_screen.action_tab_collections()
+
+    def action_analytics_tab(self) -> None:
+        """Switch to analytics tab in current screen."""
+        current_screen = self.screen
+        if hasattr(current_screen, "action_tab_analytics"):
+            current_screen.action_tab_analytics()
+
+    def on_key(self, event: events.Key) -> None:
+        """Handle global keyboard shortcuts."""
+        # Handle global shortcuts that might not be bound to specific actions
+        if event.key == "ctrl+shift+?":
+            # Alternative help shortcut
+            self.action_help()
+            event.prevent_default()
+        elif event.key == "ctrl+alt+r":
+            # Force refresh all connections
+            self.notify("🔄 Refreshing all connections...", severity="information")
+            # This could trigger a full reinit if needed
+            event.prevent_default()
+        # No else clause needed - just handle our events
--- a/ingest_pipeline/cli/tui/models.py
+++ b/ingest_pipeline/cli/tui/models.py
@@ -0,0 +1,26 @@
+"""Data models and TypedDict definitions for the TUI."""
+
+from typing import TypedDict
+
+
+class CollectionInfo(TypedDict):
+    """Information about a collection."""
+
+    name: str
+    type: str
+    count: int
+    backend: str
+    status: str
+    last_updated: str
+    size_mb: float
+
+
+class DocumentInfo(TypedDict):
+    """Information about a document."""
+
+    id: str
+    title: str
+    source_url: str
+    content_preview: str
+    word_count: int
+    timestamp: str
--- a/ingest_pipeline/cli/tui/screens/init.py
+++ b/ingest_pipeline/cli/tui/screens/init.py
@@ -0,0 +1,18 @@
+"""Screen components for the TUI application."""
+
+from .dashboard import CollectionOverviewScreen
+from .dialogs import ConfirmDeleteScreen, ConfirmDocumentDeleteScreen
+from .documents import DocumentManagementScreen
+from .help import HelpScreen
+from .ingestion import IngestionScreen
+from .search import SearchScreen
+
+__all__ = [
+    "CollectionOverviewScreen",
+    "IngestionScreen",
+    "SearchScreen",
+    "DocumentManagementScreen",
+    "ConfirmDeleteScreen",
+    "ConfirmDocumentDeleteScreen",
+    "HelpScreen",
+]
--- a/ingest_pipeline/cli/tui/screens/pycache/init.cpython-312.pyc
+++ b/ingest_pipeline/cli/tui/screens/pycache/init.cpython-312.pyc
--- a/ingest_pipeline/cli/tui/screens/pycache/dashboard.cpython-312.pyc
+++ b/ingest_pipeline/cli/tui/screens/pycache/dashboard.cpython-312.pyc
--- a/ingest_pipeline/cli/tui/screens/pycache/dialogs.cpython-312.pyc
+++ b/ingest_pipeline/cli/tui/screens/pycache/dialogs.cpython-312.pyc
--- a/ingest_pipeline/cli/tui/screens/pycache/documents.cpython-312.pyc
+++ b/ingest_pipeline/cli/tui/screens/pycache/documents.cpython-312.pyc
--- a/ingest_pipeline/cli/tui/screens/pycache/help.cpython-312.pyc
+++ b/ingest_pipeline/cli/tui/screens/pycache/help.cpython-312.pyc
--- a/ingest_pipeline/cli/tui/screens/pycache/ingestion.cpython-312.pyc
+++ b/ingest_pipeline/cli/tui/screens/pycache/ingestion.cpython-312.pyc
--- a/ingest_pipeline/cli/tui/screens/pycache/search.cpython-312.pyc
+++ b/ingest_pipeline/cli/tui/screens/pycache/search.cpython-312.pyc
--- a/ingest_pipeline/cli/tui/screens/dashboard.py
+++ b/ingest_pipeline/cli/tui/screens/dashboard.py
@@ -0,0 +1,542 @@
+"""Main dashboard screen with collections overview."""
+
+from datetime import datetime
+
+from textual import work
+from textual.app import ComposeResult
+from textual.binding import Binding
+from textual.containers import Container, Grid, Horizontal
+from textual.reactive import reactive, var
+from textual.screen import Screen
+from textual.widgets import (
+    Button,
+    Footer,
+    Header,
+    LoadingIndicator,
+    Rule,
+    Static,
+    TabbedContent,
+    TabPane,
+)
+from typing_extensions import override
+
+from ....storage.openwebui import OpenWebUIStorage
+from ....storage.weaviate import WeaviateStorage
+from ..models import CollectionInfo
+from ..widgets import EnhancedDataTable, MetricsCard, StatusIndicator
+
+
+class CollectionOverviewScreen(Screen[None]):
+    """Enhanced dashboard with modern design and metrics."""
+
+    total_documents: int = 0
+    total_collections: int = 0
+    active_backends: int = 0
+
+    BINDINGS = [
+        Binding("q", "quit", "Quit"),
+        Binding("r", "refresh", "Refresh"),
+        Binding("i", "ingest", "Ingest"),
+        Binding("m", "manage", "Manage"),
+        Binding("s", "search", "Search"),
+        Binding("ctrl+d", "delete", "Delete"),
+        Binding("ctrl+1", "tab_dashboard", "Dashboard"),
+        Binding("ctrl+2", "tab_collections", "Collections"),
+        Binding("ctrl+3", "tab_analytics", "Analytics"),
+        Binding("tab", "next_tab", "Next Tab"),
+        Binding("shift+tab", "prev_tab", "Prev Tab"),
+        Binding("f1", "help", "Help"),
+    ]
+
+    collections: var[list[CollectionInfo]] = var([])
+    is_loading: var[bool] = var(False)
+    selected_collection: reactive[CollectionInfo | None] = reactive(None)
+
+    def __init__(self, weaviate: WeaviateStorage | None, openwebui: OpenWebUIStorage | None):
+        super().__init__()
+        self.weaviate = weaviate
+        self.openwebui = openwebui
+        self.total_documents = 0
+        self.total_collections = 0
+        self.active_backends = 0
+
+    @override
+    def compose(self) -> ComposeResult:
+        yield Header(show_clock=True)
+
+        with TabbedContent("Dashboard", "Collections", "Analytics"):
+            # Dashboard Tab
+            with TabPane("Dashboard", id="dashboard"):
+                yield Container(
+                    Static("🚀 Collection Management System", classes="title"),
+                    Static("Modern document ingestion and management platform", classes="subtitle"),
+                    Rule(line_style="heavy"),
+                    # Metrics Grid
+                    Grid(
+                        MetricsCard(
+                            "Collections", str(self.total_collections), "Active collections"
+                        ),
+                        MetricsCard("Documents", str(self.total_documents), "Total indexed"),
+                        MetricsCard("Backends", str(self.active_backends), "Connected services"),
+                        MetricsCard("Status", "Online", "System health"),
+                        classes="responsive-grid metrics-grid",
+                    ),
+                    Rule(line_style="dashed"),
+                    # Quick Actions
+                    Container(
+                        Static("⚡ Quick Actions", classes="section-title"),
+                        Horizontal(
+                            Button("🔄 Refresh Data", id="quick_refresh", variant="primary"),
+                            Button("📥 New Ingestion", id="quick_ingest", variant="success"),
+                            Button("🔍 Search All", id="quick_search", variant="default"),
+                            Button("⚙️ Settings", id="quick_settings", variant="default"),
+                            classes="action_buttons",
+                        ),
+                        classes="card",
+                    ),
+                    # Recent Activity
+                    Container(
+                        Static("📊 Recent Activity", classes="section-title"),
+                        Static(
+                            "Loading recent activity...", id="activity_feed", classes="status-text"
+                        ),
+                        classes="card",
+                    ),
+                    classes="main_container",
+                )
+
+            # Collections Tab
+            with TabPane("Collections", id="collections"):
+                yield Container(
+                    Static("📚 Collection Overview", classes="title"),
+                    # Collection controls
+                    Horizontal(
+                        Button("🔄 Refresh", id="refresh_btn", variant="primary"),
+                        Button("📥 Ingest", id="ingest_btn", variant="success"),
+                        Button("🔧 Manage", id="manage_btn", variant="warning"),
+                        Button("🗑️ Delete", id="delete_btn", variant="error"),
+                        Button("🔍 Search", id="search_btn", variant="default"),
+                        classes="button_bar",
+                    ),
+                    # Collection table with enhanced navigation
+                    EnhancedDataTable(id="collections_table", classes="enhanced-table"),
+                    # Status bar
+                    Container(
+                        Static("Ready", id="status_text", classes="status-text"),
+                        StatusIndicator("Ready", id="connection_status"),
+                        classes="status-bar",
+                    ),
+                    LoadingIndicator(id="loading", classes="pulse"),
+                    classes="main_container",
+                )
+
+            # Analytics Tab
+            with TabPane("Analytics", id="analytics"):
+                yield Container(
+                    Static("📈 Analytics & Insights", classes="title"),
+                    # Analytics content
+                    Container(
+                        Static("🚧 Analytics Dashboard", classes="section-title"),
+                        Static("Advanced analytics and insights coming soon!", classes="subtitle"),
+                        # Placeholder charts area
+                        Container(
+                            Static("📊 Document Distribution", classes="chart-title"),
+                            Static(
+                                "Chart placeholder - integrate with visualization library",
+                                classes="chart-placeholder",
+                            ),
+                            classes="card",
+                        ),
+                        Container(
+                            Static("⏱️ Ingestion Timeline", classes="chart-title"),
+                            Static("Timeline chart placeholder", classes="chart-placeholder"),
+                            classes="card",
+                        ),
+                        classes="analytics-grid",
+                    ),
+                    classes="main_container",
+                )
+
+        yield Footer()
+
+    async def on_mount(self) -> None:
+        """Initialize the screen with enhanced loading."""
+        self.query_one("#loading").display = False
+        self.update_metrics()
+        self.refresh_collections()  # Don't await, let it run as a worker
+
+    def update_metrics(self) -> None:
+        """Update dashboard metrics with enhanced calculations."""
+        self.total_collections = len(self.collections)
+        self.total_documents = sum(col["count"] for col in self.collections)
+
+        # Count active backends
+        self.active_backends = 0
+        if self.weaviate:
+            self.active_backends += 1
+        if self.openwebui:
+            self.active_backends += 1
+
+        # Update metrics cards if they exist
+        try:
+            dashboard_tab = self.query_one("#dashboard")
+            metrics_cards = dashboard_tab.query(MetricsCard)
+            if len(metrics_cards) >= 4:
+                # Update existing cards with formatted values
+                metrics_cards[0].query_one(".metrics-value", Static).update(
+                    f"{self.total_collections:,}"
+                )
+                metrics_cards[1].query_one(".metrics-value", Static).update(
+                    f"{self.total_documents:,}"
+                )
+                metrics_cards[2].query_one(".metrics-value", Static).update(
+                    str(self.active_backends)
+                )
+
+                # Update status card based on system health
+                if self.active_backends > 0 and self.total_collections > 0:
+                    status_text = "🟢 Healthy"
+                    status_class = "status-active"
+                elif self.active_backends > 0:
+                    status_text = "🟡 Ready"
+                    status_class = "status-warning"
+                else:
+                    status_text = "🔴 Offline"
+                    status_class = "status-error"
+
+                metrics_cards[3].query_one(".metrics-value", Static).update(status_text)
+                metrics_cards[3].add_class(status_class)
+
+        except Exception:
+            pass  # Cards might not be rendered yet
+
+        # Update activity feed with real data
+        try:
+            dashboard_tab = self.query_one("#dashboard")
+            activity_feed = dashboard_tab.query_one("#activity_feed", Static)
+            if self.collections:
+                recent_activity = []
+                for col in self.collections[:3]:  # Show top 3 collections
+                    recent_activity.append(
+                        f"📚 {col['name']}: {col['count']:,} docs ({col.get('size_mb', 0):.1f} MB)"
+                    )
+                activity_text = "\\n".join(recent_activity)
+                if len(self.collections) > 3:
+                    activity_text += f"\\n... and {len(self.collections) - 3} more collections"
+            else:
+                activity_text = "No collections found. Start by creating your first ingestion!"
+
+            activity_feed.update(activity_text)
+        except Exception:
+            pass
+
+    @work(exclusive=True)
+    async def refresh_collections(self) -> None:
+        """Refresh collection data with enhanced loading feedback."""
+        self.is_loading = True
+        loading_indicator = self.query_one("#loading")
+        status_text = self.query_one("#status_text", Static)
+
+        loading_indicator.display = True
+        status_text.update("🔄 Refreshing collections...")
+
+        try:
+            collections = []
+
+            # Get Weaviate collections
+            if self.weaviate:
+                try:
+                    status_text.update("🔗 Connecting to Weaviate...")
+                    await self.weaviate.initialize()
+                    weaviate_collections = await self.list_weaviate_collections()
+                    collections.extend(weaviate_collections)
+                    status_text.update("✅ Weaviate collections loaded")
+                except Exception as e:
+                    self.notify(f"❌ Weaviate error: {e}", severity="error")
+                    status_text.update("❌ Weaviate connection failed")
+
+            # Get OpenWebUI collections
+            if self.openwebui:
+                try:
+                    status_text.update("🔗 Connecting to OpenWebUI...")
+                    await self.openwebui.initialize()
+                    openwebui_collections = await self.list_openwebui_collections()
+                    collections.extend(openwebui_collections)
+                    status_text.update("✅ OpenWebUI collections loaded")
+                except Exception as e:
+                    self.notify(f"❌ OpenWebUI error: {e}", severity="error")
+                    status_text.update("❌ OpenWebUI connection failed")
+
+            self.collections = collections
+            await self.update_collections_table()
+            self.update_metrics()
+            status_text.update(f"✨ Ready - {len(collections)} collections loaded")
+
+            # Update connection status
+            connection_status = self.query_one("#connection_status", StatusIndicator)
+            if collections:
+                connection_status.update_status("✓ Active")
+            else:
+                connection_status.update_status("No Data")
+
+        except Exception as e:
+            status_text.update(f"❌ Error: {e}")
+            self.notify(f"Failed to refresh: {e}", severity="error")
+        finally:
+            self.is_loading = False
+            loading_indicator.display = False
+
+    async def list_weaviate_collections(self) -> list[CollectionInfo]:
+        """List Weaviate collections with enhanced metadata."""
+        if not self.weaviate:
+            return []
+
+        try:
+            collections = []
+            collections_list = (
+                self.weaviate.client.collections.list_all()
+                if self.weaviate and self.weaviate.client
+                else []
+            )
+            for collection in collections_list:
+                collection_obj = (
+                    self.weaviate.client.collections.get(collection)
+                    if self.weaviate and self.weaviate.client
+                    else None
+                )
+                if not collection_obj:
+                    continue
+                count = collection_obj.aggregate.over_all(total_count=True).total_count or 0
+
+                # Estimate size
+                size_mb = count * 0.01  # Rough estimate
+
+                collection_info = CollectionInfo(
+                    name=collection,
+                    type="weaviate",
+                    count=count,
+                    backend="🗄️ Weaviate",
+                    status="✓ Active",
+                    last_updated=datetime.now().strftime("%Y-%m-%d %H:%M"),
+                    size_mb=size_mb,
+                )
+                collections.append(collection_info)
+
+            return collections
+        except Exception as e:
+            self.notify(f"Error listing Weaviate collections: {e}", severity="error")
+            return []
+
+    async def list_openwebui_collections(self) -> list[CollectionInfo]:
+        """List OpenWebUI collections with enhanced metadata."""
+        if not self.openwebui:
+            return []
+
+        try:
+            response = await self.openwebui.client.get("/api/v1/knowledge/")
+            response.raise_for_status()
+            knowledge_bases = response.json()
+
+            collections = []
+            for kb in knowledge_bases:
+                file_count = len(kb.get("files", []))
+                size_mb = file_count * 0.5  # Rough estimate
+
+                collection_info = CollectionInfo(
+                    name=kb.get("name", "Unknown"),
+                    type="openwebui",
+                    count=file_count,
+                    backend="🌐 OpenWebUI",
+                    status="✓ Active",
+                    last_updated=kb.get("updated_at", datetime.now().strftime("%Y-%m-%d %H:%M")),
+                    size_mb=size_mb,
+                )
+                collections.append(collection_info)
+
+            return collections
+        except Exception as e:
+            self.notify(f"Error listing OpenWebUI collections: {e}", severity="error")
+            return []
+
+    async def update_collections_table(self) -> None:
+        """Update the collections table with enhanced formatting."""
+        table = self.query_one("#collections_table", EnhancedDataTable)
+        table.clear()
+
+        # Add enhanced columns
+        table.add_columns("Collection", "Backend", "Documents", "Size", "Status", "Updated")
+
+        # Add rows with enhanced formatting
+        for collection in self.collections:
+            # Format size
+            size_str = f"{collection['size_mb']:.1f} MB"
+            if collection["size_mb"] > 1000:
+                size_str = f"{collection['size_mb'] / 1000:.1f} GB"
+
+            # Format document count
+            doc_count = f"{collection['count']:,}"
+
+            table.add_row(
+                collection["name"],
+                collection["backend"],
+                doc_count,
+                size_str,
+                collection["status"],
+                collection["last_updated"],
+            )
+
+    def get_selected_collection(self) -> CollectionInfo | None:
+        """Get the currently selected collection."""
+        table = self.query_one("#collections_table", EnhancedDataTable)
+        try:
+            if table.cursor_coordinate.row < len(self.collections):
+                return self.collections[table.cursor_coordinate.row]
+        except (AttributeError, IndexError):
+            pass
+        return None
+
+    # Action methods
+    def action_refresh(self) -> None:
+        """Refresh collections."""
+        self.refresh_collections()
+
+    def action_ingest(self) -> None:
+        """Show enhanced ingestion dialog."""
+        selected = self.get_selected_collection()
+        if selected:
+            from .ingestion import IngestionScreen
+            self.app.push_screen(IngestionScreen(selected))
+        else:
+            self.notify("🔍 Please select a collection first", severity="warning")
+
+    def action_manage(self) -> None:
+        """Manage documents in selected collection."""
+        selected = self.get_selected_collection()
+        if selected:
+            if selected["type"] == "weaviate":
+                from .documents import DocumentManagementScreen
+                self.app.push_screen(DocumentManagementScreen(selected, self.weaviate))
+            else:
+                self.notify(
+                    "🚧 Document management only available for Weaviate", severity="warning"
+                )
+        else:
+            self.notify("🔍 Please select a collection first", severity="warning")
+
+    def action_search(self) -> None:
+        """Search in selected collection."""
+        selected = self.get_selected_collection()
+        if selected:
+            from .search import SearchScreen
+            self.app.push_screen(SearchScreen(selected, self.weaviate, self.openwebui))
+        else:
+            self.notify("🔍 Please select a collection first", severity="warning")
+
+    def action_delete(self) -> None:
+        """Delete selected collection."""
+        selected = self.get_selected_collection()
+        if selected:
+            from .dialogs import ConfirmDeleteScreen
+            self.app.push_screen(ConfirmDeleteScreen(selected, self))
+        else:
+            self.notify("🔍 Please select a collection first", severity="warning")
+
+    def action_tab_dashboard(self) -> None:
+        """Switch to dashboard tab."""
+        tabs = self.query_one(TabbedContent)
+        tabs.active = "dashboard"
+
+    def action_tab_collections(self) -> None:
+        """Switch to collections tab."""
+        tabs = self.query_one(TabbedContent)
+        tabs.active = "collections"
+
+    def action_tab_analytics(self) -> None:
+        """Switch to analytics tab."""
+        tabs = self.query_one(TabbedContent)
+        tabs.active = "analytics"
+
+    def action_next_tab(self) -> None:
+        """Switch to next tab."""
+        tabs = self.query_one(TabbedContent)
+        tab_ids = ["dashboard", "collections", "analytics"]
+        current = tabs.active
+        try:
+            current_index = tab_ids.index(current)
+            next_index = (current_index + 1) % len(tab_ids)
+            tabs.active = tab_ids[next_index]
+        except (ValueError, AttributeError):
+            tabs.active = tab_ids[0]
+
+    def action_prev_tab(self) -> None:
+        """Switch to previous tab."""
+        tabs = self.query_one(TabbedContent)
+        tab_ids = ["dashboard", "collections", "analytics"]
+        current = tabs.active
+        try:
+            current_index = tab_ids.index(current)
+            prev_index = (current_index - 1) % len(tab_ids)
+            tabs.active = tab_ids[prev_index]
+        except (ValueError, AttributeError):
+            tabs.active = tab_ids[0]
+
+    def action_help(self) -> None:
+        """Show help screen."""
+        from .help import HelpScreen
+        help_md = """
+# 🚀 Modern Collection Management System
+
+## Navigation
+- **Tab** / **Shift+Tab**: Switch between tabs
+- **Ctrl+1/2/3**: Direct tab access
+- **Enter**: Activate selected item
+- **Escape**: Go back/cancel
+- **Arrow Keys**: Navigate within tables
+- **Home/End**: Jump to first/last row
+- **Page Up/Down**: Scroll by page
+
+## Collections
+- **R**: Refresh collections
+- **I**: Start ingestion
+- **M**: Manage documents
+- **S**: Search collection
+- **Ctrl+D**: Delete collection
+
+## Table Navigation
+- **Up/Down** or **J/K**: Navigate rows
+- **Space**: Toggle selection
+- **Ctrl+A**: Select all
+- **Ctrl+Shift+A**: Clear selection
+
+## General
+- **Q** / **Ctrl+C**: Quit application
+- **F1**: Show this help
+
+Enjoy the enhanced interface! 🎉
+        """
+        self.app.push_screen(HelpScreen(help_md))
+
+    def on_button_pressed(self, event: Button.Pressed) -> None:
+        """Handle button presses with enhanced feedback."""
+        button_id = event.button.id
+
+        # Add visual feedback
+        event.button.add_class("pressed")
+        self.call_later(self.remove_pressed_class, event.button)
+
+        if button_id == "refresh_btn" or button_id == "quick_refresh":
+            self.action_refresh()
+        elif button_id == "ingest_btn" or button_id == "quick_ingest":
+            self.action_ingest()
+        elif button_id == "manage_btn":
+            self.action_manage()
+        elif button_id == "delete_btn":
+            self.action_delete()
+        elif button_id == "search_btn" or button_id == "quick_search":
+            self.action_search()
+        elif button_id == "quick_settings":
+            self.notify("⚙️ Settings panel coming soon!", severity="information")
+
+    def remove_pressed_class(self, button: Button) -> None:
+        """Remove pressed visual feedback class."""
+        button.remove_class("pressed")
--- a/ingest_pipeline/cli/tui/screens/dialogs.py
+++ b/ingest_pipeline/cli/tui/screens/dialogs.py
@@ -0,0 +1,189 @@
+"""Dialog screens for confirmations and user interactions."""
+
+from typing import TYPE_CHECKING
+
+from textual.app import ComposeResult
+from textual.binding import Binding
+from textual.containers import Container, Horizontal
+from textual.screen import Screen
+from textual.widgets import Button, Footer, Header, LoadingIndicator, Static
+from typing_extensions import override
+
+from ..models import CollectionInfo
+
+if TYPE_CHECKING:
+    from .dashboard import CollectionOverviewScreen
+    from .documents import DocumentManagementScreen
+
+
+class ConfirmDeleteScreen(Screen[None]):
+    """Screen for confirming collection deletion."""
+
+    collection: CollectionInfo
+    parent_screen: "CollectionOverviewScreen"
+
+    BINDINGS = [
+        Binding("escape", "app.pop_screen", "Cancel"),
+        Binding("y", "confirm_delete", "Yes"),
+        Binding("n", "app.pop_screen", "No"),
+        Binding("enter", "confirm_delete", "Confirm"),
+    ]
+
+    def __init__(self, collection: CollectionInfo, parent_screen: "CollectionOverviewScreen"):
+        super().__init__()
+        self.collection = collection
+        self.parent_screen = parent_screen
+
+    @override
+    def compose(self) -> ComposeResult:
+        yield Header()
+        yield Container(
+            Static("⚠️  Confirm Deletion", classes="title warning"),
+            Static(f"Are you sure you want to delete collection '{self.collection['name']}'?"),
+            Static(f"Backend: {self.collection['backend']}"),
+            Static(f"Documents: {self.collection['count']:,}"),
+            Static("This action cannot be undone!", classes="warning"),
+            Static("Press Y to confirm, N or Escape to cancel", classes="subtitle"),
+            Horizontal(
+                Button("✅ Yes, Delete (Y)", id="yes_btn", variant="error"),
+                Button("❌ Cancel (N)", id="no_btn", variant="default"),
+                classes="action_buttons",
+            ),
+            classes="main_container center",
+        )
+        yield Footer()
+
+    def on_mount(self) -> None:
+        """Initialize the screen with focus on cancel button for safety."""
+        self.query_one("#no_btn").focus()
+
+    def on_button_pressed(self, event: Button.Pressed) -> None:
+        """Handle button presses."""
+        if event.button.id == "yes_btn":
+            self.action_confirm_delete()
+        elif event.button.id == "no_btn":
+            self.app.pop_screen()
+
+    def action_confirm_delete(self) -> None:
+        """Confirm deletion."""
+        self.run_worker(self.delete_collection())
+
+    async def delete_collection(self) -> None:
+        """Delete the collection."""
+        try:
+            if self.collection["type"] == "weaviate" and self.parent_screen.weaviate:
+                # Delete Weaviate collection
+                if self.parent_screen.weaviate and self.parent_screen.weaviate.client:
+                    self.parent_screen.weaviate.client.collections.delete(self.collection["name"])
+                self.notify(
+                    f"Deleted Weaviate collection: {self.collection['name']}",
+                    severity="information",
+                )
+            elif self.collection["type"] == "openwebui" and self.parent_screen.openwebui:
+                # Delete OpenWebUI knowledge base
+                response = await self.parent_screen.openwebui.client.delete(
+                    f"/api/v1/knowledge/{self.collection['name']}"
+                )
+                response.raise_for_status()
+                self.notify(
+                    f"Deleted OpenWebUI collection: {self.collection['name']}",
+                    severity="information",
+                )
+
+            # Refresh parent screen
+            self.parent_screen.refresh_collections()  # Don't await, let it run as a worker
+            self.app.pop_screen()
+
+        except Exception as e:
+            self.notify(f"Failed to delete collection: {e}", severity="error")
+
+
+class ConfirmDocumentDeleteScreen(Screen[None]):
+    """Screen for confirming document deletion."""
+
+    doc_ids: list[str]
+    collection: CollectionInfo
+    parent_screen: "DocumentManagementScreen"
+
+    BINDINGS = [
+        Binding("escape", "app.pop_screen", "Cancel"),
+        Binding("y", "confirm_delete", "Yes"),
+        Binding("n", "app.pop_screen", "No"),
+        Binding("enter", "confirm_delete", "Confirm"),
+    ]
+
+    def __init__(
+        self,
+        doc_ids: list[str],
+        collection: CollectionInfo,
+        parent_screen: "DocumentManagementScreen",
+    ):
+        super().__init__()
+        self.doc_ids = doc_ids
+        self.collection = collection
+        self.parent_screen = parent_screen
+
+    @override
+    def compose(self) -> ComposeResult:
+        yield Header()
+        yield Container(
+            Static("⚠️  Confirm Document Deletion", classes="title warning"),
+            Static(
+                f"Are you sure you want to delete {len(self.doc_ids)} documents from '{self.collection['name']}'?"
+            ),
+            Static("This action cannot be undone!", classes="warning"),
+            Static("Press Y to confirm, N or Escape to cancel", classes="subtitle"),
+            Horizontal(
+                Button("✅ Yes, Delete (Y)", id="yes_btn", variant="error"),
+                Button("❌ Cancel (N)", id="no_btn", variant="default"),
+                classes="action_buttons",
+            ),
+            LoadingIndicator(id="loading"),
+            classes="main_container center",
+        )
+        yield Footer()
+
+    def on_mount(self) -> None:
+        """Initialize the screen with focus on cancel button for safety."""
+        self.query_one("#loading").display = False
+        self.query_one("#no_btn").focus()
+
+    def on_button_pressed(self, event: Button.Pressed) -> None:
+        """Handle button presses."""
+        if event.button.id == "yes_btn":
+            self.action_confirm_delete()
+        elif event.button.id == "no_btn":
+            self.app.pop_screen()
+
+    def action_confirm_delete(self) -> None:
+        """Confirm deletion."""
+        self.run_worker(self.delete_documents())
+
+    async def delete_documents(self) -> None:
+        """Delete the selected documents."""
+        loading = self.query_one("#loading")
+        loading.display = True
+
+        try:
+            if self.parent_screen.weaviate:
+                # Delete documents
+                results = await self.parent_screen.weaviate.delete_documents(self.doc_ids)
+
+                # Count successful deletions
+                successful = sum(1 for success in results.values() if success)
+                failed = len(results) - successful
+
+                if successful > 0:
+                    self.notify(f"Deleted {successful} documents", severity="information")
+                if failed > 0:
+                    self.notify(f"Failed to delete {failed} documents", severity="error")
+
+                # Clear selection and refresh parent screen
+                self.parent_screen.selected_docs.clear()
+                await self.parent_screen.load_documents()
+                self.app.pop_screen()
+
+        except Exception as e:
+            self.notify(f"Failed to delete documents: {e}", severity="error")
+        finally:
+            loading.display = False
--- a/ingest_pipeline/cli/tui/screens/documents.py
+++ b/ingest_pipeline/cli/tui/screens/documents.py
@@ -0,0 +1,279 @@
+"""Document management screen with enhanced navigation."""
+
+from textual.app import ComposeResult
+from textual.binding import Binding
+from textual.containers import Container, Horizontal
+from textual.screen import Screen
+from textual.widgets import Button, Footer, Header, Label, LoadingIndicator, Static
+from typing_extensions import override
+
+from ....storage.weaviate import WeaviateStorage
+from ..models import CollectionInfo, DocumentInfo
+from ..widgets import EnhancedDataTable
+
+
+class DocumentManagementScreen(Screen[None]):
+    """Screen for managing documents within a collection with enhanced keyboard navigation."""
+
+    collection: CollectionInfo
+    weaviate: WeaviateStorage | None
+    documents: list[DocumentInfo]
+    selected_docs: set[str]
+    current_offset: int
+    page_size: int
+
+    BINDINGS = [
+        Binding("escape", "app.pop_screen", "Back"),
+        Binding("r", "refresh", "Refresh"),
+        Binding("delete", "delete_selected", "Delete Selected"),
+        Binding("a", "select_all", "Select All"),
+        Binding("ctrl+a", "select_all", "Select All"),
+        Binding("n", "select_none", "Clear Selection"),
+        Binding("ctrl+shift+a", "select_none", "Clear Selection"),
+        Binding("space", "toggle_selection", "Toggle Selection"),
+        Binding("ctrl+d", "delete_selected", "Delete Selected"),
+        Binding("pageup", "prev_page", "Previous Page"),
+        Binding("pagedown", "next_page", "Next Page"),
+        Binding("home", "first_page", "First Page"),
+        Binding("end", "last_page", "Last Page"),
+    ]
+
+    def __init__(self, collection: CollectionInfo, weaviate: WeaviateStorage | None):
+        super().__init__()
+        self.collection = collection
+        self.weaviate = weaviate
+        self.documents: list[DocumentInfo] = []
+        self.selected_docs: set[str] = set()
+        self.current_offset = 0
+        self.page_size = 50
+
+    @override
+    def compose(self) -> ComposeResult:
+        yield Header()
+        yield Container(
+            Static(f"📄 Document Management: {self.collection['name']}", classes="title"),
+            Static(
+                f"Total Documents: {self.collection['count']:,} | Use Space to select, Delete to remove",
+                classes="subtitle"
+            ),
+            Label(f"Page size: {self.page_size} documents"),
+            EnhancedDataTable(id="documents_table", classes="enhanced-table"),
+            Horizontal(
+                Button("🔄 Refresh", id="refresh_docs_btn", variant="primary"),
+                Button("🗑️ Delete Selected", id="delete_selected_btn", variant="error"),
+                Button("✅ Select All", id="select_all_btn", variant="default"),
+                Button("❌ Clear Selection", id="clear_selection_btn", variant="default"),
+                Button("⬅️ Previous Page", id="prev_page_btn", variant="default"),
+                Button("➡️ Next Page", id="next_page_btn", variant="default"),
+                classes="button_bar",
+            ),
+            Label("", id="selection_status"),
+            Static("", id="page_info", classes="status-text"),
+            LoadingIndicator(id="loading"),
+            classes="main_container",
+        )
+        yield Footer()
+
+    async def on_mount(self) -> None:
+        """Initialize the screen."""
+        self.query_one("#loading").display = False
+
+        # Setup documents table
+        table = self.query_one("#documents_table", EnhancedDataTable)
+        table.add_columns("✓", "Title", "Source URL", "Words", "ID")
+
+        # Set up message handling for table events
+        table.can_focus = True
+
+        await self.load_documents()
+
+    async def load_documents(self) -> None:
+        """Load documents from the collection."""
+        loading = self.query_one("#loading")
+        loading.display = True
+
+        try:
+            if self.weaviate:
+                # Set the collection name
+                self.weaviate.config.collection_name = self.collection["name"]
+
+                # Load documents with pagination
+                raw_docs = await self.weaviate.list_documents(
+                    limit=self.page_size, offset=self.current_offset
+                )
+                # Cast to proper type with type checking
+                self.documents = [
+                    DocumentInfo(
+                        id=str(doc["id"]),
+                        title=str(doc["title"]),
+                        source_url=str(doc["source_url"]),
+                        content_preview=str(doc["content_preview"]),
+                        word_count=int(doc["word_count"])
+                        if isinstance(doc["word_count"], (int, str))
+                        and str(doc["word_count"]).isdigit()
+                        else 0,
+                        timestamp=str(doc["timestamp"]),
+                    )
+                    for doc in raw_docs
+                ]
+
+                await self.update_table()
+                self.update_selection_status()
+                self.update_page_info()
+
+        except Exception as e:
+            self.notify(f"Error loading documents: {e}", severity="error")
+        finally:
+            loading.display = False
+
+    async def update_table(self) -> None:
+        """Update the documents table."""
+        table = self.query_one("#documents_table", EnhancedDataTable)
+        table.clear()
+
+        # Re-add columns
+        table.add_columns("✓", "Title", "Source URL", "Words", "ID")
+
+        # Add rows
+        for doc in self.documents:
+            selected = "✓" if doc["id"] in self.selected_docs else ""
+            table.add_row(
+                selected,
+                doc.get("title", "Untitled")[:50],
+                doc.get("source_url", "")[:50],
+                str(doc.get("word_count", 0)),
+                doc["id"][:8] + "...",  # Show truncated ID
+            )
+
+    def update_selection_status(self) -> None:
+        """Update the selection status label."""
+        status_label = self.query_one("#selection_status", Label)
+        total_selected = len(self.selected_docs)
+        status_label.update(f"Selected: {total_selected} documents")
+
+    def update_page_info(self) -> None:
+        """Update the page information."""
+        page_info = self.query_one("#page_info", Static)
+        total_docs = self.collection["count"]
+        start = self.current_offset + 1
+        end = min(self.current_offset + len(self.documents), total_docs)
+        page_num = (self.current_offset // self.page_size) + 1
+        total_pages = (total_docs + self.page_size - 1) // self.page_size
+
+        page_info.update(
+            f"Showing {start:,}-{end:,} of {total_docs:,} documents (Page {page_num} of {total_pages})"
+        )
+
+    def get_current_document(self) -> DocumentInfo | None:
+        """Get the currently selected document."""
+        table = self.query_one("#documents_table", EnhancedDataTable)
+        try:
+            if 0 <= table.cursor_coordinate.row < len(self.documents):
+                return self.documents[table.cursor_coordinate.row]
+        except (AttributeError, IndexError):
+            pass
+        return None
+
+    # Action methods
+    def action_refresh(self) -> None:
+        """Refresh the document list."""
+        self.run_worker(self.load_documents())
+
+    def action_toggle_selection(self) -> None:
+        """Toggle selection of current row."""
+        doc = self.get_current_document()
+        if doc:
+            doc_id = doc["id"]
+            if doc_id in self.selected_docs:
+                self.selected_docs.remove(doc_id)
+            else:
+                self.selected_docs.add(doc_id)
+
+            self.run_worker(self.update_table())
+            self.update_selection_status()
+
+    def action_select_all(self) -> None:
+        """Select all documents on current page."""
+        for doc in self.documents:
+            self.selected_docs.add(doc["id"])
+        self.run_worker(self.update_table())
+        self.update_selection_status()
+
+    def action_select_none(self) -> None:
+        """Clear all selections."""
+        self.selected_docs.clear()
+        self.run_worker(self.update_table())
+        self.update_selection_status()
+
+    def action_delete_selected(self) -> None:
+        """Delete selected documents."""
+        if self.selected_docs:
+            from .dialogs import ConfirmDocumentDeleteScreen
+            self.app.push_screen(
+                ConfirmDocumentDeleteScreen(list(self.selected_docs), self.collection, self)
+            )
+        else:
+            self.notify("No documents selected", severity="warning")
+
+    def action_next_page(self) -> None:
+        """Go to next page."""
+        if self.current_offset + self.page_size < self.collection["count"]:
+            self.current_offset += self.page_size
+            self.run_worker(self.load_documents())
+
+    def action_prev_page(self) -> None:
+        """Go to previous page."""
+        if self.current_offset >= self.page_size:
+            self.current_offset -= self.page_size
+            self.run_worker(self.load_documents())
+
+    def action_first_page(self) -> None:
+        """Go to first page."""
+        if self.current_offset > 0:
+            self.current_offset = 0
+            self.run_worker(self.load_documents())
+
+    def action_last_page(self) -> None:
+        """Go to last page."""
+        total_docs = self.collection["count"]
+        last_offset = ((total_docs - 1) // self.page_size) * self.page_size
+        if self.current_offset != last_offset:
+            self.current_offset = last_offset
+            self.run_worker(self.load_documents())
+
+    def on_button_pressed(self, event: Button.Pressed) -> None:
+        """Handle button presses."""
+        if event.button.id == "refresh_docs_btn":
+            self.action_refresh()
+        elif event.button.id == "delete_selected_btn":
+            self.action_delete_selected()
+        elif event.button.id == "select_all_btn":
+            self.action_select_all()
+        elif event.button.id == "clear_selection_btn":
+            self.action_select_none()
+        elif event.button.id == "next_page_btn":
+            self.action_next_page()
+        elif event.button.id == "prev_page_btn":
+            self.action_prev_page()
+
+    def on_enhanced_data_table_row_toggled(self, event: EnhancedDataTable.RowToggled) -> None:
+        """Handle row toggle from enhanced table."""
+        if 0 <= event.row_index < len(self.documents):
+            doc = self.documents[event.row_index]
+            doc_id = doc["id"]
+
+            if doc_id in self.selected_docs:
+                self.selected_docs.remove(doc_id)
+            else:
+                self.selected_docs.add(doc_id)
+
+            self.run_worker(self.update_table())
+            self.update_selection_status()
+
+    def on_enhanced_data_table_select_all(self, event: EnhancedDataTable.SelectAll) -> None:
+        """Handle select all from enhanced table."""
+        self.action_select_all()
+
+    def on_enhanced_data_table_clear_selection(self, event: EnhancedDataTable.ClearSelection) -> None:
+        """Handle clear selection from enhanced table."""
+        self.action_select_none()
--- a/ingest_pipeline/cli/tui/screens/help.py
+++ b/ingest_pipeline/cli/tui/screens/help.py
@@ -0,0 +1,50 @@
+"""Help screen with keyboard shortcuts and usage information."""
+
+from textual.app import ComposeResult
+from textual.binding import Binding
+from textual.containers import Container, ScrollableContainer
+from textual.screen import ModalScreen
+from textual.widgets import Button, Markdown, Rule, Static
+from typing_extensions import override
+
+
+class HelpScreen(ModalScreen[None]):
+    """Modern help screen with comprehensive keyboard shortcuts."""
+
+    help_content: str
+
+    BINDINGS = [
+        Binding("escape", "app.pop_screen", "Close"),
+        Binding("q", "app.pop_screen", "Close"),
+        Binding("enter", "app.pop_screen", "Close"),
+        Binding("f1", "app.pop_screen", "Close"),
+    ]
+
+    def __init__(self, help_content: str):
+        super().__init__()
+        self.help_content = help_content
+
+    @override
+    def compose(self) -> ComposeResult:
+        with Container(classes="modal-container"):
+            yield Static("📚 Help & Keyboard Shortcuts", classes="title")
+            yield Static("Enhanced navigation and productivity features", classes="subtitle")
+            yield Rule(line_style="heavy")
+
+            with ScrollableContainer():
+                yield Markdown(self.help_content)
+
+            yield Container(
+                Button("✅ Got it! (Press Escape or Enter)", id="close_btn", variant="primary"),
+                classes="action_buttons center",
+            )
+
+    def on_mount(self) -> None:
+        """Initialize the help screen."""
+        # Focus the close button
+        self.query_one("#close_btn").focus()
+
+    def on_button_pressed(self, event: Button.Pressed) -> None:
+        """Close help screen."""
+        if event.button.id == "close_btn":
+            self.app.pop_screen()
--- a/ingest_pipeline/cli/tui/screens/ingestion.py
+++ b/ingest_pipeline/cli/tui/screens/ingestion.py
@@ -0,0 +1,253 @@
+"""Enhanced ingestion screen with better UX."""
+
+import asyncio
+from datetime import datetime
+
+from textual import work
+from textual.app import ComposeResult
+from textual.binding import Binding
+from textual.containers import Container, Horizontal
+from textual.screen import ModalScreen
+from textual.widgets import Button, Input, Label, LoadingIndicator, Rule, Static
+from typing_extensions import override
+
+from ....core.models import IngestionJob, IngestionSource, StorageBackend
+from ..models import CollectionInfo
+from ..widgets import EnhancedProgressBar
+
+
+class IngestionScreen(ModalScreen[None]):
+    """Enhanced ingestion screen with better UX and keyboard navigation."""
+
+    collection: CollectionInfo
+    selected_type: IngestionSource
+    progress_value: int
+
+    BINDINGS = [
+        Binding("escape", "app.pop_screen", "Cancel"),
+        Binding("ctrl+i", "start_ingestion", "Start"),
+        Binding("1", "select_web", "Web", show=False),
+        Binding("2", "select_repo", "Repository", show=False),
+        Binding("3", "select_docs", "Documentation", show=False),
+        Binding("enter", "start_ingestion", "Start Ingestion"),
+        Binding("tab", "focus_next", "Next Field"),
+        Binding("shift+tab", "focus_previous", "Previous Field"),
+    ]
+
+    def __init__(self, collection: CollectionInfo):
+        super().__init__()
+        self.collection = collection
+        self.selected_type = IngestionSource.WEB
+        self.progress_value = 0
+
+    @override
+    def compose(self) -> ComposeResult:
+        with Container(classes="modal-container"):
+            yield Static("📥 Modern Ingestion Interface", classes="title")
+            yield Static(
+                f"Target: {self.collection['name']} ({self.collection['backend']})",
+                classes="subtitle",
+            )
+            yield Rule()
+
+            # Enhanced input section
+            yield Container(
+                Label("🌐 Source URL:", classes="input-label"),
+                Input(
+                    placeholder="https://docs.example.com or file:///path/to/repo",
+                    id="url_input",
+                    classes="modern-input",
+                ),
+                Label("📋 Source Type (Press 1/2/3):", classes="input-label"),
+                Horizontal(
+                    Button("🌐 Web (1)", id="web_btn", variant="primary", classes="type-button"),
+                    Button(
+                        "📦 Repository (2)", id="repo_btn", variant="default", classes="type-button"
+                    ),
+                    Button(
+                        "📖 Documentation (3)", id="docs_btn", variant="default", classes="type-button"
+                    ),
+                    classes="type_buttons",
+                ),
+                Rule(line_style="dashed"),
+                classes="input-section card",
+            )
+
+            # Enhanced Progress section
+            yield Container(
+                Label("🔄 Progress:", classes="progress-label"),
+                EnhancedProgressBar(id="enhanced_progress", total=100),
+                Static("Ready to start", id="progress_text", classes="status-text"),
+                classes="progress-section card",
+            )
+
+            # Action buttons
+            yield Horizontal(
+                Button("🚀 Start Ingestion", id="start_btn", variant="success"),
+                Button("❌ Cancel", id="cancel_btn", variant="error"),
+                classes="action_buttons",
+            )
+
+            yield LoadingIndicator(id="loading", classes="pulse")
+
+    def on_mount(self) -> None:
+        """Initialize the screen."""
+        self.query_one("#loading").display = False
+        self.selected_type = IngestionSource.WEB
+        # Focus the URL input field by default
+        self.query_one("#url_input").focus()
+
+    def action_select_web(self) -> None:
+        """Select web ingestion type."""
+        self.selected_type = IngestionSource.WEB
+        self.update_type_buttons("web")
+
+    def action_select_repo(self) -> None:
+        """Select repository ingestion type."""
+        self.selected_type = IngestionSource.REPOSITORY
+        self.update_type_buttons("repo")
+
+    def action_select_docs(self) -> None:
+        """Select documentation ingestion type."""
+        self.selected_type = IngestionSource.DOCUMENTATION
+        self.update_type_buttons("docs")
+
+    def on_button_pressed(self, event: Button.Pressed) -> None:
+        """Handle button presses with enhanced feedback."""
+        button_id = event.button.id
+
+        if button_id == "web_btn":
+            self.action_select_web()
+        elif button_id == "repo_btn":
+            self.action_select_repo()
+        elif button_id == "docs_btn":
+            self.action_select_docs()
+        elif button_id == "start_btn":
+            self.action_start_ingestion()
+        elif button_id == "cancel_btn":
+            self.app.pop_screen()
+
+    def update_type_buttons(self, selected: str) -> None:
+        """Update type button visual states."""
+        buttons = {
+            "web": self.query_one("#web_btn", Button),
+            "repo": self.query_one("#repo_btn", Button),
+            "docs": self.query_one("#docs_btn", Button),
+        }
+
+        for btn_type, button in buttons.items():
+            if btn_type == selected:
+                button.variant = "primary"
+            else:
+                button.variant = "default"
+
+    def on_input_submitted(self, event: Input.Submitted) -> None:
+        """Handle URL input submission."""
+        if event.input.id == "url_input":
+            self.action_start_ingestion()
+
+    def action_start_ingestion(self) -> None:
+        """Start the enhanced ingestion process."""
+        url_input = self.query_one("#url_input", Input)
+        if not url_input.value.strip():
+            self.notify("🔍 Please enter a source URL", severity="error")
+            url_input.focus()
+            return
+
+        self.perform_ingestion(url_input.value.strip())
+
+    @work(exclusive=True)
+    async def perform_ingestion(self, source_url: str) -> None:
+        """Perform ingestion with enhanced progress tracking and better UX."""
+        loading = self.query_one("#loading")
+        enhanced_progress = self.query_one("#enhanced_progress", EnhancedProgressBar)
+        progress_text = self.query_one("#progress_text", Static)
+
+        try:
+            loading.display = True
+
+            # Enhanced progress tracking with better visual feedback
+            enhanced_progress.update_progress(5, "Initializing ingestion pipeline...")
+            progress_text.update("🚀 Starting modern ingestion process...")
+            await asyncio.sleep(0.3)
+
+            # Determine storage backend
+            storage_backend = (
+                StorageBackend.WEAVIATE
+                if self.collection["type"] == "weaviate"
+                else StorageBackend.OPEN_WEBUI
+            )
+
+            enhanced_progress.update_progress(15, "Creating ingestion job...")
+            progress_text.update("📋 Configuring job parameters...")
+            await asyncio.sleep(0.4)
+
+            # Create ingestion job
+            job = IngestionJob(
+                source_url=source_url,
+                source_type=self.selected_type,
+                storage_backend=storage_backend,
+                created_at=datetime.now(),
+            )
+
+            enhanced_progress.update_progress(25, "Loading ingestion modules...")
+            progress_text.update("⚡ Importing processing components...")
+            await asyncio.sleep(0.4)
+
+            from ....flows.ingestion import ingest_documents_task
+
+            enhanced_progress.update_progress(35, "Connecting to services...")
+            progress_text.update(f"🔗 Establishing connection to {storage_backend.value}...")
+            await asyncio.sleep(0.5)
+
+            enhanced_progress.update_progress(45, "Fetching source content...")
+            progress_text.update("📄 Retrieving documents from source...")
+            await asyncio.sleep(0.6)
+
+            # Simulate realistic progress steps
+            progress_steps = [
+                (55, "Parsing document structure...", "🔍 Analyzing content structure..."),
+                (65, "Extracting text content...", "📝 Processing text and metadata..."),
+                (75, "Generating embeddings...", "🧠 Creating vector embeddings..."),
+                (85, "Storing in database...", "💾 Persisting to storage backend..."),
+                (95, "Finalizing operation...", "🎯 Completing ingestion process..."),
+            ]
+
+            for progress, status, text in progress_steps:
+                enhanced_progress.update_progress(progress, status)
+                progress_text.update(text)
+                await asyncio.sleep(0.7)
+
+            # Perform actual ingestion
+            successful, failed = await ingest_documents_task(
+                job, collection_name=self.collection["name"]
+            )
+
+            # Success handling with celebratory feedback
+            enhanced_progress.update_progress(100, "Completed successfully!")
+            progress_text.update(
+                f"🎉 Ingestion complete: {successful} documents added, {failed} failed"
+            )
+
+            # Show enhanced success notification
+            if successful > 0:
+                self.notify(
+                    f"🎉 Successfully ingested {successful} documents!",
+                    severity="information"
+                )
+                if failed > 0:
+                    self.notify(f"⚠️ {failed} documents failed to process", severity="warning")
+            else:
+                self.notify("❌ No documents were successfully processed", severity="error")
+
+            # Keep results visible before closing
+            await asyncio.sleep(3)
+            self.app.pop_screen()
+
+        except Exception as e:
+            enhanced_progress.update_progress(0, "Ingestion failed")
+            progress_text.update(f"❌ Error occurred: {str(e)[:100]}")
+            self.notify(f"❌ Ingestion failed: {e}", severity="error")
+            await asyncio.sleep(2)  # Show error before allowing interaction
+        finally:
+            loading.display = False
--- a/ingest_pipeline/cli/tui/screens/search.py
+++ b/ingest_pipeline/cli/tui/screens/search.py
@@ -0,0 +1,190 @@
+"""Search screen for finding documents within collections."""
+
+from textual.app import ComposeResult
+from textual.binding import Binding
+from textual.containers import Container
+from textual.screen import Screen
+from textual.widgets import Button, Footer, Header, Input, LoadingIndicator, Static
+from typing_extensions import override
+
+from ....storage.openwebui import OpenWebUIStorage
+from ....storage.weaviate import WeaviateStorage
+from ..models import CollectionInfo
+from ..widgets import EnhancedDataTable
+
+
+class SearchScreen(Screen[None]):
+    """Screen for searching within a collection with enhanced keyboard navigation."""
+
+    collection: CollectionInfo
+    weaviate: WeaviateStorage | None
+    openwebui: OpenWebUIStorage | None
+
+    BINDINGS = [
+        Binding("escape", "app.pop_screen", "Back"),
+        Binding("enter", "perform_search", "Search"),
+        Binding("ctrl+f", "focus_search", "Focus Search"),
+        Binding("f3", "perform_search", "Search Again"),
+        Binding("ctrl+r", "clear_results", "Clear Results"),
+        Binding("/", "focus_search", "Quick Search"),
+    ]
+
+    def __init__(
+        self,
+        collection: CollectionInfo,
+        weaviate: WeaviateStorage | None,
+        openwebui: OpenWebUIStorage | None,
+    ):
+        super().__init__()
+        self.collection = collection
+        self.weaviate = weaviate
+        self.openwebui = openwebui
+
+    @override
+    def compose(self) -> ComposeResult:
+        yield Header()
+        yield Container(
+            Static(
+                f"🔍 Search in: {self.collection['name']} ({self.collection['backend']})",
+                classes="title",
+            ),
+            Static("Press / or Ctrl+F to focus search, Enter to search", classes="subtitle"),
+            Input(placeholder="Enter search query... (press Enter to search)", id="search_input"),
+            Button("🔍 Search", id="search_btn", variant="primary"),
+            Button("🗑️ Clear Results", id="clear_btn", variant="default"),
+            EnhancedDataTable(id="results_table"),
+            Static("Enter your search query to find relevant documents.", id="search_status", classes="status-text"),
+            LoadingIndicator(id="loading"),
+            classes="main_container",
+        )
+        yield Footer()
+
+    def on_mount(self) -> None:
+        """Initialize the screen."""
+        self.query_one("#loading").display = False
+
+        # Setup results table
+        table = self.query_one("#results_table", EnhancedDataTable)
+        table.add_columns("Title", "Content Preview", "Score")
+
+        # Focus search input
+        self.query_one("#search_input").focus()
+
+    def action_focus_search(self) -> None:
+        """Focus the search input field."""
+        search_input = self.query_one("#search_input", Input)
+        search_input.focus()
+
+    def action_clear_results(self) -> None:
+        """Clear search results."""
+        table = self.query_one("#results_table", EnhancedDataTable)
+        table.clear()
+        table.add_columns("Title", "Content Preview", "Score")
+
+        status = self.query_one("#search_status", Static)
+        status.update("Search results cleared. Enter a new query to search.")
+
+    def on_input_submitted(self, event: Input.Submitted) -> None:
+        """Handle search input submission."""
+        if event.input.id == "search_input":
+            self.action_perform_search()
+
+    def on_button_pressed(self, event: Button.Pressed) -> None:
+        """Handle button presses."""
+        if event.button.id == "search_btn":
+            self.action_perform_search()
+        elif event.button.id == "clear_btn":
+            self.action_clear_results()
+
+    def action_perform_search(self) -> None:
+        """Perform search."""
+        search_input = self.query_one("#search_input", Input)
+        if not search_input.value.strip():
+            self.notify("Please enter a search query", severity="warning")
+            search_input.focus()
+            return
+
+        self.run_worker(self.search_collection(search_input.value.strip()))
+
+    async def search_collection(self, query: str) -> None:
+        """Search the collection."""
+        loading = self.query_one("#loading")
+        table = self.query_one("#results_table", EnhancedDataTable)
+        status = self.query_one("#search_status", Static)
+
+        try:
+            loading.display = True
+            status.update(f"🔍 Searching for '{query}'...")
+            table.clear()
+            table.add_columns("Title", "Content Preview", "Score")
+
+            results = []
+
+            if self.collection["type"] == "weaviate" and self.weaviate:
+                results = await self.search_weaviate(query)
+            elif self.collection["type"] == "openwebui" and self.openwebui:
+                results = await self.search_openwebui(query)
+
+            # Add results to table
+            for result in results:
+                title = result.get("title", "Untitled")
+                content = result.get("content", "")
+                score = result.get("score", 0)
+                table.add_row(
+                    title[:50] if isinstance(title, str) else str(title)[:50],
+                    (content[:100] + "...")
+                    if isinstance(content, str)
+                    else str(content)[:100] + "...",
+                    f"{score:.3f}" if isinstance(score, (int, float)) else str(score),
+                )
+
+            if not results:
+                status.update(f"No results found for '{query}'. Try different keywords.")
+                self.notify("No results found", severity="information")
+            else:
+                status.update(f"Found {len(results)} results for '{query}'. Use arrow keys to navigate.")
+                self.notify(f"Found {len(results)} results", severity="information")
+                # Focus the table for navigation
+                table.focus()
+
+        except Exception as e:
+            status.update(f"Search error: {e}")
+            self.notify(f"Search error: {e}", severity="error")
+        finally:
+            loading.display = False
+
+    async def search_weaviate(self, query: str) -> list[dict[str, str | float]]:
+        """Search Weaviate collection."""
+        if not self.weaviate:
+            return []
+
+        try:
+            await self.weaviate.initialize()
+            results_generator = self.weaviate.search(query, limit=20)
+            results = [doc async for doc in results_generator]
+            # Convert Document objects to dict format expected by the UI
+            return [
+                {
+                    "title": getattr(doc, "title", "Untitled"),
+                    "content": getattr(doc, "content", ""),
+                    "score": getattr(doc, "score", 0.0),
+                }
+                for doc in results
+            ]
+        except Exception as e:
+            self.notify(f"Weaviate search error: {e}", severity="error")
+            return []
+
+    async def search_openwebui(self, query: str) -> list[dict[str, str | float]]:
+        """Search OpenWebUI collection."""
+        if not self.openwebui:
+            return []
+
+        try:
+            # OpenWebUI does not have a direct search API, so return empty
+            # In a real implementation, you would need to implement search via their API
+            self.notify("OpenWebUI search not yet implemented", severity="warning")
+            return []
+        except Exception as e:
+            self.notify(f"OpenWebUI search error: {e}", severity="error")
+            return []
--- a/ingest_pipeline/cli/tui/styles.py
+++ b/ingest_pipeline/cli/tui/styles.py
@@ -0,0 +1,346 @@
+"""Modern CSS styles for the TUI application."""
+
+# Enhanced modern CSS with better focus indicators and navigation feedback
+TUI_CSS = """
+/* Base styling */
+Screen {
+    background: #1a1a1a;
+}
+
+* {
+    color: #ffffff;
+}
+
+/* Title styling */
+.title {
+    text-align: center;
+    margin: 1;
+    color: #ffffff;
+    text-style: bold;
+    background: #333333;
+    padding: 1;
+    border: solid #0088cc;
+}
+
+.subtitle {
+    text-align: center;
+    margin: 1 0;
+    color: #cccccc;
+    text-style: italic;
+    background: #333333;
+    padding: 1;
+}
+
+/* Container styling */
+.main_container {
+    margin: 1;
+    padding: 1;
+    background: #333333;
+}
+
+.card {
+    background: #333333;
+    padding: 1;
+    margin: 1;
+    color: #ffffff;
+    border: solid #444444;
+}
+
+.card:focus-within {
+    border: solid #0088cc;
+}
+
+/* Button styling with focus states */
+Button {
+    background: #444444;
+    color: #ffffff;
+    margin: 0 1;
+    border: solid transparent;
+}
+
+Button:hover {
+    background: #0088cc;
+    color: #ffffff;
+}
+
+Button:focus {
+    border: solid #ffffff;
+    background: #0088cc;
+}
+
+Button.-primary {
+    background: #0088cc;
+    color: #ffffff;
+}
+
+Button.-success {
+    background: #28a745;
+    color: #ffffff;
+}
+
+Button.-error {
+    background: #dc3545;
+    color: #ffffff;
+}
+
+Button.-warning {
+    background: #ffc107;
+    color: #000000;
+}
+
+/* Enhanced DataTable with focus indicators */
+DataTable {
+    background: #333333;
+    color: #ffffff;
+    border: solid #444444;
+}
+
+DataTable:focus {
+    border: solid #0088cc;
+}
+
+DataTable > .datatable--header {
+    background: #444444;
+    color: #ffffff;
+    text-style: bold;
+}
+
+DataTable > .datatable--cursor {
+    background: #0088cc;
+    color: #ffffff;
+}
+
+DataTable > .datatable--cursor-row {
+    background: #0066aa;
+    color: #ffffff;
+}
+
+/* Input styling */
+Input {
+    background: #333333;
+    color: #ffffff;
+    border: solid #666666;
+}
+
+Input:focus {
+    border: solid #0088cc;
+}
+
+/* Header and Footer */
+Header, Footer {
+    background: #333333;
+    color: #ffffff;
+}
+
+/* Tab styling with focus indicators */
+Tab {
+    background: #333333;
+    color: #ffffff;
+    border: solid transparent;
+}
+
+Tab:focus {
+    border: solid #ffffff;
+}
+
+Tab.-active {
+    background: #0088cc;
+    color: #ffffff;
+    text-style: bold;
+}
+
+/* Label styling */
+Label {
+    color: #ffffff;
+}
+
+/* Status indicators */
+.status-active {
+    color: #28a745;
+}
+
+.status-error {
+    color: #dc3545;
+}
+
+.status-warning {
+    color: #ffc107;
+}
+
+/* Animations */
+.pulse {
+    text-style: blink;
+}
+
+.glow {
+    background: #0088cc;
+    color: #ffffff;
+}
+
+.shimmer {
+    text-style: italic;
+}
+
+/* Metrics styling */
+.metrics-value {
+    text-style: bold;
+    text-align: center;
+    color: #ffffff;
+}
+
+.metrics-label {
+    text-align: center;
+    color: #cccccc;
+}
+
+.metrics-description {
+    text-align: center;
+    color: #999999;
+    text-style: italic;
+}
+
+/* Section titles */
+.section-title {
+    text-style: bold;
+    color: #ffffff;
+    margin: 1 0;
+}
+
+/* Status text */
+.status-text {
+    color: #cccccc;
+}
+
+/* Button groups */
+.button_bar {
+    margin: 1 0;
+}
+
+.action_buttons {
+    margin: 1;
+    text-align: center;
+}
+
+/* Progress styling */
+.progress-label {
+    color: #ffffff;
+    margin: 1 0;
+}
+
+/* Responsive grid */
+.responsive-grid {
+    grid-size: 4;
+    grid-gutter: 1;
+}
+
+.metrics-grid {
+    grid-size: 4;
+    grid-gutter: 1;
+    margin: 1;
+}
+
+/* Modal container */
+.modal-container {
+    background: #333333;
+    border: solid #0088cc;
+    padding: 2;
+    margin: 2;
+}
+
+/* Chart placeholders */
+.chart-title {
+    text-style: bold;
+    color: #ffffff;
+    margin: 1 0;
+}
+
+.chart-placeholder {
+    color: #999999;
+    text-style: italic;
+    text-align: center;
+    padding: 2;
+}
+
+/* Analytics grid */
+.analytics-grid {
+    grid-size: 2;
+    grid-gutter: 1;
+}
+
+/* Enhanced table styling */
+.enhanced-table {
+    background: #333333;
+    color: #ffffff;
+    border: solid #666666;
+}
+
+.enhanced-table:focus {
+    border: solid #0088cc;
+}
+
+/* Status bar */
+.status-bar {
+    background: #444444;
+    color: #ffffff;
+    padding: 0 1;
+}
+
+/* Input section styling */
+.input-section {
+    margin: 1;
+    padding: 1;
+}
+
+.input-label {
+    color: #ffffff;
+    margin: 1 0;
+}
+
+.modern-input {
+    background: #333333;
+    color: #ffffff;
+    border: solid #666666;
+    margin: 1 0;
+}
+
+.modern-input:focus {
+    border: solid #0088cc;
+}
+
+/* Type buttons */
+.type_buttons {
+    margin: 1 0;
+}
+
+.type-button {
+    margin: 0 1;
+}
+
+/* Progress section */
+.progress-section {
+    margin: 1;
+    padding: 1;
+}
+
+/* Center alignment */
+.center {
+    text-align: center;
+}
+
+/* Warning styling */
+.warning {
+    color: #ffc107;
+    text-style: bold;
+}
+
+/* Pressed button state */
+.pressed {
+    background: #006699;
+    color: #ffffff;
+}
+
+/* Focus ring for better accessibility */
+*:focus {
+    outline: solid #0088cc;
+}
+"""
--- a/ingest_pipeline/cli/tui/utils/init.py
+++ b/ingest_pipeline/cli/tui/utils/init.py
@@ -0,0 +1,5 @@
+"""Utility functions for the TUI."""
+
+from .runners import dashboard, run_textual_tui
+
+__all__ = ["dashboard", "run_textual_tui"]
--- a/ingest_pipeline/cli/tui/utils/pycache/init.cpython-312.pyc
+++ b/ingest_pipeline/cli/tui/utils/pycache/init.cpython-312.pyc
--- a/ingest_pipeline/cli/tui/utils/pycache/runners.cpython-312.pyc
+++ b/ingest_pipeline/cli/tui/utils/pycache/runners.cpython-312.pyc
--- a/ingest_pipeline/cli/tui/utils/runners.py
+++ b/ingest_pipeline/cli/tui/utils/runners.py
@@ -0,0 +1,64 @@
+"""TUI runner functions and initialization."""
+
+import asyncio
+
+from ....core.models import StorageBackend, StorageConfig
+from ....storage.openwebui import OpenWebUIStorage
+from ....storage.weaviate import WeaviateStorage
+from ..app import CollectionManagementApp
+
+
+async def run_textual_tui() -> None:
+    """Run the enhanced modern TUI with better error handling and initialization."""
+    from ....config.settings import get_settings
+
+    settings = get_settings()
+
+    # Initialize storage backends with enhanced error handling
+    weaviate = None
+    openwebui = None
+
+    print("🚀 Initializing Modern Collection Management System...")
+
+    try:
+        print("🔗 Connecting to Weaviate...")
+        weaviate_config = StorageConfig(
+            backend=StorageBackend.WEAVIATE,
+            endpoint=settings.weaviate_endpoint,
+            api_key=settings.weaviate_api_key,
+            collection_name="default",
+        )
+        weaviate = WeaviateStorage(weaviate_config)
+        await weaviate.initialize()
+        print("✅ Weaviate connected successfully!")
+    except Exception as e:
+        print(f"⚠️ Weaviate connection failed: {e}")
+
+    try:
+        print("🔗 Connecting to OpenWebUI...")
+        openwebui_config = StorageConfig(
+            backend=StorageBackend.OPEN_WEBUI,
+            endpoint=settings.openwebui_endpoint,
+            api_key=settings.openwebui_api_key,
+            collection_name="default",
+        )
+        openwebui = OpenWebUIStorage(openwebui_config)
+        await openwebui.initialize()
+        print("✅ OpenWebUI connected successfully!")
+    except Exception as e:
+        print(f"⚠️ OpenWebUI connection failed: {e}")
+
+    if not weaviate and not openwebui:
+        print("❌ Error: Could not connect to any storage backend")
+        print("Please check your configuration and try again.")
+        return
+
+    print("🎉 Launching Enhanced TUI with Keyboard Navigation...")
+
+    app = CollectionManagementApp(weaviate, openwebui)
+    await app.run_async()
+
+
+def dashboard() -> None:
+    """Launch the modern collection dashboard."""
+    asyncio.run(run_textual_tui())
--- a/ingest_pipeline/cli/tui/widgets/init.py
+++ b/ingest_pipeline/cli/tui/widgets/init.py
@@ -0,0 +1,12 @@
+"""Enhanced widgets with keyboard navigation support."""
+
+from .cards import MetricsCard
+from .indicators import EnhancedProgressBar, StatusIndicator
+from .tables import EnhancedDataTable
+
+__all__ = [
+    "MetricsCard",
+    "StatusIndicator",
+    "EnhancedProgressBar",
+    "EnhancedDataTable",
+]
--- a/ingest_pipeline/cli/tui/widgets/pycache/init.cpython-312.pyc
+++ b/ingest_pipeline/cli/tui/widgets/pycache/init.cpython-312.pyc
--- a/ingest_pipeline/cli/tui/widgets/pycache/cards.cpython-312.pyc
+++ b/ingest_pipeline/cli/tui/widgets/pycache/cards.cpython-312.pyc
--- a/ingest_pipeline/cli/tui/widgets/pycache/indicators.cpython-312.pyc
+++ b/ingest_pipeline/cli/tui/widgets/pycache/indicators.cpython-312.pyc
--- a/ingest_pipeline/cli/tui/widgets/pycache/tables.cpython-312.pyc
+++ b/ingest_pipeline/cli/tui/widgets/pycache/tables.cpython-312.pyc
--- a/ingest_pipeline/cli/tui/widgets/cards.py
+++ b/ingest_pipeline/cli/tui/widgets/cards.py
@@ -0,0 +1,28 @@
+"""Metrics card widget."""
+
+from typing import Any
+
+from textual.app import ComposeResult
+from textual.widgets import Static
+from typing_extensions import override
+
+
+class MetricsCard(Static):
+    """A modern metrics display card."""
+
+    title: str
+    value: str
+    description: str
+
+    def __init__(self, title: str, value: str, description: str = "", **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+        self.title = title
+        self.value = value
+        self.description = description
+
+    @override
+    def compose(self) -> ComposeResult:
+        yield Static(self.value, classes="metrics-value")
+        yield Static(self.title, classes="metrics-label")
+        if self.description:
+            yield Static(self.description, classes="metrics-description")
--- a/ingest_pipeline/cli/tui/widgets/indicators.py
+++ b/ingest_pipeline/cli/tui/widgets/indicators.py
@@ -0,0 +1,86 @@
+"""Status indicators and progress bars with enhanced visual feedback."""
+
+from typing import Any
+
+from textual.app import ComposeResult
+from textual.widgets import ProgressBar, Static
+from typing_extensions import override
+
+
+class StatusIndicator(Static):
+    """Modern status indicator with color coding and animations."""
+
+    status: str
+
+    def __init__(self, status: str, **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+        self.status = status
+        self.update_status(status)
+
+    def update_status(self, status: str) -> None:
+        """Update the status display with enhanced visual feedback."""
+        self.status = status
+
+        # Remove previous status classes
+        self.remove_class("status-active", "status-error", "status-warning", "pulse", "glow")
+
+        if status.lower() in ["active", "online", "connected", "✓ active"]:
+            self.add_class("status-active")
+            self.add_class("glow")
+            self.update("🟢 " + status)
+        elif status.lower() in ["error", "failed", "offline", "disconnected"]:
+            self.add_class("status-error")
+            self.add_class("pulse")
+            self.update("🔴 " + status)
+        elif status.lower() in ["warning", "pending", "in_progress"]:
+            self.add_class("status-warning")
+            self.add_class("pulse")
+            self.update("🟡 " + status)
+        elif status.lower() in ["loading", "connecting"]:
+            self.add_class("shimmer")
+            self.update("🔄 " + status)
+        else:
+            self.update("⚪ " + status)
+
+
+class EnhancedProgressBar(Static):
+    """Enhanced progress bar with better visual feedback."""
+
+    total: int
+    progress: int
+    status_text: str
+
+    def __init__(self, total: int = 100, **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+        self.total = total
+        self.progress = 0
+        self.status_text = "Ready"
+
+    @override
+    def compose(self) -> ComposeResult:
+        yield Static("", id="progress_status", classes="progress-label")
+        yield ProgressBar(total=self.total, id="progress_bar", show_eta=True, classes="shimmer")
+
+    def update_progress(self, progress: int, status: str = "") -> None:
+        """Update progress with enhanced feedback."""
+        self.progress = progress
+        if status:
+            self.status_text = status
+
+        # Update the progress bar
+        progress_bar = self.query_one("#progress_bar", ProgressBar)
+        progress_bar.update(progress=progress)
+
+        # Update status text with icons
+        status_display = self.query_one("#progress_status", Static)
+        if progress >= 100:
+            status_display.update(f"✅ {self.status_text}")
+            progress_bar.add_class("glow")
+        elif progress >= 75:
+            status_display.update(f"🔥 {self.status_text}")
+        elif progress >= 50:
+            status_display.update(f"⚡ {self.status_text}")
+        elif progress >= 25:
+            status_display.update(f"🔄 {self.status_text}")
+        else:
+            status_display.update(f"🚀 {self.status_text}")
--- a/ingest_pipeline/cli/tui/widgets/tables.py
+++ b/ingest_pipeline/cli/tui/widgets/tables.py
@@ -0,0 +1,126 @@
+"""Enhanced DataTable with improved keyboard navigation."""
+
+from typing import Any
+
+from textual import events
+from textual.binding import Binding
+from textual.message import Message
+from textual.widgets import DataTable
+
+
+class EnhancedDataTable(DataTable[Any]):
+    """DataTable with enhanced keyboard navigation and visual feedback."""
+
+    BINDINGS = [
+        Binding("up,k", "cursor_up", "Cursor Up", show=False),
+        Binding("down,j", "cursor_down", "Cursor Down", show=False),
+        Binding("left,h", "cursor_left", "Cursor Left", show=False),
+        Binding("right,l", "cursor_right", "Cursor Right", show=False),
+        Binding("home", "cursor_home", "First Row", show=False),
+        Binding("end", "cursor_end", "Last Row", show=False),
+        Binding("pageup", "page_up", "Page Up", show=False),
+        Binding("pagedown", "page_down", "Page Down", show=False),
+        Binding("enter", "select_cursor", "Select", show=False),
+        Binding("space", "toggle_selection", "Toggle Selection", show=False),
+        Binding("ctrl+a", "select_all", "Select All", show=False),
+        Binding("ctrl+shift+a", "clear_selection", "Clear Selection", show=False),
+    ]
+
+    def __init__(self, **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+        self.cursor_type = "row"  # Default to row selection
+        self.zebra_stripes = True  # Enable zebra striping for better visibility
+        self.show_cursor = True
+
+    def on_key(self, event: events.Key) -> None:
+        """Handle additional keyboard shortcuts."""
+        if event.key == "ctrl+1":
+            # Jump to first column
+            self.move_cursor(column=0)
+            event.prevent_default()
+        elif event.key == "ctrl+9":
+            # Jump to last column
+            if self.columns:
+                self.move_cursor(column=len(self.columns) - 1)
+            event.prevent_default()
+        elif event.key == "/":
+            # Start quick search (to be implemented by parent)
+            self.post_message(self.QuickSearch(self))
+            event.prevent_default()
+        elif event.key == "escape":
+            # Clear selection or exit search
+            # Clear selection by calling action
+            self.action_clear_selection()
+            event.prevent_default()
+        # No else clause needed - just handle our events
+
+    def action_cursor_home(self) -> None:
+        """Move cursor to first row."""
+        if self.row_count > 0:
+            self.move_cursor(row=0)
+
+    def action_cursor_end(self) -> None:
+        """Move cursor to last row."""
+        if self.row_count > 0:
+            self.move_cursor(row=self.row_count - 1)
+
+    def action_page_up(self) -> None:
+        """Move cursor up by visible page size."""
+        if self.row_count > 0:
+            page_size = max(1, self.size.height // 2)  # Approximate visible rows
+            new_row = max(0, self.cursor_coordinate.row - page_size)
+            self.move_cursor(row=new_row)
+
+    def action_page_down(self) -> None:
+        """Move cursor down by visible page size."""
+        if self.row_count > 0:
+            page_size = max(1, self.size.height // 2)  # Approximate visible rows
+            new_row = min(self.row_count - 1, self.cursor_coordinate.row + page_size)
+            self.move_cursor(row=new_row)
+
+    def action_toggle_selection(self) -> None:
+        """Toggle selection of current row."""
+        if self.row_count > 0:
+            current_row = self.cursor_coordinate.row
+            # This will be handled by the parent screen
+            self.post_message(self.RowToggled(self, current_row))
+
+    def action_select_all(self) -> None:
+        """Select all rows."""
+        # This will be handled by the parent screen
+        self.post_message(self.SelectAll(self))
+
+    def action_clear_selection(self) -> None:
+        """Clear all selections."""
+        # This will be handled by the parent screen
+        self.post_message(self.ClearSelection(self))
+
+    # Custom messages for enhanced functionality
+    class QuickSearch(Message):
+        """Posted when user wants to start a quick search."""
+
+        def __init__(self, table: "EnhancedDataTable") -> None:
+            super().__init__()
+            self.table = table
+
+    class RowToggled(Message):
+        """Posted when a row selection is toggled."""
+
+        def __init__(self, table: "EnhancedDataTable", row_index: int) -> None:
+            super().__init__()
+            self.table = table
+            self.row_index = row_index
+
+    class SelectAll(Message):
+        """Posted when user wants to select all rows."""
+
+        def __init__(self, table: "EnhancedDataTable") -> None:
+            super().__init__()
+            self.table = table
+
+    class ClearSelection(Message):
+        """Posted when user wants to clear selection."""
+
+        def __init__(self, table: "EnhancedDataTable") -> None:
+            super().__init__()
+            self.table = table
--- a/ingest_pipeline/config/init.py
+++ b/ingest_pipeline/config/init.py
@@ -0,0 +1,5 @@
+"""Configuration management."""
+
+from .settings import Settings, get_settings
+
+__all__ = ["Settings", "get_settings"]
--- a/ingest_pipeline/config/pycache/init.cpython-312.pyc
+++ b/ingest_pipeline/config/pycache/init.cpython-312.pyc
--- a/ingest_pipeline/config/pycache/settings.cpython-312.pyc
+++ b/ingest_pipeline/config/pycache/settings.cpython-312.pyc
--- a/ingest_pipeline/config/settings.py
+++ b/ingest_pipeline/config/settings.py
@@ -0,0 +1,103 @@
+"""Application settings and configuration."""
+
+from functools import lru_cache
+from typing import Literal
+
+from pydantic import Field, HttpUrl
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class Settings(BaseSettings):
+    """Application settings."""
+
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        case_sensitive=False,
+        extra="ignore",  # Ignore extra environment variables
+    )
+
+    # API Keys
+    firecrawl_api_key: str | None = None
+    openwebui_api_key: str | None = None
+    weaviate_api_key: str | None = None
+
+    # Endpoints
+    llm_endpoint: HttpUrl = HttpUrl("http://llm.lab")
+    weaviate_endpoint: HttpUrl = HttpUrl("http://weaviate.yo")
+    openwebui_endpoint: HttpUrl = HttpUrl("http://chat.lab")  # This will be the API URL
+    firecrawl_endpoint: HttpUrl = HttpUrl("http://crawl.lab:30002")
+
+    # Model Configuration
+    embedding_model: str = "ollama/bge-m3:latest"
+    embedding_dimension: int = 1024
+
+    # Ingestion Settings
+    default_batch_size: int = Field(default=50, gt=0, le=500)
+    max_file_size: int = 1_000_000
+    max_crawl_depth: int = Field(default=5, ge=1, le=20)
+    max_crawl_pages: int = Field(default=100, ge=1, le=1000)
+
+    # Storage Settings
+    default_storage_backend: Literal["weaviate", "open_webui"] = "weaviate"
+    default_collection_prefix: str = "docs"
+
+    # Prefect Settings
+    prefect_api_url: HttpUrl | None = None
+    prefect_api_key: str | None = None
+    prefect_work_pool: str = "default"
+
+    # Scheduling Defaults
+    default_schedule_interval: int = Field(default=60, ge=1, le=10080)  # Max 1 week
+
+    # Performance Settings
+    max_concurrent_tasks: int = Field(default=5, ge=1, le=20)
+    request_timeout: int = Field(default=60, ge=10, le=300)
+
+    # Logging
+    log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR"] = "INFO"
+
+    def get_storage_endpoint(self, backend: str) -> HttpUrl:
+        """
+        Get endpoint for storage backend.
+
+        Args:
+            backend: Storage backend name
+
+        Returns:
+            Endpoint URL
+        """
+        if backend == "weaviate":
+            return self.weaviate_endpoint
+        elif backend == "open_webui":
+            return self.openwebui_endpoint
+        else:
+            raise ValueError(f"Unknown backend: {backend}")
+
+    def get_api_key(self, service: str) -> str | None:
+        """
+        Get API key for service.
+
+        Args:
+            service: Service name
+
+        Returns:
+            API key or None
+        """
+        service_map = {
+            "firecrawl": self.firecrawl_api_key,
+            "openwebui": self.openwebui_api_key,
+            "weaviate": self.weaviate_api_key,
+        }
+        return service_map.get(service)
+
+
+@lru_cache
+def get_settings() -> Settings:
+    """
+    Get cached settings instance.
+
+    Returns:
+        Settings instance
+    """
+    return Settings()
--- a/ingest_pipeline/core/init.py
+++ b/ingest_pipeline/core/init.py
@@ -0,0 +1,27 @@
+"""Core module for ingestion pipeline."""
+
+from .exceptions import (
+    IngestionError,
+    StorageError,
+    VectorizationError,
+)
+from .models import (
+    Document,
+    IngestionJob,
+    IngestionResult,
+    IngestionSource,
+    IngestionStatus,
+    StorageBackend,
+)
+
+__all__ = [
+    "Document",
+    "IngestionJob",
+    "IngestionResult",
+    "IngestionSource",
+    "IngestionStatus",
+    "StorageBackend",
+    "IngestionError",
+    "StorageError",
+    "VectorizationError",
+]
--- a/ingest_pipeline/core/pycache/init.cpython-312.pyc
+++ b/ingest_pipeline/core/pycache/init.cpython-312.pyc
--- a/ingest_pipeline/core/pycache/exceptions.cpython-312.pyc
+++ b/ingest_pipeline/core/pycache/exceptions.cpython-312.pyc
--- a/ingest_pipeline/core/pycache/models.cpython-312.pyc
+++ b/ingest_pipeline/core/pycache/models.cpython-312.pyc
--- a/ingest_pipeline/core/exceptions.py
+++ b/ingest_pipeline/core/exceptions.py
@@ -0,0 +1,31 @@
+"""Custom exceptions for the ingestion pipeline."""
+
+
+class IngestionError(Exception):
+    """Base exception for ingestion errors."""
+
+    pass
+
+
+class StorageError(IngestionError):
+    """Exception for storage-related errors."""
+
+    pass
+
+
+class VectorizationError(IngestionError):
+    """Exception for vectorization errors."""
+
+    pass
+
+
+class ConfigurationError(IngestionError):
+    """Exception for configuration errors."""
+
+    pass
+
+
+class SourceNotFoundError(IngestionError):
+    """Exception when source cannot be found or accessed."""
+
+    pass
--- a/ingest_pipeline/core/models.py
+++ b/ingest_pipeline/core/models.py
@@ -0,0 +1,149 @@
+"""Core data models with strict typing."""
+
+from collections.abc import Callable
+from datetime import UTC, datetime
+from enum import Enum
+from typing import TypedDict
+from uuid import UUID, uuid4
+
+from pydantic import BaseModel, Field, HttpUrl
+
+
+class IngestionStatus(str, Enum):
+    """Status of an ingestion job."""
+
+    PENDING = "pending"
+    IN_PROGRESS = "in_progress"
+    COMPLETED = "completed"
+    PARTIAL = "partial"  # Some documents succeeded, some failed
+    FAILED = "failed"
+    CANCELLED = "cancelled"
+
+
+class StorageBackend(str, Enum):
+    """Available storage backends."""
+
+    WEAVIATE = "weaviate"
+    OPEN_WEBUI = "open_webui"
+
+
+class IngestionSource(str, Enum):
+    """Types of ingestion sources."""
+
+    WEB = "web"
+    REPOSITORY = "repository"
+    DOCUMENTATION = "documentation"
+
+
+class VectorConfig(BaseModel):
+    """Configuration for vectorization."""
+
+    model: str = Field(default="ollama/bge-m3:latest")
+    embedding_endpoint: HttpUrl = Field(default=HttpUrl("http://llm.lab"))
+    dimension: int = Field(default=1024)
+    batch_size: int = Field(default=100, gt=0, le=1000)
+
+
+class StorageConfig(BaseModel):
+    """Configuration for storage backend."""
+
+    backend: StorageBackend
+    endpoint: HttpUrl
+    api_key: str | None = Field(default=None)
+    collection_name: str = Field(default="documents")
+    batch_size: int = Field(default=100, gt=0, le=1000)
+
+
+class FirecrawlConfig(BaseModel):
+    """Configuration for Firecrawl ingestion (operational parameters only)."""
+
+    formats: list[str] = Field(default_factory=lambda: ["markdown", "html"])
+    max_depth: int = Field(default=5, ge=1, le=20)
+    limit: int = Field(default=100, ge=1, le=1000)
+    only_main_content: bool = Field(default=True)
+    include_subdomains: bool = Field(default=False)
+
+
+class RepomixConfig(BaseModel):
+    """Configuration for Repomix ingestion."""
+
+    include_patterns: list[str] = Field(
+        default_factory=lambda: ["*.py", "*.js", "*.ts", "*.md", "*.yaml", "*.json"]
+    )
+    exclude_patterns: list[str] = Field(
+        default_factory=lambda: ["**/node_modules/**", "**/__pycache__/**", "**/.git/**"]
+    )
+    max_file_size: int = Field(default=1_000_000)  # 1MB
+    respect_gitignore: bool = Field(default=True)
+
+
+class DocumentMetadata(TypedDict):
+    """Metadata for a document."""
+
+    source_url: str
+    title: str | None
+    description: str | None
+    timestamp: datetime
+    content_type: str
+    word_count: int
+    char_count: int
+
+
+class Document(BaseModel):
+    """Represents a single document."""
+
+    id: UUID = Field(default_factory=uuid4)
+    content: str
+    metadata: DocumentMetadata
+    vector: list[float] | None = Field(default=None)
+    source: IngestionSource
+    collection: str = Field(default="documents")
+
+    class Config:
+        """Pydantic configuration."""
+
+        json_encoders: dict[type, Callable[[UUID | datetime], str]] = {
+            UUID: lambda v: str(v) if isinstance(v, UUID) else str(v),
+            datetime: lambda v: v.isoformat() if isinstance(v, datetime) else str(v),
+        }
+
+
+class IngestionJob(BaseModel):
+    """Represents an ingestion job."""
+
+    id: UUID = Field(default_factory=uuid4)
+    source_type: IngestionSource
+    source_url: HttpUrl | str
+    status: IngestionStatus = Field(default=IngestionStatus.PENDING)
+    created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
+    updated_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
+    completed_at: datetime | None = Field(default=None)
+    error_message: str | None = Field(default=None)
+    document_count: int = Field(default=0)
+    storage_backend: StorageBackend
+
+    class Config:
+        """Pydantic configuration."""
+
+        json_encoders: dict[type, Callable[[UUID | datetime], str]] = {
+            UUID: lambda v: str(v) if isinstance(v, UUID) else str(v),
+            datetime: lambda v: v.isoformat() if isinstance(v, datetime) else str(v),
+        }
+
+
+class IngestionResult(BaseModel):
+    """Result of an ingestion operation."""
+
+    job_id: UUID
+    status: IngestionStatus
+    documents_processed: int
+    documents_failed: int
+    duration_seconds: float
+    error_messages: list[str] = Field(default_factory=list)
+
+    class Config:
+        """Pydantic configuration."""
+
+        json_encoders: dict[type, Callable[[UUID], str]] = {
+            UUID: lambda v: str(v),
+        }
--- a/ingest_pipeline/flows/init.py
+++ b/ingest_pipeline/flows/init.py
@@ -0,0 +1,9 @@
+"""Prefect flows for orchestration."""
+
+from .ingestion import create_ingestion_flow
+from .scheduler import create_scheduled_deployment
+
+__all__ = [
+    "create_ingestion_flow",
+    "create_scheduled_deployment",
+]
--- a/ingest_pipeline/flows/pycache/init.cpython-312.pyc
+++ b/ingest_pipeline/flows/pycache/init.cpython-312.pyc
--- a/ingest_pipeline/flows/pycache/ingestion.cpython-312.pyc
+++ b/ingest_pipeline/flows/pycache/ingestion.cpython-312.pyc
--- a/ingest_pipeline/flows/pycache/scheduler.cpython-312.pyc
+++ b/ingest_pipeline/flows/pycache/scheduler.cpython-312.pyc
--- a/ingest_pipeline/flows/ingestion.py
+++ b/ingest_pipeline/flows/ingestion.py
@@ -0,0 +1,274 @@
+"""Prefect flow for ingestion pipeline."""
+
+from datetime import UTC, datetime
+from typing import Literal
+
+from prefect import flow, task
+
+from ..core.exceptions import IngestionError
+from ..core.models import (
+    FirecrawlConfig,
+    IngestionJob,
+    IngestionResult,
+    IngestionSource,
+    IngestionStatus,
+    RepomixConfig,
+    StorageBackend,
+    StorageConfig,
+)
+from ..ingestors import FirecrawlIngestor, RepomixIngestor
+from ..storage import OpenWebUIStorage, WeaviateStorage
+from ..storage.base import BaseStorage
+
+
+@task(name="validate_source", retries=2, retry_delay_seconds=10, tags=["validation"])
+async def validate_source_task(source_url: str, source_type: IngestionSource) -> bool:
+    """
+    Validate that a source is accessible.
+
+    Args:
+        source_url: URL or path to source
+        source_type: Type of source
+
+    Returns:
+        True if valid
+    """
+    if source_type == IngestionSource.WEB:
+        ingestor = FirecrawlIngestor()
+    elif source_type == IngestionSource.REPOSITORY:
+        ingestor = RepomixIngestor()
+    else:
+        raise ValueError(f"Unsupported source type: {source_type}")
+
+    result = await ingestor.validate_source(source_url)
+    return bool(result)
+
+
+@task(name="initialize_storage", retries=3, retry_delay_seconds=5, tags=["storage"])
+async def initialize_storage_task(config: StorageConfig) -> BaseStorage:
+    """
+    Initialize storage backend.
+
+    Args:
+        config: Storage configuration
+
+    Returns:
+        Initialized storage adapter
+    """
+    if config.backend == StorageBackend.WEAVIATE:
+        storage = WeaviateStorage(config)
+    elif config.backend == StorageBackend.OPEN_WEBUI:
+        storage = OpenWebUIStorage(config)
+    else:
+        raise ValueError(f"Unsupported backend: {config.backend}")
+
+    await storage.initialize()
+    return storage
+
+
+@task(name="ingest_documents", retries=2, retry_delay_seconds=30, tags=["ingestion"])
+async def ingest_documents_task(job: IngestionJob, collection_name: str | None = None, batch_size: int = 50) -> tuple[int, int]:
+    """
+    Ingest documents from source.
+
+    Args:
+        job: Ingestion job configuration
+        batch_size: Number of documents per batch
+
+    Returns:
+        Tuple of (processed_count, failed_count)
+    """
+    # Select ingestor
+    if job.source_type == IngestionSource.WEB:
+        config = FirecrawlConfig()
+        ingestor = FirecrawlIngestor(config)
+    elif job.source_type == IngestionSource.REPOSITORY:
+        config = RepomixConfig()
+        ingestor = RepomixIngestor(config)
+    else:
+        raise ValueError(f"Unsupported source: {job.source_type}")
+
+    processed = 0
+    failed = 0
+    batch = []
+
+    # Initialize storage
+    from pydantic import HttpUrl
+
+    # Use provided collection name or generate default
+    if collection_name is None:
+        collection_name = f"docs_{job.source_type.value}"
+
+    storage_config = StorageConfig(
+        backend=job.storage_backend,
+        endpoint=HttpUrl("http://weaviate.yo")
+        if job.storage_backend == StorageBackend.WEAVIATE
+        else HttpUrl("http://chat.lab"),
+        collection_name=collection_name,
+    )
+
+    if job.storage_backend == StorageBackend.WEAVIATE:
+        storage = WeaviateStorage(storage_config)
+    else:
+        storage = OpenWebUIStorage(storage_config)
+
+    await storage.initialize()
+
+    # Process documents
+    async for document in ingestor.ingest(job):
+        batch.append(document)
+
+        if len(batch) >= batch_size:
+            try:
+                stored_ids = await storage.store_batch(batch)
+                print(f"Successfully stored {len(stored_ids)} documents in batch")
+                processed += len(stored_ids)
+                failed += len(batch) - len(stored_ids)
+            except Exception as e:
+                print(f"Batch storage failed: {e}")
+                failed += len(batch)
+            batch = []
+
+    # Process remaining batch
+    if batch:
+        try:
+            stored_ids = await storage.store_batch(batch)
+            print(f"Successfully stored {len(stored_ids)} documents in final batch")
+            processed += len(stored_ids)
+            failed += len(batch) - len(stored_ids)
+        except Exception as e:
+            print(f"Final batch storage failed: {e}")
+            failed += len(batch)
+
+    return processed, failed
+
+
+@task(name="update_job_status", tags=["tracking"])
+async def update_job_status_task(
+    job: IngestionJob,
+    status: IngestionStatus,
+    processed: int = 0,
+    failed: int = 0,
+    error: str | None = None,
+) -> IngestionJob:
+    """
+    Update job status.
+
+    Args:
+        job: Ingestion job
+        status: New status
+        processed: Documents processed
+        failed: Documents failed
+        error: Error message if any
+
+    Returns:
+        Updated job
+    """
+    job.status = status
+    job.updated_at = datetime.now(UTC)
+    job.document_count = processed
+
+    if status == IngestionStatus.COMPLETED:
+        job.completed_at = datetime.now(UTC)
+
+    if error:
+        job.error_message = error
+
+    return job
+
+
+@flow(
+    name="ingestion_pipeline",
+    description="Main ingestion pipeline for documents",
+    retries=1,
+    retry_delay_seconds=60,
+    persist_result=True,
+    log_prints=True,
+)
+async def create_ingestion_flow(
+    source_url: str,
+    source_type: Literal["web", "repository", "documentation"],
+    storage_backend: Literal["weaviate", "open_webui"] = "weaviate",
+    collection_name: str | None = None,
+    validate_first: bool = True,
+) -> IngestionResult:
+    """
+    Main ingestion flow.
+
+    Args:
+        source_url: URL or path to source
+        source_type: Type of source
+        storage_backend: Storage backend to use
+        validate_first: Whether to validate source first
+
+    Returns:
+        Ingestion result
+    """
+    print(f"Starting ingestion from {source_url}")
+
+    # Create job
+    job = IngestionJob(
+        source_url=source_url,
+        source_type=IngestionSource(source_type),
+        storage_backend=StorageBackend(storage_backend),
+        status=IngestionStatus.PENDING,
+    )
+
+    start_time = datetime.now(UTC)
+    error_messages = []
+    processed = 0
+    failed = 0
+
+    try:
+        # Validate source if requested
+        if validate_first:
+            print("Validating source...")
+            is_valid = await validate_source_task(source_url, job.source_type)
+
+            if not is_valid:
+                raise IngestionError(f"Source validation failed: {source_url}")
+
+        # Update status to in progress
+        job = await update_job_status_task(job, IngestionStatus.IN_PROGRESS)
+
+        # Run ingestion
+        print("Ingesting documents...")
+        processed, failed = await ingest_documents_task(job, collection_name)
+
+        # Update final status
+        if failed > 0:
+            error_messages.append(f"{failed} documents failed to process")
+
+        # Set status based on results
+        if processed == 0 and failed > 0:
+            final_status = IngestionStatus.FAILED
+        elif failed > 0:
+            final_status = IngestionStatus.PARTIAL
+        else:
+            final_status = IngestionStatus.COMPLETED
+
+        job = await update_job_status_task(job, final_status, processed=processed, failed=failed)
+
+        print(f"Ingestion completed: {processed} processed, {failed} failed")
+
+    except Exception as e:
+        print(f"Ingestion failed: {e}")
+        error_messages.append(str(e))
+
+        # Don't reset counts - keep whatever was processed before the error
+        job = await update_job_status_task(job, IngestionStatus.FAILED,
+                                          processed=processed,
+                                          failed=failed,
+                                          error=str(e))
+
+    # Calculate duration
+    duration = (datetime.now(UTC) - start_time).total_seconds()
+
+    return IngestionResult(
+        job_id=job.id,
+        status=job.status,
+        documents_processed=processed,
+        documents_failed=failed,
+        duration_seconds=duration,
+        error_messages=error_messages,
+    )
--- a/ingest_pipeline/flows/scheduler.py
+++ b/ingest_pipeline/flows/scheduler.py
@@ -0,0 +1,89 @@
+"""Scheduler for Prefect deployments."""
+
+from datetime import timedelta
+from typing import TYPE_CHECKING, Literal, Protocol
+
+from prefect import serve
+from prefect.deployments.runner import RunnerDeployment
+from prefect.schedules import Cron, Interval
+
+from .ingestion import create_ingestion_flow
+
+
+class FlowWithDeployment(Protocol):
+    """Protocol for flows that have deployment methods."""
+
+    def to_deployment(
+        self,
+        name: str,
+        **kwargs: object,
+    ) -> RunnerDeployment:
+        """Create a deployment from this flow."""
+        ...
+
+
+def create_scheduled_deployment(
+    name: str,
+    source_url: str,
+    source_type: Literal["web", "repository", "documentation"],
+    storage_backend: Literal["weaviate", "open_webui"] = "weaviate",
+    schedule_type: Literal["cron", "interval"] = "interval",
+    cron_expression: str | None = None,
+    interval_minutes: int = 60,
+    tags: list[str] | None = None,
+) -> RunnerDeployment:
+    """
+    Create a scheduled deployment for ingestion.
+
+    Args:
+        name: Deployment name
+        source_url: Source to ingest from
+        source_type: Type of source
+        storage_backend: Storage backend
+        schedule_type: Type of schedule
+        cron_expression: Cron expression if using cron
+        interval_minutes: Interval in minutes if using interval
+        tags: Optional tags for deployment
+
+    Returns:
+        Deployment configuration
+    """
+    # Create schedule
+    if schedule_type == "cron" and cron_expression:
+        schedule = Cron(cron_expression, timezone="UTC")
+    else:
+        schedule = Interval(timedelta(minutes=interval_minutes), timezone="UTC")
+
+    # Default tags
+    if tags is None:
+        tags = [source_type, storage_backend]
+
+    # Create deployment
+    # The flow decorator adds the to_deployment method at runtime
+    to_deployment = create_ingestion_flow.to_deployment
+    deployment = to_deployment(
+        name=name,
+        schedule=schedule,
+        parameters={
+            "source_url": source_url,
+            "source_type": source_type,
+            "storage_backend": storage_backend,
+            "validate_first": True,
+        },
+        tags=tags,
+        description=f"Scheduled ingestion from {source_url}",
+    )
+
+    from typing import cast
+
+    return cast("RunnerDeployment", deployment)
+
+
+def serve_deployments(deployments: list[RunnerDeployment]) -> None:
+    """
+    Serve multiple deployments.
+
+    Args:
+        deployments: List of deployment configurations
+    """
+    serve(*deployments, limit=10)
--- a/ingest_pipeline/ingestors/init.py
+++ b/ingest_pipeline/ingestors/init.py
@@ -0,0 +1,11 @@
+"""Ingestors module for different data sources."""
+
+from .base import BaseIngestor
+from .firecrawl import FirecrawlIngestor
+from .repomix import RepomixIngestor
+
+__all__ = [
+    "BaseIngestor",
+    "FirecrawlIngestor",
+    "RepomixIngestor",
+]
--- a/ingest_pipeline/ingestors/pycache/init.cpython-312.pyc
+++ b/ingest_pipeline/ingestors/pycache/init.cpython-312.pyc
--- a/ingest_pipeline/ingestors/pycache/base.cpython-312.pyc
+++ b/ingest_pipeline/ingestors/pycache/base.cpython-312.pyc
--- a/ingest_pipeline/ingestors/pycache/firecrawl.cpython-312.pyc
+++ b/ingest_pipeline/ingestors/pycache/firecrawl.cpython-312.pyc
--- a/ingest_pipeline/ingestors/pycache/repomix.cpython-312.pyc
+++ b/ingest_pipeline/ingestors/pycache/repomix.cpython-312.pyc
--- a/ingest_pipeline/ingestors/base.py
+++ b/ingest_pipeline/ingestors/base.py
@@ -0,0 +1,50 @@
+"""Base ingestor interface."""
+
+from abc import ABC, abstractmethod
+from collections.abc import AsyncGenerator
+
+from ..core.models import Document, IngestionJob
+
+
+class BaseIngestor(ABC):
+    """Abstract base class for all ingestors."""
+
+    @abstractmethod
+    async def ingest(self, job: IngestionJob) -> AsyncGenerator[Document, None]:
+        """
+        Ingest data from a source.
+
+        Args:
+            job: The ingestion job configuration
+
+        Yields:
+            Documents from the source
+        """
+        return  # type: ignore # pragma: no cover
+        yield  # pragma: no cover
+
+    @abstractmethod
+    async def validate_source(self, source_url: str) -> bool:
+        """
+        Validate if the source is accessible.
+
+        Args:
+            source_url: URL or path to the source
+
+        Returns:
+            True if source is valid and accessible
+        """
+        pass  # pragma: no cover
+
+    @abstractmethod
+    async def estimate_size(self, source_url: str) -> int:
+        """
+        Estimate the number of documents in the source.
+
+        Args:
+            source_url: URL or path to the source
+
+        Returns:
+            Estimated number of documents
+        """
+        pass  # pragma: no cover
--- a/ingest_pipeline/ingestors/firecrawl.py
+++ b/ingest_pipeline/ingestors/firecrawl.py
@@ -0,0 +1,229 @@
+"""Firecrawl ingestor for web and documentation sites."""
+
+import asyncio
+from collections.abc import AsyncGenerator
+from datetime import UTC, datetime
+from typing import Any
+from uuid import uuid4
+
+from firecrawl import AsyncFirecrawl
+from typing_extensions import override
+
+from ..config import get_settings
+from ..core.models import (
+    Document,
+    DocumentMetadata,
+    FirecrawlConfig,
+    IngestionJob,
+    IngestionSource,
+)
+from .base import BaseIngestor
+
+
+class FirecrawlIngestor(BaseIngestor):
+    """Ingestor for web and documentation sites using Firecrawl."""
+
+    config: FirecrawlConfig
+    client: Any  # AsyncFirecrawl client instance
+
+    def __init__(self, config: FirecrawlConfig | None = None):
+        """
+        Initialize Firecrawl ingestor.
+
+        Args:
+            config: Firecrawl configuration (for operational params only)
+        """
+        self.config = config or FirecrawlConfig()
+        settings = get_settings()
+
+        # All connection details come from settings/.env
+        # For self-hosted instances, use a dummy API key if none is provided
+        # The SDK requires an API key even for self-hosted instances
+        api_key = settings.firecrawl_api_key or "no-key-required"
+
+        # AsyncFirecrawl automatically uses v2 endpoints
+        self.client = AsyncFirecrawl(api_key=api_key, api_url=str(settings.firecrawl_endpoint))
+
+    @override
+    async def ingest(self, job: IngestionJob) -> AsyncGenerator[Document, None]:
+        """
+        Ingest documents from a web source.
+
+        Args:
+            job: The ingestion job configuration
+
+        Yields:
+            Documents from the web source
+        """
+        url = str(job.source_url)
+
+        # First, map the site to understand its structure
+        site_map = await self._map_site(url)
+
+        # If map returns empty, just use the main URL
+        if not site_map:
+            site_map = [url]
+
+        # Process pages in batches
+        batch_size = 10
+        for i in range(0, len(site_map), batch_size):
+            batch_urls = site_map[i : i + batch_size]
+            documents = await self._scrape_batch(batch_urls)
+
+            for doc_data in documents:
+                yield self._create_document(doc_data, job)
+
+    @override
+    async def validate_source(self, source_url: str) -> bool:
+        """
+        Validate if the web source is accessible.
+
+        Args:
+            source_url: URL to validate
+
+        Returns:
+            True if source is accessible
+        """
+        try:
+            # Use SDK v2 endpoints for both self-hosted and cloud
+            result = await self.client.scrape(source_url, formats=["markdown"])
+            return result is not None and hasattr(result, "markdown")
+        except Exception:
+            return False
+
+    @override
+    async def estimate_size(self, source_url: str) -> int:
+        """
+        Estimate the number of pages in the website.
+
+        Args:
+            source_url: URL of the website
+
+        Returns:
+            Estimated number of pages
+        """
+        try:
+            site_map = await self._map_site(source_url)
+            return len(site_map) if site_map else 0
+        except Exception:
+            return 0
+
+    async def _map_site(self, url: str) -> list[str]:
+        """
+        Map a website to get all URLs.
+
+        Args:
+            url: Base URL to map
+
+        Returns:
+            List of URLs found
+        """
+        try:
+            # Use SDK v2 map endpoint
+            result = await self.client.map(url=url, limit=self.config.limit)
+
+            if result and hasattr(result, "links"):
+                # Extract URLs from the result
+                return [
+                    link if isinstance(link, str) else getattr(link, "url", str(link))
+                    for link in result.links
+                ]
+            return []
+        except Exception as e:
+            # If map fails (might not be available in all versions), fall back to single URL
+            import logging
+
+            logging.warning(f"Map endpoint not available or failed: {e}. Using single URL.")
+            return [url]
+
+    async def _scrape_batch(self, urls: list[str]) -> list[dict[str, str]]:
+        """
+        Scrape a batch of URLs.
+
+        Args:
+            urls: List of URLs to scrape
+
+        Returns:
+            List of scraped documents
+        """
+        tasks = []
+        for url in urls:
+            task = self._scrape_single(url)
+            tasks.append(task)
+
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+
+        documents = []
+        for result in results:
+            if isinstance(result, Exception):
+                continue
+            if result and isinstance(result, dict) and "markdown" in result:
+                documents.append(result)
+
+        return documents
+
+    async def _scrape_single(self, url: str) -> dict[str, str]:
+        """
+        Scrape a single URL.
+
+        Args:
+            url: URL to scrape
+
+        Returns:
+            Scraped document data
+        """
+        try:
+            # Use SDK v2 scrape endpoint
+            result = await self.client.scrape(url, formats=self.config.formats)
+
+            # Extract data from the result
+            if result:
+                # The SDK returns a ScrapeResult object with markdown and metadata
+                metadata = getattr(result, "metadata", {})
+                return {
+                    "markdown": getattr(result, "markdown", ""),
+                    "sourceURL": url,
+                    "title": metadata.get("title", "")
+                    if isinstance(metadata, dict)
+                    else getattr(metadata, "title", ""),
+                    "description": metadata.get("description", "")
+                    if isinstance(metadata, dict)
+                    else getattr(metadata, "description", ""),
+                }
+            return {}
+        except Exception as e:
+            import logging
+
+            logging.debug(f"Failed to scrape {url}: {e}")
+            return {}
+
+    def _create_document(self, doc_data: dict[str, str], job: IngestionJob) -> Document:
+        """
+        Create a Document from scraped data.
+
+        Args:
+            doc_data: Scraped document data
+            job: The ingestion job
+
+        Returns:
+            Document instance
+        """
+        content = doc_data.get("markdown", "")
+
+        metadata: DocumentMetadata = {
+            "source_url": doc_data.get("sourceURL", str(job.source_url)),
+            "title": doc_data.get("title"),
+            "description": doc_data.get("description"),
+            "timestamp": datetime.now(UTC),
+            "content_type": "text/markdown",
+            "word_count": len(content.split()),
+            "char_count": len(content),
+        }
+
+        return Document(
+            id=uuid4(),
+            content=content,
+            metadata=metadata,
+            source=IngestionSource.WEB,
+            collection=job.storage_backend.value,
+        )
--- a/ingest_pipeline/ingestors/repomix.py
+++ b/ingest_pipeline/ingestors/repomix.py
@@ -0,0 +1,339 @@
+"""Repomix ingestor for Git repositories."""
+
+import asyncio
+import subprocess
+import tempfile
+from collections.abc import AsyncGenerator
+from datetime import UTC, datetime
+from pathlib import Path
+from uuid import uuid4
+
+from typing_extensions import override
+
+from ..core.exceptions import IngestionError, SourceNotFoundError
+from ..core.models import (
+    Document,
+    DocumentMetadata,
+    IngestionJob,
+    IngestionSource,
+    RepomixConfig,
+)
+from .base import BaseIngestor
+
+
+class RepomixIngestor(BaseIngestor):
+    """Ingestor for Git repositories using Repomix."""
+
+    config: RepomixConfig
+
+    def __init__(self, config: RepomixConfig | None = None):
+        """
+        Initialize Repomix ingestor.
+
+        Args:
+            config: Repomix configuration
+        """
+        self.config = config or RepomixConfig()
+
+    @override
+    async def ingest(self, job: IngestionJob) -> AsyncGenerator[Document, None]:
+        """
+        Ingest documents from a Git repository.
+
+        Args:
+            job: The ingestion job configuration
+
+        Yields:
+            Documents from the repository
+        """
+        repo_url = str(job.source_url)
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Clone the repository
+            repo_path = await self._clone_repository(repo_url, temp_dir)
+
+            # Run repomix to generate output
+            output_file = await self._run_repomix(repo_path)
+
+            # Parse and yield documents
+            documents = await self._parse_repomix_output(output_file, job)
+            for doc in documents:
+                yield doc
+
+    @override
+    async def validate_source(self, source_url: str) -> bool:
+        """
+        Validate if the Git repository is accessible.
+
+        Args:
+            source_url: Git repository URL
+
+        Returns:
+            True if repository is accessible
+        """
+        try:
+            # Test if we can list remote refs
+            result = await self._run_command(
+                ["git", "ls-remote", "--heads", source_url], timeout=10
+            )
+            return result.returncode == 0
+        except Exception:
+            return False
+
+    @override
+    async def estimate_size(self, source_url: str) -> int:
+        """
+        Estimate the number of files in the repository.
+
+        Args:
+            source_url: Git repository URL
+
+        Returns:
+            Estimated number of files
+        """
+        try:
+            with tempfile.TemporaryDirectory() as temp_dir:
+                # Shallow clone to get file count
+                repo_path = await self._clone_repository(source_url, temp_dir, shallow=True)
+
+                # Count files matching patterns
+                file_count = 0
+                for pattern in self.config.include_patterns:
+                    files = list(Path(repo_path).rglob(pattern))
+                    file_count += len(files)
+
+                return file_count
+        except Exception:
+            return 0
+
+    async def _clone_repository(
+        self, repo_url: str, target_dir: str, shallow: bool = False
+    ) -> Path:
+        """
+        Clone a Git repository.
+
+        Args:
+            repo_url: Repository URL
+            target_dir: Directory to clone into
+            shallow: Whether to do a shallow clone
+
+        Returns:
+            Path to cloned repository
+        """
+        repo_name = repo_url.split("/")[-1].replace(".git", "")
+        repo_path = Path(target_dir) / repo_name
+
+        cmd = ["git", "clone"]
+        if shallow:
+            cmd.extend(["--depth", "1"])
+        cmd.extend([repo_url, str(repo_path)])
+
+        result = await self._run_command(cmd, timeout=300)
+
+        if result.returncode != 0:
+            raise SourceNotFoundError(f"Failed to clone repository: {repo_url}")
+
+        return repo_path
+
+    async def _run_repomix(self, repo_path: Path) -> Path:
+        """
+        Run repomix on a repository.
+
+        Args:
+            repo_path: Path to the repository
+
+        Returns:
+            Path to repomix output file
+        """
+        output_file = repo_path / "repomix-output.md"
+
+        # Build repomix command
+        cmd = ["npx", "repomix", "--output", str(output_file)]
+
+        # Add include patterns
+        if self.config.include_patterns:
+            for pattern in self.config.include_patterns:
+                cmd.extend(["--include", pattern])
+
+        # Add exclude patterns
+        if self.config.exclude_patterns:
+            for pattern in self.config.exclude_patterns:
+                cmd.extend(["--exclude", pattern])
+
+        if self.config.respect_gitignore:
+            cmd.append("--respect-gitignore")
+
+        result = await self._run_command(cmd, cwd=str(repo_path), timeout=120)
+
+        if result.returncode != 0:
+            stderr_text = (
+                result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr
+            )
+            raise IngestionError(f"Repomix failed: {stderr_text}")
+
+        return output_file
+
+    async def _parse_repomix_output(self, output_file: Path, job: IngestionJob) -> list[Document]:
+        """
+        Parse repomix output into documents.
+
+        Args:
+            output_file: Path to repomix output
+            job: The ingestion job
+
+        Returns:
+            List of documents
+        """
+        documents = []
+
+        try:
+            content = output_file.read_text()
+
+            # Split by file markers (repomix uses specific delimiters)
+            file_sections = self._split_by_files(content)
+
+            for file_path, file_content in file_sections.items():
+                if len(file_content) > self.config.max_file_size:
+                    # Split large files into chunks
+                    chunks = self._chunk_content(file_content)
+                    for i, chunk in enumerate(chunks):
+                        doc = self._create_document(file_path, chunk, job, chunk_index=i)
+                        documents.append(doc)
+                else:
+                    doc = self._create_document(file_path, file_content, job)
+                    documents.append(doc)
+
+        except Exception as e:
+            raise IngestionError(f"Failed to parse repomix output: {e}") from e
+
+        return documents
+
+    def _split_by_files(self, content: str) -> dict[str, str]:
+        """
+        Split repomix output by files.
+
+        Args:
+            content: Repomix output content
+
+        Returns:
+            Dictionary of file paths to content
+        """
+        files: dict[str, str] = {}
+        current_file: str | None = None
+        current_content: list[str] = []
+
+        for line in content.split("\n"):
+            # Look for file markers (adjust based on actual repomix format)
+            if line.startswith("## File:") or line.startswith("### "):
+                if current_file:
+                    files[current_file] = "\n".join(current_content)
+                current_file = line.replace("## File:", "").replace("### ", "").strip()
+                current_content = []
+            else:
+                current_content.append(line)
+
+        # Add last file
+        if current_file:
+            files[current_file] = "\n".join(current_content)
+
+        # If no file markers found, treat as single document
+        if not files:
+            files["repository"] = content
+
+        return files
+
+    def _chunk_content(self, content: str, chunk_size: int = 500000) -> list[str]:
+        """
+        Split content into chunks.
+
+        Args:
+            content: Content to chunk
+            chunk_size: Maximum size per chunk
+
+        Returns:
+            List of content chunks
+        """
+        chunks: list[str] = []
+        lines = content.split("\n")
+        current_chunk: list[str] = []
+        current_size = 0
+
+        for line in lines:
+            line_size = len(line) + 1  # +1 for newline
+
+            if current_size + line_size > chunk_size and current_chunk:
+                chunks.append("\n".join(current_chunk))
+                current_chunk = []
+                current_size = 0
+
+            current_chunk.append(line)
+            current_size += line_size
+
+        if current_chunk:
+            chunks.append("\n".join(current_chunk))
+
+        return chunks
+
+    def _create_document(
+        self, file_path: str, content: str, job: IngestionJob, chunk_index: int = 0
+    ) -> Document:
+        """
+        Create a Document from repository content.
+
+        Args:
+            file_path: Path to the file in repository
+            content: File content
+            job: The ingestion job
+            chunk_index: Index if content is chunked
+
+        Returns:
+            Document instance
+        """
+        metadata: DocumentMetadata = {
+            "source_url": str(job.source_url),
+            "title": f"{file_path}" + (f" (chunk {chunk_index})" if chunk_index > 0 else ""),
+            "description": f"Repository file: {file_path}",
+            "timestamp": datetime.now(UTC),
+            "content_type": "text/plain",
+            "word_count": len(content.split()),
+            "char_count": len(content),
+        }
+
+        return Document(
+            id=uuid4(),
+            content=content,
+            metadata=metadata,
+            source=IngestionSource.REPOSITORY,
+            collection=job.storage_backend.value,
+        )
+
+    async def _run_command(
+        self, cmd: list[str], cwd: str | None = None, timeout: int = 60
+    ) -> subprocess.CompletedProcess[bytes]:
+        """
+        Run a shell command asynchronously.
+
+        Args:
+            cmd: Command and arguments
+            cwd: Working directory
+            timeout: Command timeout in seconds
+
+        Returns:
+            Completed process result
+        """
+        proc = await asyncio.create_subprocess_exec(
+            *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, cwd=cwd
+        )
+
+        try:
+            stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
+
+            return subprocess.CompletedProcess(
+                cmd,
+                proc.returncode or 0,
+                stdout,
+                stderr,
+            )
+        except TimeoutError as e:
+            proc.kill()
+            raise IngestionError(f"Command timed out: {' '.join(cmd)}") from e
--- a/ingest_pipeline/storage/init.py
+++ b/ingest_pipeline/storage/init.py
@@ -0,0 +1,11 @@
+"""Storage adapters for different backends."""
+
+from .base import BaseStorage
+from .openwebui import OpenWebUIStorage
+from .weaviate import WeaviateStorage
+
+__all__ = [
+    "BaseStorage",
+    "WeaviateStorage",
+    "OpenWebUIStorage",
+]
--- a/ingest_pipeline/storage/pycache/init.cpython-312.pyc
+++ b/ingest_pipeline/storage/pycache/init.cpython-312.pyc
--- a/ingest_pipeline/storage/pycache/base.cpython-312.pyc
+++ b/ingest_pipeline/storage/pycache/base.cpython-312.pyc
--- a/ingest_pipeline/storage/pycache/openwebui.cpython-312.pyc
+++ b/ingest_pipeline/storage/pycache/openwebui.cpython-312.pyc
--- a/ingest_pipeline/storage/pycache/weaviate.cpython-312.pyc
+++ b/ingest_pipeline/storage/pycache/weaviate.cpython-312.pyc
--- a/ingest_pipeline/storage/base.py
+++ b/ingest_pipeline/storage/base.py
@@ -0,0 +1,106 @@
+"""Base storage interface."""
+
+from abc import ABC, abstractmethod
+from collections.abc import AsyncGenerator
+
+from ..core.models import Document, StorageConfig
+
+
+class BaseStorage(ABC):
+    """Abstract base class for storage adapters."""
+
+    config: StorageConfig
+
+    def __init__(self, config: StorageConfig):
+        """
+        Initialize storage adapter.
+
+        Args:
+            config: Storage configuration
+        """
+        self.config = config
+
+    @abstractmethod
+    async def initialize(self) -> None:
+        """Initialize the storage backend and create collections if needed."""
+        pass  # pragma: no cover
+
+    @abstractmethod
+    async def store(self, document: Document) -> str:
+        """
+        Store a single document.
+
+        Args:
+            document: Document to store
+
+        Returns:
+            Document ID
+        """
+        pass  # pragma: no cover
+
+    @abstractmethod
+    async def store_batch(self, documents: list[Document]) -> list[str]:
+        """
+        Store multiple documents in batch.
+
+        Args:
+            documents: List of documents to store
+
+        Returns:
+            List of document IDs
+        """
+        pass  # pragma: no cover
+
+    @abstractmethod
+    async def retrieve(self, document_id: str) -> Document | None:
+        """
+        Retrieve a document by ID.
+
+        Args:
+            document_id: Document ID
+
+        Returns:
+            Document or None if not found
+        """
+        pass  # pragma: no cover
+
+    @abstractmethod
+    async def search(
+        self, query: str, limit: int = 10, threshold: float = 0.7
+    ) -> AsyncGenerator[Document, None]:
+        """
+        Search for documents.
+
+        Args:
+            query: Search query
+            limit: Maximum number of results
+            threshold: Similarity threshold
+
+        Yields:
+            Matching documents
+        """
+        return  # type: ignore # pragma: no cover
+        yield  # pragma: no cover
+
+    @abstractmethod
+    async def delete(self, document_id: str) -> bool:
+        """
+        Delete a document.
+
+        Args:
+            document_id: Document ID
+
+        Returns:
+            True if deleted successfully
+        """
+        pass  # pragma: no cover
+
+    @abstractmethod
+    async def count(self) -> int:
+        """
+        Get total document count.
+
+        Returns:
+            Number of documents
+        """
+        pass  # pragma: no cover
--- a/ingest_pipeline/storage/openwebui.py
+++ b/ingest_pipeline/storage/openwebui.py
@@ -0,0 +1,296 @@
+"""Open WebUI storage adapter."""
+
+from collections.abc import AsyncGenerator
+from uuid import UUID
+
+import httpx
+from typing_extensions import override
+
+from ..core.exceptions import StorageError
+from ..core.models import Document, StorageConfig
+from ..utils.vectorizer import Vectorizer
+from .base import BaseStorage
+
+
+class OpenWebUIStorage(BaseStorage):
+    """Storage adapter for Open WebUI knowledge endpoints."""
+
+    client: httpx.AsyncClient
+    vectorizer: Vectorizer
+
+    def __init__(self, config: StorageConfig):
+        """
+        Initialize Open WebUI storage.
+
+        Args:
+            config: Storage configuration
+        """
+        super().__init__(config)
+
+        self.client = httpx.AsyncClient(
+            base_url=str(config.endpoint),
+            headers={
+                "Authorization": f"Bearer {config.api_key}" if config.api_key else "",
+                "Content-Type": "application/json",
+            },
+            timeout=30.0,
+        )
+        self.vectorizer = Vectorizer(config)
+
+    @override
+    async def initialize(self) -> None:
+        """Initialize Open WebUI connection."""
+        try:
+            # Test connection with OpenWebUI knowledge API
+            response = await self.client.get("/api/v1/knowledge/")
+            response.raise_for_status()
+
+            # Check if collection (knowledge base) exists, create if not
+            knowledge_bases = response.json()
+            collection_exists = any(
+                kb.get("name") == self.config.collection_name for kb in knowledge_bases
+            )
+
+            if not collection_exists:
+                await self._create_collection()
+
+        except Exception as e:
+            raise StorageError(f"Failed to initialize Open WebUI: {e}") from e
+
+    async def _create_collection(self) -> None:
+        """Create knowledge base in Open WebUI."""
+        try:
+            response = await self.client.post(
+                "/api/v1/knowledge/create",
+                json={
+                    "name": self.config.collection_name,
+                    "description": "Documents ingested from various sources"
+                },
+            )
+            response.raise_for_status()
+        except Exception as e:
+            raise StorageError(f"Failed to create knowledge base: {e}") from e
+
+    @override
+    async def store(self, document: Document) -> str:
+        """
+        Store a document in Open WebUI.
+
+        Args:
+            document: Document to store
+
+        Returns:
+            Document ID
+        """
+        try:
+            # Vectorize if needed
+            if document.vector is None:
+                document.vector = await self.vectorizer.vectorize(document.content)
+
+            # Prepare document data
+            doc_data = {
+                "id": str(document.id),
+                "collection": self.config.collection_name,
+                "content": document.content,
+                "metadata": {
+                    **document.metadata,
+                    "timestamp": document.metadata["timestamp"].isoformat(),
+                    "source": document.source.value,
+                },
+                "embedding": document.vector,
+            }
+
+            # Store document
+            response = await self.client.post(
+                f"/api/knowledge/collections/{self.config.collection_name}/documents", json=doc_data
+            )
+            response.raise_for_status()
+
+            result = response.json()
+            document_id = result.get("id") if isinstance(result, dict) else None
+            return str(document_id) if document_id else str(document.id)
+
+        except Exception as e:
+            raise StorageError(f"Failed to store document: {e}") from e
+
+    @override
+    async def store_batch(self, documents: list[Document]) -> list[str]:
+        """
+        Store multiple documents in batch.
+
+        Args:
+            documents: List of documents
+
+        Returns:
+            List of document IDs
+        """
+        try:
+            # Vectorize documents without vectors
+            for doc in documents:
+                if doc.vector is None:
+                    doc.vector = await self.vectorizer.vectorize(doc.content)
+
+            # Prepare batch data
+            batch_data = []
+            for doc in documents:
+                batch_data.append(
+                    {
+                        "id": str(doc.id),
+                        "content": doc.content,
+                        "metadata": {
+                            **doc.metadata,
+                            "timestamp": doc.metadata["timestamp"].isoformat(),
+                            "source": doc.source.value,
+                        },
+                        "embedding": doc.vector,
+                    }
+                )
+
+            # Store batch
+            response = await self.client.post(
+                f"/api/knowledge/collections/{self.config.collection_name}/documents/batch",
+                json={"documents": batch_data},
+            )
+            response.raise_for_status()
+
+            result = response.json()
+            ids = result.get("ids") if isinstance(result, dict) else None
+            return ids if isinstance(ids, list) else [str(doc.id) for doc in documents]
+
+        except Exception as e:
+            raise StorageError(f"Failed to store batch: {e}") from e
+
+    @override
+    async def retrieve(self, document_id: str) -> Document | None:
+        """
+        Retrieve a document from Open WebUI.
+
+        Args:
+            document_id: Document ID
+
+        Returns:
+            Document or None
+        """
+        try:
+            response = await self.client.get(
+                f"/api/knowledge/collections/{self.config.collection_name}/documents/{document_id}"
+            )
+
+            if response.status_code == 404:
+                return None
+
+            response.raise_for_status()
+            data = response.json()
+
+            # Reconstruct document
+            metadata = data.get("metadata", {})
+            return Document(
+                id=UUID(document_id),
+                content=data["content"],
+                metadata=metadata,
+                vector=data.get("embedding"),
+                source=metadata.get("source", "unknown"),
+                collection=self.config.collection_name,
+            )
+
+        except Exception:
+            return None
+
+    @override
+    async def search(
+        self, query: str, limit: int = 10, threshold: float = 0.7
+    ) -> AsyncGenerator[Document, None]:
+        """
+        Search for documents in Open WebUI.
+
+        Args:
+            query: Search query
+            limit: Maximum results
+            threshold: Similarity threshold
+
+        Yields:
+            Matching documents
+        """
+        try:
+            # Vectorize query
+            query_vector = await self.vectorizer.vectorize(query)
+
+            # Perform search
+            response = await self.client.post(
+                f"/api/knowledge/collections/{self.config.collection_name}/search",
+                json={
+                    "query": query,
+                    "embedding": query_vector,
+                    "limit": limit,
+                    "threshold": threshold,
+                },
+            )
+            response.raise_for_status()
+
+            results = response.json()
+
+            for result in results.get("documents", []):
+                metadata = result.get("metadata", {})
+                doc = Document(
+                    id=result["id"],
+                    content=result["content"],
+                    metadata=metadata,
+                    vector=result.get("embedding"),
+                    source=metadata.get("source", "unknown"),
+                    collection=self.config.collection_name,
+                )
+                yield doc
+
+        except Exception as e:
+            raise StorageError(f"Search failed: {e}") from e
+
+    async def delete(self, document_id: str) -> bool:
+        """
+        Delete a document from Open WebUI.
+
+        Args:
+            document_id: Document ID
+
+        Returns:
+            True if deleted
+        """
+        try:
+            response = await self.client.delete(
+                f"/api/knowledge/collections/{self.config.collection_name}/documents/{document_id}"
+            )
+            return response.status_code in [200, 204]
+        except Exception:
+            return False
+
+    async def count(self) -> int:
+        """
+        Get document count in collection.
+
+        Returns:
+            Number of documents
+        """
+        try:
+            response = await self.client.get(
+                f"/api/knowledge/collections/{self.config.collection_name}/stats"
+            )
+            response.raise_for_status()
+
+            stats = response.json()
+            count = stats.get("document_count") if isinstance(stats, dict) else None
+            return int(count) if isinstance(count, (int, str)) else 0
+        except Exception:
+            return 0
+
+    async def __aenter__(self) -> "OpenWebUIStorage":
+        """Async context manager entry."""
+        await self.initialize()
+        return self
+
+    async def __aexit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: object | None,
+    ) -> None:
+        """Async context manager exit."""
+        await self.client.aclose()
--- a/ingest_pipeline/storage/weaviate.py
+++ b/ingest_pipeline/storage/weaviate.py
@@ -0,0 +1,703 @@
+"""Weaviate storage adapter."""
+
+from collections.abc import AsyncGenerator
+from datetime import UTC, datetime
+from typing import cast
+from uuid import UUID
+
+import weaviate
+from typing_extensions import override
+from weaviate.classes.config import Configure, DataType, Property
+
+from ..core.exceptions import StorageError
+from ..core.models import Document, DocumentMetadata, IngestionSource, StorageConfig
+from ..utils.vectorizer import Vectorizer
+from .base import BaseStorage
+
+
+class WeaviateStorage(BaseStorage):
+    """Storage adapter for Weaviate."""
+
+    client: weaviate.WeaviateClient | None
+    vectorizer: Vectorizer
+    collection_name: str
+
+    def __init__(self, config: StorageConfig):
+        """
+        Initialize Weaviate storage.
+
+        Args:
+            config: Storage configuration
+        """
+        super().__init__(config)
+        self.client = None
+        self.vectorizer = Vectorizer(config)
+        self.collection_name = config.collection_name.capitalize()
+
+    @override
+    async def initialize(self) -> None:
+        """Initialize Weaviate client and create collection if needed."""
+        try:
+            # Connect to Weaviate
+            # Parse endpoint - Weaviate expects just the hostname without protocol
+            endpoint_str = str(self.config.endpoint).replace("http://", "").replace("https://", "")
+
+            # Split host and port if port is specified in the URL
+            if ":" in endpoint_str and "/" not in endpoint_str:
+                # Only split if it's a port number (no path)
+                host, port_str = endpoint_str.rsplit(":", 1)
+                http_port = int(port_str) if port_str.isdigit() else 80
+            else:
+                # Remove any path if present
+                host = endpoint_str.split("/")[0]
+                # For reverse proxy setups, use port 80
+                http_port = 80
+
+            # For reverse proxy setups, use HTTP-only connection
+            self.client = weaviate.WeaviateClient(
+                connection_params=weaviate.connect.ConnectionParams.from_url(
+                    url=f"http://{host}:{http_port}",
+                    grpc_port=50051,  # Default gRPC port but will be ignored
+                ),
+                skip_init_checks=True,  # Skip gRPC health checks
+                additional_config=weaviate.classes.init.AdditionalConfig(
+                    timeout=weaviate.classes.init.Timeout(init=30, query=60, insert=120),
+                )
+            )
+
+            # Connect to the client
+            self.client.connect()
+
+            # Check if collection exists
+            collections = self.client.collections.list_all()
+
+            if self.collection_name not in collections:
+                await self._create_collection()
+
+        except Exception as e:
+            raise StorageError(f"Failed to initialize Weaviate: {e}") from e
+
+    async def _create_collection(self) -> None:
+        """Create Weaviate collection with schema."""
+        if not self.client:
+            raise StorageError("Weaviate client not initialized")
+        try:
+            self.client.collections.create(
+                name=self.collection_name,
+                properties=[
+                    Property(
+                        name="content", data_type=DataType.TEXT, description="Document content"
+                    ),
+                    Property(name="source_url", data_type=DataType.TEXT, description="Source URL"),
+                    Property(name="title", data_type=DataType.TEXT, description="Document title"),
+                    Property(
+                        name="description",
+                        data_type=DataType.TEXT,
+                        description="Document description",
+                    ),
+                    Property(
+                        name="timestamp", data_type=DataType.DATE, description="Ingestion timestamp"
+                    ),
+                    Property(
+                        name="content_type", data_type=DataType.TEXT, description="Content type"
+                    ),
+                    Property(name="word_count", data_type=DataType.INT, description="Word count"),
+                    Property(
+                        name="char_count", data_type=DataType.INT, description="Character count"
+                    ),
+                    Property(
+                        name="source", data_type=DataType.TEXT, description="Ingestion source"
+                    ),
+                ],
+                vectorizer_config=Configure.Vectorizer.none(),
+            )
+        except Exception as e:
+            raise StorageError(f"Failed to create collection: {e}") from e
+
+    @override
+    async def store(self, document: Document) -> str:
+        """
+        Store a document in Weaviate.
+
+        Args:
+            document: Document to store
+
+        Returns:
+            Document ID
+        """
+        try:
+            # Vectorize content if no vector provided
+            if document.vector is None:
+                document.vector = await self.vectorizer.vectorize(document.content)
+
+            if not self.client:
+                raise StorageError("Weaviate client not initialized")
+            collection = self.client.collections.get(self.collection_name)
+
+            # Prepare properties
+            properties = {
+                "content": document.content,
+                "source_url": document.metadata["source_url"],
+                "title": document.metadata.get("title", ""),
+                "description": document.metadata.get("description", ""),
+                "timestamp": document.metadata["timestamp"].isoformat(),
+                "content_type": document.metadata["content_type"],
+                "word_count": document.metadata["word_count"],
+                "char_count": document.metadata["char_count"],
+                "source": document.source.value,
+            }
+
+            # Insert with vector
+            result = collection.data.insert(
+                properties=properties, vector=document.vector, uuid=str(document.id)
+            )
+
+            return str(result)
+
+        except Exception as e:
+            raise StorageError(f"Failed to store document: {e}") from e
+
+    @override
+    async def store_batch(self, documents: list[Document]) -> list[str]:
+        """
+        Store multiple documents in batch.
+
+        Args:
+            documents: List of documents
+
+        Returns:
+            List of successfully stored document IDs
+        """
+        try:
+            if not self.client:
+                raise StorageError("Weaviate client not initialized")
+            collection = self.client.collections.get(self.collection_name)
+
+            # Vectorize documents without vectors
+            for doc in documents:
+                if doc.vector is None:
+                    doc.vector = await self.vectorizer.vectorize(doc.content)
+
+            # Try individual inserts to avoid gRPC batch issues
+            successful_ids: list[str] = []
+
+            for doc in documents:
+                try:
+                    properties = {
+                        "content": doc.content,
+                        "source_url": doc.metadata["source_url"],
+                        "title": doc.metadata.get("title", ""),
+                        "description": doc.metadata.get("description", ""),
+                        "timestamp": doc.metadata["timestamp"].isoformat(),
+                        "content_type": doc.metadata["content_type"],
+                        "word_count": doc.metadata["word_count"],
+                        "char_count": doc.metadata["char_count"],
+                        "source": doc.source.value,
+                    }
+
+                    # Insert individual document
+                    collection.data.insert(
+                        properties=properties,
+                        vector=doc.vector,
+                        uuid=str(doc.id)
+                    )
+                    successful_ids.append(str(doc.id))
+
+                except Exception as e:
+                    print(f"Failed to store document {doc.id}: {e}")
+                    continue
+
+            if not successful_ids:
+                raise StorageError("All documents in batch failed to store")
+
+            return successful_ids
+
+        except Exception as e:
+            raise StorageError(f"Failed to store batch: {e}") from e
+
+    @override
+    async def retrieve(self, document_id: str) -> Document | None:
+        """
+        Retrieve a document from Weaviate.
+
+        Args:
+            document_id: Document ID
+
+        Returns:
+            Document or None
+        """
+        try:
+            if not self.client:
+                raise StorageError("Weaviate client not initialized")
+            collection = self.client.collections.get(self.collection_name)
+            result = collection.query.fetch_object_by_id(document_id)
+
+            if not result:
+                return None
+
+            # Reconstruct document
+            props = result.properties
+            metadata_dict = {
+                "source_url": str(props["source_url"]),
+                "title": str(props.get("title")) if props.get("title") else None,
+                "description": str(props.get("description")) if props.get("description") else None,
+                "timestamp": str(props["timestamp"]),
+                "content_type": str(props["content_type"]),
+                "word_count": int(str(props["word_count"])),
+                "char_count": int(str(props["char_count"])),
+            }
+            metadata = cast(DocumentMetadata, cast(object, metadata_dict))
+
+            vector_raw = result.vector.get("default") if result.vector else None
+            vector: list[float] | None = None
+            if isinstance(vector_raw, list) and vector_raw:
+                first_elem = vector_raw[0]
+                if isinstance(first_elem, list):
+                    # Nested list - take first one and ensure all elements are numbers
+                    nested_vector = first_elem
+                    try:
+                        vector = [float(x) for x in nested_vector if isinstance(x, (int, float))]
+                    except (ValueError, TypeError):
+                        vector = None
+                else:
+                    # Flat list - ensure all elements are numbers
+                    try:
+                        vector = [float(x) for x in vector_raw if isinstance(x, (int, float))]
+                    except (ValueError, TypeError):
+                        vector = None
+
+            return Document(
+                id=UUID(document_id),
+                content=str(props["content"]),
+                metadata=metadata,
+                vector=vector,
+                source=IngestionSource.WEB,  # Default to WEB
+                collection=self.collection_name,
+            )
+
+        except Exception:
+            return None
+
+    @override
+    async def search(
+        self, query: str, limit: int = 10, threshold: float = 0.7
+    ) -> AsyncGenerator[Document, None]:
+        """
+        Search for documents in Weaviate.
+
+        Args:
+            query: Search query
+            limit: Maximum results
+            threshold: Similarity threshold
+
+        Yields:
+            Matching documents
+        """
+        try:
+            # Vectorize query
+            query_vector = await self.vectorizer.vectorize(query)
+
+            if not self.client:
+                raise StorageError("Weaviate client not initialized")
+            collection = self.client.collections.get(self.collection_name)
+
+            # Perform vector search
+            results = collection.query.near_vector(
+                near_vector=query_vector,
+                limit=limit,
+                distance=1 - threshold,  # Convert similarity to distance
+                return_metadata=["distance"],
+            )
+
+            for result in results.objects:
+                props = result.properties
+                metadata_dict = {
+                    "source_url": str(props["source_url"]),
+                    "title": str(props.get("title")) if props.get("title") else None,
+                    "description": str(props.get("description"))
+                    if props.get("description")
+                    else None,
+                    "timestamp": str(props["timestamp"]),
+                    "content_type": str(props["content_type"]),
+                    "word_count": int(str(props["word_count"])),
+                    "char_count": int(str(props["char_count"])),
+                }
+                metadata = cast(DocumentMetadata, cast(object, metadata_dict))
+
+                vector_raw = result.vector.get("default") if result.vector else None
+                vector: list[float] | None = None
+                if isinstance(vector_raw, list) and vector_raw:
+                    first_elem = vector_raw[0]
+                    if isinstance(first_elem, list):
+                        # Nested list - take first one and ensure all elements are numbers
+                        nested_vector = first_elem
+                        try:
+                            vector = [
+                                float(x) for x in nested_vector if isinstance(x, (int, float))
+                            ]
+                        except (ValueError, TypeError):
+                            vector = None
+                    else:
+                        # Flat list - ensure all elements are numbers
+                        try:
+                            vector = [float(x) for x in vector_raw if isinstance(x, (int, float))]
+                        except (ValueError, TypeError):
+                            vector = None
+
+                doc = Document(
+                    id=result.uuid,
+                    content=str(props["content"]),
+                    metadata=metadata,
+                    vector=vector,
+                    source=IngestionSource.WEB,  # Default to WEB
+                    collection=self.collection_name,
+                )
+                yield doc
+
+        except Exception as e:
+            raise StorageError(f"Search failed: {e}") from e
+
+    @override
+    async def delete(self, document_id: str) -> bool:
+        """
+        Delete a document from Weaviate.
+
+        Args:
+            document_id: Document ID
+
+        Returns:
+            True if deleted
+        """
+        try:
+            if not self.client:
+                raise StorageError("Weaviate client not initialized")
+            collection = self.client.collections.get(self.collection_name)
+            collection.data.delete_by_id(document_id)
+            return True
+        except Exception:
+            return False
+
+    @override
+    async def count(self) -> int:
+        """
+        Get document count in collection.
+
+        Returns:
+            Number of documents
+        """
+        try:
+            if not self.client:
+                return 0
+            collection = self.client.collections.get(self.collection_name)
+            result = collection.aggregate.over_all(total_count=True)
+            return result.total_count or 0
+        except Exception:
+            return 0
+
+    async def list_collections(self) -> list[str]:
+        """
+        List all available collections.
+
+        Returns:
+            List of collection names
+        """
+        try:
+            if not self.client:
+                raise StorageError("Weaviate client not initialized")
+
+            return list(self.client.collections.list_all())
+
+        except Exception as e:
+            raise StorageError(f"Failed to list collections: {e}") from e
+
+    async def sample_documents(self, limit: int = 5) -> list[Document]:
+        """
+        Get sample documents from the collection.
+
+        Args:
+            limit: Maximum number of documents to return
+
+        Returns:
+            List of sample documents
+        """
+        try:
+            if not self.client:
+                raise StorageError("Weaviate client not initialized")
+
+            collection = self.client.collections.get(self.collection_name)
+
+            # Query for sample documents
+            response = collection.query.fetch_objects(limit=limit)
+
+            documents = []
+            for obj in response.objects:
+                # Convert back to Document format
+                props = obj.properties
+                # Safely convert WeaviateField values
+                word_count_val = props.get("word_count")
+                if isinstance(word_count_val, (int, float)):
+                    word_count = int(word_count_val)
+                elif word_count_val:
+                    word_count = int(str(word_count_val))
+                else:
+                    word_count = 0
+
+                char_count_val = props.get("char_count")
+                if isinstance(char_count_val, (int, float)):
+                    char_count = int(char_count_val)
+                elif char_count_val:
+                    char_count = int(str(char_count_val))
+                else:
+                    char_count = 0
+
+                doc = Document(
+                    id=obj.uuid,
+                    content=str(props.get("content", "")),
+                    source=IngestionSource(str(props.get("source", "web"))),
+                    metadata={
+                        "source_url": str(props.get("source_url", "")),
+                        "title": str(props.get("title", "")) if props.get("title") else None,
+                        "description": str(props.get("description", "")) if props.get("description") else None,
+                        "timestamp": datetime.fromisoformat(str(props.get("timestamp", datetime.now(UTC).isoformat()))),
+                        "content_type": str(props.get("content_type", "text/plain")),
+                        "word_count": word_count,
+                        "char_count": char_count,
+                    }
+                )
+                documents.append(doc)
+
+            return documents
+
+        except Exception as e:
+            raise StorageError(f"Failed to sample documents: {e}") from e
+
+    async def search_documents(self, query: str, limit: int = 10) -> list[Document]:
+        """
+        Search documents in the collection.
+
+        Args:
+            query: Search query
+            limit: Maximum number of results
+
+        Returns:
+            List of matching documents
+        """
+        try:
+            if not self.client:
+                raise StorageError("Weaviate client not initialized")
+
+            collection = self.client.collections.get(self.collection_name)
+
+            # Try hybrid search first, fall back to BM25 keyword search
+            try:
+                response = collection.query.hybrid(
+                    query=query,
+                    limit=limit,
+                    return_metadata=["score"]
+                )
+            except Exception:
+                # Fall back to BM25 keyword search if hybrid search fails
+                response = collection.query.bm25(
+                    query=query,
+                    limit=limit,
+                    return_metadata=["score"]
+                )
+
+            documents = []
+            for obj in response.objects:
+                # Convert back to Document format
+                props = obj.properties
+
+                # Safely convert WeaviateField values
+                word_count_val = props.get("word_count")
+                if isinstance(word_count_val, (int, float)):
+                    word_count = int(word_count_val)
+                elif word_count_val:
+                    word_count = int(str(word_count_val))
+                else:
+                    word_count = 0
+
+                char_count_val = props.get("char_count")
+                if isinstance(char_count_val, (int, float)):
+                    char_count = int(char_count_val)
+                elif char_count_val:
+                    char_count = int(str(char_count_val))
+                else:
+                    char_count = 0
+
+                # Build metadata - note that search_score is not part of DocumentMetadata
+                metadata: DocumentMetadata = {
+                    "source_url": str(props.get("source_url", "")),
+                    "title": str(props.get("title", "")) if props.get("title") else None,
+                    "description": str(props.get("description", "")) if props.get("description") else None,
+                    "timestamp": datetime.fromisoformat(str(props.get("timestamp", datetime.now(UTC).isoformat()))),
+                    "content_type": str(props.get("content_type", "text/plain")),
+                    "word_count": word_count,
+                    "char_count": char_count,
+                }
+
+                doc = Document(
+                    id=obj.uuid,
+                    content=str(props.get("content", "")),
+                    source=IngestionSource(str(props.get("source", "web"))),
+                    metadata=metadata
+                )
+                documents.append(doc)
+
+            return documents
+
+        except Exception as e:
+            raise StorageError(f"Failed to search documents: {e}") from e
+
+    async def list_documents(self, limit: int = 100, offset: int = 0) -> list[dict[str, str | int]]:
+        """
+        List documents in the collection with pagination.
+
+        Args:
+            limit: Maximum number of documents to return
+            offset: Number of documents to skip
+
+        Returns:
+            List of document dictionaries with id, title, source_url, and content preview
+        """
+        try:
+            if not self.client:
+                raise StorageError("Weaviate client not initialized")
+
+            collection = self.client.collections.get(self.collection_name)
+
+            # Query documents with pagination
+            response = collection.query.fetch_objects(
+                limit=limit,
+                offset=offset,
+                return_metadata=["creation_time"]
+            )
+
+            documents = []
+            for obj in response.objects:
+                props = obj.properties
+                content = str(props.get("content", ""))
+                word_count_value = props.get("word_count", 0)
+                # Convert WeaviateField to int
+                if isinstance(word_count_value, (int, float)):
+                    word_count = int(word_count_value)
+                elif word_count_value:
+                    word_count = int(str(word_count_value))
+                else:
+                    word_count = 0
+
+                doc_info: dict[str, str | int] = {
+                    "id": str(obj.uuid),
+                    "title": str(props.get("title", "Untitled")),
+                    "source_url": str(props.get("source_url", "")),
+                    "content_preview": content[:200] + "..." if len(content) > 200 else content,
+                    "word_count": word_count,
+                    "timestamp": str(props.get("timestamp", "")),
+                }
+                documents.append(doc_info)
+
+            return documents
+
+        except Exception as e:
+            raise StorageError(f"Failed to list documents: {e}") from e
+
+    async def delete_documents(self, document_ids: list[str]) -> dict[str, bool]:
+        """
+        Delete multiple documents from Weaviate.
+
+        Args:
+            document_ids: List of document IDs to delete
+
+        Returns:
+            Dictionary mapping document IDs to deletion success status
+        """
+        results = {}
+
+        try:
+            if not self.client:
+                raise StorageError("Weaviate client not initialized")
+
+            collection = self.client.collections.get(self.collection_name)
+
+            for doc_id in document_ids:
+                try:
+                    collection.data.delete_by_id(doc_id)
+                    results[doc_id] = True
+                except Exception:
+                    results[doc_id] = False
+
+            return results
+
+        except Exception as e:
+            raise StorageError(f"Failed to delete documents: {e}") from e
+
+    async def delete_by_filter(self, filter_dict: dict[str, str]) -> int:
+        """
+        Delete documents matching a filter.
+
+        Args:
+            filter_dict: Filter criteria (e.g., {"source_url": "example.com"})
+
+        Returns:
+            Number of documents deleted
+        """
+        try:
+            if not self.client:
+                raise StorageError("Weaviate client not initialized")
+
+            collection = self.client.collections.get(self.collection_name)
+
+            # Build where filter
+            where_filter = None
+            if "source_url" in filter_dict:
+                from weaviate.classes.query import Filter
+                where_filter = Filter.by_property("source_url").equal(filter_dict["source_url"])
+
+            # Get documents matching filter
+            if where_filter:
+                response = collection.query.fetch_objects(
+                    filters=where_filter,
+                    limit=1000  # Max batch size
+                )
+            else:
+                response = collection.query.fetch_objects(
+                    limit=1000  # Max batch size
+                )
+
+            # Delete matching documents
+            deleted_count = 0
+            for obj in response.objects:
+                try:
+                    collection.data.delete_by_id(obj.uuid)
+                    deleted_count += 1
+                except Exception:
+                    continue
+
+            return deleted_count
+
+        except Exception as e:
+            raise StorageError(f"Failed to delete by filter: {e}") from e
+
+    async def delete_collection(self) -> bool:
+        """
+        Delete the entire collection.
+
+        Returns:
+            True if successful
+        """
+        try:
+            if not self.client:
+                raise StorageError("Weaviate client not initialized")
+
+            # Delete the collection using the client's collections API
+            self.client.collections.delete(self.collection_name)
+
+            return True
+
+        except Exception as e:
+            raise StorageError(f"Failed to delete collection: {e}") from e
+
+    def __del__(self) -> None:
+        """Clean up client connection."""
+        if self.client:
+            self.client.close()
--- a/ingest_pipeline/utils/init.py
+++ b/ingest_pipeline/utils/init.py
@@ -0,0 +1,6 @@
+"""Utility modules."""
+
+from .metadata_tagger import MetadataTagger
+from .vectorizer import Vectorizer
+
+__all__ = ["MetadataTagger", "Vectorizer"]
--- a/ingest_pipeline/utils/pycache/init.cpython-312.pyc
+++ b/ingest_pipeline/utils/pycache/init.cpython-312.pyc
--- a/ingest_pipeline/utils/pycache/metadata_tagger.cpython-312.pyc
+++ b/ingest_pipeline/utils/pycache/metadata_tagger.cpython-312.pyc
--- a/ingest_pipeline/utils/pycache/vectorizer.cpython-312.pyc
+++ b/ingest_pipeline/utils/pycache/vectorizer.cpython-312.pyc
--- a/ingest_pipeline/utils/metadata_tagger.py
+++ b/ingest_pipeline/utils/metadata_tagger.py
@@ -0,0 +1,269 @@
+"""Metadata tagger for enriching documents with AI-generated tags and metadata."""
+
+import json
+from datetime import UTC, datetime
+from typing import TypedDict
+
+import httpx
+
+from ..core.exceptions import IngestionError
+from ..core.models import Document
+
+
+class DocumentMetadata(TypedDict, total=False):
+    """Structured metadata for documents."""
+
+    tags: list[str]
+    category: str
+    summary: str
+    key_topics: list[str]
+    document_type: str
+    language: str
+    technical_level: str
+
+
+class MetadataTagger:
+    """Generates metadata tags for documents using language models."""
+
+    endpoint: str
+    model: str
+    client: httpx.AsyncClient
+
+    def __init__(
+        self,
+        llm_endpoint: str = "http://llm.lab",
+        model: str = "openai/gpt-4o-mini",
+    ):
+        """
+        Initialize metadata tagger.
+
+        Args:
+            llm_endpoint: LLM API endpoint
+            model: Model to use for tagging
+        """
+        self.endpoint = llm_endpoint
+        self.model = model
+
+        # Get API key from environment
+        import os
+        from pathlib import Path
+
+        from dotenv import load_dotenv
+
+        # Load .env from the project root
+        env_path = Path(__file__).parent.parent.parent / ".env"
+        load_dotenv(env_path)
+
+        api_key = os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY") or ""
+
+        headers = {"Content-Type": "application/json"}
+        if api_key:
+            headers["Authorization"] = f"Bearer {api_key}"
+
+        self.client = httpx.AsyncClient(timeout=60.0, headers=headers)
+
+    async def tag_document(
+        self, document: Document, custom_instructions: str | None = None
+    ) -> Document:
+        """
+        Analyze document and generate metadata tags.
+
+        Args:
+            document: Document to tag
+            custom_instructions: Optional custom instructions for tagging
+
+        Returns:
+            Document with enriched metadata
+        """
+        if not document.content:
+            return document
+
+        try:
+            # Generate metadata using LLM
+            metadata = await self._generate_metadata(
+                document.content,
+                document.metadata.get("title") if document.metadata else None,
+                custom_instructions
+            )
+
+            # Merge with existing metadata - preserve required fields
+            from ..core.models import DocumentMetadata as CoreDocumentMetadata
+
+            updated_metadata: CoreDocumentMetadata = {
+                "source_url": document.metadata.get("source_url", ""),
+                "title": metadata.get("title") or document.metadata.get("title"),
+                "description": metadata.get("summary") or document.metadata.get("description"),
+                "timestamp": document.metadata.get("timestamp", datetime.now(UTC)),
+                "content_type": document.metadata.get("content_type", "text/plain"),
+                "word_count": document.metadata.get("word_count", len(document.content.split())),
+                "char_count": document.metadata.get("char_count", len(document.content)),
+            }
+
+            # Store additional metadata as extra fields in the document's metadata
+            # Note: Since DocumentMetadata is a TypedDict, we can only include the defined fields
+            # Additional metadata like tags, category, etc. would need to be stored separately
+            # or the DocumentMetadata model would need to be extended
+
+            document.metadata = updated_metadata
+
+            return document
+
+        except Exception as e:
+            raise IngestionError(f"Failed to tag document: {e}") from e
+
+    async def tag_batch(
+        self,
+        documents: list[Document],
+        custom_instructions: str | None = None,
+    ) -> list[Document]:
+        """
+        Tag multiple documents with metadata.
+
+        Args:
+            documents: Documents to tag
+            custom_instructions: Optional custom instructions
+
+        Returns:
+            Documents with enriched metadata
+        """
+        tagged_docs: list[Document] = []
+
+        for doc in documents:
+            tagged_doc = await self.tag_document(doc, custom_instructions)
+            tagged_docs.append(tagged_doc)
+
+        return tagged_docs
+
+    async def _generate_metadata(
+        self,
+        content: str,
+        title: str | None = None,
+        custom_instructions: str | None = None,
+    ) -> DocumentMetadata:
+        """
+        Generate metadata using LLM.
+
+        Args:
+            content: Document content
+            title: Document title
+            custom_instructions: Optional custom instructions
+
+        Returns:
+            Generated metadata dictionary
+        """
+        # Prepare the prompt
+        system_prompt = """You are a document metadata tagger. Analyze the given content and generate relevant metadata.
+
+Return a JSON object with the following structure:
+{
+    "tags": ["tag1", "tag2", ...],  # 3-7 relevant topic tags
+    "category": "string",  # Main category
+    "summary": "string",  # 1-2 sentence summary
+    "key_topics": ["topic1", "topic2", ...],  # Main topics discussed
+    "document_type": "string",  # Type of document (e.g., "technical", "tutorial", "reference")
+    "language": "string",  # Primary language (e.g., "en", "es")
+    "technical_level": "string"  # One of: "beginner", "intermediate", "advanced"
+}"""
+
+        if custom_instructions:
+            system_prompt += f"\n\nAdditional instructions: {custom_instructions}"
+
+        # Prepare user prompt
+        user_prompt = "Document to analyze:\n"
+        if title:
+            user_prompt += f"Title: {title}\n"
+        user_prompt += f"Content:\n{content[:3000]}"  # Limit content length
+
+        # Call LLM
+        response = await self.client.post(
+            f"{self.endpoint}/v1/chat/completions",
+            json={
+                "model": self.model,
+                "messages": [
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt},
+                ],
+                "temperature": 0.3,
+                "max_tokens": 500,
+                "response_format": {"type": "json_object"},
+            },
+        )
+        response.raise_for_status()
+
+        result = response.json()
+        if not isinstance(result, dict):
+            raise IngestionError("Invalid response format from LLM")
+
+        # Extract content from response
+        choices = result.get("choices", [])
+        if not choices:
+            raise IngestionError("No response from LLM")
+
+        message = choices[0].get("message", {})
+        content_str = message.get("content", "{}")
+
+        try:
+            metadata = json.loads(content_str)
+        except json.JSONDecodeError as e:
+            raise IngestionError(f"Failed to parse LLM response: {e}") from e
+
+        # Validate and sanitize metadata
+        return self._sanitize_metadata(metadata)
+
+    def _sanitize_metadata(self, metadata: dict[str, object]) -> DocumentMetadata:
+        """
+        Sanitize and validate metadata.
+
+        Args:
+            metadata: Raw metadata from LLM
+
+        Returns:
+            Sanitized metadata
+        """
+        sanitized: DocumentMetadata = {}
+
+        # Tags
+        if "tags" in metadata and isinstance(metadata["tags"], list):
+            tags = [str(tag).lower().strip() for tag in metadata["tags"][:10]]
+            sanitized["tags"] = [tag for tag in tags if tag]
+
+        # Category
+        if "category" in metadata:
+            sanitized["category"] = str(metadata["category"]).strip()
+
+        # Summary
+        if "summary" in metadata:
+            summary = str(metadata["summary"]).strip()
+            if summary:
+                sanitized["summary"] = summary[:500]  # Limit length
+
+        # Key topics
+        if "key_topics" in metadata and isinstance(metadata["key_topics"], list):
+            topics = [str(topic).strip() for topic in metadata["key_topics"][:10]]
+            sanitized["key_topics"] = [topic for topic in topics if topic]
+
+        # Document type
+        if "document_type" in metadata:
+            sanitized["document_type"] = str(metadata["document_type"]).strip()
+
+        # Language
+        if "language" in metadata:
+            lang = str(metadata["language"]).strip().lower()
+            if len(lang) == 2:  # Basic validation for ISO 639-1
+                sanitized["language"] = lang
+
+        # Technical level
+        if "technical_level" in metadata:
+            level = str(metadata["technical_level"]).strip().lower()
+            if level in ["beginner", "intermediate", "advanced"]:
+                sanitized["technical_level"] = level
+
+        return sanitized
+
+    async def __aenter__(self) -> "MetadataTagger":
+        """Async context manager entry."""
+        return self
+
+    async def __aexit__(self, *args: object) -> None:
+        """Async context manager exit."""
+        await self.client.aclose()
--- a/ingest_pipeline/utils/vectorizer.py
+++ b/ingest_pipeline/utils/vectorizer.py
@@ -0,0 +1,220 @@
+"""Vectorizer utility for generating embeddings."""
+
+from types import TracebackType
+from typing import Self
+
+import httpx
+
+from ..core.exceptions import VectorizationError
+from ..core.models import StorageConfig, VectorConfig
+
+
+class Vectorizer:
+    """Handles text vectorization using LLM endpoints."""
+
+    endpoint: str
+    model: str
+    dimension: int
+    client: httpx.AsyncClient
+
+    def __init__(self, config: StorageConfig | VectorConfig):
+        """
+        Initialize vectorizer.
+
+        Args:
+            config: Configuration with embedding details
+        """
+        if isinstance(config, StorageConfig):
+            # Extract vector config from storage config
+            self.endpoint = "http://llm.lab"
+            self.model = "ollama/bge-m3:latest"
+            self.dimension = 1024
+        else:
+            self.endpoint = str(config.embedding_endpoint)
+            self.model = config.model
+            self.dimension = config.dimension
+
+        # Get API key from environment
+        import os
+        from dotenv import load_dotenv
+        from pathlib import Path
+
+        # Load .env from the project root
+        env_path = Path(__file__).parent.parent.parent / ".env"
+        load_dotenv(env_path)
+
+        api_key = os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY") or ""
+
+        headers = {"Content-Type": "application/json"}
+        if api_key:
+            headers["Authorization"] = f"Bearer {api_key}"
+
+        self.client = httpx.AsyncClient(timeout=60.0, headers=headers)
+
+    async def vectorize(self, text: str) -> list[float]:
+        """
+        Generate embedding vector for text.
+
+        Args:
+            text: Text to vectorize
+
+        Returns:
+            Embedding vector
+        """
+        if not text:
+            raise VectorizationError("Cannot vectorize empty text")
+
+        try:
+            # Prepare request based on model type
+            if "ollama" in self.model:
+                response = await self._ollama_embed(text)
+            else:
+                response = await self._openai_embed(text)
+
+            return response
+
+        except Exception as e:
+            raise VectorizationError(f"Vectorization failed: {e}") from e
+
+    async def vectorize_batch(self, texts: list[str]) -> list[list[float]]:
+        """
+        Generate embeddings for multiple texts.
+
+        Args:
+            texts: List of texts to vectorize
+
+        Returns:
+            List of embedding vectors
+        """
+        vectors: list[list[float]] = []
+
+        for text in texts:
+            vector = await self.vectorize(text)
+            vectors.append(vector)
+
+        return vectors
+
+    async def _ollama_embed(self, text: str) -> list[float]:
+        """
+        Generate embedding using Ollama via OpenAI-compatible endpoint.
+
+        Args:
+            text: Text to embed
+
+        Returns:
+            Embedding vector
+        """
+        # Keep the full model name for OpenAI-compatible endpoints
+        model_name = self.model
+
+        # Use OpenAI-compatible endpoint for ollama models
+        response = await self.client.post(
+            f"{self.endpoint}/v1/embeddings",
+            json={
+                "model": model_name,
+                "input": text,
+            },
+        )
+        _ = response.raise_for_status()
+
+        response_data = response.json()
+        if not isinstance(response_data, dict):
+            raise VectorizationError("Invalid response format from OpenAI-compatible API")
+
+        # Parse OpenAI-compatible response format
+        embeddings_raw = response_data.get("data", [])
+        if not isinstance(embeddings_raw, list) or not embeddings_raw:
+            raise VectorizationError("No embeddings returned")
+
+        first_embedding_data = embeddings_raw[0]
+        if not isinstance(first_embedding_data, dict):
+            raise VectorizationError("Invalid embedding data format")
+
+        embedding_raw = first_embedding_data.get("embedding")
+        if not isinstance(embedding_raw, list):
+            raise VectorizationError("Invalid embedding format")
+
+        # Convert to float list and validate
+        embedding: list[float] = []
+        for item in embedding_raw:
+            if isinstance(item, (int, float)):
+                embedding.append(float(item))
+            else:
+                raise VectorizationError(f"Invalid embedding value type: {type(item)}")
+
+        # Ensure correct dimension
+        if len(embedding) != self.dimension:
+            # Truncate or pad as needed
+            if len(embedding) > self.dimension:
+                embedding = embedding[: self.dimension]
+            else:
+                embedding.extend([0.0] * (self.dimension - len(embedding)))
+
+        return embedding
+
+    async def _openai_embed(self, text: str) -> list[float]:
+        """
+        Generate embedding using OpenAI-compatible API.
+
+        Args:
+            text: Text to embed
+
+        Returns:
+            Embedding vector
+        """
+        response = await self.client.post(
+            f"{self.endpoint}/v1/embeddings",
+            json={
+                "model": self.model,
+                "input": text,
+            },
+        )
+        _ = response.raise_for_status()
+
+        response_data = response.json()
+        if not isinstance(response_data, dict):
+            raise VectorizationError("Invalid response format from OpenAI API")
+
+        data: dict[str, list[dict[str, list[float]]]] = response_data
+
+        embeddings_raw = data.get("data", [])
+        if not isinstance(embeddings_raw, list) or not embeddings_raw:
+            raise VectorizationError("No embeddings returned")
+
+        first_embedding_data = embeddings_raw[0]
+        if not isinstance(first_embedding_data, dict):
+            raise VectorizationError("Invalid embedding data format")
+
+        embedding_raw = first_embedding_data.get("embedding")
+        if not isinstance(embedding_raw, list):
+            raise VectorizationError("Invalid embedding format")
+
+        # Convert to float list and validate
+        embedding: list[float] = []
+        for item in embedding_raw:
+            if isinstance(item, (int, float)):
+                embedding.append(float(item))
+            else:
+                raise VectorizationError(f"Invalid embedding value type: {type(item)}")
+
+        # Ensure correct dimension
+        if len(embedding) != self.dimension:
+            if len(embedding) > self.dimension:
+                embedding = embedding[: self.dimension]
+            else:
+                embedding.extend([0.0] * (self.dimension - len(embedding)))
+
+        return embedding
+
+    async def __aenter__(self) -> Self:
+        """Async context manager entry."""
+        return self
+
+    async def __aexit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: TracebackType | None,
+    ) -> None:
+        """Async context manager exit."""
+        await self.client.aclose()
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,78 @@
+[project]
+name = "ingest-pipeline"
+version = "0.1.0"
+description = "Document ingestion pipeline with Prefect orchestration"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "prefect>=2.14.0",
+    "pydantic>=2.5.0",
+    "pydantic-settings>=2.1.0",
+    "firecrawl-py>=0.0.1",
+    "gitpython>=3.1.40",
+    "weaviate-client>=4.4.0",
+    "httpx>=0.25.0",
+    "typer>=0.9.0",
+    "rich>=13.7.0",
+    "textual>=0.50.0",
+    "python-dotenv>=1.0.0",
+]
+
+[project.scripts]
+ingest = "ingest_pipeline.cli.main:app"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["ingest_pipeline"]
+
+[tool.uv]
+dev-dependencies = [
+    "pytest>=7.4.0",
+    "pytest-asyncio>=0.21.0",
+    "pytest-cov>=4.1.0",
+    "mypy>=1.7.0",
+    "ruff>=0.1.0",
+    "basedpyright>=1.31.4",
+]
+
+[tool.ruff]
+line-length = 100
+target-version = "py311"
+
+[tool.ruff.lint]
+select = [
+    "E",  # pycodestyle errors
+    "W",  # pycodestyle warnings
+    "F",  # pyflakes
+    "I",  # isort
+    "B",  # flake8-bugbear
+    "C4", # flake8-comprehensions
+    "UP", # pyupgrade
+]
+ignore = [
+    "E501", # line too long (handled by formatter)
+]
+
+[tool.ruff.lint.per-file-ignores]
+"ingest_pipeline/cli/main.py" = ["B008"] # Typer uses function calls in defaults
+
+[tool.mypy]
+python_version = "3.11"
+strict = true
+warn_return_any = true
+warn_unused_configs = true
+ignore_missing_imports = true
+# Allow AsyncGenerator types in overrides
+disable_error_code = ["override"]
+
+[tool.pytest.ini_options]
+asyncio_mode = "auto"
+testpaths = ["tests"]
+pythonpath = ["."]
+
+[tool.coverage.run]
+source = ["ingest_pipeline"]
+omit = ["*/tests/*", "*/__main__.py"]
--- a/3
+++ b/3
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+cd "$(dirname "$0")"
+uv run python -m ingest_pipeline tui
--- a/uv.lock
+++ b/uv.lock