This commit is contained in:
2025-09-19 13:34:17 +00:00
parent 97bca3809e
commit d482923804
53 changed files with 8013 additions and 7173 deletions

1
.env
View File

@@ -25,6 +25,7 @@ FIRECRAWL_ENDPOINT=http://crawl.lab:30002
# Model Configuration
EMBEDDING_MODEL=ollama/bge-m3:latest
EMBEDDING_DIMENSION=1024
METADATA_MODEL=fireworks/glm-4p5-air
# Ingestion Settings
BATCH_SIZE=50

View File

@@ -2,6 +2,8 @@
FIRECRAWL_API_KEY=
OPENWEBUI_API_KEY=
WEAVIATE_API_KEY=
LLM_API_KEY=
OPENAI_API_KEY=
# Endpoints
LLM_ENDPOINT=http://llm.lab
@@ -12,6 +14,7 @@ FIRECRAWL_ENDPOINT=http://crawl.lab:30002
# Model Configuration
EMBEDDING_MODEL=ollama/bge-m3:latest
EMBEDDING_DIMENSION=1024
METADATA_MODEL=fireworks/glm-4p5-air
# Ingestion Settings
BATCH_SIZE=50

41
.vscode/settings.json vendored
View File

@@ -7,14 +7,23 @@
"python.linting.mypyPath": "./.venv/bin/mypy",
"python.linting.pylintEnabled": false,
"python.linting.flake8Enabled": false,
"python.analysis.typeCheckingMode": "basic",
"python.analysis.typeCheckingMode": "strict",
"python.analysis.autoImportCompletions": true,
"python.analysis.stubPath": "./.venv/lib/python3.12/site-packages",
"python.analysis.memory.keepLibraryAst": false,
"python.analysis.indexing": true,
"python.analysis.packageIndexDepths": [
{
"name": "",
"depth": 2
}
],
"basedpyright.analysis.typeCheckingMode": "standard",
"basedpyright.analysis.autoSearchPaths": true,
"basedpyright.analysis.autoImportCompletions": true,
"basedpyright.analysis.diagnosticMode": "workspace",
"basedpyright.analysis.stubPath": "./.venv/lib/python3.12/site-packages",
"basedpyright.analysis.useLibraryCodeForTypes": false,
"basedpyright.analysis.extraPaths": [
"./ingest_pipeline",
"./.venv/lib/python3.12/site-packages"
@@ -29,9 +38,33 @@
"./.venv/lib/python3.12/site-packages"
],
"files.exclude": {
".mypy_cache": true,
"**/__pycache__": true,
"**/.pytest_cache": true,
"**/node_modules": true,
".mypy_cache": true
}
"**/.ruff": true,
"**/.uv**": true,
"**/.venv": true,
"**/node_modules": true
},
"python.analysis.enableTroubleshootMissingImports": true,
"python.analysis.generateWithTypeAnnotation": true,
"python.analysis.inlayHints.callArgumentNames": "partial",
"python.analysis.languageServerMode": "full",
"python.analysis.regenerateStdLibIndices": true,
"python.analysis.typeEvaluation.enableExperimentalFeatures": true,
"python.analysis.typeEvaluation.strictDictionaryInference": true,
"python.analysis.typeEvaluation.strictListInference": true,
"python.analysis.typeEvaluation.strictSetInference": true,
"python.terminal.activateEnvInCurrentTerminal": true,
"python.testing.pytestEnabled": true,
"python.useEnvironmentsExtension": true,
"editor.formatOnSave": true,
"mcp": {},
"python.pyrefly.displayTypeErrors": "force-on",
"python-envs.defaultEnvManager": "ms-python.python:venv",
"python-envs.defaultPackageManager": "charliermarsh.ruff:uv",
"python-envs.pythonProjects": [],
"python.analysis.fixAll": [],
"python.analysis.includeAliasesFromUserFiles": true,
"python.analysis.showOnlyDirectDependenciesInAutoImport": true
}

View File

@@ -1,106 +1,263 @@
# TUI Feeds
## Codebase Analysis Report: RAG Manager Ingestion Pipeline
This guide explains how the terminal dashboard surfaces collection activity and status signals so new backends can plug in without duplicating UI logic.
**Status:** Validated against current codebase implementation
**Target:** Enhanced implementation guidance for efficient agent execution
***
This analysis has been validated against the actual codebase structure and provides implementation-specific details for executing recommended improvements. The codebase demonstrates solid architecture with clear separation of concerns between ingestion flows, storage adapters, and TUI components.
## Activity Feed
### Architecture Overview
- **Storage Backends**: Weaviate, OpenWebUI, R2R with unified `BaseStorage` interface
- **TUI Framework**: Textual-based with reactive components and async worker patterns
- **Orchestration**: Prefect flows with retry logic and progress callbacks
- **Configuration**: Pydantic-based settings with environment variable support
- **Primary surface:** `#activity_feed` widget inside `DashboardScreen` (`ingest_pipeline/cli/tui/screens/dashboard.py`).
- **Data source:** `self.collections`, populated by `refresh_collections()` after gathering payloads from Weaviate and OpenWebUI via `describe_collections()`.
- **Selection logic:** `_generate_activity_text()` formats the three most recent `CollectionInfo` entries and appends an aggregate line when additional collections exist.
- **Empty state:** Presents the call-to-action _“🚀 No collections found…”_ encouraging the user to launch an ingestion run.
- **Icons:** `_get_content_type_icon()` maps collection names containing `web`, `doc`, or `repo` to 🌐/📖/📦 respectively, and falls back to 📄. Update this helper when introducing new naming conventions.
### Validated Implementation Analysis
### When it refreshes
### 1. Bug Fixes & Potential Issues
1. `refresh_collections()` loads data for each connected backend and caches it in `self.collections`.
2. `_update_activity_feed()` is triggered from `update_metrics()` immediately after metrics cards recompute.
3. The Static widget updates with a newline-delimited summary, keeping the dashboard reactive without rerendering the entire layout.
These are areas where the code may not function as intended or could lead to errors.
To surface a new backend, extend either `list_weaviate_collections()` or `list_openwebui_collections()` with the additional source (or introduce a new list helper) and ensure the resulting dictionaries match the `CollectionInfo` contract.
* <details>
<summary>
<b>HIGH PRIORITY: `R2RStorage.store_batch` inefficient looping (Lines 161-179)</b>
</summary>
***
* **File:** `ingest_pipeline/storage/r2r/storage.py:161-179`
* **Issue:** CONFIRMED - Method loops through documents calling `_store_single_document` individually
* **Impact:** ~5-10x performance degradation for batch operations
* **Implementation:** Check R2R v3 API for bulk endpoints; current implementation uses `/v3/documents` per document
* **Effort:** Medium (API research + refactor)
* **Priority:** High - affects all R2R ingestion workflows
</details>
## Status Ticker
* <details>
<summary>
<b>MEDIUM PRIORITY: Mixed HTTP client usage in `R2RStorage` (Lines 80, 99, 258)</b>
</summary>
- **Widget:** `#status_text` Static component under the metrics card cluster.
- **Lifecycle:** `refresh_collections()` pushes human-readable messages as each backend initializes, succeeds, or fails, ending with a ready state.
- **Problem reporting:** Failures bubble into rich notifications via `self.notify` and remain visible in the ticker until the next refresh attempt.
- **System health badge:** `_update_status_card()` converts backend counts into 🟢/🟡/🔴 badges so operators can judge connectivity at a glance.
* **File:** `ingest_pipeline/storage/r2r/storage.py:80,99,258`
* **Issue:** VALIDATED - Mixes `R2RAsyncClient` (line 80) with direct `httpx.AsyncClient` (lines 99, 258)
* **Specific Methods:** `initialize()`, `_ensure_collection()`, `_attempt_document_creation()`
* **Impact:** Inconsistent auth/header handling, connection pooling inefficiency
* **Implementation:** Extend `R2RAsyncClient` or create adapter pattern for missing endpoints
* **Test Coverage:** Check if affected methods have unit tests before refactoring
* **Effort:** Medium (requires SDK analysis)
</details>
When adding a backend integration, hook into the progress text updates inside `refresh_collections()` so the ticker narrates each stage consistently.
* <details>
<summary>
<b>MEDIUM PRIORITY: TUI blocking during storage init (Line 91)</b>
</summary>
***
* **File:** `ingest_pipeline/cli/tui/utils/runners.py:91`
* **Issue:** CONFIRMED - `await storage_manager.initialize_all_backends()` blocks TUI startup
* **Current Implementation:** 30s timeout per backend in `StorageManager.initialize_all_backends()`
* **User Impact:** Frozen terminal for up to 90s if all backends timeout
* **Solution:** Move to `CollectionOverviewScreen.on_mount()` as `@work` task
* **Dependencies:** `dashboard.py:304` already has worker pattern for `refresh_collections`
* **Implementation:** Use existing loading indicators and status updates (lines 308-312)
* **Effort:** Low (pattern exists, needs relocation)
</details>
## Notifications & Progress
* <details>
<summary>
<b>LOW PRIORITY: Weak URL validation in `IngestionScreen` (Lines 240-260)</b>
</summary>
- **Toast notifications:** All feed-relevant exceptions use `self.notify` with severity hints, keeping the activity feed focused on successful runs.
- **Ingestion progress:** `IngestionScreen.perform_ingestion()` (same module) drives the animated progress bar and sends celebratory/failure messages that complement the dashboard feed once control returns to the main screen.
* **File:** `ingest_pipeline/cli/tui/screens/ingestion.py:240-260`
* **Issue:** CONFIRMED - Method accepts `foo/bar` as valid (line 258)
* **Security Risk:** Medium - malicious URLs could be passed to ingestors
* **Current Logic:** Basic prefix checks only (http/https/file://)
* **Enhancement:** Add `pathlib.Path.exists()` for file:// paths, `.git` directory check for repos
* **Dependencies:** Import `pathlib` and add proper regex validation
* **Alternative:** Use `validators` library (not currently imported)
* **Effort:** Low (validation logic only)
</details>
***
### 2. Code Redundancy & Refactoring Opportunities
## Extending the Feed System
These suggestions aim to make the code more concise, maintainable, and reusable (D.R.Y. - Don't Repeat Yourself).
1. Return a fully populated `CollectionInfo` (name, type, backend label, status, last_updated, size_mb, count).
2. Call `update_metrics()` after mutating `self.collections` so both metrics cards and the activity feed stay in sync.
3. Adjust `_get_content_type_icon()` or `_format_collection_item()` if the new source warrants distinct labeling.
4. Update end-to-end tests or manual runbooks to verify the ticker, notifications, and activity feed stay coherent after integration.
* <details>
<summary>
<b>HIGH IMPACT: Redundant collection logic in dashboard (Lines 356-424)</b>
</summary>
***
* **File:** `ingest_pipeline/cli/tui/screens/dashboard.py:356-424`
* **Issue:** CONFIRMED - `list_weaviate_collections()` and `list_openwebui_collections()` duplicate `StorageManager.get_all_collections()`
* **Code Duplication:** ~70 lines of redundant collection listing logic
* **Architecture Violation:** UI layer coupled to specific storage implementations
* **Current Usage:** `refresh_collections()` calls `get_all_collections()` (line 327), making methods obsolete
* **Action:** DELETE methods `list_weaviate_collections` and `list_openwebui_collections`
* **Impact:** Code reduction ~70 lines, improved maintainability
* **Risk:** Low - methods appear unused in current flow
* **Effort:** Low (deletion only)
</details>
## Implementation Status (September 17, 2025)
* <details>
<summary>
<b>MEDIUM IMPACT: Repetitive backend init pattern (Lines 255-291)</b>
</summary>
| Component | Responsibility | Location |
| --- | --- | --- |
| Activity feed rendering | `_update_activity_feed`, `_generate_activity_text`, `_format_collection_item` | `ingest_pipeline/cli/tui/screens/dashboard.py`
| Backend loaders | `list_weaviate_collections`, `list_openwebui_collections` | `ingest_pipeline/cli/tui/screens/dashboard.py`
| Status ticker & health badge | `_update_status_card`, `refresh_collections` progress updates | `ingest_pipeline/cli/tui/screens/dashboard.py`
| Ingestion progress hand-off | `perform_ingestion` success/error notifications | `ingest_pipeline/cli/tui/screens/ingestion.py`
* **File:** `ingest_pipeline/cli/tui/utils/storage_manager.py:255-291`
* **Issue:** CONFIRMED - Pattern repeated 3x for each backend type
* **Code Structure:** Check settings → Create config → Add task (12 lines × 3 backends)
* **Current Backends:** Weaviate (258-267), OpenWebUI (270-279), R2R (282-291)
* **Refactor Pattern:** Create `BackendConfig` dataclass with `(backend_type, endpoint_setting, api_key_setting, storage_class)`
* **Implementation:** Loop over config list, reducing ~36 lines to ~15 lines
* **Extensibility:** Adding new backend becomes one-line config addition
* **Testing:** Ensure `asyncio.gather()` behavior unchanged (line 296)
* **Effort:** Medium (requires dataclass design + testing)
</details>
***
* <details>
<summary>
<b>MEDIUM IMPACT: Repeated Prefect block loading pattern (Lines 266-311)</b>
</summary>
## Multi-Storage Ingestion Refactor Plan
* **File:** `ingest_pipeline/flows/ingestion.py:266-311`
* **Issue:** CONFIRMED - Pattern in `_create_ingestor()` and `_create_storage()` methods
* **Duplication:** `Block.aload()` + fallback logic repeated 4x across both methods
* **Variable Resolution:** Batch size logic (lines 244-255) also needs abstraction
* **Helper Functions Needed:**
- `load_block_with_fallback(block_slug: str, default_config: T) -> T`
- `resolve_prefect_variable(var_name: str, default: T, type_cast: Type[T]) -> T`
* **Impact:** Cleaner flow logic, better error handling, type safety
* **Lines Reduced:** ~20 lines of repetitive code
* **Effort:** Medium (requires generic typing)
</details>
### 0. Guardrails and Baseline
- Activate the virtual environment (`source .venv/bin/activate`) before running any tooling.
- Capture current lint, type, and test status (`uv run basedpyright`, `uv run ruff check`, `uv run pytest`) to compare after the refactor.
- Record the existing ingestion modal behaviour (screenshots or a short `textual run --dev ingest_pipeline/cli/tui` demo) to verify UX parity later.
### 3. User Experience (UX) Enhancements
### 1. Storage Layer Enhancements
- Graduate `MultiStorageAdapter` into `ingest_pipeline/storage/` so it can be reused outside the TUI package.
- Extend `BaseStorage` with a descriptive `display_name` property that downstream UIs can show without hard-coding labels.
- Harden the adapter: aggregate per-backend failures, short-circuit `close()` safely, and surface a structured result containing `success_ids` and `failed_targets`.
- Add `StorageManager.build_multi_adapter(backends: Sequence[StorageBackend])` that returns an initialised adapter (invokes `initialize()` on each child) and memoises singletons for reuse inside the session.
These are suggestions to make your TUI more powerful, intuitive, and enjoyable for the user.
### 2. Application Wiring
- Refactor `CollectionManagementApp` to accept a `StorageManager` plus optional cached clients, removing direct constructor parameters for Weaviate/OpenWebUI.
- Update all screens (`dashboard.py`, `documents.py`, `search.py`, dialogs) to pull storages through the shared manager instead of owning bespoke references.
- Expose a capability flag (e.g., `StorageCapabilities.REPLICATION`) so the dashboard can badge backends that support multi-target ingestion.
* <details>
<summary>
<b>HIGH IMPACT: Document content viewer modal (Add to documents.py)</b>
</summary>
### 3. Ingestion Modal UX
- Replace the single-backend select with a checkbox group generated from `StorageManager.get_available_backends()`; preserve keyboard shortcuts (`1`, `2`, `3`, plus `ctrl+shift+<n>` for toggling if feasible).
- Default the selection to the collections current backend but allow "Select All"/"Clear" convenience buttons.
- Persist the latest selection inside a lightweight config file (for example `~/.config/rag-manager/tui.json`) to improve repeated runs.
* **Target File:** `ingest_pipeline/cli/tui/screens/documents.py`
* **Current State:** READY - `DocumentManagementScreen` has table selection (line 212)
* **Implementation:**
- Add `Binding("v", "view_document", "View")` to BINDINGS (line 27)
- Create `DocumentContentModal(ModalScreen)` with `ScrollableContainer` + `Markdown`
- Use existing `get_current_document()` method (line 212)
- Fetch full content via `storage.retrieve(document_id)`
* **Dependencies:** Import `ModalScreen`, `ScrollableContainer`, `Markdown` from textual
* **User Value:** HIGH - essential for content inspection workflow
* **Effort:** Low-Medium (~50 lines of modal code)
* **Pattern:** Follow existing modal patterns in codebase
</details>
### 4. Flow Integration
- Update `IngestionScreen.perform_ingestion()` to build the multi-adapter, pass it to `ingest_documents_task`, and capture per-backend success/failure counts for feed reporting.
- Teach `ingest_pipeline/flows/ingestion.py` helpers to recognise the adapter (inspect for `fanout_targets`) and log progress per backend, while keeping Firecrawl→R2R flow single-target until replication lands there.
- Ensure partial failures propagate as `IngestionStatus.PARTIAL` with an error message enumerating the failing targets.
* <details>
<summary>
<b>HIGH IMPACT: Analytics tab visualization (Lines 164-189)</b>
</summary>
### 5. Feeds, Ticker, and Notifications
- Extend `_generate_activity_text()` to append the backend list (e.g., `→ weaviate + open_webui`) when a multi-target run finishes.
- Add per-backend status lines to the progress ticker so operators know which replication stage is executing.
- Emit granular toast notifications: success summary plus warning toasts for any backend that failed to store documents.
* **Target File:** `ingest_pipeline/cli/tui/screens/dashboard.py:164-189`
* **Current State:** PLACEHOLDER - Static widgets with dummy content
* **Data Source:** Use existing `self.collections` (line 65) populated by `refresh_collections()`
* **Implementation Options:**
1. **Simple Text Chart:** ASCII bar chart using existing collections data
2. **textual-plotext:** Add dependency + bar chart widget
3. **Custom Widget:** Simple bar visualization with Static widgets
* **Metrics to Show:**
- Documents per collection (data available)
- Storage usage per backend (calculated in `_calculate_metrics()`)
- Ingestion timeline (requires timestamp tracking)
* **Effort:** Low-Medium (depends on visualization complexity)
* **Dependencies:** Consider `textual-plotext` or pure ASCII approach
</details>
### 6. Validation
- Add unit coverage for `MultiStorageAdapter` (full success, partial failure, close semantics) under `ingest_pipeline/tests/storage/`.
- Create a focused TUI smoke test that opens the ingestion modal, toggles multiple checkboxes, and asserts the resulting progress copy.
- Re-run `uv run basedpyright`, `uv run ruff check`, and the targeted pytest suite before and after changes; address new diagnostics immediately.
- Optionally script a headless `textual run` that simulates ingestion across two mock storages to guard against regressions.
* <details>
<summary>
<b>MEDIUM IMPACT: Global search implementation (Button exists, needs screen)</b>
</summary>
### 7. Documentation and Rollout
- Update this document and `README.md` with refreshed screenshots/GIFs demonstrating multi-backend ingestion.
- Draft release notes covering required configuration (API keys for every backend) and outline rollback instructions (git tag + revert steps).
- Brief support/playbook owners on interpreting the enriched feed/ticker signals so incidents can be triaged quickly.
* **Target File:** `ingest_pipeline/cli/tui/screens/dashboard.py`
* **Current State:** READY - "Search All" button exists (line 122), handler stubbed
* **Backend Support:** `StorageManager.search_across_backends()` method exists (line 413-441)
* **Implementation:**
- Create `GlobalSearchScreen(ModalScreen)` with search input + results table
- Use existing `search_across_backends()` method for data
- Add "Backend" column to results table showing data source
- Handle async search with loading indicators
* **Current Limitation:** Search only works for Weaviate (line 563), need to extend
* **Data Flow:** Input → `storage_manager.search_across_backends()` → Results display
* **Effort:** Medium (~100 lines for new screen + search logic)
</details>
* <details>
<summary>
<b>MEDIUM IMPACT: R2R advanced features integration (Widgets ready)</b>
</summary>
* **Target File:** `ingest_pipeline/cli/tui/screens/documents.py`
* **Available Widgets:** CONFIRMED - `ChunkViewer`, `EntityGraph`, `CollectionStats`, `DocumentOverview` in `r2r_widgets.py`
* **Current Implementation:** Basic document table only, R2R-specific features unused
* **Integration Points:**
- Add "R2R Details" button when `collection["type"] == "r2r"` (conditional UI)
- Create `R2RDocumentDetailsScreen` using existing widgets
- Use `StorageManager.get_r2r_storage()` method (exists at line 442)
* **R2R Methods Available:**
- `get_document_chunks()`, `extract_entities()`, `get_document_overview()`
* **User Value:** Medium-High for R2R users, showcases advanced features
* **Effort:** Low-Medium (widgets exist, need screen integration)
</details>
* <details>
<summary>
<b>LOW IMPACT: Create collection dialog (Backend methods exist)</b>
</summary>
* **Target File:** `ingest_pipeline/cli/tui/screens/dashboard.py`
* **Backend Support:** CONFIRMED - `create_collection()` method exists for R2R storage (line 690)
* **Current State:** No "Create Collection" button in existing UI
* **Implementation:**
- Add "New Collection" button to dashboard action buttons
- Create `CreateCollectionModal` with name input + backend checkboxes
- Iterate over `storage_manager.get_available_backends()` for backend selection
- Call `storage.create_collection()` on selected backends
* **Backend Compatibility:** Check which storage backends support collection creation
* **User Value:** Low-Medium (manual workflow, not critical)
* **Effort:** Low-Medium (~75 lines for modal + integration)
</details>
## Implementation Priority Matrix
### Quick Wins (High Impact, Low Effort)
1. **Delete redundant collection methods** (dashboard.py:356-424) - 5 min
2. **Fix TUI startup blocking** (runners.py:91) - 15 min
3. **Document content viewer modal** (documents.py) - 30 min
### High Impact Fixes (Medium Effort)
1. **R2R batch operation optimization** (storage.py:161-179) - Research R2R v3 API + implementation
2. **Analytics tab visualization** (dashboard.py:164-189) - Choose visualization approach + implement
3. **Backend initialization refactoring** (storage_manager.py:255-291) - Dataclass design + testing
### Technical Debt (Long-term)
1. **R2R client consistency** (storage.py) - SDK analysis + refactoring
2. **Prefect block loading helpers** (ingestion.py:266-311) - Generic typing + testing
3. **URL validation enhancement** (ingestion.py:240-260) - Security + validation logic
### Feature Enhancements (User Value)
1. **Global search implementation** - Medium effort, requires search backend extension
2. **R2R advanced features integration** - Showcase existing widget capabilities
3. **Create collection dialog** - Nice-to-have administrative feature
## Agent Execution Notes
**Context Efficiency Tips:**
- Focus on one priority tier at a time
- Read specific file ranges mentioned in line numbers
- Use existing patterns (worker decorators, modal screens, async methods)
- Test changes incrementally, especially async operations
- Verify import dependencies before implementation
**Architecture Constraints:**
- Maintain async/await patterns throughout
- Follow Textual reactive widget patterns
- Preserve Prefect flow structure for orchestration
- Keep storage backend abstraction intact
The codebase demonstrates excellent architectural foundations - these enhancements build upon existing strengths rather than requiring structural changes.

View File

@@ -206,7 +206,11 @@ class CollectionOverviewScreen(Screen[None]):
"""Calculate basic metrics from collections."""
self.total_collections = len(self.collections)
self.total_documents = sum(col["count"] for col in self.collections)
self.active_backends = sum([bool(self.weaviate), bool(self.openwebui), bool(self.r2r)])
# Calculate active backends from storage manager if individual storages are None
if self.weaviate is None and self.openwebui is None and self.r2r is None:
self.active_backends = len(self.storage_manager.get_available_backends())
else:
self.active_backends = sum([bool(self.weaviate), bool(self.openwebui), bool(self.r2r)])
def _update_metrics_cards(self) -> None:
"""Update the metrics cards display."""
@@ -353,75 +357,6 @@ class CollectionOverviewScreen(Screen[None]):
self.is_loading = False
loading_indicator.display = False
async def list_weaviate_collections(self) -> list[CollectionInfo]:
"""List Weaviate collections with enhanced metadata."""
if not self.weaviate:
return []
try:
overview = await self.weaviate.describe_collections()
collections: list[CollectionInfo] = []
for item in overview:
count_raw = item.get("count", 0)
count_val = int(count_raw) if isinstance(count_raw, (int, str)) else 0
size_mb_raw = item.get("size_mb", 0.0)
size_mb_val = float(size_mb_raw) if isinstance(size_mb_raw, (int, float, str)) else 0.0
collections.append(
CollectionInfo(
name=str(item.get("name", "Unknown")),
type="weaviate",
count=count_val,
backend="🗄️ Weaviate",
status="✓ Active",
last_updated=datetime.now().strftime("%Y-%m-%d %H:%M"),
size_mb=size_mb_val,
)
)
return collections
except Exception as e:
self.notify(f"Error listing Weaviate collections: {e}", severity="error", markup=False)
return []
async def list_openwebui_collections(self) -> list[CollectionInfo]:
"""List OpenWebUI collections with enhanced metadata."""
# Try to get OpenWebUI backend from storage manager if direct instance not available
openwebui_backend = self.openwebui
if not openwebui_backend:
backend = self.storage_manager.get_backend(StorageBackend.OPEN_WEBUI)
if not isinstance(backend, OpenWebUIStorage):
return []
openwebui_backend = backend
if not openwebui_backend:
return []
try:
overview = await openwebui_backend.describe_collections()
collections: list[CollectionInfo] = []
for item in overview:
count_raw = item.get("count", 0)
count_val = int(count_raw) if isinstance(count_raw, (int, str)) else 0
size_mb_raw = item.get("size_mb", 0.0)
size_mb_val = float(size_mb_raw) if isinstance(size_mb_raw, (int, float, str)) else 0.0
collection_name = str(item.get("name", "Unknown"))
collections.append(
CollectionInfo(
name=collection_name,
type="openwebui",
count=count_val,
backend="🌐 OpenWebUI",
status="✓ Active",
last_updated=datetime.now().strftime("%Y-%m-%d %H:%M"),
size_mb=size_mb_val,
)
)
return collections
except Exception as e:
self.notify(f"Error listing OpenWebUI collections: {e}", severity="error", markup=False)
return []
async def update_collections_table(self) -> None:
"""Update the collections table with enhanced formatting."""

View File

@@ -3,7 +3,7 @@
from __future__ import annotations
from pathlib import Path
from typing import TYPE_CHECKING, ClassVar
from typing import TYPE_CHECKING
from textual.app import ComposeResult
from textual.binding import Binding
@@ -15,6 +15,7 @@ from typing_extensions import override
from ..models import CollectionInfo
if TYPE_CHECKING:
from ..app import CollectionManagementApp
from .dashboard import CollectionOverviewScreen
from .documents import DocumentManagementScreen
@@ -25,7 +26,12 @@ class ConfirmDeleteScreen(Screen[None]):
collection: CollectionInfo
parent_screen: CollectionOverviewScreen
BINDINGS: list[Binding] = [
@property
def app(self) -> CollectionManagementApp: # type: ignore[override]
"""Return the typed app instance."""
return super().app # type: ignore[return-value]
BINDINGS = [
Binding("escape", "app.pop_screen", "Cancel"),
Binding("y", "confirm_delete", "Yes"),
Binding("n", "app.pop_screen", "No"),
@@ -132,12 +138,16 @@ class ConfirmDeleteScreen(Screen[None]):
return
# Refresh parent screen after a short delay to ensure deletion is processed
self.call_later(lambda _: self.parent_screen.refresh_collections(), 0.5) # 500ms delay
self.call_later(self._refresh_parent_collections, 0.5) # 500ms delay
self.app.pop_screen()
except Exception as e:
self.notify(f"Failed to delete collection: {e}", severity="error", markup=False)
def _refresh_parent_collections(self) -> None:
"""Helper method to refresh parent collections."""
self.parent_screen.refresh_collections()
class ConfirmDocumentDeleteScreen(Screen[None]):
@@ -145,9 +155,14 @@ class ConfirmDocumentDeleteScreen(Screen[None]):
doc_ids: list[str]
collection: CollectionInfo
parent_screen: "DocumentManagementScreen"
parent_screen: DocumentManagementScreen
BINDINGS: list[Binding] = [
@property
def app(self) -> CollectionManagementApp: # type: ignore[override]
"""Return the typed app instance."""
return super().app # type: ignore[return-value]
BINDINGS = [
Binding("escape", "app.pop_screen", "Cancel"),
Binding("y", "confirm_delete", "Yes"),
Binding("n", "app.pop_screen", "No"),
@@ -158,7 +173,7 @@ class ConfirmDocumentDeleteScreen(Screen[None]):
self,
doc_ids: list[str],
collection: CollectionInfo,
parent_screen: "DocumentManagementScreen",
parent_screen: DocumentManagementScreen,
):
super().__init__()
self.doc_ids = doc_ids
@@ -244,7 +259,12 @@ class LogViewerScreen(ModalScreen[None]):
_log_widget: RichLog | None
_log_file: Path | None
BINDINGS: list[Binding] = [
@property
def app(self) -> CollectionManagementApp: # type: ignore[override]
"""Return the typed app instance."""
return super().app # type: ignore[return-value]
BINDINGS = [
Binding("escape", "close", "Close"),
Binding("ctrl+l", "close", "Close"),
Binding("s", "show_path", "Log File"),
@@ -272,13 +292,13 @@ class LogViewerScreen(ModalScreen[None]):
self._log_widget = self.query_one(RichLog)
if hasattr(self.app, 'attach_log_viewer'):
self.app.attach_log_viewer(self)
self.app.attach_log_viewer(self) # type: ignore[arg-type]
def on_unmount(self) -> None:
"""Detach from the parent application when closed."""
if hasattr(self.app, 'detach_log_viewer'):
self.app.detach_log_viewer(self)
self.app.detach_log_viewer(self) # type: ignore[arg-type]
def _get_log_widget(self) -> RichLog:
if self._log_widget is None:

View File

@@ -4,9 +4,9 @@ from datetime import datetime
from textual.app import ComposeResult
from textual.binding import Binding
from textual.containers import Container, Horizontal
from textual.screen import Screen
from textual.widgets import Button, Footer, Header, Label, LoadingIndicator, Static
from textual.containers import Container, Horizontal, ScrollableContainer
from textual.screen import ModalScreen, Screen
from textual.widgets import Button, Footer, Header, Label, LoadingIndicator, Markdown, Static
from typing_extensions import override
from ....storage.base import BaseStorage
@@ -27,6 +27,7 @@ class DocumentManagementScreen(Screen[None]):
BINDINGS = [
Binding("escape", "app.pop_screen", "Back"),
Binding("r", "refresh", "Refresh"),
Binding("v", "view_document", "View"),
Binding("delete", "delete_selected", "Delete Selected"),
Binding("a", "select_all", "Select All"),
Binding("ctrl+a", "select_all", "Select All"),
@@ -324,3 +325,112 @@ class DocumentManagementScreen(Screen[None]):
) -> None:
"""Handle clear selection from enhanced table."""
self.action_select_none()
def action_view_document(self) -> None:
"""View the content of the currently selected document."""
if doc := self.get_current_document():
if self.storage:
self.app.push_screen(DocumentContentModal(doc, self.storage, self.collection["name"]))
else:
self.notify("No storage backend available", severity="error")
else:
self.notify("No document selected", severity="warning")
class DocumentContentModal(ModalScreen[None]):
"""Modal screen for viewing document content."""
DEFAULT_CSS = """
DocumentContentModal {
align: center middle;
}
DocumentContentModal > Container {
width: 90%;
height: 85%;
background: $surface;
border: thick $primary;
}
DocumentContentModal .modal-header {
background: $primary;
color: $text;
padding: 1;
dock: top;
height: 3;
}
DocumentContentModal .modal-content {
padding: 1;
height: 1fr;
}
"""
BINDINGS = [
Binding("escape", "app.pop_screen", "Close"),
Binding("q", "app.pop_screen", "Close"),
]
def __init__(self, document: DocumentInfo, storage: BaseStorage, collection_name: str):
super().__init__()
self.document = document
self.storage = storage
self.collection_name = collection_name
def compose(self) -> ComposeResult:
yield Container(
Static(
f"📄 Document: {self.document['title'][:60]}{'...' if len(self.document['title']) > 60 else ''}",
classes="modal-header"
),
ScrollableContainer(
Markdown("Loading document content...", id="document_content"),
LoadingIndicator(id="content_loading"),
classes="modal-content"
)
)
async def on_mount(self) -> None:
"""Load and display the document content."""
content_widget = self.query_one("#document_content", Markdown)
loading = self.query_one("#content_loading")
try:
# Get full document content
doc_content = await self.storage.retrieve(
self.document["id"],
collection_name=self.collection_name
)
# Format content for display
if isinstance(doc_content, str):
formatted_content = f"""# {self.document['title']}
**Source:** {self.document.get('source_url', 'N/A')}
**Type:** {self.document.get('content_type', 'text/plain')}
**Words:** {self.document.get('word_count', 0):,}
**Timestamp:** {self.document.get('timestamp', 'N/A')}
---
{doc_content}
"""
else:
formatted_content = f"""# {self.document['title']}
**Source:** {self.document.get('source_url', 'N/A')}
**Type:** {self.document.get('content_type', 'text/plain')}
**Words:** {self.document.get('word_count', 0):,}
**Timestamp:** {self.document.get('timestamp', 'N/A')}
---
*Content format not supported for display*
"""
content_widget.update(formatted_content)
except Exception as e:
content_widget.update(f"# Error Loading Document\n\nFailed to load document content: {e}")
finally:
loading.display = False

View File

@@ -2,7 +2,20 @@
from dataclasses import dataclass
from enum import Enum
from typing import Any
from typing import Protocol
from textual.app import App
# Type alias for Textual apps with unknown return type
TextualApp = App[object]
class AppProtocol(Protocol):
"""Protocol for apps that support CSS and refresh."""
def refresh(self) -> None:
"""Refresh the app."""
...
class ThemeType(Enum):
@@ -181,8 +194,8 @@ class ThemeManager:
"""Manages theme selection and CSS generation."""
def __init__(self, default_theme: ThemeType = ThemeType.DARK):
self.current_theme = default_theme
self._themes = {
self.current_theme: ThemeType = default_theme
self._themes: dict[ThemeType, ColorPalette] = {
ThemeType.DARK: ThemeRegistry.get_enhanced_dark(),
ThemeType.LIGHT: ThemeRegistry.get_light(),
ThemeType.HIGH_CONTRAST: ThemeRegistry.get_high_contrast(),
@@ -1106,18 +1119,16 @@ def get_css_for_theme(theme_type: ThemeType) -> str:
return css
def apply_theme_to_app(app: object, theme_type: ThemeType) -> None:
def apply_theme_to_app(app: TextualApp | AppProtocol, theme_type: ThemeType) -> None:
"""Apply a theme to a Textual app instance."""
try:
css = set_theme(theme_type)
if hasattr(app, "stylesheet"):
app.stylesheet.clear()
app.stylesheet.parse(css)
elif hasattr(app, "CSS"):
# Set CSS using the standard Textual approach
if hasattr(app, "CSS") or isinstance(app, App):
setattr(app, "CSS", css)
elif hasattr(app, "refresh"):
# Fallback: try to refresh the app with new CSS
app.refresh()
# Refresh the app to apply new CSS
if hasattr(app, "refresh"):
app.refresh()
except Exception as e:
# Graceful fallback - log but don't crash the UI
import logging
@@ -1127,9 +1138,9 @@ def apply_theme_to_app(app: object, theme_type: ThemeType) -> None:
class ThemeSwitcher:
"""Helper class for managing theme switching in TUI applications."""
def __init__(self, app: object | None = None) -> None:
self.app = app
self.theme_history = [ThemeType.DARK]
def __init__(self, app: TextualApp | AppProtocol | None = None) -> None:
self.app: TextualApp | AppProtocol | None = app
self.theme_history: list[ThemeType] = [ThemeType.DARK]
def switch_theme(self, theme_type: ThemeType) -> str:
"""Switch to a new theme and apply it to the app if available."""
@@ -1157,7 +1168,7 @@ class ThemeSwitcher:
next_theme = themes[(current_index + 1) % len(themes)]
return self.switch_theme(next_theme)
def get_theme_info(self) -> dict[str, Any]:
def get_theme_info(self) -> dict[str, str | list[str] | dict[str, str]]:
"""Get information about the current theme."""
palette = get_theme_palette()
return {

View File

@@ -86,49 +86,18 @@ async def run_textual_tui() -> None:
LOGGER.info("Initializing collection management TUI")
LOGGER.info("Scanning available storage backends")
# Initialize storage manager
# Create storage manager without initialization - let TUI handle it asynchronously
storage_manager = StorageManager(settings)
backend_status = await storage_manager.initialize_all_backends()
# Report initialization results
for backend, success in backend_status.items():
if success:
LOGGER.info("%s connected successfully", backend.value)
else:
LOGGER.warning("%s connection failed", backend.value)
available_backends = storage_manager.get_available_backends()
if not available_backends:
LOGGER.error("Could not connect to any storage backend")
LOGGER.info("Please check your configuration and try again")
LOGGER.info("Supported backends: Weaviate, OpenWebUI, R2R")
return
LOGGER.info(
"Launching TUI with %d backend(s): %s",
len(available_backends),
", ".join(backend.value for backend in available_backends),
)
# Get individual storage instances for backward compatibility
from ....storage.openwebui import OpenWebUIStorage
from ....storage.weaviate import WeaviateStorage
weaviate_backend = storage_manager.get_backend(StorageBackend.WEAVIATE)
openwebui_backend = storage_manager.get_backend(StorageBackend.OPEN_WEBUI)
r2r_backend = storage_manager.get_backend(StorageBackend.R2R)
# Type-safe casting to specific storage types
weaviate = weaviate_backend if isinstance(weaviate_backend, WeaviateStorage) else None
openwebui = openwebui_backend if isinstance(openwebui_backend, OpenWebUIStorage) else None
LOGGER.info("Launching TUI - storage backends will initialize in background")
# Import here to avoid circular import
from ..app import CollectionManagementApp
app = CollectionManagementApp(
storage_manager,
weaviate,
openwebui,
r2r_backend,
None, # weaviate - will be available after initialization
None, # openwebui - will be available after initialization
None, # r2r_backend - will be available after initialization
log_queue=logging_context.queue,
log_formatter=logging_context.formatter,
log_file=logging_context.log_file,

View File

@@ -4,9 +4,11 @@
from __future__ import annotations
import asyncio
from collections.abc import AsyncGenerator, Sequence
from collections.abc import AsyncGenerator, Coroutine, Sequence
from typing import TYPE_CHECKING, Protocol
from pydantic import SecretStr
from ....core.exceptions import StorageError
from ....core.models import Document, StorageBackend, StorageConfig
from ..models import CollectionInfo, StorageCapabilities
@@ -54,8 +56,8 @@ class MultiStorageAdapter(BaseStorage):
seen_ids.add(storage_id)
unique.append(storage)
self._storages = unique
self._primary = unique[0]
self._storages: list[BaseStorage] = unique
self._primary: BaseStorage = unique[0]
super().__init__(self._primary.config)
async def initialize(self) -> None:
@@ -226,10 +228,10 @@ class StorageManager:
def __init__(self, settings: Settings) -> None:
"""Initialize storage manager with application settings."""
self.settings = settings
self.settings: Settings = settings
self.backends: dict[StorageBackend, BaseStorage] = {}
self.capabilities: dict[StorageBackend, StorageCapabilities] = {}
self._initialized = False
self._initialized: bool = False
async def initialize_all_backends(self) -> dict[StorageBackend, bool]:
"""Initialize all available storage backends with timeout protection."""
@@ -252,14 +254,14 @@ class StorageManager:
return False
# Initialize backends concurrently with timeout protection
tasks = []
tasks: list[tuple[StorageBackend, Coroutine[None, None, bool]]] = []
# Try Weaviate
if self.settings.weaviate_endpoint:
config = StorageConfig(
backend=StorageBackend.WEAVIATE,
endpoint=self.settings.weaviate_endpoint,
api_key=self.settings.weaviate_api_key,
api_key=SecretStr(self.settings.weaviate_api_key) if self.settings.weaviate_api_key else None,
collection_name="default",
)
tasks.append((StorageBackend.WEAVIATE, init_backend(StorageBackend.WEAVIATE, config, WeaviateStorage)))
@@ -271,7 +273,7 @@ class StorageManager:
config = StorageConfig(
backend=StorageBackend.OPEN_WEBUI,
endpoint=self.settings.openwebui_endpoint,
api_key=self.settings.openwebui_api_key,
api_key=SecretStr(self.settings.openwebui_api_key) if self.settings.openwebui_api_key else None,
collection_name="default",
)
tasks.append((StorageBackend.OPEN_WEBUI, init_backend(StorageBackend.OPEN_WEBUI, config, OpenWebUIStorage)))
@@ -283,7 +285,7 @@ class StorageManager:
config = StorageConfig(
backend=StorageBackend.R2R,
endpoint=self.settings.r2r_endpoint,
api_key=self.settings.r2r_api_key,
api_key=SecretStr(self.settings.r2r_api_key) if self.settings.r2r_api_key else None,
collection_name="default",
)
tasks.append((StorageBackend.R2R, init_backend(StorageBackend.R2R, config, R2RStorage)))
@@ -293,7 +295,7 @@ class StorageManager:
# Execute initialization tasks concurrently
if tasks:
backend_types, task_coroutines = zip(*tasks, strict=False)
task_results = await asyncio.gather(*task_coroutines, return_exceptions=True)
task_results: Sequence[bool | BaseException] = await asyncio.gather(*task_coroutines, return_exceptions=True)
for backend_type, task_result in zip(backend_types, task_results, strict=False):
results[backend_type] = task_result if isinstance(task_result, bool) else False
@@ -426,7 +428,7 @@ class StorageManager:
storage = self.backends.get(backend_type)
if storage:
try:
documents = []
documents: list[Document] = []
async for doc in storage.search(query, limit=limit):
documents.append(doc)
results[backend_type] = documents
@@ -455,7 +457,7 @@ class StorageManager:
for collection in collections:
total_docs += await storage.count(collection_name=collection)
backend_status = {
backend_status: dict[str, str | int | bool | StorageCapabilities] = {
"available": True,
"collections": len(collections),
"total_documents": total_docs,

View File

@@ -1,7 +1,7 @@
"""Application settings and configuration."""
from functools import lru_cache
from typing import Annotated, ClassVar, Literal
from typing import Annotated, ClassVar, Final, Literal
from prefect.variables import Variable
from pydantic import Field, HttpUrl, model_validator
@@ -20,6 +20,8 @@ class Settings(BaseSettings):
# API Keys
firecrawl_api_key: str | None = None
llm_api_key: str | None = None
openai_api_key: str | None = None
openwebui_api_key: str | None = None
weaviate_api_key: str | None = None
r2r_api_key: str | None = None
@@ -33,6 +35,7 @@ class Settings(BaseSettings):
# Model Configuration
embedding_model: str = "ollama/bge-m3:latest"
metadata_model: str = "fireworks/glm-4p5-air"
embedding_dimension: int = 1024
# Ingestion Settings
@@ -100,14 +103,20 @@ class Settings(BaseSettings):
Returns:
API key or None
"""
service_map = {
service_map: Final[dict[str, str | None]] = {
"firecrawl": self.firecrawl_api_key,
"openwebui": self.openwebui_api_key,
"weaviate": self.weaviate_api_key,
"r2r": self.r2r_api_key,
"llm": self.get_llm_api_key(),
"openai": self.openai_api_key,
}
return service_map.get(service)
def get_llm_api_key(self) -> str | None:
"""Get API key for LLM services with OpenAI fallback."""
return self.llm_api_key or (self.openai_api_key or None)
@model_validator(mode="after")
def validate_backend_configuration(self) -> "Settings":
"""Validate that required configuration is present for the default backend."""

View File

@@ -8,6 +8,40 @@ from uuid import UUID, uuid4
from prefect.blocks.core import Block
from pydantic import BaseModel, Field, HttpUrl, SecretStr
from ..config import get_settings
def _default_embedding_model() -> str:
return get_settings().embedding_model
def _default_embedding_endpoint() -> HttpUrl:
return get_settings().llm_endpoint
def _default_embedding_dimension() -> int:
return get_settings().embedding_dimension
def _default_batch_size() -> int:
return get_settings().default_batch_size
def _default_collection_name() -> str:
return get_settings().default_collection_prefix
def _default_max_crawl_depth() -> int:
return get_settings().max_crawl_depth
def _default_max_crawl_pages() -> int:
return get_settings().max_crawl_pages
def _default_max_file_size() -> int:
return get_settings().max_file_size
class IngestionStatus(str, Enum):
"""Status of an ingestion job."""
@@ -39,36 +73,36 @@ class IngestionSource(str, Enum):
class VectorConfig(BaseModel):
"""Configuration for vectorization."""
model: str = Field(default="ollama/bge-m3:latest")
embedding_endpoint: HttpUrl = Field(default=HttpUrl("http://llm.lab"))
dimension: int = Field(default=1024)
batch_size: Annotated[int, Field(gt=0, le=1000)] = 100
model: str = Field(default_factory=_default_embedding_model)
embedding_endpoint: HttpUrl = Field(default_factory=_default_embedding_endpoint)
dimension: int = Field(default_factory=_default_embedding_dimension)
batch_size: Annotated[int, Field(gt=0, le=1000)] = Field(default_factory=_default_batch_size)
class StorageConfig(Block):
"""Configuration for storage backend."""
_block_type_name: ClassVar[str] = "Storage Configuration"
_block_type_slug: ClassVar[str] = "storage-config"
_description: ClassVar[str] = "Configures storage backend connections and settings for document ingestion"
_block_type_name: ClassVar[str | None] = "Storage Configuration"
_block_type_slug: ClassVar[str | None] = "storage-config"
_description: ClassVar[str | None] = "Configures storage backend connections and settings for document ingestion"
backend: StorageBackend
endpoint: HttpUrl
api_key: SecretStr | None = Field(default=None)
collection_name: str = Field(default="documents")
batch_size: Annotated[int, Field(gt=0, le=1000)] = 100
collection_name: str = Field(default_factory=_default_collection_name)
batch_size: Annotated[int, Field(gt=0, le=1000)] = Field(default_factory=_default_batch_size)
class FirecrawlConfig(Block):
"""Configuration for Firecrawl ingestion (operational parameters only)."""
_block_type_name: ClassVar[str] = "Firecrawl Configuration"
_block_type_slug: ClassVar[str] = "firecrawl-config"
_description: ClassVar[str] = "Configures Firecrawl web scraping and crawling parameters"
_block_type_name: ClassVar[str | None] = "Firecrawl Configuration"
_block_type_slug: ClassVar[str | None] = "firecrawl-config"
_description: ClassVar[str | None] = "Configures Firecrawl web scraping and crawling parameters"
formats: list[str] = Field(default_factory=lambda: ["markdown", "html"])
max_depth: Annotated[int, Field(ge=1, le=20)] = 5
limit: Annotated[int, Field(ge=1, le=1000)] = 100
max_depth: Annotated[int, Field(ge=1, le=20)] = Field(default_factory=_default_max_crawl_depth)
limit: Annotated[int, Field(ge=1, le=1000)] = Field(default_factory=_default_max_crawl_pages)
only_main_content: bool = Field(default=True)
include_subdomains: bool = Field(default=False)
@@ -76,9 +110,9 @@ class FirecrawlConfig(Block):
class RepomixConfig(Block):
"""Configuration for Repomix ingestion."""
_block_type_name: ClassVar[str] = "Repomix Configuration"
_block_type_slug: ClassVar[str] = "repomix-config"
_description: ClassVar[str] = "Configures repository ingestion patterns and file processing settings"
_block_type_name: ClassVar[str | None] = "Repomix Configuration"
_block_type_slug: ClassVar[str | None] = "repomix-config"
_description: ClassVar[str | None] = "Configures repository ingestion patterns and file processing settings"
include_patterns: list[str] = Field(
default_factory=lambda: ["*.py", "*.js", "*.ts", "*.md", "*.yaml", "*.json"]
@@ -86,16 +120,16 @@ class RepomixConfig(Block):
exclude_patterns: list[str] = Field(
default_factory=lambda: ["**/node_modules/**", "**/__pycache__/**", "**/.git/**"]
)
max_file_size: int = Field(default=1_000_000) # 1MB
max_file_size: int = Field(default_factory=_default_max_file_size) # 1MB
respect_gitignore: bool = Field(default=True)
class R2RConfig(Block):
"""Configuration for R2R ingestion."""
_block_type_name: ClassVar[str] = "R2R Configuration"
_block_type_slug: ClassVar[str] = "r2r-config"
_description: ClassVar[str] = "Configures R2R-specific ingestion settings including chunking and graph enrichment"
_block_type_name: ClassVar[str | None] = "R2R Configuration"
_block_type_slug: ClassVar[str | None] = "r2r-config"
_description: ClassVar[str | None] = "Configures R2R-specific ingestion settings including chunking and graph enrichment"
chunk_size: Annotated[int, Field(ge=100, le=8192)] = 1000
chunk_overlap: Annotated[int, Field(ge=0, le=1000)] = 200
@@ -168,7 +202,7 @@ class Document(BaseModel):
vector: list[float] | None = Field(default=None)
score: float | None = Field(default=None)
source: IngestionSource
collection: str = Field(default="documents")
collection: str = Field(default_factory=_default_collection_name)
class IngestionJob(BaseModel):

View File

@@ -3,8 +3,8 @@
from datetime import timedelta
from typing import Literal, Protocol, cast
from prefect import serve
from prefect.deployments.runner import RunnerDeployment
from prefect.flows import serve as prefect_serve
from prefect.schedules import Cron, Interval
from prefect.variables import Variable
@@ -82,7 +82,7 @@ def create_scheduled_deployment(
tags = [source_enum.value, backend_enum.value]
# Create deployment parameters with block support
parameters = {
parameters: dict[str, str | bool] = {
"source_url": source_url,
"source_type": source_enum.value,
"storage_backend": backend_enum.value,
@@ -97,8 +97,8 @@ def create_scheduled_deployment(
# Create deployment
# The flow decorator adds the to_deployment method at runtime
to_deployment = create_ingestion_flow.to_deployment
deployment = to_deployment(
flow_with_deployment = cast(FlowWithDeployment, create_ingestion_flow)
return flow_with_deployment.to_deployment(
name=name,
schedule=schedule,
parameters=parameters,
@@ -106,8 +106,6 @@ def create_scheduled_deployment(
description=f"Scheduled ingestion from {source_url}",
)
return cast("RunnerDeployment", deployment)
def serve_deployments(deployments: list[RunnerDeployment]) -> None:
"""
@@ -116,4 +114,4 @@ def serve_deployments(deployments: list[RunnerDeployment]) -> None:
Args:
deployments: List of deployment configurations
"""
serve(*deployments, limit=10)
prefect_serve(*deployments, limit=10)

View File

@@ -6,7 +6,7 @@ import re
from collections.abc import AsyncGenerator, Awaitable, Callable
from dataclasses import dataclass
from datetime import UTC, datetime
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Protocol, cast
from urllib.parse import urlparse
from uuid import NAMESPACE_URL, UUID, uuid5
@@ -28,9 +28,70 @@ if TYPE_CHECKING:
from ..storage.base import BaseStorage
class FirecrawlMetadata(Protocol):
"""Protocol for Firecrawl metadata objects."""
title: str | None
description: str | None
author: str | None
language: str | None
sitemap_last_modified: str | None
sourceURL: str | None
keywords: str | list[str] | None
robots: str | None
ogTitle: str | None
ogDescription: str | None
ogUrl: str | None
ogImage: str | None
twitterCard: str | None
twitterSite: str | None
twitterCreator: str | None
favicon: str | None
statusCode: int | None
class FirecrawlResult(Protocol):
"""Protocol for Firecrawl scrape result objects."""
metadata: FirecrawlMetadata | None
markdown: str | None
class FirecrawlMapLink(Protocol):
"""Protocol for Firecrawl map link objects."""
url: str
class FirecrawlMapResult(Protocol):
"""Protocol for Firecrawl map result objects."""
links: list[FirecrawlMapLink] | None
class AsyncFirecrawlSession(Protocol):
"""Protocol for AsyncFirecrawl session objects."""
async def close(self) -> None: ...
class AsyncFirecrawlClient(Protocol):
"""Protocol for AsyncFirecrawl client objects."""
_session: AsyncFirecrawlSession | None
async def close(self) -> None: ...
async def scrape(self, url: str, formats: list[str]) -> FirecrawlResult: ...
async def map(self, url: str, limit: int | None = None) -> "FirecrawlMapResult": ...
class FirecrawlError(IngestionError):
"""Base exception for Firecrawl-related errors."""
status_code: int | None
def __init__(self, message: str, status_code: int | None = None) -> None:
super().__init__(message)
self.status_code = status_code
@@ -64,7 +125,7 @@ async def retry_with_backoff(
except Exception as e:
if attempt == max_retries - 1:
raise e
delay = 1.0 * (2**attempt)
delay: float = 1.0 * (2**attempt)
logging.warning(
f"Firecrawl operation failed (attempt {attempt + 1}/{max_retries}): {e}. Retrying in {delay:.1f}s..."
)
@@ -104,7 +165,7 @@ class FirecrawlIngestor(BaseIngestor):
"""Ingestor for web and documentation sites using Firecrawl."""
config: FirecrawlConfig
client: AsyncFirecrawl
client: AsyncFirecrawlClient
def __init__(self, config: FirecrawlConfig | None = None):
"""
@@ -130,15 +191,15 @@ class FirecrawlIngestor(BaseIngestor):
"http://localhost"
):
# Self-hosted instance - try with api_url if supported
self.client = AsyncFirecrawl(
self.client = cast(AsyncFirecrawlClient, AsyncFirecrawl(
api_key=api_key, api_url=str(settings.firecrawl_endpoint)
)
))
else:
# Cloud instance - use standard initialization
self.client = AsyncFirecrawl(api_key=api_key)
self.client = cast(AsyncFirecrawlClient, AsyncFirecrawl(api_key=api_key))
except Exception:
# Fallback to standard initialization
self.client = AsyncFirecrawl(api_key=api_key)
self.client = cast(AsyncFirecrawlClient, AsyncFirecrawl(api_key=api_key))
@override
async def ingest(self, job: IngestionJob) -> AsyncGenerator[Document, None]:
@@ -277,11 +338,11 @@ class FirecrawlIngestor(BaseIngestor):
"""
try:
# Use SDK v2 map endpoint following official pattern
result = await self.client.map(url=url, limit=self.config.limit)
result: FirecrawlMapResult = await self.client.map(url=url, limit=self.config.limit)
if result and getattr(result, "links", None):
if result and result.links:
# Extract URLs from the result following official pattern
return [getattr(link, "url", str(link)) for link in result.links]
return [link.url for link in result.links]
return []
except Exception as e:
# If map fails (might not be available in all versions), fall back to single URL
@@ -324,43 +385,43 @@ class FirecrawlIngestor(BaseIngestor):
try:
# Use SDK v2 scrape endpoint following official pattern with retry
async def scrape_operation() -> FirecrawlPage | None:
result = await self.client.scrape(url, formats=self.config.formats)
result: FirecrawlResult = await self.client.scrape(url, formats=self.config.formats)
# Extract data from the result following official response handling
if result:
# The SDK returns a ScrapeData object with typed metadata
metadata = getattr(result, "metadata", None)
metadata: FirecrawlMetadata | None = getattr(result, "metadata", None)
# Extract basic metadata
title = getattr(metadata, "title", None) if metadata else None
description = getattr(metadata, "description", None) if metadata else None
title: str | None = getattr(metadata, "title", None) if metadata else None
description: str | None = getattr(metadata, "description", None) if metadata else None
# Extract enhanced metadata if available
author = getattr(metadata, "author", None) if metadata else None
language = getattr(metadata, "language", None) if metadata else None
sitemap_last_modified = (
author: str | None = getattr(metadata, "author", None) if metadata else None
language: str | None = getattr(metadata, "language", None) if metadata else None
sitemap_last_modified: str | None = (
getattr(metadata, "sitemap_last_modified", None) if metadata else None
)
source_url = getattr(metadata, "sourceURL", None) if metadata else None
keywords = getattr(metadata, "keywords", None) if metadata else None
robots = getattr(metadata, "robots", None) if metadata else None
source_url: str | None = getattr(metadata, "sourceURL", None) if metadata else None
keywords: str | list[str] | None = getattr(metadata, "keywords", None) if metadata else None
robots: str | None = getattr(metadata, "robots", None) if metadata else None
# Open Graph metadata
og_title = getattr(metadata, "ogTitle", None) if metadata else None
og_description = getattr(metadata, "ogDescription", None) if metadata else None
og_url = getattr(metadata, "ogUrl", None) if metadata else None
og_image = getattr(metadata, "ogImage", None) if metadata else None
og_title: str | None = getattr(metadata, "ogTitle", None) if metadata else None
og_description: str | None = getattr(metadata, "ogDescription", None) if metadata else None
og_url: str | None = getattr(metadata, "ogUrl", None) if metadata else None
og_image: str | None = getattr(metadata, "ogImage", None) if metadata else None
# Twitter metadata
twitter_card = getattr(metadata, "twitterCard", None) if metadata else None
twitter_site = getattr(metadata, "twitterSite", None) if metadata else None
twitter_creator = (
twitter_card: str | None = getattr(metadata, "twitterCard", None) if metadata else None
twitter_site: str | None = getattr(metadata, "twitterSite", None) if metadata else None
twitter_creator: str | None = (
getattr(metadata, "twitterCreator", None) if metadata else None
)
# Additional metadata
favicon = getattr(metadata, "favicon", None) if metadata else None
status_code = getattr(metadata, "statusCode", None) if metadata else None
favicon: str | None = getattr(metadata, "favicon", None) if metadata else None
status_code: int | None = getattr(metadata, "statusCode", None) if metadata else None
return FirecrawlPage(
url=url,
@@ -373,7 +434,7 @@ class FirecrawlIngestor(BaseIngestor):
source_url=source_url,
keywords=keywords.split(",")
if keywords and isinstance(keywords, str)
else keywords,
else (keywords if isinstance(keywords, list) else None),
robots=robots,
og_title=og_title,
og_description=og_description,
@@ -399,11 +460,11 @@ class FirecrawlIngestor(BaseIngestor):
return uuid5(NAMESPACE_URL, source_url)
@staticmethod
def _analyze_content_structure(content: str) -> dict[str, object]:
def _analyze_content_structure(content: str) -> dict[str, str | int | bool | list[str]]:
"""Analyze markdown content to extract structural information."""
# Extract heading hierarchy
heading_pattern = r"^(#{1,6})\s+(.+)$"
headings = []
headings: list[str] = []
for match in re.finditer(heading_pattern, content, re.MULTILINE):
level = len(match.group(1))
text = match.group(2).strip()
@@ -418,7 +479,8 @@ class FirecrawlIngestor(BaseIngestor):
max_depth = 0
if headings:
for heading in headings:
depth = (len(heading) - len(heading.lstrip())) // 2 + 1
heading_str: str = str(heading)
depth = (len(heading_str) - len(heading_str.lstrip())) // 2 + 1
max_depth = max(max_depth, depth)
return {
@@ -570,7 +632,7 @@ class FirecrawlIngestor(BaseIngestor):
await self.client.close()
except Exception as e:
logging.debug(f"Error closing Firecrawl client: {e}")
elif hasattr(self.client, "_session") and hasattr(self.client._session, "close"):
elif hasattr(self.client, "_session") and self.client._session and hasattr(self.client._session, "close"):
try:
await self.client._session.close()
except Exception as e:

View File

@@ -1,9 +1,136 @@
"""Base storage interface."""
import logging
from abc import ABC, abstractmethod
from collections.abc import AsyncGenerator
from typing import Final
from types import TracebackType
import httpx
from pydantic import SecretStr
from ..core.exceptions import StorageError
from ..core.models import Document, StorageConfig
from .types import CollectionSummary, DocumentInfo
LOGGER: Final[logging.Logger] = logging.getLogger(__name__)
class TypedHttpClient:
"""
A properly typed HTTP client wrapper for HTTPX.
Provides consistent exception handling and type annotations
for storage adapters that use HTTP APIs.
Note: Some type checkers (Pylance) may report warnings about HTTPX types
due to library compatibility issues. The code functions correctly at runtime.
"""
client: httpx.AsyncClient
_base_url: str
def __init__(
self,
base_url: str,
*,
api_key: SecretStr | None = None,
timeout: float = 30.0,
headers: dict[str, str] | None = None,
):
"""
Initialize the typed HTTP client.
Args:
base_url: Base URL for all requests
api_key: Optional API key for authentication
timeout: Request timeout in seconds
headers: Additional headers to include with requests
"""
self._base_url = base_url
# Build headers with optional authentication
client_headers: dict[str, str] = headers or {}
if api_key:
client_headers["Authorization"] = f"Bearer {api_key.get_secret_value()}"
# Note: Pylance incorrectly reports "No parameter named 'base_url'"
# but base_url is a valid AsyncClient parameter (see HTTPX docs)
client_kwargs: dict[str, str | dict[str, str] | float] = {
"base_url": base_url,
"headers": client_headers,
"timeout": timeout,
}
self.client = httpx.AsyncClient(**client_kwargs) # type: ignore
async def request(
self,
method: str,
path: str,
*,
allow_404: bool = False,
json: dict[str, object] | None = None,
data: dict[str, object] | None = None,
files: dict[str, tuple[str, bytes, str]] | None = None,
params: dict[str, str | bool] | None = None,
) -> httpx.Response | None:
"""
Perform an HTTP request with consistent error handling.
Args:
method: HTTP method (GET, POST, DELETE, etc.)
path: URL path relative to base_url
allow_404: If True, return None for 404 responses instead of raising
**kwargs: Arguments passed to httpx request
Returns:
HTTP response object, or None if allow_404=True and status is 404
Raises:
StorageError: If request fails
"""
try:
response = await self.client.request( # type: ignore
method, path, json=json, data=data, files=files, params=params
)
response.raise_for_status() # type: ignore
return response # type: ignore
except Exception as e:
# Handle 404 as special case if requested
if allow_404 and hasattr(e, 'response') and getattr(e.response, 'status_code', None) == 404: # type: ignore
LOGGER.debug("Resource not found (404): %s %s", method, path)
return None
# Convert all HTTP-related exceptions to StorageError
error_name = e.__class__.__name__
if 'HTTP' in error_name or 'Connect' in error_name or 'Request' in error_name:
if hasattr(e, 'response') and hasattr(e.response, 'status_code'): # type: ignore
status_code = getattr(e.response, 'status_code', 'unknown') # type: ignore
raise StorageError(f"HTTP {status_code} error from {self._base_url}: {e}") from e
else:
raise StorageError(f"Request failed to {self._base_url}: {e}") from e
# Re-raise non-HTTP exceptions
raise
async def close(self) -> None:
"""Close the HTTP client and cleanup resources."""
try:
await self.client.aclose()
except Exception as e:
LOGGER.warning("Error closing HTTP client: %s", e)
async def __aenter__(self) -> "TypedHttpClient":
"""Async context manager entry."""
return self
async def __aexit__(
self,
exc_type: type[BaseException] | None,
exc_val: BaseException | None,
exc_tb: TracebackType | None
) -> None:
"""Async context manager exit."""
await self.close()
class BaseStorage(ABC):
@@ -164,12 +291,12 @@ class BaseStorage(ABC):
"""
return []
async def describe_collections(self) -> list[dict[str, object]]:
async def describe_collections(self) -> list[CollectionSummary]:
"""
Describe available collections with metadata (if supported by backend).
Returns:
List of collection metadata dictionaries, empty list if not supported
List of collection metadata, empty list if not supported
"""
return []
@@ -206,7 +333,7 @@ class BaseStorage(ABC):
offset: int = 0,
*,
collection_name: str | None = None,
) -> list[dict[str, object]]:
) -> list[DocumentInfo]:
"""
List documents in the storage backend (if supported).
@@ -216,7 +343,7 @@ class BaseStorage(ABC):
collection_name: Collection to list documents from
Returns:
List of document dictionaries with metadata
List of document information with metadata
Raises:
NotImplementedError: If backend doesn't support document listing

View File

@@ -1,33 +1,49 @@
"""Open WebUI storage adapter."""
import asyncio
import contextlib
import logging
from typing import TYPE_CHECKING, Final, TypedDict, cast
from typing import Final, TypedDict, cast
import httpx
from typing_extensions import override
if TYPE_CHECKING:
# Type checking imports - these will be ignored at runtime
from httpx import AsyncClient, ConnectError, HTTPStatusError, RequestError
else:
# Runtime imports that work properly
AsyncClient = httpx.AsyncClient
ConnectError = httpx.ConnectError
HTTPStatusError = httpx.HTTPStatusError
RequestError = httpx.RequestError
from ..core.exceptions import StorageError
from ..core.models import Document, StorageConfig
from .base import BaseStorage
from .base import BaseStorage, TypedHttpClient
from .types import CollectionSummary, DocumentInfo
LOGGER: Final[logging.Logger] = logging.getLogger(__name__)
class OpenWebUIFileResponse(TypedDict, total=False):
"""OpenWebUI API file response structure."""
id: str
filename: str
name: str
content_type: str
size: int
created_at: str
meta: dict[str, str | int]
class OpenWebUIKnowledgeBase(TypedDict, total=False):
"""OpenWebUI knowledge base response structure."""
id: str
name: str
description: str
files: list[OpenWebUIFileResponse]
data: dict[str, str]
created_at: str
updated_at: str
class OpenWebUIStorage(BaseStorage):
"""Storage adapter for Open WebUI knowledge endpoints."""
client: AsyncClient
http_client: TypedHttpClient
_knowledge_cache: dict[str, str]
def __init__(self, config: StorageConfig):
@@ -39,13 +55,9 @@ class OpenWebUIStorage(BaseStorage):
"""
super().__init__(config)
headers: dict[str, str] = {}
if config.api_key:
headers["Authorization"] = f"Bearer {config.api_key}"
self.client = AsyncClient(
self.http_client = TypedHttpClient(
base_url=str(config.endpoint),
headers=headers,
api_key=config.api_key,
timeout=30.0,
)
self._knowledge_cache = {}
@@ -59,60 +71,45 @@ class OpenWebUIStorage(BaseStorage):
self.config.collection_name,
create=True,
)
except ConnectError as e:
raise StorageError(f"Connection to OpenWebUI failed: {e}") from e
except HTTPStatusError as e:
raise StorageError(f"OpenWebUI returned error {e.response.status_code}: {e}") from e
except RequestError as e:
raise StorageError(f"Request to OpenWebUI failed: {e}") from e
except Exception as e:
raise StorageError(f"Failed to initialize Open WebUI: {e}") from e
async def _create_collection(self, name: str) -> str:
"""Create knowledge base in Open WebUI."""
try:
response = await self.client.post(
"/api/v1/knowledge/create",
json={
"name": name,
"description": "Documents ingested from various sources",
"data": {},
"access_control": None,
},
)
response.raise_for_status()
result = response.json()
knowledge_id = result.get("id")
response = await self.http_client.request(
"POST",
"/api/v1/knowledge/create",
json={
"name": name,
"description": "Documents ingested from various sources",
"data": {},
"access_control": None,
},
)
if response is None:
raise StorageError("Unexpected None response from knowledge base creation")
result = response.json()
knowledge_id = result.get("id")
if not knowledge_id or not isinstance(knowledge_id, str):
raise StorageError("Knowledge base creation failed: no ID returned")
if not knowledge_id or not isinstance(knowledge_id, str):
raise StorageError("Knowledge base creation failed: no ID returned")
return str(knowledge_id)
return str(knowledge_id)
except ConnectError as e:
raise StorageError(f"Connection to OpenWebUI failed during creation: {e}") from e
except HTTPStatusError as e:
raise StorageError(
f"OpenWebUI returned error {e.response.status_code} during creation: {e}"
) from e
except RequestError as e:
raise StorageError(f"Request to OpenWebUI failed during creation: {e}") from e
except Exception as e:
raise StorageError(f"Failed to create knowledge base: {e}") from e
async def _fetch_knowledge_bases(self) -> list[dict[str, object]]:
async def _fetch_knowledge_bases(self) -> list[OpenWebUIKnowledgeBase]:
"""Return the list of knowledge bases from the API."""
response = await self.client.get("/api/v1/knowledge/list")
response.raise_for_status()
response = await self.http_client.request("GET", "/api/v1/knowledge/list")
if response is None:
return []
data = response.json()
if not isinstance(data, list):
return []
normalized: list[dict[str, object]] = []
normalized: list[OpenWebUIKnowledgeBase] = []
for item in data:
if isinstance(item, dict):
item_dict: dict[str, object] = item
normalized.append({str(k): v for k, v in item_dict.items()})
# Cast to our expected structure
kb_item = cast(OpenWebUIKnowledgeBase, item)
normalized.append(kb_item)
return normalized
async def _get_knowledge_id(
@@ -171,12 +168,14 @@ class OpenWebUIStorage(BaseStorage):
if not filename.endswith(('.txt', '.md', '.pdf', '.doc', '.docx')):
filename = f"{filename}.txt"
files = {"file": (filename, document.content.encode(), "text/plain")}
response = await self.client.post(
response = await self.http_client.request(
"POST",
"/api/v1/files/",
files=files,
params={"process": True, "process_in_background": False},
)
response.raise_for_status()
if response is None:
raise StorageError("Unexpected None response from file upload")
file_data = response.json()
file_id = file_data.get("id")
@@ -185,19 +184,14 @@ class OpenWebUIStorage(BaseStorage):
raise StorageError("File upload failed: no file ID returned")
# Step 2: Add file to knowledge base
response = await self.client.post(
f"/api/v1/knowledge/{knowledge_id}/file/add", json={"file_id": file_id}
response = await self.http_client.request(
"POST",
f"/api/v1/knowledge/{knowledge_id}/file/add",
json={"file_id": file_id}
)
response.raise_for_status()
return str(file_id)
except ConnectError as e:
raise StorageError(f"Connection to OpenWebUI failed: {e}") from e
except HTTPStatusError as e:
raise StorageError(f"OpenWebUI returned error {e.response.status_code}: {e}") from e
except RequestError as e:
raise StorageError(f"Request to OpenWebUI failed: {e}") from e
except Exception as e:
raise StorageError(f"Failed to store document: {e}") from e
@@ -229,12 +223,14 @@ class OpenWebUIStorage(BaseStorage):
if not filename.endswith(('.txt', '.md', '.pdf', '.doc', '.docx')):
filename = f"{filename}.txt"
files = {"file": (filename, doc.content.encode(), "text/plain")}
upload_response = await self.client.post(
upload_response = await self.http_client.request(
"POST",
"/api/v1/files/",
files=files,
params={"process": True, "process_in_background": False},
)
upload_response.raise_for_status()
if upload_response is None:
raise StorageError(f"Unexpected None response from file upload for document {doc.id}")
file_data = upload_response.json()
file_id = file_data.get("id")
@@ -244,10 +240,11 @@ class OpenWebUIStorage(BaseStorage):
f"File upload failed for document {doc.id}: no file ID returned"
)
attach_response = await self.client.post(
f"/api/v1/knowledge/{knowledge_id}/file/add", json={"file_id": file_id}
await self.http_client.request(
"POST",
f"/api/v1/knowledge/{knowledge_id}/file/add",
json={"file_id": file_id}
)
attach_response.raise_for_status()
return str(file_id)
@@ -273,14 +270,6 @@ class OpenWebUIStorage(BaseStorage):
return file_ids
except ConnectError as e:
raise StorageError(f"Connection to OpenWebUI failed during batch: {e}") from e
except HTTPStatusError as e:
raise StorageError(
f"OpenWebUI returned error {e.response.status_code} during batch: {e}"
) from e
except RequestError as e:
raise StorageError(f"Request to OpenWebUI failed during batch: {e}") from e
except Exception as e:
raise StorageError(f"Failed to store batch: {e}") from e
@@ -298,6 +287,7 @@ class OpenWebUIStorage(BaseStorage):
Returns:
Always None - retrieval not supported
"""
_ = document_id, collection_name # Mark as used
# OpenWebUI uses file-based storage without direct document retrieval
# This will cause the base check_exists method to return False,
# which means documents will always be re-scraped for OpenWebUI
@@ -323,35 +313,20 @@ class OpenWebUIStorage(BaseStorage):
return False
# Remove file from knowledge base
response = await self.client.post(
f"/api/v1/knowledge/{knowledge_id}/file/remove", json={"file_id": document_id}
await self.http_client.request(
"POST",
f"/api/v1/knowledge/{knowledge_id}/file/remove",
json={"file_id": document_id}
)
response.raise_for_status()
delete_response = await self.client.delete(f"/api/v1/files/{document_id}")
if delete_response.status_code == 404:
return True
delete_response.raise_for_status()
await self.http_client.request(
"DELETE",
f"/api/v1/files/{document_id}",
allow_404=True
)
return True
except ConnectError as exc:
LOGGER.error(
"Failed to reach OpenWebUI when deleting file %s", document_id, exc_info=exc
)
return False
except HTTPStatusError as exc:
LOGGER.error(
"OpenWebUI returned status error %s when deleting file %s",
exc.response.status_code if exc.response else "unknown",
document_id,
exc_info=exc,
)
return False
except RequestError as exc:
LOGGER.error("Request error deleting file %s from OpenWebUI", document_id, exc_info=exc)
return False
except Exception as exc:
LOGGER.error("Unexpected error deleting file %s", document_id, exc_info=exc)
LOGGER.error("Error deleting file %s from OpenWebUI", document_id, exc_info=exc)
return False
async def list_collections(self) -> list[str]:
@@ -370,12 +345,6 @@ class OpenWebUIStorage(BaseStorage):
for kb in knowledge_bases
]
except ConnectError as e:
raise StorageError(f"Connection to OpenWebUI failed: {e}") from e
except HTTPStatusError as e:
raise StorageError(f"OpenWebUI returned error {e.response.status_code}: {e}") from e
except RequestError as e:
raise StorageError(f"Request to OpenWebUI failed: {e}") from e
except Exception as e:
raise StorageError(f"Failed to list knowledge bases: {e}") from e
@@ -396,8 +365,11 @@ class OpenWebUIStorage(BaseStorage):
return True
# Delete the knowledge base using the OpenWebUI API
response = await self.client.delete(f"/api/v1/knowledge/{knowledge_id}/delete")
response.raise_for_status()
await self.http_client.request(
"DELETE",
f"/api/v1/knowledge/{knowledge_id}/delete",
allow_404=True
)
# Remove from cache if it exists
if collection_name in self._knowledge_cache:
@@ -406,45 +378,25 @@ class OpenWebUIStorage(BaseStorage):
LOGGER.info("Successfully deleted knowledge base: %s", collection_name)
return True
except HTTPStatusError as e:
# Handle 404 as success (already deleted)
if e.response.status_code == 404:
LOGGER.info("Knowledge base %s was already deleted or not found", collection_name)
return True
LOGGER.error(
"OpenWebUI returned error %s when deleting knowledge base %s",
e.response.status_code,
collection_name,
exc_info=e,
)
return False
except ConnectError as e:
LOGGER.error(
"Failed to reach OpenWebUI when deleting knowledge base %s",
collection_name,
exc_info=e,
)
return False
except RequestError as e:
LOGGER.error(
"Request error deleting knowledge base %s from OpenWebUI",
collection_name,
exc_info=e,
)
return False
except Exception as e:
LOGGER.error("Unexpected error deleting knowledge base %s", collection_name, exc_info=e)
if hasattr(e, 'response'):
response_attr = getattr(e, 'response', None)
if response_attr is not None and hasattr(response_attr, 'status_code'):
with contextlib.suppress(Exception):
status_code = response_attr.status_code # type: ignore[attr-defined]
if status_code == 404:
LOGGER.info("Knowledge base %s was already deleted or not found", collection_name)
return True
LOGGER.error(
"Error deleting knowledge base %s from OpenWebUI",
collection_name,
exc_info=e,
)
return False
class CollectionSummary(TypedDict):
"""Structure describing a knowledge base summary."""
name: str
count: int
size_mb: float
async def _get_knowledge_base_count(self, kb: dict[str, object]) -> int:
async def _get_knowledge_base_count(self, kb: OpenWebUIKnowledgeBase) -> int:
"""Get the file count for a knowledge base."""
kb_id = kb.get("id")
name = kb.get("name", "Unknown")
@@ -454,17 +406,22 @@ class OpenWebUIStorage(BaseStorage):
return await self._count_files_from_detailed_info(str(kb_id), str(name), kb)
def _count_files_from_basic_info(self, kb: dict[str, object]) -> int:
def _count_files_from_basic_info(self, kb: OpenWebUIKnowledgeBase) -> int:
"""Count files from basic knowledge base info."""
files = kb.get("files", [])
return len(files) if isinstance(files, list) and files is not None else 0
async def _count_files_from_detailed_info(self, kb_id: str, name: str, kb: dict[str, object]) -> int:
async def _count_files_from_detailed_info(self, kb_id: str, name: str, kb: OpenWebUIKnowledgeBase) -> int:
"""Count files by fetching detailed knowledge base info."""
try:
LOGGER.debug(f"Fetching detailed info for KB '{name}' from /api/v1/knowledge/{kb_id}")
detail_response = await self.client.get(f"/api/v1/knowledge/{kb_id}")
detail_response.raise_for_status()
detail_response = await self.http_client.request(
"GET",
f"/api/v1/knowledge/{kb_id}"
)
if detail_response is None:
LOGGER.warning(f"Knowledge base '{name}' (ID: {kb_id}) not found")
return self._count_files_from_basic_info(kb)
detailed_kb = detail_response.json()
files = detailed_kb.get("files", [])
@@ -477,21 +434,18 @@ class OpenWebUIStorage(BaseStorage):
LOGGER.warning(f"Failed to get detailed info for KB '{name}' (ID: {kb_id}): {e}")
return self._count_files_from_basic_info(kb)
async def describe_collections(self) -> list[dict[str, object]]:
async def describe_collections(self) -> list[CollectionSummary]:
"""Return metadata about each knowledge base."""
try:
knowledge_bases = await self._fetch_knowledge_bases()
collections: list[dict[str, object]] = []
collections: list[CollectionSummary] = []
for kb in knowledge_bases:
if not isinstance(kb, dict):
continue
count = await self._get_knowledge_base_count(kb)
name = kb.get("name", "Unknown")
size_mb = count * 0.5 # rough heuristic
summary: dict[str, object] = {
summary: CollectionSummary = {
"name": str(name),
"count": count,
"size_mb": float(size_mb),
@@ -535,8 +489,13 @@ class OpenWebUIStorage(BaseStorage):
return 0
# Get detailed knowledge base information to get accurate file count
detail_response = await self.client.get(f"/api/v1/knowledge/{kb_id}")
detail_response.raise_for_status()
detail_response = await self.http_client.request(
"GET",
f"/api/v1/knowledge/{kb_id}"
)
if detail_response is None:
LOGGER.warning(f"Knowledge base '{collection_name}' (ID: {kb_id}) not found")
return self._count_files_from_basic_info(kb)
detailed_kb = detail_response.json()
files = detailed_kb.get("files", [])
@@ -549,7 +508,7 @@ class OpenWebUIStorage(BaseStorage):
LOGGER.warning(f"Failed to get count for collection '{collection_name}': {e}")
return 0
async def get_knowledge_by_name(self, name: str) -> dict[str, object] | None:
async def get_knowledge_by_name(self, name: str) -> OpenWebUIKnowledgeBase | None:
"""
Get knowledge base details by name.
@@ -560,13 +519,14 @@ class OpenWebUIStorage(BaseStorage):
Knowledge base details or None if not found
"""
try:
response = await self.client.get("/api/v1/knowledge/list")
response.raise_for_status()
response = await self.http_client.request("GET", "/api/v1/knowledge/list")
if response is None:
return None
knowledge_bases = response.json()
return next(
(
{str(k): v for k, v in kb.items()}
cast(OpenWebUIKnowledgeBase, kb)
for kb in knowledge_bases
if isinstance(kb, dict) and kb.get("name") == name
),
@@ -587,6 +547,7 @@ class OpenWebUIStorage(BaseStorage):
exc_tb: object | None,
) -> None:
"""Async context manager exit."""
_ = exc_type, exc_val, exc_tb # Mark as used
await self.close()
async def list_documents(
@@ -595,7 +556,7 @@ class OpenWebUIStorage(BaseStorage):
offset: int = 0,
*,
collection_name: str | None = None,
) -> list[dict[str, object]]:
) -> list[DocumentInfo]:
"""
List documents (files) in a knowledge base.
@@ -645,11 +606,8 @@ class OpenWebUIStorage(BaseStorage):
paginated_files = files[offset : offset + limit]
# Convert to document format with safe field access
documents: list[dict[str, object]] = []
documents: list[DocumentInfo] = []
for i, file_info in enumerate(paginated_files):
if not isinstance(file_info, dict):
continue
# Safely extract fields with fallbacks
doc_id = str(file_info.get("id", f"file_{i}"))
@@ -663,7 +621,9 @@ class OpenWebUIStorage(BaseStorage):
filename = file_info["name"]
# Check meta.name (from FileModelResponse schema)
elif isinstance(file_info.get("meta"), dict):
filename = file_info["meta"].get("name")
meta = file_info.get("meta")
if isinstance(meta, dict):
filename = meta.get("name")
# Final fallback
if not filename:
@@ -673,28 +633,28 @@ class OpenWebUIStorage(BaseStorage):
# Extract size from meta if available
size = 0
if isinstance(file_info.get("meta"), dict):
size = file_info["meta"].get("size", 0)
meta = file_info.get("meta")
if isinstance(meta, dict):
size = meta.get("size", 0)
else:
size = file_info.get("size", 0)
# Estimate word count from file size (very rough approximation)
word_count = max(1, int(size / 6)) if isinstance(size, (int, float)) else 0
documents.append(
{
"id": doc_id,
"title": filename,
"source_url": "", # OpenWebUI files don't typically have source URLs
"description": f"File: {filename}",
"content_type": str(file_info.get("content_type", "text/plain")),
"content_preview": f"File uploaded to OpenWebUI: {filename}",
"word_count": word_count,
"timestamp": str(
file_info.get("created_at") or file_info.get("timestamp", "")
),
}
)
doc_info: DocumentInfo = {
"id": doc_id,
"title": filename,
"source_url": "", # OpenWebUI files don't typically have source URLs
"description": f"File: {filename}",
"content_type": str(file_info.get("content_type", "text/plain")),
"content_preview": f"File uploaded to OpenWebUI: {filename}",
"word_count": word_count,
"timestamp": str(
file_info.get("created_at") or file_info.get("timestamp", "")
),
}
documents.append(doc_info)
return documents
@@ -721,10 +681,5 @@ class OpenWebUIStorage(BaseStorage):
async def close(self) -> None:
"""Close client connection."""
if hasattr(self, "client") and self.client:
try:
await self.client.aclose()
except Exception as e:
import logging
logging.warning(f"Error closing OpenWebUI client: {e}")
if hasattr(self, "http_client"):
await self.http_client.close()

View File

@@ -10,15 +10,14 @@ from typing import Self, TypeVar, cast
from uuid import UUID, uuid4
# Direct imports for runtime and type checking
# Note: Some type checkers (basedpyright/Pyrefly) may report import issues
# but these work correctly at runtime and with mypy
from httpx import AsyncClient, HTTPStatusError
from r2r import R2RAsyncClient, R2RException
from httpx import AsyncClient, HTTPStatusError # type: ignore
from r2r import R2RAsyncClient, R2RException # type: ignore
from typing_extensions import override
from ...core.exceptions import StorageError
from ...core.models import Document, DocumentMetadata, IngestionSource, StorageConfig
from ..base import BaseStorage
from ..types import DocumentInfo
T = TypeVar("T")
@@ -80,6 +79,24 @@ class R2RStorage(BaseStorage):
self.client: R2RAsyncClient = R2RAsyncClient(self.endpoint)
self.default_collection_id: str | None = None
def _get_http_client_headers(self) -> dict[str, str]:
"""Get consistent HTTP headers for direct API calls."""
headers = {"Content-Type": "application/json"}
# Add authentication headers if available
# Note: R2R SDK may handle auth internally, so we extract it if possible
if hasattr(self.client, "_get_headers"):
with contextlib.suppress(Exception):
sdk_headers = self.client._get_headers() # type: ignore[attr-defined]
if isinstance(sdk_headers, dict):
headers |= sdk_headers
return headers
def _create_http_client(self) -> AsyncClient:
"""Create a properly configured HTTP client for direct API calls."""
headers = self._get_http_client_headers()
return AsyncClient(headers=headers, timeout=30.0)
@override
async def initialize(self) -> None:
"""Initialize R2R connection and ensure default collection exists."""
@@ -96,7 +113,7 @@ class R2RStorage(BaseStorage):
# Test connection using direct HTTP call to v3 API
endpoint = self.endpoint
client = AsyncClient()
client = self._create_http_client()
try:
response = await client.get(f"{endpoint}/v3/collections")
response.raise_for_status()
@@ -109,7 +126,7 @@ class R2RStorage(BaseStorage):
async def _ensure_collection(self, collection_name: str) -> str:
"""Get or create collection by name."""
endpoint = self.endpoint
client = AsyncClient()
client = self._create_http_client()
try:
# List collections and find by name
response = await client.get(f"{endpoint}/v3/collections")
@@ -152,6 +169,9 @@ class R2RStorage(BaseStorage):
finally:
await client.aclose()
# This should never be reached, but satisfies static analyzer
raise StorageError(f"Unexpected code path in _ensure_collection for '{collection_name}'")
@override
async def store(self, document: Document, *, collection_name: str | None = None) -> str:
"""Store a single document."""
@@ -161,20 +181,44 @@ class R2RStorage(BaseStorage):
async def store_batch(
self, documents: list[Document], *, collection_name: str | None = None
) -> list[str]:
"""Store multiple documents."""
"""Store multiple documents efficiently with connection reuse."""
collection_id = await self._resolve_collection_id(collection_name)
print(
f"Using collection ID: {collection_id} for collection: {collection_name or self.config.collection_name}"
)
stored_ids: list[str] = []
for document in documents:
if not self._is_document_valid(document):
continue
# Filter valid documents upfront
valid_documents = [doc for doc in documents if self._is_document_valid(doc)]
if not valid_documents:
return []
stored_id = await self._store_single_document(document, collection_id)
if stored_id:
stored_ids.append(stored_id)
stored_ids: list[str] = []
# Use a single HTTP client for all requests
http_client = AsyncClient()
async with http_client: # type: ignore
# Process documents with controlled concurrency
import asyncio
semaphore = asyncio.Semaphore(5) # Limit concurrent uploads
async def store_single_with_client(document: Document) -> str | None:
async with semaphore:
return await self._store_single_document_with_client(
document, collection_id, http_client
)
# Execute all uploads concurrently
results = await asyncio.gather(
*[store_single_with_client(doc) for doc in valid_documents], return_exceptions=True
)
# Collect successful IDs
for result in results:
if isinstance(result, str):
stored_ids.append(result)
elif isinstance(result, Exception):
print(f"Document upload failed: {result}")
return stored_ids
@@ -208,6 +252,16 @@ class R2RStorage(BaseStorage):
async def _store_single_document(self, document: Document, collection_id: str) -> str | None:
"""Store a single document with retry logic."""
http_client = AsyncClient()
async with http_client: # type: ignore
return await self._store_single_document_with_client(
document, collection_id, http_client
)
async def _store_single_document_with_client(
self, document: Document, collection_id: str, http_client: AsyncClient
) -> str | None:
"""Store a single document with retry logic using provided HTTP client."""
requested_id = str(document.id)
print(f"Creating document with ID: {requested_id}")
@@ -216,15 +270,23 @@ class R2RStorage(BaseStorage):
for attempt in range(max_retries):
try:
doc_response = await self._attempt_document_creation(document, collection_id)
doc_response = await self._attempt_document_creation_with_client(
document, collection_id, http_client
)
if doc_response:
return self._process_document_response(doc_response, requested_id, collection_id)
return self._process_document_response(
doc_response, requested_id, collection_id
)
except (TimeoutError, OSError) as e:
if not await self._should_retry_timeout(e, attempt, max_retries, requested_id, retry_delay):
if not await self._should_retry_timeout(
e, attempt, max_retries, requested_id, retry_delay
):
break
retry_delay *= 2
except HTTPStatusError as e:
if not await self._should_retry_http_error(e, attempt, max_retries, requested_id, retry_delay):
if not await self._should_retry_http_error(
e, attempt, max_retries, requested_id, retry_delay
):
break
retry_delay *= 2
except Exception as exc:
@@ -233,8 +295,20 @@ class R2RStorage(BaseStorage):
return None
async def _attempt_document_creation(self, document: Document, collection_id: str) -> dict[str, object] | None:
async def _attempt_document_creation(
self, document: Document, collection_id: str
) -> dict[str, object] | None:
"""Attempt to create a document via HTTP API."""
http_client = AsyncClient()
async with http_client: # type: ignore
return await self._attempt_document_creation_with_client(
document, collection_id, http_client
)
async def _attempt_document_creation_with_client(
self, document: Document, collection_id: str, http_client: AsyncClient
) -> dict[str, object] | None:
"""Attempt to create a document via HTTP API using provided client."""
import json
requested_id = str(document.id)
@@ -255,29 +329,36 @@ class R2RStorage(BaseStorage):
print(f"Sending to R2R - files keys: {list(files.keys())}")
print(f"Metadata JSON: {files['metadata'][1]}")
async with AsyncClient() as http_client:
response = await http_client.post(f"{self.endpoint}/v3/documents", files=files)
response = await http_client.post(f"{self.endpoint}/v3/documents", files=files) # type: ignore[call-arg]
if response.status_code == 422:
self._handle_validation_error(response, requested_id, metadata)
return None
if response.status_code == 422:
self._handle_validation_error(response, requested_id, metadata)
return None
response.raise_for_status()
return response.json()
response.raise_for_status()
return response.json()
def _handle_validation_error(self, response: object, requested_id: str, metadata: dict[str, object]) -> None:
def _handle_validation_error(
self, response: object, requested_id: str, metadata: dict[str, object]
) -> None:
"""Handle validation errors from R2R API."""
try:
error_detail = getattr(response, 'json', lambda: {})() if hasattr(response, 'json') else {}
error_detail = (
getattr(response, "json", lambda: {})() if hasattr(response, "json") else {}
)
print(f"R2R validation error for document {requested_id}: {error_detail}")
print(f"Document metadata sent: {metadata}")
print(f"Response status: {getattr(response, 'status_code', 'unknown')}")
print(f"Response headers: {dict(getattr(response, 'headers', {}))}")
except Exception:
print(f"R2R validation error for document {requested_id}: {getattr(response, 'text', 'unknown error')}")
print(
f"R2R validation error for document {requested_id}: {getattr(response, 'text', 'unknown error')}"
)
print(f"Document metadata sent: {metadata}")
def _process_document_response(self, doc_response: dict[str, object], requested_id: str, collection_id: str) -> str:
def _process_document_response(
self, doc_response: dict[str, object], requested_id: str, collection_id: str
) -> str:
"""Process successful document creation response."""
response_payload = doc_response.get("results", doc_response)
doc_id = _extract_id(response_payload, requested_id)
@@ -288,11 +369,20 @@ class R2RStorage(BaseStorage):
print(f"Warning: Requested ID {requested_id} but got {doc_id}")
if collection_id:
print(f"Document {doc_id} should be assigned to collection {collection_id} via creation API")
print(
f"Document {doc_id} should be assigned to collection {collection_id} via creation API"
)
return doc_id
async def _should_retry_timeout(self, error: Exception, attempt: int, max_retries: int, requested_id: str, retry_delay: float) -> bool:
async def _should_retry_timeout(
self,
error: Exception,
attempt: int,
max_retries: int,
requested_id: str,
retry_delay: float,
) -> bool:
"""Determine if timeout error should be retried."""
if attempt >= max_retries - 1:
return False
@@ -301,12 +391,22 @@ class R2RStorage(BaseStorage):
await asyncio.sleep(retry_delay)
return True
async def _should_retry_http_error(self, error: HTTPStatusError, attempt: int, max_retries: int, requested_id: str, retry_delay: float) -> bool:
async def _should_retry_http_error(
self,
error: HTTPStatusError,
attempt: int,
max_retries: int,
requested_id: str,
retry_delay: float,
) -> bool:
"""Determine if HTTP error should be retried."""
if error.response.status_code < 500 or attempt >= max_retries - 1:
status_code = error.response.status_code
if status_code < 500 or attempt >= max_retries - 1:
return False
print(f"Server error {error.response.status_code} for document {requested_id}, retrying in {retry_delay}s...")
print(
f"Server error {status_code} for document {requested_id}, retrying in {retry_delay}s..."
)
await asyncio.sleep(retry_delay)
return True
@@ -323,13 +423,13 @@ class R2RStorage(BaseStorage):
print(" → Server error - R2R internal issue")
else:
import traceback
traceback.print_exc()
def _build_metadata(self, document: Document) -> dict[str, object]:
"""Convert document metadata to enriched R2R format."""
metadata = document.metadata
# Core required fields
result: dict[str, object] = {
"source_url": metadata["source_url"],
@@ -465,7 +565,9 @@ class R2RStorage(BaseStorage):
except ValueError:
return uuid4()
def _build_core_metadata(self, metadata_map: dict[str, object], timestamp: datetime) -> DocumentMetadata:
def _build_core_metadata(
self, metadata_map: dict[str, object], timestamp: datetime
) -> DocumentMetadata:
"""Build core required metadata fields."""
return {
"source_url": str(metadata_map.get("source_url", "")),
@@ -475,7 +577,12 @@ class R2RStorage(BaseStorage):
"char_count": _as_int(metadata_map.get("char_count")),
}
def _add_optional_metadata_fields(self, metadata: DocumentMetadata, doc_map: dict[str, object], metadata_map: dict[str, object]) -> None:
def _add_optional_metadata_fields(
self,
metadata: DocumentMetadata,
doc_map: dict[str, object],
metadata_map: dict[str, object],
) -> None:
"""Add optional metadata fields if present."""
self._add_title_and_description(metadata, doc_map, metadata_map)
self._add_content_categorization(metadata, metadata_map)
@@ -484,7 +591,12 @@ class R2RStorage(BaseStorage):
self._add_processing_fields(metadata, metadata_map)
self._add_quality_scores(metadata, metadata_map)
def _add_title_and_description(self, metadata: DocumentMetadata, doc_map: dict[str, object], metadata_map: dict[str, object]) -> None:
def _add_title_and_description(
self,
metadata: DocumentMetadata,
doc_map: dict[str, object],
metadata_map: dict[str, object],
) -> None:
"""Add title and description fields."""
if title := (doc_map.get("title") or metadata_map.get("title")):
metadata["title"] = cast(str | None, title)
@@ -494,7 +606,9 @@ class R2RStorage(BaseStorage):
elif description := metadata_map.get("description"):
metadata["description"] = cast(str | None, description)
def _add_content_categorization(self, metadata: DocumentMetadata, metadata_map: dict[str, object]) -> None:
def _add_content_categorization(
self, metadata: DocumentMetadata, metadata_map: dict[str, object]
) -> None:
"""Add content categorization fields."""
if tags := metadata_map.get("tags"):
metadata["tags"] = [str(tag) for tag in tags] if isinstance(tags, list) else []
@@ -505,7 +619,9 @@ class R2RStorage(BaseStorage):
if language := metadata_map.get("language"):
metadata["language"] = str(language)
def _add_authorship_fields(self, metadata: DocumentMetadata, metadata_map: dict[str, object]) -> None:
def _add_authorship_fields(
self, metadata: DocumentMetadata, metadata_map: dict[str, object]
) -> None:
"""Add authorship and source information fields."""
if author := metadata_map.get("author"):
metadata["author"] = str(author)
@@ -514,7 +630,9 @@ class R2RStorage(BaseStorage):
if site_name := metadata_map.get("site_name"):
metadata["site_name"] = str(site_name)
def _add_structure_fields(self, metadata: DocumentMetadata, metadata_map: dict[str, object]) -> None:
def _add_structure_fields(
self, metadata: DocumentMetadata, metadata_map: dict[str, object]
) -> None:
"""Add document structure fields."""
if heading_hierarchy := metadata_map.get("heading_hierarchy"):
metadata["heading_hierarchy"] = (
@@ -529,7 +647,9 @@ class R2RStorage(BaseStorage):
if has_links := metadata_map.get("has_links"):
metadata["has_links"] = bool(has_links)
def _add_processing_fields(self, metadata: DocumentMetadata, metadata_map: dict[str, object]) -> None:
def _add_processing_fields(
self, metadata: DocumentMetadata, metadata_map: dict[str, object]
) -> None:
"""Add processing-related metadata fields."""
if extraction_method := metadata_map.get("extraction_method"):
metadata["extraction_method"] = str(extraction_method)
@@ -538,7 +658,9 @@ class R2RStorage(BaseStorage):
if last_modified := metadata_map.get("last_modified"):
metadata["last_modified"] = _as_datetime(last_modified)
def _add_quality_scores(self, metadata: DocumentMetadata, metadata_map: dict[str, object]) -> None:
def _add_quality_scores(
self, metadata: DocumentMetadata, metadata_map: dict[str, object]
) -> None:
"""Add quality score fields with safe float conversion."""
if readability_score := metadata_map.get("readability_score"):
try:
@@ -641,7 +763,7 @@ class R2RStorage(BaseStorage):
async def count(self, *, collection_name: str | None = None) -> int:
"""Get document count in collection."""
endpoint = self.endpoint
client = AsyncClient()
client = self._create_http_client()
try:
# Get collections and find the count for the specific collection
response = await client.get(f"{endpoint}/v3/collections")
@@ -662,6 +784,9 @@ class R2RStorage(BaseStorage):
finally:
await client.aclose()
# This should never be reached, but satisfies static analyzer
return 0
@override
async def close(self) -> None:
"""Close R2R client."""
@@ -709,7 +834,7 @@ class R2RStorage(BaseStorage):
async def list_collections(self) -> list[str]:
"""List all available collections."""
endpoint = self.endpoint
client = AsyncClient()
client = self._create_http_client()
try:
response = await client.get(f"{endpoint}/v3/collections")
response.raise_for_status()
@@ -726,6 +851,9 @@ class R2RStorage(BaseStorage):
finally:
await client.aclose()
# This should never be reached, but satisfies static analyzer
return []
async def list_collections_detailed(self) -> list[dict[str, object]]:
"""List all available collections with detailed information."""
try:
@@ -789,7 +917,7 @@ class R2RStorage(BaseStorage):
offset: int = 0,
*,
collection_name: str | None = None,
) -> list[dict[str, object]]:
) -> list[DocumentInfo]:
"""
List documents in R2R with pagination.
@@ -802,14 +930,14 @@ class R2RStorage(BaseStorage):
List of document dictionaries with metadata
"""
try:
documents: list[dict[str, object]] = []
documents: list[DocumentInfo] = []
if collection_name:
# Get collection ID first
collection_id = await self._ensure_collection(collection_name)
# Use the collections API to list documents in a specific collection
endpoint = self.endpoint
client = AsyncClient()
client = self._create_http_client()
try:
params = {"offset": offset, "limit": limit}
response = await client.get(
@@ -842,20 +970,19 @@ class R2RStorage(BaseStorage):
title = str(doc_map.get("title", "Untitled"))
metadata = _as_mapping(doc_map.get("metadata", {}))
documents.append(
{
"id": doc_id,
"title": title,
"source_url": str(metadata.get("source_url", "")),
"description": str(metadata.get("description", "")),
"content_type": str(metadata.get("content_type", "text/plain")),
"content_preview": str(doc_map.get("content", ""))[:200] + "..."
if doc_map.get("content")
else "",
"word_count": _as_int(metadata.get("word_count", 0)),
"timestamp": str(doc_map.get("created_at", "")),
}
)
document_info: DocumentInfo = {
"id": doc_id,
"title": title,
"source_url": str(metadata.get("source_url", "")),
"description": str(metadata.get("description", "")),
"content_type": str(metadata.get("content_type", "text/plain")),
"content_preview": str(doc_map.get("content", ""))[:200] + "..."
if doc_map.get("content")
else "",
"word_count": _as_int(metadata.get("word_count", 0)),
"timestamp": str(doc_map.get("created_at", "")),
}
documents.append(document_info)
return documents

View File

@@ -0,0 +1,22 @@
"""Shared types for storage adapters."""
from typing import TypedDict
class CollectionSummary(TypedDict):
"""Collection metadata for describe_collections."""
name: str
count: int
size_mb: float
class DocumentInfo(TypedDict):
"""Document information for list_documents."""
id: str
title: str
source_url: str
description: str
content_type: str
content_preview: str
word_count: int
timestamp: str

View File

@@ -21,6 +21,7 @@ from ..core.exceptions import StorageError
from ..core.models import Document, DocumentMetadata, IngestionSource, StorageConfig
from ..utils.vectorizer import Vectorizer
from .base import BaseStorage
from .types import CollectionSummary, DocumentInfo
VectorContainer: TypeAlias = Mapping[str, object] | Sequence[object] | None
@@ -594,14 +595,14 @@ class WeaviateStorage(BaseStorage):
except Exception as e:
raise StorageError(f"Failed to list collections: {e}") from e
async def describe_collections(self) -> list[dict[str, object]]:
async def describe_collections(self) -> list[CollectionSummary]:
"""Return metadata for each Weaviate collection."""
if not self.client:
raise StorageError("Weaviate client not initialized")
try:
client = cast(weaviate.WeaviateClient, self.client)
collections: list[dict[str, object]] = []
collections: list[CollectionSummary] = []
for name in client.collections.list_all():
collection_obj = client.collections.get(name)
if not collection_obj:
@@ -609,13 +610,12 @@ class WeaviateStorage(BaseStorage):
count = collection_obj.aggregate.over_all(total_count=True).total_count or 0
size_mb = count * 0.01
collections.append(
{
"name": name,
"count": count,
"size_mb": size_mb,
}
)
collection_summary: CollectionSummary = {
"name": name,
"count": count,
"size_mb": size_mb,
}
collections.append(collection_summary)
return collections
except Exception as e:
@@ -812,7 +812,7 @@ class WeaviateStorage(BaseStorage):
offset: int = 0,
*,
collection_name: str | None = None,
) -> list[dict[str, object]]:
) -> list[DocumentInfo]:
"""
List documents in the collection with pagination.
@@ -834,7 +834,7 @@ class WeaviateStorage(BaseStorage):
limit=limit, offset=offset, return_metadata=["creation_time"]
)
documents: list[dict[str, object]] = []
documents: list[DocumentInfo] = []
for obj in response.objects:
props = self._coerce_properties(
obj.properties,
@@ -853,7 +853,7 @@ class WeaviateStorage(BaseStorage):
else:
word_count = 0
doc_info: dict[str, object] = {
doc_info: DocumentInfo = {
"id": str(obj.uuid),
"title": str(props.get("title", "Untitled")),
"source_url": str(props.get("source_url", "")),

View File

@@ -2,13 +2,17 @@
import json
from datetime import UTC, datetime
from typing import Protocol, TypedDict, cast
from typing import Final, Protocol, TypedDict, cast
import httpx
from ..core.exceptions import IngestionError
from ..core.models import Document
JSON_CONTENT_TYPE: Final[str] = "application/json"
AUTHORIZATION_HEADER: Final[str] = "Authorization"
from ..config import get_settings
class HttpResponse(Protocol):
"""Protocol for HTTP response."""
@@ -29,6 +33,15 @@ class AsyncHttpClient(Protocol):
async def aclose(self) -> None: ...
async def __aenter__(self) -> "AsyncHttpClient": ...
async def __aexit__(
self,
exc_type: type[BaseException] | None,
exc_val: BaseException | None,
exc_tb: object | None,
) -> None: ...
class LlmResponse(TypedDict):
"""Type for LLM API response structure."""
@@ -66,8 +79,11 @@ class MetadataTagger:
def __init__(
self,
llm_endpoint: str = "http://llm.lab",
model: str = "fireworks/glm-4p5-air",
llm_endpoint: str | None = None,
model: str | None = None,
api_key: str | None = None,
*,
timeout: float | None = None,
):
"""
Initialize metadata tagger.
@@ -75,30 +91,26 @@ class MetadataTagger:
Args:
llm_endpoint: LLM API endpoint
model: Model to use for tagging
api_key: Explicit API key override
timeout: Optional request timeout override in seconds
"""
self.endpoint = llm_endpoint.rstrip('/')
self.model = model
settings = get_settings()
endpoint_value = llm_endpoint or str(settings.llm_endpoint)
self.endpoint = endpoint_value.rstrip('/')
self.model = model or settings.metadata_model
# Get API key from environment
import os
from pathlib import Path
resolved_timeout = timeout if timeout is not None else float(settings.request_timeout)
resolved_api_key = api_key or settings.get_llm_api_key() or ""
from dotenv import load_dotenv
# Load .env from the project root
env_path = Path(__file__).parent.parent.parent / ".env"
_ = load_dotenv(env_path)
api_key = os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY") or ""
headers = {"Content-Type": "application/json"}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
headers: dict[str, str] = {"Content-Type": JSON_CONTENT_TYPE}
if resolved_api_key:
headers[AUTHORIZATION_HEADER] = f"Bearer {resolved_api_key}"
# Create client with proper typing - httpx.AsyncClient implements AsyncHttpClient protocol
AsyncClientClass = getattr(httpx, "AsyncClient")
raw_client = AsyncClientClass(timeout=60.0, headers=headers)
self.client = cast(AsyncHttpClient, raw_client)
self.client = cast(
AsyncHttpClient,
httpx.AsyncClient(timeout=resolved_timeout, headers=headers),
)
async def tag_document(
self, document: Document, custom_instructions: str | None = None

View File

@@ -1,7 +1,7 @@
"""Vectorizer utility for generating embeddings."""
from types import TracebackType
from typing import Self, cast
from typing import Final, Self, cast
import httpx
@@ -9,6 +9,10 @@ from typings import EmbeddingResponse
from ..core.exceptions import VectorizationError
from ..core.models import StorageConfig, VectorConfig
from ..config import get_settings
JSON_CONTENT_TYPE: Final[str] = "application/json"
AUTHORIZATION_HEADER: Final[str] = "Authorization"
class Vectorizer:
@@ -25,33 +29,24 @@ class Vectorizer:
Args:
config: Configuration with embedding details
"""
settings = get_settings()
if isinstance(config, StorageConfig):
# Extract vector config from storage config
self.endpoint = "http://llm.lab"
self.model = "ollama/bge-m3"
self.dimension = 1024
# Extract vector config from global settings when storage config is provided
self.endpoint = str(settings.llm_endpoint).rstrip("/")
self.model = settings.embedding_model
self.dimension = settings.embedding_dimension
else:
self.endpoint = str(config.embedding_endpoint)
self.endpoint = str(config.embedding_endpoint).rstrip("/")
self.model = config.model
self.dimension = config.dimension
# Get API key from environment
import os
from pathlib import Path
resolved_api_key = settings.get_llm_api_key() or ""
headers: dict[str, str] = {"Content-Type": JSON_CONTENT_TYPE}
if resolved_api_key:
headers[AUTHORIZATION_HEADER] = f"Bearer {resolved_api_key}"
from dotenv import load_dotenv
# Load .env from the project root
env_path = Path(__file__).parent.parent.parent / ".env"
_ = load_dotenv(env_path)
api_key = os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY") or ""
headers = {"Content-Type": "application/json"}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
self.client: httpx.AsyncClient = httpx.AsyncClient(timeout=60.0, headers=headers)
timeout_seconds = float(settings.request_timeout)
self.client = httpx.AsyncClient(timeout=timeout_seconds, headers=headers)
async def vectorize(self, text: str) -> list[float]:
"""

View File

@@ -38,9 +38,9 @@ dev-dependencies = [
"pytest-cov>=4.1.0",
"mypy>=1.7.0",
"ruff>=0.1.0",
"basedpyright>=1.31.4",
"pyrefly>=0.33.0",
"sourcery>=1.37.0",
"pylance>=0.36.0",
]
[tool.ruff]

File diff suppressed because it is too large Load Diff

View File

@@ -218,6 +218,46 @@ class AsyncClientStub:
files=None,
)
async def request(
self,
method: str,
url: str,
*,
json: dict[str, object] | None = None,
data: dict[str, object] | None = None,
files: dict[str, tuple[str, bytes, str]] | None = None,
params: dict[str, str | bool] | None = None,
) -> StubbedResponse:
"""Generic request method that delegates to specific HTTP methods."""
# Convert params to the format expected by other methods
converted_params: dict[str, object] | None = None
if params:
converted_params = {k: v for k, v in params.items()}
method_upper = method.upper()
if method_upper == "GET":
return await self.get(url, params=converted_params)
elif method_upper == "POST":
return await self.post(url, json=json, files=files, params=converted_params)
elif method_upper == "DELETE":
return await self.delete(url, json=json, params=converted_params)
else:
# For other methods, use the consume/record pattern directly
normalized = self._record(
method=method_upper,
url=url,
json=json or data,
params=converted_params,
files=files,
)
return self._consume(
method=method_upper,
url=normalized,
json=json or data,
params=converted_params,
files=files,
)
async def aclose(self) -> None:
return None

View File

@@ -11,10 +11,9 @@ from ingest_pipeline.flows import scheduler
def test_create_scheduled_deployment_cron(monkeypatch: pytest.MonkeyPatch) -> None:
captured: dict[str, object] = {}
class DummyFlow:
def to_deployment(self, **kwargs: object) -> SimpleNamespace:
nonlocal captured
captured |= kwargs
return SimpleNamespace(**kwargs)
@@ -37,10 +36,9 @@ def test_create_scheduled_deployment_cron(monkeypatch: pytest.MonkeyPatch) -> No
def test_create_scheduled_deployment_interval(monkeypatch: pytest.MonkeyPatch) -> None:
captured: dict[str, object] = {}
class DummyFlow:
def to_deployment(self, **kwargs: object) -> SimpleNamespace:
nonlocal captured
captured |= kwargs
return SimpleNamespace(**kwargs)
@@ -69,7 +67,7 @@ def test_serve_deployments_invokes_prefect(monkeypatch: pytest.MonkeyPatch) -> N
called["deployments"] = deployments
called["limit"] = limit
monkeypatch.setattr(scheduler, "serve", fake_serve)
monkeypatch.setattr(scheduler, "prefect_serve", fake_serve)
deployment = SimpleNamespace(name="only")
scheduler.serve_deployments([deployment])

View File

@@ -31,7 +31,7 @@ async def test_get_knowledge_id_returns_existing(
assert knowledge_id == "kb-123"
urls = [request["url"] for request in httpx_stub.requests]
assert "http://storage.local/api/v1/knowledge/list" in urls
await storage.client.aclose()
await storage.http_client.client.aclose()
@pytest.mark.asyncio
@@ -54,7 +54,7 @@ async def test_get_knowledge_id_creates_when_missing(
url.startswith("http://storage.local/api/v1/knowledge/") and url.endswith("/create")
for url in urls
)
await storage.client.aclose()
await storage.http_client.client.aclose()
@pytest.mark.asyncio
@@ -80,7 +80,7 @@ async def test_store_uploads_and_attaches_document(
_, knowledge = knowledge_entry
assert len(knowledge.get("files", [])) == 1
assert knowledge["files"][0]["id"] == file_id
await storage.client.aclose()
await storage.http_client.client.aclose()
@pytest.mark.asyncio
@@ -105,7 +105,7 @@ async def test_store_batch_handles_multiple_documents(
assert knowledge_entry is not None
_, knowledge = knowledge_entry
assert {meta["id"] for meta in knowledge.get("files", [])} == set(file_ids)
await storage.client.aclose()
await storage.http_client.client.aclose()
@pytest.mark.asyncio
@@ -133,4 +133,4 @@ async def test_delete_removes_file(
knowledge = openwebui_service.get_knowledge("kb-55")
assert knowledge is not None
assert knowledge.get("files", []) == []
await storage.client.aclose()
await storage.http_client.client.aclose()

View File

@@ -207,7 +207,7 @@ def r2r_client_stub(
mock_async_client = MockAsyncClient(r2r_service)
monkeypatch.setattr(
"ingest_pipeline.storage.r2r.storage.AsyncClient",
lambda: mock_async_client,
lambda **kwargs: mock_async_client,
)
client = DummyClient(r2r_service)

View File

@@ -48,7 +48,7 @@ async def test_vectorizer_storage_config_uses_defaults(
vector = await vectorizer.vectorize("repo content")
assert len(vector) == 1024
assert httpx_stub.requests[0]["json_body"]["model"] == "ollama/bge-m3"
assert httpx_stub.requests[0]["json_body"]["model"] == "ollama/bge-m3:latest"
assert httpx_stub.requests[0]["url"] == "http://llm.lab/v1/embeddings"

157
uv.lock generated
View File

@@ -236,18 +236,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/25/2f/efa9d26dbb612b774990741fd8f13c7cf4cfd085b870e4a5af5c82eaf5f1/authlib-1.6.3-py2.py3-none-any.whl", hash = "sha256:7ea0f082edd95a03b7b72edac65ec7f8f68d703017d7e37573aee4fc603f2a48", size = 240105, upload-time = "2025-08-26T12:13:23.889Z" },
]
[[package]]
name = "basedpyright"
version = "1.31.4"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "nodejs-wheel-binaries" },
]
sdist = { url = "https://files.pythonhosted.org/packages/0b/53/570b03ec0445a9b2cc69788482c1d12902a9b88a9b159e449c4c537c4e3a/basedpyright-1.31.4.tar.gz", hash = "sha256:2450deb16530f7c88c1a7da04530a079f9b0b18ae1c71cb6f812825b3b82d0b1", size = 22494467, upload-time = "2025-09-03T13:05:55.817Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/e5/40/d1047a5addcade9291685d06ef42a63c1347517018bafd82747af9da0294/basedpyright-1.31.4-py3-none-any.whl", hash = "sha256:055e4a38024bd653be12d6216c1cfdbee49a1096d342b4d5f5b4560f7714b6fc", size = 11731440, upload-time = "2025-09-03T13:05:52.308Z" },
]
[[package]]
name = "cachetools"
version = "6.2.0"
@@ -989,8 +977,8 @@ dependencies = [
[package.dev-dependencies]
dev = [
{ name = "basedpyright" },
{ name = "mypy" },
{ name = "pylance" },
{ name = "pyrefly" },
{ name = "pytest" },
{ name = "pytest-asyncio" },
@@ -1019,8 +1007,8 @@ requires-dist = [
[package.metadata.requires-dev]
dev = [
{ name = "basedpyright", specifier = ">=1.31.4" },
{ name = "mypy", specifier = ">=1.7.0" },
{ name = "pylance", specifier = ">=0.36.0" },
{ name = "pyrefly", specifier = ">=0.33.0" },
{ name = "pytest", specifier = ">=7.4.0" },
{ name = "pytest-asyncio", specifier = ">=0.21.0" },
@@ -1432,19 +1420,84 @@ wheels = [
]
[[package]]
name = "nodejs-wheel-binaries"
version = "22.19.0"
name = "numpy"
version = "2.3.3"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/bd/ca/6033f80b7aebc23cb31ed8b09608b6308c5273c3522aedd043e8a0644d83/nodejs_wheel_binaries-22.19.0.tar.gz", hash = "sha256:e69b97ef443d36a72602f7ed356c6a36323873230f894799f4270a853932fdb3", size = 8060, upload-time = "2025-09-12T10:33:46.935Z" }
sdist = { url = "https://files.pythonhosted.org/packages/d0/19/95b3d357407220ed24c139018d2518fab0a61a948e68286a25f1a4d049ff/numpy-2.3.3.tar.gz", hash = "sha256:ddc7c39727ba62b80dfdbedf400d1c10ddfa8eefbd7ec8dcb118be8b56d31029", size = 20576648, upload-time = "2025-09-09T16:54:12.543Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/93/a2/0d055fd1d8c9a7a971c4db10cf42f3bba57c964beb6cf383ca053f2cdd20/nodejs_wheel_binaries-22.19.0-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:43eca1526455a1fb4cb777095198f7ebe5111a4444749c87f5c2b84645aaa72a", size = 50902454, upload-time = "2025-09-12T10:33:18.3Z" },
{ url = "https://files.pythonhosted.org/packages/b5/f5/446f7b3c5be1d2f5145ffa3c9aac3496e06cdf0f436adeb21a1f95dd79a7/nodejs_wheel_binaries-22.19.0-py2.py3-none-macosx_11_0_x86_64.whl", hash = "sha256:feb06709e1320790d34babdf71d841ec7f28e4c73217d733e7f5023060a86bfc", size = 51837860, upload-time = "2025-09-12T10:33:21.599Z" },
{ url = "https://files.pythonhosted.org/packages/1e/4e/d0a036f04fd0f5dc3ae505430657044b8d9853c33be6b2d122bb171aaca3/nodejs_wheel_binaries-22.19.0-py2.py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:db9f5777292491430457c99228d3a267decf12a09d31246f0692391e3513285e", size = 57841528, upload-time = "2025-09-12T10:33:25.433Z" },
{ url = "https://files.pythonhosted.org/packages/e2/11/4811d27819f229cc129925c170db20c12d4f01ad366a0066f06d6eb833cf/nodejs_wheel_binaries-22.19.0-py2.py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1392896f1a05a88a8a89b26e182d90fdf3020b4598a047807b91b65731e24c00", size = 58368815, upload-time = "2025-09-12T10:33:29.083Z" },
{ url = "https://files.pythonhosted.org/packages/6e/94/df41416856b980e38a7ff280cfb59f142a77955ccdbec7cc4260d8ab2e78/nodejs_wheel_binaries-22.19.0-py2.py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:9164c876644f949cad665e3ada00f75023e18f381e78a1d7b60ccbbfb4086e73", size = 59690937, upload-time = "2025-09-12T10:33:32.771Z" },
{ url = "https://files.pythonhosted.org/packages/d1/39/8d0d5f84b7616bdc4eca725f5d64a1cfcac3d90cf3f30cae17d12f8e987f/nodejs_wheel_binaries-22.19.0-py2.py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:6b4b75166134010bc9cfebd30dc57047796a27049fef3fc22316216d76bc0af7", size = 60751996, upload-time = "2025-09-12T10:33:36.962Z" },
{ url = "https://files.pythonhosted.org/packages/41/93/2d66b5b60055dd1de6e37e35bef563c15e4cafa5cfe3a6990e0ab358e515/nodejs_wheel_binaries-22.19.0-py2.py3-none-win_amd64.whl", hash = "sha256:3f271f5abfc71b052a6b074225eca8c1223a0f7216863439b86feaca814f6e5a", size = 40026140, upload-time = "2025-09-12T10:33:40.33Z" },
{ url = "https://files.pythonhosted.org/packages/a3/46/c9cf7ff7e3c71f07ca8331c939afd09b6e59fc85a2944ea9411e8b29ce50/nodejs_wheel_binaries-22.19.0-py2.py3-none-win_arm64.whl", hash = "sha256:666a355fe0c9bde44a9221cd543599b029045643c8196b8eedb44f28dc192e06", size = 38804500, upload-time = "2025-09-12T10:33:43.302Z" },
{ url = "https://files.pythonhosted.org/packages/7a/45/e80d203ef6b267aa29b22714fb558930b27960a0c5ce3c19c999232bb3eb/numpy-2.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0ffc4f5caba7dfcbe944ed674b7eef683c7e94874046454bb79ed7ee0236f59d", size = 21259253, upload-time = "2025-09-09T15:56:02.094Z" },
{ url = "https://files.pythonhosted.org/packages/52/18/cf2c648fccf339e59302e00e5f2bc87725a3ce1992f30f3f78c9044d7c43/numpy-2.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e7e946c7170858a0295f79a60214424caac2ffdb0063d4d79cb681f9aa0aa569", size = 14450980, upload-time = "2025-09-09T15:56:05.926Z" },
{ url = "https://files.pythonhosted.org/packages/93/fb/9af1082bec870188c42a1c239839915b74a5099c392389ff04215dcee812/numpy-2.3.3-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:cd4260f64bc794c3390a63bf0728220dd1a68170c169088a1e0dfa2fde1be12f", size = 5379709, upload-time = "2025-09-09T15:56:07.95Z" },
{ url = "https://files.pythonhosted.org/packages/75/0f/bfd7abca52bcbf9a4a65abc83fe18ef01ccdeb37bfb28bbd6ad613447c79/numpy-2.3.3-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:f0ddb4b96a87b6728df9362135e764eac3cfa674499943ebc44ce96c478ab125", size = 6913923, upload-time = "2025-09-09T15:56:09.443Z" },
{ url = "https://files.pythonhosted.org/packages/79/55/d69adad255e87ab7afda1caf93ca997859092afeb697703e2f010f7c2e55/numpy-2.3.3-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:afd07d377f478344ec6ca2b8d4ca08ae8bd44706763d1efb56397de606393f48", size = 14589591, upload-time = "2025-09-09T15:56:11.234Z" },
{ url = "https://files.pythonhosted.org/packages/10/a2/010b0e27ddeacab7839957d7a8f00e91206e0c2c47abbb5f35a2630e5387/numpy-2.3.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bc92a5dedcc53857249ca51ef29f5e5f2f8c513e22cfb90faeb20343b8c6f7a6", size = 16938714, upload-time = "2025-09-09T15:56:14.637Z" },
{ url = "https://files.pythonhosted.org/packages/1c/6b/12ce8ede632c7126eb2762b9e15e18e204b81725b81f35176eac14dc5b82/numpy-2.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7af05ed4dc19f308e1d9fc759f36f21921eb7bbfc82843eeec6b2a2863a0aefa", size = 16370592, upload-time = "2025-09-09T15:56:17.285Z" },
{ url = "https://files.pythonhosted.org/packages/b4/35/aba8568b2593067bb6a8fe4c52babb23b4c3b9c80e1b49dff03a09925e4a/numpy-2.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:433bf137e338677cebdd5beac0199ac84712ad9d630b74eceeb759eaa45ddf30", size = 18884474, upload-time = "2025-09-09T15:56:20.943Z" },
{ url = "https://files.pythonhosted.org/packages/45/fa/7f43ba10c77575e8be7b0138d107e4f44ca4a1ef322cd16980ea3e8b8222/numpy-2.3.3-cp311-cp311-win32.whl", hash = "sha256:eb63d443d7b4ffd1e873f8155260d7f58e7e4b095961b01c91062935c2491e57", size = 6599794, upload-time = "2025-09-09T15:56:23.258Z" },
{ url = "https://files.pythonhosted.org/packages/0a/a2/a4f78cb2241fe5664a22a10332f2be886dcdea8784c9f6a01c272da9b426/numpy-2.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:ec9d249840f6a565f58d8f913bccac2444235025bbb13e9a4681783572ee3caa", size = 13088104, upload-time = "2025-09-09T15:56:25.476Z" },
{ url = "https://files.pythonhosted.org/packages/79/64/e424e975adbd38282ebcd4891661965b78783de893b381cbc4832fb9beb2/numpy-2.3.3-cp311-cp311-win_arm64.whl", hash = "sha256:74c2a948d02f88c11a3c075d9733f1ae67d97c6bdb97f2bb542f980458b257e7", size = 10460772, upload-time = "2025-09-09T15:56:27.679Z" },
{ url = "https://files.pythonhosted.org/packages/51/5d/bb7fc075b762c96329147799e1bcc9176ab07ca6375ea976c475482ad5b3/numpy-2.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:cfdd09f9c84a1a934cde1eec2267f0a43a7cd44b2cca4ff95b7c0d14d144b0bf", size = 20957014, upload-time = "2025-09-09T15:56:29.966Z" },
{ url = "https://files.pythonhosted.org/packages/6b/0e/c6211bb92af26517acd52125a237a92afe9c3124c6a68d3b9f81b62a0568/numpy-2.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:cb32e3cf0f762aee47ad1ddc6672988f7f27045b0783c887190545baba73aa25", size = 14185220, upload-time = "2025-09-09T15:56:32.175Z" },
{ url = "https://files.pythonhosted.org/packages/22/f2/07bb754eb2ede9073f4054f7c0286b0d9d2e23982e090a80d478b26d35ca/numpy-2.3.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:396b254daeb0a57b1fe0ecb5e3cff6fa79a380fa97c8f7781a6d08cd429418fe", size = 5113918, upload-time = "2025-09-09T15:56:34.175Z" },
{ url = "https://files.pythonhosted.org/packages/81/0a/afa51697e9fb74642f231ea36aca80fa17c8fb89f7a82abd5174023c3960/numpy-2.3.3-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:067e3d7159a5d8f8a0b46ee11148fc35ca9b21f61e3c49fbd0a027450e65a33b", size = 6647922, upload-time = "2025-09-09T15:56:36.149Z" },
{ url = "https://files.pythonhosted.org/packages/5d/f5/122d9cdb3f51c520d150fef6e87df9279e33d19a9611a87c0d2cf78a89f4/numpy-2.3.3-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1c02d0629d25d426585fb2e45a66154081b9fa677bc92a881ff1d216bc9919a8", size = 14281991, upload-time = "2025-09-09T15:56:40.548Z" },
{ url = "https://files.pythonhosted.org/packages/51/64/7de3c91e821a2debf77c92962ea3fe6ac2bc45d0778c1cbe15d4fce2fd94/numpy-2.3.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d9192da52b9745f7f0766531dcfa978b7763916f158bb63bdb8a1eca0068ab20", size = 16641643, upload-time = "2025-09-09T15:56:43.343Z" },
{ url = "https://files.pythonhosted.org/packages/30/e4/961a5fa681502cd0d68907818b69f67542695b74e3ceaa513918103b7e80/numpy-2.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:cd7de500a5b66319db419dc3c345244404a164beae0d0937283b907d8152e6ea", size = 16056787, upload-time = "2025-09-09T15:56:46.141Z" },
{ url = "https://files.pythonhosted.org/packages/99/26/92c912b966e47fbbdf2ad556cb17e3a3088e2e1292b9833be1dfa5361a1a/numpy-2.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:93d4962d8f82af58f0b2eb85daaf1b3ca23fe0a85d0be8f1f2b7bb46034e56d7", size = 18579598, upload-time = "2025-09-09T15:56:49.844Z" },
{ url = "https://files.pythonhosted.org/packages/17/b6/fc8f82cb3520768718834f310c37d96380d9dc61bfdaf05fe5c0b7653e01/numpy-2.3.3-cp312-cp312-win32.whl", hash = "sha256:5534ed6b92f9b7dca6c0a19d6df12d41c68b991cef051d108f6dbff3babc4ebf", size = 6320800, upload-time = "2025-09-09T15:56:52.499Z" },
{ url = "https://files.pythonhosted.org/packages/32/ee/de999f2625b80d043d6d2d628c07d0d5555a677a3cf78fdf868d409b8766/numpy-2.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:497d7cad08e7092dba36e3d296fe4c97708c93daf26643a1ae4b03f6294d30eb", size = 12786615, upload-time = "2025-09-09T15:56:54.422Z" },
{ url = "https://files.pythonhosted.org/packages/49/6e/b479032f8a43559c383acb20816644f5f91c88f633d9271ee84f3b3a996c/numpy-2.3.3-cp312-cp312-win_arm64.whl", hash = "sha256:ca0309a18d4dfea6fc6262a66d06c26cfe4640c3926ceec90e57791a82b6eee5", size = 10195936, upload-time = "2025-09-09T15:56:56.541Z" },
{ url = "https://files.pythonhosted.org/packages/7d/b9/984c2b1ee61a8b803bf63582b4ac4242cf76e2dbd663efeafcb620cc0ccb/numpy-2.3.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f5415fb78995644253370985342cd03572ef8620b934da27d77377a2285955bf", size = 20949588, upload-time = "2025-09-09T15:56:59.087Z" },
{ url = "https://files.pythonhosted.org/packages/a6/e4/07970e3bed0b1384d22af1e9912527ecbeb47d3b26e9b6a3bced068b3bea/numpy-2.3.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d00de139a3324e26ed5b95870ce63be7ec7352171bc69a4cf1f157a48e3eb6b7", size = 14177802, upload-time = "2025-09-09T15:57:01.73Z" },
{ url = "https://files.pythonhosted.org/packages/35/c7/477a83887f9de61f1203bad89cf208b7c19cc9fef0cebef65d5a1a0619f2/numpy-2.3.3-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:9dc13c6a5829610cc07422bc74d3ac083bd8323f14e2827d992f9e52e22cd6a6", size = 5106537, upload-time = "2025-09-09T15:57:03.765Z" },
{ url = "https://files.pythonhosted.org/packages/52/47/93b953bd5866a6f6986344d045a207d3f1cfbad99db29f534ea9cee5108c/numpy-2.3.3-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:d79715d95f1894771eb4e60fb23f065663b2298f7d22945d66877aadf33d00c7", size = 6640743, upload-time = "2025-09-09T15:57:07.921Z" },
{ url = "https://files.pythonhosted.org/packages/23/83/377f84aaeb800b64c0ef4de58b08769e782edcefa4fea712910b6f0afd3c/numpy-2.3.3-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:952cfd0748514ea7c3afc729a0fc639e61655ce4c55ab9acfab14bda4f402b4c", size = 14278881, upload-time = "2025-09-09T15:57:11.349Z" },
{ url = "https://files.pythonhosted.org/packages/9a/a5/bf3db6e66c4b160d6ea10b534c381a1955dfab34cb1017ea93aa33c70ed3/numpy-2.3.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5b83648633d46f77039c29078751f80da65aa64d5622a3cd62aaef9d835b6c93", size = 16636301, upload-time = "2025-09-09T15:57:14.245Z" },
{ url = "https://files.pythonhosted.org/packages/a2/59/1287924242eb4fa3f9b3a2c30400f2e17eb2707020d1c5e3086fe7330717/numpy-2.3.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:b001bae8cea1c7dfdb2ae2b017ed0a6f2102d7a70059df1e338e307a4c78a8ae", size = 16053645, upload-time = "2025-09-09T15:57:16.534Z" },
{ url = "https://files.pythonhosted.org/packages/e6/93/b3d47ed882027c35e94ac2320c37e452a549f582a5e801f2d34b56973c97/numpy-2.3.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8e9aced64054739037d42fb84c54dd38b81ee238816c948c8f3ed134665dcd86", size = 18578179, upload-time = "2025-09-09T15:57:18.883Z" },
{ url = "https://files.pythonhosted.org/packages/20/d9/487a2bccbf7cc9d4bfc5f0f197761a5ef27ba870f1e3bbb9afc4bbe3fcc2/numpy-2.3.3-cp313-cp313-win32.whl", hash = "sha256:9591e1221db3f37751e6442850429b3aabf7026d3b05542d102944ca7f00c8a8", size = 6312250, upload-time = "2025-09-09T15:57:21.296Z" },
{ url = "https://files.pythonhosted.org/packages/1b/b5/263ebbbbcede85028f30047eab3d58028d7ebe389d6493fc95ae66c636ab/numpy-2.3.3-cp313-cp313-win_amd64.whl", hash = "sha256:f0dadeb302887f07431910f67a14d57209ed91130be0adea2f9793f1a4f817cf", size = 12783269, upload-time = "2025-09-09T15:57:23.034Z" },
{ url = "https://files.pythonhosted.org/packages/fa/75/67b8ca554bbeaaeb3fac2e8bce46967a5a06544c9108ec0cf5cece559b6c/numpy-2.3.3-cp313-cp313-win_arm64.whl", hash = "sha256:3c7cf302ac6e0b76a64c4aecf1a09e51abd9b01fc7feee80f6c43e3ab1b1dbc5", size = 10195314, upload-time = "2025-09-09T15:57:25.045Z" },
{ url = "https://files.pythonhosted.org/packages/11/d0/0d1ddec56b162042ddfafeeb293bac672de9b0cfd688383590090963720a/numpy-2.3.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:eda59e44957d272846bb407aad19f89dc6f58fecf3504bd144f4c5cf81a7eacc", size = 21048025, upload-time = "2025-09-09T15:57:27.257Z" },
{ url = "https://files.pythonhosted.org/packages/36/9e/1996ca6b6d00415b6acbdd3c42f7f03ea256e2c3f158f80bd7436a8a19f3/numpy-2.3.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:823d04112bc85ef5c4fda73ba24e6096c8f869931405a80aa8b0e604510a26bc", size = 14301053, upload-time = "2025-09-09T15:57:30.077Z" },
{ url = "https://files.pythonhosted.org/packages/05/24/43da09aa764c68694b76e84b3d3f0c44cb7c18cdc1ba80e48b0ac1d2cd39/numpy-2.3.3-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:40051003e03db4041aa325da2a0971ba41cf65714e65d296397cc0e32de6018b", size = 5229444, upload-time = "2025-09-09T15:57:32.733Z" },
{ url = "https://files.pythonhosted.org/packages/bc/14/50ffb0f22f7218ef8af28dd089f79f68289a7a05a208db9a2c5dcbe123c1/numpy-2.3.3-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:6ee9086235dd6ab7ae75aba5662f582a81ced49f0f1c6de4260a78d8f2d91a19", size = 6738039, upload-time = "2025-09-09T15:57:34.328Z" },
{ url = "https://files.pythonhosted.org/packages/55/52/af46ac0795e09657d45a7f4db961917314377edecf66db0e39fa7ab5c3d3/numpy-2.3.3-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:94fcaa68757c3e2e668ddadeaa86ab05499a70725811e582b6a9858dd472fb30", size = 14352314, upload-time = "2025-09-09T15:57:36.255Z" },
{ url = "https://files.pythonhosted.org/packages/a7/b1/dc226b4c90eb9f07a3fff95c2f0db3268e2e54e5cce97c4ac91518aee71b/numpy-2.3.3-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:da1a74b90e7483d6ce5244053399a614b1d6b7bc30a60d2f570e5071f8959d3e", size = 16701722, upload-time = "2025-09-09T15:57:38.622Z" },
{ url = "https://files.pythonhosted.org/packages/9d/9d/9d8d358f2eb5eced14dba99f110d83b5cd9a4460895230f3b396ad19a323/numpy-2.3.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:2990adf06d1ecee3b3dcbb4977dfab6e9f09807598d647f04d385d29e7a3c3d3", size = 16132755, upload-time = "2025-09-09T15:57:41.16Z" },
{ url = "https://files.pythonhosted.org/packages/b6/27/b3922660c45513f9377b3fb42240bec63f203c71416093476ec9aa0719dc/numpy-2.3.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ed635ff692483b8e3f0fcaa8e7eb8a75ee71aa6d975388224f70821421800cea", size = 18651560, upload-time = "2025-09-09T15:57:43.459Z" },
{ url = "https://files.pythonhosted.org/packages/5b/8e/3ab61a730bdbbc201bb245a71102aa609f0008b9ed15255500a99cd7f780/numpy-2.3.3-cp313-cp313t-win32.whl", hash = "sha256:a333b4ed33d8dc2b373cc955ca57babc00cd6f9009991d9edc5ddbc1bac36bcd", size = 6442776, upload-time = "2025-09-09T15:57:45.793Z" },
{ url = "https://files.pythonhosted.org/packages/1c/3a/e22b766b11f6030dc2decdeff5c2fb1610768055603f9f3be88b6d192fb2/numpy-2.3.3-cp313-cp313t-win_amd64.whl", hash = "sha256:4384a169c4d8f97195980815d6fcad04933a7e1ab3b530921c3fef7a1c63426d", size = 12927281, upload-time = "2025-09-09T15:57:47.492Z" },
{ url = "https://files.pythonhosted.org/packages/7b/42/c2e2bc48c5e9b2a83423f99733950fbefd86f165b468a3d85d52b30bf782/numpy-2.3.3-cp313-cp313t-win_arm64.whl", hash = "sha256:75370986cc0bc66f4ce5110ad35aae6d182cc4ce6433c40ad151f53690130bf1", size = 10265275, upload-time = "2025-09-09T15:57:49.647Z" },
{ url = "https://files.pythonhosted.org/packages/6b/01/342ad585ad82419b99bcf7cebe99e61da6bedb89e213c5fd71acc467faee/numpy-2.3.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:cd052f1fa6a78dee696b58a914b7229ecfa41f0a6d96dc663c1220a55e137593", size = 20951527, upload-time = "2025-09-09T15:57:52.006Z" },
{ url = "https://files.pythonhosted.org/packages/ef/d8/204e0d73fc1b7a9ee80ab1fe1983dd33a4d64a4e30a05364b0208e9a241a/numpy-2.3.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:414a97499480067d305fcac9716c29cf4d0d76db6ebf0bf3cbce666677f12652", size = 14186159, upload-time = "2025-09-09T15:57:54.407Z" },
{ url = "https://files.pythonhosted.org/packages/22/af/f11c916d08f3a18fb8ba81ab72b5b74a6e42ead4c2846d270eb19845bf74/numpy-2.3.3-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:50a5fe69f135f88a2be9b6ca0481a68a136f6febe1916e4920e12f1a34e708a7", size = 5114624, upload-time = "2025-09-09T15:57:56.5Z" },
{ url = "https://files.pythonhosted.org/packages/fb/11/0ed919c8381ac9d2ffacd63fd1f0c34d27e99cab650f0eb6f110e6ae4858/numpy-2.3.3-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:b912f2ed2b67a129e6a601e9d93d4fa37bef67e54cac442a2f588a54afe5c67a", size = 6642627, upload-time = "2025-09-09T15:57:58.206Z" },
{ url = "https://files.pythonhosted.org/packages/ee/83/deb5f77cb0f7ba6cb52b91ed388b47f8f3c2e9930d4665c600408d9b90b9/numpy-2.3.3-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9e318ee0596d76d4cb3d78535dc005fa60e5ea348cd131a51e99d0bdbe0b54fe", size = 14296926, upload-time = "2025-09-09T15:58:00.035Z" },
{ url = "https://files.pythonhosted.org/packages/77/cc/70e59dcb84f2b005d4f306310ff0a892518cc0c8000a33d0e6faf7ca8d80/numpy-2.3.3-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ce020080e4a52426202bdb6f7691c65bb55e49f261f31a8f506c9f6bc7450421", size = 16638958, upload-time = "2025-09-09T15:58:02.738Z" },
{ url = "https://files.pythonhosted.org/packages/b6/5a/b2ab6c18b4257e099587d5b7f903317bd7115333ad8d4ec4874278eafa61/numpy-2.3.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:e6687dc183aa55dae4a705b35f9c0f8cb178bcaa2f029b241ac5356221d5c021", size = 16071920, upload-time = "2025-09-09T15:58:05.029Z" },
{ url = "https://files.pythonhosted.org/packages/b8/f1/8b3fdc44324a259298520dd82147ff648979bed085feeacc1250ef1656c0/numpy-2.3.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d8f3b1080782469fdc1718c4ed1d22549b5fb12af0d57d35e992158a772a37cf", size = 18577076, upload-time = "2025-09-09T15:58:07.745Z" },
{ url = "https://files.pythonhosted.org/packages/f0/a1/b87a284fb15a42e9274e7fcea0dad259d12ddbf07c1595b26883151ca3b4/numpy-2.3.3-cp314-cp314-win32.whl", hash = "sha256:cb248499b0bc3be66ebd6578b83e5acacf1d6cb2a77f2248ce0e40fbec5a76d0", size = 6366952, upload-time = "2025-09-09T15:58:10.096Z" },
{ url = "https://files.pythonhosted.org/packages/70/5f/1816f4d08f3b8f66576d8433a66f8fa35a5acfb3bbd0bf6c31183b003f3d/numpy-2.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:691808c2b26b0f002a032c73255d0bd89751425f379f7bcd22d140db593a96e8", size = 12919322, upload-time = "2025-09-09T15:58:12.138Z" },
{ url = "https://files.pythonhosted.org/packages/8c/de/072420342e46a8ea41c324a555fa90fcc11637583fb8df722936aed1736d/numpy-2.3.3-cp314-cp314-win_arm64.whl", hash = "sha256:9ad12e976ca7b10f1774b03615a2a4bab8addce37ecc77394d8e986927dc0dfe", size = 10478630, upload-time = "2025-09-09T15:58:14.64Z" },
{ url = "https://files.pythonhosted.org/packages/d5/df/ee2f1c0a9de7347f14da5dd3cd3c3b034d1b8607ccb6883d7dd5c035d631/numpy-2.3.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:9cc48e09feb11e1db00b320e9d30a4151f7369afb96bd0e48d942d09da3a0d00", size = 21047987, upload-time = "2025-09-09T15:58:16.889Z" },
{ url = "https://files.pythonhosted.org/packages/d6/92/9453bdc5a4e9e69cf4358463f25e8260e2ffc126d52e10038b9077815989/numpy-2.3.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:901bf6123879b7f251d3631967fd574690734236075082078e0571977c6a8e6a", size = 14301076, upload-time = "2025-09-09T15:58:20.343Z" },
{ url = "https://files.pythonhosted.org/packages/13/77/1447b9eb500f028bb44253105bd67534af60499588a5149a94f18f2ca917/numpy-2.3.3-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:7f025652034199c301049296b59fa7d52c7e625017cae4c75d8662e377bf487d", size = 5229491, upload-time = "2025-09-09T15:58:22.481Z" },
{ url = "https://files.pythonhosted.org/packages/3d/f9/d72221b6ca205f9736cb4b2ce3b002f6e45cd67cd6a6d1c8af11a2f0b649/numpy-2.3.3-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:533ca5f6d325c80b6007d4d7fb1984c303553534191024ec6a524a4c92a5935a", size = 6737913, upload-time = "2025-09-09T15:58:24.569Z" },
{ url = "https://files.pythonhosted.org/packages/3c/5f/d12834711962ad9c46af72f79bb31e73e416ee49d17f4c797f72c96b6ca5/numpy-2.3.3-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0edd58682a399824633b66885d699d7de982800053acf20be1eaa46d92009c54", size = 14352811, upload-time = "2025-09-09T15:58:26.416Z" },
{ url = "https://files.pythonhosted.org/packages/a1/0d/fdbec6629d97fd1bebed56cd742884e4eead593611bbe1abc3eb40d304b2/numpy-2.3.3-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:367ad5d8fbec5d9296d18478804a530f1191e24ab4d75ab408346ae88045d25e", size = 16702689, upload-time = "2025-09-09T15:58:28.831Z" },
{ url = "https://files.pythonhosted.org/packages/9b/09/0a35196dc5575adde1eb97ddfbc3e1687a814f905377621d18ca9bc2b7dd/numpy-2.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8f6ac61a217437946a1fa48d24c47c91a0c4f725237871117dea264982128097", size = 16133855, upload-time = "2025-09-09T15:58:31.349Z" },
{ url = "https://files.pythonhosted.org/packages/7a/ca/c9de3ea397d576f1b6753eaa906d4cdef1bf97589a6d9825a349b4729cc2/numpy-2.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:179a42101b845a816d464b6fe9a845dfaf308fdfc7925387195570789bb2c970", size = 18652520, upload-time = "2025-09-09T15:58:33.762Z" },
{ url = "https://files.pythonhosted.org/packages/fd/c2/e5ed830e08cd0196351db55db82f65bc0ab05da6ef2b72a836dcf1936d2f/numpy-2.3.3-cp314-cp314t-win32.whl", hash = "sha256:1250c5d3d2562ec4174bce2e3a1523041595f9b651065e4a4473f5f48a6bc8a5", size = 6515371, upload-time = "2025-09-09T15:58:36.04Z" },
{ url = "https://files.pythonhosted.org/packages/47/c7/b0f6b5b67f6788a0725f744496badbb604d226bf233ba716683ebb47b570/numpy-2.3.3-cp314-cp314t-win_amd64.whl", hash = "sha256:b37a0b2e5935409daebe82c1e42274d30d9dd355852529eab91dab8dcca7419f", size = 13112576, upload-time = "2025-09-09T15:58:37.927Z" },
{ url = "https://files.pythonhosted.org/packages/06/b9/33bba5ff6fb679aa0b1f8a07e853f002a6b04b9394db3069a1270a7784ca/numpy-2.3.3-cp314-cp314t-win_arm64.whl", hash = "sha256:78c9f6560dc7e6b3990e32df7ea1a50bbd0e2a111e05209963f5ddcab7073b0b", size = 10545953, upload-time = "2025-09-09T15:58:40.576Z" },
{ url = "https://files.pythonhosted.org/packages/b8/f2/7e0a37cfced2644c9563c529f29fa28acbd0960dde32ece683aafa6f4949/numpy-2.3.3-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:1e02c7159791cd481e1e6d5ddd766b62a4d5acf8df4d4d1afe35ee9c5c33a41e", size = 21131019, upload-time = "2025-09-09T15:58:42.838Z" },
{ url = "https://files.pythonhosted.org/packages/1a/7e/3291f505297ed63831135a6cc0f474da0c868a1f31b0dd9a9f03a7a0d2ed/numpy-2.3.3-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:dca2d0fc80b3893ae72197b39f69d55a3cd8b17ea1b50aa4c62de82419936150", size = 14376288, upload-time = "2025-09-09T15:58:45.425Z" },
{ url = "https://files.pythonhosted.org/packages/bf/4b/ae02e985bdeee73d7b5abdefeb98aef1207e96d4c0621ee0cf228ddfac3c/numpy-2.3.3-pp311-pypy311_pp73-macosx_14_0_arm64.whl", hash = "sha256:99683cbe0658f8271b333a1b1b4bb3173750ad59c0c61f5bbdc5b318918fffe3", size = 5305425, upload-time = "2025-09-09T15:58:48.6Z" },
{ url = "https://files.pythonhosted.org/packages/8b/eb/9df215d6d7250db32007941500dc51c48190be25f2401d5b2b564e467247/numpy-2.3.3-pp311-pypy311_pp73-macosx_14_0_x86_64.whl", hash = "sha256:d9d537a39cc9de668e5cd0e25affb17aec17b577c6b3ae8a3d866b479fbe88d0", size = 6819053, upload-time = "2025-09-09T15:58:50.401Z" },
{ url = "https://files.pythonhosted.org/packages/57/62/208293d7d6b2a8998a4a1f23ac758648c3c32182d4ce4346062018362e29/numpy-2.3.3-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8596ba2f8af5f93b01d97563832686d20206d303024777f6dfc2e7c7c3f1850e", size = 14420354, upload-time = "2025-09-09T15:58:52.704Z" },
{ url = "https://files.pythonhosted.org/packages/ed/0c/8e86e0ff7072e14a71b4c6af63175e40d1e7e933ce9b9e9f765a95b4e0c3/numpy-2.3.3-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e1ec5615b05369925bd1125f27df33f3b6c8bc10d788d5999ecd8769a1fa04db", size = 16760413, upload-time = "2025-09-09T15:58:55.027Z" },
{ url = "https://files.pythonhosted.org/packages/af/11/0cc63f9f321ccf63886ac203336777140011fb669e739da36d8db3c53b98/numpy-2.3.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:2e267c7da5bf7309670523896df97f93f6e469fb931161f483cd6882b3b1a5dc", size = 12971844, upload-time = "2025-09-09T15:58:57.359Z" },
]
[[package]]
@@ -1835,6 +1888,42 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/5a/dd/464bd739bacb3b745a1c93bc15f20f0b1e27f0a64ec693367794b398673b/psycopg_binary-3.2.10-cp314-cp314-win_amd64.whl", hash = "sha256:d5c6a66a76022af41970bf19f51bc6bf87bd10165783dd1d40484bfd87d6b382", size = 2973554, upload-time = "2025-09-08T09:12:05.884Z" },
]
[[package]]
name = "pyarrow"
version = "21.0.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/ef/c2/ea068b8f00905c06329a3dfcd40d0fcc2b7d0f2e355bdb25b65e0a0e4cd4/pyarrow-21.0.0.tar.gz", hash = "sha256:5051f2dccf0e283ff56335760cbc8622cf52264d67e359d5569541ac11b6d5bc", size = 1133487, upload-time = "2025-07-18T00:57:31.761Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/94/dc/80564a3071a57c20b7c32575e4a0120e8a330ef487c319b122942d665960/pyarrow-21.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:c077f48aab61738c237802836fc3844f85409a46015635198761b0d6a688f87b", size = 31243234, upload-time = "2025-07-18T00:55:03.812Z" },
{ url = "https://files.pythonhosted.org/packages/ea/cc/3b51cb2db26fe535d14f74cab4c79b191ed9a8cd4cbba45e2379b5ca2746/pyarrow-21.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:689f448066781856237eca8d1975b98cace19b8dd2ab6145bf49475478bcaa10", size = 32714370, upload-time = "2025-07-18T00:55:07.495Z" },
{ url = "https://files.pythonhosted.org/packages/24/11/a4431f36d5ad7d83b87146f515c063e4d07ef0b7240876ddb885e6b44f2e/pyarrow-21.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:479ee41399fcddc46159a551705b89c05f11e8b8cb8e968f7fec64f62d91985e", size = 41135424, upload-time = "2025-07-18T00:55:11.461Z" },
{ url = "https://files.pythonhosted.org/packages/74/dc/035d54638fc5d2971cbf1e987ccd45f1091c83bcf747281cf6cc25e72c88/pyarrow-21.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:40ebfcb54a4f11bcde86bc586cbd0272bac0d516cfa539c799c2453768477569", size = 42823810, upload-time = "2025-07-18T00:55:16.301Z" },
{ url = "https://files.pythonhosted.org/packages/2e/3b/89fced102448a9e3e0d4dded1f37fa3ce4700f02cdb8665457fcc8015f5b/pyarrow-21.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8d58d8497814274d3d20214fbb24abcad2f7e351474357d552a8d53bce70c70e", size = 43391538, upload-time = "2025-07-18T00:55:23.82Z" },
{ url = "https://files.pythonhosted.org/packages/fb/bb/ea7f1bd08978d39debd3b23611c293f64a642557e8141c80635d501e6d53/pyarrow-21.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:585e7224f21124dd57836b1530ac8f2df2afc43c861d7bf3d58a4870c42ae36c", size = 45120056, upload-time = "2025-07-18T00:55:28.231Z" },
{ url = "https://files.pythonhosted.org/packages/6e/0b/77ea0600009842b30ceebc3337639a7380cd946061b620ac1a2f3cb541e2/pyarrow-21.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:555ca6935b2cbca2c0e932bedd853e9bc523098c39636de9ad4693b5b1df86d6", size = 26220568, upload-time = "2025-07-18T00:55:32.122Z" },
{ url = "https://files.pythonhosted.org/packages/ca/d4/d4f817b21aacc30195cf6a46ba041dd1be827efa4a623cc8bf39a1c2a0c0/pyarrow-21.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:3a302f0e0963db37e0a24a70c56cf91a4faa0bca51c23812279ca2e23481fccd", size = 31160305, upload-time = "2025-07-18T00:55:35.373Z" },
{ url = "https://files.pythonhosted.org/packages/a2/9c/dcd38ce6e4b4d9a19e1d36914cb8e2b1da4e6003dd075474c4cfcdfe0601/pyarrow-21.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:b6b27cf01e243871390474a211a7922bfbe3bda21e39bc9160daf0da3fe48876", size = 32684264, upload-time = "2025-07-18T00:55:39.303Z" },
{ url = "https://files.pythonhosted.org/packages/4f/74/2a2d9f8d7a59b639523454bec12dba35ae3d0a07d8ab529dc0809f74b23c/pyarrow-21.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:e72a8ec6b868e258a2cd2672d91f2860ad532d590ce94cdf7d5e7ec674ccf03d", size = 41108099, upload-time = "2025-07-18T00:55:42.889Z" },
{ url = "https://files.pythonhosted.org/packages/ad/90/2660332eeb31303c13b653ea566a9918484b6e4d6b9d2d46879a33ab0622/pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b7ae0bbdc8c6674259b25bef5d2a1d6af5d39d7200c819cf99e07f7dfef1c51e", size = 42829529, upload-time = "2025-07-18T00:55:47.069Z" },
{ url = "https://files.pythonhosted.org/packages/33/27/1a93a25c92717f6aa0fca06eb4700860577d016cd3ae51aad0e0488ac899/pyarrow-21.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:58c30a1729f82d201627c173d91bd431db88ea74dcaa3885855bc6203e433b82", size = 43367883, upload-time = "2025-07-18T00:55:53.069Z" },
{ url = "https://files.pythonhosted.org/packages/05/d9/4d09d919f35d599bc05c6950095e358c3e15148ead26292dfca1fb659b0c/pyarrow-21.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:072116f65604b822a7f22945a7a6e581cfa28e3454fdcc6939d4ff6090126623", size = 45133802, upload-time = "2025-07-18T00:55:57.714Z" },
{ url = "https://files.pythonhosted.org/packages/71/30/f3795b6e192c3ab881325ffe172e526499eb3780e306a15103a2764916a2/pyarrow-21.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:cf56ec8b0a5c8c9d7021d6fd754e688104f9ebebf1bf4449613c9531f5346a18", size = 26203175, upload-time = "2025-07-18T00:56:01.364Z" },
{ url = "https://files.pythonhosted.org/packages/16/ca/c7eaa8e62db8fb37ce942b1ea0c6d7abfe3786ca193957afa25e71b81b66/pyarrow-21.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:e99310a4ebd4479bcd1964dff9e14af33746300cb014aa4a3781738ac63baf4a", size = 31154306, upload-time = "2025-07-18T00:56:04.42Z" },
{ url = "https://files.pythonhosted.org/packages/ce/e8/e87d9e3b2489302b3a1aea709aaca4b781c5252fcb812a17ab6275a9a484/pyarrow-21.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:d2fe8e7f3ce329a71b7ddd7498b3cfac0eeb200c2789bd840234f0dc271a8efe", size = 32680622, upload-time = "2025-07-18T00:56:07.505Z" },
{ url = "https://files.pythonhosted.org/packages/84/52/79095d73a742aa0aba370c7942b1b655f598069489ab387fe47261a849e1/pyarrow-21.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:f522e5709379d72fb3da7785aa489ff0bb87448a9dc5a75f45763a795a089ebd", size = 41104094, upload-time = "2025-07-18T00:56:10.994Z" },
{ url = "https://files.pythonhosted.org/packages/89/4b/7782438b551dbb0468892a276b8c789b8bbdb25ea5c5eb27faadd753e037/pyarrow-21.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:69cbbdf0631396e9925e048cfa5bce4e8c3d3b41562bbd70c685a8eb53a91e61", size = 42825576, upload-time = "2025-07-18T00:56:15.569Z" },
{ url = "https://files.pythonhosted.org/packages/b3/62/0f29de6e0a1e33518dec92c65be0351d32d7ca351e51ec5f4f837a9aab91/pyarrow-21.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:731c7022587006b755d0bdb27626a1a3bb004bb56b11fb30d98b6c1b4718579d", size = 43368342, upload-time = "2025-07-18T00:56:19.531Z" },
{ url = "https://files.pythonhosted.org/packages/90/c7/0fa1f3f29cf75f339768cc698c8ad4ddd2481c1742e9741459911c9ac477/pyarrow-21.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:dc56bc708f2d8ac71bd1dcb927e458c93cec10b98eb4120206a4091db7b67b99", size = 45131218, upload-time = "2025-07-18T00:56:23.347Z" },
{ url = "https://files.pythonhosted.org/packages/01/63/581f2076465e67b23bc5a37d4a2abff8362d389d29d8105832e82c9c811c/pyarrow-21.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:186aa00bca62139f75b7de8420f745f2af12941595bbbfa7ed3870ff63e25636", size = 26087551, upload-time = "2025-07-18T00:56:26.758Z" },
{ url = "https://files.pythonhosted.org/packages/c9/ab/357d0d9648bb8241ee7348e564f2479d206ebe6e1c47ac5027c2e31ecd39/pyarrow-21.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:a7a102574faa3f421141a64c10216e078df467ab9576684d5cd696952546e2da", size = 31290064, upload-time = "2025-07-18T00:56:30.214Z" },
{ url = "https://files.pythonhosted.org/packages/3f/8a/5685d62a990e4cac2043fc76b4661bf38d06efed55cf45a334b455bd2759/pyarrow-21.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:1e005378c4a2c6db3ada3ad4c217b381f6c886f0a80d6a316fe586b90f77efd7", size = 32727837, upload-time = "2025-07-18T00:56:33.935Z" },
{ url = "https://files.pythonhosted.org/packages/fc/de/c0828ee09525c2bafefd3e736a248ebe764d07d0fd762d4f0929dbc516c9/pyarrow-21.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:65f8e85f79031449ec8706b74504a316805217b35b6099155dd7e227eef0d4b6", size = 41014158, upload-time = "2025-07-18T00:56:37.528Z" },
{ url = "https://files.pythonhosted.org/packages/6e/26/a2865c420c50b7a3748320b614f3484bfcde8347b2639b2b903b21ce6a72/pyarrow-21.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:3a81486adc665c7eb1a2bde0224cfca6ceaba344a82a971ef059678417880eb8", size = 42667885, upload-time = "2025-07-18T00:56:41.483Z" },
{ url = "https://files.pythonhosted.org/packages/0a/f9/4ee798dc902533159250fb4321267730bc0a107d8c6889e07c3add4fe3a5/pyarrow-21.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:fc0d2f88b81dcf3ccf9a6ae17f89183762c8a94a5bdcfa09e05cfe413acf0503", size = 43276625, upload-time = "2025-07-18T00:56:48.002Z" },
{ url = "https://files.pythonhosted.org/packages/5a/da/e02544d6997037a4b0d22d8e5f66bc9315c3671371a8b18c79ade1cefe14/pyarrow-21.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6299449adf89df38537837487a4f8d3bd91ec94354fdd2a7d30bc11c48ef6e79", size = 44951890, upload-time = "2025-07-18T00:56:52.568Z" },
{ url = "https://files.pythonhosted.org/packages/e5/4e/519c1bc1876625fe6b71e9a28287c43ec2f20f73c658b9ae1d485c0c206e/pyarrow-21.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:222c39e2c70113543982c6b34f3077962b44fca38c0bd9e68bb6781534425c10", size = 26371006, upload-time = "2025-07-18T00:56:56.379Z" },
]
[[package]]
name = "pycparser"
version = "2.23"
@@ -1960,6 +2049,24 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" },
]
[[package]]
name = "pylance"
version = "0.36.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "numpy" },
{ name = "pyarrow" },
]
wheels = [
{ url = "https://files.pythonhosted.org/packages/09/13/f7f029d12a3dfdc9f3059d77b3999d40f9cc064ba85fef885a08bf65dcb2/pylance-0.36.0-cp39-abi3-macosx_10_15_x86_64.whl", hash = "sha256:160ed088dc5fb63a71c8c96640d43ea58464f64bca8aa23b0337b1a96fd47b79", size = 43403867, upload-time = "2025-09-12T20:29:25.507Z" },
{ url = "https://files.pythonhosted.org/packages/95/95/defad18786260653b33d5ef8223736c0e481861c8d33311756bd471468ad/pylance-0.36.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:ce43ad002b4e67ffb1a33925d05d472bbde77c57a5e84aca1728faa9ace0c086", size = 39777498, upload-time = "2025-09-12T20:27:02.906Z" },
{ url = "https://files.pythonhosted.org/packages/19/33/7080ed4e45648d8c803a49cd5a206eb95176ef9dc06bff26748ec2109c65/pylance-0.36.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ad7b168b0d4b7864be6040bebaf6d9a3959e76a190ff401a84b165b75eade96", size = 41819489, upload-time = "2025-09-12T20:17:06.37Z" },
{ url = "https://files.pythonhosted.org/packages/29/9a/0c572994d96e03e70481dafb2b062033a9ce24beb5ac6045f00f013ca57c/pylance-0.36.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:353deeb7b19be505db490258b5f2fc897efd4a45255fa0d51455662e01ad59ab", size = 45366480, upload-time = "2025-09-12T20:19:53.924Z" },
{ url = "https://files.pythonhosted.org/packages/fe/82/a74f0436b6a983c2798d1f44699352cd98c42bc335781ece98a878cf63fb/pylance-0.36.0-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:9cd963fc22257591d1daf281fa2369e05299d78950cb11980aa099d7cbacdf00", size = 41833322, upload-time = "2025-09-12T20:17:40.784Z" },
{ url = "https://files.pythonhosted.org/packages/a8/f2/d28fa3487992c3bd46af6838da13cf9a00be24fcf4cf928f77feec52d8d6/pylance-0.36.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:40117569a87379e08ed12eccac658999158f81df946f2ed02693b77776b57597", size = 45347065, upload-time = "2025-09-12T20:19:26.435Z" },
{ url = "https://files.pythonhosted.org/packages/ff/ab/e7fc302950f1c6815a6e832d052d0860130374bfe4bd482b075299dc8384/pylance-0.36.0-cp39-abi3-win_amd64.whl", hash = "sha256:a2930738192e5075220bc38c8a58ff4e48a71d53b3ca2a577ffce0318609cac0", size = 46348996, upload-time = "2025-09-12T20:36:04.663Z" },
]
[[package]]
name = "pyrefly"
version = "0.33.0"