xx
This commit is contained in:
@@ -3,7 +3,11 @@
|
||||
"allow": [
|
||||
"mcp__context7__resolve-library-id",
|
||||
"mcp__context7__get-library-docs",
|
||||
"mcp__sequential-thinking__sequentialthinking"
|
||||
"mcp__sequential-thinking__sequentialthinking",
|
||||
"WebSearch",
|
||||
"Bash(cat:*)",
|
||||
"mcp__firecrawl__firecrawl_search",
|
||||
"mcp__firecrawl__firecrawl_scrape"
|
||||
],
|
||||
"deny": [],
|
||||
"ask": []
|
||||
|
||||
2
.env
2
.env
@@ -8,6 +8,8 @@ WCD_URL=http://weaviate.yo # or http://localhost:8080
|
||||
# No API key required for local unless you enabled local auth
|
||||
# WCD_API_KEY=
|
||||
# API Keys (only if not using local/self-hosted services)
|
||||
R2R_API_URL=http://r2r.lab
|
||||
R2R_API_KEY=
|
||||
FIRECRAWL_API_KEY=dummy-key
|
||||
OPENWEBUI_API_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6IjlmNjEwODg2LWRhM2MtNDQ4YS05OWE0LTYyZGEyZjIyZjJiNiJ9.W-dqabcE4F-LQ--k2yrJM_KEBDB-wi1CmoahlN1tQbY
|
||||
OPENWEBUI_API_URL=http://chat.lab
|
||||
|
||||
9
.repomixignore
Normal file
9
.repomixignore
Normal file
@@ -0,0 +1,9 @@
|
||||
# Add patterns to ignore here, one per line
|
||||
# Example:
|
||||
# *.log
|
||||
# tmp/
|
||||
.claude/
|
||||
.venv/
|
||||
docs/
|
||||
CLAUDE.md
|
||||
repomix**
|
||||
7
.vscode/settings.json
vendored
Normal file
7
.vscode/settings.json
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"chatgpt.openOnStartup": true,
|
||||
"python.languageServer": "None",
|
||||
"python.analysis.typeCheckingMode": "off",
|
||||
"python.defaultInterpreterPath": "./.venv/bin/python",
|
||||
"python.terminal.activateEnvironment": true
|
||||
}
|
||||
135
AGENTS.md
Normal file
135
AGENTS.md
Normal file
@@ -0,0 +1,135 @@
|
||||
# Claude Code Instructions - Python Development Standards
|
||||
|
||||
## Environment Setup
|
||||
- **Virtual Environment**: Always activate the virtual environment at `.venv/bin/activate` before any operations
|
||||
- **Package Management**: Use `uv` exclusively for package management (not pip, poetry, or conda)
|
||||
- **Pre-execution**: Always activate venv before running make targets or any Python commands
|
||||
|
||||
## Type Safety & Linting Standards
|
||||
|
||||
### Forbidden Practices
|
||||
- **Type Suppression**: Using `# type: ignore` is strictly forbidden
|
||||
- **Generic Types**: Using `Any` as a type is forbidden unless absolutely no other option exists
|
||||
- **Lint Circumvention**: Never skip, circumvent, bypass, avoid, ignore, or sidestep precommit checks and lint errors
|
||||
- **Precommit Bypass**: Using `--no-verify` to skip precommit checks is absolutely forbidden
|
||||
|
||||
### Type Annotation Requirements
|
||||
- Use modern typing with specific, descriptive types
|
||||
- Prefer union types (`str | None`) over `Optional[str]` (Python 3.10+)
|
||||
- Use generic type parameters for collections: `list[str]`, `dict[str, int]`
|
||||
- Leverage `typing.Protocol` for structural typing when appropriate
|
||||
- Use `typing.Final` for constants
|
||||
- Apply `@overload` for functions with multiple valid signatures
|
||||
|
||||
### Linting Tools
|
||||
- **Primary Linter**: Use Pyright (Pylance) for type linting
|
||||
- **Code Quality**: Address all refactorlint suggestions without exception
|
||||
- **Error Handling**: Never skip linter errors and failed tests unless explicitly instructed otherwise
|
||||
|
||||
## Code Architecture & Design
|
||||
|
||||
### File Organization
|
||||
- **Size Limit**: If a file exceeds 700 lines, convert it into a package in its own self-contained directory
|
||||
- Example: `foo.py` becomes `foo/` with `__init__.py` and sub-modules
|
||||
- **Module Creation**: Creating new files and modules is forbidden without valid justification and explicit approval
|
||||
- **Code Reuse**: Enabling redundancy is strictly forbidden
|
||||
- Recursively inspect neighboring/adjacent/tangential files and folders
|
||||
- Ensure reuse of existing code instead of creating new implementations
|
||||
|
||||
### Implementation Standards
|
||||
- **No Shortcuts**: Taking shortcuts or circumventing implementation design is forbidden
|
||||
- **Complete Implementation**: If task 1 requires subtask C, then subtask C becomes part of the requirements
|
||||
- **No TODOs**: Do not leave TODO comments; build the functionality instead
|
||||
- **Model Names**: Do not change language model names (training data may not reflect current resources)
|
||||
|
||||
## Code Style Conventions
|
||||
|
||||
### Function Design
|
||||
- **Inline Returns**: Use inline variable returns as functions to keep code neat
|
||||
- **Single Responsibility**: Functions should have one clear purpose
|
||||
- **Pure Functions**: Prefer pure functions where possible
|
||||
- **Named Expressions**: Use walrus operator (`:=`) for complex conditions (addressing `use-named-expression`)
|
||||
|
||||
### Control Flow Patterns
|
||||
- **Simplify Comparisons**: Use direct length checks instead of comparing to zero (`if items:` not `if len(items) > 0`)
|
||||
- **Boolean Logic**: Simplify boolean comparisons (`if flag:` not `if flag is True`)
|
||||
- **Conditional Merging**: Merge nested if statements when logical
|
||||
- **Early Returns**: Use guard clauses and early returns to reduce nesting
|
||||
- **Loop Optimization**: Hoist statements from loops when possible
|
||||
|
||||
### Data Structures
|
||||
- **Collections**: Use appropriate collection types (`set()` for uniqueness, `dict` for mappings)
|
||||
- **Comprehensions**: Prefer list/dict/set comprehensions over loops when readable
|
||||
- **Constant Sums**: Use `sum()` for adding sequences of numbers
|
||||
- **Set Operations**: Convert collections to sets for membership testing when appropriate
|
||||
|
||||
### Error Handling
|
||||
- **Specific Exceptions**: Raise specific errors instead of generic exceptions
|
||||
- **Custom Exceptions**: Create domain-specific exception classes when needed
|
||||
- **Exception Chaining**: Use `raise ... from ...` for exception chaining
|
||||
- **Context Managers**: Use context managers for resource management
|
||||
|
||||
## Testing Standards
|
||||
|
||||
### Test Structure
|
||||
- **No Loops**: Never use loops in tests (addressing `no-loop-in-tests`)
|
||||
- **No Conditionals**: Never use conditional statements in tests (addressing `no-conditionals-in-tests`)
|
||||
- **Assertion Clarity**: Use specific assertions (`assert x == y` not `assert True`)
|
||||
- **Test Isolation**: Each test should be independent and atomic
|
||||
- **Module Imports**: Do not import test modules in production code (addressing `dont-import-test-modules`)
|
||||
|
||||
### Test Patterns
|
||||
- **Arrange-Act-Assert**: Follow the AAA pattern consistently
|
||||
- **Parameterization**: Use `pytest.mark.parametrize` for multiple test cases
|
||||
- **Fixtures**: Create reusable fixtures for common test setup
|
||||
- **Mocking**: Use appropriate mocking for external dependencies
|
||||
|
||||
## Documentation Standards
|
||||
|
||||
### Docstring Requirements
|
||||
- **Imperative Style**: Write docstrings imperatively with proper punctuation
|
||||
- **Completeness**: Document all public functions, classes, and modules
|
||||
- **Examples**: Include usage examples in docstrings when helpful
|
||||
- **Type Information**: Docstrings should complement, not duplicate, type annotations
|
||||
|
||||
### Code Comments
|
||||
- **Explain Why**: Comments should explain why, not what
|
||||
- **Business Logic**: Document complex business rules and algorithms
|
||||
- **Edge Cases**: Explain handling of edge cases and special conditions
|
||||
|
||||
## Development Workflow
|
||||
|
||||
### Required Tools Integration
|
||||
- **Sequential Thinking**: Use the sequential thinking MCP to evaluate design and technical decisions
|
||||
- **Context7**: Use the context7 MCP religiously for modern code patterns and libraries
|
||||
- **Documentation Research**: Leverage latest documentation for current best practices
|
||||
|
||||
### Quality Gates
|
||||
- **Pre-commit Hooks**: All pre-commit hooks must pass
|
||||
- **Type Checking**: Code must pass type checking without warnings
|
||||
- **Test Coverage**: Maintain appropriate test coverage
|
||||
- **Linting**: All linting rules must be satisfied
|
||||
|
||||
### Refactoring Priorities
|
||||
Based on common issues identified, prioritize fixing:
|
||||
1. Test module imports in production code
|
||||
2. Length comparisons that can be simplified
|
||||
3. Constant sum calculations
|
||||
4. Loop and conditional usage in tests
|
||||
5. Generic assertion statements
|
||||
6. Opportunities for named expressions
|
||||
7. Code hoisting opportunities
|
||||
8. List comprehension improvements
|
||||
|
||||
## Code Review Checklist
|
||||
- [ ] Virtual environment activated
|
||||
- [ ] All type annotations specific (no `Any`)
|
||||
- [ ] No `# type: ignore` suppressions
|
||||
- [ ] Docstrings written imperatively
|
||||
- [ ] No TODOs remaining
|
||||
- [ ] Tests contain no loops or conditionals
|
||||
- [ ] Specific exceptions used
|
||||
- [ ] File under 700 lines or properly modularized
|
||||
- [ ] Code reuse verified (no redundancy)
|
||||
- [ ] All linting rules satisfied
|
||||
- [ ] Pre-commit hooks passing
|
||||
@@ -8,6 +8,9 @@
|
||||
"**/node_modules",
|
||||
".venv"
|
||||
],
|
||||
"pythonPath": "./.venv/bin/python",
|
||||
"venvPath": ".",
|
||||
"venv": ".venv",
|
||||
"reportCallInDefaultInitializer": "none",
|
||||
"reportUnknownVariableType": "warning",
|
||||
"reportUnknownMemberType": "warning",
|
||||
@@ -16,9 +19,11 @@
|
||||
"reportUnknownParameterType": "warning",
|
||||
"reportMissingParameterType": "warning",
|
||||
"reportUnannotatedClassAttribute": "warning",
|
||||
"reportMissingTypeStubs": "none",
|
||||
"reportMissingModuleSource": "none",
|
||||
"reportAny": "warning",
|
||||
"reportUnusedCallResult": "none",
|
||||
"reportUnnecessaryIsInstance": "none",
|
||||
"reportImplicitOverride": "none",
|
||||
"reportDeprecated": "warning"
|
||||
}
|
||||
}
|
||||
|
||||
248
docs/elysia.md
248
docs/elysia.md
@@ -1,248 +0,0 @@
|
||||
38 async def output_resources(): │
|
||||
│ │
|
||||
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/services/use │
|
||||
│ r.py:223 in check_all_trees_timeout │
|
||||
│ │
|
||||
│ 220 │ │ Check all trees in all TreeManagers across all users and remove any │
|
||||
│ not been active in the last tree_timeout. │
|
||||
│ 221 │ │ """ │
|
||||
│ 222 │ │ for user_id in self.users: │
|
||||
│ ❱ 223 │ │ │ self.users[user_id]["tree_manager"].check_all_trees_timeout() │
|
||||
│ 224 │ │
|
||||
│ 225 │ def check_user_timeout(self, user_id: str): │
|
||||
│ 226 │ │ """ │
|
||||
╰───────────────────────────────────────────────────────────────────────────────────╯
|
||||
KeyError: 'tree_manager'
|
||||
[10:08:31] ERROR Job "check_timeouts (trigger: interval[0:00:29], next run at: 2025-09-15 10:09:00 base.py:195
|
||||
EDT)" raised an exception
|
||||
╭──────────────────────── Traceback (most recent call last) ────────────────────────╮
|
||||
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/apscheduler/executors/b │
|
||||
│ ase.py:181 in run_coroutine_job │
|
||||
│ │
|
||||
│ 178 │ │ │
|
||||
│ 179 │ │ logger.info('Running job "%s" (scheduled at %s)', job, run_time) │
|
||||
│ 180 │ │ try: │
|
||||
│ ❱ 181 │ │ │ retval = await job.func(*job.args, **job.kwargs) │
|
||||
│ 182 │ │ except BaseException: │
|
||||
│ 183 │ │ │ exc, tb = sys.exc_info()[1:] │
|
||||
│ 184 │ │ │ formatted_tb = "".join(format_tb(tb)) │
|
||||
│ │
|
||||
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/app.py:35 in │
|
||||
│ check_timeouts │
|
||||
│ │
|
||||
│ 32 │
|
||||
│ 33 async def check_timeouts(): │
|
||||
│ 34 │ user_manager = get_user_manager() │
|
||||
│ ❱ 35 │ await user_manager.check_all_trees_timeout() │
|
||||
│ 36 │
|
||||
│ 37 │
|
||||
│ 38 async def output_resources(): │
|
||||
│ │
|
||||
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/services/use │
|
||||
│ r.py:223 in check_all_trees_timeout │
|
||||
│ │
|
||||
│ 220 │ │ Check all trees in all TreeManagers across all users and remove any │
|
||||
│ not been active in the last tree_timeout. │
|
||||
│ 221 │ │ """ │
|
||||
│ 222 │ │ for user_id in self.users: │
|
||||
│ ❱ 223 │ │ │ self.users[user_id]["tree_manager"].check_all_trees_timeout() │
|
||||
│ 224 │ │
|
||||
│ 225 │ def check_user_timeout(self, user_id: str): │
|
||||
│ 226 │ │ """ │
|
||||
╰───────────────────────────────────────────────────────────────────────────────────╯
|
||||
KeyError: 'tree_manager'
|
||||
[10:26:25] WARNING Run time of job "check_restart_clients (trigger: interval[0:00:31], next run at: base.py:176
|
||||
2025-09-15 10:26:33 EDT)" was missed by 0:00:23.029499
|
||||
WARNING Run time of job "check_timeouts (trigger: interval[0:00:29], next run at: 2025-09-15 base.py:176
|
||||
10:26:53 EDT)" was missed by 0:00:01.030848
|
||||
WARNING Run time of job "output_resources (trigger: interval[0:18:23], next run at: base.py:176
|
||||
2025-09-15 10:33:44 EDT)" was missed by 0:11:04.063842
|
||||
[10:41:41] WARNING Run time of job "check_restart_clients (trigger: interval[0:00:31], next run at: base.py:176
|
||||
2025-09-15 10:42:03 EDT)" was missed by 0:00:09.036380
|
||||
WARNING Run time of job "check_timeouts (trigger: interval[0:00:29], next run at: 2025-09-15 base.py:176
|
||||
10:41:52 EDT)" was missed by 0:00:18.037363
|
||||
WARNING Run time of job "output_resources (trigger: interval[0:18:23], next run at: base.py:176
|
||||
2025-09-15 10:52:07 EDT)" was missed by 0:07:57.071763
|
||||
[10:51:25] WARNING Run time of job "check_timeouts (trigger: interval[0:00:29], next run at: 2025-09-15 base.py:176
|
||||
10:51:32 EDT)" was missed by 0:00:21.808772
|
||||
WARNING Run time of job "check_restart_clients (trigger: interval[0:00:31], next run at: base.py:176
|
||||
2025-09-15 10:51:52 EDT)" was missed by 0:00:03.810823
|
||||
[10:51:32] ERROR Job "check_timeouts (trigger: interval[0:00:29], next run at: 2025-09-15 10:52:01 base.py:195
|
||||
EDT)" raised an exception
|
||||
╭──────────────────────── Traceback (most recent call last) ────────────────────────╮
|
||||
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/apscheduler/executors/b │
|
||||
│ ase.py:181 in run_coroutine_job │
|
||||
│ │
|
||||
│ 178 │ │ │
|
||||
│ 179 │ │ logger.info('Running job "%s" (scheduled at %s)', job, run_time) │
|
||||
│ 180 │ │ try: │
|
||||
│ ❱ 181 │ │ │ retval = await job.func(*job.args, **job.kwargs) │
|
||||
│ 182 │ │ except BaseException: │
|
||||
│ 183 │ │ │ exc, tb = sys.exc_info()[1:] │
|
||||
│ 184 │ │ │ formatted_tb = "".join(format_tb(tb)) │
|
||||
│ │
|
||||
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/app.py:35 in │
|
||||
│ check_timeouts │
|
||||
│ │
|
||||
│ 32 │
|
||||
│ 33 async def check_timeouts(): │
|
||||
│ 34 │ user_manager = get_user_manager() │
|
||||
│ ❱ 35 │ await user_manager.check_all_trees_timeout() │
|
||||
│ 36 │
|
||||
│ 37 │
|
||||
│ 38 async def output_resources(): │
|
||||
│ │
|
||||
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/services/use │
|
||||
│ r.py:223 in check_all_trees_timeout │
|
||||
│ │
|
||||
│ 220 │ │ Check all trees in all TreeManagers across all users and remove any │
|
||||
│ not been active in the last tree_timeout. │
|
||||
│ 221 │ │ """ │
|
||||
│ 222 │ │ for user_id in self.users: │
|
||||
│ ❱ 223 │ │ │ self.users[user_id]["tree_manager"].check_all_trees_timeout() │
|
||||
│ 224 │ │
|
||||
│ 225 │ def check_user_timeout(self, user_id: str): │
|
||||
│ 226 │ │ """ │
|
||||
╰───────────────────────────────────────────────────────────────────────────────────╯
|
||||
KeyError: 'tree_manager'
|
||||
[10:51:43] ERROR Unexpected error: 'client_manager' error_handlers.py:32
|
||||
INFO: 127.0.0.1:50043 - "GET /feedback/metadata/b6c0f65db8197395b453a7777a5e4c44 HTTP/1.1" 500 Internal Server Error
|
||||
ERROR: Exception in ASGI application
|
||||
Traceback (most recent call last):
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/uvicorn/protocols/http/httptools_impl.py", line 409, in run_asgi
|
||||
result = await app( # type: ignore[func-returns-value]
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/uvicorn/middleware/proxy_headers.py", line 60, in __call__
|
||||
return await self.app(scope, receive, send)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/fastapi/applications.py", line 1054, in __call__
|
||||
await super().__call__(scope, receive, send)
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/applications.py", line 113, in __call__
|
||||
await self.middleware_stack(scope, receive, send)
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/middleware/errors.py", line 186, in __call__
|
||||
raise exc
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/middleware/errors.py", line 164, in __call__
|
||||
await self.app(scope, receive, _send)
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/middleware/cors.py", line 85, in __call__
|
||||
await self.app(scope, receive, send)
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/middleware/exceptions.py", line 63, in __call__
|
||||
await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
|
||||
raise exc
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app
|
||||
await app(scope, receive, sender)
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/routing.py", line 716, in __call__
|
||||
await self.middleware_stack(scope, receive, send)
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/routing.py", line 736, in app
|
||||
await route.handle(scope, receive, send)
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/routing.py", line 290, in handle
|
||||
await self.app(scope, receive, send)
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/routing.py", line 78, in app
|
||||
await wrap_app_handling_exceptions(app, request)(scope, receive, send)
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app
|
||||
raise exc
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/_exception_handler.py", line 42, in wrapped_app
|
||||
await app(scope, receive, sender)
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/starlette/routing.py", line 75, in app
|
||||
response = await f(request)
|
||||
^^^^^^^^^^^^^^^^
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/fastapi/routing.py", line 302, in app
|
||||
raw_response = await run_endpoint_function(
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/fastapi/routing.py", line 213, in run_endpoint_function
|
||||
return await dependant.call(**values)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/routes/feedback.py", line 81, in run_feedback_metadata
|
||||
client_manager: ClientManager = user["client_manager"]
|
||||
~~~~^^^^^^^^^^^^^^^^^^
|
||||
KeyError: 'client_manager'
|
||||
ERROR HTTP error occurred: Not Found error_handlers.py:14
|
||||
INFO: 127.0.0.1:50045 - "GET /icon.svg?d6c34577c7161f78 HTTP/1.1" 404 Not Found
|
||||
INFO: 127.0.0.1:50045 - "GET /user/config/models HTTP/1.1" 200 OK
|
||||
INFO: 127.0.0.1:50054 - "GET /user/config/models HTTP/1.1" 200 OK
|
||||
[10:52:01] ERROR Job "check_timeouts (trigger: interval[0:00:29], next run at: 2025-09-15 10:52:30 base.py:195
|
||||
EDT)" raised an exception
|
||||
╭──────────────────────── Traceback (most recent call last) ────────────────────────╮
|
||||
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/apscheduler/executors/b │
|
||||
│ ase.py:181 in run_coroutine_job │
|
||||
│ │
|
||||
│ 178 │ │ │
|
||||
│ 179 │ │ logger.info('Running job "%s" (scheduled at %s)', job, run_time) │
|
||||
│ 180 │ │ try: │
|
||||
│ ❱ 181 │ │ │ retval = await job.func(*job.args, **job.kwargs) │
|
||||
│ 182 │ │ except BaseException: │
|
||||
│ 183 │ │ │ exc, tb = sys.exc_info()[1:] │
|
||||
│ 184 │ │ │ formatted_tb = "".join(format_tb(tb)) │
|
||||
│ │
|
||||
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/app.py:35 in │
|
||||
│ check_timeouts │
|
||||
│ │
|
||||
│ 32 │
|
||||
│ 33 async def check_timeouts(): │
|
||||
│ 34 │ user_manager = get_user_manager() │
|
||||
│ ❱ 35 │ await user_manager.check_all_trees_timeout() │
|
||||
│ 36 │
|
||||
│ 37 │
|
||||
│ 38 async def output_resources(): │
|
||||
│ │
|
||||
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/services/use │
|
||||
│ r.py:223 in check_all_trees_timeout │
|
||||
│ │
|
||||
│ 220 │ │ Check all trees in all TreeManagers across all users and remove any │
|
||||
│ not been active in the last tree_timeout. │
|
||||
│ 221 │ │ """ │
|
||||
│ 222 │ │ for user_id in self.users: │
|
||||
│ ❱ 223 │ │ │ self.users[user_id]["tree_manager"].check_all_trees_timeout() │
|
||||
│ 224 │ │
|
||||
│ 225 │ def check_user_timeout(self, user_id: str): │
|
||||
│ 226 │ │ """ │
|
||||
╰───────────────────────────────────────────────────────────────────────────────────╯
|
||||
KeyError: 'tree_manager'
|
||||
^X[10:52:07] ERROR Job "output_resources (trigger: interval[0:18:23], next run at: 2025-09-15 11:10:30 base.py:195
|
||||
EDT)" raised an exception
|
||||
╭──────────────────────── Traceback (most recent call last) ────────────────────────╮
|
||||
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/apscheduler/executors/b │
|
||||
│ ase.py:181 in run_coroutine_job │
|
||||
│ │
|
||||
│ 178 │ │ │
|
||||
│ 179 │ │ logger.info('Running job "%s" (scheduled at %s)', job, run_time) │
|
||||
│ 180 │ │ try: │
|
||||
│ ❱ 181 │ │ │ retval = await job.func(*job.args, **job.kwargs) │
|
||||
│ 182 │ │ except BaseException: │
|
||||
│ 183 │ │ │ exc, tb = sys.exc_info()[1:] │
|
||||
│ 184 │ │ │ formatted_tb = "".join(format_tb(tb)) │
|
||||
│ │
|
||||
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/app.py:40 in │
|
||||
│ output_resources │
|
||||
│ │
|
||||
│ 37 │
|
||||
│ 38 async def output_resources(): │
|
||||
│ 39 │ user_manager = get_user_manager() │
|
||||
│ ❱ 40 │ await print_resources(user_manager, save_to_file=True) │
|
||||
│ 41 │
|
||||
│ 42 │
|
||||
│ 43 async def check_restart_clients(): │
|
||||
│ │
|
||||
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/utils/resour │
|
||||
│ ces.py:59 in print_resources │
|
||||
│ │
|
||||
│ 56 │ user_manager: UserManager | None = None, save_to_file: bool = False │
|
||||
│ 57 ): │
|
||||
│ 58 │ if user_manager is not None: │
|
||||
│ ❱ 59 │ │ avg_user_memory, avg_tree_memory = await get_average_user_memory(us │
|
||||
│ 60 │ │ # avg_user_requests = await get_average_user_requests(user_manager) │
|
||||
│ 61 │ │ # num_users_db = await get_number_local_users_db(user_manager) │
|
||||
│ 62 │
|
||||
│ │
|
||||
│ /Users/trav/dev/elysia/.venv/lib/python3.12/site-packages/elysia/api/utils/resour │
|
||||
│ ces.py:37 in get_average_user_memory │
|
||||
│ │
|
||||
│ 34 │ avg_tree_memory = 0 │
|
||||
│ 35 │ for user in user_manager.users.values(): │
|
||||
│ 36 │ │ user_memory = 0 │
|
||||
│ ❱ 37 │ │ for tree in user["tree_manager"].trees.values(): │
|
||||
│ 38 │ │ │ user_memory += tree["tree"].detailed_memory_usage()["total"] / │
|
||||
│ 39 │ │ │
|
||||
│ 40 │ │ if len(user["tree_manager"].trees) > 0: │
|
||||
╰───────────────────────────────────────────────────────────────────────────────────╯
|
||||
KeyError: 'tree_manager'
|
||||
106
docs/feeds.md
Normal file
106
docs/feeds.md
Normal file
@@ -0,0 +1,106 @@
|
||||
# TUI Feeds
|
||||
|
||||
This guide explains how the terminal dashboard surfaces collection activity and status signals so new backends can plug in without duplicating UI logic.
|
||||
|
||||
***
|
||||
|
||||
## Activity Feed
|
||||
|
||||
- **Primary surface:** `#activity_feed` widget inside `DashboardScreen` (`ingest_pipeline/cli/tui/screens/dashboard.py`).
|
||||
- **Data source:** `self.collections`, populated by `refresh_collections()` after gathering payloads from Weaviate and OpenWebUI via `describe_collections()`.
|
||||
- **Selection logic:** `_generate_activity_text()` formats the three most recent `CollectionInfo` entries and appends an aggregate line when additional collections exist.
|
||||
- **Empty state:** Presents the call-to-action _“🚀 No collections found…”_ encouraging the user to launch an ingestion run.
|
||||
- **Icons:** `_get_content_type_icon()` maps collection names containing `web`, `doc`, or `repo` to 🌐/📖/📦 respectively, and falls back to 📄. Update this helper when introducing new naming conventions.
|
||||
|
||||
### When it refreshes
|
||||
|
||||
1. `refresh_collections()` loads data for each connected backend and caches it in `self.collections`.
|
||||
2. `_update_activity_feed()` is triggered from `update_metrics()` immediately after metrics cards recompute.
|
||||
3. The Static widget updates with a newline-delimited summary, keeping the dashboard reactive without rerendering the entire layout.
|
||||
|
||||
To surface a new backend, extend either `list_weaviate_collections()` or `list_openwebui_collections()` with the additional source (or introduce a new list helper) and ensure the resulting dictionaries match the `CollectionInfo` contract.
|
||||
|
||||
***
|
||||
|
||||
## Status Ticker
|
||||
|
||||
- **Widget:** `#status_text` Static component under the metrics card cluster.
|
||||
- **Lifecycle:** `refresh_collections()` pushes human-readable messages as each backend initializes, succeeds, or fails, ending with a ready state.
|
||||
- **Problem reporting:** Failures bubble into rich notifications via `self.notify` and remain visible in the ticker until the next refresh attempt.
|
||||
- **System health badge:** `_update_status_card()` converts backend counts into 🟢/🟡/🔴 badges so operators can judge connectivity at a glance.
|
||||
|
||||
When adding a backend integration, hook into the progress text updates inside `refresh_collections()` so the ticker narrates each stage consistently.
|
||||
|
||||
***
|
||||
|
||||
## Notifications & Progress
|
||||
|
||||
- **Toast notifications:** All feed-relevant exceptions use `self.notify` with severity hints, keeping the activity feed focused on successful runs.
|
||||
- **Ingestion progress:** `IngestionScreen.perform_ingestion()` (same module) drives the animated progress bar and sends celebratory/failure messages that complement the dashboard feed once control returns to the main screen.
|
||||
|
||||
***
|
||||
|
||||
## Extending the Feed System
|
||||
|
||||
1. Return a fully populated `CollectionInfo` (name, type, backend label, status, last_updated, size_mb, count).
|
||||
2. Call `update_metrics()` after mutating `self.collections` so both metrics cards and the activity feed stay in sync.
|
||||
3. Adjust `_get_content_type_icon()` or `_format_collection_item()` if the new source warrants distinct labeling.
|
||||
4. Update end-to-end tests or manual runbooks to verify the ticker, notifications, and activity feed stay coherent after integration.
|
||||
|
||||
***
|
||||
|
||||
## Implementation Status (September 17, 2025)
|
||||
|
||||
| Component | Responsibility | Location |
|
||||
| --- | --- | --- |
|
||||
| Activity feed rendering | `_update_activity_feed`, `_generate_activity_text`, `_format_collection_item` | `ingest_pipeline/cli/tui/screens/dashboard.py`
|
||||
| Backend loaders | `list_weaviate_collections`, `list_openwebui_collections` | `ingest_pipeline/cli/tui/screens/dashboard.py`
|
||||
| Status ticker & health badge | `_update_status_card`, `refresh_collections` progress updates | `ingest_pipeline/cli/tui/screens/dashboard.py`
|
||||
| Ingestion progress hand-off | `perform_ingestion` success/error notifications | `ingest_pipeline/cli/tui/screens/ingestion.py`
|
||||
|
||||
***
|
||||
|
||||
## Multi-Storage Ingestion Refactor Plan
|
||||
|
||||
### 0. Guardrails and Baseline
|
||||
- Activate the virtual environment (`source .venv/bin/activate`) before running any tooling.
|
||||
- Capture current lint, type, and test status (`uv run basedpyright`, `uv run ruff check`, `uv run pytest`) to compare after the refactor.
|
||||
- Record the existing ingestion modal behaviour (screenshots or a short `textual run --dev ingest_pipeline/cli/tui` demo) to verify UX parity later.
|
||||
|
||||
### 1. Storage Layer Enhancements
|
||||
- Graduate `MultiStorageAdapter` into `ingest_pipeline/storage/` so it can be reused outside the TUI package.
|
||||
- Extend `BaseStorage` with a descriptive `display_name` property that downstream UIs can show without hard-coding labels.
|
||||
- Harden the adapter: aggregate per-backend failures, short-circuit `close()` safely, and surface a structured result containing `success_ids` and `failed_targets`.
|
||||
- Add `StorageManager.build_multi_adapter(backends: Sequence[StorageBackend])` that returns an initialised adapter (invokes `initialize()` on each child) and memoises singletons for reuse inside the session.
|
||||
|
||||
### 2. Application Wiring
|
||||
- Refactor `CollectionManagementApp` to accept a `StorageManager` plus optional cached clients, removing direct constructor parameters for Weaviate/OpenWebUI.
|
||||
- Update all screens (`dashboard.py`, `documents.py`, `search.py`, dialogs) to pull storages through the shared manager instead of owning bespoke references.
|
||||
- Expose a capability flag (e.g., `StorageCapabilities.REPLICATION`) so the dashboard can badge backends that support multi-target ingestion.
|
||||
|
||||
### 3. Ingestion Modal UX
|
||||
- Replace the single-backend select with a checkbox group generated from `StorageManager.get_available_backends()`; preserve keyboard shortcuts (`1`, `2`, `3`, plus `ctrl+shift+<n>` for toggling if feasible).
|
||||
- Default the selection to the collection’s current backend but allow "Select All"/"Clear" convenience buttons.
|
||||
- Persist the latest selection inside a lightweight config file (for example `~/.config/rag-manager/tui.json`) to improve repeated runs.
|
||||
|
||||
### 4. Flow Integration
|
||||
- Update `IngestionScreen.perform_ingestion()` to build the multi-adapter, pass it to `ingest_documents_task`, and capture per-backend success/failure counts for feed reporting.
|
||||
- Teach `ingest_pipeline/flows/ingestion.py` helpers to recognise the adapter (inspect for `fanout_targets`) and log progress per backend, while keeping Firecrawl→R2R flow single-target until replication lands there.
|
||||
- Ensure partial failures propagate as `IngestionStatus.PARTIAL` with an error message enumerating the failing targets.
|
||||
|
||||
### 5. Feeds, Ticker, and Notifications
|
||||
- Extend `_generate_activity_text()` to append the backend list (e.g., `→ weaviate + open_webui`) when a multi-target run finishes.
|
||||
- Add per-backend status lines to the progress ticker so operators know which replication stage is executing.
|
||||
- Emit granular toast notifications: success summary plus warning toasts for any backend that failed to store documents.
|
||||
|
||||
### 6. Validation
|
||||
- Add unit coverage for `MultiStorageAdapter` (full success, partial failure, close semantics) under `ingest_pipeline/tests/storage/`.
|
||||
- Create a focused TUI smoke test that opens the ingestion modal, toggles multiple checkboxes, and asserts the resulting progress copy.
|
||||
- Re-run `uv run basedpyright`, `uv run ruff check`, and the targeted pytest suite before and after changes; address new diagnostics immediately.
|
||||
- Optionally script a headless `textual run` that simulates ingestion across two mock storages to guard against regressions.
|
||||
|
||||
### 7. Documentation and Rollout
|
||||
- Update this document and `README.md` with refreshed screenshots/GIFs demonstrating multi-backend ingestion.
|
||||
- Draft release notes covering required configuration (API keys for every backend) and outline rollback instructions (git tag + revert steps).
|
||||
- Brief support/playbook owners on interpreting the enriched feed/ticker signals so incidents can be triaged quickly.
|
||||
|
||||
126
docs/prefect.md
Normal file
126
docs/prefect.md
Normal file
@@ -0,0 +1,126 @@
|
||||
To break down your described workflow into Prefect tasks, blocks, and flows, here's a direct answer: Prefect flows should manage the orchestration of scraping and ingestion, while tasks encapsulate each atomic operation (e.g., fetching the sitemap, checking the database, scraping, inserting records). Prefect’s blocks can securely handle configuration/secrets for external services and database storage. Asynchronous monitoring can be achieved using subflows/concurrent tasks with state handlers or notification integrations.
|
||||
|
||||
***
|
||||
|
||||
### Workflow Breakdown into Prefect Concepts
|
||||
|
||||
#### Flows
|
||||
- **Main Flow:** Orchestrates the entire end-to-end job, handling each site sequentially or in parallel.
|
||||
- **Scrape Subflow (optional):** Manages the crawling/scraping of a single site, which can be spawned concurrently.
|
||||
|
||||
#### Tasks
|
||||
Each of the workflow steps becomes an individual Prefect task:
|
||||
|
||||
1. **Crawl Sitemap Task:**
|
||||
- Calls Firecrawl’s `/map` API to retrieve all URLs and pages for a given site.
|
||||
|
||||
2. **Check Existing Data Task:**
|
||||
- Checks your destination storage (database, vectorstore, etc.) for the URL’s existence and retrieves the last scraped date.
|
||||
|
||||
3. **Metadata Extraction Task:**
|
||||
- For eligible sites (not scraped or updated in 30 days), iterate through sitemap, extracting structured metadata and clerical attributes.
|
||||
|
||||
4. **Queue for Scraping Task:**
|
||||
- Sends pages or batches to Firecrawl’s scrape or batch scrape API for processing.
|
||||
|
||||
5. **Insert to Storage Task:**
|
||||
- Inserts the returned scraped content and metadata into the designated storage backend.
|
||||
|
||||
#### Blocks
|
||||
- **API Credentials Block:**
|
||||
- Securely stores Firecrawl API keys and connection params.
|
||||
- **Database Connection Block:**
|
||||
- Stores credentials/URI/config for your storage destination.
|
||||
- **Notification/State Blocks (optional):**
|
||||
- Slack/email notifications or custom state reporting.
|
||||
|
||||
#### Asynchronous Monitoring
|
||||
- Run page scraping and ingestion as concurrent Prefect tasks (using `task_runner=ConcurrentTaskRunner()` or subflows).
|
||||
- Register custom state handlers on tasks or use Prefect’s built-in notification integrations to async-update job/page status.
|
||||
- Flow runs and task runs expose state logs and real-time UI feedback for user-facing transparency.
|
||||
|
||||
***
|
||||
|
||||
### Example High-Level Prefect Blueprint
|
||||
|
||||
```python
|
||||
from prefect import flow, task, get_run_logger
|
||||
from prefect.task_runners import ConcurrentTaskRunner
|
||||
|
||||
@task
|
||||
def crawl_sitemap(site_url): ...
|
||||
@task
|
||||
def check_url_in_db(url): ...
|
||||
@task
|
||||
def extract_metadata(page): ...
|
||||
@task
|
||||
def queue_scrape(payload): ...
|
||||
@task
|
||||
def insert_scraped_data(data): ...
|
||||
|
||||
@flow(task_runner=ConcurrentTaskRunner())
|
||||
def process_site(site_url):
|
||||
sitemap = crawl_sitemap(site_url)
|
||||
for url in sitemap:
|
||||
if not check_url_in_db(url):
|
||||
meta = extract_metadata(url)
|
||||
scraped = queue_scrape(meta)
|
||||
insert_scraped_data(scraped)
|
||||
# monitoring could happen here per-page
|
||||
|
||||
@flow
|
||||
def main_flow(sites: list):
|
||||
for site in sites:
|
||||
process_site.submit(site)
|
||||
# could monitor process_site subflow statuses for UI
|
||||
|
||||
```
|
||||
***
|
||||
|
||||
### Key Points
|
||||
|
||||
- Each atomic action becomes a task.
|
||||
- Use concurrent task runners or async subflows for parallelism and monitoring.
|
||||
- Use blocks for credentials and external config.
|
||||
- Store scrape statuses in the destination storage or use Prefect notifications for detailed monitoring.
|
||||
|
||||
***
|
||||
|
||||
### Reference
|
||||
These patterns are standard in modern Prefect 2.x orchestration for web scraping and ETL.
|
||||
|
||||
***
|
||||
|
||||
| Step | Prefect Concept | Comments |
|
||||
|------------------------------------|------------------------|-----------------------------------------------|
|
||||
| Map site with Firecrawl | Task | API call; parallelize by site |
|
||||
| Check/update status in storage | Task | DB lookup; can batch or async |
|
||||
| Metadata extraction | Task | Loop per-page or batch |
|
||||
| Trigger Firecrawl scrape | Task | Async tasks/subflows for parallel scraping |
|
||||
| Insert pages/metadata to storage | Task | Bulk insert recommended |
|
||||
| Monitor ingestion & scrape status | State, notification | Prefect UI + optional state handler/alerts |
|
||||
|
||||
|
||||
|
||||
***
|
||||
|
||||
### Implementation Status (September 17, 2025)
|
||||
|
||||
| Workflow Stage | Prefect Artifact | Location |
|
||||
| --- | --- | --- |
|
||||
| Map site with Firecrawl | `map_firecrawl_site_task` | `ingest_pipeline/flows/ingestion.py` |
|
||||
| Filter existing R2R entries | `filter_existing_documents_task` | `ingest_pipeline/flows/ingestion.py` |
|
||||
| Queue Firecrawl scrape batches | `scrape_firecrawl_batch_task` | `ingest_pipeline/flows/ingestion.py` |
|
||||
| Annotate metadata | `annotate_firecrawl_metadata_task` | `ingest_pipeline/flows/ingestion.py` |
|
||||
| Upsert annotated documents | `upsert_r2r_documents_task` | `ingest_pipeline/flows/ingestion.py` |
|
||||
| Specialized orchestration | `firecrawl_to_r2r_flow` subflow | `ingest_pipeline/flows/ingestion.py` |
|
||||
|
||||
Supporting implementations:
|
||||
|
||||
- Deterministic Firecrawl document IDs via `FirecrawlIngestor.compute_document_id`
|
||||
- Metadata enrichment with `MetadataTagger` while preserving required fields
|
||||
- R2R upsert logic in `ingest_pipeline/storage/r2r/storage.py` with description-aware metadata payloads
|
||||
|
||||
***
|
||||
|
||||
For a real implementation, Prefect code can be provided tailored to your stack or how you want to batch/parallelize jobs. This general layout is robust for scalable, observable, and manageable workflows.
|
||||
108
docs/tagging.md
108
docs/tagging.md
@@ -1,108 +0,0 @@
|
||||
Here are clear written examples of **metadata tagging** in both Open WebUI and Weaviate, showing how you can associate tags and structured metadata with knowledge objects for RAG and semantic search.
|
||||
|
||||
***
|
||||
|
||||
### Example: Metadata Tagging in Open WebUI
|
||||
|
||||
You send a document to the Open WebUI API endpoint, attaching metadata and tags in the content field as a JSON string:
|
||||
|
||||
```json
|
||||
POST http://localhost/api/v1/documents/create
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"name": "policy_doc_2022",
|
||||
"title": "2022 Policy Handbook",
|
||||
"collection_name": "company_handbooks",
|
||||
"filename": "policy_2022.pdf",
|
||||
"content": "{\"tags\": [\"policy\", \"2022\", \"hr\"], \"source_url\": \"https://example.com/policy_2022.pdf\", \"author\": \"Jane Doe\"}"
|
||||
}
|
||||
```
|
||||
- The `"tags"` field is a list of labels for classification (policy, 2022, hr).
|
||||
- The `"source_url"` and `"author"` fields provide additional metadata useful for retrieval, audit, and filtering.[1][2]
|
||||
|
||||
For pipeline-based ingestion, you might design a function to extract and append metadata before vectorization:
|
||||
|
||||
```python
|
||||
metadata = {
|
||||
"tags": ["policy", "2022"],
|
||||
"source_url": document_url,
|
||||
"author": document_author
|
||||
}
|
||||
embed_with_metadata(chunk, metadata)
|
||||
```
|
||||
This metadata becomes part of your retrieval context in RAG workflows.[1]
|
||||
|
||||
***
|
||||
|
||||
### Example: Metadata Tagging in Weaviate
|
||||
|
||||
In Weaviate, metadata and tags are defined directly in the schema and attached to each object when added:
|
||||
|
||||
**Schema definition:**
|
||||
|
||||
```json
|
||||
{
|
||||
"class": "Document",
|
||||
"properties": [
|
||||
{"name": "title", "dataType": ["text"]},
|
||||
{"name": "tags", "dataType": ["text[]"]},
|
||||
{"name": "source_url", "dataType": ["text"]},
|
||||
{"name": "author", "dataType": ["text"]}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
**Object creation example:**
|
||||
|
||||
```python
|
||||
client.data_object.create(
|
||||
data_object={
|
||||
"title": "2022 Policy Handbook",
|
||||
"tags": ["policy", "2022", "hr"],
|
||||
"source_url": "https://example.com/policy_2022.pdf",
|
||||
"author": "Jane Doe"
|
||||
},
|
||||
class_name="Document"
|
||||
)
|
||||
```
|
||||
- The `"tags"` field is a text array, ideal for semantic filtering and faceting.
|
||||
- Other fields store provenance metadata, supporting advanced queries and data governance.[3][4][5]
|
||||
|
||||
**Query with metadata filtering:**
|
||||
|
||||
```python
|
||||
result = (
|
||||
client.query
|
||||
.get("Document", ["title", "tags", "author"])
|
||||
.with_filter({"path": ["tags"], "operator": "ContainsAny", "value": ["policy", "hr"]})
|
||||
.do()
|
||||
)
|
||||
```
|
||||
This retrieves documents classified with either "policy" or "hr" tags.[4][3]
|
||||
|
||||
***
|
||||
|
||||
Both platforms support **metadata tagging** for documents, which enables powerful RAG scenarios, detailed filtering, and context-rich retrievals.[5][2][3][4][1]
|
||||
|
||||
[1](https://www.reddit.com/r/OpenWebUI/comments/1hmmg9a/how_to_handle_metadata_during_vectorization/)
|
||||
[2](https://github.com/open-webui/open-webui/discussions/4692)
|
||||
[3](https://stackoverflow.com/questions/75006703/query-large-list-of-metadate-in-weaviate)
|
||||
[4](https://weaviate.io/blog/enterprise-workflow-langchain-weaviate)
|
||||
[5](https://docs.weaviate.io/academy/py/zero_to_mvp/schema_and_imports/schema)
|
||||
[6](https://docs.weaviate.io/weaviate/api/graphql/additional-properties)
|
||||
[7](https://weaviate.io/blog/sycamore-and-weaviate)
|
||||
[8](https://docs.llamaindex.ai/en/stable/examples/vector_stores/WeaviateIndex_auto_retriever/)
|
||||
[9](https://forum.weaviate.io/t/recommendations-for-metadata-or-knowledge-graphs/960)
|
||||
[10](https://weaviate.io/blog/agent-workflow-automation-n8n-weaviate)
|
||||
[11](https://github.com/open-webui/open-webui/discussions/9804)
|
||||
[12](https://docs.quarkiverse.io/quarkus-langchain4j/dev/rag-weaviate.html)
|
||||
[13](https://github.com/weaviate/weaviate-examples)
|
||||
[14](https://docs.openwebui.com/getting-started/api-endpoints/)
|
||||
[15](https://weaviate.io/blog/hybrid-search-for-web-developers)
|
||||
[16](https://dev.to/stephenc222/how-to-use-weaviate-to-store-and-query-vector-embeddings-4b9b)
|
||||
[17](https://helpdesk.egnyte.com/hc/en-us/articles/360035813612-Using-Metadata-in-the-WebUI)
|
||||
[18](https://docs.datadoghq.com/integrations/weaviate/)
|
||||
[19](https://docs.openwebui.com/features/)
|
||||
[20](https://documentation.suse.com/suse-ai/1.0/html/openwebui-configuring/index.html)
|
||||
[21](https://docs.openwebui.com/getting-started/env-configuration/)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -1,7 +1,7 @@
|
||||
"""CLI interface for ingestion pipeline."""
|
||||
|
||||
import asyncio
|
||||
from enum import Enum
|
||||
from typing import Annotated
|
||||
|
||||
import typer
|
||||
from rich.console import Console
|
||||
@@ -9,27 +9,11 @@ from rich.panel import Panel
|
||||
from rich.progress import BarColumn, Progress, SpinnerColumn, TaskProgressColumn, TextColumn
|
||||
from rich.table import Table
|
||||
|
||||
from ..config import get_settings
|
||||
from ..core.models import IngestionResult
|
||||
from ..config import configure_prefect, get_settings
|
||||
from ..core.models import IngestionResult, IngestionSource, StorageBackend
|
||||
from ..flows.ingestion import create_ingestion_flow
|
||||
from ..flows.scheduler import create_scheduled_deployment, serve_deployments
|
||||
|
||||
|
||||
class SourceType(str, Enum):
|
||||
"""Source types for ingestion."""
|
||||
|
||||
web = "web"
|
||||
repository = "repository"
|
||||
documentation = "documentation"
|
||||
|
||||
|
||||
class StorageBackend(str, Enum):
|
||||
"""Storage backend options."""
|
||||
|
||||
weaviate = "weaviate"
|
||||
open_webui = "open_webui"
|
||||
|
||||
|
||||
app = typer.Typer(
|
||||
name="ingest",
|
||||
help="🚀 Modern Document Ingestion Pipeline - Advanced web and repository processing",
|
||||
@@ -41,7 +25,9 @@ console = Console()
|
||||
|
||||
@app.callback()
|
||||
def main(
|
||||
version: bool = typer.Option(False, "--version", "-v", help="Show version information"),
|
||||
version: Annotated[
|
||||
bool, typer.Option("--version", "-v", help="Show version information")
|
||||
] = False,
|
||||
) -> None:
|
||||
"""
|
||||
🚀 Modern Document Ingestion Pipeline
|
||||
@@ -51,18 +37,23 @@ def main(
|
||||
Features:
|
||||
• 🌐 Web scraping and crawling with Firecrawl
|
||||
• 📦 Repository ingestion with Repomix
|
||||
• 🗄️ Multiple storage backends (Weaviate, OpenWebUI)
|
||||
• 🗄️ Multiple storage backends (Weaviate, OpenWebUI, R2R)
|
||||
• 📊 Modern TUI for collection management
|
||||
• ⚡ Async processing with Prefect orchestration
|
||||
• 🎨 Rich CLI with enhanced visuals
|
||||
"""
|
||||
settings = get_settings()
|
||||
configure_prefect(settings)
|
||||
|
||||
if version:
|
||||
console.print(
|
||||
Panel(
|
||||
"[bold magenta]Ingest Pipeline v0.1.0[/bold magenta]\n"
|
||||
"[dim]Modern Document Ingestion & Management System[/dim]",
|
||||
(
|
||||
"[bold magenta]Ingest Pipeline v0.1.0[/bold magenta]\n"
|
||||
"[dim]Modern Document Ingestion & Management System[/dim]"
|
||||
),
|
||||
title="🚀 Version Info",
|
||||
border_style="magenta"
|
||||
border_style="magenta",
|
||||
)
|
||||
)
|
||||
raise typer.Exit()
|
||||
@@ -70,17 +61,22 @@ def main(
|
||||
|
||||
@app.command()
|
||||
def ingest(
|
||||
source_url: str = typer.Argument(..., help="URL or path to ingest from"),
|
||||
source_type: SourceType = typer.Option(SourceType.web, "--type", "-t", help="Type of source"),
|
||||
storage: StorageBackend = typer.Option(
|
||||
StorageBackend.weaviate, "--storage", "-s", help="Storage backend"
|
||||
),
|
||||
collection: str = typer.Option(
|
||||
None, "--collection", "-c", help="Target collection name (auto-generated if not specified)"
|
||||
),
|
||||
validate: bool = typer.Option(
|
||||
True, "--validate/--no-validate", help="Validate source before ingesting"
|
||||
),
|
||||
source_url: Annotated[str, typer.Argument(help="URL or path to ingest from")],
|
||||
source_type: Annotated[
|
||||
IngestionSource, typer.Option("--type", "-t", help="Type of source")
|
||||
] = IngestionSource.WEB,
|
||||
storage: Annotated[
|
||||
StorageBackend, typer.Option("--storage", "-s", help="Storage backend")
|
||||
] = StorageBackend.WEAVIATE,
|
||||
collection: Annotated[
|
||||
str | None,
|
||||
typer.Option(
|
||||
"--collection", "-c", help="Target collection name (auto-generated if not specified)"
|
||||
),
|
||||
] = None,
|
||||
validate: Annotated[
|
||||
bool, typer.Option("--validate/--no-validate", help="Validate source before ingesting")
|
||||
] = True,
|
||||
) -> None:
|
||||
"""
|
||||
🚀 Run a one-time ingestion job with enhanced progress tracking.
|
||||
@@ -91,13 +87,15 @@ def ingest(
|
||||
# Enhanced startup message
|
||||
console.print(
|
||||
Panel(
|
||||
f"[bold cyan]🚀 Starting Modern Ingestion[/bold cyan]\n\n"
|
||||
f"[yellow]Source:[/yellow] {source_url}\n"
|
||||
f"[yellow]Type:[/yellow] {source_type.value.title()}\n"
|
||||
f"[yellow]Storage:[/yellow] {storage.value.replace('_', ' ').title()}\n"
|
||||
f"[yellow]Collection:[/yellow] {collection or '[dim]Auto-generated[/dim]'}",
|
||||
(
|
||||
f"[bold cyan]🚀 Starting Modern Ingestion[/bold cyan]\n\n"
|
||||
f"[yellow]Source:[/yellow] {source_url}\n"
|
||||
f"[yellow]Type:[/yellow] {source_type.value.title()}\n"
|
||||
f"[yellow]Storage:[/yellow] {storage.value.replace('_', ' ').title()}\n"
|
||||
f"[yellow]Collection:[/yellow] {collection or '[dim]Auto-generated[/dim]'}"
|
||||
),
|
||||
title="🔥 Ingestion Configuration",
|
||||
border_style="cyan"
|
||||
border_style="cyan",
|
||||
)
|
||||
)
|
||||
|
||||
@@ -118,8 +116,8 @@ def ingest(
|
||||
progress.update(task, advance=30, description="📄 Fetching documents...")
|
||||
result = await run_ingestion(
|
||||
url=source_url,
|
||||
source_type=source_type.value,
|
||||
storage_backend=storage.value,
|
||||
source_type=source_type,
|
||||
storage_backend=storage,
|
||||
collection_name=collection,
|
||||
validate_first=validate,
|
||||
)
|
||||
@@ -127,7 +125,22 @@ def ingest(
|
||||
progress.update(task, advance=50, description="✅ Ingestion complete!")
|
||||
return result
|
||||
|
||||
result = asyncio.run(run_with_progress())
|
||||
# Use asyncio.run() with proper event loop handling
|
||||
try:
|
||||
result = asyncio.run(run_with_progress())
|
||||
except RuntimeError as e:
|
||||
if "asyncio.run() cannot be called from a running event loop" in str(e):
|
||||
# If we're already in an event loop (e.g., in Jupyter), use nest_asyncio
|
||||
try:
|
||||
import nest_asyncio
|
||||
nest_asyncio.apply()
|
||||
result = asyncio.run(run_with_progress())
|
||||
except ImportError:
|
||||
# Fallback: get the current loop and run the coroutine
|
||||
loop = asyncio.get_event_loop()
|
||||
result = loop.run_until_complete(run_with_progress())
|
||||
else:
|
||||
raise
|
||||
|
||||
# Enhanced results display
|
||||
status_color = "green" if result.status.value == "completed" else "red"
|
||||
@@ -137,7 +150,7 @@ def ingest(
|
||||
title="📊 Ingestion Results",
|
||||
title_style="bold magenta",
|
||||
border_style="cyan",
|
||||
header_style="bold blue"
|
||||
header_style="bold blue",
|
||||
)
|
||||
table.add_column("📋 Metric", style="cyan", no_wrap=True)
|
||||
table.add_column("📈 Value", style=status_color, justify="right")
|
||||
@@ -162,34 +175,44 @@ def ingest(
|
||||
if result.status.value == "completed" and result.documents_processed > 0:
|
||||
console.print(
|
||||
Panel(
|
||||
f"🎉 [bold green]Success![/bold green] {result.documents_processed} documents ingested\n\n"
|
||||
f"💡 [dim]Try '[bold cyan]ingest modern[/bold cyan]' to explore your collections![/dim]",
|
||||
(
|
||||
f"🎉 [bold green]Success![/bold green] {result.documents_processed} documents ingested\n\n"
|
||||
f"💡 [dim]Try '[bold cyan]ingest modern[/bold cyan]' to explore your collections![/dim]"
|
||||
),
|
||||
title="✨ Ingestion Complete",
|
||||
border_style="green"
|
||||
border_style="green",
|
||||
)
|
||||
)
|
||||
elif result.error_messages:
|
||||
console.print(
|
||||
Panel(
|
||||
"❌ [bold red]Ingestion encountered errors[/bold red]\n\n"
|
||||
"💡 [dim]Check your configuration and try again[/dim]",
|
||||
(
|
||||
"❌ [bold red]Ingestion encountered errors[/bold red]\n\n"
|
||||
"💡 [dim]Check your configuration and try again[/dim]"
|
||||
),
|
||||
title="⚠️ Issues Detected",
|
||||
border_style="red"
|
||||
border_style="red",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@app.command()
|
||||
def schedule(
|
||||
name: str = typer.Argument(..., help="Deployment name"),
|
||||
source_url: str = typer.Argument(..., help="URL or path to ingest from"),
|
||||
source_type: SourceType = typer.Option(SourceType.web, "--type", "-t", help="Type of source"),
|
||||
storage: StorageBackend = typer.Option(
|
||||
StorageBackend.weaviate, "--storage", "-s", help="Storage backend"
|
||||
),
|
||||
cron: str | None = typer.Option(None, "--cron", "-c", help="Cron expression for scheduling"),
|
||||
interval: int = typer.Option(60, "--interval", "-i", help="Interval in minutes"),
|
||||
serve_now: bool = typer.Option(False, "--serve/--no-serve", help="Start serving immediately"),
|
||||
name: Annotated[str, typer.Argument(help="Deployment name")],
|
||||
source_url: Annotated[str, typer.Argument(help="URL or path to ingest from")],
|
||||
source_type: Annotated[
|
||||
IngestionSource, typer.Option("--type", "-t", help="Type of source")
|
||||
] = IngestionSource.WEB,
|
||||
storage: Annotated[
|
||||
StorageBackend, typer.Option("--storage", "-s", help="Storage backend")
|
||||
] = StorageBackend.WEAVIATE,
|
||||
cron: Annotated[
|
||||
str | None, typer.Option("--cron", "-c", help="Cron expression for scheduling")
|
||||
] = None,
|
||||
interval: Annotated[int, typer.Option("--interval", "-i", help="Interval in minutes")] = 60,
|
||||
serve_now: Annotated[
|
||||
bool, typer.Option("--serve/--no-serve", help="Start serving immediately")
|
||||
] = False,
|
||||
) -> None:
|
||||
"""
|
||||
Create a scheduled deployment for recurring ingestion.
|
||||
@@ -199,8 +222,8 @@ def schedule(
|
||||
deployment = create_scheduled_deployment(
|
||||
name=name,
|
||||
source_url=source_url,
|
||||
source_type=source_type.value,
|
||||
storage_backend=storage.value,
|
||||
source_type=source_type,
|
||||
storage_backend=storage,
|
||||
schedule_type="cron" if cron else "interval",
|
||||
cron_expression=cron,
|
||||
interval_minutes=interval,
|
||||
@@ -215,12 +238,12 @@ def schedule(
|
||||
|
||||
@app.command()
|
||||
def serve(
|
||||
config_file: str | None = typer.Option(
|
||||
None, "--config", "-c", help="Path to deployments config file"
|
||||
),
|
||||
ui: str | None = typer.Option(
|
||||
None, "--ui", help="Launch user interface (options: tui, web)"
|
||||
),
|
||||
config_file: Annotated[
|
||||
str | None, typer.Option("--config", "-c", help="Path to deployments config file")
|
||||
] = None,
|
||||
ui: Annotated[
|
||||
str | None, typer.Option("--ui", help="Launch user interface (options: tui, web)")
|
||||
] = None,
|
||||
) -> None:
|
||||
"""
|
||||
🚀 Serve configured deployments with optional UI interface.
|
||||
@@ -232,18 +255,21 @@ def serve(
|
||||
if ui == "tui":
|
||||
console.print(
|
||||
Panel(
|
||||
"[bold cyan]🚀 Launching Enhanced TUI[/bold cyan]\n\n"
|
||||
"[yellow]Features:[/yellow]\n"
|
||||
"• 📊 Interactive collection management\n"
|
||||
"• ⌨️ Enhanced keyboard navigation\n"
|
||||
"• 🎨 Modern design with focus indicators\n"
|
||||
"• 📄 Document browsing and search\n"
|
||||
"• 🔄 Real-time status updates",
|
||||
(
|
||||
"[bold cyan]🚀 Launching Enhanced TUI[/bold cyan]\n\n"
|
||||
"[yellow]Features:[/yellow]\n"
|
||||
"• 📊 Interactive collection management\n"
|
||||
"• ⌨️ Enhanced keyboard navigation\n"
|
||||
"• 🎨 Modern design with focus indicators\n"
|
||||
"• 📄 Document browsing and search\n"
|
||||
"• 🔄 Real-time status updates"
|
||||
),
|
||||
title="🎉 TUI Mode",
|
||||
border_style="cyan"
|
||||
border_style="cyan",
|
||||
)
|
||||
)
|
||||
from .tui import dashboard
|
||||
|
||||
dashboard()
|
||||
return
|
||||
elif ui == "web":
|
||||
@@ -296,18 +322,21 @@ def tui() -> None:
|
||||
"""
|
||||
console.print(
|
||||
Panel(
|
||||
"[bold cyan]🚀 Launching Enhanced TUI[/bold cyan]\n\n"
|
||||
"[yellow]Features:[/yellow]\n"
|
||||
"• 📊 Interactive collection management\n"
|
||||
"• ⌨️ Enhanced keyboard navigation\n"
|
||||
"• 🎨 Modern design with focus indicators\n"
|
||||
"• 📄 Document browsing and search\n"
|
||||
"• 🔄 Real-time status updates",
|
||||
(
|
||||
"[bold cyan]🚀 Launching Enhanced TUI[/bold cyan]\n\n"
|
||||
"[yellow]Features:[/yellow]\n"
|
||||
"• 📊 Interactive collection management\n"
|
||||
"• ⌨️ Enhanced keyboard navigation\n"
|
||||
"• 🎨 Modern design with focus indicators\n"
|
||||
"• 📄 Document browsing and search\n"
|
||||
"• 🔄 Real-time status updates"
|
||||
),
|
||||
title="🎉 TUI Mode",
|
||||
border_style="cyan"
|
||||
border_style="cyan",
|
||||
)
|
||||
)
|
||||
from .tui import dashboard
|
||||
|
||||
dashboard()
|
||||
|
||||
|
||||
@@ -323,10 +352,12 @@ def config() -> None:
|
||||
|
||||
console.print(
|
||||
Panel(
|
||||
"[bold cyan]⚙️ System Configuration[/bold cyan]\n"
|
||||
"[dim]Current pipeline settings and endpoints[/dim]",
|
||||
(
|
||||
"[bold cyan]⚙️ System Configuration[/bold cyan]\n"
|
||||
"[dim]Current pipeline settings and endpoints[/dim]"
|
||||
),
|
||||
title="🔧 Configuration",
|
||||
border_style="cyan"
|
||||
border_style="cyan",
|
||||
)
|
||||
)
|
||||
|
||||
@@ -336,7 +367,7 @@ def config() -> None:
|
||||
title_style="bold magenta",
|
||||
border_style="blue",
|
||||
header_style="bold cyan",
|
||||
show_lines=True
|
||||
show_lines=True,
|
||||
)
|
||||
table.add_column("🏷️ Setting", style="cyan", no_wrap=True, width=25)
|
||||
table.add_column("🎯 Value", style="yellow", overflow="fold")
|
||||
@@ -346,59 +377,37 @@ def config() -> None:
|
||||
def get_status_indicator(value: str | None) -> str:
|
||||
return "✅ Set" if value else "❌ Missing"
|
||||
|
||||
table.add_row(
|
||||
"🤖 LLM Endpoint",
|
||||
str(settings.llm_endpoint),
|
||||
"✅ Active"
|
||||
)
|
||||
table.add_row(
|
||||
"🔥 Firecrawl Endpoint",
|
||||
str(settings.firecrawl_endpoint),
|
||||
"✅ Active"
|
||||
)
|
||||
table.add_row("🤖 LLM Endpoint", str(settings.llm_endpoint), "✅ Active")
|
||||
table.add_row("🔥 Firecrawl Endpoint", str(settings.firecrawl_endpoint), "✅ Active")
|
||||
table.add_row(
|
||||
"🗄️ Weaviate Endpoint",
|
||||
str(settings.weaviate_endpoint),
|
||||
get_status_indicator(str(settings.weaviate_api_key) if settings.weaviate_api_key else None)
|
||||
get_status_indicator(str(settings.weaviate_api_key) if settings.weaviate_api_key else None),
|
||||
)
|
||||
table.add_row(
|
||||
"🌐 OpenWebUI Endpoint",
|
||||
str(settings.openwebui_endpoint),
|
||||
get_status_indicator(settings.openwebui_api_key)
|
||||
)
|
||||
table.add_row(
|
||||
"🧠 Embedding Model",
|
||||
settings.embedding_model,
|
||||
"✅ Set"
|
||||
)
|
||||
table.add_row(
|
||||
"💾 Default Storage",
|
||||
settings.default_storage_backend.title(),
|
||||
"✅ Set"
|
||||
)
|
||||
table.add_row(
|
||||
"📦 Default Batch Size",
|
||||
f"{settings.default_batch_size:,}",
|
||||
"✅ Set"
|
||||
)
|
||||
table.add_row(
|
||||
"⚡ Max Concurrent Tasks",
|
||||
f"{settings.max_concurrent_tasks}",
|
||||
"✅ Set"
|
||||
get_status_indicator(settings.openwebui_api_key),
|
||||
)
|
||||
table.add_row("🧠 Embedding Model", settings.embedding_model, "✅ Set")
|
||||
table.add_row("💾 Default Storage", settings.default_storage_backend.title(), "✅ Set")
|
||||
table.add_row("📦 Default Batch Size", f"{settings.default_batch_size:,}", "✅ Set")
|
||||
table.add_row("⚡ Max Concurrent Tasks", f"{settings.max_concurrent_tasks}", "✅ Set")
|
||||
|
||||
console.print(table)
|
||||
|
||||
# Additional helpful information
|
||||
console.print(
|
||||
Panel(
|
||||
"💡 [bold cyan]Quick Tips[/bold cyan]\n\n"
|
||||
"• Use '[bold]ingest list-collections[/bold]' to view all collections\n"
|
||||
"• Use '[bold]ingest search[/bold]' to search content\n"
|
||||
"• Configure API keys in your [yellow].env[/yellow] file\n"
|
||||
"• Default collection names are auto-generated from URLs",
|
||||
(
|
||||
"💡 [bold cyan]Quick Tips[/bold cyan]\n\n"
|
||||
"• Use '[bold]ingest list-collections[/bold]' to view all collections\n"
|
||||
"• Use '[bold]ingest search[/bold]' to search content\n"
|
||||
"• Configure API keys in your [yellow].env[/yellow] file\n"
|
||||
"• Default collection names are auto-generated from URLs"
|
||||
),
|
||||
title="🚀 Usage Tips",
|
||||
border_style="green"
|
||||
border_style="green",
|
||||
)
|
||||
)
|
||||
|
||||
@@ -414,10 +423,14 @@ def list_collections() -> None:
|
||||
|
||||
@app.command()
|
||||
def search(
|
||||
query: str = typer.Argument(..., help="Search query"),
|
||||
collection: str = typer.Option(None, "--collection", "-c", help="Target collection"),
|
||||
backend: StorageBackend = typer.Option(StorageBackend.weaviate, "--backend", "-b", help="Storage backend"),
|
||||
limit: int = typer.Option(10, "--limit", "-l", help="Result limit"),
|
||||
query: Annotated[str, typer.Argument(help="Search query")],
|
||||
collection: Annotated[
|
||||
str | None, typer.Option("--collection", "-c", help="Target collection")
|
||||
] = None,
|
||||
backend: Annotated[
|
||||
StorageBackend, typer.Option("--backend", "-b", help="Storage backend")
|
||||
] = StorageBackend.WEAVIATE,
|
||||
limit: Annotated[int, typer.Option("--limit", "-l", help="Result limit")] = 10,
|
||||
) -> None:
|
||||
"""
|
||||
🔍 Search across collections.
|
||||
@@ -428,10 +441,10 @@ def search(
|
||||
|
||||
async def run_ingestion(
|
||||
url: str,
|
||||
source_type: str,
|
||||
storage_backend: str,
|
||||
source_type: IngestionSource,
|
||||
storage_backend: StorageBackend,
|
||||
collection_name: str | None = None,
|
||||
validate_first: bool = True
|
||||
validate_first: bool = True,
|
||||
) -> IngestionResult:
|
||||
"""
|
||||
Run ingestion with support for targeted collections.
|
||||
@@ -439,25 +452,25 @@ async def run_ingestion(
|
||||
# Auto-generate collection name if not provided
|
||||
if not collection_name:
|
||||
from urllib.parse import urlparse
|
||||
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.replace(".", "_").replace("-", "_")
|
||||
collection_name = f"{domain}_{source_type}"
|
||||
collection_name = f"{domain}_{source_type.value}"
|
||||
|
||||
result = await create_ingestion_flow(
|
||||
return await create_ingestion_flow(
|
||||
source_url=url,
|
||||
source_type=source_type,
|
||||
storage_backend=storage_backend,
|
||||
collection_name=collection_name,
|
||||
validate_first=validate_first,
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
async def run_list_collections() -> None:
|
||||
"""
|
||||
List collections across storage backends.
|
||||
"""
|
||||
from ..config import get_settings
|
||||
from ..config import configure_prefect, get_settings
|
||||
from ..core.models import StorageBackend, StorageConfig
|
||||
from ..storage.openwebui import OpenWebUIStorage
|
||||
from ..storage.weaviate import WeaviateStorage
|
||||
@@ -467,7 +480,7 @@ async def run_list_collections() -> None:
|
||||
console.print("🔍 [bold cyan]Scanning storage backends...[/bold cyan]")
|
||||
|
||||
# Try to connect to Weaviate
|
||||
weaviate_collections = []
|
||||
weaviate_collections: list[tuple[str, int]] = []
|
||||
try:
|
||||
weaviate_config = StorageConfig(
|
||||
backend=StorageBackend.WEAVIATE,
|
||||
@@ -478,17 +491,16 @@ async def run_list_collections() -> None:
|
||||
weaviate = WeaviateStorage(weaviate_config)
|
||||
await weaviate.initialize()
|
||||
|
||||
collections_list = weaviate.client.collections.list_all() if weaviate.client else []
|
||||
for collection in collections_list:
|
||||
collection_obj = weaviate.client.collections.get(collection) if weaviate.client else None
|
||||
if collection_obj:
|
||||
count = collection_obj.aggregate.over_all(total_count=True).total_count or 0
|
||||
weaviate_collections.append((collection, count))
|
||||
overview = await weaviate.describe_collections()
|
||||
for item in overview:
|
||||
name = str(item.get("name", "Unknown"))
|
||||
count = int(item.get("count", 0))
|
||||
weaviate_collections.append((name, count))
|
||||
except Exception as e:
|
||||
console.print(f"❌ [red]Weaviate connection failed: {e}[/red]")
|
||||
|
||||
# Try to connect to OpenWebUI
|
||||
openwebui_collections = []
|
||||
openwebui_collections: list[tuple[str, int]] = []
|
||||
try:
|
||||
openwebui_config = StorageConfig(
|
||||
backend=StorageBackend.OPEN_WEBUI,
|
||||
@@ -499,14 +511,11 @@ async def run_list_collections() -> None:
|
||||
openwebui = OpenWebUIStorage(openwebui_config)
|
||||
await openwebui.initialize()
|
||||
|
||||
response = await openwebui.client.get("/api/v1/knowledge/")
|
||||
response.raise_for_status()
|
||||
knowledge_bases = response.json()
|
||||
|
||||
for kb in knowledge_bases:
|
||||
name = kb.get("name", "Unknown")
|
||||
file_count = len(kb.get("files", []))
|
||||
openwebui_collections.append((name, file_count))
|
||||
overview = await openwebui.describe_collections()
|
||||
for item in overview:
|
||||
name = str(item.get("name", "Unknown"))
|
||||
count = int(item.get("count", 0))
|
||||
openwebui_collections.append((name, count))
|
||||
except Exception as e:
|
||||
console.print(f"❌ [red]OpenWebUI connection failed: {e}[/red]")
|
||||
|
||||
@@ -514,11 +523,12 @@ async def run_list_collections() -> None:
|
||||
if weaviate_collections or openwebui_collections:
|
||||
# Create results table
|
||||
from rich.table import Table
|
||||
|
||||
table = Table(
|
||||
title="📚 Collection Overview",
|
||||
title_style="bold magenta",
|
||||
border_style="cyan",
|
||||
header_style="bold blue"
|
||||
header_style="bold blue",
|
||||
)
|
||||
table.add_column("🏷️ Collection", style="cyan", no_wrap=True)
|
||||
table.add_column("📊 Backend", style="yellow")
|
||||
@@ -541,7 +551,7 @@ async def run_search(query: str, collection: str | None, backend: str, limit: in
|
||||
"""
|
||||
Search across collections.
|
||||
"""
|
||||
from ..config import get_settings
|
||||
from ..config import configure_prefect, get_settings
|
||||
from ..core.models import StorageBackend, StorageConfig
|
||||
from ..storage.weaviate import WeaviateStorage
|
||||
|
||||
@@ -567,12 +577,14 @@ async def run_search(query: str, collection: str | None, backend: str, limit: in
|
||||
|
||||
results_generator = weaviate.search(query, limit=limit)
|
||||
async for doc in results_generator:
|
||||
results.append({
|
||||
"title": getattr(doc, "title", "Untitled"),
|
||||
"content": getattr(doc, "content", ""),
|
||||
"score": getattr(doc, "score", 0.0),
|
||||
"backend": "🗄️ Weaviate"
|
||||
})
|
||||
results.append(
|
||||
{
|
||||
"title": getattr(doc, "title", "Untitled"),
|
||||
"content": getattr(doc, "content", ""),
|
||||
"score": getattr(doc, "score", 0.0),
|
||||
"backend": "🗄️ Weaviate",
|
||||
}
|
||||
)
|
||||
|
||||
elif backend == "open_webui":
|
||||
console.print("❌ [red]OpenWebUI search not yet implemented[/red]")
|
||||
@@ -585,11 +597,12 @@ async def run_search(query: str, collection: str | None, backend: str, limit: in
|
||||
# Display results
|
||||
if results:
|
||||
from rich.table import Table
|
||||
|
||||
table = Table(
|
||||
title=f"🔍 Search Results for '{query}'",
|
||||
title_style="bold magenta",
|
||||
border_style="green",
|
||||
header_style="bold blue"
|
||||
header_style="bold blue",
|
||||
)
|
||||
table.add_column("📄 Title", style="cyan", max_width=40)
|
||||
table.add_column("📝 Preview", style="white", max_width=60)
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -1,21 +1,51 @@
|
||||
"""Main TUI application with enhanced keyboard navigation."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
from collections import deque
|
||||
from pathlib import Path
|
||||
from queue import Empty, Queue
|
||||
from typing import TYPE_CHECKING, ClassVar, Literal
|
||||
|
||||
from textual import events
|
||||
from textual.app import App
|
||||
from textual.binding import Binding
|
||||
from textual.binding import Binding, BindingType
|
||||
from textual.timer import Timer
|
||||
|
||||
from ...storage.base import BaseStorage
|
||||
from ...storage.openwebui import OpenWebUIStorage
|
||||
from ...storage.weaviate import WeaviateStorage
|
||||
from .screens import CollectionOverviewScreen, HelpScreen
|
||||
from .styles import TUI_CSS
|
||||
from .utils.storage_manager import StorageManager
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from logging import Formatter, LogRecord
|
||||
|
||||
from ...storage.r2r.storage import R2RStorage
|
||||
from .screens.dialogs import LogViewerScreen
|
||||
else: # pragma: no cover - optional dependency fallback
|
||||
R2RStorage = BaseStorage
|
||||
|
||||
|
||||
|
||||
class CollectionManagementApp(App[None]):
|
||||
"""Enhanced modern Textual application with comprehensive keyboard navigation."""
|
||||
|
||||
CSS = TUI_CSS
|
||||
CSS: ClassVar[str] = TUI_CSS
|
||||
|
||||
BINDINGS = [
|
||||
def safe_notify(
|
||||
self,
|
||||
message: str,
|
||||
*,
|
||||
severity: Literal["information", "warning", "error"] = "information",
|
||||
) -> None:
|
||||
"""Safely notify with markup disabled to prevent parsing errors."""
|
||||
self.notify(message, severity=severity, markup=False)
|
||||
|
||||
BINDINGS: ClassVar[list[BindingType]] = [
|
||||
Binding("q", "quit", "Quit"),
|
||||
Binding("ctrl+c", "quit", "Quit"),
|
||||
Binding("ctrl+q", "quit", "Quit"),
|
||||
@@ -25,27 +55,123 @@ class CollectionManagementApp(App[None]):
|
||||
# Global navigation shortcuts
|
||||
Binding("ctrl+r", "refresh_current", "Refresh Current Screen"),
|
||||
Binding("ctrl+w", "close_current", "Close Current Screen"),
|
||||
Binding("ctrl+l", "toggle_logs", "Logs"),
|
||||
# Tab navigation shortcuts
|
||||
Binding("ctrl+1", "dashboard_tab", "Dashboard", show=False),
|
||||
Binding("ctrl+2", "collections_tab", "Collections", show=False),
|
||||
Binding("ctrl+3", "analytics_tab", "Analytics", show=False),
|
||||
]
|
||||
|
||||
storage_manager: StorageManager
|
||||
weaviate: WeaviateStorage | None
|
||||
openwebui: OpenWebUIStorage | None
|
||||
r2r: R2RStorage | BaseStorage | None
|
||||
log_queue: Queue[LogRecord] | None
|
||||
_log_formatter: Formatter
|
||||
_log_buffer: deque[str]
|
||||
_log_viewer: LogViewerScreen | None
|
||||
_log_file: Path | None
|
||||
_log_timer: Timer | None
|
||||
|
||||
def __init__(
|
||||
self, weaviate: WeaviateStorage | None = None, openwebui: OpenWebUIStorage | None = None
|
||||
):
|
||||
self,
|
||||
storage_manager: StorageManager,
|
||||
weaviate: WeaviateStorage | None = None,
|
||||
openwebui: OpenWebUIStorage | None = None,
|
||||
r2r: R2RStorage | BaseStorage | None = None,
|
||||
*,
|
||||
log_queue: Queue[LogRecord] | None = None,
|
||||
log_formatter: Formatter | None = None,
|
||||
log_file: Path | None = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.storage_manager = storage_manager
|
||||
self.weaviate = weaviate
|
||||
self.openwebui = openwebui
|
||||
self.r2r = r2r
|
||||
self.title: str = ""
|
||||
self.sub_title: str = ""
|
||||
self.log_queue = log_queue
|
||||
self._log_formatter = log_formatter or logging.Formatter(
|
||||
fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
self._log_buffer = deque(maxlen=500)
|
||||
self._log_viewer = None
|
||||
self._log_file = log_file
|
||||
self._log_timer = None
|
||||
|
||||
def on_mount(self) -> None:
|
||||
"""Initialize the enhanced app with better branding."""
|
||||
self.title = "🚀 Enhanced Collection Management System"
|
||||
self.sub_title = "Advanced Document Ingestion & Management Platform with Keyboard Navigation"
|
||||
self.push_screen(CollectionOverviewScreen(self.weaviate, self.openwebui))
|
||||
self.sub_title = (
|
||||
"Advanced Document Ingestion & Management Platform with Keyboard Navigation"
|
||||
)
|
||||
reduced_motion_env = os.getenv("TEXTUAL_REDUCED_MOTION") or os.getenv(
|
||||
"PREFER_REDUCED_MOTION"
|
||||
)
|
||||
if reduced_motion_env is not None:
|
||||
normalized = reduced_motion_env.strip().lower()
|
||||
reduced_motion_enabled = normalized in {"1", "true", "yes", "on"}
|
||||
else:
|
||||
reduced_motion_enabled = False
|
||||
_ = self.set_class(reduced_motion_enabled, "reduced-motion")
|
||||
_ = self.push_screen(
|
||||
CollectionOverviewScreen(
|
||||
self.storage_manager,
|
||||
self.weaviate,
|
||||
self.openwebui,
|
||||
self.r2r,
|
||||
)
|
||||
)
|
||||
if self.log_queue is not None and self._log_timer is None:
|
||||
# Poll the queue so log output is captured without blocking the UI loop
|
||||
self._log_timer = self.set_interval(0.25, self._drain_log_queue)
|
||||
|
||||
def _drain_log_queue(self) -> None:
|
||||
"""Drain queued log records and route them to the active log viewer."""
|
||||
if self.log_queue is None:
|
||||
return
|
||||
|
||||
drained: list[str] = []
|
||||
while True:
|
||||
try:
|
||||
record = self.log_queue.get_nowait()
|
||||
except Empty:
|
||||
break
|
||||
message = self._log_formatter.format(record)
|
||||
self._log_buffer.append(message)
|
||||
drained.append(message)
|
||||
|
||||
if drained and self._log_viewer is not None:
|
||||
self._log_viewer.append_logs(drained)
|
||||
|
||||
def attach_log_viewer(self, viewer: "LogViewerScreen") -> None:
|
||||
"""Register an active log viewer and hydrate it with existing entries."""
|
||||
self._log_viewer = viewer
|
||||
viewer.replace_logs(list(self._log_buffer))
|
||||
viewer.update_log_file(self._log_file)
|
||||
# Drain once more to deliver any entries gathered between instantiation and mount
|
||||
self._drain_log_queue()
|
||||
|
||||
def detach_log_viewer(self, viewer: "LogViewerScreen") -> None:
|
||||
"""Remove the current log viewer when it is dismissed."""
|
||||
if self._log_viewer is viewer:
|
||||
self._log_viewer = None
|
||||
|
||||
def get_log_file_path(self) -> Path | None:
|
||||
"""Return the active log file path if configured."""
|
||||
return self._log_file
|
||||
|
||||
def action_toggle_logs(self) -> None:
|
||||
"""Toggle the log viewer modal screen."""
|
||||
if self._log_viewer is not None:
|
||||
_ = self.pop_screen()
|
||||
return
|
||||
|
||||
from .screens.dialogs import LogViewerScreen # Local import to avoid cycle
|
||||
|
||||
_ = self.push_screen(LogViewerScreen())
|
||||
|
||||
def action_help(self) -> None:
|
||||
"""Show comprehensive help information with all keyboard shortcuts."""
|
||||
@@ -131,40 +257,44 @@ class CollectionManagementApp(App[None]):
|
||||
|
||||
*Press Escape, Enter, or Q to close this help.*
|
||||
"""
|
||||
self.push_screen(HelpScreen(help_md))
|
||||
_ = self.push_screen(HelpScreen(help_md))
|
||||
|
||||
def action_refresh_current(self) -> None:
|
||||
"""Refresh the current screen if it supports it."""
|
||||
current_screen = self.screen
|
||||
if hasattr(current_screen, "action_refresh"):
|
||||
current_screen.action_refresh()
|
||||
else:
|
||||
self.notify("Current screen doesn't support refresh", severity="information")
|
||||
handler = getattr(current_screen, "action_refresh", None)
|
||||
if callable(handler):
|
||||
_ = handler()
|
||||
return
|
||||
self.notify("Current screen doesn't support refresh", severity="information")
|
||||
|
||||
def action_close_current(self) -> None:
|
||||
"""Close current screen/dialog."""
|
||||
if len(self.screen_stack) > 1: # Don't close the main screen
|
||||
self.pop_screen()
|
||||
_ = self.pop_screen()
|
||||
else:
|
||||
self.notify("Cannot close main screen. Use Q to quit.", severity="warning")
|
||||
_ = self.notify("Cannot close main screen. Use Q to quit.", severity="warning")
|
||||
|
||||
def action_dashboard_tab(self) -> None:
|
||||
"""Switch to dashboard tab in current screen."""
|
||||
current_screen = self.screen
|
||||
if hasattr(current_screen, "action_tab_dashboard"):
|
||||
current_screen.action_tab_dashboard()
|
||||
handler = getattr(current_screen, "action_tab_dashboard", None)
|
||||
if callable(handler):
|
||||
_ = handler()
|
||||
|
||||
def action_collections_tab(self) -> None:
|
||||
"""Switch to collections tab in current screen."""
|
||||
current_screen = self.screen
|
||||
if hasattr(current_screen, "action_tab_collections"):
|
||||
current_screen.action_tab_collections()
|
||||
handler = getattr(current_screen, "action_tab_collections", None)
|
||||
if callable(handler):
|
||||
_ = handler()
|
||||
|
||||
def action_analytics_tab(self) -> None:
|
||||
"""Switch to analytics tab in current screen."""
|
||||
current_screen = self.screen
|
||||
if hasattr(current_screen, "action_tab_analytics"):
|
||||
current_screen.action_tab_analytics()
|
||||
handler = getattr(current_screen, "action_tab_analytics", None)
|
||||
if callable(handler):
|
||||
_ = handler()
|
||||
|
||||
def on_key(self, event: events.Key) -> None:
|
||||
"""Handle global keyboard shortcuts."""
|
||||
@@ -172,10 +302,10 @@ class CollectionManagementApp(App[None]):
|
||||
if event.key == "ctrl+shift+?":
|
||||
# Alternative help shortcut
|
||||
self.action_help()
|
||||
event.prevent_default()
|
||||
_ = event.prevent_default()
|
||||
elif event.key == "ctrl+alt+r":
|
||||
# Force refresh all connections
|
||||
self.notify("🔄 Refreshing all connections...", severity="information")
|
||||
_ = self.notify("🔄 Refreshing all connections...", severity="information")
|
||||
# This could trigger a full reinit if needed
|
||||
event.prevent_default()
|
||||
_ = event.prevent_default()
|
||||
# No else clause needed - just handle our events
|
||||
|
||||
380
ingest_pipeline/cli/tui/layouts.py
Normal file
380
ingest_pipeline/cli/tui/layouts.py
Normal file
@@ -0,0 +1,380 @@
|
||||
"""Responsive layout system for TUI applications."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
from textual.app import ComposeResult
|
||||
from textual.containers import Container, VerticalScroll
|
||||
from textual.widgets import Static
|
||||
from typing_extensions import override
|
||||
|
||||
|
||||
class ResponsiveGrid(Container):
|
||||
"""Grid that auto-adjusts based on terminal size."""
|
||||
|
||||
DEFAULT_CSS = """
|
||||
ResponsiveGrid {
|
||||
layout: grid;
|
||||
grid-size: 1;
|
||||
grid-columns: 1fr;
|
||||
grid-rows: auto;
|
||||
grid-gutter: 1;
|
||||
padding: 1;
|
||||
}
|
||||
|
||||
ResponsiveGrid.two-column {
|
||||
grid-size: 2;
|
||||
grid-columns: 1fr 1fr;
|
||||
}
|
||||
|
||||
ResponsiveGrid.three-column {
|
||||
grid-size: 3;
|
||||
grid-columns: 1fr 1fr 1fr;
|
||||
}
|
||||
|
||||
ResponsiveGrid.auto-fit {
|
||||
grid-columns: repeat(auto-fit, minmax(20, 1fr));
|
||||
}
|
||||
|
||||
ResponsiveGrid.compact {
|
||||
grid-gutter: 0;
|
||||
padding: 0;
|
||||
}
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*children: Any,
|
||||
columns: int = 1,
|
||||
auto_fit: bool = False,
|
||||
compact: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize responsive grid."""
|
||||
super().__init__(*children, **kwargs)
|
||||
self.columns = columns
|
||||
self.auto_fit = auto_fit
|
||||
self.compact = compact
|
||||
|
||||
def on_mount(self) -> None:
|
||||
"""Apply responsive classes based on configuration."""
|
||||
if self.auto_fit:
|
||||
_ = self.add_class("auto-fit")
|
||||
elif self.columns == 2:
|
||||
_ = self.add_class("two-column")
|
||||
elif self.columns == 3:
|
||||
_ = self.add_class("three-column")
|
||||
|
||||
if self.compact:
|
||||
_ = self.add_class("compact")
|
||||
|
||||
def on_resize(self) -> None:
|
||||
"""Adjust layout based on terminal size."""
|
||||
if self.auto_fit:
|
||||
# Let CSS handle auto-fit
|
||||
return
|
||||
|
||||
terminal_width = self.size.width
|
||||
if terminal_width < 60:
|
||||
# Force single column on narrow terminals
|
||||
_ = self.remove_class("two-column", "three-column")
|
||||
self.styles.grid_size_columns = 1
|
||||
self.styles.grid_columns = "1fr"
|
||||
elif terminal_width < 100 and self.columns > 2:
|
||||
# Force two columns on medium terminals
|
||||
_ = self.remove_class("three-column")
|
||||
_ = self.add_class("two-column")
|
||||
self.styles.grid_size_columns = 2
|
||||
self.styles.grid_columns = "1fr 1fr"
|
||||
elif self.columns == 2:
|
||||
_ = self.add_class("two-column")
|
||||
elif self.columns == 3:
|
||||
_ = self.add_class("three-column")
|
||||
|
||||
|
||||
class CollapsibleSidebar(Container):
|
||||
"""Sidebar that can be collapsed to save space."""
|
||||
|
||||
DEFAULT_CSS = """
|
||||
CollapsibleSidebar {
|
||||
dock: left;
|
||||
width: 25%;
|
||||
min-width: 20;
|
||||
max-width: 40;
|
||||
background: $surface;
|
||||
border-right: solid $border;
|
||||
padding: 1;
|
||||
transition: width 300ms;
|
||||
}
|
||||
|
||||
CollapsibleSidebar.collapsed {
|
||||
width: 3;
|
||||
min-width: 3;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
CollapsibleSidebar.collapsed > * {
|
||||
display: none;
|
||||
}
|
||||
|
||||
CollapsibleSidebar .sidebar-toggle {
|
||||
dock: top;
|
||||
height: 1;
|
||||
background: $primary;
|
||||
color: $text;
|
||||
text-align: center;
|
||||
margin-bottom: 1;
|
||||
}
|
||||
|
||||
CollapsibleSidebar .sidebar-content {
|
||||
height: 1fr;
|
||||
overflow-y: auto;
|
||||
}
|
||||
"""
|
||||
|
||||
def __init__(self, *children: Any, collapsed: bool = False, **kwargs: Any) -> None:
|
||||
"""Initialize collapsible sidebar."""
|
||||
super().__init__(**kwargs)
|
||||
self.collapsed = collapsed
|
||||
self._children = children
|
||||
|
||||
@override
|
||||
def compose(self) -> ComposeResult:
|
||||
"""Compose sidebar with toggle and content."""
|
||||
yield Static("☰", classes="sidebar-toggle")
|
||||
with VerticalScroll(classes="sidebar-content"):
|
||||
yield from self._children
|
||||
|
||||
def on_mount(self) -> None:
|
||||
"""Apply initial collapsed state."""
|
||||
if self.collapsed:
|
||||
_ = self.add_class("collapsed")
|
||||
|
||||
def on_click(self) -> None:
|
||||
"""Toggle sidebar when clicked."""
|
||||
self.toggle()
|
||||
|
||||
def toggle(self) -> None:
|
||||
"""Toggle sidebar collapsed state."""
|
||||
self.collapsed = not self.collapsed
|
||||
if self.collapsed:
|
||||
_ = self.add_class("collapsed")
|
||||
else:
|
||||
_ = self.remove_class("collapsed")
|
||||
|
||||
def expand(self) -> None:
|
||||
"""Expand sidebar."""
|
||||
if self.collapsed:
|
||||
self.toggle()
|
||||
|
||||
def collapse(self) -> None:
|
||||
"""Collapse sidebar."""
|
||||
if not self.collapsed:
|
||||
self.toggle()
|
||||
|
||||
|
||||
class TabularLayout(Container):
|
||||
"""Optimized layout for data tables with optional sidebar."""
|
||||
|
||||
DEFAULT_CSS = """
|
||||
TabularLayout {
|
||||
layout: horizontal;
|
||||
height: 100%;
|
||||
}
|
||||
|
||||
TabularLayout .main-content {
|
||||
width: 1fr;
|
||||
height: 100%;
|
||||
layout: vertical;
|
||||
}
|
||||
|
||||
TabularLayout .table-container {
|
||||
height: 1fr;
|
||||
overflow: auto;
|
||||
border: solid $border;
|
||||
background: $surface;
|
||||
}
|
||||
|
||||
TabularLayout .table-header {
|
||||
dock: top;
|
||||
height: 3;
|
||||
background: $primary;
|
||||
color: $text;
|
||||
padding: 1;
|
||||
}
|
||||
|
||||
TabularLayout .table-footer {
|
||||
dock: bottom;
|
||||
height: 3;
|
||||
background: $surface-lighten-1;
|
||||
padding: 1;
|
||||
border-top: solid $border;
|
||||
}
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
table_widget: Any,
|
||||
header_content: Any | None = None,
|
||||
footer_content: Any | None = None,
|
||||
sidebar_content: Any | None = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize tabular layout."""
|
||||
super().__init__(**kwargs)
|
||||
self.table_widget = table_widget
|
||||
self.header_content = header_content
|
||||
self.footer_content = footer_content
|
||||
self.sidebar_content = sidebar_content
|
||||
|
||||
@override
|
||||
def compose(self) -> ComposeResult:
|
||||
"""Compose layout with optional sidebar."""
|
||||
if self.sidebar_content:
|
||||
yield CollapsibleSidebar(self.sidebar_content)
|
||||
|
||||
with Container(classes="main-content"):
|
||||
if self.header_content:
|
||||
yield Container(self.header_content, classes="table-header")
|
||||
|
||||
yield Container(self.table_widget, classes="table-container")
|
||||
|
||||
if self.footer_content:
|
||||
yield Container(self.footer_content, classes="table-footer")
|
||||
|
||||
|
||||
class CardLayout(ResponsiveGrid):
|
||||
"""Grid layout optimized for card-based content."""
|
||||
|
||||
DEFAULT_CSS = """
|
||||
CardLayout {
|
||||
grid-gutter: 2;
|
||||
padding: 2;
|
||||
}
|
||||
|
||||
CardLayout .card {
|
||||
background: $surface;
|
||||
border: solid $border;
|
||||
border-radius: 1;
|
||||
padding: 2;
|
||||
height: auto;
|
||||
min-height: 10;
|
||||
}
|
||||
|
||||
CardLayout .card:hover {
|
||||
border: solid $accent;
|
||||
background: $surface-lighten-1;
|
||||
}
|
||||
|
||||
CardLayout .card:focus {
|
||||
border: solid $primary;
|
||||
}
|
||||
|
||||
CardLayout .card-header {
|
||||
dock: top;
|
||||
height: 3;
|
||||
background: $primary-lighten-1;
|
||||
color: $text;
|
||||
padding: 1;
|
||||
margin: -2 -2 1 -2;
|
||||
border-radius: 1 1 0 0;
|
||||
}
|
||||
|
||||
CardLayout .card-content {
|
||||
height: 1fr;
|
||||
overflow: auto;
|
||||
}
|
||||
|
||||
CardLayout .card-footer {
|
||||
dock: bottom;
|
||||
height: 3;
|
||||
background: $surface-darken-1;
|
||||
padding: 1;
|
||||
margin: 1 -2 -2 -2;
|
||||
border-radius: 0 0 1 1;
|
||||
}
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs: Any) -> None:
|
||||
"""Initialize card layout with default settings for cards."""
|
||||
# Default to auto-fit cards with minimum width
|
||||
super().__init__(auto_fit=True, **kwargs)
|
||||
|
||||
|
||||
class SplitPane(Container):
|
||||
"""Resizable split pane layout."""
|
||||
|
||||
DEFAULT_CSS = """
|
||||
SplitPane {
|
||||
layout: horizontal;
|
||||
height: 100%;
|
||||
}
|
||||
|
||||
SplitPane.vertical {
|
||||
layout: vertical;
|
||||
}
|
||||
|
||||
SplitPane .left-pane,
|
||||
SplitPane .top-pane {
|
||||
width: 50%;
|
||||
height: 50%;
|
||||
background: $surface;
|
||||
border-right: solid $border;
|
||||
border-bottom: solid $border;
|
||||
}
|
||||
|
||||
SplitPane .right-pane,
|
||||
SplitPane .bottom-pane {
|
||||
width: 50%;
|
||||
height: 50%;
|
||||
background: $surface;
|
||||
}
|
||||
|
||||
SplitPane .splitter {
|
||||
width: 1;
|
||||
height: 1;
|
||||
background: $border;
|
||||
}
|
||||
|
||||
SplitPane.vertical .splitter {
|
||||
width: 100%;
|
||||
height: 1;
|
||||
}
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
left_content: Any,
|
||||
right_content: Any,
|
||||
vertical: bool = False,
|
||||
split_ratio: float = 0.5,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize split pane."""
|
||||
super().__init__(**kwargs)
|
||||
self.left_content = left_content
|
||||
self.right_content = right_content
|
||||
self.vertical = vertical
|
||||
self.split_ratio = split_ratio
|
||||
|
||||
@override
|
||||
def compose(self) -> ComposeResult:
|
||||
"""Compose split pane layout."""
|
||||
if self.vertical:
|
||||
_ = self.add_class("vertical")
|
||||
|
||||
pane_classes = ("top-pane", "bottom-pane") if self.vertical else ("left-pane", "right-pane")
|
||||
|
||||
yield Container(self.left_content, classes=pane_classes[0])
|
||||
yield Static("", classes="splitter")
|
||||
yield Container(self.right_content, classes=pane_classes[1])
|
||||
|
||||
def on_mount(self) -> None:
|
||||
"""Apply split ratio."""
|
||||
if self.vertical:
|
||||
self.query_one(f".{self.__class__.__name__} .top-pane").styles.height = f"{self.split_ratio * 100}%"
|
||||
self.query_one(f".{self.__class__.__name__} .bottom-pane").styles.height = f"{(1 - self.split_ratio) * 100}%"
|
||||
else:
|
||||
self.query_one(f".{self.__class__.__name__} .left-pane").styles.width = f"{self.split_ratio * 100}%"
|
||||
self.query_one(f".{self.__class__.__name__} .right-pane").styles.width = f"{(1 - self.split_ratio) * 100}%"
|
||||
@@ -1,6 +1,17 @@
|
||||
"""Data models and TypedDict definitions for the TUI."""
|
||||
|
||||
from typing import TypedDict
|
||||
from enum import IntEnum
|
||||
from typing import Any, TypedDict
|
||||
|
||||
|
||||
class StorageCapabilities(IntEnum):
|
||||
"""Storage backend capabilities (ordered by feature completeness)."""
|
||||
|
||||
NONE = 0
|
||||
BASIC = 1 # Basic CRUD operations
|
||||
VECTOR_SEARCH = 2 # Vector search capabilities
|
||||
KNOWLEDGE_BASE = 3 # Knowledge base features
|
||||
FULL_FEATURED = 4 # All features including chunks and entities
|
||||
|
||||
|
||||
class CollectionInfo(TypedDict):
|
||||
@@ -9,7 +20,7 @@ class CollectionInfo(TypedDict):
|
||||
name: str
|
||||
type: str
|
||||
count: int
|
||||
backend: str
|
||||
backend: str | list[str] # Support both single backend and multi-backend
|
||||
status: str
|
||||
last_updated: str
|
||||
size_mb: float
|
||||
@@ -21,6 +32,86 @@ class DocumentInfo(TypedDict):
|
||||
id: str
|
||||
title: str
|
||||
source_url: str
|
||||
description: str
|
||||
content_type: str
|
||||
content_preview: str
|
||||
word_count: int
|
||||
timestamp: str
|
||||
|
||||
|
||||
class ChunkInfo(TypedDict):
|
||||
"""Information about a document chunk (R2R specific)."""
|
||||
|
||||
id: str
|
||||
document_id: str
|
||||
content: str
|
||||
start_index: int
|
||||
end_index: int
|
||||
metadata: dict[str, Any]
|
||||
|
||||
|
||||
class EntityInfo(TypedDict):
|
||||
"""Information about an extracted entity (R2R specific)."""
|
||||
|
||||
id: str
|
||||
name: str
|
||||
type: str
|
||||
confidence: float
|
||||
metadata: dict[str, Any]
|
||||
|
||||
|
||||
class FirecrawlOptions(TypedDict, total=False):
|
||||
"""Advanced Firecrawl scraping options."""
|
||||
|
||||
# Scraping options
|
||||
formats: list[str] # ["markdown", "html", "screenshot"]
|
||||
only_main_content: bool
|
||||
include_tags: list[str]
|
||||
exclude_tags: list[str]
|
||||
wait_for: int # milliseconds
|
||||
|
||||
# Mapping options
|
||||
search: str | None
|
||||
include_subdomains: bool
|
||||
limit: int
|
||||
max_depth: int
|
||||
|
||||
# Extraction options
|
||||
extract_schema: dict[str, Any] | None
|
||||
extract_prompt: str | None
|
||||
|
||||
|
||||
class IngestionConfig(TypedDict):
|
||||
"""Configuration for ingestion operations."""
|
||||
|
||||
source_url: str
|
||||
source_type: str # "web", "repository", "documentation"
|
||||
target_collection: str
|
||||
storage_backend: str
|
||||
firecrawl_options: FirecrawlOptions
|
||||
batch_size: int
|
||||
max_concurrent: int
|
||||
|
||||
|
||||
class SearchFilter(TypedDict, total=False):
|
||||
"""Search filtering options."""
|
||||
|
||||
backends: list[str]
|
||||
collections: list[str]
|
||||
content_types: list[str]
|
||||
date_range: tuple[str, str] | None
|
||||
word_count_range: tuple[int, int] | None
|
||||
similarity_threshold: float
|
||||
|
||||
|
||||
class IngestionProgress(TypedDict):
|
||||
"""Real-time ingestion progress information."""
|
||||
|
||||
total_urls: int
|
||||
processed_urls: int
|
||||
successful_ingestions: int
|
||||
failed_ingestions: int
|
||||
current_url: str
|
||||
elapsed_time: float
|
||||
estimated_remaining: float
|
||||
errors: list[str]
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
"""Screen components for the TUI application."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from .dashboard import CollectionOverviewScreen
|
||||
from .dialogs import ConfirmDeleteScreen, ConfirmDocumentDeleteScreen
|
||||
from .documents import DocumentManagementScreen
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
370
ingest_pipeline/cli/tui/screens/base.py
Normal file
370
ingest_pipeline/cli/tui/screens/base.py
Normal file
@@ -0,0 +1,370 @@
|
||||
"""Base screen classes for common CRUD patterns."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import TYPE_CHECKING, Any, Generic, TypeVar
|
||||
|
||||
from textual import work
|
||||
from textual.app import ComposeResult
|
||||
from textual.binding import Binding
|
||||
from textual.containers import Container
|
||||
from textual.screen import ModalScreen, Screen
|
||||
from textual.widgets import Button, DataTable, LoadingIndicator, Static
|
||||
from typing_extensions import override
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..utils.storage_manager import StorageManager
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
class BaseScreen(Screen, ABC):
|
||||
"""Base screen with common functionality."""
|
||||
|
||||
def __init__(self, storage_manager: StorageManager, **kwargs: Any) -> None:
|
||||
"""Initialize base screen."""
|
||||
super().__init__(**kwargs)
|
||||
self.storage_manager = storage_manager
|
||||
|
||||
|
||||
class CRUDScreen(BaseScreen, Generic[T], ABC):
|
||||
"""Base class for Create/Read/Update/Delete operations."""
|
||||
|
||||
BINDINGS = [
|
||||
Binding("ctrl+n", "create_item", "New"),
|
||||
Binding("ctrl+e", "edit_item", "Edit"),
|
||||
Binding("ctrl+d", "delete_item", "Delete"),
|
||||
Binding("f5", "refresh", "Refresh"),
|
||||
Binding("escape", "app.pop_screen", "Back"),
|
||||
]
|
||||
|
||||
def __init__(self, storage_manager: StorageManager, **kwargs: Any) -> None:
|
||||
"""Initialize CRUD screen."""
|
||||
super().__init__(storage_manager, **kwargs)
|
||||
self.items: list[T] = []
|
||||
self.selected_item: T | None = None
|
||||
self.loading = False
|
||||
|
||||
@override
|
||||
def compose(self) -> ComposeResult:
|
||||
"""Compose CRUD screen layout."""
|
||||
yield Container(
|
||||
Static(self.get_title(), classes="screen-title"),
|
||||
self.create_toolbar(),
|
||||
self.create_list_view(),
|
||||
LoadingIndicator(id="loading"),
|
||||
classes="crud-container",
|
||||
)
|
||||
|
||||
def get_title(self) -> str:
|
||||
"""Get screen title."""
|
||||
return "CRUD Operations"
|
||||
|
||||
def create_toolbar(self) -> Container:
|
||||
"""Create action toolbar."""
|
||||
return Container(
|
||||
Button("📝 New", id="new_btn", variant="primary"),
|
||||
Button("✏️ Edit", id="edit_btn", variant="default"),
|
||||
Button("🗑️ Delete", id="delete_btn", variant="error"),
|
||||
Button("🔄 Refresh", id="refresh_btn", variant="default"),
|
||||
classes="toolbar",
|
||||
)
|
||||
|
||||
def create_list_view(self) -> DataTable[str]:
|
||||
"""Create list view widget."""
|
||||
table = DataTable[str](id="items_table")
|
||||
table.add_columns(*self.get_table_columns())
|
||||
return table
|
||||
|
||||
@abstractmethod
|
||||
def get_table_columns(self) -> list[str]:
|
||||
"""Get table column headers."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def load_items(self) -> list[T]:
|
||||
"""Load items from storage."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def item_to_row(self, item: T) -> list[str]:
|
||||
"""Convert item to table row."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def create_item_dialog(self) -> T | None:
|
||||
"""Show create item dialog."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def edit_item_dialog(self, item: T) -> T | None:
|
||||
"""Show edit item dialog."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def delete_item(self, item: T) -> bool:
|
||||
"""Delete item."""
|
||||
pass
|
||||
|
||||
def on_mount(self) -> None:
|
||||
"""Initialize screen."""
|
||||
self.query_one("#loading").display = False
|
||||
self.refresh_items()
|
||||
|
||||
@work(exclusive=True)
|
||||
async def refresh_items(self) -> None:
|
||||
"""Refresh items list."""
|
||||
self.set_loading(True)
|
||||
try:
|
||||
self.items = await self.load_items()
|
||||
await self.update_table()
|
||||
finally:
|
||||
self.set_loading(False)
|
||||
|
||||
async def update_table(self) -> None:
|
||||
"""Update table with current items."""
|
||||
table = self.query_one("#items_table", DataTable)
|
||||
table.clear()
|
||||
|
||||
for item in self.items:
|
||||
row_data = self.item_to_row(item)
|
||||
table.add_row(*row_data)
|
||||
|
||||
def set_loading(self, loading: bool) -> None:
|
||||
"""Set loading state."""
|
||||
self.loading = loading
|
||||
loading_widget = self.query_one("#loading")
|
||||
loading_widget.display = loading
|
||||
|
||||
def action_create_item(self) -> None:
|
||||
"""Create new item."""
|
||||
self.run_worker(self._create_item_worker())
|
||||
|
||||
def action_edit_item(self) -> None:
|
||||
"""Edit selected item."""
|
||||
if self.selected_item:
|
||||
self.run_worker(self._edit_item_worker())
|
||||
|
||||
def action_delete_item(self) -> None:
|
||||
"""Delete selected item."""
|
||||
if self.selected_item:
|
||||
self.run_worker(self._delete_item_worker())
|
||||
|
||||
def action_refresh(self) -> None:
|
||||
"""Refresh items."""
|
||||
self.refresh_items()
|
||||
|
||||
async def _create_item_worker(self) -> None:
|
||||
"""Worker for creating items."""
|
||||
item = await self.create_item_dialog()
|
||||
if item:
|
||||
self.refresh_items()
|
||||
|
||||
async def _edit_item_worker(self) -> None:
|
||||
"""Worker for editing items."""
|
||||
if self.selected_item:
|
||||
item = await self.edit_item_dialog(self.selected_item)
|
||||
if item:
|
||||
self.refresh_items()
|
||||
|
||||
async def _delete_item_worker(self) -> None:
|
||||
"""Worker for deleting items."""
|
||||
if self.selected_item:
|
||||
success = await self.delete_item(self.selected_item)
|
||||
if success:
|
||||
self.refresh_items()
|
||||
|
||||
|
||||
class ListScreen(BaseScreen, Generic[T], ABC):
|
||||
"""Base for paginated list views."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
storage_manager: StorageManager,
|
||||
page_size: int = 20,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize list screen."""
|
||||
super().__init__(storage_manager, **kwargs)
|
||||
self.page_size = page_size
|
||||
self.current_page = 0
|
||||
self.total_items = 0
|
||||
self.items: list[T] = []
|
||||
|
||||
@override
|
||||
def compose(self) -> ComposeResult:
|
||||
"""Compose list screen layout."""
|
||||
yield Container(
|
||||
Static(self.get_title(), classes="screen-title"),
|
||||
self.create_filters(),
|
||||
self.create_list_view(),
|
||||
self.create_pagination(),
|
||||
LoadingIndicator(id="loading"),
|
||||
classes="list-container",
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def get_title(self) -> str:
|
||||
"""Get screen title."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def create_filters(self) -> Container:
|
||||
"""Create filter widgets."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def create_list_view(self) -> Any:
|
||||
"""Create list view widget."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def load_page(self, page: int, page_size: int) -> tuple[list[T], int]:
|
||||
"""Load page of items."""
|
||||
pass
|
||||
|
||||
def create_pagination(self) -> Container:
|
||||
"""Create pagination controls."""
|
||||
return Container(
|
||||
Button("◀ Previous", id="prev_btn", variant="default"),
|
||||
Static("Page 1 of 1", id="page_info"),
|
||||
Button("Next ▶", id="next_btn", variant="default"),
|
||||
classes="pagination",
|
||||
)
|
||||
|
||||
def on_mount(self) -> None:
|
||||
"""Initialize screen."""
|
||||
self.query_one("#loading").display = False
|
||||
self.load_current_page()
|
||||
|
||||
@work(exclusive=True)
|
||||
async def load_current_page(self) -> None:
|
||||
"""Load current page."""
|
||||
self.set_loading(True)
|
||||
try:
|
||||
self.items, self.total_items = await self.load_page(self.current_page, self.page_size)
|
||||
await self.update_list_view()
|
||||
self.update_pagination_info()
|
||||
finally:
|
||||
self.set_loading(False)
|
||||
|
||||
@abstractmethod
|
||||
async def update_list_view(self) -> None:
|
||||
"""Update list view with current items."""
|
||||
pass
|
||||
|
||||
def update_pagination_info(self) -> None:
|
||||
"""Update pagination information."""
|
||||
total_pages = max(1, (self.total_items + self.page_size - 1) // self.page_size)
|
||||
current_page_display = self.current_page + 1
|
||||
|
||||
page_info = self.query_one("#page_info", Static)
|
||||
page_info.update(f"Page {current_page_display} of {total_pages}")
|
||||
|
||||
prev_btn = self.query_one("#prev_btn", Button)
|
||||
next_btn = self.query_one("#next_btn", Button)
|
||||
|
||||
prev_btn.disabled = self.current_page == 0
|
||||
next_btn.disabled = self.current_page >= total_pages - 1
|
||||
|
||||
def set_loading(self, loading: bool) -> None:
|
||||
"""Set loading state."""
|
||||
loading_widget = self.query_one("#loading")
|
||||
loading_widget.display = loading
|
||||
|
||||
def on_button_pressed(self, event: Button.Pressed) -> None:
|
||||
"""Handle button presses."""
|
||||
if event.button.id == "prev_btn" and self.current_page > 0:
|
||||
self.current_page -= 1
|
||||
self.load_current_page()
|
||||
elif event.button.id == "next_btn":
|
||||
total_pages = (self.total_items + self.page_size - 1) // self.page_size
|
||||
if self.current_page < total_pages - 1:
|
||||
self.current_page += 1
|
||||
self.load_current_page()
|
||||
|
||||
|
||||
class FormScreen(ModalScreen[T], Generic[T], ABC):
|
||||
"""Base for input forms with validation."""
|
||||
|
||||
BINDINGS = [
|
||||
Binding("escape", "app.pop_screen", "Cancel"),
|
||||
Binding("ctrl+s", "save", "Save"),
|
||||
Binding("enter", "save", "Save"),
|
||||
]
|
||||
|
||||
def __init__(self, item: T | None = None, **kwargs: Any) -> None:
|
||||
"""Initialize form screen."""
|
||||
super().__init__(**kwargs)
|
||||
self.item = item
|
||||
self.is_edit_mode = item is not None
|
||||
|
||||
@override
|
||||
def compose(self) -> ComposeResult:
|
||||
"""Compose form layout."""
|
||||
title = "Edit" if self.is_edit_mode else "Create"
|
||||
yield Container(
|
||||
Static(f"{title} {self.get_item_type()}", classes="form-title"),
|
||||
self.create_form_fields(),
|
||||
Container(
|
||||
Button("💾 Save", id="save_btn", variant="success"),
|
||||
Button("❌ Cancel", id="cancel_btn", variant="default"),
|
||||
classes="form-actions",
|
||||
),
|
||||
classes="form-container",
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def get_item_type(self) -> str:
|
||||
"""Get item type name for title."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def create_form_fields(self) -> Container:
|
||||
"""Create form input fields."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def validate_form(self) -> tuple[bool, list[str]]:
|
||||
"""Validate form data."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_form_data(self) -> T:
|
||||
"""Get item from form data."""
|
||||
pass
|
||||
|
||||
def on_mount(self) -> None:
|
||||
"""Initialize form."""
|
||||
if self.is_edit_mode and self.item:
|
||||
self.populate_form(self.item)
|
||||
|
||||
@abstractmethod
|
||||
def populate_form(self, item: T) -> None:
|
||||
"""Populate form with item data."""
|
||||
pass
|
||||
|
||||
def action_save(self) -> None:
|
||||
"""Save form data."""
|
||||
is_valid, errors = self.validate_form()
|
||||
if is_valid:
|
||||
try:
|
||||
item = self.get_form_data()
|
||||
self.dismiss(item)
|
||||
except Exception as e:
|
||||
self.show_validation_errors([str(e)])
|
||||
else:
|
||||
self.show_validation_errors(errors)
|
||||
|
||||
def show_validation_errors(self, errors: list[str]) -> None:
|
||||
"""Show validation errors to user."""
|
||||
# This would typically show a notification or update error display
|
||||
pass
|
||||
|
||||
def on_button_pressed(self, event: Button.Pressed) -> None:
|
||||
"""Handle button presses."""
|
||||
if event.button.id == "save_btn":
|
||||
self.action_save()
|
||||
elif event.button.id == "cancel_btn":
|
||||
self.dismiss(None)
|
||||
@@ -1,11 +1,14 @@
|
||||
"""Main dashboard screen with collections overview."""
|
||||
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import TYPE_CHECKING, Final
|
||||
|
||||
from textual import work
|
||||
from textual.app import ComposeResult
|
||||
from textual.binding import Binding
|
||||
from textual.containers import Container, Grid, Horizontal
|
||||
from textual.css.query import NoMatches
|
||||
from textual.reactive import reactive, var
|
||||
from textual.screen import Screen
|
||||
from textual.widgets import (
|
||||
@@ -20,11 +23,22 @@ from textual.widgets import (
|
||||
)
|
||||
from typing_extensions import override
|
||||
|
||||
from ....core.models import StorageBackend
|
||||
from ....storage.base import BaseStorage
|
||||
from ....storage.openwebui import OpenWebUIStorage
|
||||
from ....storage.weaviate import WeaviateStorage
|
||||
from ..models import CollectionInfo
|
||||
from ..utils.storage_manager import StorageManager
|
||||
from ..widgets import EnhancedDataTable, MetricsCard, StatusIndicator
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ....storage.r2r.storage import R2RStorage
|
||||
else: # pragma: no cover - optional dependency fallback
|
||||
R2RStorage = BaseStorage
|
||||
|
||||
|
||||
LOGGER: Final[logging.Logger] = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CollectionOverviewScreen(Screen[None]):
|
||||
"""Enhanced dashboard with modern design and metrics."""
|
||||
@@ -51,11 +65,23 @@ class CollectionOverviewScreen(Screen[None]):
|
||||
collections: var[list[CollectionInfo]] = var([])
|
||||
is_loading: var[bool] = var(False)
|
||||
selected_collection: reactive[CollectionInfo | None] = reactive(None)
|
||||
storage_manager: StorageManager
|
||||
weaviate: WeaviateStorage | None
|
||||
openwebui: OpenWebUIStorage | None
|
||||
r2r: R2RStorage | BaseStorage | None
|
||||
|
||||
def __init__(self, weaviate: WeaviateStorage | None, openwebui: OpenWebUIStorage | None):
|
||||
def __init__(
|
||||
self,
|
||||
storage_manager: StorageManager,
|
||||
weaviate: WeaviateStorage | None,
|
||||
openwebui: OpenWebUIStorage | None,
|
||||
r2r: R2RStorage | BaseStorage | None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.storage_manager = storage_manager
|
||||
self.weaviate = weaviate
|
||||
self.openwebui = openwebui
|
||||
self.r2r = r2r
|
||||
self.total_documents = 0
|
||||
self.total_collections = 0
|
||||
self.active_backends = 0
|
||||
@@ -64,7 +90,7 @@ class CollectionOverviewScreen(Screen[None]):
|
||||
def compose(self) -> ComposeResult:
|
||||
yield Header(show_clock=True)
|
||||
|
||||
with TabbedContent("Dashboard", "Collections", "Analytics"):
|
||||
with TabbedContent():
|
||||
# Dashboard Tab
|
||||
with TabPane("Dashboard", id="dashboard"):
|
||||
yield Container(
|
||||
@@ -72,14 +98,19 @@ class CollectionOverviewScreen(Screen[None]):
|
||||
Static("Modern document ingestion and management platform", classes="subtitle"),
|
||||
Rule(line_style="heavy"),
|
||||
# Metrics Grid
|
||||
Grid(
|
||||
MetricsCard(
|
||||
"Collections", str(self.total_collections), "Active collections"
|
||||
Container(
|
||||
Grid(
|
||||
MetricsCard(
|
||||
"Collections", str(self.total_collections), "Active collections"
|
||||
),
|
||||
MetricsCard("Documents", str(self.total_documents), "Total indexed"),
|
||||
MetricsCard(
|
||||
"Backends", str(self.active_backends), "Connected services"
|
||||
),
|
||||
MetricsCard("Status", "Online", "System health"),
|
||||
classes="responsive-grid metrics-grid",
|
||||
),
|
||||
MetricsCard("Documents", str(self.total_documents), "Total indexed"),
|
||||
MetricsCard("Backends", str(self.active_backends), "Connected services"),
|
||||
MetricsCard("Status", "Online", "System health"),
|
||||
classes="responsive-grid metrics-grid",
|
||||
classes="center",
|
||||
),
|
||||
Rule(line_style="dashed"),
|
||||
# Quick Actions
|
||||
@@ -167,72 +198,111 @@ class CollectionOverviewScreen(Screen[None]):
|
||||
|
||||
def update_metrics(self) -> None:
|
||||
"""Update dashboard metrics with enhanced calculations."""
|
||||
self._calculate_metrics()
|
||||
self._update_metrics_cards()
|
||||
self._update_activity_feed()
|
||||
|
||||
def _calculate_metrics(self) -> None:
|
||||
"""Calculate basic metrics from collections."""
|
||||
self.total_collections = len(self.collections)
|
||||
self.total_documents = sum(col["count"] for col in self.collections)
|
||||
self.active_backends = sum([bool(self.weaviate), bool(self.openwebui), bool(self.r2r)])
|
||||
|
||||
# Count active backends
|
||||
self.active_backends = 0
|
||||
if self.weaviate:
|
||||
self.active_backends += 1
|
||||
if self.openwebui:
|
||||
self.active_backends += 1
|
||||
|
||||
# Update metrics cards if they exist
|
||||
def _update_metrics_cards(self) -> None:
|
||||
"""Update the metrics cards display."""
|
||||
try:
|
||||
dashboard_tab = self.query_one("#dashboard")
|
||||
metrics_cards = dashboard_tab.query(MetricsCard)
|
||||
if len(metrics_cards) >= 4:
|
||||
# Update existing cards with formatted values
|
||||
metrics_cards[0].query_one(".metrics-value", Static).update(
|
||||
f"{self.total_collections:,}"
|
||||
)
|
||||
metrics_cards[1].query_one(".metrics-value", Static).update(
|
||||
f"{self.total_documents:,}"
|
||||
)
|
||||
metrics_cards[2].query_one(".metrics-value", Static).update(
|
||||
str(self.active_backends)
|
||||
)
|
||||
self._update_card_values(metrics_cards)
|
||||
self._update_status_card(metrics_cards[3])
|
||||
except NoMatches:
|
||||
return
|
||||
except Exception as exc:
|
||||
LOGGER.exception("Failed to update dashboard metrics", exc_info=exc)
|
||||
|
||||
# Update status card based on system health
|
||||
if self.active_backends > 0 and self.total_collections > 0:
|
||||
status_text = "🟢 Healthy"
|
||||
status_class = "status-active"
|
||||
elif self.active_backends > 0:
|
||||
status_text = "🟡 Ready"
|
||||
status_class = "status-warning"
|
||||
else:
|
||||
status_text = "🔴 Offline"
|
||||
status_class = "status-error"
|
||||
def _update_card_values(self, metrics_cards: list) -> None:
|
||||
"""Update individual metric card values."""
|
||||
metrics_cards[0].query_one(".metrics-value", Static).update(f"{self.total_collections:,}")
|
||||
metrics_cards[1].query_one(".metrics-value", Static).update(f"{self.total_documents:,}")
|
||||
metrics_cards[2].query_one(".metrics-value", Static).update(str(self.active_backends))
|
||||
|
||||
metrics_cards[3].query_one(".metrics-value", Static).update(status_text)
|
||||
metrics_cards[3].add_class(status_class)
|
||||
def _update_status_card(self, status_card: object) -> None:
|
||||
"""Update the system status card."""
|
||||
if self.active_backends > 0 and self.total_collections > 0:
|
||||
status_text, status_class = "🟢 Healthy", "status-active"
|
||||
elif self.active_backends > 0:
|
||||
status_text, status_class = "🟡 Ready", "status-warning"
|
||||
else:
|
||||
status_text, status_class = "🔴 Offline", "status-error"
|
||||
|
||||
except Exception:
|
||||
pass # Cards might not be rendered yet
|
||||
status_card.query_one(".metrics-value", Static).update(status_text)
|
||||
status_card.add_class(status_class)
|
||||
|
||||
# Update activity feed with real data
|
||||
def _update_activity_feed(self) -> None:
|
||||
"""Update the activity feed with collection data."""
|
||||
try:
|
||||
dashboard_tab = self.query_one("#dashboard")
|
||||
activity_feed = dashboard_tab.query_one("#activity_feed", Static)
|
||||
if self.collections:
|
||||
recent_activity = []
|
||||
for col in self.collections[:3]: # Show top 3 collections
|
||||
recent_activity.append(
|
||||
f"📚 {col['name']}: {col['count']:,} docs ({col.get('size_mb', 0):.1f} MB)"
|
||||
)
|
||||
activity_text = "\\n".join(recent_activity)
|
||||
if len(self.collections) > 3:
|
||||
activity_text += f"\\n... and {len(self.collections) - 3} more collections"
|
||||
else:
|
||||
activity_text = "No collections found. Start by creating your first ingestion!"
|
||||
|
||||
activity_text = self._generate_activity_text()
|
||||
activity_feed.update(activity_text)
|
||||
except Exception:
|
||||
pass
|
||||
except NoMatches:
|
||||
return
|
||||
except Exception as exc:
|
||||
LOGGER.exception("Failed to update dashboard activity feed", exc_info=exc)
|
||||
|
||||
def _generate_activity_text(self) -> str:
|
||||
"""Generate activity feed text from collections."""
|
||||
if not self.collections:
|
||||
return "🚀 No collections found. Start by creating your first ingestion!\n💡 Press 'I' to begin or use the Quick Actions above."
|
||||
|
||||
recent_activity = [self._format_collection_item(col) for col in self.collections[:3]]
|
||||
activity_text = "\n".join(recent_activity)
|
||||
|
||||
if len(self.collections) > 3:
|
||||
total_docs = sum(c["count"] for c in self.collections)
|
||||
activity_text += (
|
||||
f"\n📊 Total: {len(self.collections)} collections with {total_docs:,} documents"
|
||||
)
|
||||
|
||||
return activity_text
|
||||
|
||||
def _format_collection_item(self, col: CollectionInfo) -> str:
|
||||
"""Format a single collection item for the activity feed."""
|
||||
content_type = self._get_content_type_icon(col["name"])
|
||||
size_mb = col["size_mb"]
|
||||
backend_info = col["backend"]
|
||||
|
||||
# Check if this represents a multi-backend ingestion result
|
||||
if isinstance(backend_info, list):
|
||||
if len(backend_info) > 1:
|
||||
# Ensure all elements are strings for safe joining
|
||||
backend_strings = [str(b) for b in backend_info if b is not None]
|
||||
backend_list = " + ".join(backend_strings) if backend_strings else "unknown"
|
||||
return f"{content_type} {col['name']}: {col['count']:,} docs ({size_mb:.1f} MB) → {backend_list}"
|
||||
elif len(backend_info) == 1:
|
||||
backend_name = str(backend_info[0]) if backend_info[0] is not None else "unknown"
|
||||
return f"{content_type} {col['name']}: {col['count']:,} docs ({size_mb:.1f} MB) - {backend_name}"
|
||||
else:
|
||||
return f"{content_type} {col['name']}: {col['count']:,} docs ({size_mb:.1f} MB) - unknown"
|
||||
else:
|
||||
backend_display = str(backend_info) if backend_info is not None else "unknown"
|
||||
return f"{content_type} {col['name']}: {col['count']:,} docs ({size_mb:.1f} MB) - {backend_display}"
|
||||
|
||||
def _get_content_type_icon(self, name: str) -> str:
|
||||
"""Get appropriate icon for collection content type."""
|
||||
name_lower = name.lower()
|
||||
if "web" in name_lower:
|
||||
return "🌐"
|
||||
elif "doc" in name_lower:
|
||||
return "📖"
|
||||
elif "repo" in name_lower:
|
||||
return "📦"
|
||||
return "📄"
|
||||
|
||||
@work(exclusive=True)
|
||||
async def refresh_collections(self) -> None:
|
||||
"""Refresh collection data with enhanced loading feedback."""
|
||||
"""Refresh collection data with enhanced multi-backend loading feedback."""
|
||||
self.is_loading = True
|
||||
loading_indicator = self.query_one("#loading")
|
||||
status_text = self.query_one("#status_text", Static)
|
||||
@@ -241,47 +311,43 @@ class CollectionOverviewScreen(Screen[None]):
|
||||
status_text.update("🔄 Refreshing collections...")
|
||||
|
||||
try:
|
||||
collections = []
|
||||
# Use storage manager for unified backend handling
|
||||
if not self.storage_manager.is_initialized:
|
||||
status_text.update("🔗 Initializing storage backends...")
|
||||
backend_results = await self.storage_manager.initialize_all_backends()
|
||||
|
||||
# Get Weaviate collections
|
||||
if self.weaviate:
|
||||
try:
|
||||
status_text.update("🔗 Connecting to Weaviate...")
|
||||
await self.weaviate.initialize()
|
||||
weaviate_collections = await self.list_weaviate_collections()
|
||||
collections.extend(weaviate_collections)
|
||||
status_text.update("✅ Weaviate collections loaded")
|
||||
except Exception as e:
|
||||
self.notify(f"❌ Weaviate error: {e}", severity="error")
|
||||
status_text.update("❌ Weaviate connection failed")
|
||||
# Report per-backend initialization status
|
||||
success_count = sum(backend_results.values())
|
||||
total_count = len(backend_results)
|
||||
status_text.update(f"✅ Initialized {success_count}/{total_count} backends")
|
||||
|
||||
# Get OpenWebUI collections
|
||||
if self.openwebui:
|
||||
try:
|
||||
status_text.update("🔗 Connecting to OpenWebUI...")
|
||||
await self.openwebui.initialize()
|
||||
openwebui_collections = await self.list_openwebui_collections()
|
||||
collections.extend(openwebui_collections)
|
||||
status_text.update("✅ OpenWebUI collections loaded")
|
||||
except Exception as e:
|
||||
self.notify(f"❌ OpenWebUI error: {e}", severity="error")
|
||||
status_text.update("❌ OpenWebUI connection failed")
|
||||
# Get collections from all backends via storage manager
|
||||
status_text.update("📚 Loading collections from all backends...")
|
||||
collections = await self.storage_manager.get_all_collections()
|
||||
|
||||
# Update metrics calculation for multi-backend support
|
||||
self.active_backends = len(self.storage_manager.get_available_backends())
|
||||
|
||||
self.collections = collections
|
||||
await self.update_collections_table()
|
||||
self.update_metrics()
|
||||
status_text.update(f"✨ Ready - {len(collections)} collections loaded")
|
||||
|
||||
# Update connection status
|
||||
# Enhanced status reporting for multi-backend
|
||||
backend_names = ", ".join(
|
||||
backend.value for backend in self.storage_manager.get_available_backends()
|
||||
)
|
||||
status_text.update(f"✨ Ready - {len(collections)} collections from {backend_names}")
|
||||
|
||||
# Update connection status with multi-backend awareness
|
||||
connection_status = self.query_one("#connection_status", StatusIndicator)
|
||||
if collections:
|
||||
connection_status.update_status("✓ Active")
|
||||
if collections and self.active_backends > 0:
|
||||
connection_status.update_status(f"✓ {self.active_backends} Active")
|
||||
else:
|
||||
connection_status.update_status("No Data")
|
||||
|
||||
except Exception as e:
|
||||
status_text.update(f"❌ Error: {e}")
|
||||
self.notify(f"Failed to refresh: {e}", severity="error")
|
||||
self.notify(f"Failed to refresh: {e}", severity="error", markup=False)
|
||||
finally:
|
||||
self.is_loading = False
|
||||
loading_indicator.display = False
|
||||
@@ -292,79 +358,68 @@ class CollectionOverviewScreen(Screen[None]):
|
||||
return []
|
||||
|
||||
try:
|
||||
collections = []
|
||||
collections_list = (
|
||||
self.weaviate.client.collections.list_all()
|
||||
if self.weaviate and self.weaviate.client
|
||||
else []
|
||||
)
|
||||
for collection in collections_list:
|
||||
collection_obj = (
|
||||
self.weaviate.client.collections.get(collection)
|
||||
if self.weaviate and self.weaviate.client
|
||||
else None
|
||||
)
|
||||
if not collection_obj:
|
||||
continue
|
||||
count = collection_obj.aggregate.over_all(total_count=True).total_count or 0
|
||||
overview = await self.weaviate.describe_collections()
|
||||
collections: list[CollectionInfo] = []
|
||||
|
||||
# Estimate size
|
||||
size_mb = count * 0.01 # Rough estimate
|
||||
|
||||
collection_info = CollectionInfo(
|
||||
name=collection,
|
||||
type="weaviate",
|
||||
count=count,
|
||||
backend="🗄️ Weaviate",
|
||||
status="✓ Active",
|
||||
last_updated=datetime.now().strftime("%Y-%m-%d %H:%M"),
|
||||
size_mb=size_mb,
|
||||
for item in overview:
|
||||
count_val = int(item.get("count", 0))
|
||||
size_mb_val = float(item.get("size_mb", 0.0))
|
||||
collections.append(
|
||||
CollectionInfo(
|
||||
name=str(item.get("name", "Unknown")),
|
||||
type="weaviate",
|
||||
count=count_val,
|
||||
backend="🗄️ Weaviate",
|
||||
status="✓ Active",
|
||||
last_updated=datetime.now().strftime("%Y-%m-%d %H:%M"),
|
||||
size_mb=size_mb_val,
|
||||
)
|
||||
)
|
||||
collections.append(collection_info)
|
||||
|
||||
return collections
|
||||
except Exception as e:
|
||||
self.notify(f"Error listing Weaviate collections: {e}", severity="error")
|
||||
self.notify(f"Error listing Weaviate collections: {e}", severity="error", markup=False)
|
||||
return []
|
||||
|
||||
async def list_openwebui_collections(self) -> list[CollectionInfo]:
|
||||
"""List OpenWebUI collections with enhanced metadata."""
|
||||
if not self.openwebui:
|
||||
# Try to get OpenWebUI backend from storage manager if direct instance not available
|
||||
openwebui_backend = self.openwebui or self.storage_manager.get_backend(StorageBackend.OPEN_WEBUI)
|
||||
if not openwebui_backend:
|
||||
return []
|
||||
|
||||
try:
|
||||
response = await self.openwebui.client.get("/api/v1/knowledge/")
|
||||
response.raise_for_status()
|
||||
knowledge_bases = response.json()
|
||||
overview = await openwebui_backend.describe_collections()
|
||||
collections: list[CollectionInfo] = []
|
||||
|
||||
collections = []
|
||||
for kb in knowledge_bases:
|
||||
file_count = len(kb.get("files", []))
|
||||
size_mb = file_count * 0.5 # Rough estimate
|
||||
|
||||
collection_info = CollectionInfo(
|
||||
name=kb.get("name", "Unknown"),
|
||||
type="openwebui",
|
||||
count=file_count,
|
||||
backend="🌐 OpenWebUI",
|
||||
status="✓ Active",
|
||||
last_updated=kb.get("updated_at", datetime.now().strftime("%Y-%m-%d %H:%M")),
|
||||
size_mb=size_mb,
|
||||
for item in overview:
|
||||
count_val = int(item.get("count", 0))
|
||||
size_mb_val = float(item.get("size_mb", 0.0))
|
||||
collection_name = str(item.get("name", "Unknown"))
|
||||
collections.append(
|
||||
CollectionInfo(
|
||||
name=collection_name,
|
||||
type="openwebui",
|
||||
count=count_val,
|
||||
backend="🌐 OpenWebUI",
|
||||
status="✓ Active",
|
||||
last_updated=datetime.now().strftime("%Y-%m-%d %H:%M"),
|
||||
size_mb=size_mb_val,
|
||||
)
|
||||
)
|
||||
collections.append(collection_info)
|
||||
|
||||
return collections
|
||||
except Exception as e:
|
||||
self.notify(f"Error listing OpenWebUI collections: {e}", severity="error")
|
||||
self.notify(f"Error listing OpenWebUI collections: {e}", severity="error", markup=False)
|
||||
return []
|
||||
|
||||
async def update_collections_table(self) -> None:
|
||||
"""Update the collections table with enhanced formatting."""
|
||||
table = self.query_one("#collections_table", EnhancedDataTable)
|
||||
table.clear()
|
||||
table.clear(columns=True)
|
||||
|
||||
# Add enhanced columns
|
||||
table.add_columns("Collection", "Backend", "Documents", "Size", "Status", "Updated")
|
||||
# Add enhanced columns with more metadata
|
||||
table.add_columns("Collection", "Backend", "Documents", "Size", "Type", "Status", "Updated")
|
||||
|
||||
# Add rows with enhanced formatting
|
||||
for collection in self.collections:
|
||||
@@ -376,23 +431,60 @@ class CollectionOverviewScreen(Screen[None]):
|
||||
# Format document count
|
||||
doc_count = f"{collection['count']:,}"
|
||||
|
||||
# Determine content type based on collection name or other metadata
|
||||
content_type = "📄 Mixed"
|
||||
if "web" in collection["name"].lower():
|
||||
content_type = "🌐 Web"
|
||||
elif "doc" in collection["name"].lower():
|
||||
content_type = "📖 Docs"
|
||||
elif "repo" in collection["name"].lower():
|
||||
content_type = "📦 Code"
|
||||
|
||||
table.add_row(
|
||||
collection["name"],
|
||||
collection["backend"],
|
||||
doc_count,
|
||||
size_str,
|
||||
content_type,
|
||||
collection["status"],
|
||||
collection["last_updated"],
|
||||
)
|
||||
|
||||
if self.collections:
|
||||
table.move_cursor(row=0)
|
||||
|
||||
self.get_selected_collection()
|
||||
|
||||
def update_search_controls(self, collection: CollectionInfo | None) -> None:
|
||||
"""Enable or disable search controls based on backend support."""
|
||||
try:
|
||||
search_button = self.query_one("#search_btn", Button)
|
||||
quick_search_button = self.query_one("#quick_search", Button)
|
||||
except Exception:
|
||||
return
|
||||
|
||||
is_weaviate = bool(collection and collection.get("type") == "weaviate")
|
||||
search_button.disabled = not is_weaviate
|
||||
quick_search_button.disabled = not is_weaviate
|
||||
|
||||
def get_selected_collection(self) -> CollectionInfo | None:
|
||||
"""Get the currently selected collection."""
|
||||
table = self.query_one("#collections_table", EnhancedDataTable)
|
||||
try:
|
||||
if table.cursor_coordinate.row < len(self.collections):
|
||||
return self.collections[table.cursor_coordinate.row]
|
||||
row_index = table.cursor_coordinate.row
|
||||
except (AttributeError, IndexError):
|
||||
pass
|
||||
self.selected_collection = None
|
||||
self.update_search_controls(None)
|
||||
return None
|
||||
|
||||
if 0 <= row_index < len(self.collections):
|
||||
collection = self.collections[row_index]
|
||||
self.selected_collection = collection
|
||||
self.update_search_controls(collection)
|
||||
return collection
|
||||
|
||||
self.selected_collection = None
|
||||
self.update_search_controls(None)
|
||||
return None
|
||||
|
||||
# Action methods
|
||||
@@ -402,41 +494,81 @@ class CollectionOverviewScreen(Screen[None]):
|
||||
|
||||
def action_ingest(self) -> None:
|
||||
"""Show enhanced ingestion dialog."""
|
||||
selected = self.get_selected_collection()
|
||||
if selected:
|
||||
if selected := self.get_selected_collection():
|
||||
from .ingestion import IngestionScreen
|
||||
self.app.push_screen(IngestionScreen(selected))
|
||||
|
||||
self.app.push_screen(IngestionScreen(selected, self.storage_manager))
|
||||
else:
|
||||
self.notify("🔍 Please select a collection first", severity="warning")
|
||||
|
||||
def action_manage(self) -> None:
|
||||
"""Manage documents in selected collection."""
|
||||
selected = self.get_selected_collection()
|
||||
if selected:
|
||||
if selected["type"] == "weaviate":
|
||||
if selected := self.get_selected_collection():
|
||||
# Get the appropriate storage backend for the collection
|
||||
storage_backend = self._get_storage_for_collection(selected)
|
||||
if storage_backend:
|
||||
from .documents import DocumentManagementScreen
|
||||
self.app.push_screen(DocumentManagementScreen(selected, self.weaviate))
|
||||
|
||||
self.app.push_screen(DocumentManagementScreen(selected, storage_backend))
|
||||
else:
|
||||
self.notify(
|
||||
"🚧 Document management only available for Weaviate", severity="warning"
|
||||
"🚧 No storage backend available for this collection", severity="warning"
|
||||
)
|
||||
else:
|
||||
self.notify("🔍 Please select a collection first", severity="warning")
|
||||
|
||||
def _get_storage_for_collection(self, collection: CollectionInfo) -> BaseStorage | None:
|
||||
"""Get the appropriate storage backend for a collection."""
|
||||
collection_type = collection.get("type", "")
|
||||
|
||||
# Map collection types to storage backends (try direct instances first)
|
||||
if collection_type == "weaviate" and self.weaviate:
|
||||
return self.weaviate
|
||||
elif collection_type == "openwebui" and self.openwebui:
|
||||
return self.openwebui
|
||||
elif collection_type == "r2r" and self.r2r:
|
||||
return self.r2r
|
||||
|
||||
# Fall back to storage manager if direct instances not available
|
||||
if collection_type == "weaviate":
|
||||
return self.storage_manager.get_backend(StorageBackend.WEAVIATE)
|
||||
elif collection_type == "openwebui":
|
||||
return self.storage_manager.get_backend(StorageBackend.OPEN_WEBUI)
|
||||
elif collection_type == "r2r":
|
||||
return self.storage_manager.get_backend(StorageBackend.R2R)
|
||||
|
||||
# Fall back to checking available backends by backend name
|
||||
backend_name = collection.get("backend", "")
|
||||
if isinstance(backend_name, str):
|
||||
if "weaviate" in backend_name.lower():
|
||||
return self.weaviate or self.storage_manager.get_backend(StorageBackend.WEAVIATE)
|
||||
elif "openwebui" in backend_name.lower():
|
||||
return self.openwebui or self.storage_manager.get_backend(StorageBackend.OPEN_WEBUI)
|
||||
elif "r2r" in backend_name.lower():
|
||||
return self.r2r or self.storage_manager.get_backend(StorageBackend.R2R)
|
||||
|
||||
return None
|
||||
|
||||
def action_search(self) -> None:
|
||||
"""Search in selected collection."""
|
||||
selected = self.get_selected_collection()
|
||||
if selected:
|
||||
if selected := self.get_selected_collection():
|
||||
if selected["type"] != "weaviate":
|
||||
self.notify(
|
||||
"🔐 Search is currently available only for Weaviate collections",
|
||||
severity="warning",
|
||||
)
|
||||
return
|
||||
from .search import SearchScreen
|
||||
|
||||
self.app.push_screen(SearchScreen(selected, self.weaviate, self.openwebui))
|
||||
else:
|
||||
self.notify("🔍 Please select a collection first", severity="warning")
|
||||
|
||||
def action_delete(self) -> None:
|
||||
"""Delete selected collection."""
|
||||
selected = self.get_selected_collection()
|
||||
if selected:
|
||||
if selected := self.get_selected_collection():
|
||||
from .dialogs import ConfirmDeleteScreen
|
||||
|
||||
self.app.push_screen(ConfirmDeleteScreen(selected, self))
|
||||
else:
|
||||
self.notify("🔍 Please select a collection first", severity="warning")
|
||||
@@ -483,6 +615,7 @@ class CollectionOverviewScreen(Screen[None]):
|
||||
def action_help(self) -> None:
|
||||
"""Show help screen."""
|
||||
from .help import HelpScreen
|
||||
|
||||
help_md = """
|
||||
# 🚀 Modern Collection Management System
|
||||
|
||||
@@ -524,15 +657,22 @@ Enjoy the enhanced interface! 🎉
|
||||
event.button.add_class("pressed")
|
||||
self.call_later(self.remove_pressed_class, event.button)
|
||||
|
||||
if button_id == "refresh_btn" or button_id == "quick_refresh":
|
||||
if getattr(event.button, "disabled", False):
|
||||
self.notify(
|
||||
"🔐 Search is currently limited to Weaviate collections",
|
||||
severity="warning",
|
||||
)
|
||||
return
|
||||
|
||||
if button_id in ["refresh_btn", "quick_refresh"]:
|
||||
self.action_refresh()
|
||||
elif button_id == "ingest_btn" or button_id == "quick_ingest":
|
||||
elif button_id in ["ingest_btn", "quick_ingest"]:
|
||||
self.action_ingest()
|
||||
elif button_id == "manage_btn":
|
||||
self.action_manage()
|
||||
elif button_id == "delete_btn":
|
||||
self.action_delete()
|
||||
elif button_id == "search_btn" or button_id == "quick_search":
|
||||
elif button_id in ["search_btn", "quick_search"]:
|
||||
self.action_search()
|
||||
elif button_id == "quick_settings":
|
||||
self.notify("⚙️ Settings panel coming soon!", severity="information")
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
"""Dialog screens for confirmations and user interactions."""
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, ClassVar
|
||||
|
||||
from textual.app import ComposeResult
|
||||
from textual.binding import Binding
|
||||
from textual.binding import Binding, BindingType
|
||||
from textual.containers import Container, Horizontal
|
||||
from textual.screen import Screen
|
||||
from textual.widgets import Button, Footer, Header, LoadingIndicator, Static
|
||||
from textual.screen import ModalScreen, Screen
|
||||
from textual.widgets import Button, Footer, Header, LoadingIndicator, RichLog, Static
|
||||
from typing_extensions import override
|
||||
|
||||
from ..models import CollectionInfo
|
||||
@@ -22,7 +23,7 @@ class ConfirmDeleteScreen(Screen[None]):
|
||||
collection: CollectionInfo
|
||||
parent_screen: "CollectionOverviewScreen"
|
||||
|
||||
BINDINGS = [
|
||||
BINDINGS: ClassVar[list[BindingType]] = [
|
||||
Binding("escape", "app.pop_screen", "Cancel"),
|
||||
Binding("y", "confirm_delete", "Yes"),
|
||||
Binding("n", "app.pop_screen", "No"),
|
||||
@@ -73,29 +74,68 @@ class ConfirmDeleteScreen(Screen[None]):
|
||||
try:
|
||||
if self.collection["type"] == "weaviate" and self.parent_screen.weaviate:
|
||||
# Delete Weaviate collection
|
||||
if self.parent_screen.weaviate and self.parent_screen.weaviate.client:
|
||||
if self.parent_screen.weaviate.client:
|
||||
self.parent_screen.weaviate.client.collections.delete(self.collection["name"])
|
||||
self.notify(
|
||||
f"Deleted Weaviate collection: {self.collection['name']}",
|
||||
severity="information",
|
||||
)
|
||||
elif self.collection["type"] == "openwebui" and self.parent_screen.openwebui:
|
||||
# Delete OpenWebUI knowledge base
|
||||
response = await self.parent_screen.openwebui.client.delete(
|
||||
f"/api/v1/knowledge/{self.collection['name']}"
|
||||
)
|
||||
response.raise_for_status()
|
||||
self.notify(
|
||||
f"Deleted OpenWebUI collection: {self.collection['name']}",
|
||||
severity="information",
|
||||
)
|
||||
else:
|
||||
# Use the dashboard's method to get the appropriate storage backend
|
||||
storage_backend = self.parent_screen._get_storage_for_collection(self.collection)
|
||||
if not storage_backend:
|
||||
self.notify(
|
||||
f"❌ No storage backend available for {self.collection['type']} collection: {self.collection['name']}",
|
||||
severity="error",
|
||||
)
|
||||
self.app.pop_screen()
|
||||
return
|
||||
|
||||
# Refresh parent screen
|
||||
self.parent_screen.refresh_collections() # Don't await, let it run as a worker
|
||||
# Check if the storage backend supports collection deletion
|
||||
if not hasattr(storage_backend, 'delete_collection'):
|
||||
self.notify(
|
||||
f"❌ Collection deletion not supported for {self.collection['type']} backend",
|
||||
severity="error",
|
||||
)
|
||||
self.app.pop_screen()
|
||||
return
|
||||
|
||||
# Delete the collection using the appropriate backend
|
||||
# Ensure we use the exact collection name, not any default from storage config
|
||||
collection_name = str(self.collection["name"])
|
||||
collection_type = str(self.collection["type"])
|
||||
|
||||
self.notify(f"Deleting {collection_type} collection: {collection_name}...", severity="information")
|
||||
|
||||
# Use the standard delete_collection method for all backends
|
||||
if hasattr(storage_backend, 'delete_collection'):
|
||||
success = await storage_backend.delete_collection(collection_name)
|
||||
else:
|
||||
self.notify("❌ Backend does not support collection deletion", severity="error")
|
||||
self.app.pop_screen()
|
||||
return
|
||||
if success:
|
||||
self.notify(
|
||||
f"✅ Successfully deleted {self.collection['type']} collection: {self.collection['name']}",
|
||||
severity="information",
|
||||
timeout=3.0,
|
||||
)
|
||||
else:
|
||||
self.notify(
|
||||
f"❌ Failed to delete {self.collection['type']} collection: {self.collection['name']}",
|
||||
severity="error",
|
||||
)
|
||||
# Don't refresh if deletion failed
|
||||
self.app.pop_screen()
|
||||
return
|
||||
|
||||
# Refresh parent screen after a short delay to ensure deletion is processed
|
||||
self.call_later(lambda _: self.parent_screen.refresh_collections(), 0.5) # 500ms delay
|
||||
self.app.pop_screen()
|
||||
|
||||
except Exception as e:
|
||||
self.notify(f"Failed to delete collection: {e}", severity="error")
|
||||
self.notify(f"Failed to delete collection: {e}", severity="error", markup=False)
|
||||
|
||||
|
||||
|
||||
class ConfirmDocumentDeleteScreen(Screen[None]):
|
||||
@@ -105,7 +145,7 @@ class ConfirmDocumentDeleteScreen(Screen[None]):
|
||||
collection: CollectionInfo
|
||||
parent_screen: "DocumentManagementScreen"
|
||||
|
||||
BINDINGS = [
|
||||
BINDINGS: ClassVar[list[BindingType]] = [
|
||||
Binding("escape", "app.pop_screen", "Cancel"),
|
||||
Binding("y", "confirm_delete", "Yes"),
|
||||
Binding("n", "app.pop_screen", "No"),
|
||||
@@ -167,10 +207,13 @@ class ConfirmDocumentDeleteScreen(Screen[None]):
|
||||
try:
|
||||
if self.parent_screen.weaviate:
|
||||
# Delete documents
|
||||
results = await self.parent_screen.weaviate.delete_documents(self.doc_ids)
|
||||
results = await self.parent_screen.weaviate.delete_documents(
|
||||
self.doc_ids,
|
||||
collection_name=self.collection["name"],
|
||||
)
|
||||
|
||||
# Count successful deletions
|
||||
successful = sum(1 for success in results.values() if success)
|
||||
successful = sum(bool(success) for success in results.values())
|
||||
failed = len(results) - successful
|
||||
|
||||
if successful > 0:
|
||||
@@ -184,6 +227,91 @@ class ConfirmDocumentDeleteScreen(Screen[None]):
|
||||
self.app.pop_screen()
|
||||
|
||||
except Exception as e:
|
||||
self.notify(f"Failed to delete documents: {e}", severity="error")
|
||||
self.notify(f"Failed to delete documents: {e}", severity="error", markup=False)
|
||||
finally:
|
||||
loading.display = False
|
||||
|
||||
|
||||
class LogViewerScreen(ModalScreen[None]):
|
||||
"""Display live log output without disrupting the TUI."""
|
||||
|
||||
_log_widget: RichLog | None
|
||||
_log_file: Path | None
|
||||
|
||||
BINDINGS: ClassVar[list[BindingType]] = [
|
||||
Binding("escape", "close", "Close"),
|
||||
Binding("ctrl+l", "close", "Close"),
|
||||
Binding("s", "show_path", "Log File"),
|
||||
]
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self._log_widget = None
|
||||
self._log_file = None
|
||||
|
||||
@override
|
||||
def compose(self) -> ComposeResult:
|
||||
yield Header(show_clock=True)
|
||||
yield Container(
|
||||
Static("📜 Live Application Logs", classes="title"),
|
||||
Static("Logs update in real time. Press S to reveal the log file path.", classes="subtitle"),
|
||||
RichLog(id="log_stream", classes="log-stream", wrap=True, highlight=False),
|
||||
Static("", id="log_file_path", classes="subtitle"),
|
||||
classes="main_container log-viewer-container",
|
||||
)
|
||||
yield Footer()
|
||||
|
||||
def on_mount(self) -> None:
|
||||
"""Attach this viewer to the parent application once mounted."""
|
||||
self._log_widget = self.query_one(RichLog)
|
||||
from ..app import CollectionManagementApp
|
||||
|
||||
if isinstance(self.app, CollectionManagementApp):
|
||||
self.app.attach_log_viewer(self)
|
||||
|
||||
def on_unmount(self) -> None:
|
||||
"""Detach from the parent application when closed."""
|
||||
from ..app import CollectionManagementApp
|
||||
|
||||
if isinstance(self.app, CollectionManagementApp):
|
||||
self.app.detach_log_viewer(self)
|
||||
|
||||
def _get_log_widget(self) -> RichLog:
|
||||
if self._log_widget is None:
|
||||
self._log_widget = self.query_one(RichLog)
|
||||
return self._log_widget
|
||||
|
||||
def replace_logs(self, lines: list[str]) -> None:
|
||||
"""Replace rendered logs with the provided history."""
|
||||
log_widget = self._get_log_widget()
|
||||
log_widget.clear()
|
||||
for line in lines:
|
||||
log_widget.write(line)
|
||||
log_widget.scroll_end(animate=False)
|
||||
|
||||
def append_logs(self, lines: list[str]) -> None:
|
||||
"""Append new log lines to the viewer."""
|
||||
log_widget = self._get_log_widget()
|
||||
for line in lines:
|
||||
log_widget.write(line)
|
||||
log_widget.scroll_end(animate=False)
|
||||
|
||||
def update_log_file(self, log_file: Path | None) -> None:
|
||||
"""Update the displayed log file path."""
|
||||
self._log_file = log_file
|
||||
label = self.query_one("#log_file_path", Static)
|
||||
if log_file is None:
|
||||
label.update("Logs are not currently being persisted to disk.")
|
||||
else:
|
||||
label.update(f"Log file: {log_file}")
|
||||
|
||||
def action_close(self) -> None:
|
||||
"""Close the log viewer."""
|
||||
self.app.pop_screen()
|
||||
|
||||
def action_show_path(self) -> None:
|
||||
"""Reveal the log file location in a notification."""
|
||||
if self._log_file is None:
|
||||
self.notify("File logging is disabled for this session.", severity="warning")
|
||||
else:
|
||||
self.notify(f"Log file available at: {self._log_file}", severity="information", markup=False)
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
"""Document management screen with enhanced navigation."""
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
from textual.app import ComposeResult
|
||||
from textual.binding import Binding
|
||||
from textual.containers import Container, Horizontal
|
||||
@@ -7,6 +9,7 @@ from textual.screen import Screen
|
||||
from textual.widgets import Button, Footer, Header, Label, LoadingIndicator, Static
|
||||
from typing_extensions import override
|
||||
|
||||
from ....storage.base import BaseStorage
|
||||
from ....storage.weaviate import WeaviateStorage
|
||||
from ..models import CollectionInfo, DocumentInfo
|
||||
from ..widgets import EnhancedDataTable
|
||||
@@ -16,7 +19,7 @@ class DocumentManagementScreen(Screen[None]):
|
||||
"""Screen for managing documents within a collection with enhanced keyboard navigation."""
|
||||
|
||||
collection: CollectionInfo
|
||||
weaviate: WeaviateStorage | None
|
||||
storage: BaseStorage | None
|
||||
documents: list[DocumentInfo]
|
||||
selected_docs: set[str]
|
||||
current_offset: int
|
||||
@@ -38,10 +41,10 @@ class DocumentManagementScreen(Screen[None]):
|
||||
Binding("end", "last_page", "Last Page"),
|
||||
]
|
||||
|
||||
def __init__(self, collection: CollectionInfo, weaviate: WeaviateStorage | None):
|
||||
def __init__(self, collection: CollectionInfo, storage: BaseStorage | None):
|
||||
super().__init__()
|
||||
self.collection = collection
|
||||
self.weaviate = weaviate
|
||||
self.storage = storage
|
||||
self.documents: list[DocumentInfo] = []
|
||||
self.selected_docs: set[str] = set()
|
||||
self.current_offset = 0
|
||||
@@ -54,7 +57,7 @@ class DocumentManagementScreen(Screen[None]):
|
||||
Static(f"📄 Document Management: {self.collection['name']}", classes="title"),
|
||||
Static(
|
||||
f"Total Documents: {self.collection['count']:,} | Use Space to select, Delete to remove",
|
||||
classes="subtitle"
|
||||
classes="subtitle",
|
||||
),
|
||||
Label(f"Page size: {self.page_size} documents"),
|
||||
EnhancedDataTable(id="documents_table", classes="enhanced-table"),
|
||||
@@ -78,9 +81,11 @@ class DocumentManagementScreen(Screen[None]):
|
||||
"""Initialize the screen."""
|
||||
self.query_one("#loading").display = False
|
||||
|
||||
# Setup documents table
|
||||
# Setup documents table with enhanced columns
|
||||
table = self.query_one("#documents_table", EnhancedDataTable)
|
||||
table.add_columns("✓", "Title", "Source URL", "Words", "ID")
|
||||
table.add_columns(
|
||||
"✓", "Title", "Source URL", "Description", "Type", "Words", "Timestamp", "ID"
|
||||
)
|
||||
|
||||
# Set up message handling for table events
|
||||
table.can_focus = True
|
||||
@@ -93,55 +98,96 @@ class DocumentManagementScreen(Screen[None]):
|
||||
loading.display = True
|
||||
|
||||
try:
|
||||
if self.weaviate:
|
||||
# Set the collection name
|
||||
self.weaviate.config.collection_name = self.collection["name"]
|
||||
|
||||
# Load documents with pagination
|
||||
raw_docs = await self.weaviate.list_documents(
|
||||
limit=self.page_size, offset=self.current_offset
|
||||
)
|
||||
# Cast to proper type with type checking
|
||||
self.documents = [
|
||||
DocumentInfo(
|
||||
id=str(doc["id"]),
|
||||
title=str(doc["title"]),
|
||||
source_url=str(doc["source_url"]),
|
||||
content_preview=str(doc["content_preview"]),
|
||||
word_count=int(doc["word_count"])
|
||||
if isinstance(doc["word_count"], (int, str))
|
||||
and str(doc["word_count"]).isdigit()
|
||||
else 0,
|
||||
timestamp=str(doc["timestamp"]),
|
||||
if self.storage:
|
||||
# Try to load documents using the storage backend
|
||||
try:
|
||||
raw_docs = await self.storage.list_documents(
|
||||
limit=self.page_size,
|
||||
offset=self.current_offset,
|
||||
collection_name=self.collection["name"],
|
||||
)
|
||||
for doc in raw_docs
|
||||
]
|
||||
# Cast to proper type with type checking
|
||||
self.documents = [
|
||||
DocumentInfo(
|
||||
id=str(doc.get("id", f"doc_{i}")),
|
||||
title=str(doc.get("title", "Untitled Document")),
|
||||
source_url=str(doc.get("source_url", "")),
|
||||
description=str(doc.get("description", "")),
|
||||
content_type=str(doc.get("content_type", "text/plain")),
|
||||
content_preview=str(doc.get("content_preview", "")),
|
||||
word_count=int(doc.get("word_count", 0))
|
||||
if str(doc.get("word_count", 0)).isdigit()
|
||||
else 0,
|
||||
timestamp=str(doc.get("timestamp", "")),
|
||||
)
|
||||
for i, doc in enumerate(raw_docs)
|
||||
]
|
||||
except NotImplementedError:
|
||||
# For storage backends that don't support document listing, show a message
|
||||
self.notify(
|
||||
f"Document listing not supported for {self.storage.__class__.__name__}",
|
||||
severity="information"
|
||||
)
|
||||
self.documents = []
|
||||
|
||||
await self.update_table()
|
||||
self.update_selection_status()
|
||||
self.update_page_info()
|
||||
|
||||
except Exception as e:
|
||||
self.notify(f"Error loading documents: {e}", severity="error")
|
||||
self.notify(f"Error loading documents: {e}", severity="error", markup=False)
|
||||
finally:
|
||||
loading.display = False
|
||||
|
||||
async def update_table(self) -> None:
|
||||
"""Update the documents table."""
|
||||
"""Update the documents table with enhanced metadata display."""
|
||||
table = self.query_one("#documents_table", EnhancedDataTable)
|
||||
table.clear()
|
||||
table.clear(columns=True)
|
||||
|
||||
# Re-add columns
|
||||
table.add_columns("✓", "Title", "Source URL", "Words", "ID")
|
||||
# Add enhanced columns with more metadata
|
||||
table.add_columns(
|
||||
"✓", "Title", "Source URL", "Description", "Type", "Words", "Timestamp", "ID"
|
||||
)
|
||||
|
||||
# Add rows
|
||||
# Add rows with enhanced metadata
|
||||
for doc in self.documents:
|
||||
selected = "✓" if doc["id"] in self.selected_docs else ""
|
||||
|
||||
# Get additional metadata from the raw docs
|
||||
description = str(doc.get("description") or "").strip()[:40]
|
||||
if not description:
|
||||
description = "[dim]No description[/dim]"
|
||||
elif len(str(doc.get("description") or "")) > 40:
|
||||
description += "..."
|
||||
|
||||
# Format content type with appropriate icon
|
||||
content_type = doc.get("content_type", "text/plain")
|
||||
if "markdown" in content_type.lower():
|
||||
type_display = "📝 md"
|
||||
elif "html" in content_type.lower():
|
||||
type_display = "🌐 html"
|
||||
elif "text" in content_type.lower():
|
||||
type_display = "📄 txt"
|
||||
else:
|
||||
type_display = f"📄 {content_type.split('/')[-1][:5]}"
|
||||
|
||||
# Format timestamp to be more readable
|
||||
timestamp = doc.get("timestamp", "")
|
||||
if timestamp:
|
||||
try:
|
||||
# Parse ISO format timestamp
|
||||
dt = datetime.fromisoformat(timestamp.replace("Z", "+00:00"))
|
||||
timestamp = dt.strftime("%m/%d %H:%M")
|
||||
except Exception:
|
||||
timestamp = str(timestamp)[:16] # Fallback
|
||||
table.add_row(
|
||||
selected,
|
||||
doc.get("title", "Untitled")[:50],
|
||||
doc.get("source_url", "")[:50],
|
||||
doc.get("title", "Untitled")[:40],
|
||||
doc.get("source_url", "")[:35],
|
||||
description,
|
||||
type_display,
|
||||
str(doc.get("word_count", 0)),
|
||||
timestamp,
|
||||
doc["id"][:8] + "...", # Show truncated ID
|
||||
)
|
||||
|
||||
@@ -181,8 +227,7 @@ class DocumentManagementScreen(Screen[None]):
|
||||
|
||||
def action_toggle_selection(self) -> None:
|
||||
"""Toggle selection of current row."""
|
||||
doc = self.get_current_document()
|
||||
if doc:
|
||||
if doc := self.get_current_document():
|
||||
doc_id = doc["id"]
|
||||
if doc_id in self.selected_docs:
|
||||
self.selected_docs.remove(doc_id)
|
||||
@@ -209,6 +254,7 @@ class DocumentManagementScreen(Screen[None]):
|
||||
"""Delete selected documents."""
|
||||
if self.selected_docs:
|
||||
from .dialogs import ConfirmDocumentDeleteScreen
|
||||
|
||||
self.app.push_screen(
|
||||
ConfirmDocumentDeleteScreen(list(self.selected_docs), self.collection, self)
|
||||
)
|
||||
@@ -274,6 +320,8 @@ class DocumentManagementScreen(Screen[None]):
|
||||
"""Handle select all from enhanced table."""
|
||||
self.action_select_all()
|
||||
|
||||
def on_enhanced_data_table_clear_selection(self, event: EnhancedDataTable.ClearSelection) -> None:
|
||||
def on_enhanced_data_table_clear_selection(
|
||||
self, event: EnhancedDataTable.ClearSelection
|
||||
) -> None:
|
||||
"""Handle clear selection from enhanced table."""
|
||||
self.action_select_none()
|
||||
|
||||
@@ -1,27 +1,49 @@
|
||||
"""Enhanced ingestion screen with better UX."""
|
||||
"""Enhanced ingestion screen with multi-storage support."""
|
||||
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from textual import work
|
||||
from textual.app import ComposeResult
|
||||
from textual.binding import Binding
|
||||
from textual.containers import Container, Horizontal
|
||||
from textual.screen import ModalScreen
|
||||
from textual.widgets import Button, Input, Label, LoadingIndicator, Rule, Static
|
||||
from textual.widgets import Button, Checkbox, Input, Label, LoadingIndicator, Rule, Static
|
||||
from typing_extensions import override
|
||||
|
||||
from ....core.models import IngestionJob, IngestionSource, StorageBackend
|
||||
from ....core.models import IngestionResult, IngestionSource, StorageBackend
|
||||
from ....flows.ingestion import create_ingestion_flow
|
||||
from ..models import CollectionInfo
|
||||
from ..utils.storage_manager import StorageManager
|
||||
from ..widgets import EnhancedProgressBar
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..app import CollectionManagementApp
|
||||
|
||||
|
||||
BACKEND_ORDER: tuple[StorageBackend, ...] = (
|
||||
StorageBackend.WEAVIATE,
|
||||
StorageBackend.OPEN_WEBUI,
|
||||
StorageBackend.R2R,
|
||||
)
|
||||
|
||||
BACKEND_LABELS: dict[StorageBackend, str] = {
|
||||
StorageBackend.WEAVIATE: "🗄️ Weaviate",
|
||||
StorageBackend.OPEN_WEBUI: "🌐 OpenWebUI",
|
||||
StorageBackend.R2R: "🧠 R2R",
|
||||
}
|
||||
|
||||
|
||||
class IngestionScreen(ModalScreen[None]):
|
||||
"""Enhanced ingestion screen with better UX and keyboard navigation."""
|
||||
"""Modern ingestion screen with multi-backend fan-out."""
|
||||
|
||||
collection: CollectionInfo
|
||||
storage_manager: StorageManager
|
||||
selected_type: IngestionSource
|
||||
progress_value: int
|
||||
available_backends: list[StorageBackend]
|
||||
selected_backends: list[StorageBackend]
|
||||
|
||||
BINDINGS = [
|
||||
Binding("escape", "app.pop_screen", "Cancel"),
|
||||
@@ -34,23 +56,38 @@ class IngestionScreen(ModalScreen[None]):
|
||||
Binding("shift+tab", "focus_previous", "Previous Field"),
|
||||
]
|
||||
|
||||
def __init__(self, collection: CollectionInfo):
|
||||
def __init__(self, collection: CollectionInfo, storage_manager: StorageManager) -> None:
|
||||
super().__init__()
|
||||
self.collection = collection
|
||||
self.storage_manager = storage_manager
|
||||
self.selected_type = IngestionSource.WEB
|
||||
self.progress_value = 0
|
||||
self.available_backends = list(storage_manager.get_available_backends())
|
||||
if not self.available_backends:
|
||||
raise ValueError("No storage backends are available for ingestion")
|
||||
self.selected_backends = self._derive_initial_backends()
|
||||
|
||||
@override
|
||||
def compose(self) -> ComposeResult:
|
||||
target_name = self.collection["name"]
|
||||
backend_info = self.collection["backend"]
|
||||
|
||||
# Format backend label for display
|
||||
if isinstance(backend_info, list):
|
||||
# Ensure all elements are strings for safe joining
|
||||
backend_strings = [str(b) for b in backend_info if b is not None]
|
||||
target_backend_label = " + ".join(backend_strings) if backend_strings else "unknown"
|
||||
else:
|
||||
target_backend_label = str(backend_info) if backend_info is not None else "unknown"
|
||||
|
||||
with Container(classes="modal-container"):
|
||||
yield Static("📥 Modern Ingestion Interface", classes="title")
|
||||
yield Static(
|
||||
f"Target: {self.collection['name']} ({self.collection['backend']})",
|
||||
f"Target: {target_name} ({target_backend_label})",
|
||||
classes="subtitle",
|
||||
)
|
||||
yield Rule()
|
||||
|
||||
# Enhanced input section
|
||||
yield Container(
|
||||
Label("🌐 Source URL:", classes="input-label"),
|
||||
Input(
|
||||
@@ -58,22 +95,35 @@ class IngestionScreen(ModalScreen[None]):
|
||||
id="url_input",
|
||||
classes="modern-input",
|
||||
),
|
||||
Label("📝 Collection Name:", classes="input-label"),
|
||||
Input(
|
||||
placeholder="Enter collection name (or leave empty to auto-generate)",
|
||||
id="collection_input",
|
||||
classes="modern-input",
|
||||
value=self.collection.get("name", ""),
|
||||
),
|
||||
Label("📋 Source Type (Press 1/2/3):", classes="input-label"),
|
||||
Horizontal(
|
||||
Button("🌐 Web (1)", id="web_btn", variant="primary", classes="type-button"),
|
||||
Button(
|
||||
"📦 Repository (2)", id="repo_btn", variant="default", classes="type-button"
|
||||
),
|
||||
Button(
|
||||
"📖 Documentation (3)", id="docs_btn", variant="default", classes="type-button"
|
||||
),
|
||||
Button("📦 Repository (2)", id="repo_btn", variant="default", classes="type-button"),
|
||||
Button("📖 Documentation (3)", id="docs_btn", variant="default", classes="type-button"),
|
||||
classes="type_buttons",
|
||||
),
|
||||
Rule(line_style="dashed"),
|
||||
Label(f"🗄️ Target Storages ({len(self.available_backends)} available):", classes="input-label", id="backend_label"),
|
||||
Container(
|
||||
*self._create_backend_checkbox_widgets(),
|
||||
classes="backend-selection",
|
||||
),
|
||||
Container(
|
||||
Button("Select All Storages", id="select_all_backends", variant="default"),
|
||||
Button("Clear Selection", id="clear_backends", variant="default"),
|
||||
classes="backend-actions",
|
||||
),
|
||||
Static("📋 Selected: None", id="selection_status", classes="selection-status"),
|
||||
classes="input-section card",
|
||||
)
|
||||
|
||||
# Enhanced Progress section
|
||||
yield Container(
|
||||
Label("🔄 Progress:", classes="progress-label"),
|
||||
EnhancedProgressBar(id="enhanced_progress", total=100),
|
||||
@@ -81,7 +131,6 @@ class IngestionScreen(ModalScreen[None]):
|
||||
classes="progress-section card",
|
||||
)
|
||||
|
||||
# Action buttons
|
||||
yield Horizontal(
|
||||
Button("🚀 Start Ingestion", id="start_btn", variant="success"),
|
||||
Button("❌ Cancel", id="cancel_btn", variant="error"),
|
||||
@@ -90,164 +139,326 @@ class IngestionScreen(ModalScreen[None]):
|
||||
|
||||
yield LoadingIndicator(id="loading", classes="pulse")
|
||||
|
||||
|
||||
def _create_backend_checkbox_widgets(self) -> list[Checkbox]:
|
||||
"""Create checkbox widgets for each available backend."""
|
||||
checkboxes: list[Checkbox] = [
|
||||
Checkbox(
|
||||
BACKEND_LABELS.get(backend, backend.value),
|
||||
value=backend in self.selected_backends,
|
||||
id=f"backend_{backend.value}",
|
||||
)
|
||||
for backend in BACKEND_ORDER
|
||||
if backend in self.available_backends
|
||||
]
|
||||
return checkboxes
|
||||
|
||||
def on_mount(self) -> None:
|
||||
"""Initialize the screen."""
|
||||
"""Initialize the screen state once widgets exist."""
|
||||
self.query_one("#loading").display = False
|
||||
self.selected_type = IngestionSource.WEB
|
||||
# Focus the URL input field by default
|
||||
self.query_one("#url_input").focus()
|
||||
self.query_one("#url_input", Input).focus()
|
||||
self._set_backend_selection(self.selected_backends)
|
||||
self._update_selection_status()
|
||||
|
||||
def action_select_web(self) -> None:
|
||||
"""Select web ingestion type."""
|
||||
self.selected_type = IngestionSource.WEB
|
||||
self.update_type_buttons("web")
|
||||
self._update_type_buttons("web")
|
||||
|
||||
def action_select_repo(self) -> None:
|
||||
"""Select repository ingestion type."""
|
||||
self.selected_type = IngestionSource.REPOSITORY
|
||||
self.update_type_buttons("repo")
|
||||
self._update_type_buttons("repo")
|
||||
|
||||
def action_select_docs(self) -> None:
|
||||
"""Select documentation ingestion type."""
|
||||
self.selected_type = IngestionSource.DOCUMENTATION
|
||||
self.update_type_buttons("docs")
|
||||
self._update_type_buttons("docs")
|
||||
|
||||
def _update_type_buttons(self, selected: str) -> None:
|
||||
buttons = {
|
||||
"web": self.query_one("#web_btn", Button),
|
||||
"repo": self.query_one("#repo_btn", Button),
|
||||
"docs": self.query_one("#docs_btn", Button),
|
||||
}
|
||||
for kind, button in buttons.items():
|
||||
button.variant = "primary" if kind == selected else "default"
|
||||
|
||||
def on_button_pressed(self, event: Button.Pressed) -> None:
|
||||
"""Handle button presses with enhanced feedback."""
|
||||
button_id = event.button.id
|
||||
|
||||
if button_id == "web_btn":
|
||||
self.action_select_web()
|
||||
elif button_id == "repo_btn":
|
||||
self.action_select_repo()
|
||||
elif button_id == "docs_btn":
|
||||
self.action_select_docs()
|
||||
elif button_id == "select_all_backends":
|
||||
self._set_backend_selection(self.available_backends)
|
||||
self._update_selection_status()
|
||||
elif button_id == "clear_backends":
|
||||
self._set_backend_selection([])
|
||||
self._update_selection_status()
|
||||
elif button_id == "start_btn":
|
||||
self.action_start_ingestion()
|
||||
elif button_id == "cancel_btn":
|
||||
self.app.pop_screen()
|
||||
|
||||
def update_type_buttons(self, selected: str) -> None:
|
||||
"""Update type button visual states."""
|
||||
buttons = {
|
||||
"web": self.query_one("#web_btn", Button),
|
||||
"repo": self.query_one("#repo_btn", Button),
|
||||
"docs": self.query_one("#docs_btn", Button),
|
||||
}
|
||||
|
||||
for btn_type, button in buttons.items():
|
||||
if btn_type == selected:
|
||||
button.variant = "primary"
|
||||
else:
|
||||
button.variant = "default"
|
||||
def on_checkbox_changed(self, event: Checkbox.Changed) -> None:
|
||||
"""Handle checkbox state changes for backend selection."""
|
||||
if event.checkbox.id and event.checkbox.id.startswith("backend_"):
|
||||
# Update the selected backends list based on current checkbox states
|
||||
self.selected_backends = self._resolve_selected_backends()
|
||||
self._update_selection_status()
|
||||
|
||||
def on_input_submitted(self, event: Input.Submitted) -> None:
|
||||
"""Handle URL input submission."""
|
||||
if event.input.id == "url_input":
|
||||
if event.input.id in ("url_input", "collection_input"):
|
||||
self.action_start_ingestion()
|
||||
|
||||
def action_start_ingestion(self) -> None:
|
||||
"""Start the enhanced ingestion process."""
|
||||
url_input = self.query_one("#url_input", Input)
|
||||
if not url_input.value.strip():
|
||||
collection_input = self.query_one("#collection_input", Input)
|
||||
|
||||
source_url = url_input.value.strip()
|
||||
collection_name = collection_input.value.strip()
|
||||
|
||||
if not source_url:
|
||||
self.notify("🔍 Please enter a source URL", severity="error")
|
||||
url_input.focus()
|
||||
return
|
||||
|
||||
self.perform_ingestion(url_input.value.strip())
|
||||
# Validate URL format
|
||||
if not self._validate_url(source_url):
|
||||
self.notify("❌ Invalid URL format. Please enter a valid HTTP/HTTPS URL or file:// path", severity="error")
|
||||
url_input.focus()
|
||||
return
|
||||
|
||||
@work(exclusive=True)
|
||||
async def perform_ingestion(self, source_url: str) -> None:
|
||||
"""Perform ingestion with enhanced progress tracking and better UX."""
|
||||
loading = self.query_one("#loading")
|
||||
enhanced_progress = self.query_one("#enhanced_progress", EnhancedProgressBar)
|
||||
progress_text = self.query_one("#progress_text", Static)
|
||||
resolved_backends = self._resolve_selected_backends()
|
||||
if not resolved_backends:
|
||||
self.notify("⚠️ Select at least one storage backend", severity="warning")
|
||||
return
|
||||
|
||||
self.selected_backends = resolved_backends
|
||||
self.perform_ingestion(source_url, collection_name)
|
||||
|
||||
def _validate_url(self, url: str) -> bool:
|
||||
"""Validate URL format for security."""
|
||||
if not url:
|
||||
return False
|
||||
|
||||
# Basic URL validation
|
||||
url_lower = url.lower()
|
||||
|
||||
# Allow HTTP/HTTPS URLs
|
||||
if url_lower.startswith(('http://', 'https://')):
|
||||
# Additional validation could be added here
|
||||
return True
|
||||
|
||||
# Allow file:// URLs for repository paths
|
||||
if url_lower.startswith('file://'):
|
||||
return True
|
||||
|
||||
# Allow local file paths that look like repositories
|
||||
return '/' in url and not url_lower.startswith(
|
||||
('javascript:', 'data:', 'vbscript:')
|
||||
)
|
||||
|
||||
def _resolve_selected_backends(self) -> list[StorageBackend]:
|
||||
selected: list[StorageBackend] = []
|
||||
for backend in BACKEND_ORDER:
|
||||
if backend not in self.available_backends:
|
||||
continue
|
||||
checkbox_id = f"#backend_{backend.value}"
|
||||
checkbox = self.query_one(checkbox_id, Checkbox)
|
||||
if checkbox.value:
|
||||
selected.append(backend)
|
||||
return selected
|
||||
|
||||
def _set_backend_selection(self, backends: list[StorageBackend]) -> None:
|
||||
normalized = [backend for backend in BACKEND_ORDER if backend in backends]
|
||||
for backend in BACKEND_ORDER:
|
||||
if backend not in self.available_backends:
|
||||
continue
|
||||
checkbox_id = f"#backend_{backend.value}"
|
||||
checkbox = self.query_one(checkbox_id, Checkbox)
|
||||
checkbox.value = backend in normalized
|
||||
self.selected_backends = normalized
|
||||
|
||||
def _update_selection_status(self) -> None:
|
||||
"""Update the visual indicator showing current storage selection."""
|
||||
try:
|
||||
status_widget = self.query_one("#selection_status", Static)
|
||||
|
||||
if not self.selected_backends:
|
||||
status_widget.update("📋 Selected: None")
|
||||
elif len(self.selected_backends) == 1:
|
||||
backend_name = BACKEND_LABELS.get(self.selected_backends[0], self.selected_backends[0].value)
|
||||
status_widget.update(f"📋 Selected: {backend_name}")
|
||||
else:
|
||||
# Multiple backends selected
|
||||
backend_names = [
|
||||
BACKEND_LABELS.get(backend, backend.value)
|
||||
for backend in self.selected_backends
|
||||
]
|
||||
if len(backend_names) <= 3:
|
||||
# Show all names if 3 or fewer
|
||||
names_str = ", ".join(backend_names)
|
||||
status_widget.update(f"📋 Selected: {names_str}")
|
||||
else:
|
||||
# Show count if more than 3
|
||||
status_widget.update(f"📋 Selected: {len(self.selected_backends)} backends")
|
||||
except Exception:
|
||||
# Widget might not exist yet during initialization
|
||||
pass
|
||||
|
||||
def _derive_initial_backends(self) -> list[StorageBackend]:
|
||||
backend_info = self.collection.get("backend", "")
|
||||
|
||||
# Handle both single backend (str) and multi-backend (list[str])
|
||||
if isinstance(backend_info, list):
|
||||
# Multi-backend: try to match all backends
|
||||
matched_backends = []
|
||||
for backend_name in backend_info:
|
||||
backend_name_lower = backend_name.lower()
|
||||
for backend in BACKEND_ORDER:
|
||||
if backend not in self.available_backends:
|
||||
continue
|
||||
if backend.value.lower() == backend_name_lower or backend.name.lower() == backend_name_lower:
|
||||
matched_backends.append(backend)
|
||||
break
|
||||
return matched_backends or [self.available_backends[0]]
|
||||
else:
|
||||
# Single backend: original logic
|
||||
backend_label = str(backend_info).lower()
|
||||
for backend in BACKEND_ORDER:
|
||||
if backend not in self.available_backends:
|
||||
continue
|
||||
if backend.value in backend_label or backend.name.lower() in backend_label:
|
||||
return [backend]
|
||||
return [self.available_backends[0]]
|
||||
|
||||
@work(exclusive=True, thread=True)
|
||||
def perform_ingestion(self, source_url: str, collection_name: str = "") -> None:
|
||||
import asyncio
|
||||
from typing import cast
|
||||
|
||||
backends = self._resolve_selected_backends()
|
||||
self.selected_backends = backends
|
||||
|
||||
def update_ui(action: str) -> None:
|
||||
def _update() -> None:
|
||||
try:
|
||||
loading = self.query_one("#loading")
|
||||
if action == "show_loading":
|
||||
loading.display = True
|
||||
elif action == "hide_loading":
|
||||
loading.display = False
|
||||
except Exception:
|
||||
pass
|
||||
cast("CollectionManagementApp", self.app).call_from_thread(_update)
|
||||
|
||||
def progress_reporter(percent: int, message: str) -> None:
|
||||
def _update_progress() -> None:
|
||||
try:
|
||||
progress = self.query_one("#enhanced_progress", EnhancedProgressBar)
|
||||
progress_text = self.query_one("#progress_text", Static)
|
||||
progress.update_progress(percent, message)
|
||||
progress_text.update(message)
|
||||
except Exception:
|
||||
pass
|
||||
cast("CollectionManagementApp", self.app).call_from_thread(_update_progress)
|
||||
|
||||
try:
|
||||
loading.display = True
|
||||
update_ui("show_loading")
|
||||
progress_reporter(5, "🚀 Starting Prefect flows...")
|
||||
|
||||
# Enhanced progress tracking with better visual feedback
|
||||
enhanced_progress.update_progress(5, "Initializing ingestion pipeline...")
|
||||
progress_text.update("🚀 Starting modern ingestion process...")
|
||||
await asyncio.sleep(0.3)
|
||||
# Use user-provided collection name or fall back to default
|
||||
final_collection_name = collection_name or self.collection.get("name")
|
||||
|
||||
# Determine storage backend
|
||||
storage_backend = (
|
||||
StorageBackend.WEAVIATE
|
||||
if self.collection["type"] == "weaviate"
|
||||
else StorageBackend.OPEN_WEBUI
|
||||
)
|
||||
total_successful = 0
|
||||
total_failed = 0
|
||||
flow_errors: list[str] = []
|
||||
|
||||
enhanced_progress.update_progress(15, "Creating ingestion job...")
|
||||
progress_text.update("📋 Configuring job parameters...")
|
||||
await asyncio.sleep(0.4)
|
||||
for i, backend in enumerate(backends):
|
||||
progress_percent = 20 + (60 * i) // len(backends)
|
||||
progress_reporter(progress_percent, f"🔗 Processing {backend.value} backend ({i+1}/{len(backends)})...")
|
||||
|
||||
# Create ingestion job
|
||||
job = IngestionJob(
|
||||
source_url=source_url,
|
||||
source_type=self.selected_type,
|
||||
storage_backend=storage_backend,
|
||||
created_at=datetime.now(),
|
||||
)
|
||||
try:
|
||||
# Run the Prefect flow for this backend using asyncio.run with timeout
|
||||
import asyncio
|
||||
|
||||
enhanced_progress.update_progress(25, "Loading ingestion modules...")
|
||||
progress_text.update("⚡ Importing processing components...")
|
||||
await asyncio.sleep(0.4)
|
||||
async def run_flow_with_timeout() -> IngestionResult:
|
||||
return await asyncio.wait_for(
|
||||
create_ingestion_flow(
|
||||
source_url=source_url,
|
||||
source_type=self.selected_type,
|
||||
storage_backend=backend,
|
||||
collection_name=final_collection_name,
|
||||
progress_callback=progress_reporter,
|
||||
),
|
||||
timeout=600.0 # 10 minute timeout
|
||||
)
|
||||
|
||||
from ....flows.ingestion import ingest_documents_task
|
||||
result = asyncio.run(run_flow_with_timeout())
|
||||
|
||||
enhanced_progress.update_progress(35, "Connecting to services...")
|
||||
progress_text.update(f"🔗 Establishing connection to {storage_backend.value}...")
|
||||
await asyncio.sleep(0.5)
|
||||
total_successful += result.documents_processed
|
||||
total_failed += result.documents_failed
|
||||
|
||||
enhanced_progress.update_progress(45, "Fetching source content...")
|
||||
progress_text.update("📄 Retrieving documents from source...")
|
||||
await asyncio.sleep(0.6)
|
||||
if result.error_messages:
|
||||
flow_errors.extend([f"{backend.value}: {err}" for err in result.error_messages])
|
||||
|
||||
# Simulate realistic progress steps
|
||||
progress_steps = [
|
||||
(55, "Parsing document structure...", "🔍 Analyzing content structure..."),
|
||||
(65, "Extracting text content...", "📝 Processing text and metadata..."),
|
||||
(75, "Generating embeddings...", "🧠 Creating vector embeddings..."),
|
||||
(85, "Storing in database...", "💾 Persisting to storage backend..."),
|
||||
(95, "Finalizing operation...", "🎯 Completing ingestion process..."),
|
||||
]
|
||||
except asyncio.TimeoutError:
|
||||
error_msg = f"{backend.value}: Timeout after 10 minutes"
|
||||
flow_errors.append(error_msg)
|
||||
progress_reporter(0, f"❌ {backend.value} timed out")
|
||||
def notify_timeout(msg: str = f"⏰ {backend.value} flow timed out after 10 minutes") -> None:
|
||||
try:
|
||||
self.notify(msg, severity="error", markup=False)
|
||||
except Exception:
|
||||
pass
|
||||
cast("CollectionManagementApp", self.app).call_from_thread(notify_timeout)
|
||||
except Exception as exc:
|
||||
flow_errors.append(f"{backend.value}: {exc}")
|
||||
def notify_error(msg: str = f"❌ {backend.value} flow failed: {exc}") -> None:
|
||||
try:
|
||||
self.notify(msg, severity="error", markup=False)
|
||||
except Exception:
|
||||
pass
|
||||
cast("CollectionManagementApp", self.app).call_from_thread(notify_error)
|
||||
|
||||
for progress, status, text in progress_steps:
|
||||
enhanced_progress.update_progress(progress, status)
|
||||
progress_text.update(text)
|
||||
await asyncio.sleep(0.7)
|
||||
successful = total_successful
|
||||
failed = total_failed
|
||||
|
||||
# Perform actual ingestion
|
||||
successful, failed = await ingest_documents_task(
|
||||
job, collection_name=self.collection["name"]
|
||||
)
|
||||
progress_reporter(100, "🎉 Completed successfully!")
|
||||
|
||||
# Success handling with celebratory feedback
|
||||
enhanced_progress.update_progress(100, "Completed successfully!")
|
||||
progress_text.update(
|
||||
f"🎉 Ingestion complete: {successful} documents added, {failed} failed"
|
||||
)
|
||||
def notify_results() -> None:
|
||||
try:
|
||||
if successful > 0:
|
||||
self.notify(
|
||||
f"🎉 Successfully ingested {successful} documents across {len(backends)} backend(s) via Prefect!",
|
||||
severity="information",
|
||||
)
|
||||
if failed > 0:
|
||||
self.notify(f"⚠️ {failed} documents failed to process", severity="warning")
|
||||
|
||||
# Show enhanced success notification
|
||||
if successful > 0:
|
||||
self.notify(
|
||||
f"🎉 Successfully ingested {successful} documents!",
|
||||
severity="information"
|
||||
)
|
||||
if failed > 0:
|
||||
self.notify(f"⚠️ {failed} documents failed to process", severity="warning")
|
||||
else:
|
||||
self.notify("❌ No documents were successfully processed", severity="error")
|
||||
if flow_errors:
|
||||
for error in flow_errors:
|
||||
self.notify(f"⚠️ {error}", severity="warning", markup=False)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Keep results visible before closing
|
||||
await asyncio.sleep(3)
|
||||
self.app.pop_screen()
|
||||
cast("CollectionManagementApp", self.app).call_from_thread(notify_results)
|
||||
|
||||
except Exception as e:
|
||||
enhanced_progress.update_progress(0, "Ingestion failed")
|
||||
progress_text.update(f"❌ Error occurred: {str(e)[:100]}")
|
||||
self.notify(f"❌ Ingestion failed: {e}", severity="error")
|
||||
await asyncio.sleep(2) # Show error before allowing interaction
|
||||
import time
|
||||
time.sleep(2)
|
||||
cast("CollectionManagementApp", self.app).pop_screen()
|
||||
|
||||
except Exception as exc: # pragma: no cover - defensive
|
||||
progress_reporter(0, f"❌ Prefect flows error: {exc}")
|
||||
def notify_error(msg: str = f"❌ Prefect flows failed: {exc}") -> None:
|
||||
try:
|
||||
self.notify(msg, severity="error")
|
||||
except Exception:
|
||||
pass
|
||||
cast("CollectionManagementApp", self.app).call_from_thread(notify_error)
|
||||
import time
|
||||
time.sleep(2)
|
||||
finally:
|
||||
loading.display = False
|
||||
update_ui("hide_loading")
|
||||
|
||||
|
||||
@@ -53,7 +53,11 @@ class SearchScreen(Screen[None]):
|
||||
Button("🔍 Search", id="search_btn", variant="primary"),
|
||||
Button("🗑️ Clear Results", id="clear_btn", variant="default"),
|
||||
EnhancedDataTable(id="results_table"),
|
||||
Static("Enter your search query to find relevant documents.", id="search_status", classes="status-text"),
|
||||
Static(
|
||||
"Enter your search query to find relevant documents.",
|
||||
id="search_status",
|
||||
classes="status-text",
|
||||
),
|
||||
LoadingIndicator(id="loading"),
|
||||
classes="main_container",
|
||||
)
|
||||
@@ -63,9 +67,9 @@ class SearchScreen(Screen[None]):
|
||||
"""Initialize the screen."""
|
||||
self.query_one("#loading").display = False
|
||||
|
||||
# Setup results table
|
||||
# Setup results table with enhanced metadata
|
||||
table = self.query_one("#results_table", EnhancedDataTable)
|
||||
table.add_columns("Title", "Content Preview", "Score")
|
||||
table.add_columns("Title", "Source URL", "Type", "Content Preview", "Words", "Score")
|
||||
|
||||
# Focus search input
|
||||
self.query_one("#search_input").focus()
|
||||
@@ -79,7 +83,7 @@ class SearchScreen(Screen[None]):
|
||||
"""Clear search results."""
|
||||
table = self.query_one("#results_table", EnhancedDataTable)
|
||||
table.clear()
|
||||
table.add_columns("Title", "Content Preview", "Score")
|
||||
table.add_columns("Title", "Source URL", "Type", "Content Preview", "Words", "Score")
|
||||
|
||||
status = self.query_one("#search_status", Static)
|
||||
status.update("Search results cleared. Enter a new query to search.")
|
||||
@@ -113,46 +117,95 @@ class SearchScreen(Screen[None]):
|
||||
status = self.query_one("#search_status", Static)
|
||||
|
||||
try:
|
||||
loading.display = True
|
||||
status.update(f"🔍 Searching for '{query}'...")
|
||||
table.clear()
|
||||
table.add_columns("Title", "Content Preview", "Score")
|
||||
|
||||
results = []
|
||||
|
||||
if self.collection["type"] == "weaviate" and self.weaviate:
|
||||
results = await self.search_weaviate(query)
|
||||
elif self.collection["type"] == "openwebui" and self.openwebui:
|
||||
results = await self.search_openwebui(query)
|
||||
|
||||
# Add results to table
|
||||
for result in results:
|
||||
title = result.get("title", "Untitled")
|
||||
content = result.get("content", "")
|
||||
score = result.get("score", 0)
|
||||
table.add_row(
|
||||
title[:50] if isinstance(title, str) else str(title)[:50],
|
||||
(content[:100] + "...")
|
||||
if isinstance(content, str)
|
||||
else str(content)[:100] + "...",
|
||||
f"{score:.3f}" if isinstance(score, (int, float)) else str(score),
|
||||
)
|
||||
|
||||
if not results:
|
||||
status.update(f"No results found for '{query}'. Try different keywords.")
|
||||
self.notify("No results found", severity="information")
|
||||
else:
|
||||
status.update(f"Found {len(results)} results for '{query}'. Use arrow keys to navigate.")
|
||||
self.notify(f"Found {len(results)} results", severity="information")
|
||||
# Focus the table for navigation
|
||||
table.focus()
|
||||
|
||||
self._setup_search_ui(loading, table, status, query)
|
||||
results = await self._execute_search(query)
|
||||
self._populate_results_table(table, results)
|
||||
self._update_search_status(status, query, results, table)
|
||||
except Exception as e:
|
||||
status.update(f"Search error: {e}")
|
||||
self.notify(f"Search error: {e}", severity="error")
|
||||
self.notify(f"Search error: {e}", severity="error", markup=False)
|
||||
finally:
|
||||
loading.display = False
|
||||
|
||||
def _setup_search_ui(self, loading, table, status, query: str) -> None:
|
||||
"""Setup the search UI elements."""
|
||||
loading.display = True
|
||||
status.update(f"🔍 Searching for '{query}'...")
|
||||
table.clear()
|
||||
table.add_columns("Title", "Source URL", "Type", "Content Preview", "Words", "Score")
|
||||
|
||||
async def _execute_search(self, query: str) -> list[dict[str, str | float]]:
|
||||
"""Execute the search based on collection type."""
|
||||
if self.collection["type"] == "weaviate" and self.weaviate:
|
||||
return await self.search_weaviate(query)
|
||||
elif self.collection["type"] == "openwebui" and self.openwebui:
|
||||
return await self.search_openwebui(query)
|
||||
return []
|
||||
|
||||
def _populate_results_table(self, table, results: list[dict[str, str | float]]) -> None:
|
||||
"""Populate the results table with search results."""
|
||||
for result in results:
|
||||
row_data = self._format_result_row(result)
|
||||
table.add_row(*row_data)
|
||||
|
||||
def _format_result_row(self, result: dict[str, str | float]) -> tuple[str, ...]:
|
||||
"""Format a single result row for the table."""
|
||||
title = self._truncate_text(result.get("title", "Untitled"), 30)
|
||||
source_url = self._truncate_text(result.get("source_url", ""), 40)
|
||||
type_display = self._format_content_type(result.get("content_type", "text/plain"))
|
||||
content_preview = self._format_content_preview(result.get("content", ""))
|
||||
word_count = str(result.get("word_count", 0))
|
||||
score_display = self._format_score(result.get("score"))
|
||||
|
||||
return (title, source_url, type_display, content_preview, word_count, score_display)
|
||||
|
||||
def _truncate_text(self, text: str | float | None, max_length: int) -> str:
|
||||
"""Truncate text to specified length."""
|
||||
if not isinstance(text, str):
|
||||
text = str(text) if text is not None else ""
|
||||
return text[:max_length]
|
||||
|
||||
def _format_content_type(self, content_type: str | float) -> str:
|
||||
"""Format content type with appropriate icon."""
|
||||
content_type = str(content_type).lower()
|
||||
if "markdown" in content_type:
|
||||
return "📝 md"
|
||||
elif "html" in content_type:
|
||||
return "🌐 html"
|
||||
elif "text" in content_type:
|
||||
return "📄 txt"
|
||||
else:
|
||||
return f"📄 {content_type.split('/')[-1][:5]}"
|
||||
|
||||
def _format_content_preview(self, content: str | float) -> str:
|
||||
"""Format content preview with truncation."""
|
||||
if not isinstance(content, str):
|
||||
content = str(content) if content is not None else ""
|
||||
return f"{content[:60]}..." if len(content) > 60 else content
|
||||
|
||||
def _format_score(self, score) -> str:
|
||||
"""Format search score for display."""
|
||||
if isinstance(score, (int, float)):
|
||||
return f"{score:.3f}"
|
||||
elif score is None:
|
||||
return "-"
|
||||
else:
|
||||
return str(score)
|
||||
|
||||
def _update_search_status(
|
||||
self, status, query: str, results: list[dict[str, str | float]], table
|
||||
) -> None:
|
||||
"""Update search status and notifications based on results."""
|
||||
if not results:
|
||||
status.update(f"No results found for '{query}'. Try different keywords.")
|
||||
self.notify("No results found", severity="information")
|
||||
else:
|
||||
status.update(
|
||||
f"Found {len(results)} results for '{query}'. Use arrow keys to navigate."
|
||||
)
|
||||
self.notify(f"Found {len(results)} results", severity="information")
|
||||
table.focus()
|
||||
|
||||
async def search_weaviate(self, query: str) -> list[dict[str, str | float]]:
|
||||
"""Search Weaviate collection."""
|
||||
if not self.weaviate:
|
||||
@@ -160,19 +213,39 @@ class SearchScreen(Screen[None]):
|
||||
|
||||
try:
|
||||
await self.weaviate.initialize()
|
||||
results_generator = self.weaviate.search(query, limit=20)
|
||||
results = [doc async for doc in results_generator]
|
||||
# Use the search_documents method which returns more metadata
|
||||
results = await self.weaviate.search_documents(
|
||||
query,
|
||||
limit=20,
|
||||
collection_name=self.collection["name"],
|
||||
)
|
||||
|
||||
# Convert Document objects to dict format expected by the UI
|
||||
return [
|
||||
{
|
||||
"title": getattr(doc, "title", "Untitled"),
|
||||
"content": getattr(doc, "content", ""),
|
||||
"score": getattr(doc, "score", 0.0),
|
||||
}
|
||||
for doc in results
|
||||
]
|
||||
formatted_results = []
|
||||
for doc in results:
|
||||
metadata = getattr(doc, "metadata", {})
|
||||
|
||||
score_value: float | None = None
|
||||
raw_score = getattr(doc, "score", None)
|
||||
if raw_score is not None:
|
||||
try:
|
||||
score_value = float(raw_score)
|
||||
except (TypeError, ValueError):
|
||||
score_value = None
|
||||
|
||||
formatted_results.append(
|
||||
{
|
||||
"title": metadata.get("title", "Untitled"),
|
||||
"source_url": metadata.get("source_url", ""),
|
||||
"content_type": metadata.get("content_type", "text/plain"),
|
||||
"content": getattr(doc, "content", ""),
|
||||
"word_count": metadata.get("word_count", 0),
|
||||
"score": score_value if score_value is not None else 0.0,
|
||||
}
|
||||
)
|
||||
return formatted_results
|
||||
except Exception as e:
|
||||
self.notify(f"Weaviate search error: {e}", severity="error")
|
||||
self.notify(f"Weaviate search error: {e}", severity="error", markup=False)
|
||||
return []
|
||||
|
||||
async def search_openwebui(self, query: str) -> list[dict[str, str | float]]:
|
||||
@@ -186,5 +259,5 @@ class SearchScreen(Screen[None]):
|
||||
self.notify("OpenWebUI search not yet implemented", severity="warning")
|
||||
return []
|
||||
except Exception as e:
|
||||
self.notify(f"OpenWebUI search error: {e}", severity="error")
|
||||
self.notify(f"OpenWebUI search error: {e}", severity="error", markup=False)
|
||||
return []
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -1,62 +1,141 @@
|
||||
"""TUI runner functions and initialization."""
|
||||
|
||||
import asyncio
|
||||
from __future__ import annotations
|
||||
|
||||
from ....core.models import StorageBackend, StorageConfig
|
||||
import asyncio
|
||||
import logging
|
||||
from logging import Logger
|
||||
from logging.handlers import QueueHandler, RotatingFileHandler
|
||||
from pathlib import Path
|
||||
from queue import Queue
|
||||
from typing import NamedTuple, cast
|
||||
|
||||
from ....config import configure_prefect, get_settings
|
||||
from ....core.models import StorageBackend
|
||||
from ....storage.openwebui import OpenWebUIStorage
|
||||
from ....storage.weaviate import WeaviateStorage
|
||||
from ..app import CollectionManagementApp
|
||||
from .storage_manager import StorageManager
|
||||
|
||||
|
||||
class _TuiLoggingContext(NamedTuple):
|
||||
"""Container describing configured logging outputs for the TUI."""
|
||||
|
||||
queue: Queue[logging.LogRecord]
|
||||
formatter: logging.Formatter
|
||||
log_file: Path | None
|
||||
|
||||
|
||||
_logging_context: _TuiLoggingContext | None = None
|
||||
|
||||
|
||||
def _configure_tui_logging(*, log_level: str) -> _TuiLoggingContext:
|
||||
"""Configure logging so that messages do not break the TUI output."""
|
||||
|
||||
global _logging_context
|
||||
if _logging_context is not None:
|
||||
return _logging_context
|
||||
|
||||
resolved_level = getattr(logging, log_level.upper(), logging.INFO)
|
||||
log_queue: Queue[logging.LogRecord] = Queue()
|
||||
formatter = logging.Formatter(
|
||||
fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.setLevel(resolved_level)
|
||||
|
||||
# Remove existing stream handlers to prevent console flicker inside the TUI
|
||||
for handler in list(root_logger.handlers):
|
||||
root_logger.removeHandler(handler)
|
||||
|
||||
queue_handler = QueueHandler(log_queue)
|
||||
queue_handler.setLevel(resolved_level)
|
||||
root_logger.addHandler(queue_handler)
|
||||
|
||||
log_file: Path | None = None
|
||||
try:
|
||||
log_dir = Path.cwd() / "logs"
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
log_file = log_dir / "tui.log"
|
||||
file_handler = RotatingFileHandler(
|
||||
log_file,
|
||||
maxBytes=2_000_000,
|
||||
backupCount=5,
|
||||
encoding="utf-8",
|
||||
)
|
||||
file_handler.setLevel(resolved_level)
|
||||
file_handler.setFormatter(formatter)
|
||||
root_logger.addHandler(file_handler)
|
||||
except OSError as exc: # pragma: no cover - filesystem specific
|
||||
fallback = logging.getLogger(__name__)
|
||||
fallback.warning("Failed to configure file logging for TUI: %s", exc)
|
||||
|
||||
_logging_context = _TuiLoggingContext(log_queue, formatter, log_file)
|
||||
return _logging_context
|
||||
|
||||
|
||||
LOGGER: Logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def run_textual_tui() -> None:
|
||||
"""Run the enhanced modern TUI with better error handling and initialization."""
|
||||
from ....config.settings import get_settings
|
||||
|
||||
settings = get_settings()
|
||||
configure_prefect(settings)
|
||||
|
||||
# Initialize storage backends with enhanced error handling
|
||||
weaviate = None
|
||||
openwebui = None
|
||||
logging_context = _configure_tui_logging(log_level=settings.log_level)
|
||||
|
||||
print("🚀 Initializing Modern Collection Management System...")
|
||||
LOGGER.info("Initializing collection management TUI")
|
||||
LOGGER.info("Scanning available storage backends")
|
||||
|
||||
try:
|
||||
print("🔗 Connecting to Weaviate...")
|
||||
weaviate_config = StorageConfig(
|
||||
backend=StorageBackend.WEAVIATE,
|
||||
endpoint=settings.weaviate_endpoint,
|
||||
api_key=settings.weaviate_api_key,
|
||||
collection_name="default",
|
||||
)
|
||||
weaviate = WeaviateStorage(weaviate_config)
|
||||
await weaviate.initialize()
|
||||
print("✅ Weaviate connected successfully!")
|
||||
except Exception as e:
|
||||
print(f"⚠️ Weaviate connection failed: {e}")
|
||||
# Initialize storage manager
|
||||
storage_manager = StorageManager(settings)
|
||||
backend_status = await storage_manager.initialize_all_backends()
|
||||
|
||||
try:
|
||||
print("🔗 Connecting to OpenWebUI...")
|
||||
openwebui_config = StorageConfig(
|
||||
backend=StorageBackend.OPEN_WEBUI,
|
||||
endpoint=settings.openwebui_endpoint,
|
||||
api_key=settings.openwebui_api_key,
|
||||
collection_name="default",
|
||||
)
|
||||
openwebui = OpenWebUIStorage(openwebui_config)
|
||||
await openwebui.initialize()
|
||||
print("✅ OpenWebUI connected successfully!")
|
||||
except Exception as e:
|
||||
print(f"⚠️ OpenWebUI connection failed: {e}")
|
||||
# Report initialization results
|
||||
for backend, success in backend_status.items():
|
||||
if success:
|
||||
LOGGER.info("%s connected successfully", backend.value)
|
||||
else:
|
||||
LOGGER.warning("%s connection failed", backend.value)
|
||||
|
||||
if not weaviate and not openwebui:
|
||||
print("❌ Error: Could not connect to any storage backend")
|
||||
print("Please check your configuration and try again.")
|
||||
available_backends = storage_manager.get_available_backends()
|
||||
if not available_backends:
|
||||
LOGGER.error("Could not connect to any storage backend")
|
||||
LOGGER.info("Please check your configuration and try again")
|
||||
LOGGER.info("Supported backends: Weaviate, OpenWebUI, R2R")
|
||||
return
|
||||
|
||||
print("🎉 Launching Enhanced TUI with Keyboard Navigation...")
|
||||
LOGGER.info(
|
||||
"Launching TUI with %d backend(s): %s",
|
||||
len(available_backends),
|
||||
", ".join(backend.value for backend in available_backends),
|
||||
)
|
||||
|
||||
app = CollectionManagementApp(weaviate, openwebui)
|
||||
await app.run_async()
|
||||
# Get individual storage instances for backward compatibility
|
||||
weaviate = cast(WeaviateStorage | None, storage_manager.get_backend(StorageBackend.WEAVIATE))
|
||||
openwebui = cast(
|
||||
OpenWebUIStorage | None, storage_manager.get_backend(StorageBackend.OPEN_WEBUI)
|
||||
)
|
||||
r2r = storage_manager.get_backend(StorageBackend.R2R)
|
||||
|
||||
# Import here to avoid circular import
|
||||
from ..app import CollectionManagementApp
|
||||
app = CollectionManagementApp(
|
||||
storage_manager,
|
||||
weaviate,
|
||||
openwebui,
|
||||
r2r,
|
||||
log_queue=logging_context.queue,
|
||||
log_formatter=logging_context.formatter,
|
||||
log_file=logging_context.log_file,
|
||||
)
|
||||
try:
|
||||
await app.run_async()
|
||||
finally:
|
||||
LOGGER.info("Shutting down storage connections")
|
||||
await storage_manager.close_all()
|
||||
LOGGER.info("All storage connections closed gracefully")
|
||||
|
||||
|
||||
def dashboard() -> None:
|
||||
|
||||
493
ingest_pipeline/cli/tui/utils/storage_manager.py
Normal file
493
ingest_pipeline/cli/tui/utils/storage_manager.py
Normal file
@@ -0,0 +1,493 @@
|
||||
"""Storage management utilities for TUI applications."""
|
||||
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from collections.abc import AsyncGenerator, Sequence
|
||||
from typing import TYPE_CHECKING, Protocol
|
||||
|
||||
from ....core.exceptions import StorageError
|
||||
from ....core.models import Document, StorageBackend, StorageConfig
|
||||
from ..models import CollectionInfo, StorageCapabilities
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ....config.settings import Settings
|
||||
from ....storage.weaviate import WeaviateStorage
|
||||
from ....storage.r2r.storage import R2RStorage
|
||||
from ....storage.openwebui import OpenWebUIStorage
|
||||
from ....storage.base import BaseStorage
|
||||
|
||||
|
||||
class StorageBackendProtocol(Protocol):
|
||||
"""Protocol defining storage backend interface."""
|
||||
|
||||
async def initialize(self) -> None: ...
|
||||
async def count(self, *, collection_name: str | None = None) -> int: ...
|
||||
async def list_collections(self) -> list[str]: ...
|
||||
async def search(
|
||||
self,
|
||||
query: str,
|
||||
limit: int = 10,
|
||||
threshold: float = 0.7,
|
||||
*,
|
||||
collection_name: str | None = None,
|
||||
) -> AsyncGenerator[Document, None]: ...
|
||||
async def close(self) -> None: ...
|
||||
|
||||
|
||||
|
||||
class MultiStorageAdapter(BaseStorage):
|
||||
"""Mirror writes to multiple storage backends."""
|
||||
|
||||
def __init__(self, storages: Sequence[BaseStorage]) -> None:
|
||||
if not storages:
|
||||
raise ValueError("MultiStorageAdapter requires at least one storage backend")
|
||||
|
||||
unique: list[BaseStorage] = []
|
||||
seen_ids: set[int] = set()
|
||||
for storage in storages:
|
||||
storage_id = id(storage)
|
||||
if storage_id in seen_ids:
|
||||
continue
|
||||
seen_ids.add(storage_id)
|
||||
unique.append(storage)
|
||||
|
||||
self._storages = unique
|
||||
self._primary = unique[0]
|
||||
super().__init__(self._primary.config)
|
||||
|
||||
async def initialize(self) -> None:
|
||||
for storage in self._storages:
|
||||
await storage.initialize()
|
||||
|
||||
async def store(self, document: Document, *, collection_name: str | None = None) -> str:
|
||||
# Store in primary backend first
|
||||
primary_id: str = await self._primary.store(document, collection_name=collection_name)
|
||||
|
||||
# Replicate to secondary backends concurrently
|
||||
if len(self._storages) > 1:
|
||||
async def replicate_to_backend(storage: BaseStorage) -> tuple[BaseStorage, bool, Exception | None]:
|
||||
try:
|
||||
await storage.store(document, collection_name=collection_name)
|
||||
return storage, True, None
|
||||
except Exception as exc:
|
||||
return storage, False, exc
|
||||
|
||||
tasks = [replicate_to_backend(storage) for storage in self._storages[1:]]
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
failures: list[str] = []
|
||||
errors: list[Exception] = []
|
||||
|
||||
for result in results:
|
||||
if isinstance(result, tuple):
|
||||
storage, success, error = result
|
||||
if not success and error is not None:
|
||||
failures.append(self._format_backend_label(storage))
|
||||
errors.append(error)
|
||||
elif isinstance(result, Exception):
|
||||
failures.append("unknown")
|
||||
errors.append(result)
|
||||
|
||||
if failures:
|
||||
backends = ", ".join(failures)
|
||||
primary_error = errors[0] if errors else Exception("Unknown replication error")
|
||||
raise StorageError(
|
||||
f"Document stored in primary backend but replication failed for: {backends}"
|
||||
) from primary_error
|
||||
|
||||
return primary_id
|
||||
|
||||
async def store_batch(
|
||||
self, documents: list[Document], *, collection_name: str | None = None
|
||||
) -> list[str]:
|
||||
# Store in primary backend first
|
||||
primary_ids: list[str] = await self._primary.store_batch(documents, collection_name=collection_name)
|
||||
|
||||
# Replicate to secondary backends concurrently
|
||||
if len(self._storages) > 1:
|
||||
async def replicate_batch_to_backend(storage: BaseStorage) -> tuple[BaseStorage, bool, Exception | None]:
|
||||
try:
|
||||
await storage.store_batch(documents, collection_name=collection_name)
|
||||
return storage, True, None
|
||||
except Exception as exc:
|
||||
return storage, False, exc
|
||||
|
||||
tasks = [replicate_batch_to_backend(storage) for storage in self._storages[1:]]
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
failures: list[str] = []
|
||||
errors: list[Exception] = []
|
||||
|
||||
for result in results:
|
||||
if isinstance(result, tuple):
|
||||
storage, success, error = result
|
||||
if not success and error is not None:
|
||||
failures.append(self._format_backend_label(storage))
|
||||
errors.append(error)
|
||||
elif isinstance(result, Exception):
|
||||
failures.append("unknown")
|
||||
errors.append(result)
|
||||
|
||||
if failures:
|
||||
backends = ", ".join(failures)
|
||||
primary_error = errors[0] if errors else Exception("Unknown batch replication error")
|
||||
raise StorageError(
|
||||
f"Batch stored in primary backend but replication failed for: {backends}"
|
||||
) from primary_error
|
||||
|
||||
return primary_ids
|
||||
|
||||
async def delete(self, document_id: str, *, collection_name: str | None = None) -> bool:
|
||||
# Delete from primary backend first
|
||||
primary_deleted: bool = await self._primary.delete(document_id, collection_name=collection_name)
|
||||
|
||||
# Delete from secondary backends concurrently
|
||||
if len(self._storages) > 1:
|
||||
async def delete_from_backend(storage: BaseStorage) -> tuple[BaseStorage, bool, Exception | None]:
|
||||
try:
|
||||
await storage.delete(document_id, collection_name=collection_name)
|
||||
return storage, True, None
|
||||
except Exception as exc:
|
||||
return storage, False, exc
|
||||
|
||||
tasks = [delete_from_backend(storage) for storage in self._storages[1:]]
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
failures: list[str] = []
|
||||
errors: list[Exception] = []
|
||||
|
||||
for result in results:
|
||||
if isinstance(result, tuple):
|
||||
storage, success, error = result
|
||||
if not success and error is not None:
|
||||
failures.append(self._format_backend_label(storage))
|
||||
errors.append(error)
|
||||
elif isinstance(result, Exception):
|
||||
failures.append("unknown")
|
||||
errors.append(result)
|
||||
|
||||
if failures:
|
||||
backends = ", ".join(failures)
|
||||
primary_error = errors[0] if errors else Exception("Unknown deletion error")
|
||||
raise StorageError(
|
||||
f"Document deleted from primary backend but failed for: {backends}"
|
||||
) from primary_error
|
||||
|
||||
return primary_deleted
|
||||
|
||||
async def count(self, *, collection_name: str | None = None) -> int:
|
||||
count_result: int = await self._primary.count(collection_name=collection_name)
|
||||
return count_result
|
||||
|
||||
async def list_collections(self) -> list[str]:
|
||||
list_fn = getattr(self._primary, "list_collections", None)
|
||||
if list_fn is None:
|
||||
return []
|
||||
collections_result: list[str] = await list_fn()
|
||||
return collections_result
|
||||
|
||||
async def search(
|
||||
self,
|
||||
query: str,
|
||||
limit: int = 10,
|
||||
threshold: float = 0.7,
|
||||
*,
|
||||
collection_name: str | None = None,
|
||||
) -> AsyncGenerator[Document, None]:
|
||||
async for item in self._primary.search(
|
||||
query,
|
||||
limit=limit,
|
||||
threshold=threshold,
|
||||
collection_name=collection_name,
|
||||
):
|
||||
yield item
|
||||
|
||||
async def close(self) -> None:
|
||||
for storage in self._storages:
|
||||
close_fn = getattr(storage, "close", None)
|
||||
if close_fn is not None:
|
||||
await close_fn()
|
||||
|
||||
def _format_backend_label(self, storage: BaseStorage) -> str:
|
||||
backend = getattr(storage.config, "backend", None)
|
||||
if isinstance(backend, StorageBackend):
|
||||
backend_value: str = backend.value
|
||||
return backend_value
|
||||
class_name: str = storage.__class__.__name__
|
||||
return class_name
|
||||
|
||||
|
||||
|
||||
class StorageManager:
|
||||
"""Centralized manager for all storage backend operations."""
|
||||
|
||||
def __init__(self, settings: Settings) -> None:
|
||||
"""Initialize storage manager with application settings."""
|
||||
self.settings = settings
|
||||
self.backends: dict[StorageBackend, BaseStorage] = {}
|
||||
self.capabilities: dict[StorageBackend, StorageCapabilities] = {}
|
||||
self._initialized = False
|
||||
|
||||
async def initialize_all_backends(self) -> dict[StorageBackend, bool]:
|
||||
"""Initialize all available storage backends with timeout protection."""
|
||||
results: dict[StorageBackend, bool] = {}
|
||||
|
||||
async def init_backend(backend_type: StorageBackend, config: StorageConfig, storage_class: type[BaseStorage]) -> bool:
|
||||
"""Initialize a single backend with timeout."""
|
||||
try:
|
||||
storage = storage_class(config)
|
||||
await asyncio.wait_for(storage.initialize(), timeout=30.0)
|
||||
self.backends[backend_type] = storage
|
||||
if backend_type == StorageBackend.WEAVIATE:
|
||||
self.capabilities[backend_type] = StorageCapabilities.VECTOR_SEARCH
|
||||
elif backend_type == StorageBackend.OPEN_WEBUI:
|
||||
self.capabilities[backend_type] = StorageCapabilities.KNOWLEDGE_BASE
|
||||
elif backend_type == StorageBackend.R2R:
|
||||
self.capabilities[backend_type] = StorageCapabilities.FULL_FEATURED
|
||||
return True
|
||||
except (TimeoutError, Exception):
|
||||
return False
|
||||
|
||||
# Initialize backends concurrently with timeout protection
|
||||
tasks = []
|
||||
|
||||
# Try Weaviate
|
||||
if self.settings.weaviate_endpoint:
|
||||
config = StorageConfig(
|
||||
backend=StorageBackend.WEAVIATE,
|
||||
endpoint=self.settings.weaviate_endpoint,
|
||||
api_key=self.settings.weaviate_api_key,
|
||||
collection_name="default",
|
||||
)
|
||||
tasks.append((StorageBackend.WEAVIATE, init_backend(StorageBackend.WEAVIATE, config, WeaviateStorage)))
|
||||
else:
|
||||
results[StorageBackend.WEAVIATE] = False
|
||||
|
||||
# Try OpenWebUI
|
||||
if self.settings.openwebui_endpoint and self.settings.openwebui_api_key:
|
||||
config = StorageConfig(
|
||||
backend=StorageBackend.OPEN_WEBUI,
|
||||
endpoint=self.settings.openwebui_endpoint,
|
||||
api_key=self.settings.openwebui_api_key,
|
||||
collection_name="default",
|
||||
)
|
||||
tasks.append((StorageBackend.OPEN_WEBUI, init_backend(StorageBackend.OPEN_WEBUI, config, OpenWebUIStorage)))
|
||||
else:
|
||||
results[StorageBackend.OPEN_WEBUI] = False
|
||||
|
||||
# Try R2R
|
||||
if self.settings.r2r_endpoint:
|
||||
config = StorageConfig(
|
||||
backend=StorageBackend.R2R,
|
||||
endpoint=self.settings.r2r_endpoint,
|
||||
api_key=self.settings.r2r_api_key,
|
||||
collection_name="default",
|
||||
)
|
||||
tasks.append((StorageBackend.R2R, init_backend(StorageBackend.R2R, config, R2RStorage)))
|
||||
else:
|
||||
results[StorageBackend.R2R] = False
|
||||
|
||||
# Execute initialization tasks concurrently
|
||||
if tasks:
|
||||
backend_types, task_coroutines = zip(*tasks, strict=False)
|
||||
task_results = await asyncio.gather(*task_coroutines, return_exceptions=True)
|
||||
|
||||
for backend_type, task_result in zip(backend_types, task_results, strict=False):
|
||||
results[backend_type] = task_result if isinstance(task_result, bool) else False
|
||||
self._initialized = True
|
||||
return results
|
||||
|
||||
def get_backend(self, backend_type: StorageBackend) -> BaseStorage | None:
|
||||
"""Get storage backend by type."""
|
||||
return self.backends.get(backend_type)
|
||||
|
||||
def build_multi_storage_adapter(
|
||||
self, backends: Sequence[StorageBackend]
|
||||
) -> MultiStorageAdapter:
|
||||
storages: list[BaseStorage] = []
|
||||
seen: set[StorageBackend] = set()
|
||||
for backend in backends:
|
||||
backend_enum = backend if isinstance(backend, StorageBackend) else StorageBackend(backend)
|
||||
if backend_enum in seen:
|
||||
continue
|
||||
seen.add(backend_enum)
|
||||
storage = self.backends.get(backend_enum)
|
||||
if storage is None:
|
||||
raise ValueError(f"Storage backend {backend_enum.value} is not initialized")
|
||||
storages.append(storage)
|
||||
return MultiStorageAdapter(storages)
|
||||
|
||||
def get_available_backends(self) -> list[StorageBackend]:
|
||||
"""Get list of successfully initialized backends."""
|
||||
return list(self.backends.keys())
|
||||
|
||||
def has_capability(self, backend: StorageBackend, capability: StorageCapabilities) -> bool:
|
||||
"""Check if backend has specific capability."""
|
||||
backend_caps = self.capabilities.get(backend, StorageCapabilities.BASIC)
|
||||
return capability.value <= backend_caps.value
|
||||
|
||||
async def get_all_collections(self) -> list[CollectionInfo]:
|
||||
"""Get collections from all available backends, merging collections with same name."""
|
||||
collection_map: dict[str, CollectionInfo] = {}
|
||||
|
||||
for backend_type, storage in self.backends.items():
|
||||
try:
|
||||
backend_collections = await storage.list_collections()
|
||||
for collection_name in backend_collections:
|
||||
# Validate collection name
|
||||
if not collection_name or not isinstance(collection_name, str):
|
||||
continue
|
||||
|
||||
try:
|
||||
count = await storage.count(collection_name=collection_name)
|
||||
# Validate count is non-negative
|
||||
count = max(count, 0)
|
||||
except StorageError as e:
|
||||
# Storage-specific errors - log and use 0 count
|
||||
import logging
|
||||
logging.warning(f"Failed to get count for {collection_name} on {backend_type.value}: {e}")
|
||||
count = 0
|
||||
except Exception as e:
|
||||
# Unexpected errors - log and skip this collection from this backend
|
||||
import logging
|
||||
logging.warning(f"Unexpected error counting {collection_name} on {backend_type.value}: {e}")
|
||||
continue
|
||||
|
||||
size_mb = count * 0.01 # Rough estimate: 10KB per document
|
||||
|
||||
if collection_name in collection_map:
|
||||
# Merge with existing collection
|
||||
existing = collection_map[collection_name]
|
||||
existing_backends = existing["backend"]
|
||||
backend_value = backend_type.value
|
||||
|
||||
if isinstance(existing_backends, str):
|
||||
existing["backend"] = [existing_backends, backend_value]
|
||||
elif isinstance(existing_backends, list):
|
||||
# Prevent duplicates
|
||||
if backend_value not in existing_backends:
|
||||
existing_backends.append(backend_value)
|
||||
|
||||
# Aggregate counts and sizes
|
||||
existing["count"] += count
|
||||
existing["size_mb"] += size_mb
|
||||
else:
|
||||
# Create new collection entry
|
||||
collection_info: CollectionInfo = {
|
||||
"name": collection_name,
|
||||
"type": self._get_collection_type(collection_name, backend_type),
|
||||
"count": count,
|
||||
"backend": backend_type.value,
|
||||
"status": "active",
|
||||
"last_updated": "2024-01-01T00:00:00Z",
|
||||
"size_mb": size_mb,
|
||||
}
|
||||
collection_map[collection_name] = collection_info
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return list(collection_map.values())
|
||||
|
||||
def _get_collection_type(self, collection_name: str, backend: StorageBackend) -> str:
|
||||
"""Determine collection type based on name and backend."""
|
||||
# Prioritize definitive backend type first
|
||||
if backend == StorageBackend.R2R:
|
||||
return "r2r"
|
||||
elif backend == StorageBackend.WEAVIATE:
|
||||
return "weaviate"
|
||||
elif backend == StorageBackend.OPEN_WEBUI:
|
||||
return "openwebui"
|
||||
|
||||
# Fallback to name-based guessing if backend is not specific
|
||||
name_lower = collection_name.lower()
|
||||
if "web" in name_lower or "doc" in name_lower:
|
||||
return "documentation"
|
||||
elif "repo" in name_lower or "code" in name_lower:
|
||||
return "repository"
|
||||
else:
|
||||
return "general"
|
||||
|
||||
async def search_across_backends(
|
||||
self,
|
||||
query: str,
|
||||
limit: int = 10,
|
||||
backends: list[StorageBackend] | None = None,
|
||||
) -> dict[StorageBackend, list[Document]]:
|
||||
"""Search across multiple backends and return grouped results."""
|
||||
if backends is None:
|
||||
backends = self.get_available_backends()
|
||||
|
||||
results: dict[StorageBackend, list[Document]] = {}
|
||||
|
||||
async def search_backend(backend_type: StorageBackend) -> None:
|
||||
storage = self.backends.get(backend_type)
|
||||
if storage:
|
||||
try:
|
||||
documents = []
|
||||
async for doc in storage.search(query, limit=limit):
|
||||
documents.append(doc)
|
||||
results[backend_type] = documents
|
||||
except Exception:
|
||||
results[backend_type] = []
|
||||
|
||||
# Run searches in parallel
|
||||
tasks = [search_backend(backend) for backend in backends]
|
||||
await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
return results
|
||||
|
||||
def get_r2r_storage(self) -> R2RStorage | None:
|
||||
"""Get R2R storage instance if available."""
|
||||
storage = self.backends.get(StorageBackend.R2R)
|
||||
return storage if isinstance(storage, R2RStorage) else None
|
||||
|
||||
async def get_backend_status(self) -> dict[StorageBackend, dict[str, str | int | bool | StorageCapabilities]]:
|
||||
"""Get comprehensive status for all backends."""
|
||||
status: dict[StorageBackend, dict[str, str | int | bool | StorageCapabilities]] = {}
|
||||
|
||||
for backend_type, storage in self.backends.items():
|
||||
try:
|
||||
collections = await storage.list_collections()
|
||||
total_docs = 0
|
||||
for collection in collections:
|
||||
total_docs += await storage.count(collection_name=collection)
|
||||
|
||||
backend_status = {
|
||||
"available": True,
|
||||
"collections": len(collections),
|
||||
"total_documents": total_docs,
|
||||
"capabilities": self.capabilities.get(backend_type, StorageCapabilities.BASIC),
|
||||
"endpoint": getattr(storage.config, "endpoint", "unknown"),
|
||||
}
|
||||
status[backend_type] = backend_status
|
||||
except Exception as e:
|
||||
status[backend_type] = {
|
||||
"available": False,
|
||||
"error": str(e),
|
||||
"capabilities": StorageCapabilities.NONE,
|
||||
}
|
||||
|
||||
return status
|
||||
|
||||
async def close_all(self) -> None:
|
||||
"""Close all storage connections."""
|
||||
for storage in self.backends.values():
|
||||
try:
|
||||
await storage.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
self.backends.clear()
|
||||
self.capabilities.clear()
|
||||
self._initialized = False
|
||||
|
||||
@property
|
||||
def is_initialized(self) -> bool:
|
||||
"""Check if storage manager is initialized."""
|
||||
return self._initialized
|
||||
|
||||
def supports_advanced_features(self, backend: StorageBackend) -> bool:
|
||||
"""Check if backend supports advanced features like chunks and entities."""
|
||||
return self.has_capability(backend, StorageCapabilities.FULL_FEATURED)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
638
ingest_pipeline/cli/tui/widgets/firecrawl_config.py
Normal file
638
ingest_pipeline/cli/tui/widgets/firecrawl_config.py
Normal file
@@ -0,0 +1,638 @@
|
||||
"""Firecrawl configuration widgets for advanced scraping options."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, cast
|
||||
|
||||
from textual.app import ComposeResult
|
||||
from textual.containers import Container, Horizontal
|
||||
from textual.validation import Integer
|
||||
from textual.widgets import Button, Checkbox, Input, Label, Switch, TextArea
|
||||
from typing_extensions import override
|
||||
|
||||
from ..models import FirecrawlOptions
|
||||
|
||||
|
||||
class ScrapeOptionsForm(Container):
|
||||
"""Form for configuring Firecrawl scraping options."""
|
||||
|
||||
DEFAULT_CSS = """
|
||||
ScrapeOptionsForm {
|
||||
border: solid $border;
|
||||
background: $surface;
|
||||
padding: 1;
|
||||
height: auto;
|
||||
}
|
||||
|
||||
ScrapeOptionsForm .form-section {
|
||||
margin-bottom: 2;
|
||||
padding: 1;
|
||||
border: solid $border-lighten-1;
|
||||
background: $surface-lighten-1;
|
||||
}
|
||||
|
||||
ScrapeOptionsForm .form-row {
|
||||
layout: horizontal;
|
||||
align-items: center;
|
||||
height: auto;
|
||||
margin-bottom: 1;
|
||||
}
|
||||
|
||||
ScrapeOptionsForm .form-label {
|
||||
width: 30%;
|
||||
min-width: 15;
|
||||
text-align: right;
|
||||
padding-right: 2;
|
||||
}
|
||||
|
||||
ScrapeOptionsForm .form-input {
|
||||
width: 70%;
|
||||
}
|
||||
|
||||
ScrapeOptionsForm .checkbox-row {
|
||||
layout: horizontal;
|
||||
align-items: center;
|
||||
height: 3;
|
||||
margin-bottom: 1;
|
||||
}
|
||||
|
||||
ScrapeOptionsForm .checkbox-label {
|
||||
margin-left: 2;
|
||||
}
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs: Any) -> None:
|
||||
"""Initialize scrape options form."""
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@override
|
||||
def compose(self) -> ComposeResult:
|
||||
"""Compose scrape options form."""
|
||||
yield Label("🔧 Scraping Configuration", classes="form-title")
|
||||
|
||||
# Output formats section
|
||||
yield Container(
|
||||
Label("Output Formats", classes="section-title"),
|
||||
Horizontal(
|
||||
Checkbox("Markdown", id="format_markdown", value=True, classes="checkbox"),
|
||||
Label("Markdown", classes="checkbox-label"),
|
||||
classes="checkbox-row",
|
||||
),
|
||||
Horizontal(
|
||||
Checkbox("HTML", id="format_html", value=False, classes="checkbox"),
|
||||
Label("HTML", classes="checkbox-label"),
|
||||
classes="checkbox-row",
|
||||
),
|
||||
Horizontal(
|
||||
Checkbox("Screenshot", id="format_screenshot", value=False, classes="checkbox"),
|
||||
Label("Screenshot", classes="checkbox-label"),
|
||||
classes="checkbox-row",
|
||||
),
|
||||
classes="form-section",
|
||||
)
|
||||
|
||||
# Content filtering section
|
||||
yield Container(
|
||||
Label("Content Filtering", classes="section-title"),
|
||||
Horizontal(
|
||||
Label("Only Main Content:", classes="form-label"),
|
||||
Switch(id="only_main_content", value=True, classes="form-input"),
|
||||
classes="form-row",
|
||||
),
|
||||
Horizontal(
|
||||
Label("Include Tags:", classes="form-label"),
|
||||
Input(
|
||||
placeholder="p, div, article (comma-separated)",
|
||||
id="include_tags",
|
||||
classes="form-input",
|
||||
),
|
||||
classes="form-row",
|
||||
),
|
||||
Horizontal(
|
||||
Label("Exclude Tags:", classes="form-label"),
|
||||
Input(
|
||||
placeholder="nav, footer, script (comma-separated)",
|
||||
id="exclude_tags",
|
||||
classes="form-input",
|
||||
),
|
||||
classes="form-row",
|
||||
),
|
||||
classes="form-section",
|
||||
)
|
||||
|
||||
# Performance settings section
|
||||
yield Container(
|
||||
Label("Performance Settings", classes="section-title"),
|
||||
Horizontal(
|
||||
Label("Wait Time (ms):", classes="form-label"),
|
||||
Input(
|
||||
placeholder="0",
|
||||
id="wait_for",
|
||||
validators=[Integer(minimum=0, maximum=30000)],
|
||||
classes="form-input",
|
||||
),
|
||||
classes="form-row",
|
||||
),
|
||||
classes="form-section",
|
||||
)
|
||||
|
||||
def get_scrape_options(self) -> dict[str, Any]:
|
||||
"""Get scraping options from form."""
|
||||
options: dict[str, Any] = {}
|
||||
|
||||
# Collect formats
|
||||
formats = []
|
||||
if self.query_one("#format_markdown", Checkbox).value:
|
||||
formats.append("markdown")
|
||||
if self.query_one("#format_html", Checkbox).value:
|
||||
formats.append("html")
|
||||
if self.query_one("#format_screenshot", Checkbox).value:
|
||||
formats.append("screenshot")
|
||||
options["formats"] = formats
|
||||
|
||||
# Content filtering
|
||||
options["only_main_content"] = self.query_one("#only_main_content", Switch).value
|
||||
|
||||
include_tags_input = self.query_one("#include_tags", Input).value
|
||||
if include_tags_input.strip():
|
||||
options["include_tags"] = [tag.strip() for tag in include_tags_input.split(",")]
|
||||
|
||||
exclude_tags_input = self.query_one("#exclude_tags", Input).value
|
||||
if exclude_tags_input.strip():
|
||||
options["exclude_tags"] = [tag.strip() for tag in exclude_tags_input.split(",")]
|
||||
|
||||
# Performance
|
||||
wait_for_input = self.query_one("#wait_for", Input).value
|
||||
if wait_for_input.strip():
|
||||
try:
|
||||
options["wait_for"] = int(wait_for_input)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return options
|
||||
|
||||
def set_scrape_options(self, options: dict[str, Any]) -> None:
|
||||
"""Set form values from options."""
|
||||
# Set formats
|
||||
formats = options.get("formats", ["markdown"])
|
||||
self.query_one("#format_markdown", Checkbox).value = "markdown" in formats
|
||||
self.query_one("#format_html", Checkbox).value = "html" in formats
|
||||
self.query_one("#format_screenshot", Checkbox).value = "screenshot" in formats
|
||||
|
||||
# Set content filtering
|
||||
self.query_one("#only_main_content", Switch).value = options.get("only_main_content", True)
|
||||
|
||||
if include_tags := options.get("include_tags", []):
|
||||
self.query_one("#include_tags", Input).value = ", ".join(include_tags)
|
||||
|
||||
if exclude_tags := options.get("exclude_tags", []):
|
||||
self.query_one("#exclude_tags", Input).value = ", ".join(exclude_tags)
|
||||
|
||||
# Set performance
|
||||
wait_for = options.get("wait_for")
|
||||
if wait_for is not None:
|
||||
self.query_one("#wait_for", Input).value = str(wait_for)
|
||||
|
||||
|
||||
class MapOptionsForm(Container):
|
||||
"""Form for configuring site mapping options."""
|
||||
|
||||
DEFAULT_CSS = """
|
||||
MapOptionsForm {
|
||||
border: solid $border;
|
||||
background: $surface;
|
||||
padding: 1;
|
||||
height: auto;
|
||||
}
|
||||
|
||||
MapOptionsForm .form-section {
|
||||
margin-bottom: 2;
|
||||
padding: 1;
|
||||
border: solid $border-lighten-1;
|
||||
background: $surface-lighten-1;
|
||||
}
|
||||
|
||||
MapOptionsForm .form-row {
|
||||
layout: horizontal;
|
||||
align-items: center;
|
||||
height: auto;
|
||||
margin-bottom: 1;
|
||||
}
|
||||
|
||||
MapOptionsForm .form-label {
|
||||
width: 30%;
|
||||
min-width: 15;
|
||||
text-align: right;
|
||||
padding-right: 2;
|
||||
}
|
||||
|
||||
MapOptionsForm .form-input {
|
||||
width: 70%;
|
||||
}
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs: Any) -> None:
|
||||
"""Initialize map options form."""
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@override
|
||||
def compose(self) -> ComposeResult:
|
||||
"""Compose map options form."""
|
||||
yield Label("🗺️ Site Mapping Configuration", classes="form-title")
|
||||
|
||||
# Discovery settings section
|
||||
yield Container(
|
||||
Label("Discovery Settings", classes="section-title"),
|
||||
Horizontal(
|
||||
Label("Search Pattern:", classes="form-label"),
|
||||
Input(
|
||||
placeholder="docs, api, guide (optional)",
|
||||
id="search_pattern",
|
||||
classes="form-input",
|
||||
),
|
||||
classes="form-row",
|
||||
),
|
||||
Horizontal(
|
||||
Label("Include Subdomains:", classes="form-label"),
|
||||
Switch(id="include_subdomains", value=False, classes="form-input"),
|
||||
classes="form-row",
|
||||
),
|
||||
classes="form-section",
|
||||
)
|
||||
|
||||
# Limits section
|
||||
yield Container(
|
||||
Label("Crawling Limits", classes="section-title"),
|
||||
Horizontal(
|
||||
Label("Max Pages:", classes="form-label"),
|
||||
Input(
|
||||
placeholder="100",
|
||||
id="max_pages",
|
||||
validators=[Integer(minimum=1, maximum=1000)],
|
||||
classes="form-input",
|
||||
),
|
||||
classes="form-row",
|
||||
),
|
||||
Horizontal(
|
||||
Label("Max Depth:", classes="form-label"),
|
||||
Input(
|
||||
placeholder="5",
|
||||
id="max_depth",
|
||||
validators=[Integer(minimum=1, maximum=20)],
|
||||
classes="form-input",
|
||||
),
|
||||
classes="form-row",
|
||||
),
|
||||
classes="form-section",
|
||||
)
|
||||
|
||||
def get_map_options(self) -> dict[str, Any]:
|
||||
"""Get mapping options from form."""
|
||||
options: dict[str, Any] = {}
|
||||
|
||||
# Discovery settings
|
||||
search_pattern = self.query_one("#search_pattern", Input).value
|
||||
if search_pattern.strip():
|
||||
options["search"] = search_pattern.strip()
|
||||
|
||||
options["include_subdomains"] = self.query_one("#include_subdomains", Switch).value
|
||||
|
||||
# Limits
|
||||
max_pages_input = self.query_one("#max_pages", Input).value
|
||||
if max_pages_input.strip():
|
||||
try:
|
||||
options["limit"] = int(max_pages_input)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
max_depth_input = self.query_one("#max_depth", Input).value
|
||||
if max_depth_input.strip():
|
||||
try:
|
||||
options["max_depth"] = int(max_depth_input)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return options
|
||||
|
||||
def set_map_options(self, options: dict[str, Any]) -> None:
|
||||
"""Set form values from options."""
|
||||
if search := options.get("search"):
|
||||
self.query_one("#search_pattern", Input).value = str(search)
|
||||
|
||||
self.query_one("#include_subdomains", Switch).value = options.get("include_subdomains", False)
|
||||
|
||||
# Set limits
|
||||
limit = options.get("limit")
|
||||
if limit is not None:
|
||||
self.query_one("#max_pages", Input).value = str(limit)
|
||||
|
||||
max_depth = options.get("max_depth")
|
||||
if max_depth is not None:
|
||||
self.query_one("#max_depth", Input).value = str(max_depth)
|
||||
|
||||
|
||||
class ExtractOptionsForm(Container):
|
||||
"""Form for configuring data extraction options."""
|
||||
|
||||
DEFAULT_CSS = """
|
||||
ExtractOptionsForm {
|
||||
border: solid $border;
|
||||
background: $surface;
|
||||
padding: 1;
|
||||
height: auto;
|
||||
}
|
||||
|
||||
ExtractOptionsForm .form-section {
|
||||
margin-bottom: 2;
|
||||
padding: 1;
|
||||
border: solid $border-lighten-1;
|
||||
background: $surface-lighten-1;
|
||||
}
|
||||
|
||||
ExtractOptionsForm .form-row {
|
||||
layout: horizontal;
|
||||
align-items: start;
|
||||
height: auto;
|
||||
margin-bottom: 1;
|
||||
}
|
||||
|
||||
ExtractOptionsForm .form-label {
|
||||
width: 30%;
|
||||
min-width: 15;
|
||||
text-align: right;
|
||||
padding-right: 2;
|
||||
padding-top: 1;
|
||||
}
|
||||
|
||||
ExtractOptionsForm .form-input {
|
||||
width: 70%;
|
||||
}
|
||||
|
||||
ExtractOptionsForm .text-area {
|
||||
height: 6;
|
||||
}
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs: Any) -> None:
|
||||
"""Initialize extract options form."""
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@override
|
||||
def compose(self) -> ComposeResult:
|
||||
"""Compose extract options form."""
|
||||
yield Label("🎯 Data Extraction Configuration", classes="form-title")
|
||||
|
||||
# Extraction prompt section
|
||||
yield Container(
|
||||
Label("AI-Powered Extraction", classes="section-title"),
|
||||
Horizontal(
|
||||
Label("Custom Prompt:", classes="form-label"),
|
||||
TextArea(
|
||||
placeholder="Extract product names, prices, and descriptions...",
|
||||
id="extract_prompt",
|
||||
classes="form-input text-area",
|
||||
),
|
||||
classes="form-row",
|
||||
),
|
||||
classes="form-section",
|
||||
)
|
||||
|
||||
# Schema definition section
|
||||
yield Container(
|
||||
Label("Structured Schema (JSON)", classes="section-title"),
|
||||
Horizontal(
|
||||
Label("Schema Definition:", classes="form-label"),
|
||||
TextArea(
|
||||
placeholder='{"product_name": "string", "price": "number", "description": "string"}',
|
||||
id="extract_schema",
|
||||
classes="form-input text-area",
|
||||
),
|
||||
classes="form-row",
|
||||
),
|
||||
Container(
|
||||
Label("💡 Tip: Define the structure of data you want to extract"),
|
||||
classes="help-text",
|
||||
),
|
||||
classes="form-section",
|
||||
)
|
||||
|
||||
# Schema presets
|
||||
yield Container(
|
||||
Label("Quick Presets", classes="section-title"),
|
||||
Horizontal(
|
||||
Button("📄 Article", id="preset_article", variant="default"),
|
||||
Button("🛍️ Product", id="preset_product", variant="default"),
|
||||
Button("👤 Contact", id="preset_contact", variant="default"),
|
||||
Button("📊 Data", id="preset_data", variant="default"),
|
||||
classes="preset-buttons",
|
||||
),
|
||||
classes="form-section",
|
||||
)
|
||||
|
||||
def get_extract_options(self) -> dict[str, Any]:
|
||||
"""Get extraction options from form."""
|
||||
options: dict[str, Any] = {}
|
||||
|
||||
# Extract prompt
|
||||
prompt = self.query_one("#extract_prompt", TextArea).text
|
||||
if prompt.strip():
|
||||
options["extract_prompt"] = prompt.strip()
|
||||
|
||||
# Extract schema
|
||||
schema_text = self.query_one("#extract_schema", TextArea).text
|
||||
if schema_text.strip():
|
||||
try:
|
||||
import json
|
||||
|
||||
schema = json.loads(schema_text)
|
||||
options["extract_schema"] = schema
|
||||
except json.JSONDecodeError:
|
||||
# Invalid JSON, skip schema
|
||||
pass
|
||||
|
||||
return options
|
||||
|
||||
def set_extract_options(self, options: dict[str, Any]) -> None:
|
||||
"""Set form values from options."""
|
||||
if prompt := options.get("extract_prompt"):
|
||||
self.query_one("#extract_prompt", TextArea).text = str(prompt)
|
||||
|
||||
if schema := options.get("extract_schema"):
|
||||
import json
|
||||
|
||||
self.query_one("#extract_schema", TextArea).text = json.dumps(schema, indent=2)
|
||||
|
||||
def on_button_pressed(self, event: Button.Pressed) -> None:
|
||||
"""Handle preset button presses."""
|
||||
schema_widget = self.query_one("#extract_schema", TextArea)
|
||||
prompt_widget = self.query_one("#extract_prompt", TextArea)
|
||||
|
||||
if event.button.id == "preset_article":
|
||||
schema_widget.text = """{
|
||||
"title": "string",
|
||||
"author": "string",
|
||||
"date": "string",
|
||||
"content": "string",
|
||||
"tags": ["string"]
|
||||
}"""
|
||||
prompt_widget.text = "Extract article title, author, publication date, main content, and associated tags"
|
||||
|
||||
elif event.button.id == "preset_product":
|
||||
schema_widget.text = """{
|
||||
"name": "string",
|
||||
"price": "number",
|
||||
"description": "string",
|
||||
"category": "string",
|
||||
"availability": "string"
|
||||
}"""
|
||||
prompt_widget.text = "Extract product name, price, description, category, and availability status"
|
||||
|
||||
elif event.button.id == "preset_contact":
|
||||
schema_widget.text = """{
|
||||
"name": "string",
|
||||
"email": "string",
|
||||
"phone": "string",
|
||||
"company": "string",
|
||||
"position": "string"
|
||||
}"""
|
||||
prompt_widget.text = "Extract contact information including name, email, phone, company, and position"
|
||||
|
||||
elif event.button.id == "preset_data":
|
||||
schema_widget.text = """{
|
||||
"metrics": [{"name": "string", "value": "number", "unit": "string"}],
|
||||
"tables": [{"headers": ["string"], "rows": [["string"]]}]
|
||||
}"""
|
||||
prompt_widget.text = "Extract numerical data, metrics, and tabular information"
|
||||
|
||||
|
||||
class FirecrawlConfigWidget(Container):
|
||||
"""Complete Firecrawl configuration widget with tabbed interface."""
|
||||
|
||||
DEFAULT_CSS = """
|
||||
FirecrawlConfigWidget {
|
||||
border: solid $border;
|
||||
background: $surface;
|
||||
height: 100%;
|
||||
padding: 1;
|
||||
}
|
||||
|
||||
FirecrawlConfigWidget .config-header {
|
||||
dock: top;
|
||||
height: 3;
|
||||
background: $primary;
|
||||
color: $text;
|
||||
padding: 1;
|
||||
margin: -1 -1 1 -1;
|
||||
}
|
||||
|
||||
FirecrawlConfigWidget .tab-buttons {
|
||||
dock: top;
|
||||
height: 3;
|
||||
layout: horizontal;
|
||||
margin-bottom: 1;
|
||||
}
|
||||
|
||||
FirecrawlConfigWidget .tab-button {
|
||||
width: 1fr;
|
||||
margin-right: 1;
|
||||
}
|
||||
|
||||
FirecrawlConfigWidget .tab-content {
|
||||
height: 1fr;
|
||||
overflow: auto;
|
||||
}
|
||||
|
||||
FirecrawlConfigWidget .actions {
|
||||
dock: bottom;
|
||||
height: 3;
|
||||
layout: horizontal;
|
||||
align: center;
|
||||
margin-top: 1;
|
||||
}
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs: Any) -> None:
|
||||
"""Initialize Firecrawl config widget."""
|
||||
super().__init__(**kwargs)
|
||||
self.current_tab = "scrape"
|
||||
|
||||
@override
|
||||
def compose(self) -> ComposeResult:
|
||||
"""Compose config widget layout."""
|
||||
yield Container(
|
||||
Label("🔥 Firecrawl Configuration", classes="config-header"),
|
||||
Horizontal(
|
||||
Button("🔧 Scraping", id="tab_scrape", variant="primary", classes="tab-button"),
|
||||
Button("🗺️ Mapping", id="tab_map", variant="default", classes="tab-button"),
|
||||
Button("🎯 Extraction", id="tab_extract", variant="default", classes="tab-button"),
|
||||
classes="tab-buttons",
|
||||
),
|
||||
Container(
|
||||
ScrapeOptionsForm(id="scrape_form"),
|
||||
classes="tab-content",
|
||||
),
|
||||
Horizontal(
|
||||
Button("📋 Load Preset", id="load_preset", variant="default"),
|
||||
Button("💾 Save Preset", id="save_preset", variant="default"),
|
||||
Button("🔄 Reset", id="reset_config", variant="default"),
|
||||
classes="actions",
|
||||
),
|
||||
)
|
||||
|
||||
def on_mount(self) -> None:
|
||||
"""Initialize widget."""
|
||||
self.show_tab("scrape")
|
||||
|
||||
def show_tab(self, tab_name: str) -> None:
|
||||
"""Show specific configuration tab."""
|
||||
self.current_tab = tab_name
|
||||
|
||||
# Update button states
|
||||
for tab in ["scrape", "map", "extract"]:
|
||||
button = self.query_one(f"#tab_{tab}", Button)
|
||||
button.variant = "primary" if tab == tab_name else "default"
|
||||
# Update tab content
|
||||
content_container = self.query_one(".tab-content", Container)
|
||||
content_container.remove_children()
|
||||
|
||||
if tab_name == "extract":
|
||||
content_container.mount(ExtractOptionsForm(id="extract_form"))
|
||||
elif tab_name == "map":
|
||||
content_container.mount(MapOptionsForm(id="map_form"))
|
||||
elif tab_name == "scrape":
|
||||
content_container.mount(ScrapeOptionsForm(id="scrape_form"))
|
||||
|
||||
def on_button_pressed(self, event: Button.Pressed) -> None:
|
||||
"""Handle button presses."""
|
||||
if event.button.id.startswith("tab_"):
|
||||
tab_name = event.button.id[4:] # Remove "tab_" prefix
|
||||
self.show_tab(tab_name)
|
||||
|
||||
def get_all_options(self) -> FirecrawlOptions:
|
||||
"""Get all configuration options."""
|
||||
options: FirecrawlOptions = {}
|
||||
|
||||
# Try to get options from currently mounted form
|
||||
if self.current_tab == "scrape":
|
||||
try:
|
||||
form = self.query_one("#scrape_form", ScrapeOptionsForm)
|
||||
scrape_opts = form.get_scrape_options()
|
||||
options.update(cast(FirecrawlOptions, scrape_opts))
|
||||
except Exception:
|
||||
pass
|
||||
elif self.current_tab == "map":
|
||||
try:
|
||||
form = self.query_one("#map_form", MapOptionsForm)
|
||||
map_opts = form.get_map_options()
|
||||
options.update(cast(FirecrawlOptions, map_opts))
|
||||
except Exception:
|
||||
pass
|
||||
elif self.current_tab == "extract":
|
||||
try:
|
||||
form = self.query_one("#extract_form", ExtractOptionsForm)
|
||||
extract_opts = form.get_extract_options()
|
||||
options.update(cast(FirecrawlOptions, extract_opts))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return options
|
||||
@@ -24,23 +24,26 @@ class StatusIndicator(Static):
|
||||
# Remove previous status classes
|
||||
self.remove_class("status-active", "status-error", "status-warning", "pulse", "glow")
|
||||
|
||||
if status.lower() in ["active", "online", "connected", "✓ active"]:
|
||||
status_lower = status.lower()
|
||||
|
||||
if (status_lower in {"active", "online", "connected", "✓ active"} or
|
||||
status_lower.endswith("active") or "✓" in status_lower and "active" in status_lower):
|
||||
self.add_class("status-active")
|
||||
self.add_class("glow")
|
||||
self.update("🟢 " + status)
|
||||
elif status.lower() in ["error", "failed", "offline", "disconnected"]:
|
||||
self.update(f"🟢 {status}")
|
||||
elif status_lower in {"error", "failed", "offline", "disconnected"}:
|
||||
self.add_class("status-error")
|
||||
self.add_class("pulse")
|
||||
self.update("🔴 " + status)
|
||||
elif status.lower() in ["warning", "pending", "in_progress"]:
|
||||
self.update(f"🔴 {status}")
|
||||
elif status_lower in {"warning", "pending", "in_progress"}:
|
||||
self.add_class("status-warning")
|
||||
self.add_class("pulse")
|
||||
self.update("🟡 " + status)
|
||||
elif status.lower() in ["loading", "connecting"]:
|
||||
self.update(f"🟡 {status}")
|
||||
elif status_lower in {"loading", "connecting"}:
|
||||
self.add_class("shimmer")
|
||||
self.update("🔄 " + status)
|
||||
self.update(f"🔄 {status}")
|
||||
else:
|
||||
self.update("⚪ " + status)
|
||||
self.update(f"⚪ {status}")
|
||||
|
||||
|
||||
class EnhancedProgressBar(Static):
|
||||
|
||||
509
ingest_pipeline/cli/tui/widgets/r2r_widgets.py
Normal file
509
ingest_pipeline/cli/tui/widgets/r2r_widgets.py
Normal file
@@ -0,0 +1,509 @@
|
||||
"""R2R-specific widgets for chunk viewing and entity visualization."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
from textual import work
|
||||
from textual.app import ComposeResult
|
||||
from textual.containers import Container, Horizontal, Vertical, VerticalScroll
|
||||
from textual.widgets import Button, DataTable, Label, Markdown, ProgressBar, Static, Tree
|
||||
from typing_extensions import override
|
||||
|
||||
from ....storage.r2r.storage import R2RStorage
|
||||
from ..models import ChunkInfo, EntityInfo
|
||||
|
||||
|
||||
class ChunkViewer(Container):
|
||||
"""Widget for viewing document chunks with navigation."""
|
||||
|
||||
DEFAULT_CSS = """
|
||||
ChunkViewer {
|
||||
border: solid $border;
|
||||
background: $surface;
|
||||
height: 100%;
|
||||
}
|
||||
|
||||
ChunkViewer .chunk-header {
|
||||
dock: top;
|
||||
height: 3;
|
||||
background: $primary;
|
||||
color: $text;
|
||||
padding: 1;
|
||||
}
|
||||
|
||||
ChunkViewer .chunk-navigation {
|
||||
dock: top;
|
||||
height: 3;
|
||||
background: $surface-lighten-1;
|
||||
padding: 1;
|
||||
}
|
||||
|
||||
ChunkViewer .chunk-content {
|
||||
height: 1fr;
|
||||
padding: 1;
|
||||
overflow: auto;
|
||||
}
|
||||
|
||||
ChunkViewer .chunk-footer {
|
||||
dock: bottom;
|
||||
height: 3;
|
||||
background: $surface-darken-1;
|
||||
padding: 1;
|
||||
}
|
||||
"""
|
||||
|
||||
def __init__(self, r2r_storage: R2RStorage, document_id: str, **kwargs: Any) -> None:
|
||||
"""Initialize chunk viewer."""
|
||||
super().__init__(**kwargs)
|
||||
self.r2r_storage = r2r_storage
|
||||
self.document_id = document_id
|
||||
self.chunks: list[ChunkInfo] = []
|
||||
self.current_chunk_index = 0
|
||||
|
||||
@override
|
||||
def compose(self) -> ComposeResult:
|
||||
"""Compose chunk viewer layout."""
|
||||
yield Container(
|
||||
Static("📄 Document Chunks", classes="chunk-header"),
|
||||
Horizontal(
|
||||
Button("◀ Previous", id="prev_chunk", variant="default"),
|
||||
Static("Chunk 1 of 1", id="chunk_info"),
|
||||
Button("Next ▶", id="next_chunk", variant="default"),
|
||||
classes="chunk-navigation",
|
||||
),
|
||||
VerticalScroll(
|
||||
Markdown("", id="chunk_content"),
|
||||
classes="chunk-content",
|
||||
),
|
||||
Container(
|
||||
Static("Loading chunks...", id="chunk_status"),
|
||||
classes="chunk-footer",
|
||||
),
|
||||
)
|
||||
|
||||
def on_mount(self) -> None:
|
||||
"""Initialize chunk viewer."""
|
||||
self.load_chunks()
|
||||
|
||||
@work(exclusive=True)
|
||||
async def load_chunks(self) -> None:
|
||||
"""Load document chunks."""
|
||||
try:
|
||||
chunks_data = await self.r2r_storage.get_document_chunks(self.document_id)
|
||||
self.chunks = []
|
||||
|
||||
for chunk_data in chunks_data:
|
||||
chunk_info: ChunkInfo = {
|
||||
"id": str(chunk_data.get("id", "")),
|
||||
"document_id": self.document_id,
|
||||
"content": str(chunk_data.get("text", "")),
|
||||
"start_index": int(chunk_data.get("start_index", 0)),
|
||||
"end_index": int(chunk_data.get("end_index", 0)),
|
||||
"metadata": dict(chunk_data.get("metadata", {})),
|
||||
}
|
||||
self.chunks.append(chunk_info)
|
||||
|
||||
if self.chunks:
|
||||
self.current_chunk_index = 0
|
||||
self.update_chunk_display()
|
||||
else:
|
||||
self.query_one("#chunk_status", Static).update("No chunks found")
|
||||
|
||||
except Exception as e:
|
||||
self.query_one("#chunk_status", Static).update(f"Error loading chunks: {e}")
|
||||
|
||||
def update_chunk_display(self) -> None:
|
||||
"""Update chunk display with current chunk."""
|
||||
if not self.chunks:
|
||||
return
|
||||
|
||||
chunk = self.chunks[self.current_chunk_index]
|
||||
|
||||
# Update content
|
||||
content_widget = self.query_one("#chunk_content", Markdown)
|
||||
content_widget.update(chunk["content"])
|
||||
|
||||
# Update navigation info
|
||||
chunk_info = self.query_one("#chunk_info", Static)
|
||||
chunk_info.update(f"Chunk {self.current_chunk_index + 1} of {len(self.chunks)}")
|
||||
|
||||
# Update status
|
||||
status_widget = self.query_one("#chunk_status", Static)
|
||||
status_widget.update(
|
||||
f"Chunk {chunk['id']} | "
|
||||
f"Range: {chunk['start_index']}-{chunk['end_index']} | "
|
||||
f"Length: {len(chunk['content'])} chars"
|
||||
)
|
||||
|
||||
# Update button states
|
||||
prev_btn = self.query_one("#prev_chunk", Button)
|
||||
next_btn = self.query_one("#next_chunk", Button)
|
||||
prev_btn.disabled = self.current_chunk_index == 0
|
||||
next_btn.disabled = self.current_chunk_index >= len(self.chunks) - 1
|
||||
|
||||
def on_button_pressed(self, event: Button.Pressed) -> None:
|
||||
"""Handle button presses."""
|
||||
if event.button.id == "prev_chunk" and self.current_chunk_index > 0:
|
||||
self.current_chunk_index -= 1
|
||||
self.update_chunk_display()
|
||||
elif event.button.id == "next_chunk" and self.current_chunk_index < len(self.chunks) - 1:
|
||||
self.current_chunk_index += 1
|
||||
self.update_chunk_display()
|
||||
|
||||
|
||||
class EntityGraph(Container):
|
||||
"""Widget for visualizing extracted entities and relationships."""
|
||||
|
||||
DEFAULT_CSS = """
|
||||
EntityGraph {
|
||||
border: solid $border;
|
||||
background: $surface;
|
||||
height: 100%;
|
||||
}
|
||||
|
||||
EntityGraph .entity-header {
|
||||
dock: top;
|
||||
height: 3;
|
||||
background: $primary;
|
||||
color: $text;
|
||||
padding: 1;
|
||||
}
|
||||
|
||||
EntityGraph .entity-tree {
|
||||
height: 1fr;
|
||||
overflow: auto;
|
||||
}
|
||||
|
||||
EntityGraph .entity-details {
|
||||
dock: bottom;
|
||||
height: 8;
|
||||
background: $surface-lighten-1;
|
||||
padding: 1;
|
||||
border-top: solid $border;
|
||||
}
|
||||
"""
|
||||
|
||||
def __init__(self, r2r_storage: R2RStorage, document_id: str, **kwargs: Any) -> None:
|
||||
"""Initialize entity graph."""
|
||||
super().__init__(**kwargs)
|
||||
self.r2r_storage = r2r_storage
|
||||
self.document_id = document_id
|
||||
self.entities: list[EntityInfo] = []
|
||||
|
||||
@override
|
||||
def compose(self) -> ComposeResult:
|
||||
"""Compose entity graph layout."""
|
||||
yield Container(
|
||||
Static("🕸️ Entity Graph", classes="entity-header"),
|
||||
Tree("Entities", id="entity_tree", classes="entity-tree"),
|
||||
VerticalScroll(
|
||||
Label("Entity Details"),
|
||||
Static("Select an entity to view details", id="entity_details"),
|
||||
classes="entity-details",
|
||||
),
|
||||
)
|
||||
|
||||
def on_mount(self) -> None:
|
||||
"""Initialize entity graph."""
|
||||
self.load_entities()
|
||||
|
||||
@work(exclusive=True)
|
||||
async def load_entities(self) -> None:
|
||||
"""Load entities from document."""
|
||||
try:
|
||||
entities_data = await self.r2r_storage.extract_entities(self.document_id)
|
||||
self.entities = []
|
||||
|
||||
# Parse entities from R2R response
|
||||
entities_list = entities_data.get("entities", [])
|
||||
for entity_data in entities_list:
|
||||
entity_info: EntityInfo = {
|
||||
"id": str(entity_data.get("id", "")),
|
||||
"name": str(entity_data.get("name", "")),
|
||||
"type": str(entity_data.get("type", "unknown")),
|
||||
"confidence": float(entity_data.get("confidence", 0.0)),
|
||||
"metadata": dict(entity_data.get("metadata", {})),
|
||||
}
|
||||
self.entities.append(entity_info)
|
||||
|
||||
self.populate_entity_tree()
|
||||
|
||||
except Exception as e:
|
||||
details_widget = self.query_one("#entity_details", Static)
|
||||
details_widget.update(f"Error loading entities: {e}")
|
||||
|
||||
def populate_entity_tree(self) -> None:
|
||||
"""Populate the entity tree."""
|
||||
tree = self.query_one("#entity_tree", Tree)
|
||||
tree.clear()
|
||||
|
||||
if not self.entities:
|
||||
tree.root.add_leaf("No entities found")
|
||||
return
|
||||
|
||||
# Group entities by type
|
||||
entities_by_type: dict[str, list[EntityInfo]] = {}
|
||||
for entity in self.entities:
|
||||
entity_type = entity["type"]
|
||||
if entity_type not in entities_by_type:
|
||||
entities_by_type[entity_type] = []
|
||||
entities_by_type[entity_type].append(entity)
|
||||
|
||||
# Add entities to tree grouped by type
|
||||
for entity_type, type_entities in entities_by_type.items():
|
||||
type_node = tree.root.add(f"{entity_type.title()} ({len(type_entities)})")
|
||||
for entity in type_entities:
|
||||
confidence_pct = int(entity["confidence"] * 100)
|
||||
entity_node = type_node.add_leaf(f"{entity['name']} ({confidence_pct}%)")
|
||||
entity_node.data = entity
|
||||
|
||||
tree.root.expand()
|
||||
|
||||
def on_tree_node_selected(self, event: Tree.NodeSelected) -> None:
|
||||
"""Handle entity selection."""
|
||||
if hasattr(event.node, "data") and event.node.data:
|
||||
entity = event.node.data
|
||||
self.show_entity_details(entity)
|
||||
|
||||
def show_entity_details(self, entity: EntityInfo) -> None:
|
||||
"""Show detailed information about an entity."""
|
||||
details_widget = self.query_one("#entity_details", Static)
|
||||
|
||||
details_text = f"""**Entity:** {entity['name']}
|
||||
**Type:** {entity['type']}
|
||||
**Confidence:** {entity['confidence']:.2%}
|
||||
**ID:** {entity['id']}
|
||||
|
||||
**Metadata:**
|
||||
"""
|
||||
for key, value in entity["metadata"].items():
|
||||
details_text += f"- **{key}:** {value}\n"
|
||||
|
||||
details_widget.update(details_text)
|
||||
|
||||
|
||||
class CollectionStats(Container):
|
||||
"""Widget for showing R2R-specific collection statistics."""
|
||||
|
||||
DEFAULT_CSS = """
|
||||
CollectionStats {
|
||||
border: solid $border;
|
||||
background: $surface;
|
||||
height: 100%;
|
||||
padding: 1;
|
||||
}
|
||||
|
||||
CollectionStats .stats-header {
|
||||
dock: top;
|
||||
height: 3;
|
||||
background: $primary;
|
||||
color: $text;
|
||||
padding: 1;
|
||||
margin: -1 -1 1 -1;
|
||||
}
|
||||
|
||||
CollectionStats .stats-grid {
|
||||
layout: grid;
|
||||
grid-size: 2;
|
||||
grid-columns: 1fr 1fr;
|
||||
grid-gutter: 1;
|
||||
height: auto;
|
||||
}
|
||||
|
||||
CollectionStats .stat-card {
|
||||
background: $surface-lighten-1;
|
||||
border: solid $border;
|
||||
padding: 1;
|
||||
height: auto;
|
||||
}
|
||||
|
||||
CollectionStats .stat-value {
|
||||
color: $primary;
|
||||
text-style: bold;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
CollectionStats .stat-label {
|
||||
color: $text-muted;
|
||||
text-align: center;
|
||||
margin-top: 1;
|
||||
}
|
||||
|
||||
CollectionStats .progress-section {
|
||||
margin-top: 2;
|
||||
}
|
||||
"""
|
||||
|
||||
def __init__(self, r2r_storage: R2RStorage, collection_name: str, **kwargs: Any) -> None:
|
||||
"""Initialize collection stats."""
|
||||
super().__init__(**kwargs)
|
||||
self.r2r_storage = r2r_storage
|
||||
self.collection_name = collection_name
|
||||
|
||||
@override
|
||||
def compose(self) -> ComposeResult:
|
||||
"""Compose stats layout."""
|
||||
yield Container(
|
||||
Static(f"📊 {self.collection_name} Statistics", classes="stats-header"),
|
||||
Container(
|
||||
Container(
|
||||
Static("0", id="document_count", classes="stat-value"),
|
||||
Static("Documents", classes="stat-label"),
|
||||
classes="stat-card",
|
||||
),
|
||||
Container(
|
||||
Static("0", id="chunk_count", classes="stat-value"),
|
||||
Static("Chunks", classes="stat-label"),
|
||||
classes="stat-card",
|
||||
),
|
||||
Container(
|
||||
Static("0", id="entity_count", classes="stat-value"),
|
||||
Static("Entities", classes="stat-label"),
|
||||
classes="stat-card",
|
||||
),
|
||||
Container(
|
||||
Static("0 MB", id="storage_size", classes="stat-value"),
|
||||
Static("Storage Used", classes="stat-label"),
|
||||
classes="stat-card",
|
||||
),
|
||||
classes="stats-grid",
|
||||
),
|
||||
Container(
|
||||
Label("Processing Progress"),
|
||||
ProgressBar(id="processing_progress", total=100, show_eta=False),
|
||||
Static("Idle", id="processing_status"),
|
||||
classes="progress-section",
|
||||
),
|
||||
)
|
||||
|
||||
def on_mount(self) -> None:
|
||||
"""Initialize stats display."""
|
||||
self.refresh_stats()
|
||||
|
||||
@work(exclusive=True)
|
||||
async def refresh_stats(self) -> None:
|
||||
"""Refresh collection statistics."""
|
||||
try:
|
||||
# Get basic document count
|
||||
doc_count = await self.r2r_storage.count(collection_name=self.collection_name)
|
||||
self.query_one("#document_count", Static).update(str(doc_count))
|
||||
|
||||
# Estimate other stats (these would need real implementation)
|
||||
estimated_chunks = doc_count * 5 # Rough estimate
|
||||
estimated_entities = doc_count * 10 # Rough estimate
|
||||
estimated_size_mb = doc_count * 0.05 # Rough estimate
|
||||
|
||||
self.query_one("#chunk_count", Static).update(str(estimated_chunks))
|
||||
self.query_one("#entity_count", Static).update(str(estimated_entities))
|
||||
self.query_one("#storage_size", Static).update(f"{estimated_size_mb:.1f} MB")
|
||||
|
||||
# Update progress (would be real-time in actual implementation)
|
||||
progress_bar = self.query_one("#processing_progress", ProgressBar)
|
||||
progress_bar.progress = 100 # Assume complete for now
|
||||
|
||||
status_widget = self.query_one("#processing_status", Static)
|
||||
status_widget.update("All documents processed")
|
||||
|
||||
except Exception as e:
|
||||
self.query_one("#processing_status", Static).update(f"Error: {e}")
|
||||
|
||||
|
||||
class DocumentOverview(Container):
|
||||
"""Widget for comprehensive document overview and statistics."""
|
||||
|
||||
DEFAULT_CSS = """
|
||||
DocumentOverview {
|
||||
layout: vertical;
|
||||
height: 100%;
|
||||
}
|
||||
|
||||
DocumentOverview .overview-header {
|
||||
dock: top;
|
||||
height: 3;
|
||||
background: $primary;
|
||||
color: $text;
|
||||
padding: 1;
|
||||
}
|
||||
|
||||
DocumentOverview .overview-content {
|
||||
height: 1fr;
|
||||
layout: horizontal;
|
||||
}
|
||||
|
||||
DocumentOverview .overview-left {
|
||||
width: 50%;
|
||||
padding: 1;
|
||||
}
|
||||
|
||||
DocumentOverview .overview-right {
|
||||
width: 50%;
|
||||
padding: 1;
|
||||
}
|
||||
|
||||
DocumentOverview .info-table {
|
||||
height: auto;
|
||||
margin-bottom: 2;
|
||||
}
|
||||
"""
|
||||
|
||||
def __init__(self, r2r_storage: R2RStorage, document_id: str, **kwargs: Any) -> None:
|
||||
"""Initialize document overview."""
|
||||
super().__init__(**kwargs)
|
||||
self.r2r_storage = r2r_storage
|
||||
self.document_id = document_id
|
||||
|
||||
@override
|
||||
def compose(self) -> ComposeResult:
|
||||
"""Compose overview layout."""
|
||||
yield Container(
|
||||
Static("📋 Document Overview", classes="overview-header"),
|
||||
Horizontal(
|
||||
Vertical(
|
||||
Label("Document Information"),
|
||||
DataTable[str](id="doc_info_table", classes="info-table"),
|
||||
Label("Processing Statistics"),
|
||||
DataTable[str](id="stats_table", classes="info-table"),
|
||||
classes="overview-left",
|
||||
),
|
||||
Vertical(
|
||||
ChunkViewer(self.r2r_storage, self.document_id),
|
||||
classes="overview-right",
|
||||
),
|
||||
classes="overview-content",
|
||||
),
|
||||
)
|
||||
|
||||
def on_mount(self) -> None:
|
||||
"""Initialize overview."""
|
||||
self.load_overview()
|
||||
|
||||
@work(exclusive=True)
|
||||
async def load_overview(self) -> None:
|
||||
"""Load comprehensive document overview."""
|
||||
try:
|
||||
overview_data = await self.r2r_storage.get_document_overview(self.document_id)
|
||||
|
||||
# Populate document info table
|
||||
doc_table = self.query_one("#doc_info_table", DataTable)
|
||||
doc_table.add_columns("Property", "Value")
|
||||
|
||||
document_info = overview_data.get("document", {})
|
||||
doc_table.add_row("ID", str(document_info.get("id", "N/A")))
|
||||
doc_table.add_row("Title", str(document_info.get("title", "N/A")))
|
||||
doc_table.add_row("Created", str(document_info.get("created_at", "N/A")))
|
||||
doc_table.add_row("Modified", str(document_info.get("updated_at", "N/A")))
|
||||
|
||||
# Populate stats table
|
||||
stats_table = self.query_one("#stats_table", DataTable)
|
||||
stats_table.add_columns("Metric", "Count")
|
||||
|
||||
chunk_count = overview_data.get("chunk_count", 0)
|
||||
stats_table.add_row("Chunks", str(chunk_count))
|
||||
stats_table.add_row("Characters", str(len(str(document_info.get("content", "")))))
|
||||
|
||||
except Exception as e:
|
||||
# Handle error by showing minimal info
|
||||
doc_table = self.query_one("#doc_info_table", DataTable)
|
||||
doc_table.add_columns("Property", "Value")
|
||||
doc_table.add_row("Error", str(e))
|
||||
@@ -1,5 +1,53 @@
|
||||
"""Configuration management."""
|
||||
"""Configuration management utilities."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from contextlib import ExitStack
|
||||
|
||||
from prefect.settings import (
|
||||
PREFECT_API_KEY,
|
||||
PREFECT_API_URL,
|
||||
PREFECT_DEFAULT_WORK_POOL_NAME,
|
||||
Setting,
|
||||
temporary_settings,
|
||||
)
|
||||
|
||||
from .settings import Settings, get_settings
|
||||
|
||||
__all__ = ["Settings", "get_settings"]
|
||||
__all__ = ["Settings", "get_settings", "configure_prefect"]
|
||||
|
||||
_prefect_settings_stack: ExitStack | None = None
|
||||
|
||||
|
||||
def configure_prefect(settings: Settings) -> None:
|
||||
"""Apply Prefect settings from the application configuration."""
|
||||
global _prefect_settings_stack
|
||||
|
||||
overrides: dict[Setting, str] = {}
|
||||
|
||||
if settings.prefect_api_url is not None:
|
||||
overrides[PREFECT_API_URL] = str(settings.prefect_api_url)
|
||||
if settings.prefect_api_key:
|
||||
overrides[PREFECT_API_KEY] = settings.prefect_api_key
|
||||
if settings.prefect_work_pool:
|
||||
overrides[PREFECT_DEFAULT_WORK_POOL_NAME] = settings.prefect_work_pool
|
||||
|
||||
if not overrides:
|
||||
return
|
||||
|
||||
filtered_overrides = {
|
||||
setting: value
|
||||
for setting, value in overrides.items()
|
||||
if setting.value() != value
|
||||
}
|
||||
|
||||
if not filtered_overrides:
|
||||
return
|
||||
|
||||
new_stack = ExitStack()
|
||||
new_stack.enter_context(temporary_settings(updates=filtered_overrides))
|
||||
|
||||
if _prefect_settings_stack is not None:
|
||||
_prefect_settings_stack.close()
|
||||
|
||||
_prefect_settings_stack = new_stack
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@@ -1,9 +1,9 @@
|
||||
"""Application settings and configuration."""
|
||||
|
||||
from functools import lru_cache
|
||||
from typing import Literal
|
||||
from typing import Annotated, Literal
|
||||
|
||||
from pydantic import Field, HttpUrl
|
||||
from pydantic import Field, HttpUrl, model_validator
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
|
||||
@@ -21,25 +21,27 @@ class Settings(BaseSettings):
|
||||
firecrawl_api_key: str | None = None
|
||||
openwebui_api_key: str | None = None
|
||||
weaviate_api_key: str | None = None
|
||||
r2r_api_key: str | None = None
|
||||
|
||||
# Endpoints
|
||||
llm_endpoint: HttpUrl = HttpUrl("http://llm.lab")
|
||||
weaviate_endpoint: HttpUrl = HttpUrl("http://weaviate.yo")
|
||||
openwebui_endpoint: HttpUrl = HttpUrl("http://chat.lab") # This will be the API URL
|
||||
firecrawl_endpoint: HttpUrl = HttpUrl("http://crawl.lab:30002")
|
||||
r2r_endpoint: HttpUrl | None = Field(default=None, alias="r2r_api_url")
|
||||
|
||||
# Model Configuration
|
||||
embedding_model: str = "ollama/bge-m3:latest"
|
||||
embedding_dimension: int = 1024
|
||||
|
||||
# Ingestion Settings
|
||||
default_batch_size: int = Field(default=50, gt=0, le=500)
|
||||
default_batch_size: Annotated[int, Field(gt=0, le=500)] = 50
|
||||
max_file_size: int = 1_000_000
|
||||
max_crawl_depth: int = Field(default=5, ge=1, le=20)
|
||||
max_crawl_pages: int = Field(default=100, ge=1, le=1000)
|
||||
max_crawl_depth: Annotated[int, Field(ge=1, le=20)] = 5
|
||||
max_crawl_pages: Annotated[int, Field(ge=1, le=1000)] = 100
|
||||
|
||||
# Storage Settings
|
||||
default_storage_backend: Literal["weaviate", "open_webui"] = "weaviate"
|
||||
default_storage_backend: Literal["weaviate", "open_webui", "r2r"] = "weaviate"
|
||||
default_collection_prefix: str = "docs"
|
||||
|
||||
# Prefect Settings
|
||||
@@ -48,11 +50,11 @@ class Settings(BaseSettings):
|
||||
prefect_work_pool: str = "default"
|
||||
|
||||
# Scheduling Defaults
|
||||
default_schedule_interval: int = Field(default=60, ge=1, le=10080) # Max 1 week
|
||||
default_schedule_interval: Annotated[int, Field(ge=1, le=10080)] = 60 # Max 1 week
|
||||
|
||||
# Performance Settings
|
||||
max_concurrent_tasks: int = Field(default=5, ge=1, le=20)
|
||||
request_timeout: int = Field(default=60, ge=10, le=300)
|
||||
max_concurrent_tasks: Annotated[int, Field(ge=1, le=20)] = 5
|
||||
request_timeout: Annotated[int, Field(ge=10, le=300)] = 60
|
||||
|
||||
# Logging
|
||||
log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR"] = "INFO"
|
||||
@@ -66,13 +68,26 @@ class Settings(BaseSettings):
|
||||
|
||||
Returns:
|
||||
Endpoint URL
|
||||
|
||||
Raises:
|
||||
ValueError: If backend is unknown or R2R endpoint not configured
|
||||
"""
|
||||
if backend == "weaviate":
|
||||
return self.weaviate_endpoint
|
||||
elif backend == "open_webui":
|
||||
return self.openwebui_endpoint
|
||||
endpoints = {
|
||||
"weaviate": self.weaviate_endpoint,
|
||||
"open_webui": self.openwebui_endpoint,
|
||||
}
|
||||
|
||||
if backend in endpoints:
|
||||
return endpoints[backend]
|
||||
elif backend == "r2r":
|
||||
if not self.r2r_endpoint:
|
||||
raise ValueError(
|
||||
"R2R_API_URL must be set in environment variables. "
|
||||
"This should have been caught during settings validation."
|
||||
)
|
||||
return self.r2r_endpoint
|
||||
else:
|
||||
raise ValueError(f"Unknown backend: {backend}")
|
||||
raise ValueError(f"Unknown backend: {backend}. Supported: weaviate, open_webui, r2r")
|
||||
|
||||
def get_api_key(self, service: str) -> str | None:
|
||||
"""
|
||||
@@ -88,9 +103,40 @@ class Settings(BaseSettings):
|
||||
"firecrawl": self.firecrawl_api_key,
|
||||
"openwebui": self.openwebui_api_key,
|
||||
"weaviate": self.weaviate_api_key,
|
||||
"r2r": self.r2r_api_key,
|
||||
}
|
||||
return service_map.get(service)
|
||||
|
||||
@model_validator(mode="after")
|
||||
def validate_backend_configuration(self) -> "Settings":
|
||||
"""Validate that required configuration is present for the default backend."""
|
||||
backend = self.default_storage_backend
|
||||
|
||||
# Validate R2R backend configuration
|
||||
if backend == "r2r" and not self.r2r_endpoint:
|
||||
raise ValueError(
|
||||
"R2R_API_URL must be set in environment variables when using R2R as default backend"
|
||||
)
|
||||
|
||||
# Validate API key requirements (optional warning for missing keys)
|
||||
required_keys = {
|
||||
"weaviate": ("WEAVIATE_API_KEY", self.weaviate_api_key),
|
||||
"open_webui": ("OPENWEBUI_API_KEY", self.openwebui_api_key),
|
||||
"r2r": ("R2R_API_KEY", self.r2r_api_key),
|
||||
}
|
||||
|
||||
if backend in required_keys:
|
||||
key_name, key_value = required_keys[backend]
|
||||
if not key_value:
|
||||
import warnings
|
||||
warnings.warn(
|
||||
f"{key_name} not set - authentication may fail for {backend} backend",
|
||||
UserWarning,
|
||||
stacklevel=2
|
||||
)
|
||||
|
||||
return self
|
||||
|
||||
|
||||
@lru_cache
|
||||
def get_settings() -> Settings:
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -1,9 +1,8 @@
|
||||
"""Core data models with strict typing."""
|
||||
|
||||
from collections.abc import Callable
|
||||
from datetime import UTC, datetime
|
||||
from enum import Enum
|
||||
from typing import TypedDict
|
||||
from typing import Annotated, TypedDict
|
||||
from uuid import UUID, uuid4
|
||||
|
||||
from pydantic import BaseModel, Field, HttpUrl
|
||||
@@ -25,6 +24,7 @@ class StorageBackend(str, Enum):
|
||||
|
||||
WEAVIATE = "weaviate"
|
||||
OPEN_WEBUI = "open_webui"
|
||||
R2R = "r2r"
|
||||
|
||||
|
||||
class IngestionSource(str, Enum):
|
||||
@@ -41,7 +41,7 @@ class VectorConfig(BaseModel):
|
||||
model: str = Field(default="ollama/bge-m3:latest")
|
||||
embedding_endpoint: HttpUrl = Field(default=HttpUrl("http://llm.lab"))
|
||||
dimension: int = Field(default=1024)
|
||||
batch_size: int = Field(default=100, gt=0, le=1000)
|
||||
batch_size: Annotated[int, Field(gt=0, le=1000)] = 100
|
||||
|
||||
|
||||
class StorageConfig(BaseModel):
|
||||
@@ -51,15 +51,15 @@ class StorageConfig(BaseModel):
|
||||
endpoint: HttpUrl
|
||||
api_key: str | None = Field(default=None)
|
||||
collection_name: str = Field(default="documents")
|
||||
batch_size: int = Field(default=100, gt=0, le=1000)
|
||||
batch_size: Annotated[int, Field(gt=0, le=1000)] = 100
|
||||
|
||||
|
||||
class FirecrawlConfig(BaseModel):
|
||||
"""Configuration for Firecrawl ingestion (operational parameters only)."""
|
||||
|
||||
formats: list[str] = Field(default_factory=lambda: ["markdown", "html"])
|
||||
max_depth: int = Field(default=5, ge=1, le=20)
|
||||
limit: int = Field(default=100, ge=1, le=1000)
|
||||
max_depth: Annotated[int, Field(ge=1, le=20)] = 5
|
||||
limit: Annotated[int, Field(ge=1, le=1000)] = 100
|
||||
only_main_content: bool = Field(default=True)
|
||||
include_subdomains: bool = Field(default=False)
|
||||
|
||||
@@ -77,18 +77,71 @@ class RepomixConfig(BaseModel):
|
||||
respect_gitignore: bool = Field(default=True)
|
||||
|
||||
|
||||
class DocumentMetadata(TypedDict):
|
||||
"""Metadata for a document."""
|
||||
class R2RConfig(BaseModel):
|
||||
"""Configuration for R2R ingestion."""
|
||||
|
||||
chunk_size: Annotated[int, Field(ge=100, le=8192)] = 1000
|
||||
chunk_overlap: Annotated[int, Field(ge=0, le=1000)] = 200
|
||||
enable_graph_enrichment: bool = Field(default=False)
|
||||
graph_creation_settings: dict[str, object] | None = Field(default=None)
|
||||
|
||||
|
||||
class DocumentMetadataRequired(TypedDict):
|
||||
"""Required metadata fields for a document."""
|
||||
source_url: str
|
||||
title: str | None
|
||||
description: str | None
|
||||
timestamp: datetime
|
||||
content_type: str
|
||||
word_count: int
|
||||
char_count: int
|
||||
|
||||
|
||||
class DocumentMetadata(DocumentMetadataRequired, total=False):
|
||||
"""Rich metadata for a document with R2R-compatible fields."""
|
||||
|
||||
# Basic optional fields
|
||||
title: str | None
|
||||
description: str | None
|
||||
|
||||
# Content categorization
|
||||
tags: list[str]
|
||||
category: str
|
||||
section: str
|
||||
language: str
|
||||
|
||||
# Authorship and source info
|
||||
author: str
|
||||
domain: str
|
||||
site_name: str
|
||||
|
||||
# Document structure
|
||||
heading_hierarchy: list[str]
|
||||
section_depth: int
|
||||
has_code_blocks: bool
|
||||
has_images: bool
|
||||
has_links: bool
|
||||
|
||||
# Processing metadata
|
||||
extraction_method: str
|
||||
crawl_depth: int
|
||||
last_modified: datetime | None
|
||||
|
||||
# Content quality indicators
|
||||
readability_score: float | None
|
||||
completeness_score: float | None
|
||||
|
||||
# Repository-specific fields
|
||||
file_path: str | None
|
||||
repository_name: str | None
|
||||
branch_name: str | None
|
||||
commit_hash: str | None
|
||||
programming_language: str | None
|
||||
|
||||
# Custom business metadata
|
||||
importance_score: float | None
|
||||
review_status: str | None
|
||||
assigned_team: str | None
|
||||
|
||||
|
||||
class Document(BaseModel):
|
||||
"""Represents a single document."""
|
||||
|
||||
@@ -96,17 +149,10 @@ class Document(BaseModel):
|
||||
content: str
|
||||
metadata: DocumentMetadata
|
||||
vector: list[float] | None = Field(default=None)
|
||||
score: float | None = Field(default=None)
|
||||
source: IngestionSource
|
||||
collection: str = Field(default="documents")
|
||||
|
||||
class Config:
|
||||
"""Pydantic configuration."""
|
||||
|
||||
json_encoders: dict[type, Callable[[UUID | datetime], str]] = {
|
||||
UUID: lambda v: str(v) if isinstance(v, UUID) else str(v),
|
||||
datetime: lambda v: v.isoformat() if isinstance(v, datetime) else str(v),
|
||||
}
|
||||
|
||||
|
||||
class IngestionJob(BaseModel):
|
||||
"""Represents an ingestion job."""
|
||||
@@ -122,14 +168,6 @@ class IngestionJob(BaseModel):
|
||||
document_count: int = Field(default=0)
|
||||
storage_backend: StorageBackend
|
||||
|
||||
class Config:
|
||||
"""Pydantic configuration."""
|
||||
|
||||
json_encoders: dict[type, Callable[[UUID | datetime], str]] = {
|
||||
UUID: lambda v: str(v) if isinstance(v, UUID) else str(v),
|
||||
datetime: lambda v: v.isoformat() if isinstance(v, datetime) else str(v),
|
||||
}
|
||||
|
||||
|
||||
class IngestionResult(BaseModel):
|
||||
"""Result of an ingestion operation."""
|
||||
@@ -140,10 +178,3 @@ class IngestionResult(BaseModel):
|
||||
documents_failed: int
|
||||
duration_seconds: float
|
||||
error_messages: list[str] = Field(default_factory=list)
|
||||
|
||||
class Config:
|
||||
"""Pydantic configuration."""
|
||||
|
||||
json_encoders: dict[type, Callable[[UUID], str]] = {
|
||||
UUID: lambda v: str(v),
|
||||
}
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -1,12 +1,19 @@
|
||||
"""Prefect flow for ingestion pipeline."""
|
||||
|
||||
from datetime import UTC, datetime
|
||||
from typing import Literal
|
||||
from __future__ import annotations
|
||||
|
||||
from prefect import flow, task
|
||||
from collections.abc import Callable
|
||||
from datetime import UTC, datetime, timedelta
|
||||
from typing import TYPE_CHECKING, Literal, assert_never, cast
|
||||
|
||||
from prefect import flow, get_run_logger, task
|
||||
from prefect.cache_policies import NO_CACHE
|
||||
from prefect.futures import wait
|
||||
|
||||
from ..config.settings import Settings
|
||||
from ..core.exceptions import IngestionError
|
||||
from ..core.models import (
|
||||
Document,
|
||||
FirecrawlConfig,
|
||||
IngestionJob,
|
||||
IngestionResult,
|
||||
@@ -16,9 +23,22 @@ from ..core.models import (
|
||||
StorageBackend,
|
||||
StorageConfig,
|
||||
)
|
||||
from ..ingestors import FirecrawlIngestor, RepomixIngestor
|
||||
from ..ingestors import BaseIngestor, FirecrawlIngestor, FirecrawlPage, RepomixIngestor
|
||||
from ..storage import OpenWebUIStorage, WeaviateStorage
|
||||
from ..storage import R2RStorage as RuntimeR2RStorage
|
||||
from ..storage.base import BaseStorage
|
||||
from ..utils.metadata_tagger import MetadataTagger
|
||||
|
||||
SourceTypeLiteral = Literal["web", "repository", "documentation"]
|
||||
StorageBackendLiteral = Literal["weaviate", "open_webui", "r2r"]
|
||||
SourceTypeLike = IngestionSource | SourceTypeLiteral
|
||||
StorageBackendLike = StorageBackend | StorageBackendLiteral
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..storage.r2r.storage import R2RStorage as R2RStorageType
|
||||
else:
|
||||
R2RStorageType = BaseStorage
|
||||
|
||||
|
||||
@task(name="validate_source", retries=2, retry_delay_seconds=10, tags=["validation"])
|
||||
@@ -59,6 +79,10 @@ async def initialize_storage_task(config: StorageConfig) -> BaseStorage:
|
||||
storage = WeaviateStorage(config)
|
||||
elif config.backend == StorageBackend.OPEN_WEBUI:
|
||||
storage = OpenWebUIStorage(config)
|
||||
elif config.backend == StorageBackend.R2R:
|
||||
if RuntimeR2RStorage is None:
|
||||
raise ValueError("R2R storage not available. Check dependencies.")
|
||||
storage = RuntimeR2RStorage(config)
|
||||
else:
|
||||
raise ValueError(f"Unsupported backend: {config.backend}")
|
||||
|
||||
@@ -66,89 +90,402 @@ async def initialize_storage_task(config: StorageConfig) -> BaseStorage:
|
||||
return storage
|
||||
|
||||
|
||||
@task(name="ingest_documents", retries=2, retry_delay_seconds=30, tags=["ingestion"])
|
||||
async def ingest_documents_task(job: IngestionJob, collection_name: str | None = None, batch_size: int = 50) -> tuple[int, int]:
|
||||
@task(name="map_firecrawl_site", retries=2, retry_delay_seconds=15, tags=["firecrawl", "map"])
|
||||
async def map_firecrawl_site_task(source_url: str, config: FirecrawlConfig) -> list[str]:
|
||||
"""Map a site using Firecrawl and return discovered URLs."""
|
||||
ingestor = FirecrawlIngestor(config)
|
||||
mapped = await ingestor.map_site(source_url)
|
||||
return mapped or [source_url]
|
||||
|
||||
|
||||
@task(name="filter_existing_documents", retries=1, retry_delay_seconds=5, tags=["r2r", "dedup"], cache_policy=NO_CACHE)
|
||||
async def filter_existing_documents_task(
|
||||
urls: list[str],
|
||||
storage_client: R2RStorageType,
|
||||
stale_after_days: int = 30,
|
||||
) -> list[str]:
|
||||
"""Filter URLs whose documents are missing or stale in R2R."""
|
||||
logger = get_run_logger()
|
||||
cutoff = datetime.now(UTC) - timedelta(days=stale_after_days)
|
||||
eligible: list[str] = []
|
||||
|
||||
for url in urls:
|
||||
document_id = str(FirecrawlIngestor.compute_document_id(url))
|
||||
existing: Document | None = await storage_client.retrieve(document_id)
|
||||
if existing is None:
|
||||
eligible.append(url)
|
||||
continue
|
||||
|
||||
timestamp = existing.metadata["timestamp"]
|
||||
if timestamp < cutoff:
|
||||
eligible.append(url)
|
||||
|
||||
if skipped := len(urls) - len(eligible):
|
||||
logger.info("Skipping %s up-to-date pages", skipped)
|
||||
|
||||
return eligible
|
||||
|
||||
|
||||
@task(
|
||||
name="scrape_firecrawl_batch", retries=2, retry_delay_seconds=20, tags=["firecrawl", "scrape"]
|
||||
)
|
||||
async def scrape_firecrawl_batch_task(
|
||||
batch_urls: list[str], config: FirecrawlConfig
|
||||
) -> list[FirecrawlPage]:
|
||||
"""Scrape a batch of URLs via Firecrawl."""
|
||||
ingestor = FirecrawlIngestor(config)
|
||||
return await ingestor.scrape_pages(batch_urls)
|
||||
|
||||
|
||||
@task(name="annotate_firecrawl_metadata", retries=1, retry_delay_seconds=10, tags=["metadata"])
|
||||
async def annotate_firecrawl_metadata_task(
|
||||
pages: list[FirecrawlPage], job: IngestionJob
|
||||
) -> list[Document]:
|
||||
"""Annotate scraped pages with standardized metadata."""
|
||||
if not pages:
|
||||
return []
|
||||
|
||||
ingestor = FirecrawlIngestor()
|
||||
documents = [ingestor.create_document(page, job) for page in pages]
|
||||
|
||||
try:
|
||||
from ..config import get_settings
|
||||
|
||||
settings = get_settings()
|
||||
async with MetadataTagger(llm_endpoint=str(settings.llm_endpoint)) as tagger:
|
||||
return await tagger.tag_batch(documents)
|
||||
except IngestionError as exc: # pragma: no cover - logging side effect
|
||||
logger = get_run_logger()
|
||||
logger.warning("Metadata tagging failed: %s", exc)
|
||||
return documents
|
||||
except Exception as exc: # pragma: no cover - defensive
|
||||
logger = get_run_logger()
|
||||
logger.warning("Metadata tagging unavailable, using base metadata: %s", exc)
|
||||
return documents
|
||||
|
||||
|
||||
@task(name="upsert_r2r_documents", retries=2, retry_delay_seconds=20, tags=["storage", "r2r"], cache_policy=NO_CACHE)
|
||||
async def upsert_r2r_documents_task(
|
||||
storage_client: R2RStorageType,
|
||||
documents: list[Document],
|
||||
collection_name: str | None,
|
||||
) -> tuple[int, int]:
|
||||
"""Upsert documents into R2R storage."""
|
||||
if not documents:
|
||||
return 0, 0
|
||||
|
||||
stored_ids: list[str] = await storage_client.store_batch(
|
||||
documents, collection_name=collection_name
|
||||
)
|
||||
processed = len(stored_ids)
|
||||
failed = len(documents) - processed
|
||||
|
||||
if failed:
|
||||
logger = get_run_logger()
|
||||
logger.warning("Failed to upsert %s documents to R2R", failed)
|
||||
|
||||
return processed, failed
|
||||
|
||||
|
||||
@task(name="ingest_documents", retries=2, retry_delay_seconds=30, tags=["ingestion"], cache_policy=NO_CACHE)
|
||||
async def ingest_documents_task(
|
||||
job: IngestionJob,
|
||||
collection_name: str | None = None,
|
||||
batch_size: int = 50,
|
||||
storage_client: BaseStorage | None = None,
|
||||
progress_callback: Callable[[int, str], None] | None = None,
|
||||
) -> tuple[int, int]:
|
||||
"""
|
||||
Ingest documents from source.
|
||||
Ingest documents from source with optional pre-initialized storage client.
|
||||
|
||||
Args:
|
||||
job: Ingestion job configuration
|
||||
collection_name: Target collection name
|
||||
batch_size: Number of documents per batch
|
||||
storage_client: Optional pre-initialized storage client
|
||||
progress_callback: Optional callback for progress updates
|
||||
|
||||
Returns:
|
||||
Tuple of (processed_count, failed_count)
|
||||
"""
|
||||
# Select ingestor
|
||||
if progress_callback:
|
||||
progress_callback(35, "Creating ingestor and storage clients...")
|
||||
|
||||
ingestor = _create_ingestor(job)
|
||||
storage = storage_client or await _create_storage(job, collection_name)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(40, "Starting document processing...")
|
||||
|
||||
return await _process_documents(ingestor, storage, job, batch_size, collection_name, progress_callback)
|
||||
|
||||
|
||||
def _create_ingestor(job: IngestionJob) -> BaseIngestor:
|
||||
"""Create appropriate ingestor based on job source type."""
|
||||
if job.source_type == IngestionSource.WEB:
|
||||
config = FirecrawlConfig()
|
||||
ingestor = FirecrawlIngestor(config)
|
||||
return FirecrawlIngestor(config)
|
||||
elif job.source_type == IngestionSource.REPOSITORY:
|
||||
config = RepomixConfig()
|
||||
ingestor = RepomixIngestor(config)
|
||||
return RepomixIngestor(config)
|
||||
else:
|
||||
raise ValueError(f"Unsupported source: {job.source_type}")
|
||||
|
||||
processed = 0
|
||||
failed = 0
|
||||
batch = []
|
||||
|
||||
# Initialize storage
|
||||
from pydantic import HttpUrl
|
||||
|
||||
# Use provided collection name or generate default
|
||||
async def _create_storage(job: IngestionJob, collection_name: str | None) -> BaseStorage:
|
||||
"""Create and initialize storage client."""
|
||||
if collection_name is None:
|
||||
collection_name = f"docs_{job.source_type.value}"
|
||||
|
||||
storage_config = StorageConfig(
|
||||
from ..config import get_settings
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
storage_config = _build_storage_config(job, settings, collection_name)
|
||||
storage = _instantiate_storage(job.storage_backend, storage_config)
|
||||
|
||||
await storage.initialize()
|
||||
return storage
|
||||
|
||||
|
||||
def _build_storage_config(
|
||||
job: IngestionJob, settings: Settings, collection_name: str
|
||||
) -> StorageConfig:
|
||||
"""Build storage configuration from job and settings."""
|
||||
storage_endpoints = {
|
||||
StorageBackend.WEAVIATE: settings.weaviate_endpoint,
|
||||
StorageBackend.OPEN_WEBUI: settings.openwebui_endpoint,
|
||||
StorageBackend.R2R: settings.get_storage_endpoint("r2r"),
|
||||
}
|
||||
storage_api_keys = {
|
||||
StorageBackend.WEAVIATE: settings.get_api_key("weaviate"),
|
||||
StorageBackend.OPEN_WEBUI: settings.get_api_key("openwebui"),
|
||||
StorageBackend.R2R: None, # R2R is self-hosted, no API key needed
|
||||
}
|
||||
|
||||
return StorageConfig(
|
||||
backend=job.storage_backend,
|
||||
endpoint=HttpUrl("http://weaviate.yo")
|
||||
if job.storage_backend == StorageBackend.WEAVIATE
|
||||
else HttpUrl("http://chat.lab"),
|
||||
endpoint=storage_endpoints[job.storage_backend],
|
||||
api_key=storage_api_keys[job.storage_backend],
|
||||
collection_name=collection_name,
|
||||
)
|
||||
|
||||
if job.storage_backend == StorageBackend.WEAVIATE:
|
||||
storage = WeaviateStorage(storage_config)
|
||||
else:
|
||||
storage = OpenWebUIStorage(storage_config)
|
||||
|
||||
await storage.initialize()
|
||||
def _instantiate_storage(backend: StorageBackend, config: StorageConfig) -> BaseStorage:
|
||||
"""Instantiate storage based on backend type."""
|
||||
if backend == StorageBackend.WEAVIATE:
|
||||
return WeaviateStorage(config)
|
||||
elif backend == StorageBackend.OPEN_WEBUI:
|
||||
return OpenWebUIStorage(config)
|
||||
elif backend == StorageBackend.R2R:
|
||||
if RuntimeR2RStorage is None:
|
||||
raise ValueError("R2R storage not available. Check dependencies.")
|
||||
return RuntimeR2RStorage(config)
|
||||
|
||||
assert_never(backend)
|
||||
|
||||
|
||||
def _chunk_urls(urls: list[str], chunk_size: int) -> list[list[str]]:
|
||||
"""Group URLs into fixed-size chunks for batch processing."""
|
||||
|
||||
if chunk_size <= 0:
|
||||
raise ValueError("chunk_size must be greater than zero")
|
||||
|
||||
return [urls[i : i + chunk_size] for i in range(0, len(urls), chunk_size)]
|
||||
|
||||
|
||||
def _deduplicate_urls(urls: list[str]) -> list[str]:
|
||||
"""Return the URLs with order preserved and duplicates removed."""
|
||||
|
||||
seen: set[str] = set()
|
||||
unique: list[str] = []
|
||||
for url in urls:
|
||||
if url not in seen:
|
||||
seen.add(url)
|
||||
unique.append(url)
|
||||
return unique
|
||||
|
||||
|
||||
async def _process_documents(
|
||||
ingestor: BaseIngestor,
|
||||
storage: BaseStorage,
|
||||
job: IngestionJob,
|
||||
batch_size: int,
|
||||
collection_name: str | None,
|
||||
progress_callback: Callable[[int, str], None] | None = None,
|
||||
) -> tuple[int, int]:
|
||||
"""Process documents in batches."""
|
||||
processed = 0
|
||||
failed = 0
|
||||
batch: list[Document] = []
|
||||
total_documents = 0
|
||||
batch_count = 0
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(45, "Ingesting documents from source...")
|
||||
|
||||
# Process documents
|
||||
async for document in ingestor.ingest(job):
|
||||
batch.append(document)
|
||||
total_documents += 1
|
||||
|
||||
if len(batch) >= batch_size:
|
||||
try:
|
||||
stored_ids = await storage.store_batch(batch)
|
||||
print(f"Successfully stored {len(stored_ids)} documents in batch")
|
||||
processed += len(stored_ids)
|
||||
failed += len(batch) - len(stored_ids)
|
||||
except Exception as e:
|
||||
print(f"Batch storage failed: {e}")
|
||||
failed += len(batch)
|
||||
batch_count += 1
|
||||
if progress_callback:
|
||||
progress_callback(
|
||||
45 + min(35, (batch_count * 10)),
|
||||
f"Processing batch {batch_count} ({total_documents} documents so far)..."
|
||||
)
|
||||
|
||||
batch_processed, batch_failed = await _store_batch(storage, batch, collection_name)
|
||||
processed += batch_processed
|
||||
failed += batch_failed
|
||||
batch = []
|
||||
|
||||
# Process remaining batch
|
||||
if batch:
|
||||
try:
|
||||
stored_ids = await storage.store_batch(batch)
|
||||
print(f"Successfully stored {len(stored_ids)} documents in final batch")
|
||||
processed += len(stored_ids)
|
||||
failed += len(batch) - len(stored_ids)
|
||||
except Exception as e:
|
||||
print(f"Final batch storage failed: {e}")
|
||||
failed += len(batch)
|
||||
batch_count += 1
|
||||
if progress_callback:
|
||||
progress_callback(80, f"Processing final batch ({total_documents} total documents)...")
|
||||
|
||||
batch_processed, batch_failed = await _store_batch(storage, batch, collection_name)
|
||||
processed += batch_processed
|
||||
failed += batch_failed
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(85, f"Completed processing {total_documents} documents")
|
||||
|
||||
return processed, failed
|
||||
|
||||
|
||||
async def _store_batch(
|
||||
storage: BaseStorage,
|
||||
batch: list[Document],
|
||||
collection_name: str | None,
|
||||
) -> tuple[int, int]:
|
||||
"""Store a batch of documents and return processed/failed counts."""
|
||||
try:
|
||||
# Apply metadata tagging for backends that benefit from it
|
||||
processed_batch = batch
|
||||
if hasattr(storage, "config") and storage.config.backend in (
|
||||
StorageBackend.R2R,
|
||||
StorageBackend.WEAVIATE,
|
||||
):
|
||||
try:
|
||||
from ..config import get_settings
|
||||
|
||||
settings = get_settings()
|
||||
async with MetadataTagger(llm_endpoint=str(settings.llm_endpoint)) as tagger:
|
||||
processed_batch = await tagger.tag_batch(batch)
|
||||
except Exception as exc:
|
||||
print(f"Metadata tagging failed, using original documents: {exc}")
|
||||
processed_batch = batch
|
||||
|
||||
stored_ids = await storage.store_batch(processed_batch, collection_name=collection_name)
|
||||
processed_count = len(stored_ids)
|
||||
failed_count = len(processed_batch) - processed_count
|
||||
|
||||
batch_type = (
|
||||
"final" if len(processed_batch) < 50 else ""
|
||||
) # Assume standard batch size is 50
|
||||
print(f"Successfully stored {processed_count} documents in {batch_type} batch".strip())
|
||||
|
||||
return processed_count, failed_count
|
||||
except Exception as e:
|
||||
batch_type = "Final" if len(batch) < 50 else "Batch"
|
||||
print(f"{batch_type} storage failed: {e}")
|
||||
return 0, len(batch)
|
||||
|
||||
|
||||
@flow(
|
||||
name="firecrawl_to_r2r",
|
||||
description="Ingest Firecrawl pages into R2R with metadata annotation",
|
||||
persist_result=False,
|
||||
log_prints=True,
|
||||
)
|
||||
async def firecrawl_to_r2r_flow(
|
||||
job: IngestionJob, collection_name: str | None = None, progress_callback: Callable[[int, str], None] | None = None
|
||||
) -> tuple[int, int]:
|
||||
"""Specialized flow for Firecrawl ingestion into R2R."""
|
||||
logger = get_run_logger()
|
||||
from ..config import get_settings
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(35, "Initializing Firecrawl and R2R storage...")
|
||||
|
||||
settings = get_settings()
|
||||
firecrawl_config = FirecrawlConfig()
|
||||
resolved_collection = collection_name or f"docs_{job.source_type.value}"
|
||||
|
||||
storage_config = _build_storage_config(job, settings, resolved_collection)
|
||||
storage_client = await initialize_storage_task(storage_config)
|
||||
|
||||
if RuntimeR2RStorage is None or not isinstance(storage_client, RuntimeR2RStorage):
|
||||
raise IngestionError("Firecrawl to R2R flow requires an R2R storage backend")
|
||||
|
||||
r2r_storage = cast("R2RStorageType", storage_client)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(45, "Discovering pages with Firecrawl...")
|
||||
|
||||
discovered_urls = await map_firecrawl_site_task(str(job.source_url), firecrawl_config)
|
||||
unique_urls = _deduplicate_urls(discovered_urls)
|
||||
logger.info("Discovered %s unique URLs from Firecrawl map", len(unique_urls))
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(55, f"Found {len(unique_urls)} pages, filtering existing content...")
|
||||
|
||||
eligible_urls = await filter_existing_documents_task(unique_urls, r2r_storage)
|
||||
|
||||
if not eligible_urls:
|
||||
logger.info("All Firecrawl pages are up to date for %s", job.source_url)
|
||||
if progress_callback:
|
||||
progress_callback(100, "All pages are up to date, no processing needed")
|
||||
return 0, 0
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(65, f"Scraping {len(eligible_urls)} new/updated pages...")
|
||||
|
||||
batch_size = min(settings.default_batch_size, firecrawl_config.limit)
|
||||
url_batches = _chunk_urls(eligible_urls, batch_size)
|
||||
logger.info("Scraping %s batches of Firecrawl pages", len(url_batches))
|
||||
|
||||
# Use asyncio.gather for concurrent scraping
|
||||
import asyncio
|
||||
scrape_tasks = [
|
||||
scrape_firecrawl_batch_task(batch, firecrawl_config)
|
||||
for batch in url_batches
|
||||
]
|
||||
batch_results = await asyncio.gather(*scrape_tasks)
|
||||
|
||||
scraped_pages: list[FirecrawlPage] = []
|
||||
for batch_pages in batch_results:
|
||||
scraped_pages.extend(batch_pages)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(75, f"Processing {len(scraped_pages)} scraped pages...")
|
||||
|
||||
documents = await annotate_firecrawl_metadata_task(scraped_pages, job)
|
||||
|
||||
if not documents:
|
||||
logger.warning("No documents produced after scraping for %s", job.source_url)
|
||||
return 0, len(eligible_urls)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(85, f"Storing {len(documents)} documents in R2R...")
|
||||
|
||||
processed, failed = await upsert_r2r_documents_task(r2r_storage, documents, resolved_collection)
|
||||
|
||||
logger.info("Upserted %s documents into R2R (%s failed)", processed, failed)
|
||||
|
||||
return processed, failed
|
||||
|
||||
|
||||
@task(name="update_job_status", tags=["tracking"])
|
||||
async def update_job_status_task(
|
||||
def update_job_status_task(
|
||||
job: IngestionJob,
|
||||
status: IngestionStatus,
|
||||
processed: int = 0,
|
||||
failed: int = 0,
|
||||
_failed: int = 0,
|
||||
error: str | None = None,
|
||||
) -> IngestionJob:
|
||||
"""
|
||||
@@ -158,7 +495,7 @@ async def update_job_status_task(
|
||||
job: Ingestion job
|
||||
status: New status
|
||||
processed: Documents processed
|
||||
failed: Documents failed
|
||||
_failed: Documents failed (currently unused)
|
||||
error: Error message if any
|
||||
|
||||
Returns:
|
||||
@@ -187,10 +524,11 @@ async def update_job_status_task(
|
||||
)
|
||||
async def create_ingestion_flow(
|
||||
source_url: str,
|
||||
source_type: Literal["web", "repository", "documentation"],
|
||||
storage_backend: Literal["weaviate", "open_webui"] = "weaviate",
|
||||
source_type: SourceTypeLike,
|
||||
storage_backend: StorageBackendLike = StorageBackend.WEAVIATE,
|
||||
collection_name: str | None = None,
|
||||
validate_first: bool = True,
|
||||
progress_callback: Callable[[int, str], None] | None = None,
|
||||
) -> IngestionResult:
|
||||
"""
|
||||
Main ingestion flow.
|
||||
@@ -200,28 +538,34 @@ async def create_ingestion_flow(
|
||||
source_type: Type of source
|
||||
storage_backend: Storage backend to use
|
||||
validate_first: Whether to validate source first
|
||||
progress_callback: Optional callback for progress updates
|
||||
|
||||
Returns:
|
||||
Ingestion result
|
||||
"""
|
||||
print(f"Starting ingestion from {source_url}")
|
||||
|
||||
source_enum = IngestionSource(source_type)
|
||||
backend_enum = StorageBackend(storage_backend)
|
||||
|
||||
# Create job
|
||||
job = IngestionJob(
|
||||
source_url=source_url,
|
||||
source_type=IngestionSource(source_type),
|
||||
storage_backend=StorageBackend(storage_backend),
|
||||
source_type=source_enum,
|
||||
storage_backend=backend_enum,
|
||||
status=IngestionStatus.PENDING,
|
||||
)
|
||||
|
||||
start_time = datetime.now(UTC)
|
||||
error_messages = []
|
||||
error_messages: list[str] = []
|
||||
processed = 0
|
||||
failed = 0
|
||||
|
||||
try:
|
||||
# Validate source if requested
|
||||
if validate_first:
|
||||
if progress_callback:
|
||||
progress_callback(10, "Validating source...")
|
||||
print("Validating source...")
|
||||
is_valid = await validate_source_task(source_url, job.source_type)
|
||||
|
||||
@@ -229,11 +573,21 @@ async def create_ingestion_flow(
|
||||
raise IngestionError(f"Source validation failed: {source_url}")
|
||||
|
||||
# Update status to in progress
|
||||
job = await update_job_status_task(job, IngestionStatus.IN_PROGRESS)
|
||||
if progress_callback:
|
||||
progress_callback(20, "Initializing storage...")
|
||||
job = update_job_status_task(job, IngestionStatus.IN_PROGRESS)
|
||||
|
||||
# Run ingestion
|
||||
if progress_callback:
|
||||
progress_callback(30, "Starting document ingestion...")
|
||||
print("Ingesting documents...")
|
||||
processed, failed = await ingest_documents_task(job, collection_name)
|
||||
if job.source_type == IngestionSource.WEB and job.storage_backend == StorageBackend.R2R:
|
||||
processed, failed = await firecrawl_to_r2r_flow(job, collection_name, progress_callback=progress_callback)
|
||||
else:
|
||||
processed, failed = await ingest_documents_task(job, collection_name, progress_callback=progress_callback)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(90, "Finalizing ingestion...")
|
||||
|
||||
# Update final status
|
||||
if failed > 0:
|
||||
@@ -247,7 +601,7 @@ async def create_ingestion_flow(
|
||||
else:
|
||||
final_status = IngestionStatus.COMPLETED
|
||||
|
||||
job = await update_job_status_task(job, final_status, processed=processed, failed=failed)
|
||||
job = update_job_status_task(job, final_status, processed=processed, _failed=failed)
|
||||
|
||||
print(f"Ingestion completed: {processed} processed, {failed} failed")
|
||||
|
||||
@@ -256,10 +610,9 @@ async def create_ingestion_flow(
|
||||
error_messages.append(str(e))
|
||||
|
||||
# Don't reset counts - keep whatever was processed before the error
|
||||
job = await update_job_status_task(job, IngestionStatus.FAILED,
|
||||
processed=processed,
|
||||
failed=failed,
|
||||
error=str(e))
|
||||
job = update_job_status_task(
|
||||
job, IngestionStatus.FAILED, processed=processed, _failed=failed, error=str(e)
|
||||
)
|
||||
|
||||
# Calculate duration
|
||||
duration = (datetime.now(UTC) - start_time).total_seconds()
|
||||
|
||||
@@ -1,13 +1,14 @@
|
||||
"""Scheduler for Prefect deployments."""
|
||||
|
||||
from datetime import timedelta
|
||||
from typing import TYPE_CHECKING, Literal, Protocol
|
||||
from typing import Literal, Protocol, cast
|
||||
|
||||
from prefect import serve
|
||||
from prefect.deployments.runner import RunnerDeployment
|
||||
from prefect.schedules import Cron, Interval
|
||||
|
||||
from .ingestion import create_ingestion_flow
|
||||
from ..core.models import IngestionSource, StorageBackend
|
||||
from .ingestion import SourceTypeLike, StorageBackendLike, create_ingestion_flow
|
||||
|
||||
|
||||
class FlowWithDeployment(Protocol):
|
||||
@@ -25,8 +26,8 @@ class FlowWithDeployment(Protocol):
|
||||
def create_scheduled_deployment(
|
||||
name: str,
|
||||
source_url: str,
|
||||
source_type: Literal["web", "repository", "documentation"],
|
||||
storage_backend: Literal["weaviate", "open_webui"] = "weaviate",
|
||||
source_type: SourceTypeLike,
|
||||
storage_backend: StorageBackendLike = StorageBackend.WEAVIATE,
|
||||
schedule_type: Literal["cron", "interval"] = "interval",
|
||||
cron_expression: str | None = None,
|
||||
interval_minutes: int = 60,
|
||||
@@ -55,8 +56,11 @@ def create_scheduled_deployment(
|
||||
schedule = Interval(timedelta(minutes=interval_minutes), timezone="UTC")
|
||||
|
||||
# Default tags
|
||||
source_enum = IngestionSource(source_type)
|
||||
backend_enum = StorageBackend(storage_backend)
|
||||
|
||||
if tags is None:
|
||||
tags = [source_type, storage_backend]
|
||||
tags = [source_enum.value, backend_enum.value]
|
||||
|
||||
# Create deployment
|
||||
# The flow decorator adds the to_deployment method at runtime
|
||||
@@ -66,16 +70,14 @@ def create_scheduled_deployment(
|
||||
schedule=schedule,
|
||||
parameters={
|
||||
"source_url": source_url,
|
||||
"source_type": source_type,
|
||||
"storage_backend": storage_backend,
|
||||
"source_type": source_enum.value,
|
||||
"storage_backend": backend_enum.value,
|
||||
"validate_first": True,
|
||||
},
|
||||
tags=tags,
|
||||
description=f"Scheduled ingestion from {source_url}",
|
||||
)
|
||||
|
||||
from typing import cast
|
||||
|
||||
return cast("RunnerDeployment", deployment)
|
||||
|
||||
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
"""Ingestors module for different data sources."""
|
||||
|
||||
from .base import BaseIngestor
|
||||
from .firecrawl import FirecrawlIngestor
|
||||
from .firecrawl import FirecrawlIngestor, FirecrawlPage
|
||||
from .repomix import RepomixIngestor
|
||||
|
||||
__all__ = [
|
||||
"BaseIngestor",
|
||||
"FirecrawlIngestor",
|
||||
"FirecrawlPage",
|
||||
"RepomixIngestor",
|
||||
]
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -10,7 +10,7 @@ class BaseIngestor(ABC):
|
||||
"""Abstract base class for all ingestors."""
|
||||
|
||||
@abstractmethod
|
||||
async def ingest(self, job: IngestionJob) -> AsyncGenerator[Document, None]:
|
||||
def ingest(self, job: IngestionJob) -> AsyncGenerator[Document, None]:
|
||||
"""
|
||||
Ingest data from a source.
|
||||
|
||||
@@ -20,8 +20,7 @@ class BaseIngestor(ABC):
|
||||
Yields:
|
||||
Documents from the source
|
||||
"""
|
||||
return # type: ignore # pragma: no cover
|
||||
yield # pragma: no cover
|
||||
... # pragma: no cover
|
||||
|
||||
@abstractmethod
|
||||
async def validate_source(self, source_url: str) -> bool:
|
||||
|
||||
@@ -1,15 +1,19 @@
|
||||
"""Firecrawl ingestor for web and documentation sites."""
|
||||
|
||||
import asyncio
|
||||
from collections.abc import AsyncGenerator
|
||||
import logging
|
||||
import re
|
||||
from collections.abc import AsyncGenerator, Awaitable, Callable
|
||||
from dataclasses import dataclass
|
||||
from datetime import UTC, datetime
|
||||
from typing import Any
|
||||
from uuid import uuid4
|
||||
from urllib.parse import urlparse
|
||||
from uuid import NAMESPACE_URL, UUID, uuid5
|
||||
|
||||
from firecrawl import AsyncFirecrawl
|
||||
from typing_extensions import override
|
||||
|
||||
from ..config import get_settings
|
||||
from ..core.exceptions import IngestionError
|
||||
from ..core.models import (
|
||||
Document,
|
||||
DocumentMetadata,
|
||||
@@ -20,11 +24,83 @@ from ..core.models import (
|
||||
from .base import BaseIngestor
|
||||
|
||||
|
||||
class FirecrawlError(IngestionError): # type: ignore[misc]
|
||||
"""Base exception for Firecrawl-related errors."""
|
||||
|
||||
def __init__(self, message: str, status_code: int | None = None) -> None:
|
||||
super().__init__(message)
|
||||
self.status_code = status_code
|
||||
|
||||
|
||||
class FirecrawlConnectionError(FirecrawlError):
|
||||
"""Connection error with Firecrawl service."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class FirecrawlRateLimitError(FirecrawlError):
|
||||
"""Rate limit exceeded error."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class FirecrawlUnauthorizedError(FirecrawlError):
|
||||
"""Unauthorized access error."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
async def retry_with_backoff(
|
||||
operation: Callable[[], Awaitable[object]], max_retries: int = 3
|
||||
) -> object:
|
||||
"""Retry operation with exponential backoff following Firecrawl best practices."""
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
return await operation()
|
||||
except Exception as e:
|
||||
if attempt == max_retries - 1:
|
||||
raise e
|
||||
delay = 1.0 * (2**attempt)
|
||||
logging.warning(
|
||||
f"Firecrawl operation failed (attempt {attempt + 1}/{max_retries}): {e}. Retrying in {delay:.1f}s..."
|
||||
)
|
||||
await asyncio.sleep(delay)
|
||||
|
||||
# This should never be reached due to the exception handling above,
|
||||
# but mypy requires a return statement for all code paths
|
||||
raise RuntimeError("Retry loop completed without return or exception")
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class FirecrawlPage:
|
||||
"""Structured representation of a scraped Firecrawl page."""
|
||||
|
||||
url: str
|
||||
content: str
|
||||
title: str | None
|
||||
description: str | None
|
||||
author: str | None = None
|
||||
language: str | None = None
|
||||
sitemap_last_modified: str | None = None
|
||||
source_url: str | None = None
|
||||
keywords: list[str] | None = None
|
||||
robots: str | None = None
|
||||
og_title: str | None = None
|
||||
og_description: str | None = None
|
||||
og_url: str | None = None
|
||||
og_image: str | None = None
|
||||
twitter_card: str | None = None
|
||||
twitter_site: str | None = None
|
||||
twitter_creator: str | None = None
|
||||
favicon: str | None = None
|
||||
status_code: int | None = None
|
||||
|
||||
|
||||
class FirecrawlIngestor(BaseIngestor):
|
||||
"""Ingestor for web and documentation sites using Firecrawl."""
|
||||
|
||||
config: FirecrawlConfig
|
||||
client: Any # AsyncFirecrawl client instance
|
||||
client: AsyncFirecrawl
|
||||
|
||||
def __init__(self, config: FirecrawlConfig | None = None):
|
||||
"""
|
||||
@@ -41,8 +117,24 @@ class FirecrawlIngestor(BaseIngestor):
|
||||
# The SDK requires an API key even for self-hosted instances
|
||||
api_key = settings.firecrawl_api_key or "no-key-required"
|
||||
|
||||
# AsyncFirecrawl automatically uses v2 endpoints
|
||||
self.client = AsyncFirecrawl(api_key=api_key, api_url=str(settings.firecrawl_endpoint))
|
||||
# Initialize AsyncFirecrawl following official pattern
|
||||
# Note: api_url parameter may not be supported in all versions
|
||||
# Default to standard initialization for cloud instances
|
||||
try:
|
||||
endpoint_str = str(settings.firecrawl_endpoint).rstrip("/")
|
||||
if endpoint_str.startswith("http://crawl.lab") or endpoint_str.startswith(
|
||||
"http://localhost"
|
||||
):
|
||||
# Self-hosted instance - try with api_url if supported
|
||||
self.client = AsyncFirecrawl(
|
||||
api_key=api_key, api_url=str(settings.firecrawl_endpoint)
|
||||
)
|
||||
else:
|
||||
# Cloud instance - use standard initialization
|
||||
self.client = AsyncFirecrawl(api_key=api_key)
|
||||
except Exception:
|
||||
# Fallback to standard initialization
|
||||
self.client = AsyncFirecrawl(api_key=api_key)
|
||||
|
||||
@override
|
||||
async def ingest(self, job: IngestionJob) -> AsyncGenerator[Document, None]:
|
||||
@@ -58,20 +150,26 @@ class FirecrawlIngestor(BaseIngestor):
|
||||
url = str(job.source_url)
|
||||
|
||||
# First, map the site to understand its structure
|
||||
site_map = await self._map_site(url)
|
||||
|
||||
# If map returns empty, just use the main URL
|
||||
if not site_map:
|
||||
site_map = [url]
|
||||
site_map = await self.map_site(url) or [url]
|
||||
|
||||
# Process pages in batches
|
||||
batch_size = 10
|
||||
for i in range(0, len(site_map), batch_size):
|
||||
batch_urls = site_map[i : i + batch_size]
|
||||
documents = await self._scrape_batch(batch_urls)
|
||||
pages = await self.scrape_pages(batch_urls)
|
||||
|
||||
for doc_data in documents:
|
||||
yield self._create_document(doc_data, job)
|
||||
for page in pages:
|
||||
yield self.create_document(page, job)
|
||||
|
||||
async def map_site(self, url: str) -> list[str]:
|
||||
"""Public wrapper for mapping a site."""
|
||||
|
||||
return await self._map_site(url)
|
||||
|
||||
async def scrape_pages(self, urls: list[str]) -> list[FirecrawlPage]:
|
||||
"""Scrape a batch of URLs and return structured page data."""
|
||||
|
||||
return await self._scrape_batch(urls)
|
||||
|
||||
@override
|
||||
async def validate_source(self, source_url: str) -> bool:
|
||||
@@ -85,10 +183,15 @@ class FirecrawlIngestor(BaseIngestor):
|
||||
True if source is accessible
|
||||
"""
|
||||
try:
|
||||
# Use SDK v2 endpoints for both self-hosted and cloud
|
||||
result = await self.client.scrape(source_url, formats=["markdown"])
|
||||
return result is not None and hasattr(result, "markdown")
|
||||
except Exception:
|
||||
# Use SDK v2 endpoints following official pattern with retry
|
||||
async def validate_operation() -> bool:
|
||||
result = await self.client.scrape(source_url, formats=["markdown"])
|
||||
return result is not None and getattr(result, "markdown", None) is not None
|
||||
|
||||
result = await retry_with_backoff(validate_operation)
|
||||
return bool(result)
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed to validate source {source_url}: {e}")
|
||||
return False
|
||||
|
||||
@override
|
||||
@@ -105,7 +208,8 @@ class FirecrawlIngestor(BaseIngestor):
|
||||
try:
|
||||
site_map = await self._map_site(source_url)
|
||||
return len(site_map) if site_map else 0
|
||||
except Exception:
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed to estimate size for {source_url}: {e}")
|
||||
return 0
|
||||
|
||||
async def _map_site(self, url: str) -> list[str]:
|
||||
@@ -119,24 +223,19 @@ class FirecrawlIngestor(BaseIngestor):
|
||||
List of URLs found
|
||||
"""
|
||||
try:
|
||||
# Use SDK v2 map endpoint
|
||||
# Use SDK v2 map endpoint following official pattern
|
||||
result = await self.client.map(url=url, limit=self.config.limit)
|
||||
|
||||
if result and hasattr(result, "links"):
|
||||
# Extract URLs from the result
|
||||
return [
|
||||
link if isinstance(link, str) else getattr(link, "url", str(link))
|
||||
for link in result.links
|
||||
]
|
||||
if result and getattr(result, "links", None):
|
||||
# Extract URLs from the result following official pattern
|
||||
return [getattr(link, "url", str(link)) for link in result.links]
|
||||
return []
|
||||
except Exception as e:
|
||||
# If map fails (might not be available in all versions), fall back to single URL
|
||||
import logging
|
||||
|
||||
logging.warning(f"Map endpoint not available or failed: {e}. Using single URL.")
|
||||
return [url]
|
||||
|
||||
async def _scrape_batch(self, urls: list[str]) -> list[dict[str, str]]:
|
||||
async def _scrape_batch(self, urls: list[str]) -> list[FirecrawlPage]:
|
||||
"""
|
||||
Scrape a batch of URLs.
|
||||
|
||||
@@ -146,84 +245,289 @@ class FirecrawlIngestor(BaseIngestor):
|
||||
Returns:
|
||||
List of scraped documents
|
||||
"""
|
||||
tasks = []
|
||||
for url in urls:
|
||||
task = self._scrape_single(url)
|
||||
tasks.append(task)
|
||||
tasks = [self._scrape_single(url) for url in urls]
|
||||
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
documents = []
|
||||
pages: list[FirecrawlPage] = []
|
||||
for result in results:
|
||||
if isinstance(result, Exception):
|
||||
if isinstance(result, FirecrawlPage):
|
||||
pages.append(result)
|
||||
elif isinstance(result, BaseException):
|
||||
continue
|
||||
if result and isinstance(result, dict) and "markdown" in result:
|
||||
documents.append(result)
|
||||
|
||||
return documents
|
||||
return pages
|
||||
|
||||
async def _scrape_single(self, url: str) -> dict[str, str]:
|
||||
async def _scrape_single(self, url: str) -> FirecrawlPage | None:
|
||||
"""
|
||||
Scrape a single URL.
|
||||
Scrape a single URL and extract rich metadata.
|
||||
|
||||
Args:
|
||||
url: URL to scrape
|
||||
|
||||
Returns:
|
||||
Scraped document data
|
||||
Scraped document data with enhanced metadata
|
||||
"""
|
||||
try:
|
||||
# Use SDK v2 scrape endpoint
|
||||
result = await self.client.scrape(url, formats=self.config.formats)
|
||||
# Use SDK v2 scrape endpoint following official pattern with retry
|
||||
async def scrape_operation() -> FirecrawlPage | None:
|
||||
result = await self.client.scrape(url, formats=self.config.formats)
|
||||
|
||||
# Extract data from the result
|
||||
if result:
|
||||
# The SDK returns a ScrapeResult object with markdown and metadata
|
||||
metadata = getattr(result, "metadata", {})
|
||||
return {
|
||||
"markdown": getattr(result, "markdown", ""),
|
||||
"sourceURL": url,
|
||||
"title": metadata.get("title", "")
|
||||
if isinstance(metadata, dict)
|
||||
else getattr(metadata, "title", ""),
|
||||
"description": metadata.get("description", "")
|
||||
if isinstance(metadata, dict)
|
||||
else getattr(metadata, "description", ""),
|
||||
}
|
||||
return {}
|
||||
# Extract data from the result following official response handling
|
||||
if result:
|
||||
# The SDK returns a ScrapeData object with typed metadata
|
||||
metadata = getattr(result, "metadata", None)
|
||||
|
||||
# Extract basic metadata
|
||||
title = getattr(metadata, "title", None) if metadata else None
|
||||
description = getattr(metadata, "description", None) if metadata else None
|
||||
|
||||
# Extract enhanced metadata if available
|
||||
author = getattr(metadata, "author", None) if metadata else None
|
||||
language = getattr(metadata, "language", None) if metadata else None
|
||||
sitemap_last_modified = (
|
||||
getattr(metadata, "sitemap_last_modified", None) if metadata else None
|
||||
)
|
||||
source_url = getattr(metadata, "sourceURL", None) if metadata else None
|
||||
keywords = getattr(metadata, "keywords", None) if metadata else None
|
||||
robots = getattr(metadata, "robots", None) if metadata else None
|
||||
|
||||
# Open Graph metadata
|
||||
og_title = getattr(metadata, "ogTitle", None) if metadata else None
|
||||
og_description = getattr(metadata, "ogDescription", None) if metadata else None
|
||||
og_url = getattr(metadata, "ogUrl", None) if metadata else None
|
||||
og_image = getattr(metadata, "ogImage", None) if metadata else None
|
||||
|
||||
# Twitter metadata
|
||||
twitter_card = getattr(metadata, "twitterCard", None) if metadata else None
|
||||
twitter_site = getattr(metadata, "twitterSite", None) if metadata else None
|
||||
twitter_creator = (
|
||||
getattr(metadata, "twitterCreator", None) if metadata else None
|
||||
)
|
||||
|
||||
# Additional metadata
|
||||
favicon = getattr(metadata, "favicon", None) if metadata else None
|
||||
status_code = getattr(metadata, "statusCode", None) if metadata else None
|
||||
|
||||
return FirecrawlPage(
|
||||
url=url,
|
||||
content=getattr(result, "markdown", "") or "",
|
||||
title=title,
|
||||
description=description,
|
||||
author=author,
|
||||
language=language,
|
||||
sitemap_last_modified=sitemap_last_modified,
|
||||
source_url=source_url,
|
||||
keywords=keywords.split(",")
|
||||
if keywords and isinstance(keywords, str)
|
||||
else keywords,
|
||||
robots=robots,
|
||||
og_title=og_title,
|
||||
og_description=og_description,
|
||||
og_url=og_url,
|
||||
og_image=og_image,
|
||||
twitter_card=twitter_card,
|
||||
twitter_site=twitter_site,
|
||||
twitter_creator=twitter_creator,
|
||||
favicon=favicon,
|
||||
status_code=status_code,
|
||||
)
|
||||
return None
|
||||
|
||||
result = await retry_with_backoff(scrape_operation)
|
||||
return result if isinstance(result, FirecrawlPage) else None
|
||||
except Exception as e:
|
||||
import logging
|
||||
|
||||
logging.debug(f"Failed to scrape {url}: {e}")
|
||||
return {}
|
||||
return None
|
||||
|
||||
def _create_document(self, doc_data: dict[str, str], job: IngestionJob) -> Document:
|
||||
@staticmethod
|
||||
def compute_document_id(source_url: str) -> UUID:
|
||||
"""Derive a deterministic UUID for a document based on its source URL."""
|
||||
return uuid5(NAMESPACE_URL, source_url)
|
||||
|
||||
@staticmethod
|
||||
def _analyze_content_structure(content: str) -> dict[str, object]:
|
||||
"""Analyze markdown content to extract structural information."""
|
||||
# Extract heading hierarchy
|
||||
heading_pattern = r"^(#{1,6})\s+(.+)$"
|
||||
headings = []
|
||||
for match in re.finditer(heading_pattern, content, re.MULTILINE):
|
||||
level = len(match.group(1))
|
||||
text = match.group(2).strip()
|
||||
headings.append(f"{' ' * (level - 1)}{text}")
|
||||
|
||||
# Check for various content types
|
||||
has_code_blocks = bool(re.search(r"```[\s\S]*?```", content))
|
||||
has_images = bool(re.search(r"!\[.*?\]\(.*?\)", content))
|
||||
has_links = bool(re.search(r"\[.*?\]\(.*?\)", content))
|
||||
|
||||
# Calculate section depth
|
||||
max_depth = 0
|
||||
if headings:
|
||||
for heading in headings:
|
||||
depth = (len(heading) - len(heading.lstrip())) // 2 + 1
|
||||
max_depth = max(max_depth, depth)
|
||||
|
||||
return {
|
||||
"heading_hierarchy": headings,
|
||||
"section_depth": max_depth,
|
||||
"has_code_blocks": has_code_blocks,
|
||||
"has_images": has_images,
|
||||
"has_links": has_links,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _calculate_content_quality(content: str, title: str | None) -> dict[str, float | None]:
|
||||
"""Calculate basic content quality metrics."""
|
||||
if not content:
|
||||
return {"readability_score": None, "completeness_score": None}
|
||||
|
||||
# Simple readability approximation (Flesch-like)
|
||||
sentences = len(re.findall(r"[.!?]+", content))
|
||||
words = len(content.split())
|
||||
|
||||
if sentences == 0 or words == 0:
|
||||
readability_score = None
|
||||
else:
|
||||
avg_sentence_length = words / sentences
|
||||
# Simplified readability score (0-100, higher is more readable)
|
||||
readability_score = max(0, min(100, 100 - (avg_sentence_length - 15) * 2))
|
||||
|
||||
# Completeness score based on structure
|
||||
completeness_factors = 0
|
||||
total_factors = 5
|
||||
|
||||
if title:
|
||||
completeness_factors += 1
|
||||
if len(content) > 500:
|
||||
completeness_factors += 1
|
||||
if re.search(r"^#{1,6}\s+", content, re.MULTILINE):
|
||||
completeness_factors += 1
|
||||
if len(content.split()) > 100:
|
||||
completeness_factors += 1
|
||||
if not re.search(r"(error|404|not found|page not found)", content, re.IGNORECASE):
|
||||
completeness_factors += 1
|
||||
|
||||
completeness_score = (completeness_factors / total_factors) * 100
|
||||
|
||||
return {
|
||||
"readability_score": readability_score,
|
||||
"completeness_score": completeness_score,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _extract_domain_info(url: str) -> dict[str, str]:
|
||||
"""Extract domain and site information from URL."""
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.lower()
|
||||
|
||||
# Remove www. prefix
|
||||
if domain.startswith("www."):
|
||||
domain = domain[4:]
|
||||
|
||||
# Extract site name from domain
|
||||
domain_parts = domain.split(".")
|
||||
site_name = domain_parts[0].replace("-", " ").replace("_", " ").title()
|
||||
|
||||
return {
|
||||
"domain": domain,
|
||||
"site_name": site_name,
|
||||
}
|
||||
|
||||
def create_document(self, page: FirecrawlPage, job: IngestionJob) -> Document:
|
||||
"""
|
||||
Create a Document from scraped data.
|
||||
Create a Document from scraped data with enriched metadata.
|
||||
|
||||
Args:
|
||||
doc_data: Scraped document data
|
||||
page: Scraped document data
|
||||
job: The ingestion job
|
||||
|
||||
Returns:
|
||||
Document instance
|
||||
Document instance with rich metadata
|
||||
"""
|
||||
content = doc_data.get("markdown", "")
|
||||
content = page.content
|
||||
source_url = page.url
|
||||
|
||||
# Analyze content structure
|
||||
structure_info = self._analyze_content_structure(content)
|
||||
|
||||
# Calculate quality metrics
|
||||
quality_info = self._calculate_content_quality(content, page.title)
|
||||
|
||||
# Extract domain information
|
||||
domain_info = self._extract_domain_info(source_url)
|
||||
|
||||
# Build rich metadata
|
||||
metadata: DocumentMetadata = {
|
||||
"source_url": doc_data.get("sourceURL", str(job.source_url)),
|
||||
"title": doc_data.get("title"),
|
||||
"description": doc_data.get("description"),
|
||||
# Core required fields
|
||||
"source_url": source_url,
|
||||
"timestamp": datetime.now(UTC),
|
||||
"content_type": "text/markdown",
|
||||
"word_count": len(content.split()),
|
||||
"char_count": len(content),
|
||||
# Basic optional fields
|
||||
"title": page.title or f"Page from {source_url}",
|
||||
"description": page.description
|
||||
or page.og_description
|
||||
or f"Content scraped from {source_url}",
|
||||
# Content categorization
|
||||
"tags": page.keywords or [],
|
||||
"language": page.language or "en",
|
||||
# Authorship and source info
|
||||
"author": page.author or page.twitter_creator or "Unknown",
|
||||
"domain": domain_info["domain"],
|
||||
"site_name": domain_info["site_name"],
|
||||
# Document structure
|
||||
"heading_hierarchy": structure_info["heading_hierarchy"],
|
||||
"section_depth": structure_info["section_depth"],
|
||||
"has_code_blocks": structure_info["has_code_blocks"],
|
||||
"has_images": structure_info["has_images"],
|
||||
"has_links": structure_info["has_links"],
|
||||
# Processing metadata
|
||||
"extraction_method": "firecrawl",
|
||||
"last_modified": datetime.fromisoformat(page.sitemap_last_modified)
|
||||
if page.sitemap_last_modified
|
||||
else None,
|
||||
# Content quality indicators
|
||||
"readability_score": quality_info["readability_score"],
|
||||
"completeness_score": quality_info["completeness_score"],
|
||||
}
|
||||
|
||||
# Note: Additional web-specific metadata like og_title, twitter_card etc.
|
||||
# would need to be added to DocumentMetadata TypedDict if needed
|
||||
|
||||
return Document(
|
||||
id=uuid4(),
|
||||
id=self.compute_document_id(source_url),
|
||||
content=content,
|
||||
metadata=metadata,
|
||||
source=IngestionSource.WEB,
|
||||
collection=job.storage_backend.value,
|
||||
)
|
||||
|
||||
async def close(self) -> None:
|
||||
"""Close the Firecrawl client and cleanup resources."""
|
||||
# AsyncFirecrawl may not have explicit close method in all versions
|
||||
# This is defensive cleanup following best practices
|
||||
if hasattr(self.client, "close"):
|
||||
try:
|
||||
await self.client.close()
|
||||
except Exception as e:
|
||||
logging.debug(f"Error closing Firecrawl client: {e}")
|
||||
elif hasattr(self.client, "_session") and hasattr(self.client._session, "close"):
|
||||
try:
|
||||
await self.client._session.close()
|
||||
except Exception as e:
|
||||
logging.debug(f"Error closing Firecrawl session: {e}")
|
||||
|
||||
async def __aenter__(self) -> "FirecrawlIngestor":
|
||||
"""Async context manager entry."""
|
||||
return self
|
||||
|
||||
async def __aexit__(
|
||||
self,
|
||||
exc_type: type[BaseException] | None,
|
||||
exc_val: BaseException | None,
|
||||
exc_tb: object | None,
|
||||
) -> None:
|
||||
"""Async context manager exit with cleanup."""
|
||||
await self.close()
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
import asyncio
|
||||
import subprocess
|
||||
import tempfile
|
||||
import re
|
||||
from collections.abc import AsyncGenerator
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
@@ -24,7 +25,7 @@ from .base import BaseIngestor
|
||||
class RepomixIngestor(BaseIngestor):
|
||||
"""Ingestor for Git repositories using Repomix."""
|
||||
|
||||
config: RepomixConfig
|
||||
config: RepomixConfig # Explicit type annotation
|
||||
|
||||
def __init__(self, config: RepomixConfig | None = None):
|
||||
"""
|
||||
@@ -77,7 +78,9 @@ class RepomixIngestor(BaseIngestor):
|
||||
["git", "ls-remote", "--heads", source_url], timeout=10
|
||||
)
|
||||
return result.returncode == 0
|
||||
except Exception:
|
||||
except Exception as e:
|
||||
import logging
|
||||
logging.warning(f"Failed to validate repository {source_url}: {e}")
|
||||
return False
|
||||
|
||||
@override
|
||||
@@ -94,16 +97,21 @@ class RepomixIngestor(BaseIngestor):
|
||||
try:
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Shallow clone to get file count
|
||||
repo_path = await self._clone_repository(source_url, temp_dir, shallow=True)
|
||||
repo_path = await self._clone_repository(
|
||||
source_url, temp_dir, shallow=True
|
||||
)
|
||||
|
||||
# Count files matching patterns
|
||||
file_count = 0
|
||||
# Type checker now knows self.config is RepomixConfig, and include_patterns is list[str]
|
||||
for pattern in self.config.include_patterns:
|
||||
files = list(Path(repo_path).rglob(pattern))
|
||||
file_count += len(files)
|
||||
|
||||
return file_count
|
||||
except Exception:
|
||||
except Exception as e:
|
||||
import logging
|
||||
logging.warning(f"Failed to estimate size for repository {source_url}: {e}")
|
||||
return 0
|
||||
|
||||
async def _clone_repository(
|
||||
@@ -166,14 +174,14 @@ class RepomixIngestor(BaseIngestor):
|
||||
result = await self._run_command(cmd, cwd=str(repo_path), timeout=120)
|
||||
|
||||
if result.returncode != 0:
|
||||
stderr_text = (
|
||||
result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr
|
||||
)
|
||||
stderr_text = result.stderr.decode()
|
||||
raise IngestionError(f"Repomix failed: {stderr_text}")
|
||||
|
||||
return output_file
|
||||
|
||||
async def _parse_repomix_output(self, output_file: Path, job: IngestionJob) -> list[Document]:
|
||||
async def _parse_repomix_output(
|
||||
self, output_file: Path, job: IngestionJob
|
||||
) -> list[Document]:
|
||||
"""
|
||||
Parse repomix output into documents.
|
||||
|
||||
@@ -184,11 +192,15 @@ class RepomixIngestor(BaseIngestor):
|
||||
Returns:
|
||||
List of documents
|
||||
"""
|
||||
documents = []
|
||||
documents: list[Document] = []
|
||||
|
||||
try:
|
||||
content = output_file.read_text()
|
||||
# Get repository metadata
|
||||
repo_path = output_file.parent
|
||||
git_metadata = await self._get_git_metadata(repo_path)
|
||||
repo_info = self._extract_repository_info(str(job.source_url))
|
||||
|
||||
content = output_file.read_text()
|
||||
# Split by file markers (repomix uses specific delimiters)
|
||||
file_sections = self._split_by_files(content)
|
||||
|
||||
@@ -197,10 +209,16 @@ class RepomixIngestor(BaseIngestor):
|
||||
# Split large files into chunks
|
||||
chunks = self._chunk_content(file_content)
|
||||
for i, chunk in enumerate(chunks):
|
||||
doc = self._create_document(file_path, chunk, job, chunk_index=i)
|
||||
doc = self._create_document(
|
||||
file_path, chunk, job, chunk_index=i,
|
||||
git_metadata=git_metadata, repo_info=repo_info
|
||||
)
|
||||
documents.append(doc)
|
||||
else:
|
||||
doc = self._create_document(file_path, file_content, job)
|
||||
doc = self._create_document(
|
||||
file_path, file_content, job,
|
||||
git_metadata=git_metadata, repo_info=repo_info
|
||||
)
|
||||
documents.append(doc)
|
||||
|
||||
except Exception as e:
|
||||
@@ -274,31 +292,243 @@ class RepomixIngestor(BaseIngestor):
|
||||
|
||||
return chunks
|
||||
|
||||
@staticmethod
|
||||
def _detect_programming_language(file_path: str, content: str) -> str | None:
|
||||
"""Detect programming language from file extension and content."""
|
||||
path = Path(file_path)
|
||||
extension = path.suffix.lower()
|
||||
|
||||
# Map common extensions to languages
|
||||
ext_map = {
|
||||
'.py': 'python',
|
||||
'.js': 'javascript',
|
||||
'.ts': 'typescript',
|
||||
'.jsx': 'javascript',
|
||||
'.tsx': 'typescript',
|
||||
'.java': 'java',
|
||||
'.c': 'c',
|
||||
'.cpp': 'cpp',
|
||||
'.cc': 'cpp',
|
||||
'.cxx': 'cpp',
|
||||
'.h': 'c',
|
||||
'.hpp': 'cpp',
|
||||
'.cs': 'csharp',
|
||||
'.go': 'go',
|
||||
'.rs': 'rust',
|
||||
'.php': 'php',
|
||||
'.rb': 'ruby',
|
||||
'.swift': 'swift',
|
||||
'.kt': 'kotlin',
|
||||
'.scala': 'scala',
|
||||
'.sh': 'shell',
|
||||
'.bash': 'shell',
|
||||
'.zsh': 'shell',
|
||||
'.sql': 'sql',
|
||||
'.html': 'html',
|
||||
'.css': 'css',
|
||||
'.scss': 'scss',
|
||||
'.less': 'less',
|
||||
'.yaml': 'yaml',
|
||||
'.yml': 'yaml',
|
||||
'.json': 'json',
|
||||
'.xml': 'xml',
|
||||
'.md': 'markdown',
|
||||
'.txt': 'text',
|
||||
'.cfg': 'config',
|
||||
'.ini': 'config',
|
||||
'.toml': 'toml',
|
||||
}
|
||||
|
||||
if extension in ext_map:
|
||||
return ext_map[extension]
|
||||
|
||||
# Try to detect from shebang
|
||||
if content.startswith('#!'):
|
||||
first_line = content.split('\n')[0]
|
||||
if 'python' in first_line:
|
||||
return 'python'
|
||||
elif 'node' in first_line or 'javascript' in first_line:
|
||||
return 'javascript'
|
||||
elif 'bash' in first_line or 'sh' in first_line:
|
||||
return 'shell'
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _analyze_code_structure(content: str, language: str | None) -> dict[str, object]:
|
||||
"""Analyze code structure and extract metadata."""
|
||||
lines = content.split('\n')
|
||||
|
||||
# Basic metrics
|
||||
has_functions = False
|
||||
has_classes = False
|
||||
has_imports = False
|
||||
has_comments = False
|
||||
|
||||
# Language-specific patterns
|
||||
if language == 'python':
|
||||
has_functions = bool(re.search(r'^\s*def\s+\w+', content, re.MULTILINE))
|
||||
has_classes = bool(re.search(r'^\s*class\s+\w+', content, re.MULTILINE))
|
||||
has_imports = bool(re.search(r'^\s*(import|from)\s+', content, re.MULTILINE))
|
||||
has_comments = bool(re.search(r'^\s*#', content, re.MULTILINE))
|
||||
elif language in ['javascript', 'typescript']:
|
||||
has_functions = bool(re.search(r'(function\s+\w+|^\s*\w+\s*:\s*function|\w+\s*=>\s*)', content, re.MULTILINE))
|
||||
has_classes = bool(re.search(r'^\s*class\s+\w+', content, re.MULTILINE))
|
||||
has_imports = bool(re.search(r'^\s*(import|require)', content, re.MULTILINE))
|
||||
has_comments = bool(re.search(r'//|/\*', content))
|
||||
elif language == 'java':
|
||||
has_functions = bool(re.search(r'(public|private|protected).*\w+\s*\(', content, re.MULTILINE))
|
||||
has_classes = bool(re.search(r'(public|private)?\s*class\s+\w+', content, re.MULTILINE))
|
||||
has_imports = bool(re.search(r'^\s*import\s+', content, re.MULTILINE))
|
||||
has_comments = bool(re.search(r'//|/\*', content))
|
||||
|
||||
return {
|
||||
'has_functions': has_functions,
|
||||
'has_classes': has_classes,
|
||||
'has_imports': has_imports,
|
||||
'has_comments': has_comments,
|
||||
'line_count': len(lines),
|
||||
'non_empty_lines': len([line for line in lines if line.strip()]),
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _extract_repository_info(repo_url: str) -> dict[str, str]:
|
||||
"""Extract repository information from URL."""
|
||||
# Parse GitHub/GitLab style URLs
|
||||
repo_name = None
|
||||
org_name = None
|
||||
|
||||
# Handle different URL formats
|
||||
if 'github.com' in repo_url or 'gitlab.com' in repo_url:
|
||||
# Extract from URLs like https://github.com/org/repo.git
|
||||
path_match = re.search(r'/([^/]+)/([^/]+?)(?:\.git)?/?$', repo_url)
|
||||
if path_match:
|
||||
org_name = path_match.group(1)
|
||||
repo_name = path_match.group(2)
|
||||
else:
|
||||
# Try to extract from generic git URLs
|
||||
path_match = re.search(r'/([^/]+?)(?:\.git)?/?$', repo_url)
|
||||
if path_match:
|
||||
repo_name = path_match.group(1)
|
||||
|
||||
return {
|
||||
'repository_name': repo_name or 'unknown',
|
||||
'organization': org_name or 'unknown',
|
||||
}
|
||||
|
||||
async def _get_git_metadata(self, repo_path: Path) -> dict[str, str | None]:
|
||||
"""Get Git metadata from repository."""
|
||||
try:
|
||||
# Get current branch
|
||||
branch_result = await self._run_command(
|
||||
['git', 'rev-parse', '--abbrev-ref', 'HEAD'],
|
||||
cwd=str(repo_path),
|
||||
timeout=5
|
||||
)
|
||||
branch_name = branch_result.stdout.decode().strip() if branch_result.returncode == 0 else None
|
||||
|
||||
# Get current commit hash
|
||||
commit_result = await self._run_command(
|
||||
['git', 'rev-parse', 'HEAD'],
|
||||
cwd=str(repo_path),
|
||||
timeout=5
|
||||
)
|
||||
commit_hash = commit_result.stdout.decode().strip() if commit_result.returncode == 0 else None
|
||||
|
||||
return {
|
||||
'branch_name': branch_name,
|
||||
'commit_hash': commit_hash[:8] if commit_hash else None, # Short hash
|
||||
}
|
||||
except Exception:
|
||||
return {'branch_name': None, 'commit_hash': None}
|
||||
|
||||
def _create_document(
|
||||
self, file_path: str, content: str, job: IngestionJob, chunk_index: int = 0
|
||||
self, file_path: str, content: str, job: IngestionJob, chunk_index: int = 0,
|
||||
git_metadata: dict[str, str | None] | None = None,
|
||||
repo_info: dict[str, str] | None = None
|
||||
) -> Document:
|
||||
"""
|
||||
Create a Document from repository content.
|
||||
Create a Document from repository content with enriched metadata.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file in repository
|
||||
content: File content
|
||||
job: The ingestion job
|
||||
chunk_index: Index if content is chunked
|
||||
git_metadata: Git repository metadata
|
||||
repo_info: Repository information
|
||||
|
||||
Returns:
|
||||
Document instance
|
||||
Document instance with rich metadata
|
||||
"""
|
||||
# Detect programming language
|
||||
programming_language = self._detect_programming_language(file_path, content)
|
||||
|
||||
# Analyze code structure
|
||||
structure_info = self._analyze_code_structure(content, programming_language)
|
||||
|
||||
# Determine content type based on language
|
||||
content_type_map = {
|
||||
'python': 'text/x-python',
|
||||
'javascript': 'text/javascript',
|
||||
'typescript': 'text/typescript',
|
||||
'java': 'text/x-java-source',
|
||||
'html': 'text/html',
|
||||
'css': 'text/css',
|
||||
'json': 'application/json',
|
||||
'yaml': 'text/yaml',
|
||||
'markdown': 'text/markdown',
|
||||
'shell': 'text/x-shellscript',
|
||||
'sql': 'text/x-sql',
|
||||
}
|
||||
content_type = content_type_map.get(programming_language or 'text', 'text/plain')
|
||||
|
||||
# Build rich metadata
|
||||
metadata: DocumentMetadata = {
|
||||
# Core required fields
|
||||
"source_url": str(job.source_url),
|
||||
"title": f"{file_path}" + (f" (chunk {chunk_index})" if chunk_index > 0 else ""),
|
||||
"description": f"Repository file: {file_path}",
|
||||
"timestamp": datetime.now(UTC),
|
||||
"content_type": "text/plain",
|
||||
"content_type": content_type,
|
||||
"word_count": len(content.split()),
|
||||
"char_count": len(content),
|
||||
|
||||
# Basic fields
|
||||
"title": f"{file_path}" + (f" (chunk {chunk_index})" if chunk_index > 0 else ""),
|
||||
"description": f"Repository file: {file_path}",
|
||||
|
||||
# Content categorization
|
||||
"category": "source_code" if programming_language else "documentation",
|
||||
"language": programming_language or "text",
|
||||
|
||||
# Document structure from code analysis
|
||||
"has_code_blocks": True if programming_language else False,
|
||||
|
||||
# Processing metadata
|
||||
"extraction_method": "repomix",
|
||||
|
||||
# Repository-specific fields
|
||||
"file_path": file_path,
|
||||
"programming_language": programming_language,
|
||||
}
|
||||
|
||||
# Add repository info if available
|
||||
if repo_info:
|
||||
metadata["repository_name"] = repo_info.get('repository_name')
|
||||
|
||||
# Add git metadata if available
|
||||
if git_metadata:
|
||||
metadata["branch_name"] = git_metadata.get('branch_name')
|
||||
metadata["commit_hash"] = git_metadata.get('commit_hash')
|
||||
|
||||
# Add code-specific metadata for programming files
|
||||
if programming_language and structure_info:
|
||||
# Calculate code quality score
|
||||
total_lines = structure_info.get('line_count', 1)
|
||||
non_empty_lines = structure_info.get('non_empty_lines', 0)
|
||||
if isinstance(total_lines, int) and isinstance(non_empty_lines, int) and total_lines > 0:
|
||||
completeness_score = (non_empty_lines / total_lines) * 100
|
||||
metadata["completeness_score"] = completeness_score
|
||||
|
||||
return Document(
|
||||
id=uuid4(),
|
||||
content=content,
|
||||
@@ -322,7 +552,10 @@ class RepomixIngestor(BaseIngestor):
|
||||
Completed process result
|
||||
"""
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, cwd=cwd
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
cwd=cwd,
|
||||
)
|
||||
|
||||
try:
|
||||
@@ -336,4 +569,15 @@ class RepomixIngestor(BaseIngestor):
|
||||
)
|
||||
except TimeoutError as e:
|
||||
proc.kill()
|
||||
# Wait for the process to actually terminate
|
||||
try:
|
||||
await asyncio.wait_for(proc.wait(), timeout=5.0)
|
||||
except TimeoutError:
|
||||
# Force terminate if it doesn't respond to kill
|
||||
proc.terminate()
|
||||
try:
|
||||
await asyncio.wait_for(proc.wait(), timeout=2.0)
|
||||
except TimeoutError:
|
||||
import logging
|
||||
logging.warning(f"Process {proc.pid} did not terminate cleanly")
|
||||
raise IngestionError(f"Command timed out: {' '.join(cmd)}") from e
|
||||
|
||||
@@ -1,11 +1,24 @@
|
||||
"""Storage adapters for different backends."""
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from .base import BaseStorage
|
||||
from .openwebui import OpenWebUIStorage
|
||||
from .weaviate import WeaviateStorage
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .r2r import R2RStorage as _R2RStorage
|
||||
|
||||
try:
|
||||
from .r2r.storage import R2RStorage as _RuntimeR2RStorage
|
||||
R2RStorage: type[BaseStorage] | None = _RuntimeR2RStorage
|
||||
except ImportError:
|
||||
R2RStorage = None
|
||||
|
||||
__all__ = [
|
||||
"BaseStorage",
|
||||
"WeaviateStorage",
|
||||
"OpenWebUIStorage",
|
||||
"R2RStorage",
|
||||
"_R2RStorage",
|
||||
]
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
ingest_pipeline/storage/__pycache__/r2r.cpython-312.pyc
Normal file
BIN
ingest_pipeline/storage/__pycache__/r2r.cpython-312.pyc
Normal file
Binary file not shown.
Binary file not shown.
@@ -20,13 +20,18 @@ class BaseStorage(ABC):
|
||||
"""
|
||||
self.config = config
|
||||
|
||||
@property
|
||||
def display_name(self) -> str:
|
||||
"""Human-readable name for UI display."""
|
||||
return self.__class__.__name__.replace("Storage", "")
|
||||
|
||||
@abstractmethod
|
||||
async def initialize(self) -> None:
|
||||
"""Initialize the storage backend and create collections if needed."""
|
||||
pass # pragma: no cover
|
||||
|
||||
@abstractmethod
|
||||
async def store(self, document: Document) -> str:
|
||||
async def store(self, document: Document, *, collection_name: str | None = None) -> str:
|
||||
"""
|
||||
Store a single document.
|
||||
|
||||
@@ -39,7 +44,9 @@ class BaseStorage(ABC):
|
||||
pass # pragma: no cover
|
||||
|
||||
@abstractmethod
|
||||
async def store_batch(self, documents: list[Document]) -> list[str]:
|
||||
async def store_batch(
|
||||
self, documents: list[Document], *, collection_name: str | None = None
|
||||
) -> list[str]:
|
||||
"""
|
||||
Store multiple documents in batch.
|
||||
|
||||
@@ -51,25 +58,33 @@ class BaseStorage(ABC):
|
||||
"""
|
||||
pass # pragma: no cover
|
||||
|
||||
@abstractmethod
|
||||
async def retrieve(self, document_id: str) -> Document | None:
|
||||
async def retrieve(
|
||||
self, document_id: str, *, collection_name: str | None = None
|
||||
) -> Document | None:
|
||||
"""
|
||||
Retrieve a document by ID.
|
||||
Retrieve a document by ID (if supported by backend).
|
||||
|
||||
Args:
|
||||
document_id: Document ID
|
||||
|
||||
Returns:
|
||||
Document or None if not found
|
||||
"""
|
||||
pass # pragma: no cover
|
||||
|
||||
@abstractmethod
|
||||
async def search(
|
||||
self, query: str, limit: int = 10, threshold: float = 0.7
|
||||
Raises:
|
||||
NotImplementedError: If backend doesn't support retrieval
|
||||
"""
|
||||
raise NotImplementedError(f"{self.__class__.__name__} doesn't support document retrieval")
|
||||
|
||||
def search(
|
||||
self,
|
||||
query: str,
|
||||
limit: int = 10,
|
||||
threshold: float = 0.7,
|
||||
*,
|
||||
collection_name: str | None = None,
|
||||
) -> AsyncGenerator[Document, None]:
|
||||
"""
|
||||
Search for documents.
|
||||
Search for documents (if supported by backend).
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
@@ -78,12 +93,14 @@ class BaseStorage(ABC):
|
||||
|
||||
Yields:
|
||||
Matching documents
|
||||
|
||||
Raises:
|
||||
NotImplementedError: If backend doesn't support search
|
||||
"""
|
||||
return # type: ignore # pragma: no cover
|
||||
yield # pragma: no cover
|
||||
raise NotImplementedError(f"{self.__class__.__name__} doesn't support search")
|
||||
|
||||
@abstractmethod
|
||||
async def delete(self, document_id: str) -> bool:
|
||||
async def delete(self, document_id: str, *, collection_name: str | None = None) -> bool:
|
||||
"""
|
||||
Delete a document.
|
||||
|
||||
@@ -95,12 +112,51 @@ class BaseStorage(ABC):
|
||||
"""
|
||||
pass # pragma: no cover
|
||||
|
||||
@abstractmethod
|
||||
async def count(self) -> int:
|
||||
async def count(self, *, collection_name: str | None = None) -> int:
|
||||
"""
|
||||
Get total document count.
|
||||
Get total document count (if supported by backend).
|
||||
|
||||
Returns:
|
||||
Number of documents
|
||||
Number of documents, 0 if not supported
|
||||
"""
|
||||
pass # pragma: no cover
|
||||
return 0
|
||||
|
||||
async def list_collections(self) -> list[str]:
|
||||
"""
|
||||
List available collections (if supported by backend).
|
||||
|
||||
Returns:
|
||||
List of collection names, empty list if not supported
|
||||
"""
|
||||
return []
|
||||
|
||||
async def list_documents(
|
||||
self,
|
||||
limit: int = 100,
|
||||
offset: int = 0,
|
||||
*,
|
||||
collection_name: str | None = None,
|
||||
) -> list[dict[str, object]]:
|
||||
"""
|
||||
List documents in the storage backend (if supported).
|
||||
|
||||
Args:
|
||||
limit: Maximum number of documents to return
|
||||
offset: Number of documents to skip
|
||||
collection_name: Collection to list documents from
|
||||
|
||||
Returns:
|
||||
List of document dictionaries with metadata
|
||||
|
||||
Raises:
|
||||
NotImplementedError: If backend doesn't support document listing
|
||||
"""
|
||||
raise NotImplementedError(f"{self.__class__.__name__} doesn't support document listing")
|
||||
|
||||
async def close(self) -> None:
|
||||
"""
|
||||
Close storage connections and cleanup resources.
|
||||
|
||||
Default implementation does nothing.
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -1,22 +1,24 @@
|
||||
"""Open WebUI storage adapter."""
|
||||
|
||||
from collections.abc import AsyncGenerator
|
||||
from uuid import UUID
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Final, TypedDict, cast
|
||||
|
||||
import httpx
|
||||
from typing_extensions import override
|
||||
|
||||
from ..core.exceptions import StorageError
|
||||
from ..core.models import Document, StorageConfig
|
||||
from ..utils.vectorizer import Vectorizer
|
||||
from .base import BaseStorage
|
||||
|
||||
LOGGER: Final[logging.Logger] = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OpenWebUIStorage(BaseStorage):
|
||||
"""Storage adapter for Open WebUI knowledge endpoints."""
|
||||
|
||||
client: httpx.AsyncClient
|
||||
vectorizer: Vectorizer
|
||||
_knowledge_cache: dict[str, str]
|
||||
|
||||
def __init__(self, config: StorageConfig):
|
||||
"""
|
||||
@@ -27,260 +29,532 @@ class OpenWebUIStorage(BaseStorage):
|
||||
"""
|
||||
super().__init__(config)
|
||||
|
||||
headers: dict[str, str] = {}
|
||||
if config.api_key:
|
||||
headers["Authorization"] = f"Bearer {config.api_key}"
|
||||
|
||||
self.client = httpx.AsyncClient(
|
||||
base_url=str(config.endpoint),
|
||||
headers={
|
||||
"Authorization": f"Bearer {config.api_key}" if config.api_key else "",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
headers=headers,
|
||||
timeout=30.0,
|
||||
)
|
||||
self.vectorizer = Vectorizer(config)
|
||||
self._knowledge_cache = {}
|
||||
|
||||
@override
|
||||
async def initialize(self) -> None:
|
||||
"""Initialize Open WebUI connection."""
|
||||
try:
|
||||
# Test connection with OpenWebUI knowledge API
|
||||
response = await self.client.get("/api/v1/knowledge/")
|
||||
response.raise_for_status()
|
||||
|
||||
# Check if collection (knowledge base) exists, create if not
|
||||
knowledge_bases = response.json()
|
||||
collection_exists = any(
|
||||
kb.get("name") == self.config.collection_name for kb in knowledge_bases
|
||||
)
|
||||
|
||||
if not collection_exists:
|
||||
await self._create_collection()
|
||||
if self.config.collection_name:
|
||||
await self._get_knowledge_id(
|
||||
self.config.collection_name,
|
||||
create=True,
|
||||
)
|
||||
|
||||
except httpx.ConnectError as e:
|
||||
raise StorageError(f"Connection to OpenWebUI failed: {e}") from e
|
||||
except httpx.HTTPStatusError as e:
|
||||
raise StorageError(f"OpenWebUI returned error {e.response.status_code}: {e}") from e
|
||||
except httpx.RequestError as e:
|
||||
raise StorageError(f"Request to OpenWebUI failed: {e}") from e
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to initialize Open WebUI: {e}") from e
|
||||
|
||||
async def _create_collection(self) -> None:
|
||||
async def _create_collection(self, name: str) -> str:
|
||||
"""Create knowledge base in Open WebUI."""
|
||||
try:
|
||||
response = await self.client.post(
|
||||
"/api/v1/knowledge/create",
|
||||
json={
|
||||
"name": self.config.collection_name,
|
||||
"description": "Documents ingested from various sources"
|
||||
"name": name,
|
||||
"description": "Documents ingested from various sources",
|
||||
"data": {},
|
||||
"access_control": None,
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
knowledge_id = result.get("id")
|
||||
|
||||
if not knowledge_id or not isinstance(knowledge_id, str):
|
||||
raise StorageError("Knowledge base creation failed: no ID returned")
|
||||
|
||||
return str(knowledge_id)
|
||||
|
||||
except httpx.ConnectError as e:
|
||||
raise StorageError(f"Connection to OpenWebUI failed during creation: {e}") from e
|
||||
except httpx.HTTPStatusError as e:
|
||||
raise StorageError(
|
||||
f"OpenWebUI returned error {e.response.status_code} during creation: {e}"
|
||||
) from e
|
||||
except httpx.RequestError as e:
|
||||
raise StorageError(f"Request to OpenWebUI failed during creation: {e}") from e
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to create knowledge base: {e}") from e
|
||||
|
||||
async def _fetch_knowledge_bases(self) -> list[dict[str, object]]:
|
||||
"""Return the list of knowledge bases from the API."""
|
||||
response = await self.client.get("/api/v1/knowledge/list")
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
if not isinstance(data, list):
|
||||
return []
|
||||
normalized: list[dict[str, object]] = []
|
||||
for item in data:
|
||||
if isinstance(item, dict):
|
||||
item_dict: dict[str, object] = item
|
||||
normalized.append({str(k): v for k, v in item_dict.items()})
|
||||
return normalized
|
||||
|
||||
async def _get_knowledge_id(
|
||||
self,
|
||||
name: str | None,
|
||||
*,
|
||||
create: bool,
|
||||
) -> str | None:
|
||||
"""Retrieve (and optionally create) a knowledge base identifier."""
|
||||
target_raw = name or self.config.collection_name
|
||||
target = str(target_raw) if target_raw else ""
|
||||
if not target:
|
||||
raise StorageError("Knowledge base name is required")
|
||||
|
||||
if cached := self._knowledge_cache.get(target):
|
||||
return cached
|
||||
|
||||
knowledge_bases = await self._fetch_knowledge_bases()
|
||||
for kb in knowledge_bases:
|
||||
if kb.get("name") == target:
|
||||
kb_id = kb.get("id")
|
||||
if isinstance(kb_id, str):
|
||||
self._knowledge_cache[target] = kb_id
|
||||
return kb_id
|
||||
|
||||
if not create:
|
||||
return None
|
||||
|
||||
knowledge_id = await self._create_collection(target)
|
||||
self._knowledge_cache[target] = knowledge_id
|
||||
return knowledge_id
|
||||
|
||||
@override
|
||||
async def store(self, document: Document) -> str:
|
||||
async def store(self, document: Document, *, collection_name: str | None = None) -> str:
|
||||
"""
|
||||
Store a document in Open WebUI.
|
||||
Store a document in Open WebUI as a file.
|
||||
|
||||
Args:
|
||||
document: Document to store
|
||||
|
||||
Returns:
|
||||
Document ID
|
||||
File ID
|
||||
"""
|
||||
try:
|
||||
# Vectorize if needed
|
||||
if document.vector is None:
|
||||
document.vector = await self.vectorizer.vectorize(document.content)
|
||||
knowledge_id = await self._get_knowledge_id(
|
||||
collection_name,
|
||||
create=True,
|
||||
)
|
||||
if not knowledge_id:
|
||||
raise StorageError("Knowledge base not initialized")
|
||||
|
||||
# Prepare document data
|
||||
doc_data = {
|
||||
"id": str(document.id),
|
||||
"collection": self.config.collection_name,
|
||||
"content": document.content,
|
||||
"metadata": {
|
||||
**document.metadata,
|
||||
"timestamp": document.metadata["timestamp"].isoformat(),
|
||||
"source": document.source.value,
|
||||
},
|
||||
"embedding": document.vector,
|
||||
}
|
||||
|
||||
# Store document
|
||||
# Step 1: Upload document as file
|
||||
# Use document title from metadata if available, otherwise fall back to ID
|
||||
filename = document.metadata.get("title") or f"doc_{document.id}"
|
||||
# Ensure filename has proper extension
|
||||
if not filename.endswith(('.txt', '.md', '.pdf', '.doc', '.docx')):
|
||||
filename = f"{filename}.txt"
|
||||
files = {"file": (filename, document.content.encode(), "text/plain")}
|
||||
response = await self.client.post(
|
||||
f"/api/knowledge/collections/{self.config.collection_name}/documents", json=doc_data
|
||||
"/api/v1/files/",
|
||||
files=files,
|
||||
params={"process": True, "process_in_background": False},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
document_id = result.get("id") if isinstance(result, dict) else None
|
||||
return str(document_id) if document_id else str(document.id)
|
||||
file_data = response.json()
|
||||
file_id = file_data.get("id")
|
||||
|
||||
if not file_id or not isinstance(file_id, str):
|
||||
raise StorageError("File upload failed: no file ID returned")
|
||||
|
||||
# Step 2: Add file to knowledge base
|
||||
response = await self.client.post(
|
||||
f"/api/v1/knowledge/{knowledge_id}/file/add", json={"file_id": file_id}
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
return str(file_id)
|
||||
|
||||
except httpx.ConnectError as e:
|
||||
raise StorageError(f"Connection to OpenWebUI failed: {e}") from e
|
||||
except httpx.HTTPStatusError as e:
|
||||
raise StorageError(f"OpenWebUI returned error {e.response.status_code}: {e}") from e
|
||||
except httpx.RequestError as e:
|
||||
raise StorageError(f"Request to OpenWebUI failed: {e}") from e
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to store document: {e}") from e
|
||||
|
||||
@override
|
||||
async def store_batch(self, documents: list[Document]) -> list[str]:
|
||||
async def store_batch(
|
||||
self, documents: list[Document], *, collection_name: str | None = None
|
||||
) -> list[str]:
|
||||
"""
|
||||
Store multiple documents in batch.
|
||||
Store multiple documents as files in batch.
|
||||
|
||||
Args:
|
||||
documents: List of documents
|
||||
|
||||
Returns:
|
||||
List of document IDs
|
||||
List of file IDs
|
||||
"""
|
||||
try:
|
||||
# Vectorize documents without vectors
|
||||
for doc in documents:
|
||||
if doc.vector is None:
|
||||
doc.vector = await self.vectorizer.vectorize(doc.content)
|
||||
knowledge_id = await self._get_knowledge_id(
|
||||
collection_name,
|
||||
create=True,
|
||||
)
|
||||
if not knowledge_id:
|
||||
raise StorageError("Knowledge base not initialized")
|
||||
|
||||
# Prepare batch data
|
||||
batch_data = []
|
||||
for doc in documents:
|
||||
batch_data.append(
|
||||
{
|
||||
"id": str(doc.id),
|
||||
"content": doc.content,
|
||||
"metadata": {
|
||||
**doc.metadata,
|
||||
"timestamp": doc.metadata["timestamp"].isoformat(),
|
||||
"source": doc.source.value,
|
||||
},
|
||||
"embedding": doc.vector,
|
||||
}
|
||||
async def upload_and_attach(doc: Document) -> str:
|
||||
# Use document title from metadata if available, otherwise fall back to ID
|
||||
filename = doc.metadata.get("title") or f"doc_{doc.id}"
|
||||
# Ensure filename has proper extension
|
||||
if not filename.endswith(('.txt', '.md', '.pdf', '.doc', '.docx')):
|
||||
filename = f"{filename}.txt"
|
||||
files = {"file": (filename, doc.content.encode(), "text/plain")}
|
||||
upload_response = await self.client.post(
|
||||
"/api/v1/files/",
|
||||
files=files,
|
||||
params={"process": True, "process_in_background": False},
|
||||
)
|
||||
upload_response.raise_for_status()
|
||||
|
||||
file_data = upload_response.json()
|
||||
file_id = file_data.get("id")
|
||||
|
||||
if not file_id or not isinstance(file_id, str):
|
||||
raise StorageError(
|
||||
f"File upload failed for document {doc.id}: no file ID returned"
|
||||
)
|
||||
|
||||
attach_response = await self.client.post(
|
||||
f"/api/v1/knowledge/{knowledge_id}/file/add", json={"file_id": file_id}
|
||||
)
|
||||
attach_response.raise_for_status()
|
||||
|
||||
return str(file_id)
|
||||
|
||||
tasks = [upload_and_attach(doc) for doc in documents]
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
file_ids: list[str] = []
|
||||
failures: list[str] = []
|
||||
|
||||
for index, result in enumerate(results):
|
||||
doc = documents[index]
|
||||
if isinstance(result, Exception):
|
||||
failures.append(f"{doc.id}: {result}")
|
||||
else:
|
||||
file_ids.append(cast(str, result))
|
||||
|
||||
if failures:
|
||||
LOGGER.warning(
|
||||
"OpenWebUI partial batch failure for knowledge base %s: %s",
|
||||
self.config.collection_name,
|
||||
", ".join(failures),
|
||||
)
|
||||
|
||||
# Store batch
|
||||
response = await self.client.post(
|
||||
f"/api/knowledge/collections/{self.config.collection_name}/documents/batch",
|
||||
json={"documents": batch_data},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
ids = result.get("ids") if isinstance(result, dict) else None
|
||||
return ids if isinstance(ids, list) else [str(doc.id) for doc in documents]
|
||||
return file_ids
|
||||
|
||||
except httpx.ConnectError as e:
|
||||
raise StorageError(f"Connection to OpenWebUI failed during batch: {e}") from e
|
||||
except httpx.HTTPStatusError as e:
|
||||
raise StorageError(
|
||||
f"OpenWebUI returned error {e.response.status_code} during batch: {e}"
|
||||
) from e
|
||||
except httpx.RequestError as e:
|
||||
raise StorageError(f"Request to OpenWebUI failed during batch: {e}") from e
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to store batch: {e}") from e
|
||||
|
||||
@override
|
||||
async def retrieve(self, document_id: str) -> Document | None:
|
||||
async def delete(self, document_id: str, *, collection_name: str | None = None) -> bool:
|
||||
"""
|
||||
Retrieve a document from Open WebUI.
|
||||
Remove a file from Open WebUI knowledge base.
|
||||
|
||||
Args:
|
||||
document_id: Document ID
|
||||
document_id: File ID to remove
|
||||
|
||||
Returns:
|
||||
Document or None
|
||||
True if removed successfully
|
||||
"""
|
||||
try:
|
||||
response = await self.client.get(
|
||||
f"/api/knowledge/collections/{self.config.collection_name}/documents/{document_id}"
|
||||
knowledge_id = await self._get_knowledge_id(
|
||||
collection_name,
|
||||
create=False,
|
||||
)
|
||||
if not knowledge_id:
|
||||
return False
|
||||
|
||||
if response.status_code == 404:
|
||||
return None
|
||||
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
# Reconstruct document
|
||||
metadata = data.get("metadata", {})
|
||||
return Document(
|
||||
id=UUID(document_id),
|
||||
content=data["content"],
|
||||
metadata=metadata,
|
||||
vector=data.get("embedding"),
|
||||
source=metadata.get("source", "unknown"),
|
||||
collection=self.config.collection_name,
|
||||
)
|
||||
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
@override
|
||||
async def search(
|
||||
self, query: str, limit: int = 10, threshold: float = 0.7
|
||||
) -> AsyncGenerator[Document, None]:
|
||||
"""
|
||||
Search for documents in Open WebUI.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
limit: Maximum results
|
||||
threshold: Similarity threshold
|
||||
|
||||
Yields:
|
||||
Matching documents
|
||||
"""
|
||||
try:
|
||||
# Vectorize query
|
||||
query_vector = await self.vectorizer.vectorize(query)
|
||||
|
||||
# Perform search
|
||||
# Remove file from knowledge base
|
||||
response = await self.client.post(
|
||||
f"/api/knowledge/collections/{self.config.collection_name}/search",
|
||||
json={
|
||||
"query": query,
|
||||
"embedding": query_vector,
|
||||
"limit": limit,
|
||||
"threshold": threshold,
|
||||
},
|
||||
f"/api/v1/knowledge/{knowledge_id}/file/remove", json={"file_id": document_id}
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
results = response.json()
|
||||
delete_response = await self.client.delete(f"/api/v1/files/{document_id}")
|
||||
if delete_response.status_code == 404:
|
||||
return True
|
||||
delete_response.raise_for_status()
|
||||
return True
|
||||
|
||||
for result in results.get("documents", []):
|
||||
metadata = result.get("metadata", {})
|
||||
doc = Document(
|
||||
id=result["id"],
|
||||
content=result["content"],
|
||||
metadata=metadata,
|
||||
vector=result.get("embedding"),
|
||||
source=metadata.get("source", "unknown"),
|
||||
collection=self.config.collection_name,
|
||||
)
|
||||
yield doc
|
||||
|
||||
except Exception as e:
|
||||
raise StorageError(f"Search failed: {e}") from e
|
||||
|
||||
async def delete(self, document_id: str) -> bool:
|
||||
"""
|
||||
Delete a document from Open WebUI.
|
||||
|
||||
Args:
|
||||
document_id: Document ID
|
||||
|
||||
Returns:
|
||||
True if deleted
|
||||
"""
|
||||
try:
|
||||
response = await self.client.delete(
|
||||
f"/api/knowledge/collections/{self.config.collection_name}/documents/{document_id}"
|
||||
except httpx.ConnectError as exc:
|
||||
LOGGER.error(
|
||||
"Failed to reach OpenWebUI when deleting file %s", document_id, exc_info=exc
|
||||
)
|
||||
return response.status_code in [200, 204]
|
||||
except Exception:
|
||||
return False
|
||||
except httpx.HTTPStatusError as exc:
|
||||
LOGGER.error(
|
||||
"OpenWebUI returned status error %s when deleting file %s",
|
||||
exc.response.status_code if exc.response else "unknown",
|
||||
document_id,
|
||||
exc_info=exc,
|
||||
)
|
||||
return False
|
||||
except httpx.RequestError as exc:
|
||||
LOGGER.error("Request error deleting file %s from OpenWebUI", document_id, exc_info=exc)
|
||||
return False
|
||||
except Exception as exc:
|
||||
LOGGER.error("Unexpected error deleting file %s", document_id, exc_info=exc)
|
||||
return False
|
||||
|
||||
async def count(self) -> int:
|
||||
async def list_collections(self) -> list[str]:
|
||||
"""
|
||||
Get document count in collection.
|
||||
List all available knowledge bases.
|
||||
|
||||
Returns:
|
||||
Number of documents
|
||||
List of knowledge base names
|
||||
"""
|
||||
try:
|
||||
response = await self.client.get(
|
||||
f"/api/knowledge/collections/{self.config.collection_name}/stats"
|
||||
)
|
||||
knowledge_bases = await self._fetch_knowledge_bases()
|
||||
|
||||
# Extract names from knowledge bases
|
||||
return [
|
||||
str(kb.get("name", f"knowledge_{kb.get('id', 'unknown')}") or "")
|
||||
for kb in knowledge_bases
|
||||
]
|
||||
|
||||
except httpx.ConnectError as e:
|
||||
raise StorageError(f"Connection to OpenWebUI failed: {e}") from e
|
||||
except httpx.HTTPStatusError as e:
|
||||
raise StorageError(f"OpenWebUI returned error {e.response.status_code}: {e}") from e
|
||||
except httpx.RequestError as e:
|
||||
raise StorageError(f"Request to OpenWebUI failed: {e}") from e
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to list knowledge bases: {e}") from e
|
||||
|
||||
async def delete_collection(self, collection_name: str) -> bool:
|
||||
"""
|
||||
Delete a knowledge base by name.
|
||||
|
||||
Args:
|
||||
collection_name: Name of the knowledge base to delete
|
||||
|
||||
Returns:
|
||||
True if deleted successfully, False otherwise
|
||||
"""
|
||||
try:
|
||||
knowledge_id = await self._get_knowledge_id(collection_name, create=False)
|
||||
if not knowledge_id:
|
||||
# Collection doesn't exist, consider it already deleted
|
||||
return True
|
||||
|
||||
# Delete the knowledge base using the OpenWebUI API
|
||||
response = await self.client.delete(f"/api/v1/knowledge/{knowledge_id}/delete")
|
||||
response.raise_for_status()
|
||||
|
||||
stats = response.json()
|
||||
count = stats.get("document_count") if isinstance(stats, dict) else None
|
||||
return int(count) if isinstance(count, (int, str)) else 0
|
||||
except Exception:
|
||||
# Remove from cache if it exists
|
||||
if collection_name in self._knowledge_cache:
|
||||
del self._knowledge_cache[collection_name]
|
||||
|
||||
LOGGER.info("Successfully deleted knowledge base: %s", collection_name)
|
||||
return True
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
# Handle 404 as success (already deleted)
|
||||
if e.response.status_code == 404:
|
||||
LOGGER.info("Knowledge base %s was already deleted or not found", collection_name)
|
||||
return True
|
||||
LOGGER.error(
|
||||
"OpenWebUI returned error %s when deleting knowledge base %s",
|
||||
e.response.status_code,
|
||||
collection_name,
|
||||
exc_info=e,
|
||||
)
|
||||
return False
|
||||
except httpx.ConnectError as e:
|
||||
LOGGER.error(
|
||||
"Failed to reach OpenWebUI when deleting knowledge base %s",
|
||||
collection_name,
|
||||
exc_info=e,
|
||||
)
|
||||
return False
|
||||
except httpx.RequestError as e:
|
||||
LOGGER.error(
|
||||
"Request error deleting knowledge base %s from OpenWebUI",
|
||||
collection_name,
|
||||
exc_info=e,
|
||||
)
|
||||
return False
|
||||
except Exception as e:
|
||||
LOGGER.error("Unexpected error deleting knowledge base %s", collection_name, exc_info=e)
|
||||
return False
|
||||
|
||||
class CollectionSummary(TypedDict):
|
||||
"""Structure describing a knowledge base summary."""
|
||||
|
||||
name: str
|
||||
count: int
|
||||
size_mb: float
|
||||
|
||||
async def describe_collections(self) -> list[CollectionSummary]:
|
||||
"""Return metadata about each knowledge base."""
|
||||
try:
|
||||
# First get the list of knowledge bases
|
||||
response = await self.client.get("/api/v1/knowledge/")
|
||||
response.raise_for_status()
|
||||
knowledge_bases = response.json()
|
||||
|
||||
LOGGER.info(f"OpenWebUI returned {len(knowledge_bases)} knowledge bases")
|
||||
LOGGER.debug(f"Knowledge bases structure: {knowledge_bases}")
|
||||
|
||||
collections: list[OpenWebUIStorage.CollectionSummary] = []
|
||||
for kb in knowledge_bases:
|
||||
if not isinstance(kb, dict):
|
||||
continue
|
||||
|
||||
kb_id = kb.get("id")
|
||||
name = kb.get("name", "Unknown")
|
||||
|
||||
LOGGER.info(f"Processing knowledge base: '{name}' (ID: {kb_id})")
|
||||
LOGGER.debug(f"KB structure: {kb}")
|
||||
|
||||
if not kb_id:
|
||||
# If no ID, fall back to basic count from list response
|
||||
files = kb.get("files", [])
|
||||
if files is None:
|
||||
files = []
|
||||
count = len(files) if isinstance(files, list) else 0
|
||||
else:
|
||||
# Get detailed knowledge base information using the correct endpoint
|
||||
try:
|
||||
LOGGER.debug(f"Fetching detailed info for KB '{name}' from /api/v1/knowledge/{kb_id}")
|
||||
detail_response = await self.client.get(f"/api/v1/knowledge/{kb_id}")
|
||||
detail_response.raise_for_status()
|
||||
detailed_kb = detail_response.json()
|
||||
|
||||
LOGGER.debug(f"Detailed KB response: {detailed_kb}")
|
||||
|
||||
files = detailed_kb.get("files", [])
|
||||
if files is None:
|
||||
files = []
|
||||
count = len(files) if isinstance(files, list) else 0
|
||||
|
||||
# Debug logging
|
||||
LOGGER.info(f"Knowledge base '{name}' (ID: {kb_id}): found {count} files")
|
||||
if count > 0 and len(files) > 0:
|
||||
LOGGER.debug(f"First file structure: {files[0] if files else 'No files'}")
|
||||
elif count == 0:
|
||||
LOGGER.warning(f"Knowledge base '{name}' has 0 files. Files field type: {type(files)}, value: {files}")
|
||||
|
||||
except Exception as e:
|
||||
LOGGER.warning(f"Failed to get detailed info for KB '{name}' (ID: {kb_id}): {e}")
|
||||
# Fallback to basic files list if detailed fetch fails
|
||||
files = kb.get("files", [])
|
||||
if files is None:
|
||||
files = []
|
||||
count = len(files) if isinstance(files, list) else 0
|
||||
LOGGER.info(f"Fallback count for KB '{name}': {count}")
|
||||
|
||||
size_mb = count * 0.5 # rough heuristic
|
||||
summary: OpenWebUIStorage.CollectionSummary = {
|
||||
"name": str(name),
|
||||
"count": int(count),
|
||||
"size_mb": float(size_mb),
|
||||
}
|
||||
collections.append(summary)
|
||||
|
||||
return collections
|
||||
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to describe knowledge bases: {e}") from e
|
||||
|
||||
async def count(self, *, collection_name: str | None = None) -> int:
|
||||
"""
|
||||
Get document count for a specific collection (knowledge base).
|
||||
|
||||
Args:
|
||||
collection_name: Name of the knowledge base to count documents for
|
||||
|
||||
Returns:
|
||||
Number of documents in the collection, 0 if collection not found
|
||||
"""
|
||||
if not collection_name:
|
||||
# If no collection name provided, return total across all collections
|
||||
try:
|
||||
collections = await self.describe_collections()
|
||||
return sum(collection["count"] for collection in collections)
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
try:
|
||||
# Get knowledge base by name and return its file count
|
||||
kb = await self.get_knowledge_by_name(collection_name)
|
||||
if not kb:
|
||||
return 0
|
||||
|
||||
kb_id = kb.get("id")
|
||||
if not kb_id:
|
||||
return 0
|
||||
|
||||
# Get detailed knowledge base information to get accurate file count
|
||||
detail_response = await self.client.get(f"/api/v1/knowledge/{kb_id}")
|
||||
detail_response.raise_for_status()
|
||||
detailed_kb = detail_response.json()
|
||||
|
||||
files = detailed_kb.get("files", [])
|
||||
count = len(files) if isinstance(files, list) else 0
|
||||
|
||||
LOGGER.debug(f"Count for collection '{collection_name}': {count} files")
|
||||
return count
|
||||
|
||||
except Exception as e:
|
||||
LOGGER.warning(f"Failed to get count for collection '{collection_name}': {e}")
|
||||
return 0
|
||||
|
||||
async def get_knowledge_by_name(self, name: str) -> dict[str, object] | None:
|
||||
"""
|
||||
Get knowledge base details by name.
|
||||
|
||||
Args:
|
||||
name: Knowledge base name
|
||||
|
||||
Returns:
|
||||
Knowledge base details or None if not found
|
||||
"""
|
||||
try:
|
||||
response = await self.client.get("/api/v1/knowledge/list")
|
||||
response.raise_for_status()
|
||||
knowledge_bases = response.json()
|
||||
|
||||
return next(
|
||||
(
|
||||
{str(k): v for k, v in kb.items()}
|
||||
for kb in knowledge_bases
|
||||
if isinstance(kb, dict) and kb.get("name") == name
|
||||
),
|
||||
None,
|
||||
)
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to get knowledge base by name: {e}") from e
|
||||
|
||||
async def __aenter__(self) -> "OpenWebUIStorage":
|
||||
"""Async context manager entry."""
|
||||
await self.initialize()
|
||||
@@ -293,4 +567,144 @@ class OpenWebUIStorage(BaseStorage):
|
||||
exc_tb: object | None,
|
||||
) -> None:
|
||||
"""Async context manager exit."""
|
||||
await self.client.aclose()
|
||||
await self.close()
|
||||
|
||||
async def list_documents(
|
||||
self,
|
||||
limit: int = 100,
|
||||
offset: int = 0,
|
||||
*,
|
||||
collection_name: str | None = None,
|
||||
) -> list[dict[str, object]]:
|
||||
"""
|
||||
List documents (files) in a knowledge base.
|
||||
|
||||
NOTE: This is a basic implementation that attempts to extract file information
|
||||
from OpenWebUI knowledge bases. The actual file listing capabilities depend
|
||||
on the OpenWebUI API version and may not include detailed file metadata.
|
||||
|
||||
Args:
|
||||
limit: Maximum number of documents to return
|
||||
offset: Number of documents to skip
|
||||
collection_name: Knowledge base name
|
||||
|
||||
Returns:
|
||||
List of document dictionaries with available metadata
|
||||
"""
|
||||
try:
|
||||
# Use the knowledge base name or fall back to default
|
||||
kb_name = collection_name or self.config.collection_name or "default"
|
||||
|
||||
# Try to get knowledge base details
|
||||
knowledge_base = await self.get_knowledge_by_name(kb_name)
|
||||
if not knowledge_base:
|
||||
# If specific KB not found, return empty list with a note
|
||||
return []
|
||||
|
||||
# Extract files if available (API structure may vary)
|
||||
files = knowledge_base.get("files", [])
|
||||
|
||||
# Handle different possible API response structures
|
||||
if not isinstance(files, list):
|
||||
# Some API versions might structure this differently
|
||||
# Try to handle gracefully
|
||||
return [
|
||||
{
|
||||
"id": "unknown",
|
||||
"title": f"Knowledge Base: {kb_name}",
|
||||
"source_url": "",
|
||||
"description": "OpenWebUI knowledge base (file details not available)",
|
||||
"content_type": "text/plain",
|
||||
"content_preview": "Document listing not fully supported for OpenWebUI",
|
||||
"word_count": 0,
|
||||
"timestamp": "",
|
||||
}
|
||||
]
|
||||
|
||||
# Apply pagination
|
||||
paginated_files = files[offset : offset + limit]
|
||||
|
||||
# Convert to document format with safe field access
|
||||
documents: list[dict[str, object]] = []
|
||||
for i, file_info in enumerate(paginated_files):
|
||||
if not isinstance(file_info, dict):
|
||||
continue
|
||||
|
||||
# Safely extract fields with fallbacks
|
||||
doc_id = str(file_info.get("id", f"file_{i}"))
|
||||
|
||||
# Try multiple ways to get filename from OpenWebUI API response
|
||||
filename = None
|
||||
# Check direct filename field
|
||||
if "filename" in file_info:
|
||||
filename = file_info["filename"]
|
||||
# Check name field
|
||||
elif "name" in file_info:
|
||||
filename = file_info["name"]
|
||||
# Check meta.name (from FileModelResponse schema)
|
||||
elif isinstance(file_info.get("meta"), dict):
|
||||
filename = file_info["meta"].get("name")
|
||||
|
||||
# Final fallback
|
||||
if not filename:
|
||||
filename = f"file_{i}"
|
||||
|
||||
filename = str(filename)
|
||||
|
||||
# Extract size from meta if available
|
||||
size = 0
|
||||
if isinstance(file_info.get("meta"), dict):
|
||||
size = file_info["meta"].get("size", 0)
|
||||
else:
|
||||
size = file_info.get("size", 0)
|
||||
|
||||
# Estimate word count from file size (very rough approximation)
|
||||
word_count = max(1, int(size / 6)) if isinstance(size, (int, float)) else 0
|
||||
|
||||
documents.append(
|
||||
{
|
||||
"id": doc_id,
|
||||
"title": filename,
|
||||
"source_url": "", # OpenWebUI files don't typically have source URLs
|
||||
"description": f"File: {filename}",
|
||||
"content_type": str(file_info.get("content_type", "text/plain")),
|
||||
"content_preview": f"File uploaded to OpenWebUI: {filename}",
|
||||
"word_count": word_count,
|
||||
"timestamp": str(
|
||||
file_info.get("created_at") or file_info.get("timestamp", "")
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
return documents
|
||||
|
||||
except Exception as e:
|
||||
# Since OpenWebUI file listing API structure is not guaranteed,
|
||||
# we gracefully fall back rather than raise an error
|
||||
import logging
|
||||
|
||||
logging.warning(f"OpenWebUI document listing failed: {e}")
|
||||
|
||||
# Return a placeholder entry indicating limited support
|
||||
return [
|
||||
{
|
||||
"id": "api_error",
|
||||
"title": f"Knowledge Base: {collection_name or 'default'}",
|
||||
"source_url": "",
|
||||
"description": "Document listing encountered an error - API compatibility issue",
|
||||
"content_type": "text/plain",
|
||||
"content_preview": f"Error: {str(e)[:100]}...",
|
||||
"word_count": 0,
|
||||
"timestamp": "",
|
||||
}
|
||||
]
|
||||
|
||||
async def close(self) -> None:
|
||||
"""Close client connection."""
|
||||
if hasattr(self, "client") and self.client:
|
||||
try:
|
||||
await self.client.aclose()
|
||||
except Exception as e:
|
||||
import logging
|
||||
|
||||
logging.warning(f"Error closing OpenWebUI client: {e}")
|
||||
|
||||
5
ingest_pipeline/storage/r2r/__init__.py
Normal file
5
ingest_pipeline/storage/r2r/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""R2R storage package providing comprehensive R2R integration."""
|
||||
|
||||
from .storage import R2RStorage
|
||||
|
||||
__all__ = ["R2RStorage"]
|
||||
BIN
ingest_pipeline/storage/r2r/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
ingest_pipeline/storage/r2r/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
ingest_pipeline/storage/r2r/__pycache__/storage.cpython-312.pyc
Normal file
BIN
ingest_pipeline/storage/r2r/__pycache__/storage.cpython-312.pyc
Normal file
Binary file not shown.
461
ingest_pipeline/storage/r2r/collections.py
Normal file
461
ingest_pipeline/storage/r2r/collections.py
Normal file
@@ -0,0 +1,461 @@
|
||||
"""Comprehensive collection CRUD operations for R2R."""
|
||||
|
||||
from typing import TypedDict, cast
|
||||
from uuid import UUID
|
||||
|
||||
from r2r import R2RAsyncClient
|
||||
|
||||
from ...core.exceptions import StorageError
|
||||
|
||||
# JSON serializable type for API responses
|
||||
JsonData = dict[str, str | int | bool | None]
|
||||
|
||||
|
||||
class DocumentAddResult(TypedDict, total=False):
|
||||
"""Result of adding a document to a collection."""
|
||||
|
||||
document_id: str
|
||||
added: bool
|
||||
result: JsonData
|
||||
error: str
|
||||
|
||||
|
||||
class DocumentRemoveResult(TypedDict, total=False):
|
||||
"""Result of removing a document from a collection."""
|
||||
|
||||
document_id: str
|
||||
removed: bool
|
||||
error: str
|
||||
|
||||
|
||||
class ExportResult(TypedDict):
|
||||
"""Result of a CSV export operation."""
|
||||
|
||||
exported: int
|
||||
path: str
|
||||
|
||||
|
||||
class R2RCollections:
|
||||
"""Comprehensive collection management for R2R."""
|
||||
|
||||
client: R2RAsyncClient
|
||||
|
||||
def __init__(self, client: R2RAsyncClient) -> None:
|
||||
"""Initialize collections manager with R2R client."""
|
||||
self.client = client
|
||||
|
||||
async def create(self, name: str, description: str | None = None) -> JsonData:
|
||||
"""Create a new collection in R2R.
|
||||
|
||||
Args:
|
||||
name: Collection name
|
||||
description: Optional collection description
|
||||
|
||||
Returns:
|
||||
Created collection information
|
||||
|
||||
Raises:
|
||||
StorageError: If collection creation fails
|
||||
"""
|
||||
try:
|
||||
response = await self.client.collections.create(
|
||||
name=name,
|
||||
description=description,
|
||||
)
|
||||
return cast(JsonData, response.results.model_dump())
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to create collection '{name}': {e}") from e
|
||||
|
||||
async def retrieve(self, collection_id: str | UUID) -> JsonData:
|
||||
"""Retrieve a collection by ID.
|
||||
|
||||
Args:
|
||||
collection_id: Collection ID to retrieve
|
||||
|
||||
Returns:
|
||||
Collection information
|
||||
|
||||
Raises:
|
||||
StorageError: If collection retrieval fails
|
||||
"""
|
||||
try:
|
||||
response = await self.client.collections.retrieve(str(collection_id))
|
||||
return cast(JsonData, response.results.model_dump())
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to retrieve collection {collection_id}: {e}") from e
|
||||
|
||||
async def update(
|
||||
self,
|
||||
collection_id: str | UUID,
|
||||
name: str | None = None,
|
||||
description: str | None = None,
|
||||
) -> JsonData:
|
||||
"""Update collection metadata.
|
||||
|
||||
Args:
|
||||
collection_id: Collection ID to update
|
||||
name: New name (optional)
|
||||
description: New description (optional)
|
||||
|
||||
Returns:
|
||||
Updated collection information
|
||||
|
||||
Raises:
|
||||
StorageError: If collection update fails
|
||||
"""
|
||||
try:
|
||||
response = await self.client.collections.update(
|
||||
id=str(collection_id),
|
||||
name=name,
|
||||
description=description,
|
||||
)
|
||||
return cast(JsonData, response.results.model_dump())
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to update collection {collection_id}: {e}") from e
|
||||
|
||||
async def delete(self, collection_id: str | UUID) -> bool:
|
||||
"""Delete a collection by ID.
|
||||
|
||||
Args:
|
||||
collection_id: Collection ID to delete
|
||||
|
||||
Returns:
|
||||
True if deletion was successful
|
||||
|
||||
Raises:
|
||||
StorageError: If collection deletion fails
|
||||
"""
|
||||
try:
|
||||
_ = await self.client.collections.delete(str(collection_id))
|
||||
return True
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to delete collection {collection_id}: {e}") from e
|
||||
|
||||
async def list_all(
|
||||
self, offset: int = 0, limit: int = 100, owner_only: bool = False
|
||||
) -> JsonData:
|
||||
"""List collections with pagination support.
|
||||
|
||||
Args:
|
||||
offset: Starting offset for pagination
|
||||
limit: Maximum number of collections to return
|
||||
owner_only: Only return collections owned by current user
|
||||
|
||||
Returns:
|
||||
Paginated list of collections
|
||||
|
||||
Raises:
|
||||
StorageError: If collection listing fails
|
||||
"""
|
||||
try:
|
||||
response = await self.client.collections.list(
|
||||
offset=offset,
|
||||
limit=limit,
|
||||
owner_only=owner_only,
|
||||
)
|
||||
return cast(JsonData, response.results.model_dump())
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to list collections: {e}") from e
|
||||
|
||||
async def get_by_name(
|
||||
self, collection_name: str, owner_id: str | UUID | None = None
|
||||
) -> JsonData:
|
||||
"""Get collection by name with optional owner filter.
|
||||
|
||||
Args:
|
||||
collection_name: Name of the collection
|
||||
owner_id: Optional owner ID filter
|
||||
|
||||
Returns:
|
||||
Collection information
|
||||
|
||||
Raises:
|
||||
StorageError: If collection retrieval fails
|
||||
"""
|
||||
try:
|
||||
# List all collections and find by name
|
||||
collections_response = await self.client.collections.list()
|
||||
for collection in collections_response.results:
|
||||
if (
|
||||
owner_id is None or str(collection.owner_id) == str(owner_id)
|
||||
) and collection.name == collection_name:
|
||||
return cast(JsonData, collection.model_dump())
|
||||
raise StorageError(f"Collection '{collection_name}' not found")
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to get collection by name '{collection_name}': {e}") from e
|
||||
|
||||
async def add_document(self, collection_id: str | UUID, document_id: str | UUID) -> JsonData:
|
||||
"""Associate a document with a collection.
|
||||
|
||||
Args:
|
||||
collection_id: Collection ID
|
||||
document_id: Document ID to add
|
||||
|
||||
Returns:
|
||||
Association result
|
||||
|
||||
Raises:
|
||||
StorageError: If document association fails
|
||||
"""
|
||||
try:
|
||||
response = await self.client.collections.add_document(
|
||||
id=str(collection_id),
|
||||
document_id=str(document_id),
|
||||
)
|
||||
return cast(JsonData, response.results.model_dump())
|
||||
except Exception as e:
|
||||
raise StorageError(
|
||||
f"Failed to add document {document_id} to collection {collection_id}: {e}"
|
||||
) from e
|
||||
|
||||
async def remove_document(self, collection_id: str | UUID, document_id: str | UUID) -> bool:
|
||||
"""Remove document association from collection.
|
||||
|
||||
Args:
|
||||
collection_id: Collection ID
|
||||
document_id: Document ID to remove
|
||||
|
||||
Returns:
|
||||
True if removal was successful
|
||||
|
||||
Raises:
|
||||
StorageError: If document removal fails
|
||||
"""
|
||||
try:
|
||||
await self.client.collections.remove_document(
|
||||
id=str(collection_id),
|
||||
document_id=str(document_id),
|
||||
)
|
||||
return True
|
||||
except Exception as e:
|
||||
raise StorageError(
|
||||
f"Failed to remove document {document_id} from collection {collection_id}: {e}"
|
||||
) from e
|
||||
|
||||
async def list_documents(
|
||||
self, collection_id: str | UUID, offset: int = 0, limit: int = 100
|
||||
) -> JsonData:
|
||||
"""List all documents in a collection with pagination.
|
||||
|
||||
Args:
|
||||
collection_id: Collection ID
|
||||
offset: Starting offset for pagination
|
||||
limit: Maximum number of documents to return
|
||||
|
||||
Returns:
|
||||
Paginated list of documents in collection
|
||||
|
||||
Raises:
|
||||
StorageError: If document listing fails
|
||||
"""
|
||||
try:
|
||||
response = await self.client.collections.list_documents(
|
||||
id=str(collection_id),
|
||||
offset=offset,
|
||||
limit=limit,
|
||||
)
|
||||
return cast(JsonData, response.results.model_dump())
|
||||
except Exception as e:
|
||||
raise StorageError(
|
||||
f"Failed to list documents in collection {collection_id}: {e}"
|
||||
) from e
|
||||
|
||||
async def add_user(self, collection_id: str | UUID, user_id: str | UUID) -> JsonData:
|
||||
"""Grant user access to a collection.
|
||||
|
||||
Args:
|
||||
collection_id: Collection ID
|
||||
user_id: User ID to grant access
|
||||
|
||||
Returns:
|
||||
Access grant result
|
||||
|
||||
Raises:
|
||||
StorageError: If user access grant fails
|
||||
"""
|
||||
try:
|
||||
response = await self.client.collections.add_user(
|
||||
id=str(collection_id),
|
||||
user_id=str(user_id),
|
||||
)
|
||||
return cast(JsonData, response.results.model_dump())
|
||||
except Exception as e:
|
||||
raise StorageError(
|
||||
f"Failed to add user {user_id} to collection {collection_id}: {e}"
|
||||
) from e
|
||||
|
||||
async def remove_user(self, collection_id: str | UUID, user_id: str | UUID) -> bool:
|
||||
"""Revoke user access from a collection.
|
||||
|
||||
Args:
|
||||
collection_id: Collection ID
|
||||
user_id: User ID to revoke access
|
||||
|
||||
Returns:
|
||||
True if revocation was successful
|
||||
|
||||
Raises:
|
||||
StorageError: If user access revocation fails
|
||||
"""
|
||||
try:
|
||||
await self.client.collections.remove_user(
|
||||
id=str(collection_id),
|
||||
user_id=str(user_id),
|
||||
)
|
||||
return True
|
||||
except Exception as e:
|
||||
raise StorageError(
|
||||
f"Failed to remove user {user_id} from collection {collection_id}: {e}"
|
||||
) from e
|
||||
|
||||
async def list_users(
|
||||
self, collection_id: str | UUID, offset: int = 0, limit: int = 100
|
||||
) -> JsonData:
|
||||
"""List all users with access to a collection.
|
||||
|
||||
Args:
|
||||
collection_id: Collection ID
|
||||
offset: Starting offset for pagination
|
||||
limit: Maximum number of users to return
|
||||
|
||||
Returns:
|
||||
Paginated list of users with collection access
|
||||
|
||||
Raises:
|
||||
StorageError: If user listing fails
|
||||
"""
|
||||
try:
|
||||
response = await self.client.collections.list_users(
|
||||
id=str(collection_id),
|
||||
offset=offset,
|
||||
limit=limit,
|
||||
)
|
||||
return cast(JsonData, response.results.model_dump())
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to list users for collection {collection_id}: {e}") from e
|
||||
|
||||
async def extract_entities(
|
||||
self,
|
||||
collection_id: str | UUID,
|
||||
run_with_orchestration: bool = True,
|
||||
settings: JsonData | None = None,
|
||||
) -> JsonData:
|
||||
"""Extract entities and relationships from collection documents.
|
||||
|
||||
Args:
|
||||
collection_id: Collection ID
|
||||
run_with_orchestration: Whether to run with orchestration
|
||||
settings: Extraction configuration settings
|
||||
|
||||
Returns:
|
||||
Extraction results
|
||||
|
||||
Raises:
|
||||
StorageError: If entity extraction fails
|
||||
"""
|
||||
try:
|
||||
response = await self.client.collections.extract(
|
||||
id=str(collection_id),
|
||||
run_with_orchestration=run_with_orchestration,
|
||||
settings=cast(dict[str, object], settings or {}),
|
||||
)
|
||||
return cast(JsonData, response.results.model_dump())
|
||||
except Exception as e:
|
||||
raise StorageError(
|
||||
f"Failed to extract entities from collection {collection_id}: {e}"
|
||||
) from e
|
||||
|
||||
async def export_to_csv(
|
||||
self, output_path: str, columns: list[str] | None = None, include_header: bool = True
|
||||
) -> ExportResult:
|
||||
"""Export collections to CSV format.
|
||||
|
||||
Args:
|
||||
output_path: Path for the exported CSV file
|
||||
columns: Specific columns to export (optional)
|
||||
include_header: Whether to include header row
|
||||
|
||||
Returns:
|
||||
Export result information
|
||||
|
||||
Raises:
|
||||
StorageError: If export fails
|
||||
"""
|
||||
# R2R SDK doesn't currently support collection export
|
||||
# Implement a basic CSV export using list()
|
||||
try:
|
||||
import csv
|
||||
from pathlib import Path
|
||||
|
||||
collections_response = await self.client.collections.list()
|
||||
collections_data = [
|
||||
{
|
||||
"id": str(c.id),
|
||||
"name": c.name,
|
||||
"description": c.description or "",
|
||||
"owner_id": str(c.owner_id) if hasattr(c, "owner_id") else "",
|
||||
}
|
||||
for c in collections_response.results
|
||||
]
|
||||
|
||||
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(output_path, "w", newline="", encoding="utf-8") as csvfile:
|
||||
if not collections_data:
|
||||
return {"exported": 0, "path": output_path}
|
||||
|
||||
fieldnames = columns or list(collections_data[0].keys())
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
|
||||
if include_header:
|
||||
writer.writeheader()
|
||||
|
||||
for collection in collections_data:
|
||||
filtered_collection = {k: v for k, v in collection.items() if k in fieldnames}
|
||||
writer.writerow(filtered_collection)
|
||||
|
||||
return {"exported": len(collections_data), "path": output_path}
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to export collections: {e}") from e
|
||||
|
||||
async def batch_add_documents(
|
||||
self, collection_id: str | UUID, document_ids: list[str | UUID]
|
||||
) -> list[DocumentAddResult]:
|
||||
"""Add multiple documents to a collection efficiently.
|
||||
|
||||
Args:
|
||||
collection_id: Collection ID
|
||||
document_ids: List of document IDs to add
|
||||
|
||||
Returns:
|
||||
List of addition results
|
||||
"""
|
||||
results: list[DocumentAddResult] = []
|
||||
for doc_id in document_ids:
|
||||
try:
|
||||
result = await self.add_document(collection_id, doc_id)
|
||||
results.append({"document_id": str(doc_id), "added": True, "result": result})
|
||||
except StorageError as e:
|
||||
results.append({"document_id": str(doc_id), "added": False, "error": str(e)})
|
||||
return results
|
||||
|
||||
async def batch_remove_documents(
|
||||
self, collection_id: str | UUID, document_ids: list[str | UUID]
|
||||
) -> list[DocumentRemoveResult]:
|
||||
"""Remove multiple documents from a collection efficiently.
|
||||
|
||||
Args:
|
||||
collection_id: Collection ID
|
||||
document_ids: List of document IDs to remove
|
||||
|
||||
Returns:
|
||||
List of removal results
|
||||
"""
|
||||
results: list[DocumentRemoveResult] = []
|
||||
for doc_id in document_ids:
|
||||
try:
|
||||
success = await self.remove_document(collection_id, doc_id)
|
||||
results.append({"document_id": str(doc_id), "removed": success})
|
||||
except StorageError as e:
|
||||
results.append({"document_id": str(doc_id), "removed": False, "error": str(e)})
|
||||
return results
|
||||
831
ingest_pipeline/storage/r2r/storage.py
Normal file
831
ingest_pipeline/storage/r2r/storage.py
Normal file
@@ -0,0 +1,831 @@
|
||||
"""R2R storage implementation using the official R2R SDK."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import contextlib
|
||||
from collections.abc import AsyncGenerator, Iterable, Mapping, Sequence
|
||||
from datetime import UTC, datetime
|
||||
from typing import Self, TypeVar, cast
|
||||
from uuid import UUID, uuid4
|
||||
|
||||
import httpx
|
||||
from r2r import R2RAsyncClient, R2RException
|
||||
from typing_extensions import override
|
||||
|
||||
from ...core.exceptions import StorageError
|
||||
from ...core.models import Document, DocumentMetadata, IngestionSource, StorageConfig
|
||||
from ..base import BaseStorage
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
def _as_mapping(value: object) -> dict[str, object]:
|
||||
if isinstance(value, Mapping):
|
||||
return dict(cast(Mapping[str, object], value))
|
||||
if hasattr(value, "__dict__"):
|
||||
return dict(cast(Mapping[str, object], value.__dict__))
|
||||
return {}
|
||||
|
||||
|
||||
def _as_sequence(value: object) -> tuple[object, ...]:
|
||||
"""Convert value to a tuple of objects."""
|
||||
if isinstance(value, Sequence):
|
||||
return tuple(value)
|
||||
return tuple(value) if isinstance(value, Iterable) else ()
|
||||
|
||||
|
||||
def _extract_id(source: object, fallback: str) -> str:
|
||||
mapping = _as_mapping(source)
|
||||
identifier = mapping.get("id") if mapping else None
|
||||
if identifier is None and hasattr(source, "id"):
|
||||
identifier = getattr(source, "id", None)
|
||||
return fallback if identifier is None else str(identifier)
|
||||
|
||||
|
||||
def _as_datetime(value: object) -> datetime:
|
||||
if isinstance(value, datetime):
|
||||
return value
|
||||
if isinstance(value, str):
|
||||
with contextlib.suppress(ValueError):
|
||||
return datetime.fromisoformat(value)
|
||||
return datetime.now(UTC)
|
||||
|
||||
|
||||
def _as_int(value: object, default: int = 0) -> int:
|
||||
if isinstance(value, bool):
|
||||
return int(value)
|
||||
if isinstance(value, int):
|
||||
return value
|
||||
if isinstance(value, float):
|
||||
return int(value)
|
||||
if isinstance(value, str):
|
||||
try:
|
||||
return int(float(value)) if "." in value else int(value)
|
||||
except ValueError:
|
||||
return default
|
||||
return default
|
||||
|
||||
|
||||
class R2RStorage(BaseStorage):
|
||||
"""R2R storage implementation using the official R2R SDK."""
|
||||
|
||||
def __init__(self, config: StorageConfig) -> None:
|
||||
"""Initialize R2R storage with SDK client."""
|
||||
super().__init__(config)
|
||||
self.endpoint: str = str(config.endpoint).rstrip("/")
|
||||
self.client: R2RAsyncClient = R2RAsyncClient(self.endpoint)
|
||||
self.default_collection_id: str | None = None
|
||||
|
||||
@override
|
||||
async def initialize(self) -> None:
|
||||
"""Initialize R2R connection and ensure default collection exists."""
|
||||
try:
|
||||
# Ensure we have an event loop
|
||||
try:
|
||||
asyncio.get_running_loop()
|
||||
except RuntimeError:
|
||||
# No event loop running, this should not happen in async context
|
||||
# but let's be defensive
|
||||
import logging
|
||||
|
||||
logging.warning("No event loop found during R2R initialization")
|
||||
|
||||
# Test connection using direct HTTP call to v3 API
|
||||
endpoint = self.endpoint
|
||||
client = httpx.AsyncClient()
|
||||
try:
|
||||
response = await client.get(f"{endpoint}/v3/collections")
|
||||
response.raise_for_status()
|
||||
finally:
|
||||
await client.aclose()
|
||||
_ = await self._ensure_collection(self.config.collection_name)
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to initialize R2R: {e}") from e
|
||||
|
||||
async def _ensure_collection(self, collection_name: str) -> str:
|
||||
"""Get or create collection by name."""
|
||||
try:
|
||||
endpoint = self.endpoint
|
||||
client = httpx.AsyncClient()
|
||||
try:
|
||||
# List collections and find by name
|
||||
response = await client.get(f"{endpoint}/v3/collections")
|
||||
response.raise_for_status()
|
||||
data: dict[str, object] = response.json()
|
||||
|
||||
results = cast(list[dict[str, object]], data.get("results", []))
|
||||
for collection in results:
|
||||
if collection.get("name") == collection_name:
|
||||
collection_id = str(collection.get("id"))
|
||||
if collection_name == self.config.collection_name:
|
||||
self.default_collection_id = collection_id
|
||||
return collection_id
|
||||
|
||||
# Create if not found
|
||||
create_response = await client.post(
|
||||
f"{endpoint}/v3/collections",
|
||||
json={
|
||||
"name": collection_name,
|
||||
"description": f"Auto-created collection: {collection_name}",
|
||||
},
|
||||
)
|
||||
create_response.raise_for_status()
|
||||
created: dict[str, object] = create_response.json()
|
||||
created_results = cast(dict[str, object], created.get("results", {}))
|
||||
collection_id = str(created_results.get("id"))
|
||||
|
||||
if collection_name == self.config.collection_name:
|
||||
self.default_collection_id = collection_id
|
||||
|
||||
return collection_id
|
||||
finally:
|
||||
await client.aclose()
|
||||
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to ensure collection '{collection_name}': {e}") from e
|
||||
|
||||
@override
|
||||
async def store(self, document: Document, *, collection_name: str | None = None) -> str:
|
||||
"""Store a single document."""
|
||||
return (await self.store_batch([document], collection_name=collection_name))[0]
|
||||
|
||||
@override
|
||||
async def store_batch(
|
||||
self, documents: list[Document], *, collection_name: str | None = None
|
||||
) -> list[str]:
|
||||
"""Store multiple documents."""
|
||||
# Fix: Always ensure we have the correct collection ID
|
||||
if collection_name:
|
||||
# If a specific collection is requested, get its ID
|
||||
collection_id = await self._ensure_collection(collection_name)
|
||||
else:
|
||||
# If no collection specified, use the default one from config
|
||||
if self.default_collection_id:
|
||||
collection_id = self.default_collection_id
|
||||
else:
|
||||
# Fallback: ensure the default collection exists
|
||||
collection_id = await self._ensure_collection(self.config.collection_name)
|
||||
self.default_collection_id = collection_id
|
||||
|
||||
print(
|
||||
f"Using collection ID: {collection_id} for collection: {collection_name or self.config.collection_name}"
|
||||
)
|
||||
|
||||
stored_ids: list[str] = []
|
||||
failed_documents: list[Document] = []
|
||||
for document in documents:
|
||||
try:
|
||||
# Create document with explicit ID using direct HTTP call
|
||||
requested_id = str(document.id)
|
||||
print(f"Creating document with ID: {requested_id}")
|
||||
|
||||
# Validate document before sending to R2R
|
||||
if not document.content or not document.content.strip():
|
||||
print(f"Skipping document {requested_id}: empty content")
|
||||
failed_documents.append(document)
|
||||
continue
|
||||
|
||||
if len(document.content) > 1_000_000: # 1MB limit
|
||||
print(
|
||||
f"Skipping document {requested_id}: content too large ({len(document.content)} chars)"
|
||||
)
|
||||
failed_documents.append(document)
|
||||
continue
|
||||
|
||||
# Use direct HTTP call with proper multipart form-data format
|
||||
import asyncio
|
||||
import json
|
||||
|
||||
max_retries = 3
|
||||
retry_delay = 1.0
|
||||
doc_response = None # Initialize variable to avoid UnboundLocalError
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
async with httpx.AsyncClient() as http_client:
|
||||
# Use files parameter but with string values for multipart/form-data
|
||||
# This matches the cURL -F behavior more closely
|
||||
metadata = self._build_metadata(document)
|
||||
print(f"Built metadata for document {requested_id}: {metadata}")
|
||||
|
||||
files = {
|
||||
"raw_text": (None, document.content),
|
||||
"metadata": (None, json.dumps(metadata)),
|
||||
"id": (None, requested_id),
|
||||
"ingestion_mode": (None, "hi-res"), # Enable R2R enrichment
|
||||
}
|
||||
|
||||
# Add collection_ids if we have a collection to assign to
|
||||
if collection_id:
|
||||
files["collection_ids"] = (None, json.dumps([collection_id]))
|
||||
print(
|
||||
f"Creating document {requested_id} with collection_ids: [{collection_id}]"
|
||||
)
|
||||
|
||||
print(f"Sending to R2R - files keys: {list(files.keys())}")
|
||||
print(f"Metadata JSON: {files['metadata'][1]}")
|
||||
|
||||
response = await http_client.post(
|
||||
f"{self.endpoint}/v3/documents",
|
||||
files=files,
|
||||
)
|
||||
|
||||
if response.status_code == 422:
|
||||
# Get detailed error information for 422 responses
|
||||
try:
|
||||
error_detail = response.json()
|
||||
print(
|
||||
f"R2R validation error for document {requested_id}: {error_detail}"
|
||||
)
|
||||
print(f"Document content length: {len(document.content)}")
|
||||
print(f"Document metadata sent: {metadata}")
|
||||
print(f"Response status: {response.status_code}")
|
||||
print(f"Response headers: {dict(response.headers)}")
|
||||
except Exception:
|
||||
print(
|
||||
f"R2R validation error for document {requested_id}: {response.text}"
|
||||
)
|
||||
print(f"Document metadata sent: {metadata}")
|
||||
# Don't retry validation errors
|
||||
break
|
||||
|
||||
if response.status_code >= 500:
|
||||
# Server error - retry
|
||||
if attempt < max_retries - 1:
|
||||
print(
|
||||
f"Server error {response.status_code} for document {requested_id}, retrying in {retry_delay}s..."
|
||||
)
|
||||
await asyncio.sleep(retry_delay)
|
||||
retry_delay *= 2 # Exponential backoff
|
||||
continue
|
||||
|
||||
response.raise_for_status()
|
||||
doc_response = response.json()
|
||||
break # Success - exit retry loop
|
||||
|
||||
except httpx.TimeoutException:
|
||||
if attempt < max_retries - 1:
|
||||
print(
|
||||
f"Timeout for document {requested_id}, retrying in {retry_delay}s..."
|
||||
)
|
||||
await asyncio.sleep(retry_delay)
|
||||
retry_delay *= 2
|
||||
continue
|
||||
else:
|
||||
raise
|
||||
except httpx.HTTPStatusError as e:
|
||||
if e.response.status_code >= 500 and attempt < max_retries - 1:
|
||||
print(
|
||||
f"Server error {e.response.status_code} for document {requested_id}, retrying in {retry_delay}s..."
|
||||
)
|
||||
await asyncio.sleep(retry_delay)
|
||||
retry_delay *= 2
|
||||
continue
|
||||
else:
|
||||
raise
|
||||
|
||||
# Only process response if we have a successful doc_response
|
||||
if doc_response is not None:
|
||||
response_payload = doc_response.get("results", doc_response)
|
||||
doc_id = _extract_id(response_payload, requested_id)
|
||||
|
||||
print(f"R2R returned document ID: {doc_id}")
|
||||
|
||||
# Verify the ID matches what we requested
|
||||
if doc_id != requested_id:
|
||||
print(f"Warning: Requested ID {requested_id} but got {doc_id}")
|
||||
|
||||
# Collection assignment is now handled during document creation
|
||||
# No need to add to collection afterward if collection_ids was provided
|
||||
if collection_id:
|
||||
print(
|
||||
f"Document {doc_id} should be assigned to collection {collection_id} via creation API"
|
||||
)
|
||||
|
||||
stored_ids.append(doc_id)
|
||||
else:
|
||||
print(f"No successful response received for document {requested_id}")
|
||||
failed_documents.append(document)
|
||||
|
||||
except Exception as exc:
|
||||
print(f"Failed to store document {document.id}: {exc}")
|
||||
failed_documents.append(document)
|
||||
|
||||
# Log specific error types for debugging
|
||||
if "422" in str(exc):
|
||||
print(" → Data validation issue - check document content and metadata format")
|
||||
elif "timeout" in str(exc).lower():
|
||||
print(" → Network timeout - R2R may be overloaded")
|
||||
elif "500" in str(exc):
|
||||
print(" → Server error - R2R internal issue")
|
||||
else:
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
continue
|
||||
|
||||
return stored_ids
|
||||
|
||||
def _build_metadata(self, document: Document) -> dict[str, object]:
|
||||
"""Convert document metadata to enriched R2R format."""
|
||||
metadata = document.metadata
|
||||
|
||||
|
||||
# Core required fields
|
||||
result: dict[str, object] = {
|
||||
"source_url": metadata["source_url"],
|
||||
"content_type": metadata["content_type"],
|
||||
"word_count": metadata["word_count"],
|
||||
"char_count": metadata["char_count"],
|
||||
"timestamp": metadata["timestamp"].isoformat(),
|
||||
"ingestion_source": document.source.value,
|
||||
}
|
||||
|
||||
# Basic optional fields
|
||||
if title := metadata.get("title"):
|
||||
result["title"] = title
|
||||
if description := metadata.get("description"):
|
||||
result["description"] = description
|
||||
|
||||
# Content categorization
|
||||
if tags := metadata.get("tags"):
|
||||
result["tags"] = tags
|
||||
if category := metadata.get("category"):
|
||||
result["category"] = category
|
||||
if section := metadata.get("section"):
|
||||
result["section"] = section
|
||||
if language := metadata.get("language"):
|
||||
result["language"] = language
|
||||
|
||||
# Authorship and source info
|
||||
if author := metadata.get("author"):
|
||||
result["author"] = author
|
||||
if domain := metadata.get("domain"):
|
||||
result["domain"] = domain
|
||||
if site_name := metadata.get("site_name"):
|
||||
result["site_name"] = site_name
|
||||
|
||||
# Document structure
|
||||
if heading_hierarchy := metadata.get("heading_hierarchy"):
|
||||
result["heading_hierarchy"] = heading_hierarchy
|
||||
if section_depth := metadata.get("section_depth"):
|
||||
result["section_depth"] = section_depth
|
||||
if has_code_blocks := metadata.get("has_code_blocks"):
|
||||
result["has_code_blocks"] = has_code_blocks
|
||||
if has_images := metadata.get("has_images"):
|
||||
result["has_images"] = has_images
|
||||
if has_links := metadata.get("has_links"):
|
||||
result["has_links"] = has_links
|
||||
|
||||
# Processing metadata
|
||||
if extraction_method := metadata.get("extraction_method"):
|
||||
result["extraction_method"] = extraction_method
|
||||
if crawl_depth := metadata.get("crawl_depth"):
|
||||
result["crawl_depth"] = crawl_depth
|
||||
if last_modified := metadata.get("last_modified"):
|
||||
result["last_modified"] = last_modified.isoformat() if last_modified else None
|
||||
|
||||
# Content quality indicators
|
||||
if readability_score := metadata.get("readability_score"):
|
||||
result["readability_score"] = readability_score
|
||||
if completeness_score := metadata.get("completeness_score"):
|
||||
result["completeness_score"] = completeness_score
|
||||
|
||||
# Repository-specific fields
|
||||
if file_path := metadata.get("file_path"):
|
||||
result["file_path"] = file_path
|
||||
if repository_name := metadata.get("repository_name"):
|
||||
result["repository_name"] = repository_name
|
||||
if branch_name := metadata.get("branch_name"):
|
||||
result["branch_name"] = branch_name
|
||||
if commit_hash := metadata.get("commit_hash"):
|
||||
result["commit_hash"] = commit_hash
|
||||
if programming_language := metadata.get("programming_language"):
|
||||
result["programming_language"] = programming_language
|
||||
|
||||
# Custom business metadata
|
||||
if importance_score := metadata.get("importance_score"):
|
||||
result["importance_score"] = importance_score
|
||||
if review_status := metadata.get("review_status"):
|
||||
result["review_status"] = review_status
|
||||
if assigned_team := metadata.get("assigned_team"):
|
||||
result["assigned_team"] = assigned_team
|
||||
|
||||
return result
|
||||
|
||||
@override
|
||||
async def retrieve(
|
||||
self, document_id: str, *, collection_name: str | None = None
|
||||
) -> Document | None:
|
||||
"""Retrieve a document by ID."""
|
||||
try:
|
||||
response = await self.client.documents.retrieve(document_id)
|
||||
except R2RException as exc:
|
||||
status_code = getattr(exc, "status_code", None)
|
||||
if status_code == 404:
|
||||
return None
|
||||
import logging
|
||||
|
||||
logging.warning(f"Unexpected error retrieving document {document_id}: {exc}")
|
||||
return None
|
||||
except Exception as error:
|
||||
import logging
|
||||
|
||||
logging.warning(f"Unexpected error retrieving document {document_id}: {error}")
|
||||
return None
|
||||
payload = getattr(response, "results", response)
|
||||
return self._convert_to_document(payload, collection_name)
|
||||
|
||||
def _convert_to_document(self, r2r_doc: object, collection_name: str | None = None) -> Document:
|
||||
"""Convert R2R document payload to our Document model."""
|
||||
doc_map = _as_mapping(r2r_doc)
|
||||
metadata_map = _as_mapping(doc_map.get("metadata", {}))
|
||||
|
||||
|
||||
doc_id_str = _extract_id(r2r_doc, str(uuid4()))
|
||||
try:
|
||||
doc_uuid = UUID(doc_id_str)
|
||||
except ValueError:
|
||||
doc_uuid = uuid4()
|
||||
|
||||
timestamp = _as_datetime(doc_map.get("created_at", metadata_map.get("timestamp")))
|
||||
|
||||
metadata: DocumentMetadata = {
|
||||
# Core required fields
|
||||
"source_url": str(metadata_map.get("source_url", "")),
|
||||
"timestamp": timestamp,
|
||||
"content_type": str(metadata_map.get("content_type", "text/plain")),
|
||||
"word_count": _as_int(metadata_map.get("word_count")),
|
||||
"char_count": _as_int(metadata_map.get("char_count")),
|
||||
}
|
||||
|
||||
# Add optional fields if present
|
||||
# Check for title in both top-level and metadata (R2R schema has title as top-level field)
|
||||
if title := (doc_map.get("title") or metadata_map.get("title")):
|
||||
metadata["title"] = cast(str | None, title)
|
||||
# Check for summary in top-level R2R field (R2R schema has summary as top-level field)
|
||||
if summary := (doc_map.get("summary") or metadata_map.get("summary")):
|
||||
metadata["description"] = cast(str | None, summary)
|
||||
elif description := metadata_map.get("description"):
|
||||
metadata["description"] = cast(str | None, description)
|
||||
if tags := metadata_map.get("tags"):
|
||||
metadata["tags"] = _as_sequence(tags) if isinstance(tags, list) else []
|
||||
if category := metadata_map.get("category"):
|
||||
metadata["category"] = str(category)
|
||||
if section := metadata_map.get("section"):
|
||||
metadata["section"] = str(section)
|
||||
if language := metadata_map.get("language"):
|
||||
metadata["language"] = str(language)
|
||||
if author := metadata_map.get("author"):
|
||||
metadata["author"] = str(author)
|
||||
if domain := metadata_map.get("domain"):
|
||||
metadata["domain"] = str(domain)
|
||||
if site_name := metadata_map.get("site_name"):
|
||||
metadata["site_name"] = str(site_name)
|
||||
if heading_hierarchy := metadata_map.get("heading_hierarchy"):
|
||||
metadata["heading_hierarchy"] = (
|
||||
list(heading_hierarchy) if isinstance(heading_hierarchy, list) else []
|
||||
)
|
||||
if section_depth := metadata_map.get("section_depth"):
|
||||
metadata["section_depth"] = _as_int(section_depth)
|
||||
if has_code_blocks := metadata_map.get("has_code_blocks"):
|
||||
metadata["has_code_blocks"] = bool(has_code_blocks)
|
||||
if has_images := metadata_map.get("has_images"):
|
||||
metadata["has_images"] = bool(has_images)
|
||||
if has_links := metadata_map.get("has_links"):
|
||||
metadata["has_links"] = bool(has_links)
|
||||
if extraction_method := metadata_map.get("extraction_method"):
|
||||
metadata["extraction_method"] = str(extraction_method)
|
||||
if crawl_depth := metadata_map.get("crawl_depth"):
|
||||
metadata["crawl_depth"] = _as_int(crawl_depth)
|
||||
if last_modified := metadata_map.get("last_modified"):
|
||||
metadata["last_modified"] = _as_datetime(last_modified)
|
||||
if readability_score := metadata_map.get("readability_score"):
|
||||
metadata["readability_score"] = (
|
||||
float(readability_score) if readability_score is not None else None
|
||||
)
|
||||
if completeness_score := metadata_map.get("completeness_score"):
|
||||
metadata["completeness_score"] = (
|
||||
float(completeness_score) if completeness_score is not None else None
|
||||
)
|
||||
|
||||
source_value = str(metadata_map.get("ingestion_source", IngestionSource.WEB.value))
|
||||
try:
|
||||
source_enum = IngestionSource(source_value)
|
||||
except ValueError:
|
||||
source_enum = IngestionSource.WEB
|
||||
|
||||
content_value = doc_map.get("content", getattr(r2r_doc, "content", ""))
|
||||
|
||||
return Document(
|
||||
id=doc_uuid,
|
||||
content=str(content_value),
|
||||
metadata=metadata,
|
||||
source=source_enum,
|
||||
collection=collection_name or self.config.collection_name,
|
||||
)
|
||||
|
||||
@override
|
||||
async def search(
|
||||
self,
|
||||
query: str,
|
||||
limit: int = 10,
|
||||
threshold: float = 0.7,
|
||||
*,
|
||||
collection_name: str | None = None,
|
||||
) -> AsyncGenerator[Document, None]:
|
||||
"""Search documents using R2R."""
|
||||
try:
|
||||
search_settings: dict[str, object] = {
|
||||
"limit": limit,
|
||||
"similarity_threshold": threshold,
|
||||
}
|
||||
|
||||
if collection_name:
|
||||
collection_id = await self._ensure_collection(collection_name)
|
||||
search_settings["collection_ids"] = [collection_id]
|
||||
|
||||
search_response = await self.client.retrieval.search(
|
||||
query=query,
|
||||
search_settings=search_settings,
|
||||
)
|
||||
|
||||
for result in _as_sequence(getattr(search_response, "results", ())):
|
||||
result_map = _as_mapping(result)
|
||||
document_id_value = result_map.get(
|
||||
"document_id", getattr(result, "document_id", None)
|
||||
)
|
||||
if document_id_value is None:
|
||||
continue
|
||||
document_id = str(document_id_value)
|
||||
|
||||
try:
|
||||
doc_response = await self.client.documents.retrieve(document_id)
|
||||
except R2RException as exc:
|
||||
import logging
|
||||
|
||||
logging.warning(
|
||||
f"Failed to retrieve document {document_id} during search: {exc}"
|
||||
)
|
||||
continue
|
||||
|
||||
document_payload = getattr(doc_response, "results", doc_response)
|
||||
document = self._convert_to_document(document_payload, collection_name)
|
||||
|
||||
score_value = result_map.get("score", getattr(result, "score", None))
|
||||
if score_value is not None:
|
||||
try:
|
||||
# Handle various score value types safely
|
||||
if isinstance(score_value, (int, float, str)):
|
||||
document.score = float(score_value)
|
||||
else:
|
||||
# For unknown types, try string conversion first
|
||||
document.score = float(str(score_value))
|
||||
except (TypeError, ValueError) as e:
|
||||
import logging
|
||||
|
||||
logging.debug(
|
||||
f"Invalid score value {score_value} for document {document_id}: {e}"
|
||||
)
|
||||
document.score = None
|
||||
|
||||
yield document
|
||||
|
||||
except R2RException as exc:
|
||||
raise StorageError(f"Search failed: {exc}") from exc
|
||||
|
||||
@override
|
||||
async def delete(self, document_id: str, *, collection_name: str | None = None) -> bool:
|
||||
"""Delete a document."""
|
||||
try:
|
||||
_ = await self.client.documents.delete(document_id)
|
||||
return True
|
||||
except R2RException:
|
||||
return False
|
||||
|
||||
@override
|
||||
async def count(self, *, collection_name: str | None = None) -> int:
|
||||
"""Get document count in collection."""
|
||||
try:
|
||||
endpoint = self.endpoint
|
||||
client = httpx.AsyncClient()
|
||||
try:
|
||||
# Get collections and find the count for the specific collection
|
||||
response = await client.get(f"{endpoint}/v3/collections")
|
||||
response.raise_for_status()
|
||||
data: dict[str, object] = response.json()
|
||||
|
||||
target_collection = collection_name or self.config.collection_name
|
||||
results = cast(list[dict[str, object]], data.get("results", []))
|
||||
for collection in results:
|
||||
if collection.get("name") == target_collection:
|
||||
doc_count = collection.get("document_count", 0)
|
||||
return _as_int(doc_count)
|
||||
|
||||
return 0
|
||||
finally:
|
||||
await client.aclose()
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
@override
|
||||
async def close(self) -> None:
|
||||
"""Close R2R client."""
|
||||
try:
|
||||
await self.client.close()
|
||||
except Exception as e:
|
||||
import logging
|
||||
|
||||
logging.warning(f"Error closing R2R client: {e}")
|
||||
|
||||
async def __aenter__(self) -> Self:
|
||||
"""Async context manager entry."""
|
||||
return self
|
||||
|
||||
async def __aexit__(
|
||||
self,
|
||||
exc_type: type[BaseException] | None,
|
||||
exc_val: BaseException | None,
|
||||
exc_tb: object | None,
|
||||
) -> None:
|
||||
"""Async context manager exit with proper cleanup."""
|
||||
await self.close()
|
||||
|
||||
# Additional R2R-specific comprehensive management methods
|
||||
|
||||
async def create_collection(self, name: str, description: str | None = None) -> str:
|
||||
"""Create a new collection."""
|
||||
try:
|
||||
response = await self.client.collections.create(name=name, description=description)
|
||||
created = _as_mapping(getattr(response, "results", {}))
|
||||
return str(created.get("id", name))
|
||||
except R2RException as exc:
|
||||
raise StorageError(f"Failed to create collection {name}: {exc}") from exc
|
||||
|
||||
async def delete_collection(self, collection_name: str) -> bool:
|
||||
"""Delete a collection."""
|
||||
try:
|
||||
collection_id = await self._ensure_collection(collection_name)
|
||||
_ = await self.client.collections.delete(collection_id)
|
||||
return True
|
||||
except R2RException:
|
||||
return False
|
||||
|
||||
@override
|
||||
async def list_collections(self) -> list[str]:
|
||||
"""List all available collections."""
|
||||
try:
|
||||
endpoint = self.endpoint
|
||||
client = httpx.AsyncClient()
|
||||
try:
|
||||
response = await client.get(f"{endpoint}/v3/collections")
|
||||
response.raise_for_status()
|
||||
data: dict[str, object] = response.json()
|
||||
|
||||
collection_names: list[str] = []
|
||||
results = cast(list[dict[str, object]], data.get("results", []))
|
||||
for entry in results:
|
||||
if name := entry.get("name"):
|
||||
collection_names.append(str(name))
|
||||
return collection_names
|
||||
finally:
|
||||
await client.aclose()
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to list collections: {e}") from e
|
||||
|
||||
async def list_collections_detailed(self) -> list[dict[str, object]]:
|
||||
"""List all available collections with detailed information."""
|
||||
try:
|
||||
response = await self.client.collections.list()
|
||||
collections: list[dict[str, object]] = []
|
||||
for entry in _as_sequence(getattr(response, "results", ())):
|
||||
entry_map = _as_mapping(entry)
|
||||
collections.append(
|
||||
{
|
||||
"id": str(entry_map.get("id", "")),
|
||||
"name": str(entry_map.get("name", "")),
|
||||
"description": entry_map.get("description"),
|
||||
}
|
||||
)
|
||||
return collections
|
||||
except R2RException as exc:
|
||||
raise StorageError(f"Failed to list collections: {exc}") from exc
|
||||
|
||||
async def get_document_chunks(self, document_id: str) -> list[dict[str, object]]:
|
||||
"""Get all chunks for a specific document."""
|
||||
try:
|
||||
response = await self.client.chunks.list(filters={"document_id": document_id})
|
||||
return [
|
||||
dict(_as_mapping(chunk)) for chunk in _as_sequence(getattr(response, "results", ()))
|
||||
]
|
||||
except R2RException as exc:
|
||||
raise StorageError(f"Failed to get chunks for document {document_id}: {exc}") from exc
|
||||
|
||||
async def extract_entities(self, document_id: str) -> dict[str, object]:
|
||||
"""Extract entities and relationships from a document."""
|
||||
try:
|
||||
response = await self.client.documents.extract(id=document_id)
|
||||
return dict(_as_mapping(getattr(response, "results", {})))
|
||||
except R2RException as exc:
|
||||
raise StorageError(
|
||||
f"Failed to extract entities from document {document_id}: {exc}"
|
||||
) from exc
|
||||
|
||||
async def get_document_overview(self, document_id: str) -> dict[str, object]:
|
||||
"""Get comprehensive document overview and statistics."""
|
||||
try:
|
||||
doc_response = await self.client.documents.retrieve(document_id)
|
||||
chunks_response = await self.client.chunks.list(filters={"document_id": document_id})
|
||||
document_payload = dict(_as_mapping(getattr(doc_response, "results", {})))
|
||||
chunk_payload = [
|
||||
dict(_as_mapping(chunk))
|
||||
for chunk in _as_sequence(getattr(chunks_response, "results", ()))
|
||||
]
|
||||
return {
|
||||
"document": document_payload,
|
||||
"chunk_count": len(chunk_payload),
|
||||
"chunks": chunk_payload,
|
||||
}
|
||||
except R2RException as exc:
|
||||
raise StorageError(f"Failed to get overview for document {document_id}: {exc}") from exc
|
||||
|
||||
@override
|
||||
async def list_documents(
|
||||
self,
|
||||
limit: int = 100,
|
||||
offset: int = 0,
|
||||
*,
|
||||
collection_name: str | None = None,
|
||||
) -> list[dict[str, object]]:
|
||||
"""
|
||||
List documents in R2R with pagination.
|
||||
|
||||
Args:
|
||||
limit: Maximum number of documents to return
|
||||
offset: Number of documents to skip
|
||||
collection_name: Collection name (optional)
|
||||
|
||||
Returns:
|
||||
List of document dictionaries with metadata
|
||||
"""
|
||||
try:
|
||||
documents: list[dict[str, object]] = []
|
||||
|
||||
if collection_name:
|
||||
# Get collection ID first
|
||||
collection_id = await self._ensure_collection(collection_name)
|
||||
# Use the collections API to list documents in a specific collection
|
||||
endpoint = self.endpoint
|
||||
client = httpx.AsyncClient()
|
||||
try:
|
||||
params = {"offset": offset, "limit": limit}
|
||||
response = await client.get(
|
||||
f"{endpoint}/v3/collections/{collection_id}/documents", params=params
|
||||
)
|
||||
response.raise_for_status()
|
||||
data: dict[str, object] = response.json()
|
||||
finally:
|
||||
await client.aclose()
|
||||
|
||||
doc_sequence = _as_sequence(data.get("results", []))
|
||||
else:
|
||||
# List all documents
|
||||
r2r_response = await self.client.documents.list(offset=offset, limit=limit)
|
||||
documents_data: list[object] | dict[str, object] = getattr(
|
||||
r2r_response, "results", []
|
||||
)
|
||||
|
||||
doc_sequence = _as_sequence(
|
||||
documents_data.get("results", [])
|
||||
if isinstance(documents_data, dict)
|
||||
else documents_data
|
||||
)
|
||||
|
||||
for doc_data in doc_sequence:
|
||||
doc_map = _as_mapping(doc_data)
|
||||
|
||||
# Extract standard document fields
|
||||
doc_id = str(doc_map.get("id", ""))
|
||||
title = str(doc_map.get("title", "Untitled"))
|
||||
metadata = _as_mapping(doc_map.get("metadata", {}))
|
||||
|
||||
documents.append(
|
||||
{
|
||||
"id": doc_id,
|
||||
"title": title,
|
||||
"source_url": str(metadata.get("source_url", "")),
|
||||
"description": str(metadata.get("description", "")),
|
||||
"content_type": str(metadata.get("content_type", "text/plain")),
|
||||
"content_preview": str(doc_map.get("content", ""))[:200] + "..."
|
||||
if doc_map.get("content")
|
||||
else "",
|
||||
"word_count": _as_int(metadata.get("word_count", 0)),
|
||||
"timestamp": str(doc_map.get("created_at", "")),
|
||||
}
|
||||
)
|
||||
|
||||
return documents
|
||||
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to list documents: {e}") from e
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -2,7 +2,7 @@
|
||||
|
||||
import json
|
||||
from datetime import UTC, datetime
|
||||
from typing import TypedDict
|
||||
from typing import TypedDict, cast
|
||||
|
||||
import httpx
|
||||
|
||||
@@ -32,7 +32,7 @@ class MetadataTagger:
|
||||
def __init__(
|
||||
self,
|
||||
llm_endpoint: str = "http://llm.lab",
|
||||
model: str = "openai/gpt-4o-mini",
|
||||
model: str = "fireworks/glm-4p5-air",
|
||||
):
|
||||
"""
|
||||
Initialize metadata tagger.
|
||||
@@ -41,7 +41,7 @@ class MetadataTagger:
|
||||
llm_endpoint: LLM API endpoint
|
||||
model: Model to use for tagging
|
||||
"""
|
||||
self.endpoint = llm_endpoint
|
||||
self.endpoint = llm_endpoint.rstrip('/')
|
||||
self.model = model
|
||||
|
||||
# Get API key from environment
|
||||
@@ -52,7 +52,7 @@ class MetadataTagger:
|
||||
|
||||
# Load .env from the project root
|
||||
env_path = Path(__file__).parent.parent.parent / ".env"
|
||||
load_dotenv(env_path)
|
||||
_ = load_dotenv(env_path)
|
||||
|
||||
api_key = os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY") or ""
|
||||
|
||||
@@ -83,28 +83,33 @@ class MetadataTagger:
|
||||
metadata = await self._generate_metadata(
|
||||
document.content,
|
||||
document.metadata.get("title") if document.metadata else None,
|
||||
custom_instructions
|
||||
custom_instructions,
|
||||
)
|
||||
|
||||
# Merge with existing metadata - preserve required fields
|
||||
# Merge with existing metadata - preserve ALL existing fields and add LLM-generated ones
|
||||
from typing import cast
|
||||
|
||||
from ..core.models import DocumentMetadata as CoreDocumentMetadata
|
||||
|
||||
updated_metadata: CoreDocumentMetadata = {
|
||||
"source_url": document.metadata.get("source_url", ""),
|
||||
"title": metadata.get("title") or document.metadata.get("title"),
|
||||
"description": metadata.get("summary") or document.metadata.get("description"),
|
||||
"timestamp": document.metadata.get("timestamp", datetime.now(UTC)),
|
||||
"content_type": document.metadata.get("content_type", "text/plain"),
|
||||
"word_count": document.metadata.get("word_count", len(document.content.split())),
|
||||
"char_count": document.metadata.get("char_count", len(document.content)),
|
||||
}
|
||||
# Start with a copy of existing metadata to preserve all fields
|
||||
# Cast to avoid TypedDict key errors during manipulation
|
||||
updated_metadata = cast(dict[str, object], dict(document.metadata))
|
||||
|
||||
# Store additional metadata as extra fields in the document's metadata
|
||||
# Note: Since DocumentMetadata is a TypedDict, we can only include the defined fields
|
||||
# Additional metadata like tags, category, etc. would need to be stored separately
|
||||
# or the DocumentMetadata model would need to be extended
|
||||
# Update/enhance with LLM-generated metadata, preserving existing values when new ones are empty
|
||||
if metadata.get("title") and not updated_metadata.get("title"):
|
||||
updated_metadata["title"] = str(metadata["title"]) # type: ignore[typeddict-item]
|
||||
if metadata.get("summary") and not updated_metadata.get("description"):
|
||||
updated_metadata["description"] = str(metadata["summary"])
|
||||
|
||||
document.metadata = updated_metadata
|
||||
# Ensure required fields have values
|
||||
updated_metadata.setdefault("source_url", "")
|
||||
updated_metadata.setdefault("timestamp", datetime.now(UTC))
|
||||
updated_metadata.setdefault("content_type", "text/plain")
|
||||
updated_metadata.setdefault("word_count", len(document.content.split()))
|
||||
updated_metadata.setdefault("char_count", len(document.content))
|
||||
|
||||
# Cast to the expected type since we're preserving all fields from the original metadata
|
||||
document.metadata = cast(CoreDocumentMetadata, updated_metadata)
|
||||
|
||||
return document
|
||||
|
||||
@@ -190,25 +195,40 @@ Return a JSON object with the following structure:
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
if not isinstance(result, dict):
|
||||
result_raw = response.json()
|
||||
if not isinstance(result_raw, dict):
|
||||
raise IngestionError("Invalid response format from LLM")
|
||||
|
||||
result = cast(dict[str, object], result_raw)
|
||||
|
||||
# Extract content from response
|
||||
choices = result.get("choices", [])
|
||||
if not choices:
|
||||
if not choices or not isinstance(choices, list):
|
||||
raise IngestionError("No response from LLM")
|
||||
|
||||
message = choices[0].get("message", {})
|
||||
content_str = message.get("content", "{}")
|
||||
first_choice_raw = cast(object, choices[0])
|
||||
if not isinstance(first_choice_raw, dict):
|
||||
raise IngestionError("Invalid choice format")
|
||||
|
||||
first_choice = cast(dict[str, object], first_choice_raw)
|
||||
message_raw = first_choice.get("message", {})
|
||||
if not isinstance(message_raw, dict):
|
||||
raise IngestionError("Invalid message format")
|
||||
|
||||
message = cast(dict[str, object], message_raw)
|
||||
content_str = str(message.get("content", "{}"))
|
||||
|
||||
try:
|
||||
metadata = json.loads(content_str)
|
||||
raw_metadata = json.loads(content_str)
|
||||
except json.JSONDecodeError as e:
|
||||
raise IngestionError(f"Failed to parse LLM response: {e}") from e
|
||||
|
||||
# Ensure it's a dict before processing
|
||||
if not isinstance(raw_metadata, dict):
|
||||
raise IngestionError("LLM response is not a valid JSON object")
|
||||
|
||||
# Validate and sanitize metadata
|
||||
return self._sanitize_metadata(metadata)
|
||||
return self._sanitize_metadata(raw_metadata)
|
||||
|
||||
def _sanitize_metadata(self, metadata: dict[str, object]) -> DocumentMetadata:
|
||||
"""
|
||||
@@ -224,7 +244,9 @@ Return a JSON object with the following structure:
|
||||
|
||||
# Tags
|
||||
if "tags" in metadata and isinstance(metadata["tags"], list):
|
||||
tags = [str(tag).lower().strip() for tag in metadata["tags"][:10]]
|
||||
tags_list = cast(list[object], metadata["tags"])
|
||||
tags_raw = tags_list[:10] if len(tags_list) > 10 else tags_list
|
||||
tags = [str(tag).lower().strip() for tag in tags_raw]
|
||||
sanitized["tags"] = [tag for tag in tags if tag]
|
||||
|
||||
# Category
|
||||
@@ -233,13 +255,14 @@ Return a JSON object with the following structure:
|
||||
|
||||
# Summary
|
||||
if "summary" in metadata:
|
||||
summary = str(metadata["summary"]).strip()
|
||||
if summary:
|
||||
if summary := str(metadata["summary"]).strip():
|
||||
sanitized["summary"] = summary[:500] # Limit length
|
||||
|
||||
# Key topics
|
||||
if "key_topics" in metadata and isinstance(metadata["key_topics"], list):
|
||||
topics = [str(topic).strip() for topic in metadata["key_topics"][:10]]
|
||||
topics_list = cast(list[object], metadata["key_topics"])
|
||||
topics_raw = topics_list[:10] if len(topics_list) > 10 else topics_list
|
||||
topics = [str(topic).strip() for topic in topics_raw]
|
||||
sanitized["key_topics"] = [topic for topic in topics if topic]
|
||||
|
||||
# Document type
|
||||
@@ -255,7 +278,7 @@ Return a JSON object with the following structure:
|
||||
# Technical level
|
||||
if "technical_level" in metadata:
|
||||
level = str(metadata["technical_level"]).strip().lower()
|
||||
if level in ["beginner", "intermediate", "advanced"]:
|
||||
if level in {"beginner", "intermediate", "advanced"}:
|
||||
sanitized["technical_level"] = level
|
||||
|
||||
return sanitized
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
"""Vectorizer utility for generating embeddings."""
|
||||
|
||||
from types import TracebackType
|
||||
from typing import Self
|
||||
from typing import Self, cast
|
||||
|
||||
import httpx
|
||||
|
||||
from typings import EmbeddingResponse
|
||||
|
||||
from ..core.exceptions import VectorizationError
|
||||
from ..core.models import StorageConfig, VectorConfig
|
||||
|
||||
@@ -15,7 +17,6 @@ class Vectorizer:
|
||||
endpoint: str
|
||||
model: str
|
||||
dimension: int
|
||||
client: httpx.AsyncClient
|
||||
|
||||
def __init__(self, config: StorageConfig | VectorConfig):
|
||||
"""
|
||||
@@ -27,7 +28,7 @@ class Vectorizer:
|
||||
if isinstance(config, StorageConfig):
|
||||
# Extract vector config from storage config
|
||||
self.endpoint = "http://llm.lab"
|
||||
self.model = "ollama/bge-m3:latest"
|
||||
self.model = "ollama/bge-m3"
|
||||
self.dimension = 1024
|
||||
else:
|
||||
self.endpoint = str(config.embedding_endpoint)
|
||||
@@ -36,12 +37,13 @@ class Vectorizer:
|
||||
|
||||
# Get API key from environment
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from pathlib import Path
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Load .env from the project root
|
||||
env_path = Path(__file__).parent.parent.parent / ".env"
|
||||
load_dotenv(env_path)
|
||||
_ = load_dotenv(env_path)
|
||||
|
||||
api_key = os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY") or ""
|
||||
|
||||
@@ -49,7 +51,7 @@ class Vectorizer:
|
||||
if api_key:
|
||||
headers["Authorization"] = f"Bearer {api_key}"
|
||||
|
||||
self.client = httpx.AsyncClient(timeout=60.0, headers=headers)
|
||||
self.client = httpx.AsyncClient(timeout=60.0, headers=headers) # type: ignore[attr-defined]
|
||||
|
||||
async def vectorize(self, text: str) -> list[float]:
|
||||
"""
|
||||
@@ -65,14 +67,11 @@ class Vectorizer:
|
||||
raise VectorizationError("Cannot vectorize empty text")
|
||||
|
||||
try:
|
||||
# Prepare request based on model type
|
||||
if "ollama" in self.model:
|
||||
response = await self._ollama_embed(text)
|
||||
else:
|
||||
response = await self._openai_embed(text)
|
||||
|
||||
return response
|
||||
|
||||
return (
|
||||
await self._ollama_embed(text)
|
||||
if "ollama" in self.model
|
||||
else await self._openai_embed(text)
|
||||
)
|
||||
except Exception as e:
|
||||
raise VectorizationError(f"Vectorization failed: {e}") from e
|
||||
|
||||
@@ -104,7 +103,7 @@ class Vectorizer:
|
||||
Returns:
|
||||
Embedding vector
|
||||
"""
|
||||
# Keep the full model name for OpenAI-compatible endpoints
|
||||
# Use the full model name as it appears in the API
|
||||
model_name = self.model
|
||||
|
||||
# Use OpenAI-compatible endpoint for ollama models
|
||||
@@ -117,38 +116,29 @@ class Vectorizer:
|
||||
)
|
||||
_ = response.raise_for_status()
|
||||
|
||||
response_data = response.json()
|
||||
if not isinstance(response_data, dict):
|
||||
raise VectorizationError("Invalid response format from OpenAI-compatible API")
|
||||
response_json = response.json()
|
||||
# Response is expected to be dict[str, object] from our type stub
|
||||
|
||||
response_data = cast(EmbeddingResponse, cast(object, response_json))
|
||||
|
||||
# Parse OpenAI-compatible response format
|
||||
embeddings_raw = response_data.get("data", [])
|
||||
if not isinstance(embeddings_raw, list) or not embeddings_raw:
|
||||
embeddings_list = response_data.get("data", [])
|
||||
if not embeddings_list:
|
||||
raise VectorizationError("No embeddings returned")
|
||||
|
||||
first_embedding_data = embeddings_raw[0]
|
||||
if not isinstance(first_embedding_data, dict):
|
||||
raise VectorizationError("Invalid embedding data format")
|
||||
|
||||
embedding_raw = first_embedding_data.get("embedding")
|
||||
if not isinstance(embedding_raw, list):
|
||||
first_embedding = embeddings_list[0]
|
||||
embedding_raw = first_embedding.get("embedding")
|
||||
if not embedding_raw:
|
||||
raise VectorizationError("Invalid embedding format")
|
||||
|
||||
# Convert to float list and validate
|
||||
embedding: list[float] = []
|
||||
for item in embedding_raw:
|
||||
if isinstance(item, (int, float)):
|
||||
embedding.append(float(item))
|
||||
else:
|
||||
raise VectorizationError(f"Invalid embedding value type: {type(item)}")
|
||||
|
||||
embedding.extend(float(item) for item in embedding_raw)
|
||||
# Ensure correct dimension
|
||||
if len(embedding) != self.dimension:
|
||||
# Truncate or pad as needed
|
||||
if len(embedding) > self.dimension:
|
||||
embedding = embedding[: self.dimension]
|
||||
else:
|
||||
embedding.extend([0.0] * (self.dimension - len(embedding)))
|
||||
raise VectorizationError(
|
||||
f"Embedding dimension mismatch: expected {self.dimension}, received {len(embedding)}"
|
||||
)
|
||||
|
||||
return embedding
|
||||
|
||||
@@ -171,38 +161,28 @@ class Vectorizer:
|
||||
)
|
||||
_ = response.raise_for_status()
|
||||
|
||||
response_data = response.json()
|
||||
if not isinstance(response_data, dict):
|
||||
raise VectorizationError("Invalid response format from OpenAI API")
|
||||
response_json = response.json()
|
||||
# Response is expected to be dict[str, object] from our type stub
|
||||
|
||||
data: dict[str, list[dict[str, list[float]]]] = response_data
|
||||
response_data = cast(EmbeddingResponse, cast(object, response_json))
|
||||
|
||||
embeddings_raw = data.get("data", [])
|
||||
if not isinstance(embeddings_raw, list) or not embeddings_raw:
|
||||
embeddings_list = response_data.get("data", [])
|
||||
if not embeddings_list:
|
||||
raise VectorizationError("No embeddings returned")
|
||||
|
||||
first_embedding_data = embeddings_raw[0]
|
||||
if not isinstance(first_embedding_data, dict):
|
||||
raise VectorizationError("Invalid embedding data format")
|
||||
|
||||
embedding_raw = first_embedding_data.get("embedding")
|
||||
if not isinstance(embedding_raw, list):
|
||||
first_embedding = embeddings_list[0]
|
||||
embedding_raw = first_embedding.get("embedding")
|
||||
if not embedding_raw:
|
||||
raise VectorizationError("Invalid embedding format")
|
||||
|
||||
# Convert to float list and validate
|
||||
embedding: list[float] = []
|
||||
for item in embedding_raw:
|
||||
if isinstance(item, (int, float)):
|
||||
embedding.append(float(item))
|
||||
else:
|
||||
raise VectorizationError(f"Invalid embedding value type: {type(item)}")
|
||||
|
||||
embedding.extend(float(item) for item in embedding_raw)
|
||||
# Ensure correct dimension
|
||||
if len(embedding) != self.dimension:
|
||||
if len(embedding) > self.dimension:
|
||||
embedding = embedding[: self.dimension]
|
||||
else:
|
||||
embedding.extend([0.0] * (self.dimension - len(embedding)))
|
||||
raise VectorizationError(
|
||||
f"Embedding dimension mismatch: expected {self.dimension}, received {len(embedding)}"
|
||||
)
|
||||
|
||||
return embedding
|
||||
|
||||
|
||||
848
logs/tui.log
Normal file
848
logs/tui.log
Normal file
@@ -0,0 +1,848 @@
|
||||
2025-09-18 07:54:36 | INFO | ingest_pipeline.cli.tui.utils.runners | Initializing collection management TUI
|
||||
2025-09-18 07:54:36 | INFO | ingest_pipeline.cli.tui.utils.runners | Scanning available storage backends
|
||||
2025-09-18 07:54:36 | INFO | httpx | HTTP Request: GET http://weaviate.yo/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 07:54:36 | INFO | httpx | HTTP Request: GET http://weaviate.yo/v1/meta "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:36 | INFO | httpx | HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:36 | INFO | httpx | HTTP Request: GET http://weaviate.yo/v1/schema "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:36 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:36 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:36 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:36 | INFO | ingest_pipeline.cli.tui.utils.runners | weaviate connected successfully
|
||||
2025-09-18 07:54:36 | INFO | ingest_pipeline.cli.tui.utils.runners | open_webui connected successfully
|
||||
2025-09-18 07:54:36 | INFO | ingest_pipeline.cli.tui.utils.runners | r2r connected successfully
|
||||
2025-09-18 07:54:36 | INFO | ingest_pipeline.cli.tui.utils.runners | Launching TUI with 3 backend(s): weaviate, open_webui, r2r
|
||||
2025-09-18 07:54:36 | INFO | httpx | HTTP Request: GET http://weaviate.yo/v1/schema "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:36 | INFO | httpx | HTTP Request: POST http://weaviate.yo/v1/graphql "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:36 | INFO | httpx | HTTP Request: POST http://weaviate.yo/v1/graphql "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:36 | INFO | httpx | HTTP Request: POST http://weaviate.yo/v1/graphql "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:37 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:37 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:37 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/d06bd108-ae7f-44f4-92fb-2ac556784920 "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:37 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:37 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/139c04d5-7d38-4595-8e12-79a67fd731e7 "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:37 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:37 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/96832710-8146-4e3b-88f3-4b3929f67dbf "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:37 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:37 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/dade78d9-9893-4966-bd4b-31f1c1635cfa "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:37 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:37 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/721c1517-b2cd-482d-bd1c-f99571f0f31f "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:37 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:37 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/f867530b-5eea-43bf-8257-d3da497cb10b "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:37 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:37 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/cbd4ae82-6fdd-4a4e-a4d5-d0b97ae988fd "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:37 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:37 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:37 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:37 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:37 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:54:37 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:00 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/admin/version "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:00 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=fa923688-217a-41ce-a381-ae4bb8e4d40c "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 07:55:00 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flows/ "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:01 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 07:55:01 | INFO | prefect.engine | View at http://prefect.lab/runs/flow-run/ffa24f3c-fb6a-4fb5-b929-225eac154755
|
||||
2025-09-18 07:55:01 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/ffa24f3c-fb6a-4fb5-b929-225eac154755/set_state "HTTP/1.1 201 Created"
|
||||
2025-09-18 07:55:01 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/flow_runs/ffa24f3c-fb6a-4fb5-b929-225eac154755 "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:01 | INFO | prefect.flow_runs | Beginning flow run 'colossal-swan' for flow 'ingestion_pipeline'
|
||||
2025-09-18 07:55:01 | INFO | prefect.flow_runs | View at http://prefect.lab/runs/flow-run/ffa24f3c-fb6a-4fb5-b929-225eac154755
|
||||
2025-09-18 07:55:01 | INFO | prefect.flow_runs | Starting ingestion from https://r2r-docs.sciphi.ai/introduction
|
||||
2025-09-18 07:55:01 | INFO | prefect.flow_runs | Validating source...
|
||||
2025-09-18 07:55:01 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=067c7220-07bc-4299-acbb-ebb65e47b26f "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 07:55:01 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:01 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/flows/78f3cfb6-1339-49c6-89f4-c38effea29e4 "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:03 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=6975811f-b28f-4a9a-8e7e-40a210313c82 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 07:55:03 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/logs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 07:55:05 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:05 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:05 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 07:55:05 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/admin/version "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:05 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=5f81018b-d377-438b-bdaa-dd7f02a1b29f "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 07:55:05 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:05 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=2ece8111-a752-4a05-95e9-58a000f64d68 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 07:55:05 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:05 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 07:55:05 | INFO | prefect.flow_runs | Ingesting documents...
|
||||
2025-09-18 07:55:05 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/task_runs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 07:55:05 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flows/ "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:05 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 07:55:05 | INFO | prefect.engine | View at http://prefect.lab/runs/flow-run/092dd5b6-0f86-4e27-94ae-28c7638e7c40
|
||||
2025-09-18 07:55:05 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/092dd5b6-0f86-4e27-94ae-28c7638e7c40/set_state "HTTP/1.1 201 Created"
|
||||
2025-09-18 07:55:05 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/flow_runs/092dd5b6-0f86-4e27-94ae-28c7638e7c40 "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:05 | INFO | prefect.flow_runs | Beginning subflow run 'amiable-marmoset' for flow 'firecrawl_to_r2r'
|
||||
2025-09-18 07:55:05 | INFO | prefect.flow_runs | View at http://prefect.lab/runs/flow-run/092dd5b6-0f86-4e27-94ae-28c7638e7c40
|
||||
2025-09-18 07:55:05 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/flows/bac48c85-e6dc-4da0-99d5-6f26e027cabb "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:05 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=022cacb0-b4d4-4989-aade-55300219df5e "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 07:55:05 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:05 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:05 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:05 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:05 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 07:55:05 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=f2bf95d5-c836-4266-8148-336bb7c622fc "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 07:55:05 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:06 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/map "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:06 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:06 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 07:55:06 | INFO | prefect.flow_runs | Discovered 5 unique URLs from Firecrawl map
|
||||
2025-09-18 07:55:06 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=6fdc6a20-d4d7-4b74-8567-745ae21ed80e "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 07:55:06 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:06 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/documents/c5c726b4-805a-5e22-ad13-323750b25efa "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 07:55:06 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/documents/a534965a-9da2-566e-a9ad-3e0da59bd3ae "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 07:55:06 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/documents/8af54b00-fe82-55c5-a1a5-fd0544139b62 "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 07:55:06 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/documents/9a2d0156-602f-5e4a-a8e1-22edd4c987e6 "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 07:55:06 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/documents/c01a1979-1dba-5731-bc71-39daff2e6ca2 "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 07:55:06 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:06 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 07:55:06 | INFO | prefect.flow_runs | Scraping 1 batches of Firecrawl pages
|
||||
2025-09-18 07:55:06 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=2f238b05-eae3-44e7-8fe9-8a43fad6a505 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 07:55:06 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:07 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/logs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 07:55:11 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:12 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:14 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:14 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:16 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:16 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:16 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 07:55:16 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=b09316a7-665d-4ef4-9d9d-8e4fcdb17aa2 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 07:55:16 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:16 | INFO | prefect.task_runs | Task run failed with exception: 1 validation error for Document
|
||||
metadata.author
|
||||
Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
|
||||
For further information visit https://errors.pydantic.dev/2.11/v/string_type - Retry 1/1 will start 10 second(s) from now
|
||||
2025-09-18 07:55:17 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/logs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 07:55:26 | ERROR | prefect.task_runs | Task run failed with exception: 1 validation error for Document
|
||||
metadata.author
|
||||
Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
|
||||
For further information visit https://errors.pydantic.dev/2.11/v/string_type - Retries are exhausted
|
||||
Traceback (most recent call last):
|
||||
File "/home/vasceannie/projects/rag-manager/.venv/lib/python3.12/site-packages/prefect/task_engine.py", line 1459, in run_context
|
||||
yield self
|
||||
File "/home/vasceannie/projects/rag-manager/.venv/lib/python3.12/site-packages/prefect/task_engine.py", line 1538, in run_task_async
|
||||
await engine.call_task_fn(txn)
|
||||
File "/home/vasceannie/projects/rag-manager/.venv/lib/python3.12/site-packages/prefect/task_engine.py", line 1476, in call_task_fn
|
||||
result = await call_with_parameters(self.task.fn, parameters)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/home/vasceannie/projects/rag-manager/ingest_pipeline/flows/ingestion.py", line 149, in annotate_firecrawl_metadata_task
|
||||
documents = [ingestor.create_document(page, job) for page in pages]
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/home/vasceannie/projects/rag-manager/ingest_pipeline/ingestors/firecrawl.py", line 489, in create_document
|
||||
return Document(
|
||||
^^^^^^^^^
|
||||
File "/home/vasceannie/projects/rag-manager/.venv/lib/python3.12/site-packages/pydantic/main.py", line 253, in __init__
|
||||
validated_self = self.__pydantic_validator__.validate_python(data, self_instance=self)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
pydantic_core._pydantic_core.ValidationError: 1 validation error for Document
|
||||
metadata.author
|
||||
Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
|
||||
For further information visit https://errors.pydantic.dev/2.11/v/string_type
|
||||
2025-09-18 07:55:26 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:26 | ERROR | prefect.task_runs | Finished in state Failed('Task run encountered an exception ValidationError: 1 validation error for Document\nmetadata.author\n Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]\n For further information visit https://errors.pydantic.dev/2.11/v/string_type')
|
||||
2025-09-18 07:55:26 | ERROR | prefect.flow_runs | Encountered exception during execution: 1 validation error for Document
|
||||
metadata.author
|
||||
Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
|
||||
For further information visit https://errors.pydantic.dev/2.11/v/string_type
|
||||
Traceback (most recent call last):
|
||||
File "/home/vasceannie/projects/rag-manager/.venv/lib/python3.12/site-packages/prefect/flow_engine.py", line 1357, in run_context
|
||||
yield self
|
||||
File "/home/vasceannie/projects/rag-manager/.venv/lib/python3.12/site-packages/prefect/flow_engine.py", line 1419, in run_flow_async
|
||||
await engine.call_flow_fn()
|
||||
File "/home/vasceannie/projects/rag-manager/.venv/lib/python3.12/site-packages/prefect/flow_engine.py", line 1371, in call_flow_fn
|
||||
result = await call_with_parameters(self.flow.fn, self.parameters)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/home/vasceannie/projects/rag-manager/ingest_pipeline/flows/ingestion.py", line 467, in firecrawl_to_r2r_flow
|
||||
documents = await annotate_firecrawl_metadata_task(scraped_pages, job)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/home/vasceannie/projects/rag-manager/.venv/lib/python3.12/site-packages/prefect/task_engine.py", line 1540, in run_task_async
|
||||
return engine.state if return_type == "state" else await engine.result()
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/home/vasceannie/projects/rag-manager/.venv/lib/python3.12/site-packages/prefect/task_engine.py", line 1087, in result
|
||||
raise self._raised
|
||||
File "/home/vasceannie/projects/rag-manager/.venv/lib/python3.12/site-packages/prefect/task_engine.py", line 1459, in run_context
|
||||
yield self
|
||||
File "/home/vasceannie/projects/rag-manager/.venv/lib/python3.12/site-packages/prefect/task_engine.py", line 1538, in run_task_async
|
||||
await engine.call_task_fn(txn)
|
||||
File "/home/vasceannie/projects/rag-manager/.venv/lib/python3.12/site-packages/prefect/task_engine.py", line 1476, in call_task_fn
|
||||
result = await call_with_parameters(self.task.fn, parameters)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/home/vasceannie/projects/rag-manager/ingest_pipeline/flows/ingestion.py", line 149, in annotate_firecrawl_metadata_task
|
||||
documents = [ingestor.create_document(page, job) for page in pages]
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/home/vasceannie/projects/rag-manager/ingest_pipeline/ingestors/firecrawl.py", line 489, in create_document
|
||||
return Document(
|
||||
^^^^^^^^^
|
||||
File "/home/vasceannie/projects/rag-manager/.venv/lib/python3.12/site-packages/pydantic/main.py", line 253, in __init__
|
||||
validated_self = self.__pydantic_validator__.validate_python(data, self_instance=self)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
pydantic_core._pydantic_core.ValidationError: 1 validation error for Document
|
||||
metadata.author
|
||||
Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
|
||||
For further information visit https://errors.pydantic.dev/2.11/v/string_type
|
||||
2025-09-18 07:55:26 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/092dd5b6-0f86-4e27-94ae-28c7638e7c40/set_state "HTTP/1.1 201 Created"
|
||||
2025-09-18 07:55:26 | ERROR | prefect.flow_runs | Finished in state Failed('Flow run encountered an exception: ValidationError: 1 validation error for Document\nmetadata.author\n Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]\n For further information visit https://errors.pydantic.dev/2.11/v/string_type')
|
||||
2025-09-18 07:55:26 | INFO | prefect.flow_runs | Ingestion failed: 1 validation error for Document
|
||||
metadata.author
|
||||
Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
|
||||
For further information visit https://errors.pydantic.dev/2.11/v/string_type
|
||||
2025-09-18 07:55:26 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/admin/version "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:26 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:26 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=2ac58f3e-8c81-4c9e-9a68-b5c84902da18 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 07:55:26 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 07:55:26 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 07:55:26 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/ffa24f3c-fb6a-4fb5-b929-225eac154755/set_state "HTTP/1.1 201 Created"
|
||||
2025-09-18 07:55:26 | INFO | prefect.flow_runs | Finished in state Completed()
|
||||
2025-09-18 07:55:27 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/logs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:00:45 | INFO | ingest_pipeline.cli.tui.utils.runners | Shutting down storage connections
|
||||
2025-09-18 08:00:45 | INFO | ingest_pipeline.cli.tui.utils.runners | All storage connections closed gracefully
|
||||
2025-09-18 08:00:49 | INFO | ingest_pipeline.cli.tui.utils.runners | Initializing collection management TUI
|
||||
2025-09-18 08:00:49 | INFO | ingest_pipeline.cli.tui.utils.runners | Scanning available storage backends
|
||||
2025-09-18 08:00:49 | INFO | httpx | HTTP Request: GET http://weaviate.yo/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 08:00:50 | INFO | httpx | HTTP Request: GET http://weaviate.yo/v1/meta "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:50 | INFO | httpx | HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:50 | INFO | httpx | HTTP Request: GET http://weaviate.yo/v1/schema "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:50 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:50 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:50 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:50 | INFO | ingest_pipeline.cli.tui.utils.runners | weaviate connected successfully
|
||||
2025-09-18 08:00:50 | INFO | ingest_pipeline.cli.tui.utils.runners | open_webui connected successfully
|
||||
2025-09-18 08:00:50 | INFO | ingest_pipeline.cli.tui.utils.runners | r2r connected successfully
|
||||
2025-09-18 08:00:50 | INFO | ingest_pipeline.cli.tui.utils.runners | Launching TUI with 3 backend(s): weaviate, open_webui, r2r
|
||||
2025-09-18 08:00:50 | INFO | httpx | HTTP Request: GET http://weaviate.yo/v1/schema "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:50 | INFO | httpx | HTTP Request: POST http://weaviate.yo/v1/graphql "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:50 | INFO | httpx | HTTP Request: POST http://weaviate.yo/v1/graphql "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:50 | INFO | httpx | HTTP Request: POST http://weaviate.yo/v1/graphql "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:51 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:51 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:51 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/d06bd108-ae7f-44f4-92fb-2ac556784920 "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:51 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:51 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/139c04d5-7d38-4595-8e12-79a67fd731e7 "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:51 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:51 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/96832710-8146-4e3b-88f3-4b3929f67dbf "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:51 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:51 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/dade78d9-9893-4966-bd4b-31f1c1635cfa "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:51 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:51 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/721c1517-b2cd-482d-bd1c-f99571f0f31f "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:51 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:51 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/f867530b-5eea-43bf-8257-d3da497cb10b "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:51 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:51 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/cbd4ae82-6fdd-4a4e-a4d5-d0b97ae988fd "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:51 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:51 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:51 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:51 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:51 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:00:51 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:14 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/admin/version "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:14 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=4e57b817-47a8-4468-b030-48bcb4a52c2f "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:01:14 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flows/ "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:14 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:01:14 | INFO | prefect.engine | View at http://prefect.lab/runs/flow-run/063d86dc-a190-4be9-a56a-de1d1257478f
|
||||
2025-09-18 08:01:14 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/063d86dc-a190-4be9-a56a-de1d1257478f/set_state "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:01:14 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/flow_runs/063d86dc-a190-4be9-a56a-de1d1257478f "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:14 | INFO | prefect.flow_runs | Beginning flow run 'ingenious-meerkat' for flow 'ingestion_pipeline'
|
||||
2025-09-18 08:01:14 | INFO | prefect.flow_runs | View at http://prefect.lab/runs/flow-run/063d86dc-a190-4be9-a56a-de1d1257478f
|
||||
2025-09-18 08:01:14 | INFO | prefect.flow_runs | Starting ingestion from https://r2r-docs.sciphi.ai/introduction
|
||||
2025-09-18 08:01:14 | INFO | prefect.flow_runs | Validating source...
|
||||
2025-09-18 08:01:15 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=e5c33d4d-db9c-4a37-8a41-3d553a584909 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:01:15 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:15 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/flows/78f3cfb6-1339-49c6-89f4-c38effea29e4 "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:16 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=d85fd4d3-4dd6-451f-ac17-8d3bdc4bf6ab "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:01:16 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/logs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:01:19 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:19 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:19 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:01:19 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/admin/version "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:19 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=128eff67-fb5c-49f0-a29b-633de309b56a "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:01:19 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:19 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=61d9033b-b8e2-4740-bf02-ad5534afc98e "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:01:19 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:19 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:01:19 | INFO | prefect.flow_runs | Ingesting documents...
|
||||
2025-09-18 08:01:19 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/task_runs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:01:19 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flows/ "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:19 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:01:19 | INFO | prefect.engine | View at http://prefect.lab/runs/flow-run/f79e3c88-5696-47b7-804f-52e63e119d4f
|
||||
2025-09-18 08:01:19 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/f79e3c88-5696-47b7-804f-52e63e119d4f/set_state "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:01:19 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/flow_runs/f79e3c88-5696-47b7-804f-52e63e119d4f "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:19 | INFO | prefect.flow_runs | Beginning subflow run 'scarlet-kestrel' for flow 'firecrawl_to_r2r'
|
||||
2025-09-18 08:01:19 | INFO | prefect.flow_runs | View at http://prefect.lab/runs/flow-run/f79e3c88-5696-47b7-804f-52e63e119d4f
|
||||
2025-09-18 08:01:19 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/flows/bac48c85-e6dc-4da0-99d5-6f26e027cabb "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:19 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=a4940a61-03b2-42fd-8be5-c56fb41c5fbb "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:01:19 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:19 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:19 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:19 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:19 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:01:19 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=73eee395-facc-4155-8860-f0bb18505773 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:01:19 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:20 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/map "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:20 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:20 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:01:20 | INFO | prefect.flow_runs | Discovered 5 unique URLs from Firecrawl map
|
||||
2025-09-18 08:01:20 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=a15d4179-100a-4e62-9c29-96d5dc33f25b "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:01:20 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:20 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/documents/c5c726b4-805a-5e22-ad13-323750b25efa "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 08:01:20 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/documents/a534965a-9da2-566e-a9ad-3e0da59bd3ae "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 08:01:20 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/documents/8af54b00-fe82-55c5-a1a5-fd0544139b62 "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 08:01:20 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/documents/9a2d0156-602f-5e4a-a8e1-22edd4c987e6 "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 08:01:20 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/documents/c01a1979-1dba-5731-bc71-39daff2e6ca2 "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 08:01:20 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:20 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:01:20 | INFO | prefect.flow_runs | Scraping 1 batches of Firecrawl pages
|
||||
2025-09-18 08:01:20 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=ab85073c-948a-4dfd-a69a-88908220a6d7 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:01:20 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:21 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/logs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:01:24 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:25 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:26 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:28 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:29 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:29 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:29 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:01:29 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=cd0f7f38-7c7d-4c70-8b71-96747cc6f47d "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:01:29 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:31 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/logs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:01:31 | INFO | httpx | HTTP Request: POST http://llm.lab/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:33 | INFO | httpx | HTTP Request: POST http://llm.lab/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:35 | INFO | httpx | HTTP Request: POST http://llm.lab/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:36 | INFO | httpx | HTTP Request: POST http://llm.lab/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:38 | INFO | httpx | HTTP Request: POST http://llm.lab/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:38 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:38 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:01:38 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=8a7fc241-e552-41df-a026-f108da848756 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:01:38 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:38 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:38 | INFO | prefect.task_runs | Using collection ID: 866022d4-9a5d-4ff2-9609-1412502d44a1 for collection: r2r
|
||||
2025-09-18 08:01:38 | INFO | prefect.task_runs | Creating document with ID: c5c726b4-805a-5e22-ad13-323750b25efa
|
||||
2025-09-18 08:01:38 | INFO | prefect.task_runs | Built metadata for document c5c726b4-805a-5e22-ad13-323750b25efa: {'source_url': 'https://r2r-docs.sciphi.ai/introduction', 'content_type': 'text/markdown', 'word_count': 296, 'char_count': 3000, 'timestamp': '2025-09-18T08:01:29.271720+00:00', 'ingestion_source': 'web', 'title': 'Introduction | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.', 'description': 'R2R is an advanced all-in-one AI Retrieval-Augmented Generation (RAG) solution with multimodal content ingestion, hybrid search, configurable GraphRAG, and a Deep Research API for complex queries.'}
|
||||
2025-09-18 08:01:38 | INFO | prefect.task_runs | Creating document c5c726b4-805a-5e22-ad13-323750b25efa with collection_ids: [866022d4-9a5d-4ff2-9609-1412502d44a1]
|
||||
2025-09-18 08:01:38 | INFO | prefect.task_runs | Sending to R2R - files keys: ['raw_text', 'metadata', 'id', 'ingestion_mode', 'collection_ids']
|
||||
2025-09-18 08:01:38 | INFO | prefect.task_runs | Metadata JSON: {"source_url": "https://r2r-docs.sciphi.ai/introduction", "content_type": "text/markdown", "word_count": 296, "char_count": 3000, "timestamp": "2025-09-18T08:01:29.271720+00:00", "ingestion_source": "web", "title": "Introduction | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.", "description": "R2R is an advanced all-in-one AI Retrieval-Augmented Generation (RAG) solution with multimodal content ingestion, hybrid search, configurable GraphRAG, and a Deep Research API for complex queries."}
|
||||
2025-09-18 08:01:38 | INFO | httpx | HTTP Request: POST http://r2r.lab/v3/documents "HTTP/1.1 202 Accepted"
|
||||
2025-09-18 08:01:38 | INFO | prefect.task_runs | R2R returned document ID: c5c726b4-805a-5e22-ad13-323750b25efa
|
||||
2025-09-18 08:01:38 | INFO | prefect.task_runs | Document c5c726b4-805a-5e22-ad13-323750b25efa should be assigned to collection 866022d4-9a5d-4ff2-9609-1412502d44a1 via creation API
|
||||
2025-09-18 08:01:38 | INFO | prefect.task_runs | Creating document with ID: a534965a-9da2-566e-a9ad-3e0da59bd3ae
|
||||
2025-09-18 08:01:38 | INFO | prefect.task_runs | Built metadata for document a534965a-9da2-566e-a9ad-3e0da59bd3ae: {'source_url': 'https://r2r-docs.sciphi.ai/introduction/system', 'content_type': 'text/markdown', 'word_count': 146, 'char_count': 1275, 'timestamp': '2025-09-18T08:01:29.271879+00:00', 'ingestion_source': 'web', 'title': 'System | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.', 'description': 'Learn about the R2R system architecture'}
|
||||
2025-09-18 08:01:38 | INFO | prefect.task_runs | Creating document a534965a-9da2-566e-a9ad-3e0da59bd3ae with collection_ids: [866022d4-9a5d-4ff2-9609-1412502d44a1]
|
||||
2025-09-18 08:01:38 | INFO | prefect.task_runs | Sending to R2R - files keys: ['raw_text', 'metadata', 'id', 'ingestion_mode', 'collection_ids']
|
||||
2025-09-18 08:01:38 | INFO | prefect.task_runs | Metadata JSON: {"source_url": "https://r2r-docs.sciphi.ai/introduction/system", "content_type": "text/markdown", "word_count": 146, "char_count": 1275, "timestamp": "2025-09-18T08:01:29.271879+00:00", "ingestion_source": "web", "title": "System | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.", "description": "Learn about the R2R system architecture"}
|
||||
2025-09-18 08:01:38 | INFO | httpx | HTTP Request: POST http://r2r.lab/v3/documents "HTTP/1.1 202 Accepted"
|
||||
2025-09-18 08:01:38 | INFO | prefect.task_runs | R2R returned document ID: a534965a-9da2-566e-a9ad-3e0da59bd3ae
|
||||
2025-09-18 08:01:38 | INFO | prefect.task_runs | Document a534965a-9da2-566e-a9ad-3e0da59bd3ae should be assigned to collection 866022d4-9a5d-4ff2-9609-1412502d44a1 via creation API
|
||||
2025-09-18 08:01:38 | INFO | prefect.task_runs | Creating document with ID: 8af54b00-fe82-55c5-a1a5-fd0544139b62
|
||||
2025-09-18 08:01:38 | INFO | prefect.task_runs | Built metadata for document 8af54b00-fe82-55c5-a1a5-fd0544139b62: {'source_url': 'https://r2r-docs.sciphi.ai/introduction/whats-new', 'content_type': 'text/markdown', 'word_count': 42, 'char_count': 350, 'timestamp': '2025-09-18T08:01:29.271949+00:00', 'ingestion_source': 'web', 'title': "What's New | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.", 'description': 'Release notes for version 0.3.5 of an advanced AI retrieval system featuring Agentic RAG with improved API, SSE streaming output, and enhanced citations.'}
|
||||
2025-09-18 08:01:38 | INFO | prefect.task_runs | Creating document 8af54b00-fe82-55c5-a1a5-fd0544139b62 with collection_ids: [866022d4-9a5d-4ff2-9609-1412502d44a1]
|
||||
2025-09-18 08:01:38 | INFO | prefect.task_runs | Sending to R2R - files keys: ['raw_text', 'metadata', 'id', 'ingestion_mode', 'collection_ids']
|
||||
2025-09-18 08:01:38 | INFO | prefect.task_runs | Metadata JSON: {"source_url": "https://r2r-docs.sciphi.ai/introduction/whats-new", "content_type": "text/markdown", "word_count": 42, "char_count": 350, "timestamp": "2025-09-18T08:01:29.271949+00:00", "ingestion_source": "web", "title": "What's New | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.", "description": "Release notes for version 0.3.5 of an advanced AI retrieval system featuring Agentic RAG with improved API, SSE streaming output, and enhanced citations."}
|
||||
2025-09-18 08:01:38 | INFO | httpx | HTTP Request: POST http://r2r.lab/v3/documents "HTTP/1.1 202 Accepted"
|
||||
2025-09-18 08:01:38 | INFO | prefect.task_runs | R2R returned document ID: 8af54b00-fe82-55c5-a1a5-fd0544139b62
|
||||
2025-09-18 08:01:38 | INFO | prefect.task_runs | Document 8af54b00-fe82-55c5-a1a5-fd0544139b62 should be assigned to collection 866022d4-9a5d-4ff2-9609-1412502d44a1 via creation API
|
||||
2025-09-18 08:01:38 | INFO | prefect.task_runs | Creating document with ID: 9a2d0156-602f-5e4a-a8e1-22edd4c987e6
|
||||
2025-09-18 08:01:38 | INFO | prefect.task_runs | Built metadata for document 9a2d0156-602f-5e4a-a8e1-22edd4c987e6: {'source_url': 'https://r2r-docs.sciphi.ai/introduction/what-is-r2r', 'content_type': 'text/markdown', 'word_count': 444, 'char_count': 3541, 'timestamp': '2025-09-18T08:01:29.272137+00:00', 'ingestion_source': 'web', 'title': 'What is R2R? | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.', 'description': 'R2R is an advanced AI retrieval system that provides infrastructure and tools for implementing efficient, scalable, and reliable AI-powered document understanding in applications through Retrieval-Augmented Generation (RAG) with a RESTful API.'}
|
||||
2025-09-18 08:01:38 | INFO | prefect.task_runs | Creating document 9a2d0156-602f-5e4a-a8e1-22edd4c987e6 with collection_ids: [866022d4-9a5d-4ff2-9609-1412502d44a1]
|
||||
2025-09-18 08:01:38 | INFO | prefect.task_runs | Sending to R2R - files keys: ['raw_text', 'metadata', 'id', 'ingestion_mode', 'collection_ids']
|
||||
2025-09-18 08:01:38 | INFO | prefect.task_runs | Metadata JSON: {"source_url": "https://r2r-docs.sciphi.ai/introduction/what-is-r2r", "content_type": "text/markdown", "word_count": 444, "char_count": 3541, "timestamp": "2025-09-18T08:01:29.272137+00:00", "ingestion_source": "web", "title": "What is R2R? | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.", "description": "R2R is an advanced AI retrieval system that provides infrastructure and tools for implementing efficient, scalable, and reliable AI-powered document understanding in applications through Retrieval-Augmented Generation (RAG) with a RESTful API."}
|
||||
2025-09-18 08:01:39 | INFO | httpx | HTTP Request: POST http://r2r.lab/v3/documents "HTTP/1.1 202 Accepted"
|
||||
2025-09-18 08:01:39 | INFO | prefect.task_runs | R2R returned document ID: 9a2d0156-602f-5e4a-a8e1-22edd4c987e6
|
||||
2025-09-18 08:01:39 | INFO | prefect.task_runs | Document 9a2d0156-602f-5e4a-a8e1-22edd4c987e6 should be assigned to collection 866022d4-9a5d-4ff2-9609-1412502d44a1 via creation API
|
||||
2025-09-18 08:01:39 | INFO | prefect.task_runs | Creating document with ID: c01a1979-1dba-5731-bc71-39daff2e6ca2
|
||||
2025-09-18 08:01:39 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/logs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:01:39 | INFO | prefect.task_runs | Built metadata for document c01a1979-1dba-5731-bc71-39daff2e6ca2: {'source_url': 'https://r2r-docs.sciphi.ai/introduction/rag', 'content_type': 'text/markdown', 'word_count': 632, 'char_count': 4228, 'timestamp': '2025-09-18T08:01:29.272373+00:00', 'ingestion_source': 'web', 'title': 'More about RAG | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.', 'description': 'This document provides a comprehensive guide to implementing and configuring Retrieval-Augmented Generation (RAG) using the R2R system, covering setup, configuration, and operational details.'}
|
||||
2025-09-18 08:01:39 | INFO | prefect.task_runs | Creating document c01a1979-1dba-5731-bc71-39daff2e6ca2 with collection_ids: [866022d4-9a5d-4ff2-9609-1412502d44a1]
|
||||
2025-09-18 08:01:39 | INFO | prefect.task_runs | Sending to R2R - files keys: ['raw_text', 'metadata', 'id', 'ingestion_mode', 'collection_ids']
|
||||
2025-09-18 08:01:39 | INFO | prefect.task_runs | Metadata JSON: {"source_url": "https://r2r-docs.sciphi.ai/introduction/rag", "content_type": "text/markdown", "word_count": 632, "char_count": 4228, "timestamp": "2025-09-18T08:01:29.272373+00:00", "ingestion_source": "web", "title": "More about RAG | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.", "description": "This document provides a comprehensive guide to implementing and configuring Retrieval-Augmented Generation (RAG) using the R2R system, covering setup, configuration, and operational details."}
|
||||
2025-09-18 08:01:39 | INFO | httpx | HTTP Request: POST http://r2r.lab/v3/documents "HTTP/1.1 202 Accepted"
|
||||
2025-09-18 08:01:39 | INFO | prefect.task_runs | R2R returned document ID: c01a1979-1dba-5731-bc71-39daff2e6ca2
|
||||
2025-09-18 08:01:39 | INFO | prefect.task_runs | Document c01a1979-1dba-5731-bc71-39daff2e6ca2 should be assigned to collection 866022d4-9a5d-4ff2-9609-1412502d44a1 via creation API
|
||||
2025-09-18 08:01:39 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:39 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:01:39 | INFO | prefect.flow_runs | Upserted 5 documents into R2R (0 failed)
|
||||
2025-09-18 08:01:39 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/f79e3c88-5696-47b7-804f-52e63e119d4f/set_state "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:01:39 | INFO | prefect.flow_runs | Finished in state Completed()
|
||||
2025-09-18 08:01:39 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/admin/version "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:39 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:39 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=3aa36664-8d4e-4c80-9482-0dee2f2b003f "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:01:39 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:01:39 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:01:39 | INFO | prefect.flow_runs | Ingestion completed: 5 processed, 0 failed
|
||||
2025-09-18 08:01:39 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/063d86dc-a190-4be9-a56a-de1d1257478f/set_state "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:01:39 | INFO | prefect.flow_runs | Finished in state Completed()
|
||||
2025-09-18 08:01:41 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/logs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:07:27 | INFO | ingest_pipeline.cli.tui.utils.runners | Shutting down storage connections
|
||||
2025-09-18 08:07:27 | INFO | ingest_pipeline.cli.tui.utils.runners | All storage connections closed gracefully
|
||||
2025-09-18 08:07:31 | INFO | ingest_pipeline.cli.tui.utils.runners | Initializing collection management TUI
|
||||
2025-09-18 08:07:31 | INFO | ingest_pipeline.cli.tui.utils.runners | Scanning available storage backends
|
||||
2025-09-18 08:07:31 | INFO | httpx | HTTP Request: GET http://weaviate.yo/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 08:07:31 | INFO | httpx | HTTP Request: GET http://weaviate.yo/v1/meta "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:31 | INFO | httpx | HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:31 | INFO | httpx | HTTP Request: GET http://weaviate.yo/v1/schema "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:31 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:31 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:31 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:31 | INFO | ingest_pipeline.cli.tui.utils.runners | weaviate connected successfully
|
||||
2025-09-18 08:07:31 | INFO | ingest_pipeline.cli.tui.utils.runners | open_webui connected successfully
|
||||
2025-09-18 08:07:31 | INFO | ingest_pipeline.cli.tui.utils.runners | r2r connected successfully
|
||||
2025-09-18 08:07:31 | INFO | ingest_pipeline.cli.tui.utils.runners | Launching TUI with 3 backend(s): weaviate, open_webui, r2r
|
||||
2025-09-18 08:07:32 | INFO | httpx | HTTP Request: GET http://weaviate.yo/v1/schema "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:32 | INFO | httpx | HTTP Request: POST http://weaviate.yo/v1/graphql "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:32 | INFO | httpx | HTTP Request: POST http://weaviate.yo/v1/graphql "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:32 | INFO | httpx | HTTP Request: POST http://weaviate.yo/v1/graphql "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:32 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:32 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:32 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/d06bd108-ae7f-44f4-92fb-2ac556784920 "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:32 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:32 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/139c04d5-7d38-4595-8e12-79a67fd731e7 "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:32 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:32 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/96832710-8146-4e3b-88f3-4b3929f67dbf "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:32 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:32 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/dade78d9-9893-4966-bd4b-31f1c1635cfa "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:32 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:32 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/721c1517-b2cd-482d-bd1c-f99571f0f31f "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:32 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:32 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/f867530b-5eea-43bf-8257-d3da497cb10b "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:32 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:32 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/cbd4ae82-6fdd-4a4e-a4d5-d0b97ae988fd "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:32 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:32 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:32 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:32 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:32 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:07:32 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:01 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/admin/version "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:01 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=92c2d296-3666-4e3d-894d-76aa2ba37134 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:08:01 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flows/ "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:01 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:08:01 | INFO | prefect.engine | View at http://prefect.lab/runs/flow-run/34fad9cf-0b69-46da-88a8-755ea10237a1
|
||||
2025-09-18 08:08:01 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/34fad9cf-0b69-46da-88a8-755ea10237a1/set_state "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:08:01 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/flow_runs/34fad9cf-0b69-46da-88a8-755ea10237a1 "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:01 | INFO | prefect.flow_runs | Beginning flow run 'uppish-cow' for flow 'ingestion_pipeline'
|
||||
2025-09-18 08:08:01 | INFO | prefect.flow_runs | View at http://prefect.lab/runs/flow-run/34fad9cf-0b69-46da-88a8-755ea10237a1
|
||||
2025-09-18 08:08:01 | INFO | prefect.flow_runs | Starting ingestion from https://r2r-docs.sciphi.ai/introduction
|
||||
2025-09-18 08:08:01 | INFO | prefect.flow_runs | Validating source...
|
||||
2025-09-18 08:08:01 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=933e7349-accd-47ba-a443-ad004dc80204 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:08:01 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:01 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/flows/78f3cfb6-1339-49c6-89f4-c38effea29e4 "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:03 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=816af358-1638-4ced-97b0-e06686fc1a81 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:08:03 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/logs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:08:06 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:07 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:07 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:08:07 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/admin/version "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:07 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=89f5424f-d39b-4d5a-9d2c-795ff23f7ac9 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:08:07 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:07 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=11dfd0a9-209c-432e-b1fe-973b00a779d4 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:08:07 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:07 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:08:07 | INFO | prefect.flow_runs | Ingesting documents...
|
||||
2025-09-18 08:08:07 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/task_runs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:08:07 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flows/ "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:07 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:08:07 | INFO | prefect.engine | View at http://prefect.lab/runs/flow-run/3208cd24-4801-43da-886f-d4bcde98c727
|
||||
2025-09-18 08:08:07 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/3208cd24-4801-43da-886f-d4bcde98c727/set_state "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:08:07 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/flow_runs/3208cd24-4801-43da-886f-d4bcde98c727 "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:07 | INFO | prefect.flow_runs | Beginning subflow run 'magnificent-mouflon' for flow 'firecrawl_to_r2r'
|
||||
2025-09-18 08:08:07 | INFO | prefect.flow_runs | View at http://prefect.lab/runs/flow-run/3208cd24-4801-43da-886f-d4bcde98c727
|
||||
2025-09-18 08:08:07 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/flows/bac48c85-e6dc-4da0-99d5-6f26e027cabb "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:07 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=b85fa79b-e50a-43ab-9cbc-6d356f9d9c02 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:08:07 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:07 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:07 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:07 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:07 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:08:07 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=23f93c72-be3b-4e0e-99a9-6aa26dcb687d "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:08:07 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/logs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:08:07 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:08 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/map "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:08 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:08 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:08:08 | INFO | prefect.flow_runs | Discovered 5 unique URLs from Firecrawl map
|
||||
2025-09-18 08:08:08 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=942907ec-99f4-4122-b494-aac403d7a710 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:08:08 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:08 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/documents/c5c726b4-805a-5e22-ad13-323750b25efa "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 08:08:08 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/documents/a534965a-9da2-566e-a9ad-3e0da59bd3ae "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 08:08:08 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/documents/8af54b00-fe82-55c5-a1a5-fd0544139b62 "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 08:08:08 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/documents/9a2d0156-602f-5e4a-a8e1-22edd4c987e6 "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 08:08:08 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/documents/c01a1979-1dba-5731-bc71-39daff2e6ca2 "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 08:08:08 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:08 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:08:08 | INFO | prefect.flow_runs | Scraping 1 batches of Firecrawl pages
|
||||
2025-09-18 08:08:08 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=9dc7984e-4dec-4eb3-95b1-35b24c8fb708 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:08:08 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:09 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/logs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:08:12 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:13 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:15 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:15 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:17 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:17 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:17 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:08:17 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=e67effe9-bc4d-4eb3-9621-ed8ed19057e3 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:08:17 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:18 | INFO | httpx | HTTP Request: POST http://llm.lab/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:19 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/logs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:08:19 | INFO | httpx | HTTP Request: POST http://llm.lab/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:20 | INFO | httpx | HTTP Request: POST http://llm.lab/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:21 | INFO | httpx | HTTP Request: POST http://llm.lab/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:23 | INFO | httpx | HTTP Request: POST http://llm.lab/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:23 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:23 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:08:23 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=a58ca065-e7eb-4180-bda4-1b1b31068c37 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:08:23 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:23 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:23 | INFO | prefect.task_runs | Using collection ID: 866022d4-9a5d-4ff2-9609-1412502d44a1 for collection: r2r
|
||||
2025-09-18 08:08:23 | INFO | prefect.task_runs | Creating document with ID: c5c726b4-805a-5e22-ad13-323750b25efa
|
||||
2025-09-18 08:08:23 | INFO | prefect.task_runs | Built metadata for document c5c726b4-805a-5e22-ad13-323750b25efa: {'source_url': 'https://r2r-docs.sciphi.ai/introduction', 'content_type': 'text/markdown', 'word_count': 296, 'char_count': 3000, 'timestamp': '2025-09-18T08:08:17.530458+00:00', 'ingestion_source': 'web', 'title': 'Introduction | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.', 'description': 'R2R is an advanced AI retrieval-augmented generation system with multimodal content ingestion, hybrid search, configurable GraphRAG, and a Deep Research API for complex queries.'}
|
||||
2025-09-18 08:08:23 | INFO | prefect.task_runs | Creating document c5c726b4-805a-5e22-ad13-323750b25efa with collection_ids: [866022d4-9a5d-4ff2-9609-1412502d44a1]
|
||||
2025-09-18 08:08:23 | INFO | prefect.task_runs | Sending to R2R - files keys: ['raw_text', 'metadata', 'id', 'ingestion_mode', 'collection_ids']
|
||||
2025-09-18 08:08:23 | INFO | prefect.task_runs | Metadata JSON: {"source_url": "https://r2r-docs.sciphi.ai/introduction", "content_type": "text/markdown", "word_count": 296, "char_count": 3000, "timestamp": "2025-09-18T08:08:17.530458+00:00", "ingestion_source": "web", "title": "Introduction | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.", "description": "R2R is an advanced AI retrieval-augmented generation system with multimodal content ingestion, hybrid search, configurable GraphRAG, and a Deep Research API for complex queries."}
|
||||
2025-09-18 08:08:23 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/logs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:08:23 | INFO | httpx | HTTP Request: POST http://r2r.lab/v3/documents "HTTP/1.1 202 Accepted"
|
||||
2025-09-18 08:08:23 | INFO | prefect.task_runs | R2R returned document ID: c5c726b4-805a-5e22-ad13-323750b25efa
|
||||
2025-09-18 08:08:23 | INFO | prefect.task_runs | Document c5c726b4-805a-5e22-ad13-323750b25efa should be assigned to collection 866022d4-9a5d-4ff2-9609-1412502d44a1 via creation API
|
||||
2025-09-18 08:08:23 | INFO | prefect.task_runs | Creating document with ID: a534965a-9da2-566e-a9ad-3e0da59bd3ae
|
||||
2025-09-18 08:08:23 | INFO | prefect.task_runs | Built metadata for document a534965a-9da2-566e-a9ad-3e0da59bd3ae: {'source_url': 'https://r2r-docs.sciphi.ai/introduction/system', 'content_type': 'text/markdown', 'word_count': 146, 'char_count': 1275, 'timestamp': '2025-09-18T08:08:17.530646+00:00', 'ingestion_source': 'web', 'title': 'System | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.', 'description': 'R2R is an advanced AI retrieval system with agentic RAG capabilities, built on a modular service-oriented architecture with RESTful API, vector storage, and support for hybrid search and GraphRAG.'}
|
||||
2025-09-18 08:08:23 | INFO | prefect.task_runs | Creating document a534965a-9da2-566e-a9ad-3e0da59bd3ae with collection_ids: [866022d4-9a5d-4ff2-9609-1412502d44a1]
|
||||
2025-09-18 08:08:23 | INFO | prefect.task_runs | Sending to R2R - files keys: ['raw_text', 'metadata', 'id', 'ingestion_mode', 'collection_ids']
|
||||
2025-09-18 08:08:23 | INFO | prefect.task_runs | Metadata JSON: {"source_url": "https://r2r-docs.sciphi.ai/introduction/system", "content_type": "text/markdown", "word_count": 146, "char_count": 1275, "timestamp": "2025-09-18T08:08:17.530646+00:00", "ingestion_source": "web", "title": "System | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.", "description": "R2R is an advanced AI retrieval system with agentic RAG capabilities, built on a modular service-oriented architecture with RESTful API, vector storage, and support for hybrid search and GraphRAG."}
|
||||
2025-09-18 08:08:23 | INFO | httpx | HTTP Request: POST http://r2r.lab/v3/documents "HTTP/1.1 202 Accepted"
|
||||
2025-09-18 08:08:23 | INFO | prefect.task_runs | R2R returned document ID: a534965a-9da2-566e-a9ad-3e0da59bd3ae
|
||||
2025-09-18 08:08:23 | INFO | prefect.task_runs | Document a534965a-9da2-566e-a9ad-3e0da59bd3ae should be assigned to collection 866022d4-9a5d-4ff2-9609-1412502d44a1 via creation API
|
||||
2025-09-18 08:08:23 | INFO | prefect.task_runs | Creating document with ID: 8af54b00-fe82-55c5-a1a5-fd0544139b62
|
||||
2025-09-18 08:08:23 | INFO | prefect.task_runs | Built metadata for document 8af54b00-fe82-55c5-a1a5-fd0544139b62: {'source_url': 'https://r2r-docs.sciphi.ai/introduction/whats-new', 'content_type': 'text/markdown', 'word_count': 42, 'char_count': 350, 'timestamp': '2025-09-18T08:08:17.530703+00:00', 'ingestion_source': 'web', 'title': "What's New | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.", 'description': 'Version 0.3.5 release notes for an advanced AI retrieval system featuring Agentic RAG with RESTful API, SSE streaming, and improved citations.'}
|
||||
2025-09-18 08:08:23 | INFO | prefect.task_runs | Creating document 8af54b00-fe82-55c5-a1a5-fd0544139b62 with collection_ids: [866022d4-9a5d-4ff2-9609-1412502d44a1]
|
||||
2025-09-18 08:08:23 | INFO | prefect.task_runs | Sending to R2R - files keys: ['raw_text', 'metadata', 'id', 'ingestion_mode', 'collection_ids']
|
||||
2025-09-18 08:08:23 | INFO | prefect.task_runs | Metadata JSON: {"source_url": "https://r2r-docs.sciphi.ai/introduction/whats-new", "content_type": "text/markdown", "word_count": 42, "char_count": 350, "timestamp": "2025-09-18T08:08:17.530703+00:00", "ingestion_source": "web", "title": "What's New | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.", "description": "Version 0.3.5 release notes for an advanced AI retrieval system featuring Agentic RAG with RESTful API, SSE streaming, and improved citations."}
|
||||
2025-09-18 08:08:23 | INFO | httpx | HTTP Request: POST http://r2r.lab/v3/documents "HTTP/1.1 202 Accepted"
|
||||
2025-09-18 08:08:23 | INFO | prefect.task_runs | R2R returned document ID: 8af54b00-fe82-55c5-a1a5-fd0544139b62
|
||||
2025-09-18 08:08:23 | INFO | prefect.task_runs | Document 8af54b00-fe82-55c5-a1a5-fd0544139b62 should be assigned to collection 866022d4-9a5d-4ff2-9609-1412502d44a1 via creation API
|
||||
2025-09-18 08:08:23 | INFO | prefect.task_runs | Creating document with ID: 9a2d0156-602f-5e4a-a8e1-22edd4c987e6
|
||||
2025-09-18 08:08:23 | INFO | prefect.task_runs | Built metadata for document 9a2d0156-602f-5e4a-a8e1-22edd4c987e6: {'source_url': 'https://r2r-docs.sciphi.ai/introduction/what-is-r2r', 'content_type': 'text/markdown', 'word_count': 444, 'char_count': 3541, 'timestamp': '2025-09-18T08:08:17.530910+00:00', 'ingestion_source': 'web', 'title': 'What is R2R? | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.', 'description': 'R2R is an advanced AI retrieval system that provides infrastructure and tools for implementing efficient, scalable, and reliable AI-powered document understanding in applications.'}
|
||||
2025-09-18 08:08:23 | INFO | prefect.task_runs | Creating document 9a2d0156-602f-5e4a-a8e1-22edd4c987e6 with collection_ids: [866022d4-9a5d-4ff2-9609-1412502d44a1]
|
||||
2025-09-18 08:08:23 | INFO | prefect.task_runs | Sending to R2R - files keys: ['raw_text', 'metadata', 'id', 'ingestion_mode', 'collection_ids']
|
||||
2025-09-18 08:08:23 | INFO | prefect.task_runs | Metadata JSON: {"source_url": "https://r2r-docs.sciphi.ai/introduction/what-is-r2r", "content_type": "text/markdown", "word_count": 444, "char_count": 3541, "timestamp": "2025-09-18T08:08:17.530910+00:00", "ingestion_source": "web", "title": "What is R2R? | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.", "description": "R2R is an advanced AI retrieval system that provides infrastructure and tools for implementing efficient, scalable, and reliable AI-powered document understanding in applications."}
|
||||
2025-09-18 08:08:24 | INFO | httpx | HTTP Request: POST http://r2r.lab/v3/documents "HTTP/1.1 202 Accepted"
|
||||
2025-09-18 08:08:24 | INFO | prefect.task_runs | R2R returned document ID: 9a2d0156-602f-5e4a-a8e1-22edd4c987e6
|
||||
2025-09-18 08:08:24 | INFO | prefect.task_runs | Document 9a2d0156-602f-5e4a-a8e1-22edd4c987e6 should be assigned to collection 866022d4-9a5d-4ff2-9609-1412502d44a1 via creation API
|
||||
2025-09-18 08:08:24 | INFO | prefect.task_runs | Creating document with ID: c01a1979-1dba-5731-bc71-39daff2e6ca2
|
||||
2025-09-18 08:08:24 | INFO | prefect.task_runs | Built metadata for document c01a1979-1dba-5731-bc71-39daff2e6ca2: {'source_url': 'https://r2r-docs.sciphi.ai/introduction/rag', 'content_type': 'text/markdown', 'word_count': 632, 'char_count': 4228, 'timestamp': '2025-09-18T08:08:17.531237+00:00', 'ingestion_source': 'web', 'title': 'More about RAG | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.', 'description': 'This document provides a comprehensive guide to implementing and configuring Retrieval-Augmented Generation (RAG) with the R2R system, covering setup, configuration, and operational details.'}
|
||||
2025-09-18 08:08:24 | INFO | prefect.task_runs | Creating document c01a1979-1dba-5731-bc71-39daff2e6ca2 with collection_ids: [866022d4-9a5d-4ff2-9609-1412502d44a1]
|
||||
2025-09-18 08:08:24 | INFO | prefect.task_runs | Sending to R2R - files keys: ['raw_text', 'metadata', 'id', 'ingestion_mode', 'collection_ids']
|
||||
2025-09-18 08:08:24 | INFO | prefect.task_runs | Metadata JSON: {"source_url": "https://r2r-docs.sciphi.ai/introduction/rag", "content_type": "text/markdown", "word_count": 632, "char_count": 4228, "timestamp": "2025-09-18T08:08:17.531237+00:00", "ingestion_source": "web", "title": "More about RAG | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.", "description": "This document provides a comprehensive guide to implementing and configuring Retrieval-Augmented Generation (RAG) with the R2R system, covering setup, configuration, and operational details."}
|
||||
2025-09-18 08:08:24 | INFO | httpx | HTTP Request: POST http://r2r.lab/v3/documents "HTTP/1.1 202 Accepted"
|
||||
2025-09-18 08:08:24 | INFO | prefect.task_runs | R2R returned document ID: c01a1979-1dba-5731-bc71-39daff2e6ca2
|
||||
2025-09-18 08:08:24 | INFO | prefect.task_runs | Document c01a1979-1dba-5731-bc71-39daff2e6ca2 should be assigned to collection 866022d4-9a5d-4ff2-9609-1412502d44a1 via creation API
|
||||
2025-09-18 08:08:24 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:24 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:08:24 | INFO | prefect.flow_runs | Upserted 5 documents into R2R (0 failed)
|
||||
2025-09-18 08:08:24 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/3208cd24-4801-43da-886f-d4bcde98c727/set_state "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:08:24 | INFO | prefect.flow_runs | Finished in state Completed()
|
||||
2025-09-18 08:08:24 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/admin/version "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:24 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:24 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=58784f1e-c888-42c3-83f3-d212c901c316 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:08:24 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:08:24 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:08:24 | INFO | prefect.flow_runs | Ingestion completed: 5 processed, 0 failed
|
||||
2025-09-18 08:08:24 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/34fad9cf-0b69-46da-88a8-755ea10237a1/set_state "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:08:24 | INFO | prefect.flow_runs | Finished in state Completed()
|
||||
2025-09-18 08:08:25 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/logs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:13:05 | INFO | ingest_pipeline.cli.tui.utils.runners | Shutting down storage connections
|
||||
2025-09-18 08:13:05 | INFO | ingest_pipeline.cli.tui.utils.runners | All storage connections closed gracefully
|
||||
2025-09-18 08:13:09 | INFO | ingest_pipeline.cli.tui.utils.runners | Initializing collection management TUI
|
||||
2025-09-18 08:13:09 | INFO | ingest_pipeline.cli.tui.utils.runners | Scanning available storage backends
|
||||
2025-09-18 08:13:09 | INFO | httpx | HTTP Request: GET http://weaviate.yo/v1/.well-known/openid-configuration "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 08:13:09 | INFO | httpx | HTTP Request: GET http://weaviate.yo/v1/meta "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:09 | INFO | httpx | HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:09 | INFO | httpx | HTTP Request: GET http://weaviate.yo/v1/schema "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:09 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:09 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:09 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:10 | INFO | ingest_pipeline.cli.tui.utils.runners | weaviate connected successfully
|
||||
2025-09-18 08:13:10 | INFO | ingest_pipeline.cli.tui.utils.runners | open_webui connected successfully
|
||||
2025-09-18 08:13:10 | INFO | ingest_pipeline.cli.tui.utils.runners | r2r connected successfully
|
||||
2025-09-18 08:13:10 | INFO | ingest_pipeline.cli.tui.utils.runners | Launching TUI with 3 backend(s): weaviate, open_webui, r2r
|
||||
2025-09-18 08:13:10 | INFO | httpx | HTTP Request: GET http://weaviate.yo/v1/schema "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:10 | INFO | httpx | HTTP Request: POST http://weaviate.yo/v1/graphql "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:10 | INFO | httpx | HTTP Request: POST http://weaviate.yo/v1/graphql "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:10 | INFO | httpx | HTTP Request: POST http://weaviate.yo/v1/graphql "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:10 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:11 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:11 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/d06bd108-ae7f-44f4-92fb-2ac556784920 "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:11 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:11 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/139c04d5-7d38-4595-8e12-79a67fd731e7 "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:11 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:11 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/96832710-8146-4e3b-88f3-4b3929f67dbf "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:11 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:11 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/dade78d9-9893-4966-bd4b-31f1c1635cfa "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:11 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:11 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/721c1517-b2cd-482d-bd1c-f99571f0f31f "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:11 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:11 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/f867530b-5eea-43bf-8257-d3da497cb10b "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:11 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/list "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:11 | INFO | httpx | HTTP Request: GET http://chat.lab/api/v1/knowledge/cbd4ae82-6fdd-4a4e-a4d5-d0b97ae988fd "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:11 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:11 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:11 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:12 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:12 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:12 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:31 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/admin/version "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:31 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=d782cd1e-1eea-48fe-8e20-cc6b73e7352b "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:13:31 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flows/ "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:31 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:13:31 | INFO | prefect.engine | View at http://prefect.lab/runs/flow-run/813e6876-948a-4833-a855-a88ef455dcf8
|
||||
2025-09-18 08:13:31 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/813e6876-948a-4833-a855-a88ef455dcf8/set_state "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:13:31 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/flow_runs/813e6876-948a-4833-a855-a88ef455dcf8 "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:31 | INFO | prefect.flow_runs | Beginning flow run 'jade-skylark' for flow 'ingestion_pipeline'
|
||||
2025-09-18 08:13:31 | INFO | prefect.flow_runs | View at http://prefect.lab/runs/flow-run/813e6876-948a-4833-a855-a88ef455dcf8
|
||||
2025-09-18 08:13:31 | INFO | prefect.flow_runs | Starting ingestion from https://r2r-docs.sciphi.ai/introduction
|
||||
2025-09-18 08:13:31 | INFO | prefect.flow_runs | Validating source...
|
||||
2025-09-18 08:13:32 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=30fd7b21-d90e-49b1-bdb9-306689a9ad01 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:13:32 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:32 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/flows/78f3cfb6-1339-49c6-89f4-c38effea29e4 "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:33 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=afdcdbd5-ed46-442d-9311-d9c83fc14a13 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:13:33 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/logs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:13:35 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:35 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:35 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:13:35 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/admin/version "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:35 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=ff9eea92-2967-42da-8673-fdf63dd27b57 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:13:35 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:35 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=15f58d95-731b-4c7c-b42e-86054b548ef2 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:13:35 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:35 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:13:35 | INFO | prefect.flow_runs | Ingesting documents...
|
||||
2025-09-18 08:13:35 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/task_runs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:13:35 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flows/ "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:36 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:13:36 | INFO | prefect.engine | View at http://prefect.lab/runs/flow-run/c962cb9b-f332-4862-a9ff-5e57b06c49ed
|
||||
2025-09-18 08:13:36 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/logs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:13:36 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/c962cb9b-f332-4862-a9ff-5e57b06c49ed/set_state "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:13:36 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/flow_runs/c962cb9b-f332-4862-a9ff-5e57b06c49ed "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:36 | INFO | prefect.flow_runs | Beginning subflow run 'winged-wren' for flow 'firecrawl_to_r2r'
|
||||
2025-09-18 08:13:36 | INFO | prefect.flow_runs | View at http://prefect.lab/runs/flow-run/c962cb9b-f332-4862-a9ff-5e57b06c49ed
|
||||
2025-09-18 08:13:36 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/flows/bac48c85-e6dc-4da0-99d5-6f26e027cabb "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:36 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=23d7e58d-c5e9-464b-9d53-fd89a06f7198 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:13:36 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:36 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:36 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:36 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:36 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:13:36 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=47567ea9-ee2d-4c18-991b-a5bf46b10fd8 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:13:36 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:37 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/map "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:37 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:37 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:13:37 | INFO | prefect.flow_runs | Discovered 5 unique URLs from Firecrawl map
|
||||
2025-09-18 08:13:37 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=5a6c2608-7305-45c9-a9c2-7908e76dbc4d "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:13:37 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:37 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/documents/c5c726b4-805a-5e22-ad13-323750b25efa "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 08:13:37 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/documents/a534965a-9da2-566e-a9ad-3e0da59bd3ae "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 08:13:37 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/documents/8af54b00-fe82-55c5-a1a5-fd0544139b62 "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 08:13:37 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/documents/9a2d0156-602f-5e4a-a8e1-22edd4c987e6 "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 08:13:37 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/documents/c01a1979-1dba-5731-bc71-39daff2e6ca2 "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 08:13:37 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:37 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:13:37 | INFO | prefect.flow_runs | Scraping 1 batches of Firecrawl pages
|
||||
2025-09-18 08:13:37 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=3735d66c-52e4-4085-97f9-e0b381fccf96 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:13:37 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:38 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/logs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:13:39 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:40 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:40 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:42 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:43 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:43 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:43 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:13:43 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=7f757009-47c5-460c-b7b8-a865c295f794 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:13:43 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:44 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/logs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:13:44 | INFO | httpx | HTTP Request: POST http://llm.lab/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:45 | INFO | httpx | HTTP Request: POST http://llm.lab/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:46 | INFO | httpx | HTTP Request: POST http://llm.lab/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:47 | INFO | httpx | HTTP Request: POST http://llm.lab/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:48 | INFO | httpx | HTTP Request: POST http://llm.lab/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:48 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:48 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:13:48 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=bb287f37-852d-4b01-9eef-b1a1d23eafa2 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:13:48 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:48 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:48 | INFO | prefect.task_runs | Using collection ID: 866022d4-9a5d-4ff2-9609-1412502d44a1 for collection: r2r
|
||||
2025-09-18 08:13:48 | INFO | prefect.task_runs | Creating document with ID: c5c726b4-805a-5e22-ad13-323750b25efa
|
||||
2025-09-18 08:13:48 | INFO | prefect.task_runs | Built metadata for document c5c726b4-805a-5e22-ad13-323750b25efa: {'source_url': 'https://r2r-docs.sciphi.ai/introduction', 'content_type': 'text/markdown', 'word_count': 296, 'char_count': 3000, 'timestamp': '2025-09-18T08:13:43.092808+00:00', 'ingestion_source': 'web', 'title': 'Introduction | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.', 'description': 'R2R is an advanced AI retrieval system that provides Retrieval-Augmented Generation capabilities with a RESTful API, featuring multimodal content ingestion, hybrid search, configurable GraphRAG, and a Deep Research API for complex queries.'}
|
||||
2025-09-18 08:13:48 | INFO | prefect.task_runs | Creating document c5c726b4-805a-5e22-ad13-323750b25efa with collection_ids: [866022d4-9a5d-4ff2-9609-1412502d44a1]
|
||||
2025-09-18 08:13:48 | INFO | prefect.task_runs | Sending to R2R - files keys: ['raw_text', 'metadata', 'id', 'ingestion_mode', 'collection_ids']
|
||||
2025-09-18 08:13:48 | INFO | prefect.task_runs | Metadata JSON: {"source_url": "https://r2r-docs.sciphi.ai/introduction", "content_type": "text/markdown", "word_count": 296, "char_count": 3000, "timestamp": "2025-09-18T08:13:43.092808+00:00", "ingestion_source": "web", "title": "Introduction | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.", "description": "R2R is an advanced AI retrieval system that provides Retrieval-Augmented Generation capabilities with a RESTful API, featuring multimodal content ingestion, hybrid search, configurable GraphRAG, and a Deep Research API for complex queries."}
|
||||
2025-09-18 08:13:48 | INFO | httpx | HTTP Request: POST http://r2r.lab/v3/documents "HTTP/1.1 202 Accepted"
|
||||
2025-09-18 08:13:48 | INFO | prefect.task_runs | R2R returned document ID: c5c726b4-805a-5e22-ad13-323750b25efa
|
||||
2025-09-18 08:13:48 | INFO | prefect.task_runs | Document c5c726b4-805a-5e22-ad13-323750b25efa should be assigned to collection 866022d4-9a5d-4ff2-9609-1412502d44a1 via creation API
|
||||
2025-09-18 08:13:48 | INFO | prefect.task_runs | Creating document with ID: a534965a-9da2-566e-a9ad-3e0da59bd3ae
|
||||
2025-09-18 08:13:48 | INFO | prefect.task_runs | Built metadata for document a534965a-9da2-566e-a9ad-3e0da59bd3ae: {'source_url': 'https://r2r-docs.sciphi.ai/introduction/system', 'content_type': 'text/markdown', 'word_count': 146, 'char_count': 1275, 'timestamp': '2025-09-18T08:13:43.092979+00:00', 'ingestion_source': 'web', 'title': 'System | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.', 'description': 'R2R is an advanced AI retrieval system with a modular architecture supporting both simple RAG applications and complex production-grade systems with features like hybrid search and GraphRAG.'}
|
||||
2025-09-18 08:13:48 | INFO | prefect.task_runs | Creating document a534965a-9da2-566e-a9ad-3e0da59bd3ae with collection_ids: [866022d4-9a5d-4ff2-9609-1412502d44a1]
|
||||
2025-09-18 08:13:48 | INFO | prefect.task_runs | Sending to R2R - files keys: ['raw_text', 'metadata', 'id', 'ingestion_mode', 'collection_ids']
|
||||
2025-09-18 08:13:48 | INFO | prefect.task_runs | Metadata JSON: {"source_url": "https://r2r-docs.sciphi.ai/introduction/system", "content_type": "text/markdown", "word_count": 146, "char_count": 1275, "timestamp": "2025-09-18T08:13:43.092979+00:00", "ingestion_source": "web", "title": "System | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.", "description": "R2R is an advanced AI retrieval system with a modular architecture supporting both simple RAG applications and complex production-grade systems with features like hybrid search and GraphRAG."}
|
||||
2025-09-18 08:13:48 | INFO | httpx | HTTP Request: POST http://r2r.lab/v3/documents "HTTP/1.1 202 Accepted"
|
||||
2025-09-18 08:13:48 | INFO | prefect.task_runs | R2R returned document ID: a534965a-9da2-566e-a9ad-3e0da59bd3ae
|
||||
2025-09-18 08:13:48 | INFO | prefect.task_runs | Document a534965a-9da2-566e-a9ad-3e0da59bd3ae should be assigned to collection 866022d4-9a5d-4ff2-9609-1412502d44a1 via creation API
|
||||
2025-09-18 08:13:48 | INFO | prefect.task_runs | Creating document with ID: 8af54b00-fe82-55c5-a1a5-fd0544139b62
|
||||
2025-09-18 08:13:48 | INFO | prefect.task_runs | Built metadata for document 8af54b00-fe82-55c5-a1a5-fd0544139b62: {'source_url': 'https://r2r-docs.sciphi.ai/introduction/whats-new', 'content_type': 'text/markdown', 'word_count': 42, 'char_count': 350, 'timestamp': '2025-09-18T08:13:43.093036+00:00', 'ingestion_source': 'web', 'title': "What's New | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.", 'description': 'Release notes for version 0.3.5 of an advanced AI retrieval system featuring improved Agentic RAG API, SSE streaming output, enhanced citations, and minor bug fixes.'}
|
||||
2025-09-18 08:13:48 | INFO | prefect.task_runs | Creating document 8af54b00-fe82-55c5-a1a5-fd0544139b62 with collection_ids: [866022d4-9a5d-4ff2-9609-1412502d44a1]
|
||||
2025-09-18 08:13:48 | INFO | prefect.task_runs | Sending to R2R - files keys: ['raw_text', 'metadata', 'id', 'ingestion_mode', 'collection_ids']
|
||||
2025-09-18 08:13:48 | INFO | prefect.task_runs | Metadata JSON: {"source_url": "https://r2r-docs.sciphi.ai/introduction/whats-new", "content_type": "text/markdown", "word_count": 42, "char_count": 350, "timestamp": "2025-09-18T08:13:43.093036+00:00", "ingestion_source": "web", "title": "What's New | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.", "description": "Release notes for version 0.3.5 of an advanced AI retrieval system featuring improved Agentic RAG API, SSE streaming output, enhanced citations, and minor bug fixes."}
|
||||
2025-09-18 08:13:48 | INFO | httpx | HTTP Request: POST http://r2r.lab/v3/documents "HTTP/1.1 202 Accepted"
|
||||
2025-09-18 08:13:48 | INFO | prefect.task_runs | R2R returned document ID: 8af54b00-fe82-55c5-a1a5-fd0544139b62
|
||||
2025-09-18 08:13:48 | INFO | prefect.task_runs | Document 8af54b00-fe82-55c5-a1a5-fd0544139b62 should be assigned to collection 866022d4-9a5d-4ff2-9609-1412502d44a1 via creation API
|
||||
2025-09-18 08:13:48 | INFO | prefect.task_runs | Creating document with ID: 9a2d0156-602f-5e4a-a8e1-22edd4c987e6
|
||||
2025-09-18 08:13:49 | INFO | prefect.task_runs | Built metadata for document 9a2d0156-602f-5e4a-a8e1-22edd4c987e6: {'source_url': 'https://r2r-docs.sciphi.ai/introduction/what-is-r2r', 'content_type': 'text/markdown', 'word_count': 444, 'char_count': 3541, 'timestamp': '2025-09-18T08:13:43.093218+00:00', 'ingestion_source': 'web', 'title': 'What is R2R? | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.', 'description': 'R2R is an advanced AI retrieval system that provides infrastructure and tools for implementing efficient, scalable, and reliable AI-powered document understanding in applications.'}
|
||||
2025-09-18 08:13:49 | INFO | prefect.task_runs | Creating document 9a2d0156-602f-5e4a-a8e1-22edd4c987e6 with collection_ids: [866022d4-9a5d-4ff2-9609-1412502d44a1]
|
||||
2025-09-18 08:13:49 | INFO | prefect.task_runs | Sending to R2R - files keys: ['raw_text', 'metadata', 'id', 'ingestion_mode', 'collection_ids']
|
||||
2025-09-18 08:13:49 | INFO | prefect.task_runs | Metadata JSON: {"source_url": "https://r2r-docs.sciphi.ai/introduction/what-is-r2r", "content_type": "text/markdown", "word_count": 444, "char_count": 3541, "timestamp": "2025-09-18T08:13:43.093218+00:00", "ingestion_source": "web", "title": "What is R2R? | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.", "description": "R2R is an advanced AI retrieval system that provides infrastructure and tools for implementing efficient, scalable, and reliable AI-powered document understanding in applications."}
|
||||
2025-09-18 08:13:49 | INFO | httpx | HTTP Request: POST http://r2r.lab/v3/documents "HTTP/1.1 202 Accepted"
|
||||
2025-09-18 08:13:49 | INFO | prefect.task_runs | R2R returned document ID: 9a2d0156-602f-5e4a-a8e1-22edd4c987e6
|
||||
2025-09-18 08:13:49 | INFO | prefect.task_runs | Document 9a2d0156-602f-5e4a-a8e1-22edd4c987e6 should be assigned to collection 866022d4-9a5d-4ff2-9609-1412502d44a1 via creation API
|
||||
2025-09-18 08:13:49 | INFO | prefect.task_runs | Creating document with ID: c01a1979-1dba-5731-bc71-39daff2e6ca2
|
||||
2025-09-18 08:13:49 | INFO | prefect.task_runs | Built metadata for document c01a1979-1dba-5731-bc71-39daff2e6ca2: {'source_url': 'https://r2r-docs.sciphi.ai/introduction/rag', 'content_type': 'text/markdown', 'word_count': 632, 'char_count': 4228, 'timestamp': '2025-09-18T08:13:43.093435+00:00', 'ingestion_source': 'web', 'title': 'More about RAG | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.', 'description': 'This document provides a comprehensive guide to implementing and configuring Retrieval-Augmented Generation (RAG) with the R2R system, explaining how it combines large language models with precise information retrieval from documents.'}
|
||||
2025-09-18 08:13:49 | INFO | prefect.task_runs | Creating document c01a1979-1dba-5731-bc71-39daff2e6ca2 with collection_ids: [866022d4-9a5d-4ff2-9609-1412502d44a1]
|
||||
2025-09-18 08:13:49 | INFO | prefect.task_runs | Sending to R2R - files keys: ['raw_text', 'metadata', 'id', 'ingestion_mode', 'collection_ids']
|
||||
2025-09-18 08:13:49 | INFO | prefect.task_runs | Metadata JSON: {"source_url": "https://r2r-docs.sciphi.ai/introduction/rag", "content_type": "text/markdown", "word_count": 632, "char_count": 4228, "timestamp": "2025-09-18T08:13:43.093435+00:00", "ingestion_source": "web", "title": "More about RAG | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.", "description": "This document provides a comprehensive guide to implementing and configuring Retrieval-Augmented Generation (RAG) with the R2R system, explaining how it combines large language models with precise information retrieval from documents."}
|
||||
2025-09-18 08:13:49 | INFO | httpx | HTTP Request: POST http://r2r.lab/v3/documents "HTTP/1.1 202 Accepted"
|
||||
2025-09-18 08:13:49 | INFO | prefect.task_runs | R2R returned document ID: c01a1979-1dba-5731-bc71-39daff2e6ca2
|
||||
2025-09-18 08:13:49 | INFO | prefect.task_runs | Document c01a1979-1dba-5731-bc71-39daff2e6ca2 should be assigned to collection 866022d4-9a5d-4ff2-9609-1412502d44a1 via creation API
|
||||
2025-09-18 08:13:49 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:49 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:13:49 | INFO | prefect.flow_runs | Upserted 5 documents into R2R (0 failed)
|
||||
2025-09-18 08:13:49 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/c962cb9b-f332-4862-a9ff-5e57b06c49ed/set_state "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:13:49 | INFO | prefect.flow_runs | Finished in state Completed()
|
||||
2025-09-18 08:13:49 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/admin/version "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:49 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:49 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=59db8ba2-e01a-4e42-baf8-e2dabaaf4757 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:13:49 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:13:49 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:13:49 | INFO | prefect.flow_runs | Ingestion completed: 5 processed, 0 failed
|
||||
2025-09-18 08:13:49 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/813e6876-948a-4833-a855-a88ef455dcf8/set_state "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:13:49 | INFO | prefect.flow_runs | Finished in state Completed()
|
||||
2025-09-18 08:13:50 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/logs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:28:46 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/admin/version "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:28:46 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=06321c35-36eb-442c-9e78-513baef02343 "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:28:46 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flows/ "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:28:46 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:28:46 | INFO | prefect.engine | View at http://prefect.lab/runs/flow-run/3d6f8223-0b7e-43fd-a1f4-c102f3fc8919
|
||||
2025-09-18 08:28:46 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/3d6f8223-0b7e-43fd-a1f4-c102f3fc8919/set_state "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:28:46 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/flow_runs/3d6f8223-0b7e-43fd-a1f4-c102f3fc8919 "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:28:46 | INFO | prefect.flow_runs | Beginning flow run 'magic-deer' for flow 'ingestion_pipeline'
|
||||
2025-09-18 08:28:46 | INFO | prefect.flow_runs | View at http://prefect.lab/runs/flow-run/3d6f8223-0b7e-43fd-a1f4-c102f3fc8919
|
||||
2025-09-18 08:28:46 | INFO | prefect.flow_runs | Starting ingestion from https://r2r-docs.sciphi.ai/introduction
|
||||
2025-09-18 08:28:46 | INFO | prefect.flow_runs | Validating source...
|
||||
2025-09-18 08:28:46 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:28:48 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/logs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:28:49 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:28:49 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:28:49 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:28:49 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/admin/version "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:28:49 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:28:49 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=5a229335-b24f-4b76-9ae1-0cae020e3cfe "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:28:49 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:28:49 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:28:49 | INFO | prefect.flow_runs | Ingesting documents...
|
||||
2025-09-18 08:28:49 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/task_runs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:28:49 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flows/ "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:28:49 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:28:49 | INFO | prefect.engine | View at http://prefect.lab/runs/flow-run/c9d6f1ef-c902-4ad7-8cad-7c7b07a0c013
|
||||
2025-09-18 08:28:49 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/c9d6f1ef-c902-4ad7-8cad-7c7b07a0c013/set_state "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:28:49 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/flow_runs/c9d6f1ef-c902-4ad7-8cad-7c7b07a0c013 "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:28:49 | INFO | prefect.flow_runs | Beginning subflow run 'tourmaline-beluga' for flow 'firecrawl_to_r2r'
|
||||
2025-09-18 08:28:49 | INFO | prefect.flow_runs | View at http://prefect.lab/runs/flow-run/c9d6f1ef-c902-4ad7-8cad-7c7b07a0c013
|
||||
2025-09-18 08:28:49 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:28:49 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:28:49 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:28:49 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:28:49 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:28:49 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:28:50 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/logs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:28:50 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/map "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:28:50 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:28:50 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:28:50 | INFO | prefect.flow_runs | Discovered 5 unique URLs from Firecrawl map
|
||||
2025-09-18 08:28:50 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:28:50 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/documents/c5c726b4-805a-5e22-ad13-323750b25efa "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 08:28:50 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/documents/a534965a-9da2-566e-a9ad-3e0da59bd3ae "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 08:28:50 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/documents/8af54b00-fe82-55c5-a1a5-fd0544139b62 "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 08:28:50 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/documents/9a2d0156-602f-5e4a-a8e1-22edd4c987e6 "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 08:28:50 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/documents/c01a1979-1dba-5731-bc71-39daff2e6ca2 "HTTP/1.1 404 Not Found"
|
||||
2025-09-18 08:28:50 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:28:50 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:28:50 | INFO | prefect.flow_runs | Scraping 1 batches of Firecrawl pages
|
||||
2025-09-18 08:28:50 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:28:52 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/logs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:28:53 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:28:55 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:28:55 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:28:57 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:28:58 | INFO | httpx | HTTP Request: POST http://crawl.lab:30002/v2/scrape "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:28:58 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:28:58 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:28:58 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:28:58 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/logs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:28:59 | INFO | httpx | HTTP Request: POST http://llm.lab/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:29:00 | INFO | httpx | HTTP Request: POST http://llm.lab/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:29:01 | INFO | httpx | HTTP Request: POST http://llm.lab/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:29:02 | INFO | httpx | HTTP Request: POST http://llm.lab/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:29:03 | INFO | httpx | HTTP Request: POST http://llm.lab/v1/chat/completions "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:29:03 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:29:03 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:29:03 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:29:03 | INFO | httpx | HTTP Request: GET http://r2r.lab/v3/collections "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:29:03 | INFO | prefect.task_runs | Using collection ID: 866022d4-9a5d-4ff2-9609-1412502d44a1 for collection: r2r
|
||||
2025-09-18 08:29:03 | INFO | prefect.task_runs | Creating document with ID: c5c726b4-805a-5e22-ad13-323750b25efa
|
||||
2025-09-18 08:29:03 | INFO | prefect.task_runs | Built metadata for document c5c726b4-805a-5e22-ad13-323750b25efa: {'source_url': 'https://r2r-docs.sciphi.ai/introduction', 'content_type': 'text/markdown', 'word_count': 296, 'char_count': 3000, 'timestamp': '2025-09-18T08:28:58.061547+00:00', 'ingestion_source': 'web', 'title': 'Introduction | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.', 'description': 'R2R is an advanced AI retrieval-augmented generation system with multimodal content ingestion, hybrid search, and a Deep Research API for complex queries.'}
|
||||
2025-09-18 08:29:03 | INFO | prefect.task_runs | Creating document c5c726b4-805a-5e22-ad13-323750b25efa with collection_ids: [866022d4-9a5d-4ff2-9609-1412502d44a1]
|
||||
2025-09-18 08:29:03 | INFO | prefect.task_runs | Sending to R2R - files keys: ['raw_text', 'metadata', 'id', 'ingestion_mode', 'collection_ids']
|
||||
2025-09-18 08:29:03 | INFO | prefect.task_runs | Metadata JSON: {"source_url": "https://r2r-docs.sciphi.ai/introduction", "content_type": "text/markdown", "word_count": 296, "char_count": 3000, "timestamp": "2025-09-18T08:28:58.061547+00:00", "ingestion_source": "web", "title": "Introduction | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.", "description": "R2R is an advanced AI retrieval-augmented generation system with multimodal content ingestion, hybrid search, and a Deep Research API for complex queries."}
|
||||
2025-09-18 08:29:04 | INFO | httpx | HTTP Request: POST http://r2r.lab/v3/documents "HTTP/1.1 202 Accepted"
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | R2R returned document ID: c5c726b4-805a-5e22-ad13-323750b25efa
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | Document c5c726b4-805a-5e22-ad13-323750b25efa should be assigned to collection 866022d4-9a5d-4ff2-9609-1412502d44a1 via creation API
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | Creating document with ID: a534965a-9da2-566e-a9ad-3e0da59bd3ae
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | Built metadata for document a534965a-9da2-566e-a9ad-3e0da59bd3ae: {'source_url': 'https://r2r-docs.sciphi.ai/introduction/system', 'content_type': 'text/markdown', 'word_count': 146, 'char_count': 1275, 'timestamp': '2025-09-18T08:28:58.061702+00:00', 'ingestion_source': 'web', 'title': 'System | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.', 'description': 'R2R is an advanced AI retrieval system with a modular architecture supporting retrieval-augmented generation, vector storage, and GraphRAG capabilities.'}
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | Creating document a534965a-9da2-566e-a9ad-3e0da59bd3ae with collection_ids: [866022d4-9a5d-4ff2-9609-1412502d44a1]
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | Sending to R2R - files keys: ['raw_text', 'metadata', 'id', 'ingestion_mode', 'collection_ids']
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | Metadata JSON: {"source_url": "https://r2r-docs.sciphi.ai/introduction/system", "content_type": "text/markdown", "word_count": 146, "char_count": 1275, "timestamp": "2025-09-18T08:28:58.061702+00:00", "ingestion_source": "web", "title": "System | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.", "description": "R2R is an advanced AI retrieval system with a modular architecture supporting retrieval-augmented generation, vector storage, and GraphRAG capabilities."}
|
||||
2025-09-18 08:29:04 | INFO | httpx | HTTP Request: POST http://r2r.lab/v3/documents "HTTP/1.1 202 Accepted"
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | R2R returned document ID: a534965a-9da2-566e-a9ad-3e0da59bd3ae
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | Document a534965a-9da2-566e-a9ad-3e0da59bd3ae should be assigned to collection 866022d4-9a5d-4ff2-9609-1412502d44a1 via creation API
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | Creating document with ID: 8af54b00-fe82-55c5-a1a5-fd0544139b62
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | Built metadata for document 8af54b00-fe82-55c5-a1a5-fd0544139b62: {'source_url': 'https://r2r-docs.sciphi.ai/introduction/whats-new', 'content_type': 'text/markdown', 'word_count': 42, 'char_count': 350, 'timestamp': '2025-09-18T08:28:58.061749+00:00', 'ingestion_source': 'web', 'title': "What's New | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.", 'description': 'This document announces version 0.3.5 of an advanced AI retrieval system with Agentic Retrieval-Augmented Generation (RAG) and a RESTful API, featuring improved API, SSE streaming output, and enhanced citations.'}
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | Creating document 8af54b00-fe82-55c5-a1a5-fd0544139b62 with collection_ids: [866022d4-9a5d-4ff2-9609-1412502d44a1]
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | Sending to R2R - files keys: ['raw_text', 'metadata', 'id', 'ingestion_mode', 'collection_ids']
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | Metadata JSON: {"source_url": "https://r2r-docs.sciphi.ai/introduction/whats-new", "content_type": "text/markdown", "word_count": 42, "char_count": 350, "timestamp": "2025-09-18T08:28:58.061749+00:00", "ingestion_source": "web", "title": "What's New | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.", "description": "This document announces version 0.3.5 of an advanced AI retrieval system with Agentic Retrieval-Augmented Generation (RAG) and a RESTful API, featuring improved API, SSE streaming output, and enhanced citations."}
|
||||
2025-09-18 08:29:04 | INFO | httpx | HTTP Request: POST http://r2r.lab/v3/documents "HTTP/1.1 202 Accepted"
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | R2R returned document ID: 8af54b00-fe82-55c5-a1a5-fd0544139b62
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | Document 8af54b00-fe82-55c5-a1a5-fd0544139b62 should be assigned to collection 866022d4-9a5d-4ff2-9609-1412502d44a1 via creation API
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | Creating document with ID: 9a2d0156-602f-5e4a-a8e1-22edd4c987e6
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | Built metadata for document 9a2d0156-602f-5e4a-a8e1-22edd4c987e6: {'source_url': 'https://r2r-docs.sciphi.ai/introduction/what-is-r2r', 'content_type': 'text/markdown', 'word_count': 444, 'char_count': 3541, 'timestamp': '2025-09-18T08:28:58.061954+00:00', 'ingestion_source': 'web', 'title': 'What is R2R? | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.', 'description': 'R2R is an advanced AI retrieval system that provides infrastructure and tools for implementing efficient, scalable, and reliable AI-powered document understanding in applications.'}
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | Creating document 9a2d0156-602f-5e4a-a8e1-22edd4c987e6 with collection_ids: [866022d4-9a5d-4ff2-9609-1412502d44a1]
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | Sending to R2R - files keys: ['raw_text', 'metadata', 'id', 'ingestion_mode', 'collection_ids']
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | Metadata JSON: {"source_url": "https://r2r-docs.sciphi.ai/introduction/what-is-r2r", "content_type": "text/markdown", "word_count": 444, "char_count": 3541, "timestamp": "2025-09-18T08:28:58.061954+00:00", "ingestion_source": "web", "title": "What is R2R? | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.", "description": "R2R is an advanced AI retrieval system that provides infrastructure and tools for implementing efficient, scalable, and reliable AI-powered document understanding in applications."}
|
||||
2025-09-18 08:29:04 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/logs/ "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:29:04 | INFO | httpx | HTTP Request: POST http://r2r.lab/v3/documents "HTTP/1.1 202 Accepted"
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | R2R returned document ID: 9a2d0156-602f-5e4a-a8e1-22edd4c987e6
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | Document 9a2d0156-602f-5e4a-a8e1-22edd4c987e6 should be assigned to collection 866022d4-9a5d-4ff2-9609-1412502d44a1 via creation API
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | Creating document with ID: c01a1979-1dba-5731-bc71-39daff2e6ca2
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | Built metadata for document c01a1979-1dba-5731-bc71-39daff2e6ca2: {'source_url': 'https://r2r-docs.sciphi.ai/introduction/rag', 'content_type': 'text/markdown', 'word_count': 632, 'char_count': 4228, 'timestamp': '2025-09-18T08:28:58.062194+00:00', 'ingestion_source': 'web', 'title': 'More about RAG | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.', 'description': 'This document provides a comprehensive guide to implementing and configuring Retrieval-Augmented Generation (RAG) using the R2R system, covering setup, configuration, and operational details.'}
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | Creating document c01a1979-1dba-5731-bc71-39daff2e6ca2 with collection_ids: [866022d4-9a5d-4ff2-9609-1412502d44a1]
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | Sending to R2R - files keys: ['raw_text', 'metadata', 'id', 'ingestion_mode', 'collection_ids']
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | Metadata JSON: {"source_url": "https://r2r-docs.sciphi.ai/introduction/rag", "content_type": "text/markdown", "word_count": 632, "char_count": 4228, "timestamp": "2025-09-18T08:28:58.062194+00:00", "ingestion_source": "web", "title": "More about RAG | The most advanced AI retrieval system. Agentic Retrieval-Augmented Generation (RAG) with a RESTful API.", "description": "This document provides a comprehensive guide to implementing and configuring Retrieval-Augmented Generation (RAG) using the R2R system, covering setup, configuration, and operational details."}
|
||||
2025-09-18 08:29:04 | INFO | httpx | HTTP Request: POST http://r2r.lab/v3/documents "HTTP/1.1 202 Accepted"
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | R2R returned document ID: c01a1979-1dba-5731-bc71-39daff2e6ca2
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | Document c01a1979-1dba-5731-bc71-39daff2e6ca2 should be assigned to collection 866022d4-9a5d-4ff2-9609-1412502d44a1 via creation API
|
||||
2025-09-18 08:29:04 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:29:04 | INFO | prefect.flow_runs | Upserted 5 documents into R2R (0 failed)
|
||||
2025-09-18 08:29:04 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/c9d6f1ef-c902-4ad7-8cad-7c7b07a0c013/set_state "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:29:04 | INFO | prefect.flow_runs | Finished in state Completed()
|
||||
2025-09-18 08:29:04 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/admin/version "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:29:04 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/increment "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:29:04 | INFO | httpx | HTTP Request: GET http://prefect.lab/api/csrf-token?client=dad10334-91ca-42b5-bc9d-0adba08e2ebd "HTTP/1.1 422 Unprocessable Entity"
|
||||
2025-09-18 08:29:04 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/concurrency_limits/decrement "HTTP/1.1 200 OK"
|
||||
2025-09-18 08:29:04 | INFO | prefect.task_runs | Finished in state Completed()
|
||||
2025-09-18 08:29:04 | INFO | prefect.flow_runs | Ingestion completed: 5 processed, 0 failed
|
||||
2025-09-18 08:29:04 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/flow_runs/3d6f8223-0b7e-43fd-a1f4-c102f3fc8919/set_state "HTTP/1.1 201 Created"
|
||||
2025-09-18 08:29:04 | INFO | prefect.flow_runs | Finished in state Completed()
|
||||
2025-09-18 08:29:06 | INFO | httpx | HTTP Request: POST http://prefect.lab/api/logs/ "HTTP/1.1 201 Created"
|
||||
@@ -8,7 +8,7 @@ dependencies = [
|
||||
"prefect>=2.14.0",
|
||||
"pydantic>=2.5.0",
|
||||
"pydantic-settings>=2.1.0",
|
||||
"firecrawl-py>=0.0.1",
|
||||
"firecrawl-py>=1.0.0",
|
||||
"gitpython>=3.1.40",
|
||||
"weaviate-client>=4.4.0",
|
||||
"httpx>=0.25.0",
|
||||
@@ -16,6 +16,9 @@ dependencies = [
|
||||
"rich>=13.7.0",
|
||||
"textual>=0.50.0",
|
||||
"python-dotenv>=1.0.0",
|
||||
"r2r>=3.6.6",
|
||||
"typing-extensions>=4.15.0",
|
||||
"python-dateutil>=2.9.0.post0",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
@@ -36,6 +39,8 @@ dev-dependencies = [
|
||||
"mypy>=1.7.0",
|
||||
"ruff>=0.1.0",
|
||||
"basedpyright>=1.31.4",
|
||||
"pyrefly>=0.33.0",
|
||||
"sourcery>=1.37.0",
|
||||
]
|
||||
|
||||
[tool.ruff]
|
||||
|
||||
12280
repomix-output.xml
Normal file
12280
repomix-output.xml
Normal file
File diff suppressed because it is too large
Load Diff
31
repomix.config.json
Normal file
31
repomix.config.json
Normal file
@@ -0,0 +1,31 @@
|
||||
{
|
||||
"output": {
|
||||
"filePath": "repomix-output.xml",
|
||||
"style": "xml",
|
||||
"parsableStyle": false,
|
||||
"fileSummary": true,
|
||||
"directoryStructure": true,
|
||||
"removeComments": false,
|
||||
"removeEmptyLines": false,
|
||||
"compress": false,
|
||||
"topFilesLength": 5,
|
||||
"showLineNumbers": false,
|
||||
"copyToClipboard": false,
|
||||
"git": {
|
||||
"sortByChanges": true,
|
||||
"sortByChangesMaxCommits": 100
|
||||
}
|
||||
},
|
||||
"include": ["ingest_pipeline/"],
|
||||
"ignore": {
|
||||
"useGitignore": true,
|
||||
"useDefaultPatterns": true,
|
||||
"customPatterns": []
|
||||
},
|
||||
"security": {
|
||||
"enableSecurityCheck": true
|
||||
},
|
||||
"tokenCount": {
|
||||
"encoding": "o200k_base"
|
||||
}
|
||||
}
|
||||
13
typings/__init__.py
Normal file
13
typings/__init__.py
Normal file
@@ -0,0 +1,13 @@
|
||||
"""API response type definitions."""
|
||||
|
||||
from typing import TypedDict
|
||||
|
||||
|
||||
class EmbeddingData(TypedDict):
|
||||
"""Structure for embedding data from API response."""
|
||||
embedding: list[float]
|
||||
|
||||
|
||||
class EmbeddingResponse(TypedDict):
|
||||
"""Structure for OpenAI-compatible embedding API response."""
|
||||
data: list[EmbeddingData]
|
||||
13
typings/__init__.pyi
Normal file
13
typings/__init__.pyi
Normal file
@@ -0,0 +1,13 @@
|
||||
"""API response type definitions."""
|
||||
|
||||
from typing import TypedDict
|
||||
|
||||
|
||||
class EmbeddingData(TypedDict):
|
||||
"""Structure for embedding data from API response."""
|
||||
embedding: list[float]
|
||||
|
||||
|
||||
class EmbeddingResponse(TypedDict):
|
||||
"""Structure for OpenAI-compatible embedding API response."""
|
||||
data: list[EmbeddingData]
|
||||
BIN
typings/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
typings/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
8
typings/dotenv.pyi
Normal file
8
typings/dotenv.pyi
Normal file
@@ -0,0 +1,8 @@
|
||||
"""Type stubs for dotenv library."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def load_dotenv(dotenv_path: str | Path | None = None) -> bool: ...
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user