154 lines
5.9 KiB
YAML
154 lines
5.9 KiB
YAML
# Prefect deployment configuration for RAG Manager
|
|
name: rag-manager
|
|
prefect-version: 3.0.0
|
|
|
|
# Build steps - prepare the environment
|
|
build:
|
|
- prefect.deployments.steps.run_shell_script:
|
|
id: prepare-environment
|
|
script: |
|
|
echo "Preparing RAG Manager environment..."
|
|
# Ensure virtual environment is activated
|
|
source .venv/bin/activate || echo "Virtual environment not found"
|
|
# Install dependencies
|
|
uv sync --frozen
|
|
# Register custom blocks
|
|
prefect block register --module ingest_pipeline.core.models
|
|
|
|
# Push steps - handle deployment artifacts
|
|
push: null
|
|
|
|
# Work pool configuration
|
|
work_pool:
|
|
name: "{{ prefect.variables.work_pool_name | default('default') }}"
|
|
work_queue_name: "{{ prefect.variables.work_queue_name | default('default') }}"
|
|
job_variables:
|
|
env:
|
|
# Prefect configuration
|
|
PREFECT_API_URL: "{{ $PREFECT_API_URL }}"
|
|
PREFECT_API_KEY: "{{ $PREFECT_API_KEY }}"
|
|
|
|
# Application configuration from variables
|
|
DEFAULT_BATCH_SIZE: "{{ prefect.variables.default_batch_size | default('50') }}"
|
|
MAX_CRAWL_DEPTH: "{{ prefect.variables.max_crawl_depth | default('5') }}"
|
|
MAX_CRAWL_PAGES: "{{ prefect.variables.max_crawl_pages | default('100') }}"
|
|
MAX_CONCURRENT_TASKS: "{{ prefect.variables.max_concurrent_tasks | default('5') }}"
|
|
|
|
# Service endpoints from variables
|
|
LLM_ENDPOINT: "{{ prefect.variables.llm_endpoint | default('http://llm.lab') }}"
|
|
WEAVIATE_ENDPOINT: "{{ prefect.variables.weaviate_endpoint | default('http://weaviate.yo') }}"
|
|
OPENWEBUI_ENDPOINT: "{{ prefect.variables.openwebui_endpoint | default('http://chat.lab') }}"
|
|
FIRECRAWL_ENDPOINT: "{{ prefect.variables.firecrawl_endpoint | default('http://crawl.lab:30002') }}"
|
|
|
|
# Deployment definitions
|
|
deployments:
|
|
# Web ingestion deployment
|
|
- name: web-ingestion
|
|
version: "{{ prefect.variables.deployment_version | default('1.0.0') }}"
|
|
tags:
|
|
- "{{ prefect.variables.environment | default('development') }}"
|
|
- web
|
|
- ingestion
|
|
description: "Automated web content ingestion using Firecrawl"
|
|
entrypoint: ingest_pipeline/flows/ingestion.py:create_ingestion_flow
|
|
parameters:
|
|
source_type: web
|
|
storage_backend: "{{ prefect.variables.default_storage_backend | default('weaviate') }}"
|
|
validate_first: true
|
|
storage_block_name: "{{ prefect.variables.default_storage_block }}"
|
|
ingestor_config_block_name: "{{ prefect.variables.default_firecrawl_block }}"
|
|
schedule:
|
|
interval: "{{ prefect.variables.default_schedule_interval | default(3600) }}"
|
|
timezone: UTC
|
|
work_pool:
|
|
name: "{{ prefect.variables.web_work_pool | default('default') }}"
|
|
job_variables:
|
|
env:
|
|
INGESTION_TYPE: "web"
|
|
MAX_PAGES: "{{ prefect.variables.max_crawl_pages | default('100') }}"
|
|
|
|
# Repository ingestion deployment
|
|
- name: repository-ingestion
|
|
version: "{{ prefect.variables.deployment_version | default('1.0.0') }}"
|
|
tags:
|
|
- "{{ prefect.variables.environment | default('development') }}"
|
|
- repository
|
|
- ingestion
|
|
description: "Automated repository content ingestion using Repomix"
|
|
entrypoint: ingest_pipeline/flows/ingestion.py:create_ingestion_flow
|
|
parameters:
|
|
source_type: repository
|
|
storage_backend: "{{ prefect.variables.default_storage_backend | default('weaviate') }}"
|
|
validate_first: true
|
|
storage_block_name: "{{ prefect.variables.default_storage_block }}"
|
|
ingestor_config_block_name: "{{ prefect.variables.default_repomix_block }}"
|
|
schedule: null # Manual trigger only
|
|
work_pool:
|
|
name: "{{ prefect.variables.repo_work_pool | default('default') }}"
|
|
job_variables:
|
|
env:
|
|
INGESTION_TYPE: "repository"
|
|
|
|
# R2R specialized deployment
|
|
- name: firecrawl-to-r2r
|
|
version: "{{ prefect.variables.deployment_version | default('1.0.0') }}"
|
|
tags:
|
|
- "{{ prefect.variables.environment | default('development') }}"
|
|
- firecrawl
|
|
- r2r
|
|
- specialized
|
|
description: "Optimized Firecrawl to R2R ingestion flow"
|
|
entrypoint: ingest_pipeline/flows/ingestion.py:firecrawl_to_r2r_flow
|
|
parameters:
|
|
storage_block_name: "{{ prefect.variables.r2r_storage_block }}"
|
|
schedule:
|
|
cron: "{{ prefect.variables.r2r_cron_schedule | default('0 2 * * *') }}"
|
|
timezone: UTC
|
|
work_pool:
|
|
name: "{{ prefect.variables.r2r_work_pool | default('default') }}"
|
|
job_variables:
|
|
env:
|
|
INGESTION_TYPE: "r2r"
|
|
SPECIALIZED_FLOW: "true"
|
|
|
|
# Automation definitions (commented out - would be created via API)
|
|
# automations:
|
|
# - name: Cancel Long Running Flows
|
|
# description: Cancels flows running longer than 30 minutes
|
|
# trigger:
|
|
# type: event
|
|
# posture: Proactive
|
|
# expect: [prefect.flow-run.Running]
|
|
# match_related:
|
|
# prefect.resource.role: flow
|
|
# prefect.resource.name: ingestion_pipeline
|
|
# threshold: 1
|
|
# within: 1800
|
|
# actions:
|
|
# - type: cancel-flow-run
|
|
# source: inferred
|
|
# enabled: true
|
|
|
|
# Variables that should be set for optimal operation
|
|
# Use: prefect variable set <name> <value>
|
|
# Required variables:
|
|
# - default_storage_backend: weaviate|open_webui|r2r
|
|
# - llm_endpoint: URL for LLM service
|
|
# - weaviate_endpoint: URL for Weaviate instance
|
|
# - openwebui_endpoint: URL for OpenWebUI instance
|
|
# - firecrawl_endpoint: URL for Firecrawl service
|
|
#
|
|
# Optional variables with defaults:
|
|
# - default_batch_size: 50
|
|
# - max_crawl_depth: 5
|
|
# - max_crawl_pages: 100
|
|
# - max_concurrent_tasks: 5
|
|
# - default_schedule_interval: 3600 (1 hour)
|
|
# - deployment_version: 1.0.0
|
|
# - environment: development
|
|
|
|
# Block types that should be registered:
|
|
# - storage-config: Storage backend configurations
|
|
# - firecrawl-config: Firecrawl scraping parameters
|
|
# - repomix-config: Repository processing settings
|
|
# - r2r-config: R2R-specific chunking and graph settings |