* UN-2901 [FIX] Container startup race condition with polling grace period * UN-2901 [FIX] Add Redis retry resilience and fix container failure detection - Add configurable Redis retry decorator with exponential backoff - Fix critical bug where containers that never start are marked as SUCCESS - Add robust env var validation for retry configuration - Apply retry logic to FileExecutionStatusTracker and ToolExecutionTracker - Document REDIS_RETRY_MAX_ATTEMPTS and REDIS_RETRY_BACKOFF_FACTOR env vars 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> * UN-2901 [FIX] Address CodeRabbitAI review feedback for race condition fix This commit addresses all valid CodeRabbitAI review comments on PR #1602: 1. **Fix retry loop semantics**: Changed retry loop to use range(max_retries + 1) where max_retries means "retries after initial attempt", not total attempts. Updated default from 5 to 4 (total 5 attempts) for clarity. 2. **Fix TypeError in file_execution_tracker.py**: Fixed json.loads() receiving dict instead of string by using string fallback values. 3. **Fix unsafe env var parsing**: Added _safe_get_env_int/_safe_get_env_float helpers with validation and fallback to defaults with warning logs. 4. **Fix status None check**: Added defensive None check before calling .get() on status dict in grace period reset logic. 5. **Update sample.env defaults**: Changed REDIS_RETRY_MAX_ATTEMPTS from 5 to 4 and updated comments to clarify retry semantics. 6. **Improve transient failure handling**: Changed logger.error to logger.warning for transient status fetch failures, added sleep before continue to respect polling interval and avoid API hammering. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> --------- Co-authored-by: Claude <noreply@anthropic.com>
373 lines
12 KiB
Bash
373 lines
12 KiB
Bash
# =============================================================================
|
|
# Unstract Workers Environment Configuration
|
|
# =============================================================================
|
|
# Copy this file to .env and update the values for your environment
|
|
|
|
# =============================================================================
|
|
# Core Configuration (REQUIRED)
|
|
# =============================================================================
|
|
|
|
# Django Backend URL - REQUIRED
|
|
# Docker (default): http://unstract-backend:8000
|
|
# Local development: http://localhost:8000
|
|
DJANGO_APP_BACKEND_URL=http://unstract-backend:8000
|
|
|
|
# Internal API Base URL - REQUIRED
|
|
# This is the full URL with /internal suffix for worker→backend communication
|
|
# Docker: http://unstract-backend:8000/internal
|
|
# Local: http://localhost:8000/internal
|
|
INTERNAL_API_BASE_URL=http://unstract-backend:8000/internal
|
|
|
|
# Internal API Configuration
|
|
INTERNAL_API_PREFIX=/internal
|
|
INTERNAL_API_VERSION=v1
|
|
|
|
# Internal Service API Key - REQUIRED
|
|
INTERNAL_SERVICE_API_KEY=dev-internal-key-123
|
|
|
|
# Internal API Connection Settings
|
|
INTERNAL_API_TIMEOUT=120
|
|
INTERNAL_API_RETRY_ATTEMPTS=3
|
|
INTERNAL_API_RETRY_BACKOFF_FACTOR=1.0
|
|
|
|
# Internal API Endpoint Prefixes
|
|
INTERNAL_API_HEALTH_PREFIX=v1/health/
|
|
INTERNAL_API_TOOL_PREFIX=v1/tool-execution/
|
|
INTERNAL_API_EXECUTION_PREFIX=v1/execution/
|
|
INTERNAL_API_WEBHOOK_PREFIX=v1/webhook/
|
|
INTERNAL_API_FILE_HISTORY_PREFIX=v1/file-history/
|
|
INTERNAL_API_WORKFLOW_PREFIX=v1/workflow-execution/
|
|
INTERNAL_API_ORGANIZATION_PREFIX=v1/organization/
|
|
|
|
# =============================================================================
|
|
# Celery Configuration
|
|
# =============================================================================
|
|
|
|
# Celery Broker (RabbitMQ) - REQUIRED
|
|
# These credentials must match your RabbitMQ configuration
|
|
CELERY_BROKER_BASE_URL=amqp://unstract-rabbitmq:5672//
|
|
CELERY_BROKER_USER=admin
|
|
CELERY_BROKER_PASS=password
|
|
|
|
# =============================================================================
|
|
# Database Configuration (REQUIRED)
|
|
# =============================================================================
|
|
|
|
# PostgreSQL (for Celery result backend) - REQUIRED
|
|
# These credentials must match your PostgreSQL configuration
|
|
DB_HOST=unstract-db
|
|
DB_USER=unstract_dev
|
|
DB_PASSWORD=unstract_pass
|
|
DB_NAME=unstract_db
|
|
DB_PORT=5432
|
|
DB_SCHEMA=unstract
|
|
|
|
# Celery Backend Database Schema
|
|
CELERY_BACKEND_DB_SCHEMA=public
|
|
|
|
# Redis (for caching and queues) - REQUIRED
|
|
REDIS_HOST=unstract-redis
|
|
REDIS_PORT=6379
|
|
REDIS_PASSWORD=
|
|
REDIS_USER=default
|
|
REDIS_DB=0
|
|
|
|
# Cache-Specific Redis Configuration
|
|
CACHE_REDIS_ENABLED=true
|
|
CACHE_REDIS_HOST=unstract-redis
|
|
CACHE_REDIS_PORT=6379
|
|
CACHE_REDIS_DB=0
|
|
CACHE_REDIS_PASSWORD=
|
|
CACHE_REDIS_USERNAME=
|
|
CACHE_REDIS_SSL=false
|
|
CACHE_REDIS_SSL_CERT_REQS=required
|
|
|
|
# Database URL (for fallback usage)
|
|
DATABASE_URL=postgresql://unstract_dev:unstract_pass@unstract-db:5432/unstract_db
|
|
|
|
# =============================================================================
|
|
# Worker Infrastructure Settings
|
|
# =============================================================================
|
|
|
|
# Worker Singleton Infrastructure - Controls shared resource management
|
|
ENABLE_API_CLIENT_SINGLETON=true
|
|
DEBUG_API_CLIENT_INIT=false
|
|
WORKER_INFRASTRUCTURE_HEALTH_CHECK=true
|
|
|
|
# API Client Configuration
|
|
API_CLIENT_POOL_SIZE=3
|
|
|
|
# Config Caching
|
|
ENABLE_CONFIG_CACHE=true
|
|
CONFIG_CACHE_TTL=300
|
|
|
|
# Debug Settings
|
|
ENABLE_DEBUG_LOGGING=false
|
|
DEBUG_ORGANIZATION_CONTEXT=false
|
|
|
|
# Worker Concurrency
|
|
MAX_CONCURRENT_TASKS=10
|
|
|
|
# =============================================================================
|
|
# Worker Performance Settings
|
|
# =============================================================================
|
|
|
|
CELERY_WORKER_PREFETCH_MULTIPLIER=1
|
|
CELERY_TASK_ACKS_LATE=true
|
|
CELERY_WORKER_MAX_TASKS_PER_CHILD=1000
|
|
|
|
# =============================================================================
|
|
# Task Timeout Configuration (Celery Standard Naming Convention)
|
|
# =============================================================================
|
|
# Uses format: {WORKER_TYPE}_TASK_TIME_LIMIT and {WORKER_TYPE}_TASK_SOFT_TIME_LIMIT
|
|
#
|
|
# Resolution hierarchy:
|
|
# 1. Worker-specific: FILE_PROCESSING_TASK_TIME_LIMIT (highest priority)
|
|
# 2. General: TASK_TIME_LIMIT (fallback for all workers)
|
|
# 3. Code defaults (lowest priority)
|
|
|
|
# General Task Timeouts - Applies to all workers without specific overrides
|
|
TASK_TIME_LIMIT=3600 # 1 hour - General hard timeout
|
|
TASK_SOFT_TIME_LIMIT=3300 # 55 minutes - General soft timeout
|
|
|
|
# Worker-Specific Timeouts - Overrides general timeouts for specific worker types
|
|
FILE_PROCESSING_TASK_TIME_LIMIT=7200 # 2 hours - File processing hard timeout
|
|
FILE_PROCESSING_TASK_SOFT_TIME_LIMIT=6300 # 1h 45m - File processing soft timeout
|
|
CALLBACK_TASK_TIME_LIMIT=3600 # 1 hour - Callback hard timeout
|
|
CALLBACK_TASK_SOFT_TIME_LIMIT=3300 # 55 minutes - Callback soft timeout
|
|
|
|
# Retry Configuration
|
|
CELERY_TASK_DEFAULT_RETRY_DELAY=60
|
|
CELERY_TASK_MAX_RETRIES=3
|
|
CELERY_TASK_REJECT_ON_WORKER_LOST=false
|
|
|
|
# Advanced Celery Configuration
|
|
CELERY_WORKER_POOL_RESTARTS=true
|
|
CELERY_BROKER_CONNECTION_RETRY_ON_STARTUP=true
|
|
CELERY_RESULT_CHORD_RETRY_INTERVAL=3.0
|
|
|
|
# =============================================================================
|
|
# Worker-Specific Configuration
|
|
# =============================================================================
|
|
|
|
# API Deployment Worker
|
|
API_DEPLOYMENT_WORKER_NAME=api-deployment-worker
|
|
API_DEPLOYMENT_HEALTH_PORT=8080
|
|
API_DEPLOYMENT_AUTOSCALE=4,1
|
|
|
|
# General Worker
|
|
GENERAL_WORKER_NAME=general-worker
|
|
GENERAL_HEALTH_PORT=8081
|
|
GENERAL_AUTOSCALE=6,2
|
|
|
|
# File Processing Worker
|
|
FILE_PROCESSING_WORKER_NAME=file-processing-worker
|
|
FILE_PROCESSING_HEALTH_PORT=8082
|
|
FILE_PROCESSING_AUTOSCALE=8,2
|
|
|
|
# Callback Worker
|
|
CALLBACK_WORKER_NAME=callback-worker
|
|
CALLBACK_HEALTH_PORT=8083
|
|
CALLBACK_AUTOSCALE=4,1
|
|
|
|
# Scheduler Worker
|
|
SCHEDULER_WORKER_NAME=scheduler-worker
|
|
SCHEDULER_HEALTH_PORT=8087
|
|
SCHEDULER_AUTOSCALE=2,1
|
|
|
|
# Notification Worker
|
|
NOTIFICATION_WORKER_NAME=notification-worker
|
|
NOTIFICATION_HEALTH_PORT=8085
|
|
NOTIFICATION_AUTOSCALE=4,1
|
|
|
|
# Log Consumer Worker
|
|
LOG_CONSUMER_WORKER_NAME=log-consumer-worker
|
|
LOG_CONSUMER_HEALTH_PORT=8086
|
|
LOG_CONSUMER_AUTOSCALE=2,1
|
|
|
|
# =============================================================================
|
|
# Logging Configuration
|
|
# =============================================================================
|
|
|
|
LOG_LEVEL=INFO
|
|
# Note: LOG_FORMAT removed - format is now hardcoded (not configurable)
|
|
# All workers use a single standardized format matching Django backend
|
|
DEFAULT_LOG_LEVEL=INFO
|
|
WORKER_VERSION=1.0.0
|
|
WORKER_INSTANCE_ID=dev-01
|
|
|
|
# Log History Configuration
|
|
ENABLE_LOG_HISTORY=true
|
|
LOG_HISTORY_CONSUMER_INTERVAL=30
|
|
LOGS_BATCH_LIMIT=30
|
|
LOGS_EXPIRATION_TIME_IN_SECOND=86400
|
|
LOG_HISTORY_QUEUE_NAME=log_history_queue
|
|
|
|
# Log Queue Size Protection
|
|
# Maximum number of logs in Redis queue before dropping new logs
|
|
LOG_QUEUE_MAX_SIZE=10000
|
|
|
|
# =============================================================================
|
|
# Queue Configuration
|
|
# =============================================================================
|
|
|
|
# Notification Queue Name
|
|
NOTIFICATION_QUEUE_NAME=notifications
|
|
|
|
# =============================================================================
|
|
# Backend Services
|
|
# =============================================================================
|
|
|
|
# Platform Service
|
|
PLATFORM_SERVICE_HOST=http://unstract-platform-service
|
|
PLATFORM_SERVICE_PORT=3001
|
|
|
|
# Prompt Service
|
|
PROMPT_HOST=http://unstract-prompt-service
|
|
PROMPT_PORT=3003
|
|
|
|
# X2Text Service
|
|
X2TEXT_HOST=http://unstract-x2text-service
|
|
X2TEXT_PORT=3004
|
|
|
|
# Tool Runner
|
|
UNSTRACT_RUNNER_HOST=http://unstract-runner
|
|
UNSTRACT_RUNNER_PORT=5002
|
|
UNSTRACT_RUNNER_API_TIMEOUT=300
|
|
UNSTRACT_RUNNER_API_RETRY_COUNT=5
|
|
UNSTRACT_RUNNER_API_BACKOFF_FACTOR=3
|
|
|
|
# =============================================================================
|
|
# File Storage Configuration
|
|
# =============================================================================
|
|
|
|
# File Storage Credentials (MinIO)
|
|
WORKFLOW_EXECUTION_FILE_STORAGE_CREDENTIALS='{"provider": "minio", "credentials": {"endpoint_url": "http://unstract-minio:9000", "key": "minio", "secret": "minio123"}}'
|
|
API_FILE_STORAGE_CREDENTIALS='{"provider": "minio", "credentials": {"endpoint_url": "http://unstract-minio:9000", "key": "minio", "secret": "minio123"}}'
|
|
|
|
# File Execution Configuration
|
|
WORKFLOW_EXECUTION_DIR_PREFIX=unstract/execution
|
|
API_EXECUTION_DIR_PREFIX=unstract/api
|
|
MAX_PARALLEL_FILE_BATCHES=1
|
|
|
|
# File Execution TTL Configuration
|
|
FILE_EXECUTION_TRACKER_TTL_IN_SECOND=18000
|
|
FILE_EXECUTION_TRACKER_COMPLETED_TTL_IN_SECOND=300
|
|
|
|
# Destination Processing TTL Configuration
|
|
DESTINATION_PROCESSING_STAGE_TTL_IN_SECOND=600
|
|
# Actual Redis lock TTL
|
|
DESTINATION_PROCESSING_LOCK_TTL_IN_SECOND=10
|
|
|
|
EXECUTION_RESULT_TTL_SECONDS=86400
|
|
EXECUTION_CACHE_TTL_SECONDS=86400
|
|
INSTANT_WF_POLLING_TIMEOUT=300
|
|
|
|
# Active File Execution cache in seconds
|
|
ACTIVE_FILE_CACHE_TTL=300
|
|
|
|
# Polling Grace Period for NOT_FOUND Status
|
|
# How long the status poller tolerates NOT_FOUND before treating it as failure
|
|
POLL_NOT_FOUND_GRACE_PERIOD=40
|
|
|
|
# Redis Retry Configuration
|
|
# Controls automatic retry behavior for transient Redis connection failures
|
|
# These settings apply to all Redis read operations in execution status trackers
|
|
# Max retry attempts after initial attempt before giving up (default: 4, total attempts: 5)
|
|
REDIS_RETRY_MAX_ATTEMPTS=4
|
|
# Exponential backoff multiplier in seconds (default: 0.5)
|
|
# With defaults (4 retries after initial = 5 total attempts):
|
|
# Retry delays: 0.5s, 1s, 2s, 4s = 5 total attempts over 7.5s
|
|
# Handles: network blips, Redis restarts (3-5s), connection pool exhaustion
|
|
REDIS_RETRY_BACKOFF_FACTOR=0.5
|
|
|
|
# =============================================================================
|
|
# Development Settings
|
|
# =============================================================================
|
|
|
|
DEBUG=false
|
|
TESTING=false
|
|
ENABLE_METRICS=true
|
|
ENABLE_FILE_HISTORY=true
|
|
ENABLE_WEBHOOK_DELIVERY=true
|
|
|
|
# Tool Registry
|
|
TOOL_REGISTRY_CONFIG_PATH=../unstract/tool-registry/tool_registry_config
|
|
TOOL_REGISTRY_STORAGE_CREDENTIALS='{"provider":"local"}'
|
|
|
|
# =============================================================================
|
|
# Optional Advanced Settings
|
|
# =============================================================================
|
|
|
|
# Health Checks
|
|
HEALTH_CHECK_INTERVAL=30
|
|
HEALTH_CHECK_TIMEOUT=10
|
|
METRICS_PORT=8080
|
|
|
|
# Circuit Breaker
|
|
CIRCUIT_BREAKER_FAILURE_THRESHOLD=5
|
|
CIRCUIT_BREAKER_RECOVERY_TIMEOUT=60
|
|
|
|
# Notifications
|
|
NOTIFICATION_TIMEOUT=5
|
|
|
|
# Cache
|
|
CACHE_TTL_SEC=10800
|
|
|
|
# Connection Pooling
|
|
CONNECTION_POOL_SIZE=10
|
|
CONNECTION_POOL_MAX_OVERFLOW=20
|
|
|
|
# Task Routing and Backup
|
|
ENABLE_PRIORITY_ROUTING=false
|
|
HIGH_PRIORITY_QUEUE_SUFFIX=_high
|
|
LOW_PRIORITY_QUEUE_SUFFIX=_low
|
|
ENABLE_TASK_BACKUP=false
|
|
BACKUP_INTERVAL=3600
|
|
|
|
# Feature Flags
|
|
ENABLE_DESTINATION_CONNECTORS=true
|
|
ENABLE_CLEANUP_TASKS=true
|
|
|
|
# Security (for production)
|
|
SECURE_SSL_REDIRECT=false
|
|
SESSION_COOKIE_SECURE=false
|
|
CSRF_COOKIE_SECURE=false
|
|
|
|
# Monitoring
|
|
SENTRY_DSN=
|
|
SENTRY_ENVIRONMENT=development
|
|
|
|
# Google Drive
|
|
GDRIVE_GOOGLE_SERVICE_ACCOUNT=
|
|
GDRIVE_GOOGLE_PROJECT_ID=
|
|
GOOGLE_STORAGE_ACCESS_KEY_ID=
|
|
GOOGLE_STORAGE_SECRET_ACCESS_KEY=
|
|
GOOGLE_STORAGE_BASE_URL=https://storage.googleapis.com
|
|
|
|
# Connector OAuth
|
|
SOCIAL_AUTH_EXTRA_DATA_EXPIRATION_TIME_IN_SECOND=3600
|
|
GOOGLE_OAUTH2_KEY=
|
|
GOOGLE_OAUTH2_SECRET=
|
|
|
|
|
|
|
|
# =============================================================================
|
|
# Local Development Overrides
|
|
# =============================================================================
|
|
# For local development (all services on host), change Docker service names to localhost:
|
|
# DJANGO_APP_BACKEND_URL=http://localhost:8000
|
|
# INTERNAL_API_BASE_URL=http://localhost:8000/internal
|
|
# CELERY_BROKER_BASE_URL=amqp://localhost:5672//
|
|
# DB_HOST=localhost
|
|
# REDIS_HOST=localhost
|
|
# CACHE_REDIS_HOST=localhost
|
|
# PLATFORM_SERVICE_HOST=http://localhost
|
|
# PROMPT_HOST=http://localhost
|
|
# X2TEXT_HOST=http://localhost
|
|
# UNSTRACT_RUNNER_HOST=http://localhost
|
|
# WORKFLOW_EXECUTION_FILE_STORAGE_CREDENTIALS={"provider": "minio", "credentials": {"endpoint_url": "http://localhost:9000", "key": "minio", "secret": "minio123"}}
|
|
# API_FILE_STORAGE_CREDENTIALS={"provider": "minio", "credentials": {"endpoint_url": "http://localhost:9000", "key": "minio", "secret": "minio123"}}
|
|
|
|
PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
|