UN-2874 [FIX] Prevent duplicate destination entries when worker drops abruptly using atomic lock mechanism
- Fixed stage order: DESTINATION_PROCESSING(2) → FINALIZATION(3) → COMPLETED(4) - Implemented atomic lock via DESTINATION_PROCESSING IN_PROGRESS status - Added 3-minute TTL for COMPLETED stage to optimize Redis memory - Removed conflicting stage status updates from service.py - Enhanced status history tracking with complete IN_PROGRESS → SUCCESS transitions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -20,6 +20,7 @@ class FileExecutionStage(Enum):
|
||||
INITIALIZATION = "INITIALIZATION"
|
||||
TOOL_EXECUTION = "TOOL_EXECUTION"
|
||||
FINALIZATION = "FINALIZATION"
|
||||
DESTINATION_PROCESSING = "DESTINATION_PROCESSING"
|
||||
COMPLETED = "COMPLETED"
|
||||
|
||||
@property
|
||||
@@ -42,8 +43,9 @@ class FileExecutionStage(Enum):
|
||||
FILE_EXECUTION_STAGE_ORDER = {
|
||||
FileExecutionStage.INITIALIZATION: 0,
|
||||
FileExecutionStage.TOOL_EXECUTION: 1,
|
||||
FileExecutionStage.FINALIZATION: 2,
|
||||
FileExecutionStage.COMPLETED: 3,
|
||||
FileExecutionStage.DESTINATION_PROCESSING: 2,
|
||||
FileExecutionStage.FINALIZATION: 3,
|
||||
FileExecutionStage.COMPLETED: 4,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -146,6 +146,10 @@ CELERY_WORKER_POOL_RESTARTS=true
|
||||
CELERY_BROKER_CONNECTION_RETRY_ON_STARTUP=true
|
||||
CELERY_RESULT_CHORD_RETRY_INTERVAL=3.0
|
||||
|
||||
# Disable AMQP heartbeats to prevent connection drops during pod shutdown.
|
||||
# Fixes duplicate task processing with prefork+acks_late. See: celery/celery#3802
|
||||
CELERY_WORKER_HEARTBEAT=0
|
||||
|
||||
# =============================================================================
|
||||
# Worker-Specific Configuration
|
||||
# =============================================================================
|
||||
@@ -233,7 +237,7 @@ X2TEXT_PORT=3004
|
||||
# Tool Runner
|
||||
UNSTRACT_RUNNER_HOST=http://unstract-runner
|
||||
UNSTRACT_RUNNER_PORT=5002
|
||||
UNSTRACT_RUNNER_API_TIMEOUT=120
|
||||
UNSTRACT_RUNNER_API_TIMEOUT=300
|
||||
UNSTRACT_RUNNER_API_RETRY_COUNT=5
|
||||
UNSTRACT_RUNNER_API_BACKOFF_FACTOR=3
|
||||
|
||||
@@ -252,7 +256,7 @@ MAX_PARALLEL_FILE_BATCHES=1
|
||||
|
||||
# File Execution TTL Configuration
|
||||
FILE_EXECUTION_TRACKER_TTL_IN_SECOND=18000
|
||||
FILE_EXECUTION_TRACKER_COMPLETED_TTL_IN_SECOND=600
|
||||
FILE_EXECUTION_TRACKER_COMPLETED_TTL_IN_SECOND=300
|
||||
EXECUTION_RESULT_TTL_SECONDS=86400
|
||||
EXECUTION_CACHE_TTL_SECONDS=86400
|
||||
INSTANT_WF_POLLING_TIMEOUT=300
|
||||
|
||||
@@ -33,6 +33,13 @@ from unstract.connectors.connectorkit import Connectorkit
|
||||
from unstract.connectors.exceptions import ConnectorError
|
||||
from unstract.core.data_models import ConnectionType as CoreConnectionType
|
||||
from unstract.core.data_models import FileHashData
|
||||
from unstract.core.exceptions import FileExecutionStageException
|
||||
from unstract.core.file_execution_tracker import (
|
||||
FileExecutionStage,
|
||||
FileExecutionStageData,
|
||||
FileExecutionStageStatus,
|
||||
FileExecutionStatusTracker,
|
||||
)
|
||||
from unstract.filesystem import FileStorageType, FileSystem
|
||||
from unstract.sdk.constants import ToolExecKey
|
||||
from unstract.sdk.tool.mime_types import EXT_MIME_MAP
|
||||
@@ -304,6 +311,87 @@ class WorkerDestinationConnector:
|
||||
|
||||
return ProcessingResult(tool_execution_result=tool_result, metadata=metadata)
|
||||
|
||||
def _check_and_acquire_destination_lock(
|
||||
self, exec_ctx: ExecutionContext, file_ctx: FileContext
|
||||
) -> bool:
|
||||
"""Check if destination already processed and atomically acquire lock.
|
||||
|
||||
Returns:
|
||||
bool: True if lock acquired successfully, False if already processed (duplicate)
|
||||
|
||||
This method provides duplicate prevention using FileExecutionStatusTracker:
|
||||
1. Check if DESTINATION_PROCESSING or COMPLETED stage already exists
|
||||
2. If yes, this is a duplicate attempt (e.g., from worker restart) -> skip
|
||||
3. If no, atomically set DESTINATION_PROCESSING stage (acts as distributed lock)
|
||||
4. If FileExecutionStageException raised, another worker has the lock -> skip
|
||||
"""
|
||||
try:
|
||||
tracker = FileExecutionStatusTracker()
|
||||
|
||||
# Check if destination already processed
|
||||
existing_data = tracker.get_data(
|
||||
exec_ctx.execution_id, exec_ctx.file_execution_id
|
||||
)
|
||||
|
||||
if existing_data:
|
||||
current_stage = existing_data.stage_status.stage
|
||||
if current_stage in [
|
||||
FileExecutionStage.DESTINATION_PROCESSING,
|
||||
FileExecutionStage.COMPLETED,
|
||||
]:
|
||||
logger.warning(
|
||||
f"Destination already processed for file '{file_ctx.file_name}' "
|
||||
f"at stage {current_stage.value}. Skipping duplicate processing."
|
||||
)
|
||||
log_file_info(
|
||||
exec_ctx.workflow_log,
|
||||
exec_ctx.file_execution_id,
|
||||
f"File '{file_ctx.file_name}' destination already processed - skipping duplicate",
|
||||
)
|
||||
return False # Duplicate detected
|
||||
|
||||
# Atomically acquire lock by setting DESTINATION_PROCESSING stage
|
||||
# This prevents race conditions - first worker to set this stage wins
|
||||
try:
|
||||
logger.info(
|
||||
f"Acquiring destination processing lock for file '{file_ctx.file_name}'"
|
||||
)
|
||||
tracker.update_stage_status(
|
||||
exec_ctx.execution_id,
|
||||
exec_ctx.file_execution_id,
|
||||
FileExecutionStageData(
|
||||
stage=FileExecutionStage.DESTINATION_PROCESSING,
|
||||
status=FileExecutionStageStatus.IN_PROGRESS,
|
||||
),
|
||||
)
|
||||
logger.info(
|
||||
f"Successfully acquired destination processing lock for file '{file_ctx.file_name}'"
|
||||
)
|
||||
return True # Lock acquired successfully
|
||||
|
||||
except FileExecutionStageException as stage_error:
|
||||
# Another worker already acquired the lock (stage validation failed)
|
||||
logger.warning(
|
||||
f"Race condition detected: Another worker acquired destination lock "
|
||||
f"for file '{file_ctx.file_name}'. Skipping duplicate processing. "
|
||||
f"Error: {str(stage_error)}"
|
||||
)
|
||||
log_file_info(
|
||||
exec_ctx.workflow_log,
|
||||
exec_ctx.file_execution_id,
|
||||
f"File '{file_ctx.file_name}' destination lock acquired by another worker - skipping duplicate",
|
||||
)
|
||||
return False # Another worker has the lock
|
||||
|
||||
except Exception as e:
|
||||
# If Redis fails or other unexpected error, log but allow processing to continue
|
||||
# This ensures graceful degradation if tracking system is unavailable
|
||||
logger.error(
|
||||
f"Failed to check/acquire destination lock for file '{file_ctx.file_name}': {str(e)}. "
|
||||
f"Allowing processing to continue (graceful degradation)."
|
||||
)
|
||||
return True # Allow processing on infrastructure failure
|
||||
|
||||
def _check_and_handle_hitl(
|
||||
self, exec_ctx: ExecutionContext, file_ctx: FileContext, result: ProcessingResult
|
||||
) -> bool:
|
||||
@@ -547,6 +635,21 @@ class WorkerDestinationConnector:
|
||||
# Extract processing data
|
||||
result = self._extract_processing_data(exec_ctx, file_ctx)
|
||||
|
||||
# Check if destination already processed and atomically acquire lock
|
||||
# This prevents duplicate insertions during warm shutdown scenarios
|
||||
lock_acquired = self._check_and_acquire_destination_lock(exec_ctx, file_ctx)
|
||||
if not lock_acquired:
|
||||
# Duplicate detected or another worker has the lock - skip processing
|
||||
logger.info(
|
||||
f"Skipping destination processing for file '{file_ctx.file_name}' - "
|
||||
f"already processed or being processed by another worker"
|
||||
)
|
||||
return HandleOutputResult(
|
||||
output=result.tool_execution_result,
|
||||
metadata=result.metadata,
|
||||
connection_type=self.connection_type,
|
||||
)
|
||||
|
||||
# Check and handle HITL if needed
|
||||
result.has_hitl = self._check_and_handle_hitl(exec_ctx, file_ctx, result)
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ from unstract/workflow-execution, enabling workers to execute workflows
|
||||
directly using the ToolSandbox and runner services.
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
@@ -207,22 +208,6 @@ class WorkerWorkflowExecutionService:
|
||||
# - Writing to filesystem/database
|
||||
# - Routing to manual review
|
||||
# - Creating file history
|
||||
# Track finalization stage before destination processing
|
||||
if workflow_file_execution_id:
|
||||
try:
|
||||
tracker = FileExecutionStatusTracker()
|
||||
tracker.update_stage_status(
|
||||
execution_id=execution_id,
|
||||
file_execution_id=workflow_file_execution_id,
|
||||
stage_status=FileExecutionStageData(
|
||||
stage=FileExecutionStage.FINALIZATION,
|
||||
status=FileExecutionStageStatus.IN_PROGRESS,
|
||||
),
|
||||
)
|
||||
logger.info(f"Tracked finalization stage for {file_name}")
|
||||
except Exception as tracker_error:
|
||||
logger.warning(f"Failed to track finalization stage: {tracker_error}")
|
||||
|
||||
destination_result = None
|
||||
destination_start_time = time.time()
|
||||
logger.info(
|
||||
@@ -245,6 +230,26 @@ class WorkerWorkflowExecutionService:
|
||||
)
|
||||
logger.info(f"Destination processing completed for {file_name}")
|
||||
|
||||
# Mark destination processing as successful
|
||||
if workflow_file_execution_id and workflow_success:
|
||||
try:
|
||||
tracker = FileExecutionStatusTracker()
|
||||
tracker.update_stage_status(
|
||||
execution_id=execution_id,
|
||||
file_execution_id=workflow_file_execution_id,
|
||||
stage_status=FileExecutionStageData(
|
||||
stage=FileExecutionStage.DESTINATION_PROCESSING,
|
||||
status=FileExecutionStageStatus.SUCCESS,
|
||||
),
|
||||
)
|
||||
logger.info(
|
||||
f"Marked destination processing as successful for {file_name}"
|
||||
)
|
||||
except Exception as tracker_error:
|
||||
logger.warning(
|
||||
f"Failed to mark destination processing success: {tracker_error}"
|
||||
)
|
||||
|
||||
except Exception as dest_error:
|
||||
logger.error(
|
||||
f"Destination processing failed for {file_name}: {dest_error}",
|
||||
@@ -306,7 +311,25 @@ class WorkerWorkflowExecutionService:
|
||||
destination_result and not destination_result.error
|
||||
)
|
||||
|
||||
# Stage flow: DESTINATION_PROCESSING(2) → FINALIZATION(3) → COMPLETED(4)
|
||||
if overall_success:
|
||||
# Mark finalization as successful
|
||||
tracker.update_stage_status(
|
||||
execution_id=execution_id,
|
||||
file_execution_id=workflow_file_execution_id,
|
||||
stage_status=FileExecutionStageData(
|
||||
stage=FileExecutionStage.FINALIZATION,
|
||||
status=FileExecutionStageStatus.SUCCESS,
|
||||
),
|
||||
)
|
||||
logger.info(f"Marked finalization as successful for {file_name}")
|
||||
|
||||
# Use shorter TTL for COMPLETED stage to optimize Redis memory
|
||||
completed_ttl = int(
|
||||
os.environ.get(
|
||||
"FILE_EXECUTION_TRACKER_COMPLETED_TTL_IN_SECOND", 300
|
||||
)
|
||||
)
|
||||
tracker.update_stage_status(
|
||||
execution_id=execution_id,
|
||||
file_execution_id=workflow_file_execution_id,
|
||||
@@ -314,6 +337,7 @@ class WorkerWorkflowExecutionService:
|
||||
stage=FileExecutionStage.COMPLETED,
|
||||
status=FileExecutionStageStatus.SUCCESS,
|
||||
),
|
||||
ttl_in_second=completed_ttl,
|
||||
)
|
||||
logger.info(f"Tracked successful completion for {file_name}")
|
||||
else:
|
||||
@@ -340,12 +364,6 @@ class WorkerWorkflowExecutionService:
|
||||
file_execution_id=workflow_file_execution_id,
|
||||
)
|
||||
|
||||
# Clean up file execution tracker (log data then delete)
|
||||
self._cleanup_file_execution_tracker(
|
||||
execution_id=execution_id,
|
||||
file_execution_id=workflow_file_execution_id,
|
||||
)
|
||||
|
||||
except Exception as tracker_error:
|
||||
logger.warning(f"Failed to track final completion stage: {tracker_error}")
|
||||
|
||||
@@ -402,50 +420,6 @@ class WorkerWorkflowExecutionService:
|
||||
f"Failed to initialize file execution tracker for {execution_id}/{file_execution_id}: {e}"
|
||||
)
|
||||
|
||||
def _cleanup_file_execution_tracker(
|
||||
self,
|
||||
execution_id: str,
|
||||
file_execution_id: str,
|
||||
) -> None:
|
||||
"""Clean up file execution tracker after processing completes.
|
||||
|
||||
Logs file execution data for debugging purposes before cleanup.
|
||||
"""
|
||||
try:
|
||||
tracker = FileExecutionStatusTracker()
|
||||
|
||||
# Get current file execution data for logging before cleanup
|
||||
file_execution_data = tracker.get_data(execution_id, file_execution_id)
|
||||
|
||||
if file_execution_data:
|
||||
# Log file execution data for debugging purposes
|
||||
logger.info(
|
||||
f"File execution tracker data before cleanup - "
|
||||
f"execution_id: {execution_id}, file_execution_id: {file_execution_id}, "
|
||||
f"stage: {file_execution_data.stage_status.stage.value}, "
|
||||
f"status: {file_execution_data.stage_status.status.value}, "
|
||||
f"organization_id: {file_execution_data.organization_id}, "
|
||||
f"error: {file_execution_data.stage_status.error or 'None'}"
|
||||
)
|
||||
|
||||
# Actually delete file execution tracker data from Redis
|
||||
tracker.delete_data(execution_id, file_execution_id)
|
||||
logger.info(
|
||||
f"Deleted file execution tracker for execution_id: {execution_id}, "
|
||||
f"file_execution_id: {file_execution_id}"
|
||||
)
|
||||
else:
|
||||
logger.debug(
|
||||
f"No file execution tracker data found for execution_id: {execution_id}, "
|
||||
f"file_execution_id: {file_execution_id}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
# Non-critical - log and continue
|
||||
logger.warning(
|
||||
f"Failed to cleanup file execution tracker for {execution_id}/{file_execution_id}: {e}"
|
||||
)
|
||||
|
||||
def _cleanup_tool_execution_tracker(
|
||||
self,
|
||||
execution_id: str,
|
||||
|
||||
Reference in New Issue
Block a user