UN-2874 [FIX] Prevent duplicate destination entries when worker drops abruptly using atomic lock mechanism

- Fixed stage order: DESTINATION_PROCESSING(2) → FINALIZATION(3) → COMPLETED(4) - Implemented atomic lock via DESTINATION_PROCESSING IN_PROGRESS status - Added 3-minute TTL for COMPLETED stage to optimize Redis memory - Removed conflicting stage status updates from service.py - Enhanced status history tracking with complete IN_PROGRESS → SUCCESS transitions 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-14 09:57:08 +05:30
parent 10571598f2
commit 35be79f549
4 changed files with 153 additions and 70 deletions
--- a/unstract/core/src/unstract/core/file_execution_tracker.py
+++ b/unstract/core/src/unstract/core/file_execution_tracker.py
@@ -20,6 +20,7 @@ class FileExecutionStage(Enum):
    INITIALIZATION = "INITIALIZATION"
    TOOL_EXECUTION = "TOOL_EXECUTION"
    FINALIZATION = "FINALIZATION"
+    DESTINATION_PROCESSING = "DESTINATION_PROCESSING"
    COMPLETED = "COMPLETED"

    @property
@@ -42,8 +43,9 @@ class FileExecutionStage(Enum):
 FILE_EXECUTION_STAGE_ORDER = {
    FileExecutionStage.INITIALIZATION: 0,
    FileExecutionStage.TOOL_EXECUTION: 1,
-    FileExecutionStage.FINALIZATION: 2,
-    FileExecutionStage.COMPLETED: 3,
+    FileExecutionStage.DESTINATION_PROCESSING: 2,
+    FileExecutionStage.FINALIZATION: 3,
+    FileExecutionStage.COMPLETED: 4,
 }


--- a/workers/sample.env
+++ b/workers/sample.env
@@ -146,6 +146,10 @@ CELERY_WORKER_POOL_RESTARTS=true
 CELERY_BROKER_CONNECTION_RETRY_ON_STARTUP=true
 CELERY_RESULT_CHORD_RETRY_INTERVAL=3.0

+# Disable AMQP heartbeats to prevent connection drops during pod shutdown.
+# Fixes duplicate task processing with prefork+acks_late. See: celery/celery#3802
+CELERY_WORKER_HEARTBEAT=0
+
 # =============================================================================
 # Worker-Specific Configuration
 # =============================================================================
@@ -233,7 +237,7 @@ X2TEXT_PORT=3004
 # Tool Runner
 UNSTRACT_RUNNER_HOST=http://unstract-runner
 UNSTRACT_RUNNER_PORT=5002
-UNSTRACT_RUNNER_API_TIMEOUT=120
+UNSTRACT_RUNNER_API_TIMEOUT=300
 UNSTRACT_RUNNER_API_RETRY_COUNT=5
 UNSTRACT_RUNNER_API_BACKOFF_FACTOR=3

@@ -252,7 +256,7 @@ MAX_PARALLEL_FILE_BATCHES=1

 # File Execution TTL Configuration
 FILE_EXECUTION_TRACKER_TTL_IN_SECOND=18000
-FILE_EXECUTION_TRACKER_COMPLETED_TTL_IN_SECOND=600
+FILE_EXECUTION_TRACKER_COMPLETED_TTL_IN_SECOND=300
 EXECUTION_RESULT_TTL_SECONDS=86400
 EXECUTION_CACHE_TTL_SECONDS=86400
 INSTANT_WF_POLLING_TIMEOUT=300
--- a/workers/shared/workflow/destination_connector.py
+++ b/workers/shared/workflow/destination_connector.py
@@ -33,6 +33,13 @@ from unstract.connectors.connectorkit import Connectorkit
 from unstract.connectors.exceptions import ConnectorError
 from unstract.core.data_models import ConnectionType as CoreConnectionType
 from unstract.core.data_models import FileHashData
+from unstract.core.exceptions import FileExecutionStageException
+from unstract.core.file_execution_tracker import (
+    FileExecutionStage,
+    FileExecutionStageData,
+    FileExecutionStageStatus,
+    FileExecutionStatusTracker,
+)
 from unstract.filesystem import FileStorageType, FileSystem
 from unstract.sdk.constants import ToolExecKey
 from unstract.sdk.tool.mime_types import EXT_MIME_MAP
@@ -304,6 +311,87 @@ class WorkerDestinationConnector:

        return ProcessingResult(tool_execution_result=tool_result, metadata=metadata)

+    def _check_and_acquire_destination_lock(
+        self, exec_ctx: ExecutionContext, file_ctx: FileContext
+    ) -> bool:
+        """Check if destination already processed and atomically acquire lock.
+
+        Returns:
+            bool: True if lock acquired successfully, False if already processed (duplicate)
+
+        This method provides duplicate prevention using FileExecutionStatusTracker:
+        1. Check if DESTINATION_PROCESSING or COMPLETED stage already exists
+        2. If yes, this is a duplicate attempt (e.g., from worker restart) -> skip
+        3. If no, atomically set DESTINATION_PROCESSING stage (acts as distributed lock)
+        4. If FileExecutionStageException raised, another worker has the lock -> skip
+        """
+        try:
+            tracker = FileExecutionStatusTracker()
+
+            # Check if destination already processed
+            existing_data = tracker.get_data(
+                exec_ctx.execution_id, exec_ctx.file_execution_id
+            )
+
+            if existing_data:
+                current_stage = existing_data.stage_status.stage
+                if current_stage in [
+                    FileExecutionStage.DESTINATION_PROCESSING,
+                    FileExecutionStage.COMPLETED,
+                ]:
+                    logger.warning(
+                        f"Destination already processed for file '{file_ctx.file_name}' "
+                        f"at stage {current_stage.value}. Skipping duplicate processing."
+                    )
+                    log_file_info(
+                        exec_ctx.workflow_log,
+                        exec_ctx.file_execution_id,
+                        f"File '{file_ctx.file_name}' destination already processed - skipping duplicate",
+                    )
+                    return False  # Duplicate detected
+
+            # Atomically acquire lock by setting DESTINATION_PROCESSING stage
+            # This prevents race conditions - first worker to set this stage wins
+            try:
+                logger.info(
+                    f"Acquiring destination processing lock for file '{file_ctx.file_name}'"
+                )
+                tracker.update_stage_status(
+                    exec_ctx.execution_id,
+                    exec_ctx.file_execution_id,
+                    FileExecutionStageData(
+                        stage=FileExecutionStage.DESTINATION_PROCESSING,
+                        status=FileExecutionStageStatus.IN_PROGRESS,
+                    ),
+                )
+                logger.info(
+                    f"Successfully acquired destination processing lock for file '{file_ctx.file_name}'"
+                )
+                return True  # Lock acquired successfully
+
+            except FileExecutionStageException as stage_error:
+                # Another worker already acquired the lock (stage validation failed)
+                logger.warning(
+                    f"Race condition detected: Another worker acquired destination lock "
+                    f"for file '{file_ctx.file_name}'. Skipping duplicate processing. "
+                    f"Error: {str(stage_error)}"
+                )
+                log_file_info(
+                    exec_ctx.workflow_log,
+                    exec_ctx.file_execution_id,
+                    f"File '{file_ctx.file_name}' destination lock acquired by another worker - skipping duplicate",
+                )
+                return False  # Another worker has the lock
+
+        except Exception as e:
+            # If Redis fails or other unexpected error, log but allow processing to continue
+            # This ensures graceful degradation if tracking system is unavailable
+            logger.error(
+                f"Failed to check/acquire destination lock for file '{file_ctx.file_name}': {str(e)}. "
+                f"Allowing processing to continue (graceful degradation)."
+            )
+            return True  # Allow processing on infrastructure failure
+
    def _check_and_handle_hitl(
        self, exec_ctx: ExecutionContext, file_ctx: FileContext, result: ProcessingResult
    ) -> bool:
@@ -547,6 +635,21 @@ class WorkerDestinationConnector:
        # Extract processing data
        result = self._extract_processing_data(exec_ctx, file_ctx)

+        # Check if destination already processed and atomically acquire lock
+        # This prevents duplicate insertions during warm shutdown scenarios
+        lock_acquired = self._check_and_acquire_destination_lock(exec_ctx, file_ctx)
+        if not lock_acquired:
+            # Duplicate detected or another worker has the lock - skip processing
+            logger.info(
+                f"Skipping destination processing for file '{file_ctx.file_name}' - "
+                f"already processed or being processed by another worker"
+            )
+            return HandleOutputResult(
+                output=result.tool_execution_result,
+                metadata=result.metadata,
+                connection_type=self.connection_type,
+            )
+
        # Check and handle HITL if needed
        result.has_hitl = self._check_and_handle_hitl(exec_ctx, file_ctx, result)

--- a/workers/shared/workflow/execution/service.py
+++ b/workers/shared/workflow/execution/service.py
@@ -5,6 +5,7 @@ from unstract/workflow-execution, enabling workers to execute workflows
 directly using the ToolSandbox and runner services.
 """

+import os
 import time
 from typing import Any

@@ -207,22 +208,6 @@ class WorkerWorkflowExecutionService:
        # - Writing to filesystem/database
        # - Routing to manual review
        # - Creating file history
-        # Track finalization stage before destination processing
-        if workflow_file_execution_id:
-            try:
-                tracker = FileExecutionStatusTracker()
-                tracker.update_stage_status(
-                    execution_id=execution_id,
-                    file_execution_id=workflow_file_execution_id,
-                    stage_status=FileExecutionStageData(
-                        stage=FileExecutionStage.FINALIZATION,
-                        status=FileExecutionStageStatus.IN_PROGRESS,
-                    ),
-                )
-                logger.info(f"Tracked finalization stage for {file_name}")
-            except Exception as tracker_error:
-                logger.warning(f"Failed to track finalization stage: {tracker_error}")
-
        destination_result = None
        destination_start_time = time.time()
        logger.info(
@@ -245,6 +230,26 @@ class WorkerWorkflowExecutionService:
            )
            logger.info(f"Destination processing completed for {file_name}")

+            # Mark destination processing as successful
+            if workflow_file_execution_id and workflow_success:
+                try:
+                    tracker = FileExecutionStatusTracker()
+                    tracker.update_stage_status(
+                        execution_id=execution_id,
+                        file_execution_id=workflow_file_execution_id,
+                        stage_status=FileExecutionStageData(
+                            stage=FileExecutionStage.DESTINATION_PROCESSING,
+                            status=FileExecutionStageStatus.SUCCESS,
+                        ),
+                    )
+                    logger.info(
+                        f"Marked destination processing as successful for {file_name}"
+                    )
+                except Exception as tracker_error:
+                    logger.warning(
+                        f"Failed to mark destination processing success: {tracker_error}"
+                    )
+
        except Exception as dest_error:
            logger.error(
                f"Destination processing failed for {file_name}: {dest_error}",
@@ -306,7 +311,25 @@ class WorkerWorkflowExecutionService:
                    destination_result and not destination_result.error
                )

+                # Stage flow: DESTINATION_PROCESSING(2) → FINALIZATION(3) → COMPLETED(4)
                if overall_success:
+                    # Mark finalization as successful
+                    tracker.update_stage_status(
+                        execution_id=execution_id,
+                        file_execution_id=workflow_file_execution_id,
+                        stage_status=FileExecutionStageData(
+                            stage=FileExecutionStage.FINALIZATION,
+                            status=FileExecutionStageStatus.SUCCESS,
+                        ),
+                    )
+                    logger.info(f"Marked finalization as successful for {file_name}")
+
+                    # Use shorter TTL for COMPLETED stage to optimize Redis memory
+                    completed_ttl = int(
+                        os.environ.get(
+                            "FILE_EXECUTION_TRACKER_COMPLETED_TTL_IN_SECOND", 300
+                        )
+                    )
                    tracker.update_stage_status(
                        execution_id=execution_id,
                        file_execution_id=workflow_file_execution_id,
@@ -314,6 +337,7 @@ class WorkerWorkflowExecutionService:
                            stage=FileExecutionStage.COMPLETED,
                            status=FileExecutionStageStatus.SUCCESS,
                        ),
+                        ttl_in_second=completed_ttl,
                    )
                    logger.info(f"Tracked successful completion for {file_name}")
                else:
@@ -340,12 +364,6 @@ class WorkerWorkflowExecutionService:
                    file_execution_id=workflow_file_execution_id,
                )

-                # Clean up file execution tracker (log data then delete)
-                self._cleanup_file_execution_tracker(
-                    execution_id=execution_id,
-                    file_execution_id=workflow_file_execution_id,
-                )
-
            except Exception as tracker_error:
                logger.warning(f"Failed to track final completion stage: {tracker_error}")

@@ -402,50 +420,6 @@ class WorkerWorkflowExecutionService:
                f"Failed to initialize file execution tracker for {execution_id}/{file_execution_id}: {e}"
            )

-    def _cleanup_file_execution_tracker(
-        self,
-        execution_id: str,
-        file_execution_id: str,
-    ) -> None:
-        """Clean up file execution tracker after processing completes.
-
-        Logs file execution data for debugging purposes before cleanup.
-        """
-        try:
-            tracker = FileExecutionStatusTracker()
-
-            # Get current file execution data for logging before cleanup
-            file_execution_data = tracker.get_data(execution_id, file_execution_id)
-
-            if file_execution_data:
-                # Log file execution data for debugging purposes
-                logger.info(
-                    f"File execution tracker data before cleanup - "
-                    f"execution_id: {execution_id}, file_execution_id: {file_execution_id}, "
-                    f"stage: {file_execution_data.stage_status.stage.value}, "
-                    f"status: {file_execution_data.stage_status.status.value}, "
-                    f"organization_id: {file_execution_data.organization_id}, "
-                    f"error: {file_execution_data.stage_status.error or 'None'}"
-                )
-
-                # Actually delete file execution tracker data from Redis
-                tracker.delete_data(execution_id, file_execution_id)
-                logger.info(
-                    f"Deleted file execution tracker for execution_id: {execution_id}, "
-                    f"file_execution_id: {file_execution_id}"
-                )
-            else:
-                logger.debug(
-                    f"No file execution tracker data found for execution_id: {execution_id}, "
-                    f"file_execution_id: {file_execution_id}"
-                )
-
-        except Exception as e:
-            # Non-critical - log and continue
-            logger.warning(
-                f"Failed to cleanup file execution tracker for {execution_id}/{file_execution_id}: {e}"
-            )
-
    def _cleanup_tool_execution_tracker(
        self,
        execution_id: str,