UN-2893 [FIX] Fix duplicate process handling status updates and UI error logs (#1594)

* UN-2893 [FIX] Fix duplicate process handling status updates and UI error logs

Prevent duplicate worker processes from updating file execution status
and showing UI error logs during GKE race conditions.

- Added is_duplicate_skip flag to FileProcessingResult dataclass
- Fixed destination_processed default value for correct duplicate detection
- Skip status updates and UI logs when duplicate is detected
- Only first worker updates status, second worker silently exits

* logger.error converted to logger.exception

* error to exception in logs
This commit is contained in:
ali
2025-10-16 10:50:39 +05:30
committed by GitHub
parent 464fbb30b6
commit 8d6c7e20f9
5 changed files with 100 additions and 54 deletions

View File

@@ -786,6 +786,9 @@ class FileProcessingResult:
destination_processed: bool = True
destination_error: str | None = None
# Duplicate detection indicator
is_duplicate_skip: bool = False # True when file skipped due to duplicate detection
def to_dict(self) -> dict[str, Any]:
"""Convert to dictionary for backward compatibility."""
return serialize_dataclass_to_dict(self)
@@ -807,6 +810,7 @@ class FileProcessingResult:
review_result=data.get("review_result"),
destination_processed=data.get("destination_processed", True),
destination_error=data.get("destination_error"),
is_duplicate_skip=data.get("is_duplicate_skip", False),
)
def is_successful(self) -> bool:

View File

@@ -760,6 +760,21 @@ def _handle_file_processing_result(
)
return
# CRITICAL: Handle duplicate skip - no DB updates, silent skip
# When is_duplicate_skip=True, another worker is already processing this file
# We should skip ALL processing and not update any database status or counters
if getattr(file_execution_result, "is_duplicate_skip", False):
# Enhanced debug log with full context for internal debugging
logger.info(
f"DUPLICATE SKIP: File '{file_name}' skipped as duplicate - another worker is processing it. "
f"execution_id={execution_id}, workflow_id={workflow_id}, "
f"file_execution_id={file_execution_id}, celery_task_id={celery_task_id}. "
f"No DB status updates, no counter increments - silent skip to avoid interfering with active worker. "
f"First worker will handle all status updates and counter increments."
)
# Exit early without any DB updates - the first worker will handle all updates
return
# Calculate execution time
file_execution_time = _calculate_execution_time(file_name, file_start_time)

View File

@@ -358,10 +358,8 @@ class WorkflowExecutionProcessor:
logger.info(
f"Updated file execution {context.workflow_file_execution_id} status to ERROR"
)
except Exception as status_error:
logger.error(
f"Failed to update file execution status: {status_error}"
)
except Exception:
logger.exception("Failed to update file execution status")
return FileProcessingResult(
file_name=context.file_name,
@@ -380,6 +378,21 @@ class WorkflowExecutionProcessor:
f"✓ Workflow execution completed successfully for {context.file_name}"
)
# Check if this was a duplicate skip (destination not processed due to duplicate detection)
destination_processed = (
execution_result.metadata.destination_processed
if execution_result.metadata
else False
)
destination_error = (
execution_result.metadata.destination_error
if execution_result.metadata
else None
)
# Duplicate detection: destination not processed AND no error
is_duplicate = not destination_processed and not destination_error
return FileProcessingResult(
file_name=context.file_name,
file_execution_id=context.workflow_file_execution_id,
@@ -390,12 +403,9 @@ class WorkflowExecutionProcessor:
if execution_result.metadata
else {},
execution_time=context.get_processing_duration(),
destination_processed=execution_result.metadata.destination_processed
if execution_result.metadata
else True,
destination_error=execution_result.metadata.destination_error
if execution_result.metadata
else None,
destination_processed=destination_processed,
destination_error=destination_error,
is_duplicate_skip=is_duplicate,
)
except Exception as execution_error:
@@ -597,37 +607,47 @@ class FileProcessor:
# Check if destination processing failed
destination_error = workflow_result.destination_error
destination_processed = workflow_result.destination_processed
is_duplicate_skip = getattr(workflow_result, "is_duplicate_skip", False)
if destination_error or not destination_processed:
# Log destination failure
error_msg = destination_error or "Destination processing failed"
log_file_error(
workflow_logger,
workflow_file_execution_id,
f"❌ File '{context.file_name}' destination processing failed: {error_msg}",
)
# Update file execution status to ERROR
logger.info(
f"Updating file execution status to ERROR for {context.workflow_file_execution_id} due to destination failure"
)
try:
context.api_client.update_file_execution_status(
file_execution_id=context.workflow_file_execution_id,
status=ExecutionStatus.ERROR.value,
error_message=error_msg,
# Skip UI error log and DB updates for duplicates (internal race condition, not user error)
if not is_duplicate_skip:
# Log destination failure to UI
error_msg = destination_error or "Destination processing failed"
log_file_error(
workflow_logger,
workflow_file_execution_id,
f"❌ File '{context.file_name}' destination processing failed: {error_msg}",
)
# Update file execution status to ERROR
logger.info(
f"Updated file execution {context.workflow_file_execution_id} status to ERROR"
)
except Exception as status_error:
logger.error(
f"Failed to update file execution status: {status_error}"
f"Updating file execution status to ERROR for {context.workflow_file_execution_id} due to destination failure"
)
try:
context.api_client.update_file_execution_status(
file_execution_id=context.workflow_file_execution_id,
status=ExecutionStatus.ERROR.value,
error_message=error_msg,
)
logger.info(
f"Updated file execution {context.workflow_file_execution_id} status to ERROR"
)
except Exception:
logger.exception("Failed to update file execution status")
# Update workflow result since destination failed
workflow_result.success = False
workflow_result.error = error_msg
# Update workflow result since destination failed
workflow_result.success = False
workflow_result.error = error_msg
else:
# Debug log for duplicate detection (internal use only)
logger.info(
f"DUPLICATE SKIP: File '{context.file_name}' identified as duplicate in processor. "
f"destination_processed=False, destination_error=None, "
f"file_execution_id={context.workflow_file_execution_id}. "
f"Skipping UI error log and status updates - this is an internal race condition, not a user-facing error. "
f"First worker will handle all status updates."
)
else:
log_file_info(
workflow_logger,
@@ -639,7 +659,7 @@ class FileProcessor:
return workflow_result
except Exception as e:
logger.error(f"File processing failed for {context.file_name}: {e}")
logger.exception(f"File processing failed for {context.file_name}")
# Send file processing error log to UI
log_file_processing_error(

View File

@@ -343,15 +343,14 @@ class WorkerDestinationConnector:
FileExecutionStage.FINALIZATION,
FileExecutionStage.COMPLETED,
]:
# Enhanced debug log with full context for internal debugging
logger.warning(
f"Destination already processed for file '{file_ctx.file_name}' "
f"at stage {current_stage.value}. Skipping duplicate processing."
)
log_file_info(
exec_ctx.workflow_log,
exec_ctx.file_execution_id,
f"File '{file_ctx.file_name}' destination already processed - skipping duplicate",
f"DUPLICATE DETECTION: File '{file_ctx.file_name}' destination already at stage "
f"{current_stage.value}. execution_id={exec_ctx.execution_id}, "
f"file_execution_id={exec_ctx.file_execution_id}, worker_pid={os.getpid()}. "
f"Skipping duplicate processing - another worker completed this already."
)
# UI log removed - duplication is internal detail, not user-facing error
return False # Duplicate detected
# Atomically acquire lock using Redis SET NX
@@ -381,16 +380,14 @@ class WorkerDestinationConnector:
)
if not lock_acquired:
# Another worker already has the lock
# Enhanced debug log with full context for internal debugging
logger.warning(
f"Lock acquisition failed for file '{file_ctx.file_name}': "
f"Another worker holds the lock. Skipping duplicate processing."
)
log_file_info(
exec_ctx.workflow_log,
exec_ctx.file_execution_id,
f"File '{file_ctx.file_name}' destination lock held by another worker - skipping duplicate",
f"DUPLICATE DETECTION: Lock acquisition failed for file '{file_ctx.file_name}'. "
f"execution_id={exec_ctx.execution_id}, file_execution_id={exec_ctx.file_execution_id}, "
f"lock_key={lock_key}, worker_pid={os.getpid()}. "
f"Another worker holds the lock - skipping duplicate processing."
)
# UI log removed - duplication is internal detail, not user-facing error
return False
# Lock acquired successfully - now set DESTINATION_PROCESSING stage

View File

@@ -528,13 +528,19 @@ class WorkerWorkflowExecutionService:
final_error = destination_result.error
# Build metadata
# CRITICAL: Check destination_result.processed (not just existence) to detect duplicate skips
# destination_result.processed=False means duplicate was detected and skipped
metadata = WorkflowExecutionMetadata(
workflow_id=workflow_id,
execution_id=execution_id,
execution_time=execution_time,
tool_count=tool_count,
workflow_executed=workflow_success,
destination_processed=destination_result is not None,
destination_processed=(
getattr(destination_result, "processed", True)
if destination_result
else False
),
destination_error=destination_result.error if destination_result else None,
)
@@ -1156,9 +1162,13 @@ class WorkerWorkflowExecutionService:
# Check if handle_output returned None (duplicate detected)
if handle_output_result is None:
# Enhanced debug log with full context for internal debugging
logger.info(
f"Duplicate detected for {file_hash.file_name} - "
f"skipping file history and all downstream processing"
f"DUPLICATE SKIP: File '{file_hash.file_name}' duplicate detected at destination phase. "
f"execution_id={execution_id}, file_execution_id={workflow_file_execution_id}, "
f"workflow_id={workflow_id}. "
f"Returning FinalOutputResult with processed=False, no file history will be created. "
f"This is an internal race condition, not an error."
)
# Return special result to indicate duplicate (not an error, not processed)
return FinalOutputResult(