From 3efb1716b438091b3725a2e0b07f35eb12857ce7 Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 19 Nov 2025 03:06:29 +0800 Subject: [PATCH 1/6] Enhance XLSX extraction with structured tab-delimited format and escaping - Add clear sheet separators - Escape special characters - Trim trailing empty columns - Preserve row structure - Single-pass optimization --- lightrag/api/routers/document_routes.py | 103 +++++++++++++++++++++--- 1 file changed, 93 insertions(+), 10 deletions(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index dd6d7fd8..1e770520 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -1052,27 +1052,110 @@ def _extract_pptx(file_bytes: bytes) -> str: def _extract_xlsx(file_bytes: bytes) -> str: - """Extract XLSX content (synchronous). + """Extract XLSX content in tab-delimited format with clear sheet separation. + + This function processes Excel workbooks and converts them to a structured text format + suitable for LLM prompts and RAG systems. Each sheet is clearly delimited with + separator lines, and special characters are escaped to preserve the tab-delimited structure. + + Features: + - Each sheet is wrapped with '====================' separators for visual distinction + - Special characters (tabs, newlines, backslashes) are escaped to prevent structure corruption + - Trailing empty columns are trimmed per row to reduce token waste + - Empty rows are preserved as blank lines to maintain row structure + - Single-pass optimization for better performance on large spreadsheets Args: file_bytes: XLSX file content as bytes Returns: - str: Extracted text content + str: Extracted text content with all sheets in tab-delimited format. + Format: Sheet separators, sheet name, then tab-delimited rows. + + Example output: + ==================== Sheet: Data ==================== + Name\tAge\tCity + Alice\t30\tNew York + Bob\t25\tLondon + + ==================== Sheet: Summary ==================== + Total\t2 + ==================== """ from openpyxl import load_workbook # type: ignore xlsx_file = BytesIO(file_bytes) wb = load_workbook(xlsx_file) - content = "" - for sheet in wb: - content += f"Sheet: {sheet.title}\n" + + def escape_cell(cell_value: str | int | float | None) -> str: + """Escape characters that would break tab-delimited layout. + + Escape order is critical: backslashes first, then tabs/newlines. + This prevents double-escaping issues. + + Args: + cell_value: The cell value to escape (can be None, str, int, or float) + + Returns: + str: Escaped cell value safe for tab-delimited format + """ + if cell_value is None: + return "" + text = str(cell_value) + # CRITICAL: Escape backslash first to avoid double-escaping + return ( + text.replace("\\", "\\\\") # Must be first: \ -> \\ + .replace("\t", "\\t") # Tab -> \t (visible) + .replace("\r\n", "\\n") # Windows newline -> \n + .replace("\r", "\\n") # Mac newline -> \n + .replace("\n", "\\n") # Unix newline -> \n + ) + + def escape_sheet_title(title: str) -> str: + """Escape sheet title to prevent formatting issues in separators. + + Args: + title: Original sheet title + + Returns: + str: Sanitized sheet title with tabs/newlines replaced + """ + return str(title).replace("\n", " ").replace("\t", " ").replace("\r", " ") + + content_parts: list[str] = [] + sheet_separator = "=" * 20 + + for idx, sheet in enumerate(wb): + if idx > 0: + content_parts.append("") # Blank line between sheets for readability + + # Escape sheet title to handle edge cases with special characters + safe_title = escape_sheet_title(sheet.title) + content_parts.append(f"{sheet_separator} Sheet: {safe_title} {sheet_separator}") + + # Single-pass optimization: escape and trim in one iteration for row in sheet.iter_rows(values_only=True): - content += ( - "\t".join(str(cell) if cell is not None else "" for cell in row) + "\n" - ) - content += "\n" - return content + row_parts = [] + last_nonempty_idx = -1 + + # Build escaped row while tracking last non-empty cell position + for idx, cell in enumerate(row): + escaped = escape_cell(cell) + row_parts.append(escaped) + if escaped != "": + last_nonempty_idx = idx + + # Handle completely empty rows vs rows with data + if last_nonempty_idx == -1: + # Preserve empty rows as blank lines (maintains row structure) + content_parts.append("") + else: + # Only join up to last non-empty cell (trim trailing empties) + content_parts.append("\t".join(row_parts[: last_nonempty_idx + 1])) + + # Final separator for symmetry (makes parsing easier) + content_parts.append(sheet_separator) + return "\n".join(content_parts) async def pipeline_enqueue_file( From ef659a1e09ed0f5b5e650728b182ec49f93c4c38 Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 19 Nov 2025 03:34:22 +0800 Subject: [PATCH 2/6] Preserve column alignment in XLSX extraction with two-pass processing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Two-pass approach for consistent width • Maintain tabular structure integrity • Determine max columns first pass • Extract with alignment second pass • Prevent column misalignment issues --- lightrag/api/routers/document_routes.py | 42 ++++++++++++++++--------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 1e770520..14e03f5f 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -1061,9 +1061,9 @@ def _extract_xlsx(file_bytes: bytes) -> str: Features: - Each sheet is wrapped with '====================' separators for visual distinction - Special characters (tabs, newlines, backslashes) are escaped to prevent structure corruption - - Trailing empty columns are trimmed per row to reduce token waste + - Column alignment is preserved across all rows to maintain tabular structure - Empty rows are preserved as blank lines to maintain row structure - - Single-pass optimization for better performance on large spreadsheets + - Two-pass processing: determines max column width, then extracts with consistent alignment Args: file_bytes: XLSX file content as bytes @@ -1133,25 +1133,39 @@ def _extract_xlsx(file_bytes: bytes) -> str: safe_title = escape_sheet_title(sheet.title) content_parts.append(f"{sheet_separator} Sheet: {safe_title} {sheet_separator}") - # Single-pass optimization: escape and trim in one iteration - for row in sheet.iter_rows(values_only=True): - row_parts = [] - last_nonempty_idx = -1 + # Two-pass approach to preserve column alignment: + # Pass 1: Determine the maximum column width for this sheet + max_columns = 0 + all_rows = list(sheet.iter_rows(values_only=True)) - # Build escaped row while tracking last non-empty cell position + for row in all_rows: + last_nonempty_idx = -1 for idx, cell in enumerate(row): - escaped = escape_cell(cell) - row_parts.append(escaped) - if escaped != "": + # Check if cell has meaningful content (not None or empty string) + if cell is not None and str(cell).strip(): last_nonempty_idx = idx - # Handle completely empty rows vs rows with data - if last_nonempty_idx == -1: + if last_nonempty_idx >= 0: + max_columns = max(max_columns, last_nonempty_idx + 1) + + # Pass 2: Extract rows with consistent width to preserve column alignment + for row in all_rows: + row_parts = [] + + # Build row up to max_columns width + for idx in range(max_columns): + if idx < len(row): + row_parts.append(escape_cell(row[idx])) + else: + row_parts.append("") # Pad short rows + + # Check if row is completely empty + if all(part == "" for part in row_parts): # Preserve empty rows as blank lines (maintains row structure) content_parts.append("") else: - # Only join up to last non-empty cell (trim trailing empties) - content_parts.append("\t".join(row_parts[: last_nonempty_idx + 1])) + # Join all columns to maintain consistent column count + content_parts.append("\t".join(row_parts)) # Final separator for symmetry (makes parsing easier) content_parts.append(sheet_separator) From 2b160163120bc8310babbb84887d07c8089ac7c7 Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 19 Nov 2025 03:48:36 +0800 Subject: [PATCH 3/6] Optimize XLSX extraction to avoid storing all rows in memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Remove intermediate row storage • Use iterator twice instead of list() • Preserve column alignment logic • Reduce memory footprint • Maintain same output format --- lightrag/api/routers/document_routes.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 14e03f5f..a4efcacd 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -1133,12 +1133,10 @@ def _extract_xlsx(file_bytes: bytes) -> str: safe_title = escape_sheet_title(sheet.title) content_parts.append(f"{sheet_separator} Sheet: {safe_title} {sheet_separator}") - # Two-pass approach to preserve column alignment: - # Pass 1: Determine the maximum column width for this sheet + # Two-pass approach to preserve column alignment without storing rows in memory: + # Pass 1: Scan to determine the maximum column width (memory-efficient) max_columns = 0 - all_rows = list(sheet.iter_rows(values_only=True)) - - for row in all_rows: + for row in sheet.iter_rows(values_only=True): last_nonempty_idx = -1 for idx, cell in enumerate(row): # Check if cell has meaningful content (not None or empty string) @@ -1149,7 +1147,7 @@ def _extract_xlsx(file_bytes: bytes) -> str: max_columns = max(max_columns, last_nonempty_idx + 1) # Pass 2: Extract rows with consistent width to preserve column alignment - for row in all_rows: + for row in sheet.iter_rows(values_only=True): row_parts = [] # Build row up to max_columns width From 0244699d81a551cf5c5ba3a936709eb54fbd1968 Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 19 Nov 2025 04:02:39 +0800 Subject: [PATCH 4/6] Optimize XLSX extraction by using sheet.max_column instead of two-pass scan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Remove two-pass row scanning approach • Use built-in sheet.max_column property • Simplify column width detection logic • Improve memory efficiency • Maintain column alignment preservation --- lightrag/api/routers/document_routes.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index a4efcacd..5775c4da 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -1133,20 +1133,10 @@ def _extract_xlsx(file_bytes: bytes) -> str: safe_title = escape_sheet_title(sheet.title) content_parts.append(f"{sheet_separator} Sheet: {safe_title} {sheet_separator}") - # Two-pass approach to preserve column alignment without storing rows in memory: - # Pass 1: Scan to determine the maximum column width (memory-efficient) - max_columns = 0 - for row in sheet.iter_rows(values_only=True): - last_nonempty_idx = -1 - for idx, cell in enumerate(row): - # Check if cell has meaningful content (not None or empty string) - if cell is not None and str(cell).strip(): - last_nonempty_idx = idx + # Use sheet.max_column to get the maximum column width directly + max_columns = sheet.max_column if sheet.max_column else 0 - if last_nonempty_idx >= 0: - max_columns = max(max_columns, last_nonempty_idx + 1) - - # Pass 2: Extract rows with consistent width to preserve column alignment + # Extract rows with consistent width to preserve column alignment for row in sheet.iter_rows(values_only=True): row_parts = [] From 87de2b3e9e5d455a84bb93b2492ce4673d3faed9 Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 19 Nov 2025 04:26:41 +0800 Subject: [PATCH 5/6] Update XLSX extraction documentation to reflect current implementation --- lightrag/api/routers/document_routes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 5775c4da..8839811c 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -1063,7 +1063,7 @@ def _extract_xlsx(file_bytes: bytes) -> str: - Special characters (tabs, newlines, backslashes) are escaped to prevent structure corruption - Column alignment is preserved across all rows to maintain tabular structure - Empty rows are preserved as blank lines to maintain row structure - - Two-pass processing: determines max column width, then extracts with consistent alignment + - Uses sheet.max_column to determine column width efficiently Args: file_bytes: XLSX file content as bytes From 95cd0ece7424100b0fe2e4109c873ca23710fd0f Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 19 Nov 2025 09:54:35 +0800 Subject: [PATCH 6/6] Fix DOCX table extraction by escaping special characters in cells - Add escape_cell() function - Escape backslashes first - Handle tabs and newlines - Preserve tab-delimited format - Prevent double-escaping issues --- lightrag/api/routers/document_routes.py | 28 +++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py index 8839811c..85183bbd 100644 --- a/lightrag/api/routers/document_routes.py +++ b/lightrag/api/routers/document_routes.py @@ -992,6 +992,30 @@ def _extract_docx(file_bytes: bytes) -> str: docx_file = BytesIO(file_bytes) doc = Document(docx_file) + def escape_cell(cell_value: str | None) -> str: + """Escape characters that would break tab-delimited layout. + + Escape order is critical: backslashes first, then tabs/newlines. + This prevents double-escaping issues. + + Args: + cell_value: The cell value to escape (can be None or str) + + Returns: + str: Escaped cell value safe for tab-delimited format + """ + if cell_value is None: + return "" + text = str(cell_value) + # CRITICAL: Escape backslash first to avoid double-escaping + return ( + text.replace("\\", "\\\\") # Must be first: \ -> \\ + .replace("\t", "\\t") # Tab -> \t (visible) + .replace("\r\n", "\\n") # Windows newline -> \n + .replace("\r", "\\n") # Mac newline -> \n + .replace("\n", "\\n") # Unix newline -> \n + ) + content_parts = [] in_table = False # Track if we're currently processing a table @@ -1021,8 +1045,8 @@ def _extract_docx(file_bytes: bytes) -> str: row_text = [] for cell in row.cells: cell_text = cell.text - # Always append cell text to preserve column structure - row_text.append(cell_text) + # Escape special characters to preserve tab-delimited structure + row_text.append(escape_cell(cell_text)) # Only add row if at least one cell has content if any(cell for cell in row_text): content_parts.append("\t".join(row_text))