From 3efb1716b438091b3725a2e0b07f35eb12857ce7 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Wed, 19 Nov 2025 03:06:29 +0800
Subject: [PATCH 1/6] Enhance XLSX extraction with structured tab-delimited
 format and escaping

- Add clear sheet separators
- Escape special characters
- Trim trailing empty columns
- Preserve row structure
- Single-pass optimization
---
 lightrag/api/routers/document_routes.py | 103 +++++++++++++++++++++---
 1 file changed, 93 insertions(+), 10 deletions(-)

diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py
index dd6d7fd8..1e770520 100644
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@@ -1052,27 +1052,110 @@ def _extract_pptx(file_bytes: bytes) -> str:
 
 
 def _extract_xlsx(file_bytes: bytes) -> str:
-    """Extract XLSX content (synchronous).
+    """Extract XLSX content in tab-delimited format with clear sheet separation.
+
+    This function processes Excel workbooks and converts them to a structured text format
+    suitable for LLM prompts and RAG systems. Each sheet is clearly delimited with
+    separator lines, and special characters are escaped to preserve the tab-delimited structure.
+
+    Features:
+    - Each sheet is wrapped with '====================' separators for visual distinction
+    - Special characters (tabs, newlines, backslashes) are escaped to prevent structure corruption
+    - Trailing empty columns are trimmed per row to reduce token waste
+    - Empty rows are preserved as blank lines to maintain row structure
+    - Single-pass optimization for better performance on large spreadsheets
 
     Args:
         file_bytes: XLSX file content as bytes
 
     Returns:
-        str: Extracted text content
+        str: Extracted text content with all sheets in tab-delimited format.
+             Format: Sheet separators, sheet name, then tab-delimited rows.
+
+    Example output:
+        ==================== Sheet: Data ====================
+        Name\tAge\tCity
+        Alice\t30\tNew York
+        Bob\t25\tLondon
+
+        ==================== Sheet: Summary ====================
+        Total\t2
+        ====================
     """
     from openpyxl import load_workbook  # type: ignore
 
     xlsx_file = BytesIO(file_bytes)
     wb = load_workbook(xlsx_file)
-    content = ""
-    for sheet in wb:
-        content += f"Sheet: {sheet.title}\n"
+
+    def escape_cell(cell_value: str | int | float | None) -> str:
+        """Escape characters that would break tab-delimited layout.
+
+        Escape order is critical: backslashes first, then tabs/newlines.
+        This prevents double-escaping issues.
+
+        Args:
+            cell_value: The cell value to escape (can be None, str, int, or float)
+
+        Returns:
+            str: Escaped cell value safe for tab-delimited format
+        """
+        if cell_value is None:
+            return ""
+        text = str(cell_value)
+        # CRITICAL: Escape backslash first to avoid double-escaping
+        return (
+            text.replace("\\", "\\\\")  # Must be first: \ -> \\
+            .replace("\t", "\\t")  # Tab -> \t (visible)
+            .replace("\r\n", "\\n")  # Windows newline -> \n
+            .replace("\r", "\\n")  # Mac newline -> \n
+            .replace("\n", "\\n")  # Unix newline -> \n
+        )
+
+    def escape_sheet_title(title: str) -> str:
+        """Escape sheet title to prevent formatting issues in separators.
+
+        Args:
+            title: Original sheet title
+
+        Returns:
+            str: Sanitized sheet title with tabs/newlines replaced
+        """
+        return str(title).replace("\n", " ").replace("\t", " ").replace("\r", " ")
+
+    content_parts: list[str] = []
+    sheet_separator = "=" * 20
+
+    for idx, sheet in enumerate(wb):
+        if idx > 0:
+            content_parts.append("")  # Blank line between sheets for readability
+
+        # Escape sheet title to handle edge cases with special characters
+        safe_title = escape_sheet_title(sheet.title)
+        content_parts.append(f"{sheet_separator} Sheet: {safe_title} {sheet_separator}")
+
+        # Single-pass optimization: escape and trim in one iteration
         for row in sheet.iter_rows(values_only=True):
-            content += (
-                "\t".join(str(cell) if cell is not None else "" for cell in row) + "\n"
-            )
-        content += "\n"
-    return content
+            row_parts = []
+            last_nonempty_idx = -1
+
+            # Build escaped row while tracking last non-empty cell position
+            for idx, cell in enumerate(row):
+                escaped = escape_cell(cell)
+                row_parts.append(escaped)
+                if escaped != "":
+                    last_nonempty_idx = idx
+
+            # Handle completely empty rows vs rows with data
+            if last_nonempty_idx == -1:
+                # Preserve empty rows as blank lines (maintains row structure)
+                content_parts.append("")
+            else:
+                # Only join up to last non-empty cell (trim trailing empties)
+                content_parts.append("\t".join(row_parts[: last_nonempty_idx + 1]))
+
+    # Final separator for symmetry (makes parsing easier)
+    content_parts.append(sheet_separator)
+    return "\n".join(content_parts)
 
 
 async def pipeline_enqueue_file(

From ef659a1e09ed0f5b5e650728b182ec49f93c4c38 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Wed, 19 Nov 2025 03:34:22 +0800
Subject: [PATCH 2/6] Preserve column alignment in XLSX extraction with
 two-pass processing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Two-pass approach for consistent width
• Maintain tabular structure integrity
• Determine max columns first pass
• Extract with alignment second pass
• Prevent column misalignment issues
---
 lightrag/api/routers/document_routes.py | 42 ++++++++++++++++---------
 1 file changed, 28 insertions(+), 14 deletions(-)

diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py
index 1e770520..14e03f5f 100644
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@@ -1061,9 +1061,9 @@ def _extract_xlsx(file_bytes: bytes) -> str:
     Features:
     - Each sheet is wrapped with '====================' separators for visual distinction
     - Special characters (tabs, newlines, backslashes) are escaped to prevent structure corruption
-    - Trailing empty columns are trimmed per row to reduce token waste
+    - Column alignment is preserved across all rows to maintain tabular structure
     - Empty rows are preserved as blank lines to maintain row structure
-    - Single-pass optimization for better performance on large spreadsheets
+    - Two-pass processing: determines max column width, then extracts with consistent alignment
 
     Args:
         file_bytes: XLSX file content as bytes
@@ -1133,25 +1133,39 @@ def _extract_xlsx(file_bytes: bytes) -> str:
         safe_title = escape_sheet_title(sheet.title)
         content_parts.append(f"{sheet_separator} Sheet: {safe_title} {sheet_separator}")
 
-        # Single-pass optimization: escape and trim in one iteration
-        for row in sheet.iter_rows(values_only=True):
-            row_parts = []
-            last_nonempty_idx = -1
+        # Two-pass approach to preserve column alignment:
+        # Pass 1: Determine the maximum column width for this sheet
+        max_columns = 0
+        all_rows = list(sheet.iter_rows(values_only=True))
 
-            # Build escaped row while tracking last non-empty cell position
+        for row in all_rows:
+            last_nonempty_idx = -1
             for idx, cell in enumerate(row):
-                escaped = escape_cell(cell)
-                row_parts.append(escaped)
-                if escaped != "":
+                # Check if cell has meaningful content (not None or empty string)
+                if cell is not None and str(cell).strip():
                     last_nonempty_idx = idx
 
-            # Handle completely empty rows vs rows with data
-            if last_nonempty_idx == -1:
+            if last_nonempty_idx >= 0:
+                max_columns = max(max_columns, last_nonempty_idx + 1)
+
+        # Pass 2: Extract rows with consistent width to preserve column alignment
+        for row in all_rows:
+            row_parts = []
+
+            # Build row up to max_columns width
+            for idx in range(max_columns):
+                if idx < len(row):
+                    row_parts.append(escape_cell(row[idx]))
+                else:
+                    row_parts.append("")  # Pad short rows
+
+            # Check if row is completely empty
+            if all(part == "" for part in row_parts):
                 # Preserve empty rows as blank lines (maintains row structure)
                 content_parts.append("")
             else:
-                # Only join up to last non-empty cell (trim trailing empties)
-                content_parts.append("\t".join(row_parts[: last_nonempty_idx + 1]))
+                # Join all columns to maintain consistent column count
+                content_parts.append("\t".join(row_parts))
 
     # Final separator for symmetry (makes parsing easier)
     content_parts.append(sheet_separator)

From 2b160163120bc8310babbb84887d07c8089ac7c7 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Wed, 19 Nov 2025 03:48:36 +0800
Subject: [PATCH 3/6] Optimize XLSX extraction to avoid storing all rows in
 memory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Remove intermediate row storage
• Use iterator twice instead of list()
• Preserve column alignment logic
• Reduce memory footprint
• Maintain same output format
---
 lightrag/api/routers/document_routes.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py
index 14e03f5f..a4efcacd 100644
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@@ -1133,12 +1133,10 @@ def _extract_xlsx(file_bytes: bytes) -> str:
         safe_title = escape_sheet_title(sheet.title)
         content_parts.append(f"{sheet_separator} Sheet: {safe_title} {sheet_separator}")
 
-        # Two-pass approach to preserve column alignment:
-        # Pass 1: Determine the maximum column width for this sheet
+        # Two-pass approach to preserve column alignment without storing rows in memory:
+        # Pass 1: Scan to determine the maximum column width (memory-efficient)
         max_columns = 0
-        all_rows = list(sheet.iter_rows(values_only=True))
-
-        for row in all_rows:
+        for row in sheet.iter_rows(values_only=True):
             last_nonempty_idx = -1
             for idx, cell in enumerate(row):
                 # Check if cell has meaningful content (not None or empty string)
@@ -1149,7 +1147,7 @@ def _extract_xlsx(file_bytes: bytes) -> str:
                 max_columns = max(max_columns, last_nonempty_idx + 1)
 
         # Pass 2: Extract rows with consistent width to preserve column alignment
-        for row in all_rows:
+        for row in sheet.iter_rows(values_only=True):
             row_parts = []
 
             # Build row up to max_columns width

From 0244699d81a551cf5c5ba3a936709eb54fbd1968 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Wed, 19 Nov 2025 04:02:39 +0800
Subject: [PATCH 4/6] Optimize XLSX extraction by using sheet.max_column
 instead of two-pass scan
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Remove two-pass row scanning approach
• Use built-in sheet.max_column property
• Simplify column width detection logic
• Improve memory efficiency
• Maintain column alignment preservation
---
 lightrag/api/routers/document_routes.py | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py
index a4efcacd..5775c4da 100644
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@@ -1133,20 +1133,10 @@ def _extract_xlsx(file_bytes: bytes) -> str:
         safe_title = escape_sheet_title(sheet.title)
         content_parts.append(f"{sheet_separator} Sheet: {safe_title} {sheet_separator}")
 
-        # Two-pass approach to preserve column alignment without storing rows in memory:
-        # Pass 1: Scan to determine the maximum column width (memory-efficient)
-        max_columns = 0
-        for row in sheet.iter_rows(values_only=True):
-            last_nonempty_idx = -1
-            for idx, cell in enumerate(row):
-                # Check if cell has meaningful content (not None or empty string)
-                if cell is not None and str(cell).strip():
-                    last_nonempty_idx = idx
+        # Use sheet.max_column to get the maximum column width directly
+        max_columns = sheet.max_column if sheet.max_column else 0
 
-            if last_nonempty_idx >= 0:
-                max_columns = max(max_columns, last_nonempty_idx + 1)
-
-        # Pass 2: Extract rows with consistent width to preserve column alignment
+        # Extract rows with consistent width to preserve column alignment
         for row in sheet.iter_rows(values_only=True):
             row_parts = []
 

From 87de2b3e9e5d455a84bb93b2492ce4673d3faed9 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Wed, 19 Nov 2025 04:26:41 +0800
Subject: [PATCH 5/6] Update XLSX extraction documentation to reflect current
 implementation

---
 lightrag/api/routers/document_routes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py
index 5775c4da..8839811c 100644
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@@ -1063,7 +1063,7 @@ def _extract_xlsx(file_bytes: bytes) -> str:
     - Special characters (tabs, newlines, backslashes) are escaped to prevent structure corruption
     - Column alignment is preserved across all rows to maintain tabular structure
     - Empty rows are preserved as blank lines to maintain row structure
-    - Two-pass processing: determines max column width, then extracts with consistent alignment
+    - Uses sheet.max_column to determine column width efficiently
 
     Args:
         file_bytes: XLSX file content as bytes

From 95cd0ece7424100b0fe2e4109c873ca23710fd0f Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Wed, 19 Nov 2025 09:54:35 +0800
Subject: [PATCH 6/6] Fix DOCX table extraction by escaping special characters
 in cells

- Add escape_cell() function
- Escape backslashes first
- Handle tabs and newlines
- Preserve tab-delimited format
- Prevent double-escaping issues
---
 lightrag/api/routers/document_routes.py | 28 +++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/lightrag/api/routers/document_routes.py b/lightrag/api/routers/document_routes.py
index 8839811c..85183bbd 100644
--- a/lightrag/api/routers/document_routes.py
+++ b/lightrag/api/routers/document_routes.py
@@ -992,6 +992,30 @@ def _extract_docx(file_bytes: bytes) -> str:
     docx_file = BytesIO(file_bytes)
     doc = Document(docx_file)
 
+    def escape_cell(cell_value: str | None) -> str:
+        """Escape characters that would break tab-delimited layout.
+
+        Escape order is critical: backslashes first, then tabs/newlines.
+        This prevents double-escaping issues.
+
+        Args:
+            cell_value: The cell value to escape (can be None or str)
+
+        Returns:
+            str: Escaped cell value safe for tab-delimited format
+        """
+        if cell_value is None:
+            return ""
+        text = str(cell_value)
+        # CRITICAL: Escape backslash first to avoid double-escaping
+        return (
+            text.replace("\\", "\\\\")  # Must be first: \ -> \\
+            .replace("\t", "\\t")  # Tab -> \t (visible)
+            .replace("\r\n", "\\n")  # Windows newline -> \n
+            .replace("\r", "\\n")  # Mac newline -> \n
+            .replace("\n", "\\n")  # Unix newline -> \n
+        )
+
     content_parts = []
     in_table = False  # Track if we're currently processing a table
 
@@ -1021,8 +1045,8 @@ def _extract_docx(file_bytes: bytes) -> str:
                 row_text = []
                 for cell in row.cells:
                     cell_text = cell.text
-                    # Always append cell text to preserve column structure
-                    row_text.append(cell_text)
+                    # Escape special characters to preserve tab-delimited structure
+                    row_text.append(escape_cell(cell_text))
                 # Only add row if at least one cell has content
                 if any(cell for cell in row_text):
                     content_parts.append("\t".join(row_text))