Merge pull request #2550 from danielaskdd/docx-extraction

Refact: Change DOCX extraction to use HTML tags for whitespace
This commit is contained in:
Daniel.y
2025-12-28 15:45:15 +08:00
committed by GitHub

View File

@@ -1015,10 +1015,10 @@ def _extract_docx(file_bytes: bytes) -> str:
# CRITICAL: Escape backslash first to avoid double-escaping
return (
text.replace("\\", "\\\\") # Must be first: \ -> \\
.replace("\t", "\\t") # Tab -> \t (visible)
.replace("\r\n", "\\n") # Windows newline -> \n
.replace("\r", "\\n") # Mac newline -> \n
.replace("\n", "\\n") # Unix newline -> \n
.replace("\t", "  ") # Tab -> \t (visible)
.replace("\r\n", "<br>") # Windows newline -> \n
.replace("\r", "<br>") # Mac newline -> \n
.replace("\n", "<br>") # Unix newline -> \n
)
content_parts = []