Change DOCX extraction to use HTML tags for whitespace

- Replace tabs with HTML em spaces
- Convert all newlines to break tags
This commit is contained in:
yangdx
2025-12-28 15:16:59 +08:00
parent 3651c09c5a
commit 4ef52ec695

View File

@@ -1015,10 +1015,10 @@ def _extract_docx(file_bytes: bytes) -> str:
# CRITICAL: Escape backslash first to avoid double-escaping
return (
text.replace("\\", "\\\\") # Must be first: \ -> \\
.replace("\t", "\\t") # Tab -> \t (visible)
.replace("\r\n", "\\n") # Windows newline -> \n
.replace("\r", "\\n") # Mac newline -> \n
.replace("\n", "\\n") # Unix newline -> \n
.replace("\t", "  ") # Tab -> \t (visible)
.replace("\r\n", "<br>") # Windows newline -> \n
.replace("\r", "<br>") # Mac newline -> \n
.replace("\n", "<br>") # Unix newline -> \n
)
content_parts = []