Change DOCX extraction to use HTML tags for whitespace
- Replace tabs with HTML em spaces - Convert all newlines to break tags
This commit is contained in:
@@ -1015,10 +1015,10 @@ def _extract_docx(file_bytes: bytes) -> str:
|
||||
# CRITICAL: Escape backslash first to avoid double-escaping
|
||||
return (
|
||||
text.replace("\\", "\\\\") # Must be first: \ -> \\
|
||||
.replace("\t", "\\t") # Tab -> \t (visible)
|
||||
.replace("\r\n", "\\n") # Windows newline -> \n
|
||||
.replace("\r", "\\n") # Mac newline -> \n
|
||||
.replace("\n", "\\n") # Unix newline -> \n
|
||||
.replace("\t", "  ") # Tab -> \t (visible)
|
||||
.replace("\r\n", "<br>") # Windows newline -> \n
|
||||
.replace("\r", "<br>") # Mac newline -> \n
|
||||
.replace("\n", "<br>") # Unix newline -> \n
|
||||
)
|
||||
|
||||
content_parts = []
|
||||
|
||||
Reference in New Issue
Block a user