Add specialized JSON string sanitizer to prevent UTF-8 encoding errors

• Remove surrogate characters (U+D800-DFFF)
• Filter Unicode non-characters
• Direct char-by-char filtering
This commit is contained in:
yangdx
2025-11-12 00:38:47 +08:00
parent 23cbb9c9b2
commit 5885637ebf

View File

@@ -927,6 +927,37 @@ def load_json(file_name):
return json.load(f)
def _sanitize_string_for_json(text: str) -> str:
"""Remove characters that cannot be encoded in UTF-8 for JSON serialization.
This is a simpler sanitizer specifically for JSON that directly removes
problematic characters without attempting to encode first.
Args:
text: String to sanitize
Returns:
Sanitized string safe for UTF-8 encoding in JSON
"""
if not text:
return text
# Directly filter out problematic characters without pre-validation
sanitized = ""
for char in text:
code_point = ord(char)
# Skip surrogate characters (U+D800 to U+DFFF) - main cause of encoding errors
if 0xD800 <= code_point <= 0xDFFF:
continue
# Skip other non-characters in Unicode
elif code_point == 0xFFFE or code_point == 0xFFFF:
continue
else:
sanitized += char
return sanitized
def _sanitize_json_data(data: Any) -> Any:
"""Recursively sanitize all string values in data structure for safe UTF-8 encoding
@@ -941,7 +972,7 @@ def _sanitize_json_data(data: Any) -> Any:
elif isinstance(data, list):
return [_sanitize_json_data(item) for item in data]
elif isinstance(data, str):
return sanitize_text_for_encoding(data, replacement_char="")
return _sanitize_string_for_json(data)
else:
return data