Add specialized JSON string sanitizer to prevent UTF-8 encoding errors

• Remove surrogate characters (U+D800-DFFF) • Filter Unicode non-characters • Direct char-by-char filtering
2025-11-12 00:38:47 +08:00
parent 23cbb9c9b2
commit 5885637ebf
1 changed files with 32 additions and 1 deletions
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -927,6 +927,37 @@ def load_json(file_name):
        return json.load(f)


+def _sanitize_string_for_json(text: str) -> str:
+    """Remove characters that cannot be encoded in UTF-8 for JSON serialization.
+
+    This is a simpler sanitizer specifically for JSON that directly removes
+    problematic characters without attempting to encode first.
+
+    Args:
+        text: String to sanitize
+
+    Returns:
+        Sanitized string safe for UTF-8 encoding in JSON
+    """
+    if not text:
+        return text
+
+    # Directly filter out problematic characters without pre-validation
+    sanitized = ""
+    for char in text:
+        code_point = ord(char)
+        # Skip surrogate characters (U+D800 to U+DFFF) - main cause of encoding errors
+        if 0xD800 <= code_point <= 0xDFFF:
+            continue
+        # Skip other non-characters in Unicode
+        elif code_point == 0xFFFE or code_point == 0xFFFF:
+            continue
+        else:
+            sanitized += char
+
+    return sanitized
+
+
 def _sanitize_json_data(data: Any) -> Any:
    """Recursively sanitize all string values in data structure for safe UTF-8 encoding

@@ -941,7 +972,7 @@ def _sanitize_json_data(data: Any) -> Any:
    elif isinstance(data, list):
        return [_sanitize_json_data(item) for item in data]
    elif isinstance(data, str):
-        return sanitize_text_for_encoding(data, replacement_char="")
+        return _sanitize_string_for_json(data)
    else:
        return data