From d1f4b6e515a7cd63555b93af3b93c685d60bb299 Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 12 Nov 2025 00:11:13 +0800 Subject: [PATCH 1/3] Add data sanitization to JSON writing to prevent UTF-8 encoding errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Add _sanitize_json_data helper function • Recursively clean strings in data • Sanitize before JSON serialization • Prevent encoding-related crashes • Use existing sanitize_text_for_encoding --- lightrag/utils.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/lightrag/utils.py b/lightrag/utils.py index 460ede3c..064e4804 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -927,9 +927,30 @@ def load_json(file_name): return json.load(f) +def _sanitize_json_data(data: Any) -> Any: + """Recursively sanitize all string values in data structure for safe UTF-8 encoding + + Args: + data: Data to sanitize (dict, list, str, or other types) + + Returns: + Sanitized data with all strings cleaned of problematic characters + """ + if isinstance(data, dict): + return {k: _sanitize_json_data(v) for k, v in data.items()} + elif isinstance(data, list): + return [_sanitize_json_data(item) for item in data] + elif isinstance(data, str): + return sanitize_text_for_encoding(data, replacement_char="") + else: + return data + + def write_json(json_obj, file_name): + # Sanitize data before writing to prevent UTF-8 encoding errors + sanitized_obj = _sanitize_json_data(json_obj) with open(file_name, "w", encoding="utf-8") as f: - json.dump(json_obj, f, indent=2, ensure_ascii=False) + json.dump(sanitized_obj, f, indent=2, ensure_ascii=False) class TokenizerInterface(Protocol): From 6918a88f927bdf123826b1a5f0dbfd64bf6b39f8 Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 12 Nov 2025 00:38:47 +0800 Subject: [PATCH 2/3] Add specialized JSON string sanitizer to prevent UTF-8 encoding errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Remove surrogate characters (U+D800-DFFF) • Filter Unicode non-characters • Direct char-by-char filtering --- lightrag/utils.py | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/lightrag/utils.py b/lightrag/utils.py index 064e4804..7232a91c 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -927,6 +927,37 @@ def load_json(file_name): return json.load(f) +def _sanitize_string_for_json(text: str) -> str: + """Remove characters that cannot be encoded in UTF-8 for JSON serialization. + + This is a simpler sanitizer specifically for JSON that directly removes + problematic characters without attempting to encode first. + + Args: + text: String to sanitize + + Returns: + Sanitized string safe for UTF-8 encoding in JSON + """ + if not text: + return text + + # Directly filter out problematic characters without pre-validation + sanitized = "" + for char in text: + code_point = ord(char) + # Skip surrogate characters (U+D800 to U+DFFF) - main cause of encoding errors + if 0xD800 <= code_point <= 0xDFFF: + continue + # Skip other non-characters in Unicode + elif code_point == 0xFFFE or code_point == 0xFFFF: + continue + else: + sanitized += char + + return sanitized + + def _sanitize_json_data(data: Any) -> Any: """Recursively sanitize all string values in data structure for safe UTF-8 encoding @@ -941,7 +972,7 @@ def _sanitize_json_data(data: Any) -> Any: elif isinstance(data, list): return [_sanitize_json_data(item) for item in data] elif isinstance(data, str): - return sanitize_text_for_encoding(data, replacement_char="") + return _sanitize_string_for_json(data) else: return data From f28a0c25b178d3347f9f72bf9e63242cefc1ea53 Mon Sep 17 00:00:00 2001 From: yangdx Date: Wed, 12 Nov 2025 00:50:18 +0800 Subject: [PATCH 3/3] Improve JSON data sanitization to handle tuples and dict keys - Sanitize dictionary keys - Preserve tuple types - Handle nested structures better --- lightrag/utils.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/lightrag/utils.py b/lightrag/utils.py index 7232a91c..4bfd20f2 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -961,19 +961,34 @@ def _sanitize_string_for_json(text: str) -> str: def _sanitize_json_data(data: Any) -> Any: """Recursively sanitize all string values in data structure for safe UTF-8 encoding + Handles all JSON-serializable types including: + - Dictionary keys and values + - Lists and tuples (preserves type) + - Nested structures + - Strings at any level + Args: - data: Data to sanitize (dict, list, str, or other types) + data: Data to sanitize (dict, list, tuple, str, or other types) Returns: Sanitized data with all strings cleaned of problematic characters """ if isinstance(data, dict): - return {k: _sanitize_json_data(v) for k, v in data.items()} - elif isinstance(data, list): - return [_sanitize_json_data(item) for item in data] + # Sanitize both keys and values + return { + _sanitize_string_for_json(k) + if isinstance(k, str) + else k: _sanitize_json_data(v) + for k, v in data.items() + } + elif isinstance(data, (list, tuple)): + # Handle both lists and tuples, preserve original type + sanitized = [_sanitize_json_data(item) for item in data] + return type(data)(sanitized) elif isinstance(data, str): return _sanitize_string_for_json(data) else: + # Numbers, booleans, None, etc. - return as-is return data