From d1f4b6e515a7cd63555b93af3b93c685d60bb299 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Wed, 12 Nov 2025 00:11:13 +0800
Subject: [PATCH 1/3] Add data sanitization to JSON writing to prevent UTF-8
 encoding errors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Add _sanitize_json_data helper function
• Recursively clean strings in data
• Sanitize before JSON serialization
• Prevent encoding-related crashes
• Use existing sanitize_text_for_encoding
---
 lightrag/utils.py | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/lightrag/utils.py b/lightrag/utils.py
index 460ede3c..064e4804 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -927,9 +927,30 @@ def load_json(file_name):
         return json.load(f)
 
 
+def _sanitize_json_data(data: Any) -> Any:
+    """Recursively sanitize all string values in data structure for safe UTF-8 encoding
+
+    Args:
+        data: Data to sanitize (dict, list, str, or other types)
+
+    Returns:
+        Sanitized data with all strings cleaned of problematic characters
+    """
+    if isinstance(data, dict):
+        return {k: _sanitize_json_data(v) for k, v in data.items()}
+    elif isinstance(data, list):
+        return [_sanitize_json_data(item) for item in data]
+    elif isinstance(data, str):
+        return sanitize_text_for_encoding(data, replacement_char="")
+    else:
+        return data
+
+
 def write_json(json_obj, file_name):
+    # Sanitize data before writing to prevent UTF-8 encoding errors
+    sanitized_obj = _sanitize_json_data(json_obj)
     with open(file_name, "w", encoding="utf-8") as f:
-        json.dump(json_obj, f, indent=2, ensure_ascii=False)
+        json.dump(sanitized_obj, f, indent=2, ensure_ascii=False)
 
 
 class TokenizerInterface(Protocol):

From 6918a88f927bdf123826b1a5f0dbfd64bf6b39f8 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Wed, 12 Nov 2025 00:38:47 +0800
Subject: [PATCH 2/3] Add specialized JSON string sanitizer to prevent UTF-8
 encoding errors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

• Remove surrogate characters (U+D800-DFFF)
• Filter Unicode non-characters
• Direct char-by-char filtering
---
 lightrag/utils.py | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/lightrag/utils.py b/lightrag/utils.py
index 064e4804..7232a91c 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -927,6 +927,37 @@ def load_json(file_name):
         return json.load(f)
 
 
+def _sanitize_string_for_json(text: str) -> str:
+    """Remove characters that cannot be encoded in UTF-8 for JSON serialization.
+
+    This is a simpler sanitizer specifically for JSON that directly removes
+    problematic characters without attempting to encode first.
+
+    Args:
+        text: String to sanitize
+
+    Returns:
+        Sanitized string safe for UTF-8 encoding in JSON
+    """
+    if not text:
+        return text
+
+    # Directly filter out problematic characters without pre-validation
+    sanitized = ""
+    for char in text:
+        code_point = ord(char)
+        # Skip surrogate characters (U+D800 to U+DFFF) - main cause of encoding errors
+        if 0xD800 <= code_point <= 0xDFFF:
+            continue
+        # Skip other non-characters in Unicode
+        elif code_point == 0xFFFE or code_point == 0xFFFF:
+            continue
+        else:
+            sanitized += char
+
+    return sanitized
+
+
 def _sanitize_json_data(data: Any) -> Any:
     """Recursively sanitize all string values in data structure for safe UTF-8 encoding
 
@@ -941,7 +972,7 @@ def _sanitize_json_data(data: Any) -> Any:
     elif isinstance(data, list):
         return [_sanitize_json_data(item) for item in data]
     elif isinstance(data, str):
-        return sanitize_text_for_encoding(data, replacement_char="")
+        return _sanitize_string_for_json(data)
     else:
         return data
 

From f28a0c25b178d3347f9f72bf9e63242cefc1ea53 Mon Sep 17 00:00:00 2001
From: yangdx <gzdaniel@me.com>
Date: Wed, 12 Nov 2025 00:50:18 +0800
Subject: [PATCH 3/3] Improve JSON data sanitization to handle tuples and dict
 keys

- Sanitize dictionary keys
- Preserve tuple types
- Handle nested structures better
---
 lightrag/utils.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/lightrag/utils.py b/lightrag/utils.py
index 7232a91c..4bfd20f2 100644
--- a/lightrag/utils.py
+++ b/lightrag/utils.py
@@ -961,19 +961,34 @@ def _sanitize_string_for_json(text: str) -> str:
 def _sanitize_json_data(data: Any) -> Any:
     """Recursively sanitize all string values in data structure for safe UTF-8 encoding
 
+    Handles all JSON-serializable types including:
+    - Dictionary keys and values
+    - Lists and tuples (preserves type)
+    - Nested structures
+    - Strings at any level
+
     Args:
-        data: Data to sanitize (dict, list, str, or other types)
+        data: Data to sanitize (dict, list, tuple, str, or other types)
 
     Returns:
         Sanitized data with all strings cleaned of problematic characters
     """
     if isinstance(data, dict):
-        return {k: _sanitize_json_data(v) for k, v in data.items()}
-    elif isinstance(data, list):
-        return [_sanitize_json_data(item) for item in data]
+        # Sanitize both keys and values
+        return {
+            _sanitize_string_for_json(k)
+            if isinstance(k, str)
+            else k: _sanitize_json_data(v)
+            for k, v in data.items()
+        }
+    elif isinstance(data, (list, tuple)):
+        # Handle both lists and tuples, preserve original type
+        sanitized = [_sanitize_json_data(item) for item in data]
+        return type(data)(sanitized)
     elif isinstance(data, str):
         return _sanitize_string_for_json(data)
     else:
+        # Numbers, booleans, None, etc. - return as-is
         return data