Spaces:

rm-lht
/

lightrag

Configuration error

yangdx commited on Apr 12

Commit

d3d4fd8

1 Parent(s): 0e21c6a

Remove chinese quotes in entity name

Files changed (2) hide show

lightrag/operate.py CHANGED Viewed

@@ -165,7 +165,7 @@ async def _handle_single_entity_extraction(
         return None
     # Normalize entity name
-    entity_name = normalize_extracted_info(entity_name)
     # Clean and validate entity type
     entity_type = clean_str(record_attributes[2]).strip('"')
@@ -176,7 +176,7 @@ async def _handle_single_entity_extraction(
         return None
     # Clean and validate description
-    entity_description = clean_str(record_attributes[3]).strip('"')
     entity_description = normalize_extracted_info(entity_description)
     if not entity_description.strip():
@@ -202,20 +202,20 @@ async def _handle_single_relationship_extraction(
     if len(record_attributes) < 5 or record_attributes[0] != '"relationship"':
         return None
     # add this record as edge
-    source = clean_str(record_attributes[1]).strip('"')
-    target = clean_str(record_attributes[2]).strip('"')
     # Normalize source and target entity names
-    source = normalize_extracted_info(source)
-    target = normalize_extracted_info(target)
-    edge_description = clean_str(record_attributes[3]).strip('"')
     edge_description = normalize_extracted_info(edge_description)
-    edge_keywords = clean_str(record_attributes[4]).strip('"')
     edge_source_id = chunk_key
     weight = (
-        float(record_attributes[-1].strip('"'))
         if is_float_regex(record_attributes[-1])
         else 1.0
     )

         return None
     # Normalize entity name
+    entity_name = normalize_extracted_info(entity_name, is_entity=True)
     # Clean and validate entity type
     entity_type = clean_str(record_attributes[2]).strip('"')
         return None
     # Clean and validate description
+    entity_description = clean_str(record_attributes[3])
     entity_description = normalize_extracted_info(entity_description)
     if not entity_description.strip():
     if len(record_attributes) < 5 or record_attributes[0] != '"relationship"':
         return None
     # add this record as edge
+    source = clean_str(record_attributes[1])
+    target = clean_str(record_attributes[2])
     # Normalize source and target entity names
+    source = normalize_extracted_info(source, is_entity=True)
+    target = normalize_extracted_info(target, is_entity=True)
+    edge_description = clean_str(record_attributes[3])
     edge_description = normalize_extracted_info(edge_description)
+    edge_keywords = clean_str(record_attributes[4]).strip('"').strip("'")
     edge_source_id = chunk_key
     weight = (
+        float(record_attributes[-1].strip('"').strip("'"))
         if is_float_regex(record_attributes[-1])
         else 1.0
     )

lightrag/utils.py CHANGED Viewed

@@ -1006,7 +1006,7 @@ def get_content_summary(content: str, max_length: int = 250) -> str:
     return content[:max_length] + "..."
-def normalize_extracted_info(name: str) -> str:
     """Normalize entity/relation names and description with the following rules:
     1. Remove spaces between Chinese characters
     2. Remove spaces between Chinese characters and English letters/numbers
@@ -1040,6 +1040,13 @@ def normalize_extracted_info(name: str) -> str:
     # Remove English quotation marks from the beginning and end
     name = name.strip('"').strip("'")
     return name

     return content[:max_length] + "..."
+def normalize_extracted_info(name: str, is_entity = False) -> str:
     """Normalize entity/relation names and description with the following rules:
     1. Remove spaces between Chinese characters
     2. Remove spaces between Chinese characters and English letters/numbers
     # Remove English quotation marks from the beginning and end
     name = name.strip('"').strip("'")
+    if is_entity:
+        # remove Chinese quotes
+        name = name.replace("“", "").replace("”", "").replace("‘", "").replace("’", "")
+        # remove English queotes in and around chinese
+        name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name)
+        name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name)
     return name