yangdx
commited on
Commit
Β·
d3d4fd8
1
Parent(s):
0e21c6a
Remove chinese quotes in entity name
Browse files- lightrag/operate.py +9 -9
- lightrag/utils.py +8 -1
lightrag/operate.py
CHANGED
@@ -165,7 +165,7 @@ async def _handle_single_entity_extraction(
|
|
165 |
return None
|
166 |
|
167 |
# Normalize entity name
|
168 |
-
entity_name = normalize_extracted_info(entity_name)
|
169 |
|
170 |
# Clean and validate entity type
|
171 |
entity_type = clean_str(record_attributes[2]).strip('"')
|
@@ -176,7 +176,7 @@ async def _handle_single_entity_extraction(
|
|
176 |
return None
|
177 |
|
178 |
# Clean and validate description
|
179 |
-
entity_description = clean_str(record_attributes[3])
|
180 |
entity_description = normalize_extracted_info(entity_description)
|
181 |
|
182 |
if not entity_description.strip():
|
@@ -202,20 +202,20 @@ async def _handle_single_relationship_extraction(
|
|
202 |
if len(record_attributes) < 5 or record_attributes[0] != '"relationship"':
|
203 |
return None
|
204 |
# add this record as edge
|
205 |
-
source = clean_str(record_attributes[1])
|
206 |
-
target = clean_str(record_attributes[2])
|
207 |
|
208 |
# Normalize source and target entity names
|
209 |
-
source = normalize_extracted_info(source)
|
210 |
-
target = normalize_extracted_info(target)
|
211 |
|
212 |
-
edge_description = clean_str(record_attributes[3])
|
213 |
edge_description = normalize_extracted_info(edge_description)
|
214 |
|
215 |
-
edge_keywords = clean_str(record_attributes[4]).strip('"')
|
216 |
edge_source_id = chunk_key
|
217 |
weight = (
|
218 |
-
float(record_attributes[-1].strip('"'))
|
219 |
if is_float_regex(record_attributes[-1])
|
220 |
else 1.0
|
221 |
)
|
|
|
165 |
return None
|
166 |
|
167 |
# Normalize entity name
|
168 |
+
entity_name = normalize_extracted_info(entity_name, is_entity=True)
|
169 |
|
170 |
# Clean and validate entity type
|
171 |
entity_type = clean_str(record_attributes[2]).strip('"')
|
|
|
176 |
return None
|
177 |
|
178 |
# Clean and validate description
|
179 |
+
entity_description = clean_str(record_attributes[3])
|
180 |
entity_description = normalize_extracted_info(entity_description)
|
181 |
|
182 |
if not entity_description.strip():
|
|
|
202 |
if len(record_attributes) < 5 or record_attributes[0] != '"relationship"':
|
203 |
return None
|
204 |
# add this record as edge
|
205 |
+
source = clean_str(record_attributes[1])
|
206 |
+
target = clean_str(record_attributes[2])
|
207 |
|
208 |
# Normalize source and target entity names
|
209 |
+
source = normalize_extracted_info(source, is_entity=True)
|
210 |
+
target = normalize_extracted_info(target, is_entity=True)
|
211 |
|
212 |
+
edge_description = clean_str(record_attributes[3])
|
213 |
edge_description = normalize_extracted_info(edge_description)
|
214 |
|
215 |
+
edge_keywords = clean_str(record_attributes[4]).strip('"').strip("'")
|
216 |
edge_source_id = chunk_key
|
217 |
weight = (
|
218 |
+
float(record_attributes[-1].strip('"').strip("'"))
|
219 |
if is_float_regex(record_attributes[-1])
|
220 |
else 1.0
|
221 |
)
|
lightrag/utils.py
CHANGED
@@ -1006,7 +1006,7 @@ def get_content_summary(content: str, max_length: int = 250) -> str:
|
|
1006 |
return content[:max_length] + "..."
|
1007 |
|
1008 |
|
1009 |
-
def normalize_extracted_info(name: str) -> str:
|
1010 |
"""Normalize entity/relation names and description with the following rules:
|
1011 |
1. Remove spaces between Chinese characters
|
1012 |
2. Remove spaces between Chinese characters and English letters/numbers
|
@@ -1040,6 +1040,13 @@ def normalize_extracted_info(name: str) -> str:
|
|
1040 |
# Remove English quotation marks from the beginning and end
|
1041 |
name = name.strip('"').strip("'")
|
1042 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1043 |
return name
|
1044 |
|
1045 |
|
|
|
1006 |
return content[:max_length] + "..."
|
1007 |
|
1008 |
|
1009 |
+
def normalize_extracted_info(name: str, is_entity = False) -> str:
|
1010 |
"""Normalize entity/relation names and description with the following rules:
|
1011 |
1. Remove spaces between Chinese characters
|
1012 |
2. Remove spaces between Chinese characters and English letters/numbers
|
|
|
1040 |
# Remove English quotation marks from the beginning and end
|
1041 |
name = name.strip('"').strip("'")
|
1042 |
|
1043 |
+
if is_entity:
|
1044 |
+
# remove Chinese quotes
|
1045 |
+
name = name.replace("β", "").replace("β", "").replace("β", "").replace("β", "")
|
1046 |
+
# remove English queotes in and around chinese
|
1047 |
+
name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name)
|
1048 |
+
name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name)
|
1049 |
+
|
1050 |
return name
|
1051 |
|
1052 |
|