yangdx commited on
Commit
d3d4fd8
Β·
1 Parent(s): 0e21c6a

Remove chinese quotes in entity name

Browse files
Files changed (2) hide show
  1. lightrag/operate.py +9 -9
  2. lightrag/utils.py +8 -1
lightrag/operate.py CHANGED
@@ -165,7 +165,7 @@ async def _handle_single_entity_extraction(
165
  return None
166
 
167
  # Normalize entity name
168
- entity_name = normalize_extracted_info(entity_name)
169
 
170
  # Clean and validate entity type
171
  entity_type = clean_str(record_attributes[2]).strip('"')
@@ -176,7 +176,7 @@ async def _handle_single_entity_extraction(
176
  return None
177
 
178
  # Clean and validate description
179
- entity_description = clean_str(record_attributes[3]).strip('"')
180
  entity_description = normalize_extracted_info(entity_description)
181
 
182
  if not entity_description.strip():
@@ -202,20 +202,20 @@ async def _handle_single_relationship_extraction(
202
  if len(record_attributes) < 5 or record_attributes[0] != '"relationship"':
203
  return None
204
  # add this record as edge
205
- source = clean_str(record_attributes[1]).strip('"')
206
- target = clean_str(record_attributes[2]).strip('"')
207
 
208
  # Normalize source and target entity names
209
- source = normalize_extracted_info(source)
210
- target = normalize_extracted_info(target)
211
 
212
- edge_description = clean_str(record_attributes[3]).strip('"')
213
  edge_description = normalize_extracted_info(edge_description)
214
 
215
- edge_keywords = clean_str(record_attributes[4]).strip('"')
216
  edge_source_id = chunk_key
217
  weight = (
218
- float(record_attributes[-1].strip('"'))
219
  if is_float_regex(record_attributes[-1])
220
  else 1.0
221
  )
 
165
  return None
166
 
167
  # Normalize entity name
168
+ entity_name = normalize_extracted_info(entity_name, is_entity=True)
169
 
170
  # Clean and validate entity type
171
  entity_type = clean_str(record_attributes[2]).strip('"')
 
176
  return None
177
 
178
  # Clean and validate description
179
+ entity_description = clean_str(record_attributes[3])
180
  entity_description = normalize_extracted_info(entity_description)
181
 
182
  if not entity_description.strip():
 
202
  if len(record_attributes) < 5 or record_attributes[0] != '"relationship"':
203
  return None
204
  # add this record as edge
205
+ source = clean_str(record_attributes[1])
206
+ target = clean_str(record_attributes[2])
207
 
208
  # Normalize source and target entity names
209
+ source = normalize_extracted_info(source, is_entity=True)
210
+ target = normalize_extracted_info(target, is_entity=True)
211
 
212
+ edge_description = clean_str(record_attributes[3])
213
  edge_description = normalize_extracted_info(edge_description)
214
 
215
+ edge_keywords = clean_str(record_attributes[4]).strip('"').strip("'")
216
  edge_source_id = chunk_key
217
  weight = (
218
+ float(record_attributes[-1].strip('"').strip("'"))
219
  if is_float_regex(record_attributes[-1])
220
  else 1.0
221
  )
lightrag/utils.py CHANGED
@@ -1006,7 +1006,7 @@ def get_content_summary(content: str, max_length: int = 250) -> str:
1006
  return content[:max_length] + "..."
1007
 
1008
 
1009
- def normalize_extracted_info(name: str) -> str:
1010
  """Normalize entity/relation names and description with the following rules:
1011
  1. Remove spaces between Chinese characters
1012
  2. Remove spaces between Chinese characters and English letters/numbers
@@ -1040,6 +1040,13 @@ def normalize_extracted_info(name: str) -> str:
1040
  # Remove English quotation marks from the beginning and end
1041
  name = name.strip('"').strip("'")
1042
 
 
 
 
 
 
 
 
1043
  return name
1044
 
1045
 
 
1006
  return content[:max_length] + "..."
1007
 
1008
 
1009
+ def normalize_extracted_info(name: str, is_entity = False) -> str:
1010
  """Normalize entity/relation names and description with the following rules:
1011
  1. Remove spaces between Chinese characters
1012
  2. Remove spaces between Chinese characters and English letters/numbers
 
1040
  # Remove English quotation marks from the beginning and end
1041
  name = name.strip('"').strip("'")
1042
 
1043
+ if is_entity:
1044
+ # remove Chinese quotes
1045
+ name = name.replace("β€œ", "").replace("”", "").replace("β€˜", "").replace("’", "")
1046
+ # remove English queotes in and around chinese
1047
+ name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name)
1048
+ name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name)
1049
+
1050
  return name
1051
 
1052