gzdaniel commited on
Commit
ab53a27
·
1 Parent(s): a265e60

fix: Deduplicate entities and relationships in a single chunk with multiple gleaning results during KG rebuild

Browse files
Files changed (1) hide show
  1. lightrag/operate.py +15 -2
lightrag/operate.py CHANGED
@@ -284,6 +284,7 @@ async def _rebuild_knowledge_from_chunks(
284
  pipeline_status["history_messages"].append(status_message)
285
 
286
  # Get cached extraction results for these chunks using storage
 
287
  cached_results = await _get_cached_extraction_results(
288
  llm_response_cache,
289
  all_referenced_chunk_ids,
@@ -309,6 +310,7 @@ async def _rebuild_knowledge_from_chunks(
309
  chunk_entities[chunk_id] = defaultdict(list)
310
  chunk_relationships[chunk_id] = defaultdict(list)
311
 
 
312
  for extraction_result in extraction_results:
313
  entities, relationships = await _parse_extraction_result(
314
  text_chunks_storage=text_chunks_storage,
@@ -317,10 +319,21 @@ async def _rebuild_knowledge_from_chunks(
317
  )
318
 
319
  # Merge entities and relationships from this extraction result
 
320
  for entity_name, entity_list in entities.items():
321
- chunk_entities[chunk_id][entity_name].extend(entity_list)
 
 
 
 
 
 
322
  for rel_key, rel_list in relationships.items():
323
- chunk_relationships[chunk_id][rel_key].extend(rel_list)
 
 
 
 
324
 
325
  except Exception as e:
326
  status_message = (
 
284
  pipeline_status["history_messages"].append(status_message)
285
 
286
  # Get cached extraction results for these chunks using storage
287
+ # cached_results: chunk_id -> [list of extraction result from LLM cache sorted by created_at]
288
  cached_results = await _get_cached_extraction_results(
289
  llm_response_cache,
290
  all_referenced_chunk_ids,
 
310
  chunk_entities[chunk_id] = defaultdict(list)
311
  chunk_relationships[chunk_id] = defaultdict(list)
312
 
313
+ # process multiple LLM extraction results for a single chunk_id
314
  for extraction_result in extraction_results:
315
  entities, relationships = await _parse_extraction_result(
316
  text_chunks_storage=text_chunks_storage,
 
319
  )
320
 
321
  # Merge entities and relationships from this extraction result
322
+ # Only keep the first occurrence of each entity_name in the same chunk_id
323
  for entity_name, entity_list in entities.items():
324
+ if (
325
+ entity_name not in chunk_entities[chunk_id]
326
+ or len(chunk_entities[chunk_id][entity_name]) == 0
327
+ ):
328
+ chunk_entities[chunk_id][entity_name].extend(entity_list)
329
+
330
+ # Only keep the first occurrence of each rel_key in the same chunk_id
331
  for rel_key, rel_list in relationships.items():
332
+ if (
333
+ rel_key not in chunk_relationships[chunk_id]
334
+ or len(chunk_relationships[chunk_id][rel_key]) == 0
335
+ ):
336
+ chunk_relationships[chunk_id][rel_key].extend(rel_list)
337
 
338
  except Exception as e:
339
  status_message = (