fix: Deduplicate entities and relationships in a single chunk with multiple gleaning results during KG rebuild
Browse files- lightrag/operate.py +15 -2
 
    	
        lightrag/operate.py
    CHANGED
    
    | 
         @@ -284,6 +284,7 @@ async def _rebuild_knowledge_from_chunks( 
     | 
|
| 284 | 
         
             
                        pipeline_status["history_messages"].append(status_message)
         
     | 
| 285 | 
         | 
| 286 | 
         
             
                # Get cached extraction results for these chunks using storage
         
     | 
| 
         | 
|
| 287 | 
         
             
                cached_results = await _get_cached_extraction_results(
         
     | 
| 288 | 
         
             
                    llm_response_cache,
         
     | 
| 289 | 
         
             
                    all_referenced_chunk_ids,
         
     | 
| 
         @@ -309,6 +310,7 @@ async def _rebuild_knowledge_from_chunks( 
     | 
|
| 309 | 
         
             
                        chunk_entities[chunk_id] = defaultdict(list)
         
     | 
| 310 | 
         
             
                        chunk_relationships[chunk_id] = defaultdict(list)
         
     | 
| 311 | 
         | 
| 
         | 
|
| 312 | 
         
             
                        for extraction_result in extraction_results:
         
     | 
| 313 | 
         
             
                            entities, relationships = await _parse_extraction_result(
         
     | 
| 314 | 
         
             
                                text_chunks_storage=text_chunks_storage,
         
     | 
| 
         @@ -317,10 +319,21 @@ async def _rebuild_knowledge_from_chunks( 
     | 
|
| 317 | 
         
             
                            )
         
     | 
| 318 | 
         | 
| 319 | 
         
             
                            # Merge entities and relationships from this extraction result
         
     | 
| 
         | 
|
| 320 | 
         
             
                            for entity_name, entity_list in entities.items():
         
     | 
| 321 | 
         
            -
                                 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 322 | 
         
             
                            for rel_key, rel_list in relationships.items():
         
     | 
| 323 | 
         
            -
                                 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 324 | 
         | 
| 325 | 
         
             
                    except Exception as e:
         
     | 
| 326 | 
         
             
                        status_message = (
         
     | 
| 
         | 
|
| 284 | 
         
             
                        pipeline_status["history_messages"].append(status_message)
         
     | 
| 285 | 
         | 
| 286 | 
         
             
                # Get cached extraction results for these chunks using storage
         
     | 
| 287 | 
         
            +
                #    cached_results: chunk_id -> [list of extraction result from LLM cache sorted by created_at]
         
     | 
| 288 | 
         
             
                cached_results = await _get_cached_extraction_results(
         
     | 
| 289 | 
         
             
                    llm_response_cache,
         
     | 
| 290 | 
         
             
                    all_referenced_chunk_ids,
         
     | 
| 
         | 
|
| 310 | 
         
             
                        chunk_entities[chunk_id] = defaultdict(list)
         
     | 
| 311 | 
         
             
                        chunk_relationships[chunk_id] = defaultdict(list)
         
     | 
| 312 | 
         | 
| 313 | 
         
            +
                        # process multiple LLM extraction results for a single chunk_id
         
     | 
| 314 | 
         
             
                        for extraction_result in extraction_results:
         
     | 
| 315 | 
         
             
                            entities, relationships = await _parse_extraction_result(
         
     | 
| 316 | 
         
             
                                text_chunks_storage=text_chunks_storage,
         
     | 
| 
         | 
|
| 319 | 
         
             
                            )
         
     | 
| 320 | 
         | 
| 321 | 
         
             
                            # Merge entities and relationships from this extraction result
         
     | 
| 322 | 
         
            +
                            # Only keep the first occurrence of each entity_name in the same chunk_id
         
     | 
| 323 | 
         
             
                            for entity_name, entity_list in entities.items():
         
     | 
| 324 | 
         
            +
                                if (
         
     | 
| 325 | 
         
            +
                                    entity_name not in chunk_entities[chunk_id]
         
     | 
| 326 | 
         
            +
                                    or len(chunk_entities[chunk_id][entity_name]) == 0
         
     | 
| 327 | 
         
            +
                                ):
         
     | 
| 328 | 
         
            +
                                    chunk_entities[chunk_id][entity_name].extend(entity_list)
         
     | 
| 329 | 
         
            +
             
     | 
| 330 | 
         
            +
                            # Only keep the first occurrence of each rel_key in the same chunk_id
         
     | 
| 331 | 
         
             
                            for rel_key, rel_list in relationships.items():
         
     | 
| 332 | 
         
            +
                                if (
         
     | 
| 333 | 
         
            +
                                    rel_key not in chunk_relationships[chunk_id]
         
     | 
| 334 | 
         
            +
                                    or len(chunk_relationships[chunk_id][rel_key]) == 0
         
     | 
| 335 | 
         
            +
                                ):
         
     | 
| 336 | 
         
            +
                                    chunk_relationships[chunk_id][rel_key].extend(rel_list)
         
     | 
| 337 | 
         | 
| 338 | 
         
             
                    except Exception as e:
         
     | 
| 339 | 
         
             
                        status_message = (
         
     |