fix index
Browse files- lightrag/operate.py +10 -36
 
    	
        lightrag/operate.py
    CHANGED
    
    | 
         @@ -172,7 +172,7 @@ async def _handle_single_entity_extraction( 
     | 
|
| 172 | 
         
             
                    entity_type=entity_type,
         
     | 
| 173 | 
         
             
                    description=entity_description,
         
     | 
| 174 | 
         
             
                    source_id=chunk_key,
         
     | 
| 175 | 
         
            -
                     
     | 
| 176 | 
         
             
                )
         
     | 
| 177 | 
         | 
| 178 | 
         | 
| 
         @@ -201,7 +201,7 @@ async def _handle_single_relationship_extraction( 
     | 
|
| 201 | 
         
             
                    description=edge_description,
         
     | 
| 202 | 
         
             
                    keywords=edge_keywords,
         
     | 
| 203 | 
         
             
                    source_id=edge_source_id,
         
     | 
| 204 | 
         
            -
                     
     | 
| 205 | 
         
             
                )
         
     | 
| 206 | 
         | 
| 207 | 
         | 
| 
         @@ -224,9 +224,7 @@ async def _merge_nodes_then_upsert( 
     | 
|
| 224 | 
         
             
                        split_string_by_multi_markers(already_node["source_id"], [GRAPH_FIELD_SEP])
         
     | 
| 225 | 
         
             
                    )
         
     | 
| 226 | 
         
             
                    already_file_paths.extend(
         
     | 
| 227 | 
         
            -
                        split_string_by_multi_markers(
         
     | 
| 228 | 
         
            -
                            already_node["metadata"]["file_path"], [GRAPH_FIELD_SEP]
         
     | 
| 229 | 
         
            -
                        )
         
     | 
| 230 | 
         
             
                    )
         
     | 
| 231 | 
         
             
                    already_description.append(already_node["description"])
         
     | 
| 232 | 
         | 
| 
         @@ -244,7 +242,7 @@ async def _merge_nodes_then_upsert( 
     | 
|
| 244 | 
         
             
                    set([dp["source_id"] for dp in nodes_data] + already_source_ids)
         
     | 
| 245 | 
         
             
                )
         
     | 
| 246 | 
         
             
                file_path = GRAPH_FIELD_SEP.join(
         
     | 
| 247 | 
         
            -
                    set([dp[" 
     | 
| 248 | 
         
             
                )
         
     | 
| 249 | 
         | 
| 250 | 
         
             
                logger.debug(f"file_path: {file_path}")
         
     | 
| 
         @@ -298,7 +296,7 @@ async def _merge_edges_then_upsert( 
     | 
|
| 298 | 
         
             
                        if already_edge.get("file_path") is not None:
         
     | 
| 299 | 
         
             
                            already_file_paths.extend(
         
     | 
| 300 | 
         
             
                                split_string_by_multi_markers(
         
     | 
| 301 | 
         
            -
                                    already_edge[" 
     | 
| 302 | 
         
             
                                )
         
     | 
| 303 | 
         
             
                            )
         
     | 
| 304 | 
         | 
| 
         @@ -340,11 +338,7 @@ async def _merge_edges_then_upsert( 
     | 
|
| 340 | 
         
             
                )
         
     | 
| 341 | 
         
             
                file_path = GRAPH_FIELD_SEP.join(
         
     | 
| 342 | 
         
             
                    set(
         
     | 
| 343 | 
         
            -
                        [
         
     | 
| 344 | 
         
            -
                            dp["metadata"]["file_path"]
         
     | 
| 345 | 
         
            -
                            for dp in edges_data
         
     | 
| 346 | 
         
            -
                            if dp.get("metadata", {}).get("file_path")
         
     | 
| 347 | 
         
            -
                        ]
         
     | 
| 348 | 
         
             
                        + already_file_paths
         
     | 
| 349 | 
         
             
                    )
         
     | 
| 350 | 
         
             
                )
         
     | 
| 
         @@ -679,10 +673,6 @@ async def extract_entities( 
     | 
|
| 679 | 
         
             
                            "content": f"{dp['entity_name']}\n{dp['description']}",
         
     | 
| 680 | 
         
             
                            "source_id": dp["source_id"],
         
     | 
| 681 | 
         
             
                            "file_path": dp.get("file_path", "unknown_source"),
         
     | 
| 682 | 
         
            -
                            "metadata": {
         
     | 
| 683 | 
         
            -
                                "created_at": dp.get("created_at", time.time()),
         
     | 
| 684 | 
         
            -
                                "file_path": dp.get("file_path", "unknown_source"),
         
     | 
| 685 | 
         
            -
                            },
         
     | 
| 686 | 
         
             
                        }
         
     | 
| 687 | 
         
             
                        for dp in all_entities_data
         
     | 
| 688 | 
         
             
                    }
         
     | 
| 
         @@ -697,10 +687,6 @@ async def extract_entities( 
     | 
|
| 697 | 
         
             
                            "content": f"{dp['src_id']}\t{dp['tgt_id']}\n{dp['keywords']}\n{dp['description']}",
         
     | 
| 698 | 
         
             
                            "source_id": dp["source_id"],
         
     | 
| 699 | 
         
             
                            "file_path": dp.get("file_path", "unknown_source"),
         
     | 
| 700 | 
         
            -
                            "metadata": {
         
     | 
| 701 | 
         
            -
                                "created_at": dp.get("created_at", time.time()),
         
     | 
| 702 | 
         
            -
                                "file_path": dp.get("file_path", "unknown_source"),
         
     | 
| 703 | 
         
            -
                            },
         
     | 
| 704 | 
         
             
                        }
         
     | 
| 705 | 
         
             
                        for dp in all_relationships_data
         
     | 
| 706 | 
         
             
                    }
         
     | 
| 
         @@ -1285,11 +1271,8 @@ async def _get_node_data( 
     | 
|
| 1285 | 
         
             
                    if isinstance(created_at, (int, float)):
         
     | 
| 1286 | 
         
             
                        created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
         
     | 
| 1287 | 
         | 
| 1288 | 
         
            -
                    # Get file path from  
     | 
| 1289 | 
         
             
                    file_path = n.get("file_path", "unknown_source")
         
     | 
| 1290 | 
         
            -
                    if not file_path or file_path == "unknown_source":
         
     | 
| 1291 | 
         
            -
                        # Try to get from metadata
         
     | 
| 1292 | 
         
            -
                        file_path = n.get("metadata", {}).get("file_path", "unknown_source")
         
     | 
| 1293 | 
         | 
| 1294 | 
         
             
                    entites_section_list.append(
         
     | 
| 1295 | 
         
             
                        [
         
     | 
| 
         @@ -1323,11 +1306,8 @@ async def _get_node_data( 
     | 
|
| 1323 | 
         
             
                    if isinstance(created_at, (int, float)):
         
     | 
| 1324 | 
         
             
                        created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
         
     | 
| 1325 | 
         | 
| 1326 | 
         
            -
                    # Get file path from  
     | 
| 1327 | 
         
             
                    file_path = e.get("file_path", "unknown_source")
         
     | 
| 1328 | 
         
            -
                    if not file_path or file_path == "unknown_source":
         
     | 
| 1329 | 
         
            -
                        # Try to get from metadata
         
     | 
| 1330 | 
         
            -
                        file_path = e.get("metadata", {}).get("file_path", "unknown_source")
         
     | 
| 1331 | 
         | 
| 1332 | 
         
             
                    relations_section_list.append(
         
     | 
| 1333 | 
         
             
                        [
         
     | 
| 
         @@ -1564,11 +1544,8 @@ async def _get_edge_data( 
     | 
|
| 1564 | 
         
             
                    if isinstance(created_at, (int, float)):
         
     | 
| 1565 | 
         
             
                        created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
         
     | 
| 1566 | 
         | 
| 1567 | 
         
            -
                    # Get file path from  
     | 
| 1568 | 
         
             
                    file_path = e.get("file_path", "unknown_source")
         
     | 
| 1569 | 
         
            -
                    if not file_path or file_path == "unknown_source":
         
     | 
| 1570 | 
         
            -
                        # Try to get from metadata
         
     | 
| 1571 | 
         
            -
                        file_path = e.get("metadata", {}).get("file_path", "unknown_source")
         
     | 
| 1572 | 
         | 
| 1573 | 
         
             
                    relations_section_list.append(
         
     | 
| 1574 | 
         
             
                        [
         
     | 
| 
         @@ -1594,11 +1571,8 @@ async def _get_edge_data( 
     | 
|
| 1594 | 
         
             
                    if isinstance(created_at, (int, float)):
         
     | 
| 1595 | 
         
             
                        created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
         
     | 
| 1596 | 
         | 
| 1597 | 
         
            -
                    # Get file path from  
     | 
| 1598 | 
         
             
                    file_path = n.get("file_path", "unknown_source")
         
     | 
| 1599 | 
         
            -
                    if not file_path or file_path == "unknown_source":
         
     | 
| 1600 | 
         
            -
                        # Try to get from metadata
         
     | 
| 1601 | 
         
            -
                        file_path = n.get("metadata", {}).get("file_path", "unknown_source")
         
     | 
| 1602 | 
         | 
| 1603 | 
         
             
                    entites_section_list.append(
         
     | 
| 1604 | 
         
             
                        [
         
     | 
| 
         | 
|
| 172 | 
         
             
                    entity_type=entity_type,
         
     | 
| 173 | 
         
             
                    description=entity_description,
         
     | 
| 174 | 
         
             
                    source_id=chunk_key,
         
     | 
| 175 | 
         
            +
                    file_path=file_path,
         
     | 
| 176 | 
         
             
                )
         
     | 
| 177 | 
         | 
| 178 | 
         | 
| 
         | 
|
| 201 | 
         
             
                    description=edge_description,
         
     | 
| 202 | 
         
             
                    keywords=edge_keywords,
         
     | 
| 203 | 
         
             
                    source_id=edge_source_id,
         
     | 
| 204 | 
         
            +
                    file_path=file_path,
         
     | 
| 205 | 
         
             
                )
         
     | 
| 206 | 
         | 
| 207 | 
         | 
| 
         | 
|
| 224 | 
         
             
                        split_string_by_multi_markers(already_node["source_id"], [GRAPH_FIELD_SEP])
         
     | 
| 225 | 
         
             
                    )
         
     | 
| 226 | 
         
             
                    already_file_paths.extend(
         
     | 
| 227 | 
         
            +
                        split_string_by_multi_markers(already_node["file_path"], [GRAPH_FIELD_SEP])
         
     | 
| 
         | 
|
| 
         | 
|
| 228 | 
         
             
                    )
         
     | 
| 229 | 
         
             
                    already_description.append(already_node["description"])
         
     | 
| 230 | 
         | 
| 
         | 
|
| 242 | 
         
             
                    set([dp["source_id"] for dp in nodes_data] + already_source_ids)
         
     | 
| 243 | 
         
             
                )
         
     | 
| 244 | 
         
             
                file_path = GRAPH_FIELD_SEP.join(
         
     | 
| 245 | 
         
            +
                    set([dp["file_path"] for dp in nodes_data] + already_file_paths)
         
     | 
| 246 | 
         
             
                )
         
     | 
| 247 | 
         | 
| 248 | 
         
             
                logger.debug(f"file_path: {file_path}")
         
     | 
| 
         | 
|
| 296 | 
         
             
                        if already_edge.get("file_path") is not None:
         
     | 
| 297 | 
         
             
                            already_file_paths.extend(
         
     | 
| 298 | 
         
             
                                split_string_by_multi_markers(
         
     | 
| 299 | 
         
            +
                                    already_edge["file_path"], [GRAPH_FIELD_SEP]
         
     | 
| 300 | 
         
             
                                )
         
     | 
| 301 | 
         
             
                            )
         
     | 
| 302 | 
         | 
| 
         | 
|
| 338 | 
         
             
                )
         
     | 
| 339 | 
         
             
                file_path = GRAPH_FIELD_SEP.join(
         
     | 
| 340 | 
         
             
                    set(
         
     | 
| 341 | 
         
            +
                        [dp["file_path"] for dp in edges_data if dp.get("file_path")]
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 342 | 
         
             
                        + already_file_paths
         
     | 
| 343 | 
         
             
                    )
         
     | 
| 344 | 
         
             
                )
         
     | 
| 
         | 
|
| 673 | 
         
             
                            "content": f"{dp['entity_name']}\n{dp['description']}",
         
     | 
| 674 | 
         
             
                            "source_id": dp["source_id"],
         
     | 
| 675 | 
         
             
                            "file_path": dp.get("file_path", "unknown_source"),
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 676 | 
         
             
                        }
         
     | 
| 677 | 
         
             
                        for dp in all_entities_data
         
     | 
| 678 | 
         
             
                    }
         
     | 
| 
         | 
|
| 687 | 
         
             
                            "content": f"{dp['src_id']}\t{dp['tgt_id']}\n{dp['keywords']}\n{dp['description']}",
         
     | 
| 688 | 
         
             
                            "source_id": dp["source_id"],
         
     | 
| 689 | 
         
             
                            "file_path": dp.get("file_path", "unknown_source"),
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 690 | 
         
             
                        }
         
     | 
| 691 | 
         
             
                        for dp in all_relationships_data
         
     | 
| 692 | 
         
             
                    }
         
     | 
| 
         | 
|
| 1271 | 
         
             
                    if isinstance(created_at, (int, float)):
         
     | 
| 1272 | 
         
             
                        created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
         
     | 
| 1273 | 
         | 
| 1274 | 
         
            +
                    # Get file path from node data
         
     | 
| 1275 | 
         
             
                    file_path = n.get("file_path", "unknown_source")
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 1276 | 
         | 
| 1277 | 
         
             
                    entites_section_list.append(
         
     | 
| 1278 | 
         
             
                        [
         
     | 
| 
         | 
|
| 1306 | 
         
             
                    if isinstance(created_at, (int, float)):
         
     | 
| 1307 | 
         
             
                        created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
         
     | 
| 1308 | 
         | 
| 1309 | 
         
            +
                    # Get file path from edge data
         
     | 
| 1310 | 
         
             
                    file_path = e.get("file_path", "unknown_source")
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 1311 | 
         | 
| 1312 | 
         
             
                    relations_section_list.append(
         
     | 
| 1313 | 
         
             
                        [
         
     | 
| 
         | 
|
| 1544 | 
         
             
                    if isinstance(created_at, (int, float)):
         
     | 
| 1545 | 
         
             
                        created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
         
     | 
| 1546 | 
         | 
| 1547 | 
         
            +
                    # Get file path from edge data
         
     | 
| 1548 | 
         
             
                    file_path = e.get("file_path", "unknown_source")
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 1549 | 
         | 
| 1550 | 
         
             
                    relations_section_list.append(
         
     | 
| 1551 | 
         
             
                        [
         
     | 
| 
         | 
|
| 1571 | 
         
             
                    if isinstance(created_at, (int, float)):
         
     | 
| 1572 | 
         
             
                        created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
         
     | 
| 1573 | 
         | 
| 1574 | 
         
            +
                    # Get file path from node data
         
     | 
| 1575 | 
         
             
                    file_path = n.get("file_path", "unknown_source")
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 1576 | 
         | 
| 1577 | 
         
             
                    entites_section_list.append(
         
     | 
| 1578 | 
         
             
                        [
         
     |