Merge branch 'main' into main
Browse files- README.md +1 -1
- lightrag/lightrag.py +68 -1
- lightrag/operate.py +52 -72
README.md
CHANGED
|
@@ -12,7 +12,7 @@
|
|
| 12 |
</p>
|
| 13 |
<p>
|
| 14 |
<img src='https://img.shields.io/github/stars/hkuds/lightrag?color=green&style=social' />
|
| 15 |
-
<img src="https://img.shields.io/badge/python
|
| 16 |
<a href="https://pypi.org/project/lightrag-hku/"><img src="https://img.shields.io/pypi/v/lightrag-hku.svg"></a>
|
| 17 |
<a href="https://pepy.tech/project/lightrag-hku"><img src="https://static.pepy.tech/badge/lightrag-hku/month"></a>
|
| 18 |
</p>
|
|
|
|
| 12 |
</p>
|
| 13 |
<p>
|
| 14 |
<img src='https://img.shields.io/github/stars/hkuds/lightrag?color=green&style=social' />
|
| 15 |
+
<img src="https://img.shields.io/badge/python-3.10-blue">
|
| 16 |
<a href="https://pypi.org/project/lightrag-hku/"><img src="https://img.shields.io/pypi/v/lightrag-hku.svg"></a>
|
| 17 |
<a href="https://pepy.tech/project/lightrag-hku"><img src="https://static.pepy.tech/badge/lightrag-hku/month"></a>
|
| 18 |
</p>
|
lightrag/lightrag.py
CHANGED
|
@@ -323,7 +323,7 @@ class LightRAG:
|
|
| 323 |
)
|
| 324 |
|
| 325 |
async def ainsert(
|
| 326 |
-
self, string_or_strings, split_by_character, split_by_character_only
|
| 327 |
):
|
| 328 |
"""Insert documents with checkpoint support
|
| 329 |
|
|
@@ -466,6 +466,73 @@ class LightRAG:
|
|
| 466 |
# Ensure all indexes are updated after each document
|
| 467 |
await self._insert_done()
|
| 468 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 469 |
async def _insert_done(self):
|
| 470 |
tasks = []
|
| 471 |
for storage_inst in [
|
|
|
|
| 323 |
)
|
| 324 |
|
| 325 |
async def ainsert(
|
| 326 |
+
self, string_or_strings, split_by_character=None, split_by_character_only=False
|
| 327 |
):
|
| 328 |
"""Insert documents with checkpoint support
|
| 329 |
|
|
|
|
| 466 |
# Ensure all indexes are updated after each document
|
| 467 |
await self._insert_done()
|
| 468 |
|
| 469 |
+
def insert_custom_chunks(self, full_text: str, text_chunks: list[str]):
|
| 470 |
+
loop = always_get_an_event_loop()
|
| 471 |
+
return loop.run_until_complete(
|
| 472 |
+
self.ainsert_custom_chunks(full_text, text_chunks)
|
| 473 |
+
)
|
| 474 |
+
|
| 475 |
+
async def ainsert_custom_chunks(self, full_text: str, text_chunks: list[str]):
|
| 476 |
+
update_storage = False
|
| 477 |
+
try:
|
| 478 |
+
doc_key = compute_mdhash_id(full_text.strip(), prefix="doc-")
|
| 479 |
+
new_docs = {doc_key: {"content": full_text.strip()}}
|
| 480 |
+
|
| 481 |
+
_add_doc_keys = await self.full_docs.filter_keys([doc_key])
|
| 482 |
+
new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
|
| 483 |
+
if not len(new_docs):
|
| 484 |
+
logger.warning("This document is already in the storage.")
|
| 485 |
+
return
|
| 486 |
+
|
| 487 |
+
update_storage = True
|
| 488 |
+
logger.info(f"[New Docs] inserting {len(new_docs)} docs")
|
| 489 |
+
|
| 490 |
+
inserting_chunks = {}
|
| 491 |
+
for chunk_text in text_chunks:
|
| 492 |
+
chunk_text_stripped = chunk_text.strip()
|
| 493 |
+
chunk_key = compute_mdhash_id(chunk_text_stripped, prefix="chunk-")
|
| 494 |
+
|
| 495 |
+
inserting_chunks[chunk_key] = {
|
| 496 |
+
"content": chunk_text_stripped,
|
| 497 |
+
"full_doc_id": doc_key,
|
| 498 |
+
}
|
| 499 |
+
|
| 500 |
+
_add_chunk_keys = await self.text_chunks.filter_keys(
|
| 501 |
+
list(inserting_chunks.keys())
|
| 502 |
+
)
|
| 503 |
+
inserting_chunks = {
|
| 504 |
+
k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
|
| 505 |
+
}
|
| 506 |
+
if not len(inserting_chunks):
|
| 507 |
+
logger.warning("All chunks are already in the storage.")
|
| 508 |
+
return
|
| 509 |
+
|
| 510 |
+
logger.info(f"[New Chunks] inserting {len(inserting_chunks)} chunks")
|
| 511 |
+
|
| 512 |
+
await self.chunks_vdb.upsert(inserting_chunks)
|
| 513 |
+
|
| 514 |
+
logger.info("[Entity Extraction]...")
|
| 515 |
+
maybe_new_kg = await extract_entities(
|
| 516 |
+
inserting_chunks,
|
| 517 |
+
knowledge_graph_inst=self.chunk_entity_relation_graph,
|
| 518 |
+
entity_vdb=self.entities_vdb,
|
| 519 |
+
relationships_vdb=self.relationships_vdb,
|
| 520 |
+
global_config=asdict(self),
|
| 521 |
+
)
|
| 522 |
+
|
| 523 |
+
if maybe_new_kg is None:
|
| 524 |
+
logger.warning("No new entities and relationships found")
|
| 525 |
+
return
|
| 526 |
+
else:
|
| 527 |
+
self.chunk_entity_relation_graph = maybe_new_kg
|
| 528 |
+
|
| 529 |
+
await self.full_docs.upsert(new_docs)
|
| 530 |
+
await self.text_chunks.upsert(inserting_chunks)
|
| 531 |
+
|
| 532 |
+
finally:
|
| 533 |
+
if update_storage:
|
| 534 |
+
await self._insert_done()
|
| 535 |
+
|
| 536 |
async def _insert_done(self):
|
| 537 |
tasks = []
|
| 538 |
for storage_inst in [
|
lightrag/operate.py
CHANGED
|
@@ -4,7 +4,6 @@ import re
|
|
| 4 |
from tqdm.asyncio import tqdm as tqdm_async
|
| 5 |
from typing import Union
|
| 6 |
from collections import Counter, defaultdict
|
| 7 |
-
import warnings
|
| 8 |
from .utils import (
|
| 9 |
logger,
|
| 10 |
clean_str,
|
|
@@ -611,15 +610,22 @@ async def kg_query(
|
|
| 611 |
logger.warning("low_level_keywords and high_level_keywords is empty")
|
| 612 |
return PROMPTS["fail_response"]
|
| 613 |
if ll_keywords == [] and query_param.mode in ["local", "hybrid"]:
|
| 614 |
-
logger.warning(
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
|
|
|
|
| 618 |
if hl_keywords == [] and query_param.mode in ["global", "hybrid"]:
|
| 619 |
-
logger.warning(
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 623 |
|
| 624 |
# Build context
|
| 625 |
keywords = [ll_keywords, hl_keywords]
|
|
@@ -685,77 +691,51 @@ async def _build_query_context(
|
|
| 685 |
# ll_entities_context, ll_relations_context, ll_text_units_context = "", "", ""
|
| 686 |
# hl_entities_context, hl_relations_context, hl_text_units_context = "", "", ""
|
| 687 |
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
| 696 |
-
|
| 697 |
-
"Low Level context is None. Return empty Low entity/relationship/source"
|
| 698 |
-
)
|
| 699 |
-
query_param.mode = "global"
|
| 700 |
-
else:
|
| 701 |
-
(
|
| 702 |
-
ll_entities_context,
|
| 703 |
-
ll_relations_context,
|
| 704 |
-
ll_text_units_context,
|
| 705 |
-
) = await _get_node_data(
|
| 706 |
-
ll_kewwords,
|
| 707 |
-
knowledge_graph_inst,
|
| 708 |
-
entities_vdb,
|
| 709 |
-
text_chunks_db,
|
| 710 |
-
query_param,
|
| 711 |
-
)
|
| 712 |
-
if query_param.mode in ["global", "hybrid"]:
|
| 713 |
-
if hl_keywrds == "":
|
| 714 |
-
hl_entities_context, hl_relations_context, hl_text_units_context = (
|
| 715 |
-
"",
|
| 716 |
-
"",
|
| 717 |
-
"",
|
| 718 |
-
)
|
| 719 |
-
warnings.warn(
|
| 720 |
-
"High Level context is None. Return empty High entity/relationship/source"
|
| 721 |
-
)
|
| 722 |
-
query_param.mode = "local"
|
| 723 |
-
else:
|
| 724 |
-
(
|
| 725 |
-
hl_entities_context,
|
| 726 |
-
hl_relations_context,
|
| 727 |
-
hl_text_units_context,
|
| 728 |
-
) = await _get_edge_data(
|
| 729 |
-
hl_keywrds,
|
| 730 |
-
knowledge_graph_inst,
|
| 731 |
-
relationships_vdb,
|
| 732 |
-
text_chunks_db,
|
| 733 |
-
query_param,
|
| 734 |
-
)
|
| 735 |
-
if (
|
| 736 |
-
hl_entities_context == ""
|
| 737 |
-
and hl_relations_context == ""
|
| 738 |
-
and hl_text_units_context == ""
|
| 739 |
-
):
|
| 740 |
-
logger.warn("No high level context found. Switching to local mode.")
|
| 741 |
-
query_param.mode = "local"
|
| 742 |
-
if query_param.mode == "hybrid":
|
| 743 |
-
entities_context, relations_context, text_units_context = combine_contexts(
|
| 744 |
-
[hl_entities_context, ll_entities_context],
|
| 745 |
-
[hl_relations_context, ll_relations_context],
|
| 746 |
-
[hl_text_units_context, ll_text_units_context],
|
| 747 |
)
|
| 748 |
-
elif query_param.mode == "
|
| 749 |
-
entities_context, relations_context, text_units_context = (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 750 |
ll_entities_context,
|
| 751 |
ll_relations_context,
|
| 752 |
ll_text_units_context,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 753 |
)
|
| 754 |
-
|
| 755 |
-
entities_context, relations_context, text_units_context = (
|
| 756 |
hl_entities_context,
|
| 757 |
hl_relations_context,
|
| 758 |
hl_text_units_context,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 759 |
)
|
| 760 |
return f"""
|
| 761 |
-----Entities-----
|
|
|
|
| 4 |
from tqdm.asyncio import tqdm as tqdm_async
|
| 5 |
from typing import Union
|
| 6 |
from collections import Counter, defaultdict
|
|
|
|
| 7 |
from .utils import (
|
| 8 |
logger,
|
| 9 |
clean_str,
|
|
|
|
| 610 |
logger.warning("low_level_keywords and high_level_keywords is empty")
|
| 611 |
return PROMPTS["fail_response"]
|
| 612 |
if ll_keywords == [] and query_param.mode in ["local", "hybrid"]:
|
| 613 |
+
logger.warning(
|
| 614 |
+
"low_level_keywords is empty, switching from %s mode to global mode",
|
| 615 |
+
query_param.mode,
|
| 616 |
+
)
|
| 617 |
+
query_param.mode = "global"
|
| 618 |
if hl_keywords == [] and query_param.mode in ["global", "hybrid"]:
|
| 619 |
+
logger.warning(
|
| 620 |
+
"high_level_keywords is empty, switching from %s mode to local mode",
|
| 621 |
+
query_param.mode,
|
| 622 |
+
)
|
| 623 |
+
query_param.mode = "local"
|
| 624 |
+
|
| 625 |
+
ll_keywords = ", ".join(ll_keywords) if ll_keywords else ""
|
| 626 |
+
hl_keywords = ", ".join(hl_keywords) if hl_keywords else ""
|
| 627 |
+
|
| 628 |
+
logger.info("Using %s mode for query processing", query_param.mode)
|
| 629 |
|
| 630 |
# Build context
|
| 631 |
keywords = [ll_keywords, hl_keywords]
|
|
|
|
| 691 |
# ll_entities_context, ll_relations_context, ll_text_units_context = "", "", ""
|
| 692 |
# hl_entities_context, hl_relations_context, hl_text_units_context = "", "", ""
|
| 693 |
|
| 694 |
+
ll_keywords, hl_keywords = query[0], query[1]
|
| 695 |
+
|
| 696 |
+
if query_param.mode == "local":
|
| 697 |
+
entities_context, relations_context, text_units_context = await _get_node_data(
|
| 698 |
+
ll_keywords,
|
| 699 |
+
knowledge_graph_inst,
|
| 700 |
+
entities_vdb,
|
| 701 |
+
text_chunks_db,
|
| 702 |
+
query_param,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 703 |
)
|
| 704 |
+
elif query_param.mode == "global":
|
| 705 |
+
entities_context, relations_context, text_units_context = await _get_edge_data(
|
| 706 |
+
hl_keywords,
|
| 707 |
+
knowledge_graph_inst,
|
| 708 |
+
relationships_vdb,
|
| 709 |
+
text_chunks_db,
|
| 710 |
+
query_param,
|
| 711 |
+
)
|
| 712 |
+
else: # hybrid mode
|
| 713 |
+
(
|
| 714 |
ll_entities_context,
|
| 715 |
ll_relations_context,
|
| 716 |
ll_text_units_context,
|
| 717 |
+
) = await _get_node_data(
|
| 718 |
+
ll_keywords,
|
| 719 |
+
knowledge_graph_inst,
|
| 720 |
+
entities_vdb,
|
| 721 |
+
text_chunks_db,
|
| 722 |
+
query_param,
|
| 723 |
)
|
| 724 |
+
(
|
|
|
|
| 725 |
hl_entities_context,
|
| 726 |
hl_relations_context,
|
| 727 |
hl_text_units_context,
|
| 728 |
+
) = await _get_edge_data(
|
| 729 |
+
hl_keywords,
|
| 730 |
+
knowledge_graph_inst,
|
| 731 |
+
relationships_vdb,
|
| 732 |
+
text_chunks_db,
|
| 733 |
+
query_param,
|
| 734 |
+
)
|
| 735 |
+
entities_context, relations_context, text_units_context = combine_contexts(
|
| 736 |
+
[hl_entities_context, ll_entities_context],
|
| 737 |
+
[hl_relations_context, ll_relations_context],
|
| 738 |
+
[hl_text_units_context, ll_text_units_context],
|
| 739 |
)
|
| 740 |
return f"""
|
| 741 |
-----Entities-----
|