zrguo commited on
Commit
3548a4a
·
unverified ·
2 Parent(s): 9c6d163 ab85891

Merge branch 'main' into main

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. lightrag/lightrag.py +68 -1
  3. lightrag/operate.py +52 -72
README.md CHANGED
@@ -12,7 +12,7 @@
12
  </p>
13
  <p>
14
  <img src='https://img.shields.io/github/stars/hkuds/lightrag?color=green&style=social' />
15
- <img src="https://img.shields.io/badge/python->=3.10-blue">
16
  <a href="https://pypi.org/project/lightrag-hku/"><img src="https://img.shields.io/pypi/v/lightrag-hku.svg"></a>
17
  <a href="https://pepy.tech/project/lightrag-hku"><img src="https://static.pepy.tech/badge/lightrag-hku/month"></a>
18
  </p>
 
12
  </p>
13
  <p>
14
  <img src='https://img.shields.io/github/stars/hkuds/lightrag?color=green&style=social' />
15
+ <img src="https://img.shields.io/badge/python-3.10-blue">
16
  <a href="https://pypi.org/project/lightrag-hku/"><img src="https://img.shields.io/pypi/v/lightrag-hku.svg"></a>
17
  <a href="https://pepy.tech/project/lightrag-hku"><img src="https://static.pepy.tech/badge/lightrag-hku/month"></a>
18
  </p>
lightrag/lightrag.py CHANGED
@@ -323,7 +323,7 @@ class LightRAG:
323
  )
324
 
325
  async def ainsert(
326
- self, string_or_strings, split_by_character, split_by_character_only
327
  ):
328
  """Insert documents with checkpoint support
329
 
@@ -466,6 +466,73 @@ class LightRAG:
466
  # Ensure all indexes are updated after each document
467
  await self._insert_done()
468
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469
  async def _insert_done(self):
470
  tasks = []
471
  for storage_inst in [
 
323
  )
324
 
325
  async def ainsert(
326
+ self, string_or_strings, split_by_character=None, split_by_character_only=False
327
  ):
328
  """Insert documents with checkpoint support
329
 
 
466
  # Ensure all indexes are updated after each document
467
  await self._insert_done()
468
 
469
+ def insert_custom_chunks(self, full_text: str, text_chunks: list[str]):
470
+ loop = always_get_an_event_loop()
471
+ return loop.run_until_complete(
472
+ self.ainsert_custom_chunks(full_text, text_chunks)
473
+ )
474
+
475
+ async def ainsert_custom_chunks(self, full_text: str, text_chunks: list[str]):
476
+ update_storage = False
477
+ try:
478
+ doc_key = compute_mdhash_id(full_text.strip(), prefix="doc-")
479
+ new_docs = {doc_key: {"content": full_text.strip()}}
480
+
481
+ _add_doc_keys = await self.full_docs.filter_keys([doc_key])
482
+ new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
483
+ if not len(new_docs):
484
+ logger.warning("This document is already in the storage.")
485
+ return
486
+
487
+ update_storage = True
488
+ logger.info(f"[New Docs] inserting {len(new_docs)} docs")
489
+
490
+ inserting_chunks = {}
491
+ for chunk_text in text_chunks:
492
+ chunk_text_stripped = chunk_text.strip()
493
+ chunk_key = compute_mdhash_id(chunk_text_stripped, prefix="chunk-")
494
+
495
+ inserting_chunks[chunk_key] = {
496
+ "content": chunk_text_stripped,
497
+ "full_doc_id": doc_key,
498
+ }
499
+
500
+ _add_chunk_keys = await self.text_chunks.filter_keys(
501
+ list(inserting_chunks.keys())
502
+ )
503
+ inserting_chunks = {
504
+ k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
505
+ }
506
+ if not len(inserting_chunks):
507
+ logger.warning("All chunks are already in the storage.")
508
+ return
509
+
510
+ logger.info(f"[New Chunks] inserting {len(inserting_chunks)} chunks")
511
+
512
+ await self.chunks_vdb.upsert(inserting_chunks)
513
+
514
+ logger.info("[Entity Extraction]...")
515
+ maybe_new_kg = await extract_entities(
516
+ inserting_chunks,
517
+ knowledge_graph_inst=self.chunk_entity_relation_graph,
518
+ entity_vdb=self.entities_vdb,
519
+ relationships_vdb=self.relationships_vdb,
520
+ global_config=asdict(self),
521
+ )
522
+
523
+ if maybe_new_kg is None:
524
+ logger.warning("No new entities and relationships found")
525
+ return
526
+ else:
527
+ self.chunk_entity_relation_graph = maybe_new_kg
528
+
529
+ await self.full_docs.upsert(new_docs)
530
+ await self.text_chunks.upsert(inserting_chunks)
531
+
532
+ finally:
533
+ if update_storage:
534
+ await self._insert_done()
535
+
536
  async def _insert_done(self):
537
  tasks = []
538
  for storage_inst in [
lightrag/operate.py CHANGED
@@ -4,7 +4,6 @@ import re
4
  from tqdm.asyncio import tqdm as tqdm_async
5
  from typing import Union
6
  from collections import Counter, defaultdict
7
- import warnings
8
  from .utils import (
9
  logger,
10
  clean_str,
@@ -611,15 +610,22 @@ async def kg_query(
611
  logger.warning("low_level_keywords and high_level_keywords is empty")
612
  return PROMPTS["fail_response"]
613
  if ll_keywords == [] and query_param.mode in ["local", "hybrid"]:
614
- logger.warning("low_level_keywords is empty")
615
- return PROMPTS["fail_response"]
616
- else:
617
- ll_keywords = ", ".join(ll_keywords)
 
618
  if hl_keywords == [] and query_param.mode in ["global", "hybrid"]:
619
- logger.warning("high_level_keywords is empty")
620
- return PROMPTS["fail_response"]
621
- else:
622
- hl_keywords = ", ".join(hl_keywords)
 
 
 
 
 
 
623
 
624
  # Build context
625
  keywords = [ll_keywords, hl_keywords]
@@ -685,77 +691,51 @@ async def _build_query_context(
685
  # ll_entities_context, ll_relations_context, ll_text_units_context = "", "", ""
686
  # hl_entities_context, hl_relations_context, hl_text_units_context = "", "", ""
687
 
688
- ll_kewwords, hl_keywrds = query[0], query[1]
689
- if query_param.mode in ["local", "hybrid"]:
690
- if ll_kewwords == "":
691
- ll_entities_context, ll_relations_context, ll_text_units_context = (
692
- "",
693
- "",
694
- "",
695
- )
696
- warnings.warn(
697
- "Low Level context is None. Return empty Low entity/relationship/source"
698
- )
699
- query_param.mode = "global"
700
- else:
701
- (
702
- ll_entities_context,
703
- ll_relations_context,
704
- ll_text_units_context,
705
- ) = await _get_node_data(
706
- ll_kewwords,
707
- knowledge_graph_inst,
708
- entities_vdb,
709
- text_chunks_db,
710
- query_param,
711
- )
712
- if query_param.mode in ["global", "hybrid"]:
713
- if hl_keywrds == "":
714
- hl_entities_context, hl_relations_context, hl_text_units_context = (
715
- "",
716
- "",
717
- "",
718
- )
719
- warnings.warn(
720
- "High Level context is None. Return empty High entity/relationship/source"
721
- )
722
- query_param.mode = "local"
723
- else:
724
- (
725
- hl_entities_context,
726
- hl_relations_context,
727
- hl_text_units_context,
728
- ) = await _get_edge_data(
729
- hl_keywrds,
730
- knowledge_graph_inst,
731
- relationships_vdb,
732
- text_chunks_db,
733
- query_param,
734
- )
735
- if (
736
- hl_entities_context == ""
737
- and hl_relations_context == ""
738
- and hl_text_units_context == ""
739
- ):
740
- logger.warn("No high level context found. Switching to local mode.")
741
- query_param.mode = "local"
742
- if query_param.mode == "hybrid":
743
- entities_context, relations_context, text_units_context = combine_contexts(
744
- [hl_entities_context, ll_entities_context],
745
- [hl_relations_context, ll_relations_context],
746
- [hl_text_units_context, ll_text_units_context],
747
  )
748
- elif query_param.mode == "local":
749
- entities_context, relations_context, text_units_context = (
 
 
 
 
 
 
 
 
750
  ll_entities_context,
751
  ll_relations_context,
752
  ll_text_units_context,
 
 
 
 
 
 
753
  )
754
- elif query_param.mode == "global":
755
- entities_context, relations_context, text_units_context = (
756
  hl_entities_context,
757
  hl_relations_context,
758
  hl_text_units_context,
 
 
 
 
 
 
 
 
 
 
 
759
  )
760
  return f"""
761
  -----Entities-----
 
4
  from tqdm.asyncio import tqdm as tqdm_async
5
  from typing import Union
6
  from collections import Counter, defaultdict
 
7
  from .utils import (
8
  logger,
9
  clean_str,
 
610
  logger.warning("low_level_keywords and high_level_keywords is empty")
611
  return PROMPTS["fail_response"]
612
  if ll_keywords == [] and query_param.mode in ["local", "hybrid"]:
613
+ logger.warning(
614
+ "low_level_keywords is empty, switching from %s mode to global mode",
615
+ query_param.mode,
616
+ )
617
+ query_param.mode = "global"
618
  if hl_keywords == [] and query_param.mode in ["global", "hybrid"]:
619
+ logger.warning(
620
+ "high_level_keywords is empty, switching from %s mode to local mode",
621
+ query_param.mode,
622
+ )
623
+ query_param.mode = "local"
624
+
625
+ ll_keywords = ", ".join(ll_keywords) if ll_keywords else ""
626
+ hl_keywords = ", ".join(hl_keywords) if hl_keywords else ""
627
+
628
+ logger.info("Using %s mode for query processing", query_param.mode)
629
 
630
  # Build context
631
  keywords = [ll_keywords, hl_keywords]
 
691
  # ll_entities_context, ll_relations_context, ll_text_units_context = "", "", ""
692
  # hl_entities_context, hl_relations_context, hl_text_units_context = "", "", ""
693
 
694
+ ll_keywords, hl_keywords = query[0], query[1]
695
+
696
+ if query_param.mode == "local":
697
+ entities_context, relations_context, text_units_context = await _get_node_data(
698
+ ll_keywords,
699
+ knowledge_graph_inst,
700
+ entities_vdb,
701
+ text_chunks_db,
702
+ query_param,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
703
  )
704
+ elif query_param.mode == "global":
705
+ entities_context, relations_context, text_units_context = await _get_edge_data(
706
+ hl_keywords,
707
+ knowledge_graph_inst,
708
+ relationships_vdb,
709
+ text_chunks_db,
710
+ query_param,
711
+ )
712
+ else: # hybrid mode
713
+ (
714
  ll_entities_context,
715
  ll_relations_context,
716
  ll_text_units_context,
717
+ ) = await _get_node_data(
718
+ ll_keywords,
719
+ knowledge_graph_inst,
720
+ entities_vdb,
721
+ text_chunks_db,
722
+ query_param,
723
  )
724
+ (
 
725
  hl_entities_context,
726
  hl_relations_context,
727
  hl_text_units_context,
728
+ ) = await _get_edge_data(
729
+ hl_keywords,
730
+ knowledge_graph_inst,
731
+ relationships_vdb,
732
+ text_chunks_db,
733
+ query_param,
734
+ )
735
+ entities_context, relations_context, text_units_context = combine_contexts(
736
+ [hl_entities_context, ll_entities_context],
737
+ [hl_relations_context, ll_relations_context],
738
+ [hl_text_units_context, ll_text_units_context],
739
  )
740
  return f"""
741
  -----Entities-----