hyb182 commited on
Commit
cb17442
·
1 Parent(s): 390c21a

feat: 新增ini文件读取数据库配置方式,方便生产环境,修改Lightrag ainsert方法_add_doc_keys获取方式,原来只过滤存在的,但这会让失败的文档无法再次存储,新增--chunk_size和--chunk_overlap_size方便生产环境,新增llm_binding:openai-ollama 方便用openai的同时使用ollama embedding

Browse files
config.ini ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [redis]
2
+ uri = redis://localhost:6379
3
+
4
+ [neo4j]
5
+ uri = #
6
+ username = neo4j
7
+ password = 12345678
8
+
9
+ [milvus]
10
+ uri = #
11
+ user = root
12
+ password = Milvus
13
+ db_name = lightrag
lightrag/api/lightrag_server.py CHANGED
@@ -20,6 +20,7 @@ import shutil
20
  import aiofiles
21
  from ascii_colors import trace_exception, ASCIIColors
22
  import os
 
23
 
24
  from fastapi import Depends, Security
25
  from fastapi.security import APIKeyHeader
@@ -57,6 +58,52 @@ LIGHTRAG_SIZE = 7365960935 # it's a dummy value
57
  LIGHTRAG_CREATED_AT = "2024-01-15T00:00:00Z"
58
  LIGHTRAG_DIGEST = "sha256:lightrag"
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  def get_default_host(binding_type: str) -> str:
62
  default_hosts = {
@@ -337,6 +384,18 @@ def parse_args() -> argparse.Namespace:
337
  help="Embedding model name (default: from env or bge-m3:latest)",
338
  )
339
 
 
 
 
 
 
 
 
 
 
 
 
 
340
  def timeout_type(value):
341
  if value is None or value == "None":
342
  return None
@@ -551,7 +610,7 @@ def get_api_key_dependency(api_key: Optional[str]):
551
 
552
  def create_app(args):
553
  # Verify that bindings arer correctly setup
554
- if args.llm_binding not in ["lollms", "ollama", "openai"]:
555
  raise Exception("llm binding not supported")
556
 
557
  if args.embedding_binding not in ["lollms", "ollama", "openai"]:
@@ -692,22 +751,32 @@ def create_app(args):
692
  )
693
 
694
  # Initialize RAG
695
- if args.llm_binding in ["lollms", "ollama"]:
696
  rag = LightRAG(
697
  working_dir=args.working_dir,
698
  llm_model_func=lollms_model_complete
699
  if args.llm_binding == "lollms"
700
- else ollama_model_complete,
 
 
701
  llm_model_name=args.llm_model,
702
  llm_model_max_async=args.max_async,
703
  llm_model_max_token_size=args.max_tokens,
 
 
704
  llm_model_kwargs={
705
  "host": args.llm_binding_host,
706
  "timeout": args.timeout,
707
  "options": {"num_ctx": args.max_tokens},
708
  "api_key": args.llm_binding_api_key,
709
- },
 
 
710
  embedding_func=embedding_func,
 
 
 
 
711
  )
712
  else:
713
  rag = LightRAG(
@@ -715,7 +784,13 @@ def create_app(args):
715
  llm_model_func=azure_openai_model_complete
716
  if args.llm_binding == "azure_openai"
717
  else openai_alike_model_complete,
 
 
718
  embedding_func=embedding_func,
 
 
 
 
719
  )
720
 
721
  async def index_file(file_path: Union[str, Path]) -> None:
 
20
  import aiofiles
21
  from ascii_colors import trace_exception, ASCIIColors
22
  import os
23
+ import configparser
24
 
25
  from fastapi import Depends, Security
26
  from fastapi.security import APIKeyHeader
 
58
  LIGHTRAG_CREATED_AT = "2024-01-15T00:00:00Z"
59
  LIGHTRAG_DIGEST = "sha256:lightrag"
60
 
61
+ KV_STORAGE = "JsonKVStorage"
62
+ DOC_STATUS_STORAGE = "JsonDocStatusStorage"
63
+ GRAPH_STORAGE = "NetworkXStorage"
64
+ VECTOR_STORAGE = "NanoVectorDBStorage"
65
+
66
+ # 读取配置文件
67
+ config = configparser.ConfigParser()
68
+ config.read("config.ini")
69
+ # Redis 配置
70
+ redis_uri = config.get("redis", "uri", fallback=None)
71
+ if redis_uri:
72
+ os.environ["REDIS_URI"] = redis_uri
73
+ KV_STORAGE = "RedisKVStorage"
74
+ DOC_STATUS_STORAGE = "RedisKVStorage"
75
+
76
+ # Neo4j 配置
77
+ neo4j_uri = config.get("neo4j", "uri", fallback=None)
78
+ neo4j_username = config.get("neo4j", "username", fallback=None)
79
+ neo4j_password = config.get("neo4j", "password", fallback=None)
80
+ if neo4j_uri:
81
+ os.environ["NEO4J_URI"] = neo4j_uri
82
+ os.environ["NEO4J_USERNAME"] = neo4j_username
83
+ os.environ["NEO4J_PASSWORD"] = neo4j_password
84
+ GRAPH_STORAGE = "Neo4JStorage"
85
+
86
+ # Milvus 配置
87
+ milvus_uri = config.get("milvus", "uri", fallback=None)
88
+ milvus_user = config.get("milvus", "user", fallback=None)
89
+ milvus_password = config.get("milvus", "password", fallback=None)
90
+ milvus_db_name = config.get("milvus", "db_name", fallback=None)
91
+ if milvus_uri:
92
+ os.environ["MILVUS_URI"] = milvus_uri
93
+ os.environ["MILVUS_USER"] = milvus_user
94
+ os.environ["MILVUS_PASSWORD"] = milvus_password
95
+ os.environ["MILVUS_DB_NAME"] = milvus_db_name
96
+ VECTOR_STORAGE = "MilvusVectorDBStorge"
97
+
98
+ # MongoDB 配置
99
+ mongo_uri = config.get("mongodb", "uri", fallback=None)
100
+ mongo_database = config.get("mongodb", "LightRAG", fallback=None)
101
+ if mongo_uri:
102
+ os.environ["MONGO_URI"] = mongo_uri
103
+ os.environ["MONGO_DATABASE"] = mongo_database
104
+ KV_STORAGE = "MongoKVStorage"
105
+ DOC_STATUS_STORAGE = "MongoKVStorage"
106
+
107
 
108
  def get_default_host(binding_type: str) -> str:
109
  default_hosts = {
 
384
  help="Embedding model name (default: from env or bge-m3:latest)",
385
  )
386
 
387
+ parser.add_argument(
388
+ "--chunk_size",
389
+ default=1200,
390
+ help="chunk token size default 1200",
391
+ )
392
+
393
+ parser.add_argument(
394
+ "--chunk_overlap_size",
395
+ default=100,
396
+ help="chunk token size default 1200",
397
+ )
398
+
399
  def timeout_type(value):
400
  if value is None or value == "None":
401
  return None
 
610
 
611
  def create_app(args):
612
  # Verify that bindings arer correctly setup
613
+ if args.llm_binding not in ["lollms", "ollama", "openai", "openai-ollama"]:
614
  raise Exception("llm binding not supported")
615
 
616
  if args.embedding_binding not in ["lollms", "ollama", "openai"]:
 
751
  )
752
 
753
  # Initialize RAG
754
+ if args.llm_binding in ["lollms", "ollama", "openai-ollama"]:
755
  rag = LightRAG(
756
  working_dir=args.working_dir,
757
  llm_model_func=lollms_model_complete
758
  if args.llm_binding == "lollms"
759
+ else ollama_model_complete
760
+ if args.llm_binding == "ollama"
761
+ else openai_alike_model_complete,
762
  llm_model_name=args.llm_model,
763
  llm_model_max_async=args.max_async,
764
  llm_model_max_token_size=args.max_tokens,
765
+ chunk_token_size=int(args.chunk_size),
766
+ chunk_overlap_token_size=int(args.chunk_overlap_size),
767
  llm_model_kwargs={
768
  "host": args.llm_binding_host,
769
  "timeout": args.timeout,
770
  "options": {"num_ctx": args.max_tokens},
771
  "api_key": args.llm_binding_api_key,
772
+ }
773
+ if args.llm_binding == "lollms" or args.llm_binding == "ollama"
774
+ else {},
775
  embedding_func=embedding_func,
776
+ kv_storage=KV_STORAGE,
777
+ graph_storage=GRAPH_STORAGE,
778
+ vector_storage=VECTOR_STORAGE,
779
+ doc_status_storage=DOC_STATUS_STORAGE,
780
  )
781
  else:
782
  rag = LightRAG(
 
784
  llm_model_func=azure_openai_model_complete
785
  if args.llm_binding == "azure_openai"
786
  else openai_alike_model_complete,
787
+ chunk_token_size=int(args.chunk_size),
788
+ chunk_overlap_token_size=int(args.chunk_overlap_size),
789
  embedding_func=embedding_func,
790
+ kv_storage=KV_STORAGE,
791
+ graph_storage=GRAPH_STORAGE,
792
+ vector_storage=VECTOR_STORAGE,
793
+ doc_status_storage=DOC_STATUS_STORAGE,
794
  )
795
 
796
  async def index_file(file_path: Union[str, Path]) -> None:
lightrag/lightrag.py CHANGED
@@ -361,7 +361,13 @@ class LightRAG:
361
  }
362
 
363
  # 3. Filter out already processed documents
364
- _add_doc_keys = await self.doc_status.filter_keys(list(new_docs.keys()))
 
 
 
 
 
 
365
  new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
366
 
367
  if not new_docs:
@@ -573,7 +579,7 @@ class LightRAG:
573
  _not_stored_doc_keys = await self.full_docs.filter_keys(list(new_docs.keys()))
574
  if len(_not_stored_doc_keys) < len(new_docs):
575
  logger.info(
576
- f"Skipping {len(new_docs)-len(_not_stored_doc_keys)} already existing documents"
577
  )
578
  new_docs = {k: v for k, v in new_docs.items() if k in _not_stored_doc_keys}
579
 
@@ -618,7 +624,7 @@ class LightRAG:
618
  batch_docs = dict(list(new_docs.items())[i : i + batch_size])
619
  for doc_id, doc in tqdm_async(
620
  batch_docs.items(),
621
- desc=f"Level 1 - Spliting doc in batch {i//batch_size + 1}",
622
  ):
623
  try:
624
  # Generate chunks from document
 
361
  }
362
 
363
  # 3. Filter out already processed documents
364
+ # _add_doc_keys = await self.doc_status.filter_keys(list(new_docs.keys()))
365
+ _add_doc_keys = {
366
+ doc_id
367
+ for doc_id in new_docs.keys()
368
+ if (current_doc := await self.doc_status.get_by_id(doc_id)) is None
369
+ or current_doc["status"] == DocStatus.FAILED
370
+ }
371
  new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
372
 
373
  if not new_docs:
 
579
  _not_stored_doc_keys = await self.full_docs.filter_keys(list(new_docs.keys()))
580
  if len(_not_stored_doc_keys) < len(new_docs):
581
  logger.info(
582
+ f"Skipping {len(new_docs) - len(_not_stored_doc_keys)} already existing documents"
583
  )
584
  new_docs = {k: v for k, v in new_docs.items() if k in _not_stored_doc_keys}
585
 
 
624
  batch_docs = dict(list(new_docs.items())[i : i + batch_size])
625
  for doc_id, doc in tqdm_async(
626
  batch_docs.items(),
627
+ desc=f"Level 1 - Spliting doc in batch {i // batch_size + 1}",
628
  ):
629
  try:
630
  # Generate chunks from document
lightrag/storage.py CHANGED
@@ -445,6 +445,9 @@ class JsonDocStatusStorage(DocStatusStorage):
445
  await self.index_done_callback()
446
  return data
447
 
 
 
 
448
  async def get(self, doc_id: str) -> Union[DocProcessingStatus, None]:
449
  """Get document status by ID"""
450
  return self._data.get(doc_id)
 
445
  await self.index_done_callback()
446
  return data
447
 
448
+ async def get_by_id(self, id: str):
449
+ return self._data.get(id)
450
+
451
  async def get(self, doc_id: str) -> Union[DocProcessingStatus, None]:
452
  """Get document status by ID"""
453
  return self._data.get(doc_id)
requirements.txt CHANGED
@@ -4,6 +4,7 @@ aiofiles
4
  aiohttp
5
  aioredis
6
  asyncpg
 
7
 
8
  # database packages
9
  graspologic
@@ -17,13 +18,19 @@ numpy
17
  ollama
18
  openai
19
  oracledb
 
20
  psycopg-pool
21
  psycopg[binary,pool]
22
  pydantic
23
  pymilvus
24
  pymongo
25
  pymysql
 
 
 
 
26
  python-dotenv
 
27
  pyvis
28
  setuptools
29
  # lmdeploy[all]
 
4
  aiohttp
5
  aioredis
6
  asyncpg
7
+ configparser
8
 
9
  # database packages
10
  graspologic
 
18
  ollama
19
  openai
20
  oracledb
21
+ pipmaster
22
  psycopg-pool
23
  psycopg[binary,pool]
24
  pydantic
25
  pymilvus
26
  pymongo
27
  pymysql
28
+
29
+
30
+ PyPDF2
31
+ python-docx
32
  python-dotenv
33
+ python-pptx
34
  pyvis
35
  setuptools
36
  # lmdeploy[all]