feat: 新增ini文件读取数据库配置方式,方便生产环境,修改Lightrag ainsert方法_add_doc_keys获取方式,原来只过滤存在的,但这会让失败的文档无法再次存储,新增--chunk_size和--chunk_overlap_size方便生产环境,新增llm_binding:openai-ollama 方便用openai的同时使用ollama embedding
Browse files- config.ini +13 -0
- lightrag/api/lightrag_server.py +79 -4
- lightrag/lightrag.py +9 -3
- lightrag/storage.py +3 -0
- requirements.txt +7 -0
config.ini
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[redis]
|
| 2 |
+
uri = redis://localhost:6379
|
| 3 |
+
|
| 4 |
+
[neo4j]
|
| 5 |
+
uri = #
|
| 6 |
+
username = neo4j
|
| 7 |
+
password = 12345678
|
| 8 |
+
|
| 9 |
+
[milvus]
|
| 10 |
+
uri = #
|
| 11 |
+
user = root
|
| 12 |
+
password = Milvus
|
| 13 |
+
db_name = lightrag
|
lightrag/api/lightrag_server.py
CHANGED
|
@@ -20,6 +20,7 @@ import shutil
|
|
| 20 |
import aiofiles
|
| 21 |
from ascii_colors import trace_exception, ASCIIColors
|
| 22 |
import os
|
|
|
|
| 23 |
|
| 24 |
from fastapi import Depends, Security
|
| 25 |
from fastapi.security import APIKeyHeader
|
|
@@ -57,6 +58,52 @@ LIGHTRAG_SIZE = 7365960935 # it's a dummy value
|
|
| 57 |
LIGHTRAG_CREATED_AT = "2024-01-15T00:00:00Z"
|
| 58 |
LIGHTRAG_DIGEST = "sha256:lightrag"
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
def get_default_host(binding_type: str) -> str:
|
| 62 |
default_hosts = {
|
|
@@ -337,6 +384,18 @@ def parse_args() -> argparse.Namespace:
|
|
| 337 |
help="Embedding model name (default: from env or bge-m3:latest)",
|
| 338 |
)
|
| 339 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 340 |
def timeout_type(value):
|
| 341 |
if value is None or value == "None":
|
| 342 |
return None
|
|
@@ -551,7 +610,7 @@ def get_api_key_dependency(api_key: Optional[str]):
|
|
| 551 |
|
| 552 |
def create_app(args):
|
| 553 |
# Verify that bindings arer correctly setup
|
| 554 |
-
if args.llm_binding not in ["lollms", "ollama", "openai"]:
|
| 555 |
raise Exception("llm binding not supported")
|
| 556 |
|
| 557 |
if args.embedding_binding not in ["lollms", "ollama", "openai"]:
|
|
@@ -692,22 +751,32 @@ def create_app(args):
|
|
| 692 |
)
|
| 693 |
|
| 694 |
# Initialize RAG
|
| 695 |
-
if args.llm_binding in ["lollms", "ollama"]:
|
| 696 |
rag = LightRAG(
|
| 697 |
working_dir=args.working_dir,
|
| 698 |
llm_model_func=lollms_model_complete
|
| 699 |
if args.llm_binding == "lollms"
|
| 700 |
-
else ollama_model_complete
|
|
|
|
|
|
|
| 701 |
llm_model_name=args.llm_model,
|
| 702 |
llm_model_max_async=args.max_async,
|
| 703 |
llm_model_max_token_size=args.max_tokens,
|
|
|
|
|
|
|
| 704 |
llm_model_kwargs={
|
| 705 |
"host": args.llm_binding_host,
|
| 706 |
"timeout": args.timeout,
|
| 707 |
"options": {"num_ctx": args.max_tokens},
|
| 708 |
"api_key": args.llm_binding_api_key,
|
| 709 |
-
}
|
|
|
|
|
|
|
| 710 |
embedding_func=embedding_func,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 711 |
)
|
| 712 |
else:
|
| 713 |
rag = LightRAG(
|
|
@@ -715,7 +784,13 @@ def create_app(args):
|
|
| 715 |
llm_model_func=azure_openai_model_complete
|
| 716 |
if args.llm_binding == "azure_openai"
|
| 717 |
else openai_alike_model_complete,
|
|
|
|
|
|
|
| 718 |
embedding_func=embedding_func,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 719 |
)
|
| 720 |
|
| 721 |
async def index_file(file_path: Union[str, Path]) -> None:
|
|
|
|
| 20 |
import aiofiles
|
| 21 |
from ascii_colors import trace_exception, ASCIIColors
|
| 22 |
import os
|
| 23 |
+
import configparser
|
| 24 |
|
| 25 |
from fastapi import Depends, Security
|
| 26 |
from fastapi.security import APIKeyHeader
|
|
|
|
| 58 |
LIGHTRAG_CREATED_AT = "2024-01-15T00:00:00Z"
|
| 59 |
LIGHTRAG_DIGEST = "sha256:lightrag"
|
| 60 |
|
| 61 |
+
KV_STORAGE = "JsonKVStorage"
|
| 62 |
+
DOC_STATUS_STORAGE = "JsonDocStatusStorage"
|
| 63 |
+
GRAPH_STORAGE = "NetworkXStorage"
|
| 64 |
+
VECTOR_STORAGE = "NanoVectorDBStorage"
|
| 65 |
+
|
| 66 |
+
# 读取配置文件
|
| 67 |
+
config = configparser.ConfigParser()
|
| 68 |
+
config.read("config.ini")
|
| 69 |
+
# Redis 配置
|
| 70 |
+
redis_uri = config.get("redis", "uri", fallback=None)
|
| 71 |
+
if redis_uri:
|
| 72 |
+
os.environ["REDIS_URI"] = redis_uri
|
| 73 |
+
KV_STORAGE = "RedisKVStorage"
|
| 74 |
+
DOC_STATUS_STORAGE = "RedisKVStorage"
|
| 75 |
+
|
| 76 |
+
# Neo4j 配置
|
| 77 |
+
neo4j_uri = config.get("neo4j", "uri", fallback=None)
|
| 78 |
+
neo4j_username = config.get("neo4j", "username", fallback=None)
|
| 79 |
+
neo4j_password = config.get("neo4j", "password", fallback=None)
|
| 80 |
+
if neo4j_uri:
|
| 81 |
+
os.environ["NEO4J_URI"] = neo4j_uri
|
| 82 |
+
os.environ["NEO4J_USERNAME"] = neo4j_username
|
| 83 |
+
os.environ["NEO4J_PASSWORD"] = neo4j_password
|
| 84 |
+
GRAPH_STORAGE = "Neo4JStorage"
|
| 85 |
+
|
| 86 |
+
# Milvus 配置
|
| 87 |
+
milvus_uri = config.get("milvus", "uri", fallback=None)
|
| 88 |
+
milvus_user = config.get("milvus", "user", fallback=None)
|
| 89 |
+
milvus_password = config.get("milvus", "password", fallback=None)
|
| 90 |
+
milvus_db_name = config.get("milvus", "db_name", fallback=None)
|
| 91 |
+
if milvus_uri:
|
| 92 |
+
os.environ["MILVUS_URI"] = milvus_uri
|
| 93 |
+
os.environ["MILVUS_USER"] = milvus_user
|
| 94 |
+
os.environ["MILVUS_PASSWORD"] = milvus_password
|
| 95 |
+
os.environ["MILVUS_DB_NAME"] = milvus_db_name
|
| 96 |
+
VECTOR_STORAGE = "MilvusVectorDBStorge"
|
| 97 |
+
|
| 98 |
+
# MongoDB 配置
|
| 99 |
+
mongo_uri = config.get("mongodb", "uri", fallback=None)
|
| 100 |
+
mongo_database = config.get("mongodb", "LightRAG", fallback=None)
|
| 101 |
+
if mongo_uri:
|
| 102 |
+
os.environ["MONGO_URI"] = mongo_uri
|
| 103 |
+
os.environ["MONGO_DATABASE"] = mongo_database
|
| 104 |
+
KV_STORAGE = "MongoKVStorage"
|
| 105 |
+
DOC_STATUS_STORAGE = "MongoKVStorage"
|
| 106 |
+
|
| 107 |
|
| 108 |
def get_default_host(binding_type: str) -> str:
|
| 109 |
default_hosts = {
|
|
|
|
| 384 |
help="Embedding model name (default: from env or bge-m3:latest)",
|
| 385 |
)
|
| 386 |
|
| 387 |
+
parser.add_argument(
|
| 388 |
+
"--chunk_size",
|
| 389 |
+
default=1200,
|
| 390 |
+
help="chunk token size default 1200",
|
| 391 |
+
)
|
| 392 |
+
|
| 393 |
+
parser.add_argument(
|
| 394 |
+
"--chunk_overlap_size",
|
| 395 |
+
default=100,
|
| 396 |
+
help="chunk token size default 1200",
|
| 397 |
+
)
|
| 398 |
+
|
| 399 |
def timeout_type(value):
|
| 400 |
if value is None or value == "None":
|
| 401 |
return None
|
|
|
|
| 610 |
|
| 611 |
def create_app(args):
|
| 612 |
# Verify that bindings arer correctly setup
|
| 613 |
+
if args.llm_binding not in ["lollms", "ollama", "openai", "openai-ollama"]:
|
| 614 |
raise Exception("llm binding not supported")
|
| 615 |
|
| 616 |
if args.embedding_binding not in ["lollms", "ollama", "openai"]:
|
|
|
|
| 751 |
)
|
| 752 |
|
| 753 |
# Initialize RAG
|
| 754 |
+
if args.llm_binding in ["lollms", "ollama", "openai-ollama"]:
|
| 755 |
rag = LightRAG(
|
| 756 |
working_dir=args.working_dir,
|
| 757 |
llm_model_func=lollms_model_complete
|
| 758 |
if args.llm_binding == "lollms"
|
| 759 |
+
else ollama_model_complete
|
| 760 |
+
if args.llm_binding == "ollama"
|
| 761 |
+
else openai_alike_model_complete,
|
| 762 |
llm_model_name=args.llm_model,
|
| 763 |
llm_model_max_async=args.max_async,
|
| 764 |
llm_model_max_token_size=args.max_tokens,
|
| 765 |
+
chunk_token_size=int(args.chunk_size),
|
| 766 |
+
chunk_overlap_token_size=int(args.chunk_overlap_size),
|
| 767 |
llm_model_kwargs={
|
| 768 |
"host": args.llm_binding_host,
|
| 769 |
"timeout": args.timeout,
|
| 770 |
"options": {"num_ctx": args.max_tokens},
|
| 771 |
"api_key": args.llm_binding_api_key,
|
| 772 |
+
}
|
| 773 |
+
if args.llm_binding == "lollms" or args.llm_binding == "ollama"
|
| 774 |
+
else {},
|
| 775 |
embedding_func=embedding_func,
|
| 776 |
+
kv_storage=KV_STORAGE,
|
| 777 |
+
graph_storage=GRAPH_STORAGE,
|
| 778 |
+
vector_storage=VECTOR_STORAGE,
|
| 779 |
+
doc_status_storage=DOC_STATUS_STORAGE,
|
| 780 |
)
|
| 781 |
else:
|
| 782 |
rag = LightRAG(
|
|
|
|
| 784 |
llm_model_func=azure_openai_model_complete
|
| 785 |
if args.llm_binding == "azure_openai"
|
| 786 |
else openai_alike_model_complete,
|
| 787 |
+
chunk_token_size=int(args.chunk_size),
|
| 788 |
+
chunk_overlap_token_size=int(args.chunk_overlap_size),
|
| 789 |
embedding_func=embedding_func,
|
| 790 |
+
kv_storage=KV_STORAGE,
|
| 791 |
+
graph_storage=GRAPH_STORAGE,
|
| 792 |
+
vector_storage=VECTOR_STORAGE,
|
| 793 |
+
doc_status_storage=DOC_STATUS_STORAGE,
|
| 794 |
)
|
| 795 |
|
| 796 |
async def index_file(file_path: Union[str, Path]) -> None:
|
lightrag/lightrag.py
CHANGED
|
@@ -361,7 +361,13 @@ class LightRAG:
|
|
| 361 |
}
|
| 362 |
|
| 363 |
# 3. Filter out already processed documents
|
| 364 |
-
_add_doc_keys = await self.doc_status.filter_keys(list(new_docs.keys()))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
|
| 366 |
|
| 367 |
if not new_docs:
|
|
@@ -573,7 +579,7 @@ class LightRAG:
|
|
| 573 |
_not_stored_doc_keys = await self.full_docs.filter_keys(list(new_docs.keys()))
|
| 574 |
if len(_not_stored_doc_keys) < len(new_docs):
|
| 575 |
logger.info(
|
| 576 |
-
f"Skipping {len(new_docs)-len(_not_stored_doc_keys)} already existing documents"
|
| 577 |
)
|
| 578 |
new_docs = {k: v for k, v in new_docs.items() if k in _not_stored_doc_keys}
|
| 579 |
|
|
@@ -618,7 +624,7 @@ class LightRAG:
|
|
| 618 |
batch_docs = dict(list(new_docs.items())[i : i + batch_size])
|
| 619 |
for doc_id, doc in tqdm_async(
|
| 620 |
batch_docs.items(),
|
| 621 |
-
desc=f"Level 1 - Spliting doc in batch {i//batch_size + 1}",
|
| 622 |
):
|
| 623 |
try:
|
| 624 |
# Generate chunks from document
|
|
|
|
| 361 |
}
|
| 362 |
|
| 363 |
# 3. Filter out already processed documents
|
| 364 |
+
# _add_doc_keys = await self.doc_status.filter_keys(list(new_docs.keys()))
|
| 365 |
+
_add_doc_keys = {
|
| 366 |
+
doc_id
|
| 367 |
+
for doc_id in new_docs.keys()
|
| 368 |
+
if (current_doc := await self.doc_status.get_by_id(doc_id)) is None
|
| 369 |
+
or current_doc["status"] == DocStatus.FAILED
|
| 370 |
+
}
|
| 371 |
new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
|
| 372 |
|
| 373 |
if not new_docs:
|
|
|
|
| 579 |
_not_stored_doc_keys = await self.full_docs.filter_keys(list(new_docs.keys()))
|
| 580 |
if len(_not_stored_doc_keys) < len(new_docs):
|
| 581 |
logger.info(
|
| 582 |
+
f"Skipping {len(new_docs) - len(_not_stored_doc_keys)} already existing documents"
|
| 583 |
)
|
| 584 |
new_docs = {k: v for k, v in new_docs.items() if k in _not_stored_doc_keys}
|
| 585 |
|
|
|
|
| 624 |
batch_docs = dict(list(new_docs.items())[i : i + batch_size])
|
| 625 |
for doc_id, doc in tqdm_async(
|
| 626 |
batch_docs.items(),
|
| 627 |
+
desc=f"Level 1 - Spliting doc in batch {i // batch_size + 1}",
|
| 628 |
):
|
| 629 |
try:
|
| 630 |
# Generate chunks from document
|
lightrag/storage.py
CHANGED
|
@@ -445,6 +445,9 @@ class JsonDocStatusStorage(DocStatusStorage):
|
|
| 445 |
await self.index_done_callback()
|
| 446 |
return data
|
| 447 |
|
|
|
|
|
|
|
|
|
|
| 448 |
async def get(self, doc_id: str) -> Union[DocProcessingStatus, None]:
|
| 449 |
"""Get document status by ID"""
|
| 450 |
return self._data.get(doc_id)
|
|
|
|
| 445 |
await self.index_done_callback()
|
| 446 |
return data
|
| 447 |
|
| 448 |
+
async def get_by_id(self, id: str):
|
| 449 |
+
return self._data.get(id)
|
| 450 |
+
|
| 451 |
async def get(self, doc_id: str) -> Union[DocProcessingStatus, None]:
|
| 452 |
"""Get document status by ID"""
|
| 453 |
return self._data.get(doc_id)
|
requirements.txt
CHANGED
|
@@ -4,6 +4,7 @@ aiofiles
|
|
| 4 |
aiohttp
|
| 5 |
aioredis
|
| 6 |
asyncpg
|
|
|
|
| 7 |
|
| 8 |
# database packages
|
| 9 |
graspologic
|
|
@@ -17,13 +18,19 @@ numpy
|
|
| 17 |
ollama
|
| 18 |
openai
|
| 19 |
oracledb
|
|
|
|
| 20 |
psycopg-pool
|
| 21 |
psycopg[binary,pool]
|
| 22 |
pydantic
|
| 23 |
pymilvus
|
| 24 |
pymongo
|
| 25 |
pymysql
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
python-dotenv
|
|
|
|
| 27 |
pyvis
|
| 28 |
setuptools
|
| 29 |
# lmdeploy[all]
|
|
|
|
| 4 |
aiohttp
|
| 5 |
aioredis
|
| 6 |
asyncpg
|
| 7 |
+
configparser
|
| 8 |
|
| 9 |
# database packages
|
| 10 |
graspologic
|
|
|
|
| 18 |
ollama
|
| 19 |
openai
|
| 20 |
oracledb
|
| 21 |
+
pipmaster
|
| 22 |
psycopg-pool
|
| 23 |
psycopg[binary,pool]
|
| 24 |
pydantic
|
| 25 |
pymilvus
|
| 26 |
pymongo
|
| 27 |
pymysql
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
PyPDF2
|
| 31 |
+
python-docx
|
| 32 |
python-dotenv
|
| 33 |
+
python-pptx
|
| 34 |
pyvis
|
| 35 |
setuptools
|
| 36 |
# lmdeploy[all]
|