thanhtung09t2 commited on
Commit
bcc5b22
·
verified ·
1 Parent(s): 0859f1c

Upload vector_index

Browse files
api/vector_index/base.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Class trung gian xử lý tác vụ tạo VectorIndex từ database
2
+ # Nhận vào các tham số: loại db, tên embedding_model
3
+ from api.vector_index import chroma, milvus
4
+
5
+ def get_vector_index(db_name, force_new = False):
6
+ if db_name == "chroma":
7
+ return chroma.ChromaVectorIndex(force_new)
8
+ if db_name == "milvus":
9
+ return milvus.MilvusVectorIndex(force_new)
10
+ else:
11
+ raise NotImplementedError("This type of index is not yet supported")
api/vector_index/chroma.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chromadb
2
+ import os
3
+ import shutil
4
+
5
+ from llama_index.core import Settings, SimpleDirectoryReader, StorageContext, VectorStoreIndex
6
+ from llama_index.vector_stores.chroma import ChromaVectorStore
7
+ from llama_index.core.node_parser import SemanticSplitterNodeParser
8
+ from llama_index.core.node_parser import SentenceSplitter
9
+ root_path = os.environ['ROOT_PATH']
10
+
11
+ DOCUMENTS_DIRECTORY = os.path.join(root_path,"doc","loaded")
12
+ INPUT_DIRECTORY = os.path.join(root_path,"doc","input")
13
+ DB_DIRECTORY = os.path.join(root_path,"chromadb")
14
+ COLLECTION_NAME = "SmartAgri"
15
+ DB_METADATA = {"hnsw:space": "cosine"}
16
+
17
+ def move_files(src, dst):
18
+ for root, dirs, files in os.walk(src):
19
+ # Tạo cấu trúc thư mục con tương tự trong folder đích
20
+ for dir in dirs:
21
+ os.makedirs(os.path.join(dst, os.path.relpath(os.path.join(root, dir), src)), exist_ok=True)
22
+
23
+ # Di chuyển các file
24
+ for file in files:
25
+ src_file = os.path.join(root, file)
26
+ dst_file = os.path.join(dst, os.path.relpath(src_file, src))
27
+
28
+ # Nếu file đích đã tồn tại, giữ lại file có sẵn trong folder 'dest'
29
+ if not os.path.exists(dst_file):
30
+ shutil.move(src_file, dst_file)
31
+
32
+ def ChromaVectorIndex(force_new = False):
33
+ chroma_client = chromadb.PersistentClient(path = DB_DIRECTORY)
34
+ if force_new:
35
+ # chuyển hết sang thư mục input để thực hiện embedding tất cả từ đầu
36
+ move_files(DOCUMENTS_DIRECTORY, INPUT_DIRECTORY)
37
+ chroma_client.delete_collection(COLLECTION_NAME) # xóa db đang có
38
+ try:
39
+ reader = SimpleDirectoryReader(input_dir=INPUT_DIRECTORY, recursive=True)
40
+ documents = reader.load_data()
41
+ except ValueError:
42
+ documents = None
43
+ chroma_collection = chroma_client.get_or_create_collection(COLLECTION_NAME,
44
+ metadata=DB_METADATA)
45
+ # chuyển hết sang thư mục loaded sau khi đã embedding xong
46
+ vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
47
+ move_files(INPUT_DIRECTORY, DOCUMENTS_DIRECTORY)
48
+
49
+ if documents:
50
+ Settings.text_splitter = SentenceSplitter(chunk_size=1500, chunk_overlap=500) # SemanticSplitterNodeParser(include_metadata=True, include_prev_next_rel=True, embed_model=Settings.embed_model, breakpoint_percentile_threshold=95)
51
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
52
+ index = VectorStoreIndex.from_documents(documents,
53
+ transformations=[Settings.text_splitter],
54
+ storage_context=storage_context,
55
+ embed_model=Settings.embed_model)
56
+ else:
57
+ index = VectorStoreIndex.from_vector_store(vector_store=vector_store,
58
+ embed_model=Settings.embed_model)
59
+ return index
api/vector_index/vector_index_config.py ADDED
File without changes