File size: 3,051 Bytes
bcc5b22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f3a3c8
 
 
 
bcc5b22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import chromadb
import os
import shutil

from llama_index.core import Settings, SimpleDirectoryReader, StorageContext, VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore 
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.core.node_parser import SentenceSplitter
root_path = os.environ['ROOT_PATH']

DOCUMENTS_DIRECTORY = os.path.join(root_path,"doc","loaded")
INPUT_DIRECTORY = os.path.join(root_path,"doc","input")
DB_DIRECTORY = os.path.join(root_path,"chromadb")
COLLECTION_NAME = "SmartAgri"
DB_METADATA = {"hnsw:space": "cosine"}

def move_files(src, dst):
    for root, dirs, files in os.walk(src):
        # Tạo cấu trúc thư mục con tương tự trong folder đích
        for dir in dirs:
            os.makedirs(os.path.join(dst, os.path.relpath(os.path.join(root, dir), src)), exist_ok=True)
        
        # Di chuyển các file
        for file in files:
            src_file = os.path.join(root, file)
            dst_file = os.path.join(dst, os.path.relpath(src_file, src))
            
            # Nếu file đích đã tồn tại, giữ lại file có sẵn trong folder 'dest'
            if not os.path.exists(dst_file):
                shutil.move(src_file, dst_file)

def ChromaVectorIndex(force_new = False):
    chroma_client = chromadb.PersistentClient(path = DB_DIRECTORY)
    if force_new:
        # chuyển hết sang thư mục input để thực hiện embedding tất cả từ đầu
        move_files(DOCUMENTS_DIRECTORY, INPUT_DIRECTORY)
        try:
            chroma_client.delete_collection(COLLECTION_NAME) # xóa db đang có
        except:
            pass
    try:
        reader = SimpleDirectoryReader(input_dir=INPUT_DIRECTORY, recursive=True)
        documents = reader.load_data()
    except ValueError:
        documents = None
    chroma_collection = chroma_client.get_or_create_collection(COLLECTION_NAME,
                                                        metadata=DB_METADATA)
    # chuyển hết sang thư mục loaded sau khi đã embedding xong
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    move_files(INPUT_DIRECTORY, DOCUMENTS_DIRECTORY)

    if documents:
        Settings.text_splitter = SentenceSplitter(chunk_size=1500, chunk_overlap=500) # SemanticSplitterNodeParser(include_metadata=True, include_prev_next_rel=True, embed_model=Settings.embed_model, breakpoint_percentile_threshold=95)
        storage_context = StorageContext.from_defaults(vector_store=vector_store)
        index = VectorStoreIndex.from_documents(documents, 
                                                transformations=[Settings.text_splitter],
                                                storage_context=storage_context, 
                                                embed_model=Settings.embed_model)
    else:
        index = VectorStoreIndex.from_vector_store(vector_store=vector_store,
                                               embed_model=Settings.embed_model)
    return index