thanhtung09t2's picture
Update file
9f3a3c8 verified
raw
history blame
3.05 kB
import chromadb
import os
import shutil
from llama_index.core import Settings, SimpleDirectoryReader, StorageContext, VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.core.node_parser import SentenceSplitter
root_path = os.environ['ROOT_PATH']
DOCUMENTS_DIRECTORY = os.path.join(root_path,"doc","loaded")
INPUT_DIRECTORY = os.path.join(root_path,"doc","input")
DB_DIRECTORY = os.path.join(root_path,"chromadb")
COLLECTION_NAME = "SmartAgri"
DB_METADATA = {"hnsw:space": "cosine"}
def move_files(src, dst):
for root, dirs, files in os.walk(src):
# Tạo cấu trúc thư mục con tương tự trong folder đích
for dir in dirs:
os.makedirs(os.path.join(dst, os.path.relpath(os.path.join(root, dir), src)), exist_ok=True)
# Di chuyển các file
for file in files:
src_file = os.path.join(root, file)
dst_file = os.path.join(dst, os.path.relpath(src_file, src))
# Nếu file đích đã tồn tại, giữ lại file có sẵn trong folder 'dest'
if not os.path.exists(dst_file):
shutil.move(src_file, dst_file)
def ChromaVectorIndex(force_new = False):
chroma_client = chromadb.PersistentClient(path = DB_DIRECTORY)
if force_new:
# chuyển hết sang thư mục input để thực hiện embedding tất cả từ đầu
move_files(DOCUMENTS_DIRECTORY, INPUT_DIRECTORY)
try:
chroma_client.delete_collection(COLLECTION_NAME) # xóa db đang có
except:
pass
try:
reader = SimpleDirectoryReader(input_dir=INPUT_DIRECTORY, recursive=True)
documents = reader.load_data()
except ValueError:
documents = None
chroma_collection = chroma_client.get_or_create_collection(COLLECTION_NAME,
metadata=DB_METADATA)
# chuyển hết sang thư mục loaded sau khi đã embedding xong
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
move_files(INPUT_DIRECTORY, DOCUMENTS_DIRECTORY)
if documents:
Settings.text_splitter = SentenceSplitter(chunk_size=1500, chunk_overlap=500) # SemanticSplitterNodeParser(include_metadata=True, include_prev_next_rel=True, embed_model=Settings.embed_model, breakpoint_percentile_threshold=95)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(documents,
transformations=[Settings.text_splitter],
storage_context=storage_context,
embed_model=Settings.embed_model)
else:
index = VectorStoreIndex.from_vector_store(vector_store=vector_store,
embed_model=Settings.embed_model)
return index