Spaces:
Sleeping
Sleeping
import chromadb | |
import os | |
import shutil | |
from llama_index.core import Settings, SimpleDirectoryReader, StorageContext, VectorStoreIndex | |
from llama_index.vector_stores.chroma import ChromaVectorStore | |
from llama_index.core.node_parser import SemanticSplitterNodeParser | |
from llama_index.core.node_parser import SentenceSplitter | |
root_path = os.environ['ROOT_PATH'] | |
DOCUMENTS_DIRECTORY = os.path.join(root_path,"doc","loaded") | |
INPUT_DIRECTORY = os.path.join(root_path,"doc","input") | |
DB_DIRECTORY = os.path.join(root_path,"chromadb") | |
COLLECTION_NAME = "SmartAgri" | |
DB_METADATA = {"hnsw:space": "cosine"} | |
def move_files(src, dst): | |
for root, dirs, files in os.walk(src): | |
# Tạo cấu trúc thư mục con tương tự trong folder đích | |
for dir in dirs: | |
os.makedirs(os.path.join(dst, os.path.relpath(os.path.join(root, dir), src)), exist_ok=True) | |
# Di chuyển các file | |
for file in files: | |
src_file = os.path.join(root, file) | |
dst_file = os.path.join(dst, os.path.relpath(src_file, src)) | |
# Nếu file đích đã tồn tại, giữ lại file có sẵn trong folder 'dest' | |
if not os.path.exists(dst_file): | |
shutil.move(src_file, dst_file) | |
def ChromaVectorIndex(force_new = False): | |
chroma_client = chromadb.PersistentClient(path = DB_DIRECTORY) | |
if force_new: | |
# chuyển hết sang thư mục input để thực hiện embedding tất cả từ đầu | |
move_files(DOCUMENTS_DIRECTORY, INPUT_DIRECTORY) | |
try: | |
chroma_client.delete_collection(COLLECTION_NAME) # xóa db đang có | |
except: | |
pass | |
try: | |
reader = SimpleDirectoryReader(input_dir=INPUT_DIRECTORY, recursive=True) | |
documents = reader.load_data() | |
except ValueError: | |
documents = None | |
chroma_collection = chroma_client.get_or_create_collection(COLLECTION_NAME, | |
metadata=DB_METADATA) | |
# chuyển hết sang thư mục loaded sau khi đã embedding xong | |
vector_store = ChromaVectorStore(chroma_collection=chroma_collection) | |
move_files(INPUT_DIRECTORY, DOCUMENTS_DIRECTORY) | |
if documents: | |
Settings.text_splitter = SentenceSplitter(chunk_size=1500, chunk_overlap=500) # SemanticSplitterNodeParser(include_metadata=True, include_prev_next_rel=True, embed_model=Settings.embed_model, breakpoint_percentile_threshold=95) | |
storage_context = StorageContext.from_defaults(vector_store=vector_store) | |
index = VectorStoreIndex.from_documents(documents, | |
transformations=[Settings.text_splitter], | |
storage_context=storage_context, | |
embed_model=Settings.embed_model) | |
else: | |
index = VectorStoreIndex.from_vector_store(vector_store=vector_store, | |
embed_model=Settings.embed_model) | |
return index | |