Spaces:

AI4SmartLife
/

smart_eco_footprint

Sleeping

App Files Files Community

smart_eco_footprint / api /vector_index /chroma.py

thanhtung09t2

Update file

9f3a3c8 verified 5 months ago

raw

history blame

3.05 kB

	import chromadb
	import os
	import shutil

	from llama_index.core import Settings, SimpleDirectoryReader, StorageContext, VectorStoreIndex
	from llama_index.vector_stores.chroma import ChromaVectorStore
	from llama_index.core.node_parser import SemanticSplitterNodeParser
	from llama_index.core.node_parser import SentenceSplitter
	root_path = os.environ['ROOT_PATH']

	DOCUMENTS_DIRECTORY = os.path.join(root_path,"doc","loaded")
	INPUT_DIRECTORY = os.path.join(root_path,"doc","input")
	DB_DIRECTORY = os.path.join(root_path,"chromadb")
	COLLECTION_NAME = "SmartAgri"
	DB_METADATA = {"hnsw:space": "cosine"}

	def move_files(src, dst):
	for root, dirs, files in os.walk(src):
	# Tạo cấu trúc thư mục con tương tự trong folder đích
	for dir in dirs:
	os.makedirs(os.path.join(dst, os.path.relpath(os.path.join(root, dir), src)), exist_ok=True)

	# Di chuyển các file
	for file in files:
	src_file = os.path.join(root, file)
	dst_file = os.path.join(dst, os.path.relpath(src_file, src))

	# Nếu file đích đã tồn tại, giữ lại file có sẵn trong folder 'dest'
	if not os.path.exists(dst_file):
	shutil.move(src_file, dst_file)

	def ChromaVectorIndex(force_new = False):
	chroma_client = chromadb.PersistentClient(path = DB_DIRECTORY)
	if force_new:
	# chuyển hết sang thư mục input để thực hiện embedding tất cả từ đầu
	move_files(DOCUMENTS_DIRECTORY, INPUT_DIRECTORY)
	try:
	chroma_client.delete_collection(COLLECTION_NAME) # xóa db đang có
	except:
	pass
	try:
	reader = SimpleDirectoryReader(input_dir=INPUT_DIRECTORY, recursive=True)
	documents = reader.load_data()
	except ValueError:
	documents = None
	chroma_collection = chroma_client.get_or_create_collection(COLLECTION_NAME,
	metadata=DB_METADATA)
	# chuyển hết sang thư mục loaded sau khi đã embedding xong
	vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
	move_files(INPUT_DIRECTORY, DOCUMENTS_DIRECTORY)

	if documents:
	Settings.text_splitter = SentenceSplitter(chunk_size=1500, chunk_overlap=500) # SemanticSplitterNodeParser(include_metadata=True, include_prev_next_rel=True, embed_model=Settings.embed_model, breakpoint_percentile_threshold=95)
	storage_context = StorageContext.from_defaults(vector_store=vector_store)
	index = VectorStoreIndex.from_documents(documents,
	transformations=[Settings.text_splitter],
	storage_context=storage_context,
	embed_model=Settings.embed_model)
	else:
	index = VectorStoreIndex.from_vector_store(vector_store=vector_store,
	embed_model=Settings.embed_model)
	return index