from langchain_huggingface import HuggingFaceEmbeddings from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM from langchain_community.vectorstores import Chroma from langchain.schema import Document from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline import torch embedding_model_name = 'nomic-ai/nomic-embed-text-v1.5' model_kwargs = {'device':'cuda' if torch.cuda.is_available() else 'cpu',"trust_remote_code": True} embeddings = HuggingFaceEmbeddings( model_name=embedding_model_name, model_kwargs=model_kwargs ) vectorstore = None def read_file(data: str) -> Document: f = open(data,'r') content = f.read() f.close() doc = Document(page_content=content, metadata={"name": data.split('/')[-1]}) return doc text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100) def add_doc(data,vectorstore): doc = read_file(data) splits = text_splitter.split_documents([doc]) vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings) retriever = vectorstore.as_retriever(search_kwargs={'k':1}) return retriever, vectorstore def delete_doc(delete_name,vectorstore): delete_doc_ids = [] for idx,name in enumerate(vectorstore.get()['metadatas']): if name['name'] == delete_name: delete_doc_ids.append(vectorstore.get()['ids'][idx]) for id in delete_doc_ids: vectorstore.delete(ids = id) # vectorstore.persist() retriever = vectorstore.as_retriever(search_kwargs={'k':1}) return retriever, vectorstore def delete_all_doc(vectorstore): delete_doc_ids = vectorstore.get()['ids'] for id in delete_doc_ids: vectorstore.delete(ids = id) # vectorstore.persist() retriever = vectorstore.as_retriever(search_kwargs={'k':1}) return retriever, vectorstore