from langchain_openai import OpenAIEmbeddings from langchain_community.vectorstores import Chroma from langchain_community.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter import os import shutil class GT_RAG(): def __init__(self, cache_dir): model = OpenAIEmbeddings( model="text-embedding-3-large" ) calculate_vector_embeddings('RAG_data', 'RAG_cache') self.rag = Chroma(persist_directory=cache_dir, embedding_function=model) def query(self, q, k=10): related = self.rag.similarity_search_with_score(query=q, k=k) related = sorted(related, key = lambda x : x[1]) related = [r[0].page_content for r in related] return ' '.join(related) def calculate_vector_embeddings(input_dir, cache_dir): if os.path.isdir(cache_dir): shutil.rmtree(cache_dir) os.mkdir(cache_dir) model = OpenAIEmbeddings( model="text-embedding-3-large" ) print("Emedding Data...") chroma_documents = [] for src in os.listdir(input_dir): # load your pdf doc loader = PyPDFLoader(os.path.join(input_dir, src)) pages = loader.load() # split the doc into smaller chunks text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=150) chunks = text_splitter.split_documents(pages) chroma_documents += chunks print("Done!") print("Calculating embeddings and building database...") chroma_db = Chroma.from_documents(documents=chroma_documents, embedding=model, persist_directory=cache_dir) chroma_db.persist() print("Done!") if __name__ == '__main__': # os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" calculate_vector_embeddings('RAG_data', 'RAG_cache') rag = GT_RAG('RAG_cache') res = rag.query("I want help with registering for a CS class") print(res)