Reggie / RAG.py
RangDeBasanti's picture
Upload folder using huggingface_hub
3402f0a verified
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
import shutil
class GT_RAG():
def __init__(self, cache_dir):
model = OpenAIEmbeddings(
model="text-embedding-3-large"
)
calculate_vector_embeddings('RAG_data', 'RAG_cache')
self.rag = Chroma(persist_directory=cache_dir, embedding_function=model)
def query(self, q, k=10):
related = self.rag.similarity_search_with_score(query=q, k=k)
related = sorted(related, key = lambda x : x[1])
related = [r[0].page_content for r in related]
return ' '.join(related)
def calculate_vector_embeddings(input_dir, cache_dir):
if os.path.isdir(cache_dir):
shutil.rmtree(cache_dir)
os.mkdir(cache_dir)
model = OpenAIEmbeddings(
model="text-embedding-3-large"
)
print("Emedding Data...")
chroma_documents = []
for src in os.listdir(input_dir):
# load your pdf doc
loader = PyPDFLoader(os.path.join(input_dir, src))
pages = loader.load()
# split the doc into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=150)
chunks = text_splitter.split_documents(pages)
chroma_documents += chunks
print("Done!")
print("Calculating embeddings and building database...")
chroma_db = Chroma.from_documents(documents=chroma_documents, embedding=model, persist_directory=cache_dir)
chroma_db.persist()
print("Done!")
if __name__ == '__main__':
# os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
calculate_vector_embeddings('RAG_data', 'RAG_cache')
rag = GT_RAG('RAG_cache')
res = rag.query("I want help with registering for a CS class")
print(res)