import gradio as gr from langchain.vectorstores import Chroma from langchain.document_loaders import PyPDFLoader from langchain.embeddings import HuggingFaceInstructEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter # Initialize the HuggingFaceInstructEmbeddings hf = HuggingFaceInstructEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": "cpu"} ) # Load and process the PDF files from langchain.document_loaders import PyPDFDirectoryLoader loader = PyPDFDirectoryLoader("new_papers/") documents = loader.load() #loader = PyPDFLoader('./new_papers/', glob="./*.pdf") #documents = loader.load() #splitting the text into text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) texts = text_splitter.split_documents(documents) # Create a Chroma vector store from the PDF documents db = Chroma.from_documents(texts, hf, collection_name="my-collection") class VectoreStoreRetrievalTool: def __init__(self): self.retriever = db.as_retriever(search_kwargs={"k": 1}) def __call__(self, query): # Run the query through the retriever response = self.retriever.run(query) return response['result']