import os import gradio as gr from langchain.vectorstores.faiss import FAISS from langchain.embeddings import HuggingFaceBgeEmbeddings from langchain.document_loaders import PyPDFLoader from langchain.text_splitter import CharacterTextSplitter from PyPDF2 import PdfReader # Load environment variables #load_dotenv() # Print the current working directory current_directory = os.getcwd() print("Current Working Directory:", current_directory) def get_pdf_text(pdf_docs): """ Extract text from a list of PDF documents. Parameters ---------- pdf_docs : list List of PDF documents to extract text from. Returns ------- str Extracted text from all the PDF documents. """ text = "" #for pdf in pdf_docs: pdf_reader = PdfReader(pdf_docs) for page in pdf_reader.pages: text += page.extract_text() return text def get_text_chunks(text): """ Split the input text into chunks. Parameters ---------- text : str The input text to be split. Returns ------- list List of text chunks. """ text_splitter = CharacterTextSplitter( separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len ) chunks = text_splitter.split_text(text) return chunks def get_vectorstore(text_chunks): """ Generate a vector store from a list of text chunks using HuggingFace BgeEmbeddings. Parameters ---------- text_chunks : list List of text chunks to be embedded. Returns ------- FAISS A FAISS vector store containing the embeddings of the text chunks. """ model = "BAAI/bge-base-en-v1.5" encode_kwargs = { "normalize_embeddings": True } # set True to compute cosine similarity embeddings = HuggingFaceBgeEmbeddings( model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"} ) vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings) print("-----") print(vectorstore.as_retriever.similarity_search("What is ALiBi?")) print("-----") return vectorstore # Adjust the path to your PDF file by escaping the space pdf_path = r"new_papers/ALiBi.pdf" pdf_text = get_pdf_text(pdf_path) text_chunks = get_text_chunks(pdf_text) api_db = get_vectorstore(text_chunks) # Define the PDF retrieval function def pdf_retrieval(query): # Run the query through the retriever response = api_db.similarity_search(query) print(response) return response # Create Gradio interface for the API retriever api_tool = gr.Interface( fn=pdf_retrieval, inputs=[gr.Textbox()], outputs=gr.Textbox(), live=True, title="API PDF Retrieval Tool", description="This tool indexes PDF documents and retrieves relevant answers based on a given query (HuggingFaceBgeEmbeddings).", ) # Launch the Gradio interface api_tool.launch()