rag-tool / app.py
Chris4K's picture
Update app.py
b4dfc79 verified
raw
history blame contribute delete
No virus
2.92 kB
import os
import gradio as gr
from langchain.vectorstores.faiss import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from PyPDF2 import PdfReader
# Load environment variables
#load_dotenv()
# Print the current working directory
current_directory = os.getcwd()
print("Current Working Directory:", current_directory)
def get_pdf_text(pdf_docs):
"""
Extract text from a list of PDF documents.
Parameters
----------
pdf_docs : list
List of PDF documents to extract text from.
Returns
-------
str
Extracted text from all the PDF documents.
"""
text = ""
#for pdf in pdf_docs:
pdf_reader = PdfReader(pdf_docs)
for page in pdf_reader.pages:
text += page.extract_text()
return text
def get_text_chunks(text):
"""
Split the input text into chunks.
Parameters
----------
text : str
The input text to be split.
Returns
-------
list
List of text chunks.
"""
text_splitter = CharacterTextSplitter(
separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
)
chunks = text_splitter.split_text(text)
return chunks
def get_vectorstore(text_chunks):
"""
Generate a vector store from a list of text chunks using HuggingFace BgeEmbeddings.
Parameters
----------
text_chunks : list
List of text chunks to be embedded.
Returns
-------
FAISS
A FAISS vector store containing the embeddings of the text chunks.
"""
model = "BAAI/bge-base-en-v1.5"
encode_kwargs = {
"normalize_embeddings": True
} # set True to compute cosine similarity
embeddings = HuggingFaceBgeEmbeddings(
model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
)
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
print("-----")
print(vectorstore.similarity_search("What is ALiBi?"))
print("-----")
return vectorstore
# Adjust the path to your PDF file by escaping the space
pdf_path = r"new_papers/ALiBi.pdf"
pdf_text = get_pdf_text(pdf_path)
text_chunks = get_text_chunks(pdf_text)
api_db = get_vectorstore(text_chunks)
# Define the PDF retrieval function
def pdf_retrieval(query):
# Run the query through the retriever
response = api_db.similarity_search(query)
print(response)
return response
# Create Gradio interface for the API retriever
api_tool = gr.Interface(
fn=pdf_retrieval,
inputs=[gr.Textbox()],
outputs=gr.Textbox(),
live=True,
title="API PDF Retrieval Tool",
description="This tool indexes PDF documents and retrieves relevant answers based on a given query (HuggingFaceBgeEmbeddings).",
)
# Launch the Gradio interface
api_tool.launch()