rag-tool

Sleeping

App Files Files Community

rag-tool / app.py

Chris4K

Update app.py

b4dfc79 verified 9 months ago

raw

history blame contribute delete

No virus

2.92 kB

	import os
	import gradio as gr
	from langchain.vectorstores.faiss import FAISS
	from langchain.embeddings import HuggingFaceBgeEmbeddings
	from langchain.document_loaders import PyPDFLoader
	from langchain.text_splitter import CharacterTextSplitter
	from PyPDF2 import PdfReader

	# Load environment variables
	#load_dotenv()


	# Print the current working directory
	current_directory = os.getcwd()
	print("Current Working Directory:", current_directory)

	def get_pdf_text(pdf_docs):
	"""
	Extract text from a list of PDF documents.

	Parameters
	----------
	pdf_docs : list
	List of PDF documents to extract text from.

	Returns
	-------
	str
	Extracted text from all the PDF documents.

	"""
	text = ""
	#for pdf in pdf_docs:
	pdf_reader = PdfReader(pdf_docs)
	for page in pdf_reader.pages:
	text += page.extract_text()
	return text


	def get_text_chunks(text):
	"""
	Split the input text into chunks.

	Parameters
	----------
	text : str
	The input text to be split.

	Returns
	-------
	list
	List of text chunks.

	"""
	text_splitter = CharacterTextSplitter(
	separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
	)
	chunks = text_splitter.split_text(text)
	return chunks


	def get_vectorstore(text_chunks):
	"""
	Generate a vector store from a list of text chunks using HuggingFace BgeEmbeddings.

	Parameters
	----------
	text_chunks : list
	List of text chunks to be embedded.

	Returns
	-------
	FAISS
	A FAISS vector store containing the embeddings of the text chunks.

	"""
	model = "BAAI/bge-base-en-v1.5"
	encode_kwargs = {
	"normalize_embeddings": True
	} # set True to compute cosine similarity
	embeddings = HuggingFaceBgeEmbeddings(
	model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
	)
	vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
	print("-----")
	print(vectorstore.similarity_search("What is ALiBi?"))
	print("-----")
	return vectorstore

	# Adjust the path to your PDF file by escaping the space
	pdf_path = r"new_papers/ALiBi.pdf"
	pdf_text = get_pdf_text(pdf_path)

	text_chunks = get_text_chunks(pdf_text)
	api_db = get_vectorstore(text_chunks)



	# Define the PDF retrieval function
	def pdf_retrieval(query):
	# Run the query through the retriever
	response = api_db.similarity_search(query)
	print(response)
	return response

	# Create Gradio interface for the API retriever
	api_tool = gr.Interface(
	fn=pdf_retrieval,
	inputs=[gr.Textbox()],
	outputs=gr.Textbox(),
	live=True,
	title="API PDF Retrieval Tool",
	description="This tool indexes PDF documents and retrieves relevant answers based on a given query (HuggingFaceBgeEmbeddings).",
	)

	# Launch the Gradio interface
	api_tool.launch()