Spaces:

prthgo
/

PDF-Chatbot

Sleeping

App Files Files Community

PDF-Chatbot / app.py

prthgo

Update app.py

4f55bbb 9 months ago

raw history blame contribute delete

No virus

3.52 kB

	import streamlit as st
	from dotenv import load_dotenv
	from PyPDF2 import PdfReader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.embeddings import HuggingFaceBgeEmbeddings
	from langchain.vectorstores import FAISS
	from langchain.memory import ConversationBufferMemory
	from langchain.chains import ConversationalRetrievalChain
	from htmltemp import css, bot_template, user_template
	from langchain.llms import HuggingFaceHub


	def main():
	load_dotenv()
	st.set_page_config(page_title="PDF Chatbot", page_icon="📚")
	st.write(css, unsafe_allow_html=True)

	if "conversation" not in st.session_state:
	st.session_state.conversation = None
	if "chat_history" not in st.session_state:
	st.session_state.chat_history = None

	st.header("Chat with your PDFs 📚")
	user_question = st.text_input("Ask a question about your documents:")
	if user_question:
	handle_userinput(user_question)

	with st.sidebar:
	st.sidebar.info("""Note: I haven't used any GPU for this project so It can take
	long time to process large PDFs. Also this is POC project and can be easily upgraded
	with better model and resources. """)

	st.subheader("Your PDFs")
	pdf_docs = st.file_uploader(
	"Upload your PDFs here", accept_multiple_files=True
	)
	if st.button("Process"):
	with st.spinner("Processing"):
	# get pdf text
	raw_text = get_pdf_text(pdf_docs)

	# get the text chunks
	text_chunks = get_text_chunks(raw_text)

	# create vector store
	vectorstore = get_vectorstore(text_chunks)

	# create conversation chain
	st.session_state.conversation = get_conversation_chain(vectorstore)


	def get_pdf_text(pdf_docs):
	text = ""
	for pdf in pdf_docs:
	pdf_reader = PdfReader(pdf)
	for page in pdf_reader.pages:
	text += page.extract_text()
	return text


	def get_text_chunks(text):
	text_splitter = RecursiveCharacterTextSplitter(
	separators=["\n\n", "\n", "."], chunk_size=900, chunk_overlap=200, length_function=len
	)
	chunks = text_splitter.split_text(text)
	return chunks


	def get_vectorstore(text_chunks):
	embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en-v1.5")
	vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
	return vectorstore


	def get_conversation_chain(vectorstore):
	llm = HuggingFaceHub(
	repo_id="google/flan-t5-large",
	model_kwargs={"temperature": 0.5, "max_length": 1024},

	)

	memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
	conversation_chain = ConversationalRetrievalChain.from_llm(
	llm=llm, retriever=vectorstore.as_retriever(), memory=memory
	)
	return conversation_chain


	def handle_userinput(user_question):
	response = st.session_state.conversation({"question": user_question})
	st.session_state.chat_history = response["chat_history"]

	for i, message in enumerate(st.session_state.chat_history):
	if i % 2 == 0:
	st.write(
	user_template.replace("{{MSG}}", message.content),
	unsafe_allow_html=True,
	)
	else:
	st.write(
	bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True
	)


	if __name__ == "__main__":
	main()