Spaces:

SoumyaJ
/

PdfQandA

Sleeping

App Files Files Community

PdfQandA / app.py

SoumyaJ

Upload 2 files

b379775 verified 7 months ago

raw

history blame contribute delete

6.13 kB

	import streamlit as st
	import os
	from langchain_groq import ChatGroq
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain.chains.combine_documents import create_stuff_documents_chain
	from langchain_core.prompts import ChatPromptTemplate
	from langchain.chains import create_retrieval_chain
	from langchain_community.vectorstores import FAISS
	from langchain_community.document_loaders import PyPDFLoader
	from dotenv import load_dotenv
	from PyPDF2 import PdfReader
	import time

	load_dotenv()

	##CSS for the background and sidebar styling
	st.markdown(
	"""
	<style>
	.stApp {
	background-image: url('https://www.transparenttextures.com/patterns/white-leather.png');
	background-size: cover;
	}
	.sidebar .sidebar-content {
	padding: 20px;
	background-image: url('https://www.transparenttextures.com/patterns/asfalt-light.png');
	background-size: cover;
	border-radius: 10px;
	box-shadow: 0px 0px 10px rgba(0, 0, 0, 0.1);
	}
	.sidebar .bottom-button {
	position: fixed;
	bottom: 20px;
	left: 20px;
	width: calc(100% - 40px);
	}
	</style>
	""",
	unsafe_allow_html=True
	)

	os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")
	groq_api_key = os.getenv("GROQ_API_KEY")

	#documentloader -> text splitter -> embeddings -> vector store -> use retriever chains
	embeddings = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")

	llm = ChatGroq(model = "Llama3-8b-8192",api_key = groq_api_key)

	prompt_template = ChatPromptTemplate.from_template("""
	Answer the following question from the provided context only.
	Please provide the most accurate response based on the question
	<context>
	{context}
	</context>
	Question : {input}
	""")

	def get_pdf_text(pdf_docs):
	text=""
	for pdf in pdf_docs:
	pdf_reader= PdfReader(pdf)
	for page in pdf_reader.pages:
	text+= page.extract_text()
	return text

	def create_vector_embeddings(pdfText):
	if "vectors" not in st.session_state:
	st.session_state.docs = get_pdf_text(pdfText)
	st.session_state.splitter = RecursiveCharacterTextSplitter(chunk_size=1200,chunk_overlap=400)
	st.session_state.final_docs = st.session_state.splitter.split_text(st.session_state.docs)
	st.session_state.vectors = FAISS.from_texts(st.session_state.final_docs, embeddings)

	if "options" not in st.session_state:
	st.session_state.options = ["Select a query"]

	if "user_prompt" not in st.session_state:
	st.session_state.user_prompt = ""

	def autopopulate_promptsbydoctype(uploaded_text):
	if uploaded_text and uploaded_text[0].name.endswith("pdf"):
	#autopopulate all the questions in pdf
	itemsToAppend = ["get all the programme details including rights and tape content etc in pointwise manner, dont miss any info",
	"give a structured short summary of the programmes and details",
	"give me programme package with programme details listed"]

	for itemToAppend in itemsToAppend:
	if itemToAppend not in st.session_state.options:
	st.session_state.options.append(itemToAppend)

	st.title("Basic Document QnA")

	with st.sidebar:
	st.title("Menu:")
	#if "uploaded_text" not in st.session_state:
	st.session_state.uploaded_text = st.file_uploader("Upload your Files and Click on the Submit & Process Button", accept_multiple_files=True)
	if st.button("Click To Process File"):
	with st.spinner("Processing..."):
	create_vector_embeddings(st.session_state.uploaded_text)
	st.write("Vector Database is ready")
	autopopulate_promptsbydoctype(st.session_state.uploaded_text)

	# st.markdown('<div class="bottom-button">', unsafe_allow_html=True)
	# params = ['docs', 'splitter','final_docs']
	# if st.button("Clean Current Document Settings") and st.session_state.keys():
	# with st.spinner("Cleaning In Progress...."):
	# for param in params:
	# if param in st.session_state:
	# del st.session_state[param]

	# st.session_state['uploaded_text'] = ""
	# st.write("Cleanup completed..")
	# st.markdown('</div>', unsafe_allow_html=True)


	new_option = st.text_input("Or type your query here:")

	if new_option and new_option not in st.session_state.options:
	st.session_state.options.append(new_option)
	st.session_state.user_prompt = new_option

	if st.session_state.uploaded_text and "Technical" not in st.session_state.uploaded_text[0].name:
	st.session_state.user_prompt= st.selectbox("Enter/Select your query from the document", st.session_state.options,
	index=st.session_state.options.index(st.session_state.user_prompt) if st.session_state.user_prompt in st.session_state.options else 0)

	if st.session_state.user_prompt and st.session_state.user_prompt != "Select a query":
	#st.write(st.session_state.user_prompt)
	document_chain = create_stuff_documents_chain(llm=llm, prompt= prompt_template)
	retriever = st.session_state.vectors.as_retriever()
	retrieval_chain=create_retrieval_chain(retriever,document_chain)

	start = time.process_time()
	response = retrieval_chain.invoke({"input": st.session_state.user_prompt})
	print(f"Response time :{time.process_time()-start}")

	st.write(response['answer'])

	## With a streamlit expander
	with st.expander("Document similarity Search"):
	for i,doc in enumerate(response['context']):
	st.write(doc.page_content)
	st.write('------------------------')