PDF_Extractor / app.py
Jagannath95's picture
Update app.py
d5bddf3 verified
import streamlit as st
from dotenv import load_dotenv
import sys
from PyPDF2 import PdfReader
from langchain_community.llms import OpenAI
from langchain_community.chat_models import ChatOpenAI
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers import MultiQueryRetriever
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI , Cohere
def get_pdf_text(pdf_docs):
text = ""
pdf_reader = PdfReader(pdf_docs)
for page in pdf_reader.pages:
text += page.extract_text()
return text
def get_text_chunks(text):
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=1000,
chunk_overlap=200,
length_function=len,
is_separator_regex=False,)
chunks = text_splitter.split_text(text)
return chunks
def get_vectorstore(text_chunks):
embeddings = OpenAIEmbeddings()
# embeddings = HuggingFaceInstructEmbeddings(model_name = "hkunlp/instructor-large")
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
return vectorstore
def ll_retriver(vectorstore):
llm = OpenAI(temperature=0)
llm_based_retriver=MultiQueryRetriever.from_llm(
retriever=vectorstore.as_retriever(),
llm=llm
)
return llm_based_retriver
def chain(llm_based_retriever):
llm = Cohere(temperature=0)
QA_Chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=llm_based_retriever
)
return QA_Chain
def main():
load_dotenv()
st.set_page_config(page_title = "Chat with a PDFs",page_icon=":books:")
if "conversation" not in st.session_state:
st.session_state.conversation = None
if "Q_A_Chain" not in st.session_state:
st.session_state.Q_A_Chain = None
st.header("Chat with PDF :books:")
# question = st.text_input("Ask a Question about your document:")
with st.sidebar:
st.subheader("Upload your PDF")
pdf_docs = st.file_uploader("Upload your PDF here then Process")
if st.button("Process"):
with st.spinner("Processing"):
# get the raw PDF context
raw_text = get_pdf_text(pdf_docs)
# st.write(raw_text)
# get the chunks
text_chunks = get_text_chunks(raw_text)
# st.write(text_chunks)
#Create Vector Store
vectorstore = get_vectorstore(text_chunks)
# Conversation chain
llm_based_retriver = ll_retriver(vectorstore)
st.session_state.Q_A_Chain = chain(llm_based_retriver)
st.success("PDF processed successfully, you can now ask Questions.")
if st.session_state.Q_A_Chain:
question = st.text_input("Ask a Question about your document:")
if st.button("Submit Question"):
if question:
with st.spinner("Getting answer..."):
docs = st.session_state.Q_A_Chain({"query":question})
st.write(docs['result'])
if __name__ == "__main__":
main()