Document_Query / app.py
Asheesh18's picture
Final Update
d99fe36 verified
import warnings
warnings.filterwarnings("ignore")
import os
#pdf loader
from PyPDF2 import PdfReader
#textsplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Embeddings
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
#storing vector embeddings
from langchain_community.vectorstores import FAISS
#to connect llm models from huggingface
from langchain import HuggingFaceHub
from langchain_groq import ChatGroq
from langchain.chains.question_answering import load_qa_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
import streamlit as st
from streamlit_extras.add_vertical_space import add_vertical_space
def get_pdf_text(filename):
pdf_reader = PdfReader(filename)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
return text
def get_text_chunks(text):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len
)
chunks = text_splitter.split_text(text=text)
return chunks
def get_vectorstore(textchunks):
model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
hf_embedding = HuggingFaceBgeEmbeddings(
model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)
db = FAISS.from_texts(textchunks, hf_embedding)
return db
def get_conversation_chain(db):
# llm=HuggingFaceHub(repo_id="google/flan-t5-base",
# model_kwargs={"max_new_tokens": 100,
# "temperature": 0.2,
# })
llm = ChatGroq(
api_key=api_key,
model="llama3-8b-8192",
temperature = 0
)
chain = load_qa_chain(llm, chain_type="stuff") #initialize llm and chain type
return chain
def main():
st.set_page_config(page_title="RAG: Internet Expense Invoice PDF πŸ“„")
st.header("RAG: Internet Expense Invoice PDF πŸ“„")
user_question = st.text_input("Ask a question about Details of the Invoice:")
# if user_question:
# handle_userinput(user_question)
with st.sidebar:
st.markdown('''
## About
This app is an LLM-powered chatbot built using:
- [Streamlit](https://streamlit.io/)
- [LangChain](https://python.langchain.com/)
- [HuggingFace](https://huggingface.co/BAAI/bge-small-en)
- [Groq](https://groq.com/)
''')
st.subheader("Your documents")
pdf_docs = st.file_uploader(
"Upload Internet Invoice here and click on 'Process'")
st.write('Made with ❀️ by Asheesh ')
if st.button("Process"):
with st.spinner("Processing"):
#get pdf text
raw_text = get_pdf_text(pdf_docs)
#get the text chunks
text_chunks = get_text_chunks(raw_text)
st.write(text_chunks)
# #create vector store
st.session_state.vectorstore = get_vectorstore(text_chunks)
# create conversation chain
st.session_state.conversation = get_conversation_chain(st.session_state.vectorstore)
if user_question:
docs = st.session_state.vectorstore.similarity_search(user_question) #perform similarity search in the vector database (db)
answer = st.session_state.conversation.run(input_documents=docs, question=user_question) #output the answer
st.write(answer)
if __name__ == '__main__':
os.getenv("HUGGINGFACEHUB_API_TOKEN")
api_key = os.getenv("GROQ_API_KEY")
main()