Spaces:
Sleeping
Sleeping
import warnings | |
warnings.filterwarnings("ignore") | |
import os | |
#pdf loader | |
from PyPDF2 import PdfReader | |
#textsplitter | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
# Embeddings | |
from langchain_community.embeddings import HuggingFaceBgeEmbeddings | |
#storing vector embeddings | |
from langchain_community.vectorstores import FAISS | |
#to connect llm models from huggingface | |
from langchain import HuggingFaceHub | |
from langchain_groq import ChatGroq | |
from langchain.chains.question_answering import load_qa_chain | |
from langchain_core.prompts import ChatPromptTemplate | |
from langchain.chains.combine_documents import create_stuff_documents_chain | |
from langchain.chains import create_retrieval_chain | |
import streamlit as st | |
from streamlit_extras.add_vertical_space import add_vertical_space | |
def get_pdf_text(filename): | |
pdf_reader = PdfReader(filename) | |
text = "" | |
for page in pdf_reader.pages: | |
text += page.extract_text() | |
return text | |
def get_text_chunks(text): | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=1000, | |
chunk_overlap=200, | |
length_function=len | |
) | |
chunks = text_splitter.split_text(text=text) | |
return chunks | |
def get_vectorstore(textchunks): | |
model_name = "BAAI/bge-small-en" | |
model_kwargs = {"device": "cpu"} | |
encode_kwargs = {"normalize_embeddings": True} | |
hf_embedding = HuggingFaceBgeEmbeddings( | |
model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs | |
) | |
db = FAISS.from_texts(textchunks, hf_embedding) | |
return db | |
def get_conversation_chain(db): | |
# llm=HuggingFaceHub(repo_id="google/flan-t5-base", | |
# model_kwargs={"max_new_tokens": 100, | |
# "temperature": 0.2, | |
# }) | |
llm = ChatGroq( | |
api_key=api_key, | |
model="llama3-8b-8192", | |
temperature = 0 | |
) | |
chain = load_qa_chain(llm, chain_type="stuff") #initialize llm and chain type | |
return chain | |
def main(): | |
st.set_page_config(page_title="RAG: Internet Expense Invoice PDF π") | |
st.header("RAG: Internet Expense Invoice PDF π") | |
user_question = st.text_input("Ask a question about Details of the Invoice:") | |
# if user_question: | |
# handle_userinput(user_question) | |
with st.sidebar: | |
st.markdown(''' | |
## About | |
This app is an LLM-powered chatbot built using: | |
- [Streamlit](https://streamlit.io/) | |
- [LangChain](https://python.langchain.com/) | |
- [HuggingFace](https://huggingface.co/BAAI/bge-small-en) | |
- [Groq](https://groq.com/) | |
''') | |
st.subheader("Your documents") | |
pdf_docs = st.file_uploader( | |
"Upload Internet Invoice here and click on 'Process'") | |
st.write('Made with β€οΈ by Asheesh ') | |
if st.button("Process"): | |
with st.spinner("Processing"): | |
#get pdf text | |
raw_text = get_pdf_text(pdf_docs) | |
#get the text chunks | |
text_chunks = get_text_chunks(raw_text) | |
st.write(text_chunks) | |
# #create vector store | |
st.session_state.vectorstore = get_vectorstore(text_chunks) | |
# create conversation chain | |
st.session_state.conversation = get_conversation_chain(st.session_state.vectorstore) | |
if user_question: | |
docs = st.session_state.vectorstore.similarity_search(user_question) #perform similarity search in the vector database (db) | |
answer = st.session_state.conversation.run(input_documents=docs, question=user_question) #output the answer | |
st.write(answer) | |
if __name__ == '__main__': | |
os.getenv("HUGGINGFACEHUB_API_TOKEN") | |
api_key = os.getenv("GROQ_API_KEY") | |
main() |