Spaces:
Sleeping
Sleeping
File size: 3,830 Bytes
e0a319f 50da0c3 d99fe36 50da0c3 e0a319f 50da0c3 9068256 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import warnings
warnings.filterwarnings("ignore")
import os
#pdf loader
from PyPDF2 import PdfReader
#textsplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Embeddings
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
#storing vector embeddings
from langchain_community.vectorstores import FAISS
#to connect llm models from huggingface
from langchain import HuggingFaceHub
from langchain_groq import ChatGroq
from langchain.chains.question_answering import load_qa_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
import streamlit as st
from streamlit_extras.add_vertical_space import add_vertical_space
def get_pdf_text(filename):
pdf_reader = PdfReader(filename)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
return text
def get_text_chunks(text):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len
)
chunks = text_splitter.split_text(text=text)
return chunks
def get_vectorstore(textchunks):
model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
hf_embedding = HuggingFaceBgeEmbeddings(
model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)
db = FAISS.from_texts(textchunks, hf_embedding)
return db
def get_conversation_chain(db):
# llm=HuggingFaceHub(repo_id="google/flan-t5-base",
# model_kwargs={"max_new_tokens": 100,
# "temperature": 0.2,
# })
llm = ChatGroq(
api_key=api_key,
model="llama3-8b-8192",
temperature = 0
)
chain = load_qa_chain(llm, chain_type="stuff") #initialize llm and chain type
return chain
def main():
st.set_page_config(page_title="RAG: Internet Expense Invoice PDF 📄")
st.header("RAG: Internet Expense Invoice PDF 📄")
user_question = st.text_input("Ask a question about Details of the Invoice:")
# if user_question:
# handle_userinput(user_question)
with st.sidebar:
st.markdown('''
## About
This app is an LLM-powered chatbot built using:
- [Streamlit](https://streamlit.io/)
- [LangChain](https://python.langchain.com/)
- [HuggingFace](https://huggingface.co/BAAI/bge-small-en)
- [Groq](https://groq.com/)
''')
st.subheader("Your documents")
pdf_docs = st.file_uploader(
"Upload Internet Invoice here and click on 'Process'")
st.write('Made with ❤️ by Asheesh ')
if st.button("Process"):
with st.spinner("Processing"):
#get pdf text
raw_text = get_pdf_text(pdf_docs)
#get the text chunks
text_chunks = get_text_chunks(raw_text)
st.write(text_chunks)
# #create vector store
st.session_state.vectorstore = get_vectorstore(text_chunks)
# create conversation chain
st.session_state.conversation = get_conversation_chain(st.session_state.vectorstore)
if user_question:
docs = st.session_state.vectorstore.similarity_search(user_question) #perform similarity search in the vector database (db)
answer = st.session_state.conversation.run(input_documents=docs, question=user_question) #output the answer
st.write(answer)
if __name__ == '__main__':
os.getenv("HUGGINGFACEHUB_API_TOKEN")
api_key = os.getenv("GROQ_API_KEY")
main() |