akarshrajsingh7 commited on
Commit
9b6e71b
1 Parent(s): d9164c6
Files changed (6) hide show
  1. .gitignore +2 -0
  2. Logo.png +0 -0
  3. app.py +80 -0
  4. app_style.py +73 -0
  5. llm_chain.py +87 -0
  6. requirements.txt +11 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .env
2
+ *.pyc
Logo.png ADDED
app.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ import streamlit as st
3
+ from app_style import css, bot_template, user_template
4
+ from llm_chain import RAG_PDF
5
+
6
+ logo_image_path = "logo.png"
7
+
8
+ def handle_userinput(user_question):
9
+ response = st.session_state.conversation({'question': user_question})
10
+ st.session_state.chat_history = response['chat_history']
11
+
12
+ for i, message in enumerate(st.session_state.chat_history):
13
+ if i % 2 == 0:
14
+ st.write(user_template.replace(
15
+ "{{MSG}}", message.content), unsafe_allow_html=True)
16
+ else:
17
+ st.write(bot_template.replace(
18
+ "{{MSG}}", message.content), unsafe_allow_html=True)
19
+
20
+
21
+ def main():
22
+ # loading environment varibales
23
+ load_dotenv()
24
+ # Page Config
25
+ st.set_page_config(page_title="Ask-your-PDFs",
26
+ page_icon=":books:")
27
+ st.write(css, unsafe_allow_html=True)
28
+ # Chat history session management
29
+
30
+ if "conversation" not in st.session_state:
31
+ st.session_state.conversation = None
32
+ if "chat_history" not in st.session_state:
33
+ st.session_state.chat_history = None
34
+
35
+ # for rendering the background image (Uncomment the next line to update the background img of the application)
36
+ # render_background_img(background_path)
37
+
38
+ # Chat User input
39
+ st.header("Chat with PDFs :books:")
40
+ user_question = st.text_input("Ask a question about your documents:")
41
+ styl = f"""
42
+ <style>
43
+ .stTextInput {{
44
+ position: fixed;
45
+ bottom: 3rem;
46
+ }}
47
+ </style>
48
+ """
49
+ st.markdown(styl, unsafe_allow_html=True)
50
+
51
+ # Handling user input
52
+ if user_question:
53
+ handle_userinput(user_question)
54
+
55
+ with st.sidebar:
56
+ # Loading the Logo
57
+ st.image(logo_image_path, use_column_width=True)
58
+
59
+ # Header text for the sidebar
60
+ st.subheader("Your documents")
61
+
62
+ # File Uploader (Allowing multiple files upload)
63
+ pdf_docs = st.file_uploader(
64
+ "Upload your PDFs here and click on 'Submit'", accept_multiple_files=True)
65
+
66
+ # When the submit button is clicked
67
+ if st.button("Submit"):
68
+ # Processing Bar
69
+ with st.spinner("Processing"):
70
+ # Creating an object of RAG pipeline
71
+ RAG_object = RAG_PDF(pdf_docs)
72
+
73
+ # Activating the RAG Pipeline
74
+ st.session_state.conversation = RAG_object.activate_RAG_pipeline()
75
+
76
+ # Posting an update when the upload and processing of RAG architecture done
77
+ st.write("Processing Completed.")
78
+
79
+ if __name__ == '__main__':
80
+ main()
app_style.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import base64
3
+
4
+ def render_background_img(background_img_path, image_type = "jpg"):
5
+ '''
6
+ A function to unpack an image from root folder and set as bg.
7
+
8
+ Returns
9
+ -------
10
+ The background.
11
+ '''
12
+ # set bg name
13
+ main_bg_ext = image_type
14
+
15
+ st.markdown(
16
+ f"""
17
+ <style>
18
+ .stApp {{
19
+ background: url(data:image/{main_bg_ext};base64,{base64.b64encode(open(background_img_path, "rb").read()).decode()});
20
+ background-size: cover
21
+ }}
22
+ </style>
23
+ """,
24
+ unsafe_allow_html=True
25
+ )
26
+
27
+ ####### CSS for the CHAT UI
28
+
29
+ css = '''
30
+ <style>
31
+ .chat-message {
32
+ padding: 1rem; border-radius: 0.5rem; margin-bottom: 0.8rem; display: flex; max-height: 40%; overflow: auto;
33
+ }
34
+ .chat-message.user {
35
+ background-color: #2b313e
36
+ }
37
+ .chat-message.bot {
38
+ background-color: #475063
39
+ }
40
+ .chat-message .avatar {
41
+ width: 15%;
42
+ }
43
+ .chat-message .avatar img {
44
+ max-width: 60px;
45
+ max-height: 60px;
46
+ border-radius: 50%;
47
+ object-fit: cover;
48
+ }
49
+ .chat-message .message {
50
+ width: 85%;
51
+ padding: 0 1.2rem;
52
+ color: #fff;
53
+ }
54
+ </style>
55
+ '''
56
+
57
+ bot_template = '''
58
+ <div class="chat-message bot">
59
+ <div class="avatar">
60
+ <img src="https://i.ibb.co/cN0nmSj/Screenshot-2023-05-28-at-02-37-21.png" style="max-height: 60px; max-width: 60px; border-radius: 50%; object-fit: cover;">
61
+ </div>
62
+ <div class="message">{{MSG}}</div>
63
+ </div>
64
+ '''
65
+
66
+ user_template = '''
67
+ <div class="chat-message user">
68
+ <div class="avatar">
69
+ <img src="https://i.ibb.co/XJBBhsD/IMG-7040.jpg" style="max-height: 60px; max-width: 60px; border-radius: 50%; object-fit: cover;">
70
+ </div>
71
+ <div class="message">{{MSG}}</div>
72
+ </div>
73
+ '''
llm_chain.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from dotenv import load_dotenv
3
+ from PyPDF2 import PdfReader
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
6
+ from langchain.vectorstores import FAISS
7
+ from langchain.chat_models import ChatOpenAI
8
+ from langchain.memory import ConversationBufferMemory
9
+ from langchain.chains import ConversationalRetrievalChain
10
+ from app_style import css, bot_template, user_template
11
+ from langchain.llms import HuggingFaceHub
12
+
13
+ class RAG_PDF:
14
+ '''
15
+ Class for implementing RAGs for answer questions from PDFs
16
+ '''
17
+ def __init__(self, pdf_docs, model = "open-source"):
18
+ '''
19
+ Initializing the constructor
20
+ '''
21
+ self.pdf_docs = pdf_docs
22
+ if model=="open-source":
23
+ # Open Source model to generate embeddings for the text
24
+ self.embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
25
+ # Open Source model to generate response (Current model used is T5-XXL)
26
+ self.llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
27
+ elif model=="openai":
28
+ # OpenAI model to generate embeddings for the text
29
+ self.embeddings = OpenAIEmbeddings()
30
+ # OpenAI model to generate response
31
+ self.llm = ChatOpenAI()
32
+
33
+
34
+ def pdf_extract_text(self):
35
+ '''
36
+ Extracting text from the PDFs
37
+ '''
38
+ text = ""
39
+ for pdf in self.pdf_docs:
40
+ pdf_reader = PdfReader(pdf)
41
+ for page in pdf_reader.pages:
42
+ text += page.extract_text()
43
+ return text
44
+
45
+ def pdf_chunkize(self, text):
46
+ '''
47
+ Chunking the text into smaller chunks
48
+ '''
49
+ text_splitter = CharacterTextSplitter(
50
+ separator="\n",
51
+ chunk_size=1000,
52
+ chunk_overlap=200, #context aware chunking
53
+ length_function=len
54
+ )
55
+ chunks = text_splitter.split_text(text)
56
+ return chunks
57
+
58
+ def pdf_vectorstore(self, text_chunks):
59
+ '''
60
+ Creating vector store for the text chunks
61
+ '''
62
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=self.embeddings)
63
+ return vectorstore
64
+
65
+ def pdf_conversation_chain(self, vectorstore):
66
+ memory = ConversationBufferMemory(
67
+ memory_key='chat_history', return_messages=True)
68
+ conversation_chain = ConversationalRetrievalChain.from_llm(
69
+ llm=self.llm,
70
+ retriever=vectorstore.as_retriever(),
71
+ memory=memory
72
+ )
73
+ return conversation_chain
74
+
75
+ def activate_RAG_pipeline(self):
76
+ # get pdf text
77
+ raw_text = self.pdf_extract_text()
78
+
79
+ # get the text chunks
80
+ text_chunks = self.pdf_chunkize(raw_text)
81
+
82
+ # create vector store
83
+ vectorstore = self.pdf_vectorstore(text_chunks)
84
+
85
+ # create conversation chain
86
+ conversation_chain = self.pdf_conversation_chain(vectorstore)
87
+ return conversation_chain
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain==0.1.1
2
+ PyPDF2==3.0.1
3
+ python-dotenv==1.0.0
4
+ streamlit==1.30.0
5
+ openai==0.27.6
6
+ faiss-cpu==1.7.4
7
+ altair==4
8
+ tiktoken==0.4.0
9
+ huggingface-hub
10
+ InstructorEmbedding==1.0.1
11
+ sentence-transformers==2.2.2