gli-mrunal commited on
Commit
7b1da1b
1 Parent(s): b28488b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +145 -0
app.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://www.youtube.com/watch?v=dXxQ0LR-3Hg
2
+
3
+ # Create Conda virtual environment
4
+ # conda create --name gpt_chatbot python=3.9.4
5
+ # conda activate gpt_chatbot
6
+
7
+ # Installation
8
+ # pip install streamlit pypdf2 langchain python-dotenv faiss-cpu openai huggingface_hub
9
+ # pip install tiktoken
10
+
11
+ # pip install InstructorEmbedding sentence_transformers
12
+
13
+ # Could not import tiktoken python package. This is needed in order to for OpenAIEmbeddings. Please install it with `pip install tiktoken`.
14
+ # run the app using the following command in anaconda VS Code terminal
15
+ # streamlit run app.py
16
+
17
+
18
+
19
+ import streamlit as st
20
+ from dotenv import load_dotenv
21
+ from PyPDF2 import PdfReader
22
+ from langchain.text_splitter import CharacterTextSplitter
23
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
24
+ from langchain.vectorstores import FAISS # FAISS instead of PineCone
25
+ # from langchain.llms import OpenAI
26
+ from langchain.chat_models import ChatOpenAI
27
+ from langchain.memory import ConversationBufferMemory
28
+ from langchain.chains import ConversationalRetrievalChain
29
+ from htmlTemplates import css, bot_template, user_template
30
+
31
+ def get_pdf_text(pdf_docs):
32
+ text =""
33
+ for pdf in pdf_docs:
34
+ pdf_reader = PdfReader(pdf)
35
+ for page in pdf_reader.pages:
36
+ text += page.extract_text()
37
+ return text
38
+
39
+ def get_text_chunks(text):
40
+ text_splitter = CharacterTextSplitter(
41
+ separator="\n",
42
+ chunk_size=1000,
43
+ chunk_overlap=200,
44
+ length_function=len
45
+ )
46
+ chunks = text_splitter.split_text(text)
47
+ return chunks
48
+
49
+ def get_vectorstore(text_chunks):
50
+ # embeddings = OpenAIEmbeddings()
51
+ embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
52
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
53
+ return vectorstore
54
+
55
+ def get_conversation_chain(vectorstore):
56
+ #llm = OpenAI()
57
+ llm = ChatOpenAI()
58
+ memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
59
+ conversation_chain = ConversationalRetrievalChain.from_llm(
60
+ llm=llm,
61
+ retriever=vectorstore.as_retriever(),
62
+ memory=memory
63
+ )
64
+ return conversation_chain
65
+
66
+
67
+
68
+ def handle_userinput(user_question):
69
+ # st.session_state.conversation contains all the configuration from our vectorstore and memory.
70
+ response = st.session_state.conversation({'question': user_question})
71
+ # st.write(response)
72
+ st.session_state.chat_history = response['chat_history']
73
+
74
+ for i, message in enumerate(st.session_state.chat_history):
75
+ if i % 2 == 0:
76
+ st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
77
+ else:
78
+ st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
79
+
80
+
81
+
82
+
83
+ def main():
84
+ load_dotenv()
85
+ st.set_page_config(page_title="Chat with multiple law journal PDFs",
86
+ page_icon=":books:")
87
+
88
+ st.write(css, unsafe_allow_html=True)
89
+
90
+ if "conversation" not in st.session_state:
91
+ st.session_state.conversation = None
92
+
93
+ if "chat_history" not in st.session_state:
94
+ st.session_state.chat_history = None
95
+
96
+ st.header("Chat with multiple PDFs :books:")
97
+
98
+ user_question = st.text_input("Ask a question about your documents:")
99
+ if user_question:
100
+ handle_userinput(user_question)
101
+
102
+ #st.write(user_template.replace("{{MSG}}", "hello robot"), unsafe_allow_html=True)
103
+ #st.write(bot_template.replace("{{MSG}}", "hello human"), unsafe_allow_html=True)
104
+
105
+ # "https://i.ibb.co/rdZC7LZ/Photo-logo-1.png"
106
+ # "https://huggingface.co/spaces/gli-mrunal/GPT_instruct_chatbot/blob/main/images/bot.jpg"
107
+ # "https://huggingface.co/spaces/gli-mrunal/GPT_instruct_chatbot/blob/main/images/CSUN_Matadors_logo.svg.png"
108
+
109
+ with st.sidebar:
110
+ st.subheader("Your documents")
111
+
112
+ pdf_docs = st.file_uploader(
113
+ "Upload your PDfs here and click on 'Process'", accept_multiple_files=True)
114
+ if st.button("Process"):
115
+ with st.spinner("Processing"):
116
+ # --------------- get pdf text -------------------
117
+
118
+ raw_text = get_pdf_text(pdf_docs)
119
+ #st.write(raw_text)
120
+
121
+ # ---------- get the text chunks -------------------------
122
+
123
+ text_chunks = get_text_chunks(raw_text)
124
+ #st.write(text_chunks)
125
+
126
+
127
+ # -------------- create vector store------------------------
128
+ # https://openai.com/pricing --> Embedding Models
129
+ # Chose to use the best embedding model - intructor_xl ranked higher than OpenAi's embeddings from huggingface leaderboard
130
+ # https://huggingface.co/spaces/mteb/leaderboard
131
+
132
+ vectorstore = get_vectorstore(text_chunks)
133
+
134
+ # create conversation chain
135
+ st.session_state.conversation = get_conversation_chain(vectorstore)
136
+ #conversation = get_conversation_chain(vectorstore)
137
+
138
+ #st.session_state.conversation
139
+
140
+
141
+
142
+
143
+
144
+ if __name__ == '__main__':
145
+ main()