datastx commited on
Commit
62e3596
1 Parent(s): 6c25a82

chat with pdf

Browse files
Files changed (6) hide show
  1. .gitignore +5 -0
  2. Makefile.venv +272 -0
  3. app.py +104 -0
  4. htmlTemplates.py +44 -0
  5. makefile +14 -0
  6. requirements.txt +21 -0
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ .venv
2
+ .env
3
+ __pycache__
4
+ models
5
+ *.bin
Makefile.venv ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # SEAMLESSLY MANAGE PYTHON VIRTUAL ENVIRONMENT WITH A MAKEFILE
3
+ #
4
+ # https://github.com/sio/Makefile.venv v2023.04.17
5
+ #
6
+ #
7
+ # Insert `include Makefile.venv` at the bottom of your Makefile to enable these
8
+ # rules.
9
+ #
10
+ # When writing your Makefile use '$(VENV)/python' to refer to the Python
11
+ # interpreter within virtual environment and '$(VENV)/executablename' for any
12
+ # other executable in venv.
13
+ #
14
+ # This Makefile provides the following targets:
15
+ # venv
16
+ # Use this as a dependency for any target that requires virtual
17
+ # environment to be created and configured
18
+ # python, ipython
19
+ # Use these to launch interactive Python shell within virtual environment
20
+ # shell, bash, zsh
21
+ # Launch interactive command line shell. "shell" target launches the
22
+ # default shell Makefile executes its rules in (usually /bin/sh).
23
+ # "bash" and "zsh" can be used to refer to the specific desired shell.
24
+ # show-venv
25
+ # Show versions of Python and pip, and the path to the virtual environment
26
+ # clean-venv
27
+ # Remove virtual environment
28
+ # $(VENV)/executable_name
29
+ # Install `executable_name` with pip. Only packages with names matching
30
+ # the name of the corresponding executable are supported.
31
+ # Use this as a lightweight mechanism for development dependencies
32
+ # tracking. E.g. for one-off tools that are not required in every
33
+ # developer's environment, therefore are not included into
34
+ # requirements.txt or setup.py.
35
+ # Note:
36
+ # Rules using such target or dependency MUST be defined below
37
+ # `include` directive to make use of correct $(VENV) value.
38
+ # Example:
39
+ # codestyle: $(VENV)/pyflakes
40
+ # $(VENV)/pyflakes .
41
+ # See `ipython` target below for another example.
42
+ #
43
+ # This Makefile can be configured via following variables:
44
+ # PY
45
+ # Command name for system Python interpreter. It is used only initially to
46
+ # create the virtual environment
47
+ # Default: python3
48
+ # REQUIREMENTS_TXT
49
+ # Space separated list of paths to requirements.txt files.
50
+ # Paths are resolved relative to current working directory.
51
+ # Default: requirements.txt
52
+ #
53
+ # Non-existent files are treated as hard dependencies,
54
+ # recipes for creating such files must be provided by the main Makefile.
55
+ # Providing empty value (REQUIREMENTS_TXT=) turns off processing of
56
+ # requirements.txt even when the file exists.
57
+ # SETUP_PY, SETUP_CFG, PYPROJECT_TOML, VENV_LOCAL_PACKAGE
58
+ # Space separated list of paths to files that contain build instructions
59
+ # for local Python packages. Corresponding packages will be installed
60
+ # into venv in editable mode along with all their dependencies.
61
+ # Default: setup.py setup.cfg pyproject.toml (whichever present)
62
+ #
63
+ # Non-existent and empty values are treated in the same way as for REQUIREMENTS_TXT.
64
+ # WORKDIR
65
+ # Parent directory for the virtual environment.
66
+ # Default: current working directory.
67
+ # VENVDIR
68
+ # Python virtual environment directory.
69
+ # Default: $(WORKDIR)/.venv
70
+ #
71
+ # This Makefile was written for GNU Make and may not work with other make
72
+ # implementations.
73
+ #
74
+ #
75
+ # Copyright (c) 2019-2023 Vitaly Potyarkin
76
+ #
77
+ # Licensed under the Apache License, Version 2.0
78
+ # <http://www.apache.org/licenses/LICENSE-2.0>
79
+ #
80
+
81
+
82
+ #
83
+ # Configuration variables
84
+ #
85
+
86
+ WORKDIR?=.
87
+ VENVDIR?=$(WORKDIR)/.venv
88
+ REQUIREMENTS_TXT?=$(wildcard requirements.txt) # Multiple paths are supported (space separated)
89
+ SETUP_PY?=$(wildcard setup.py) # Multiple paths are supported (space separated)
90
+ SETUP_CFG?=$(foreach s,$(SETUP_PY),$(wildcard $(patsubst %setup.py,%setup.cfg,$(s))))
91
+ PYPROJECT_TOML?=$(wildcard pyproject.toml)
92
+ VENV_LOCAL_PACKAGE?=$(SETUP_PY) $(SETUP_CFG) $(PYPROJECT_TOML)
93
+ MARKER=.initialized-with-Makefile.venv
94
+
95
+
96
+ #
97
+ # Python interpreter detection
98
+ #
99
+
100
+ _PY_AUTODETECT_MSG=Detected Python interpreter: $(PY). Use PY environment variable to override
101
+
102
+ ifeq (ok,$(shell test -e /dev/null 2>&1 && echo ok))
103
+ NULL_STDERR=2>/dev/null
104
+ else
105
+ NULL_STDERR=2>NUL
106
+ endif
107
+
108
+ ifndef PY
109
+ _PY_OPTION:=python3
110
+ ifeq (ok,$(shell $(_PY_OPTION) -c "print('ok')" $(NULL_STDERR)))
111
+ PY=$(_PY_OPTION)
112
+ endif
113
+ endif
114
+
115
+ ifndef PY
116
+ _PY_OPTION:=$(VENVDIR)/bin/python
117
+ ifeq (ok,$(shell $(_PY_OPTION) -c "print('ok')" $(NULL_STDERR)))
118
+ PY=$(_PY_OPTION)
119
+ $(info $(_PY_AUTODETECT_MSG))
120
+ endif
121
+ endif
122
+
123
+ ifndef PY
124
+ _PY_OPTION:=$(subst /,\,$(VENVDIR)/Scripts/python)
125
+ ifeq (ok,$(shell $(_PY_OPTION) -c "print('ok')" $(NULL_STDERR)))
126
+ PY=$(_PY_OPTION)
127
+ $(info $(_PY_AUTODETECT_MSG))
128
+ endif
129
+ endif
130
+
131
+ ifndef PY
132
+ _PY_OPTION:=py -3
133
+ ifeq (ok,$(shell $(_PY_OPTION) -c "print('ok')" $(NULL_STDERR)))
134
+ PY=$(_PY_OPTION)
135
+ $(info $(_PY_AUTODETECT_MSG))
136
+ endif
137
+ endif
138
+
139
+ ifndef PY
140
+ _PY_OPTION:=python
141
+ ifeq (ok,$(shell $(_PY_OPTION) -c "print('ok')" $(NULL_STDERR)))
142
+ PY=$(_PY_OPTION)
143
+ $(info $(_PY_AUTODETECT_MSG))
144
+ endif
145
+ endif
146
+
147
+ ifndef PY
148
+ define _PY_AUTODETECT_ERR
149
+ Could not detect Python interpreter automatically.
150
+ Please specify path to interpreter via PY environment variable.
151
+ endef
152
+ $(error $(_PY_AUTODETECT_ERR))
153
+ endif
154
+
155
+
156
+ #
157
+ # Internal variable resolution
158
+ #
159
+
160
+ VENV=$(VENVDIR)/bin
161
+ EXE=
162
+ # Detect windows
163
+ ifeq (win32,$(shell $(PY) -c "import __future__, sys; print(sys.platform)"))
164
+ VENV=$(VENVDIR)/Scripts
165
+ EXE=.exe
166
+ endif
167
+
168
+ touch=touch $(1)
169
+ ifeq (,$(shell command -v touch $(NULL_STDERR)))
170
+ # https://ss64.com/nt/touch.html
171
+ touch=type nul >> $(subst /,\,$(1)) && copy /y /b $(subst /,\,$(1))+,, $(subst /,\,$(1))
172
+ endif
173
+
174
+ RM?=rm -f
175
+ ifeq (,$(shell command -v $(firstword $(RM)) $(NULL_STDERR)))
176
+ RMDIR:=rd /s /q
177
+ else
178
+ RMDIR:=$(RM) -r
179
+ endif
180
+
181
+
182
+ #
183
+ # Virtual environment
184
+ #
185
+
186
+ .PHONY: venv
187
+ venv: $(VENV)/$(MARKER)
188
+
189
+ .PHONY: clean-venv
190
+ clean-venv:
191
+ -$(RMDIR) "$(VENVDIR)"
192
+
193
+ .PHONY: show-venv
194
+ show-venv: venv
195
+ @$(VENV)/python -c "import sys; print('Python ' + sys.version.replace('\n',''))"
196
+ @$(VENV)/pip --version
197
+ @echo venv: $(VENVDIR)
198
+
199
+ .PHONY: debug-venv
200
+ debug-venv:
201
+ @echo "PATH (Shell)=$$PATH"
202
+ @$(MAKE) --version
203
+ $(info PATH (GNU Make)="$(PATH)")
204
+ $(info SHELL="$(SHELL)")
205
+ $(info PY="$(PY)")
206
+ $(info REQUIREMENTS_TXT="$(REQUIREMENTS_TXT)")
207
+ $(info VENV_LOCAL_PACKAGE="$(VENV_LOCAL_PACKAGE)")
208
+ $(info VENVDIR="$(VENVDIR)")
209
+ $(info VENVDEPENDS="$(VENVDEPENDS)")
210
+ $(info WORKDIR="$(WORKDIR)")
211
+
212
+
213
+ #
214
+ # Dependencies
215
+ #
216
+
217
+ ifneq ($(strip $(REQUIREMENTS_TXT)),)
218
+ VENVDEPENDS+=$(REQUIREMENTS_TXT)
219
+ endif
220
+
221
+ ifneq ($(strip $(VENV_LOCAL_PACKAGE)),)
222
+ VENVDEPENDS+=$(VENV_LOCAL_PACKAGE)
223
+ endif
224
+
225
+ $(VENV):
226
+ $(PY) -m venv $(VENVDIR)
227
+ $(VENV)/python -m pip install --upgrade pip setuptools wheel
228
+
229
+ $(VENV)/$(MARKER): $(VENVDEPENDS) | $(VENV)
230
+ ifneq ($(strip $(REQUIREMENTS_TXT)),)
231
+ $(VENV)/pip install $(foreach path,$(REQUIREMENTS_TXT),-r $(path))
232
+ endif
233
+ ifneq ($(strip $(VENV_LOCAL_PACKAGE)),)
234
+ $(VENV)/pip install $(foreach path,$(sort $(VENV_LOCAL_PACKAGE)),-e $(dir $(path)))
235
+ endif
236
+ $(call touch,$(VENV)/$(MARKER))
237
+
238
+
239
+ #
240
+ # Interactive shells
241
+ #
242
+
243
+ .PHONY: python
244
+ python: venv
245
+ exec $(VENV)/python
246
+
247
+ .PHONY: ipython
248
+ ipython: $(VENV)/ipython
249
+ exec $(VENV)/ipython
250
+
251
+ .PHONY: shell
252
+ shell: venv
253
+ . $(VENV)/activate && exec $(notdir $(SHELL))
254
+
255
+ .PHONY: bash zsh
256
+ bash zsh: venv
257
+ . $(VENV)/activate && exec $@
258
+
259
+
260
+ #
261
+ # Commandline tools (wildcard rule, executable name must match package name)
262
+ #
263
+
264
+ ifneq ($(EXE),)
265
+ $(VENV)/%: $(VENV)/%$(EXE) ;
266
+ .PHONY: $(VENV)/%
267
+ .PRECIOUS: $(VENV)/%$(EXE)
268
+ endif
269
+
270
+ $(VENV)/%$(EXE): $(VENV)/$(MARKER)
271
+ $(VENV)/pip install --upgrade $*
272
+ $(call touch,$@)
app.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from dotenv import load_dotenv
3
+ from PyPDF2 import PdfReader
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
6
+ from langchain.vectorstores import FAISS
7
+ from langchain.chat_models import ChatOpenAI
8
+ from langchain.memory import ConversationBufferMemory
9
+ from langchain.chains import ConversationalRetrievalChain
10
+ from htmlTemplates import css, bot_template, user_template
11
+ from langchain.llms import HuggingFaceHub
12
+
13
+ def get_pdf_text(pdf_docs):
14
+ text = ""
15
+ for pdf in pdf_docs:
16
+ pdf_reader = PdfReader(pdf)
17
+ for page in pdf_reader.pages:
18
+ text += page.extract_text()
19
+ return text
20
+
21
+
22
+ def get_text_chunks(text):
23
+ text_splitter = CharacterTextSplitter(
24
+ separator="\n",
25
+ chunk_size=1000,
26
+ chunk_overlap=200,
27
+ length_function=len
28
+ )
29
+ chunks = text_splitter.split_text(text)
30
+ return chunks
31
+
32
+
33
+ def get_vectorstore(text_chunks):
34
+ embeddings = OpenAIEmbeddings()
35
+ # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
36
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
37
+ return vectorstore
38
+
39
+
40
+ def get_conversation_chain(vectorstore):
41
+ llm = ChatOpenAI()
42
+ # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
43
+
44
+ memory = ConversationBufferMemory(
45
+ memory_key='chat_history', return_messages=True)
46
+ conversation_chain = ConversationalRetrievalChain.from_llm(
47
+ llm=llm,
48
+ retriever=vectorstore.as_retriever(),
49
+ memory=memory
50
+ )
51
+ return conversation_chain
52
+
53
+
54
+ def handle_userinput(user_question):
55
+ response = st.session_state.conversation({'question': user_question})
56
+ st.session_state.chat_history = response['chat_history']
57
+
58
+ for i, message in enumerate(st.session_state.chat_history):
59
+ if i % 2 == 0:
60
+ st.write(user_template.replace(
61
+ "{{MSG}}", message.content), unsafe_allow_html=True)
62
+ else:
63
+ st.write(bot_template.replace(
64
+ "{{MSG}}", message.content), unsafe_allow_html=True)
65
+
66
+
67
+ def main():
68
+ load_dotenv()
69
+ st.set_page_config(page_title="Chat with multiple PDFs",
70
+ page_icon=":books:")
71
+ st.write(css, unsafe_allow_html=True)
72
+
73
+ if "conversation" not in st.session_state:
74
+ st.session_state.conversation = None
75
+ if "chat_history" not in st.session_state:
76
+ st.session_state.chat_history = None
77
+
78
+ st.header("Chat with multiple PDFs :books:")
79
+ user_question = st.text_input("Ask a question about your documents:")
80
+ if user_question:
81
+ handle_userinput(user_question)
82
+
83
+ with st.sidebar:
84
+ st.subheader("Your documents")
85
+ pdf_docs = st.file_uploader(
86
+ "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
87
+ if st.button("Process"):
88
+ with st.spinner("Processing"):
89
+ # get pdf text
90
+ raw_text = get_pdf_text(pdf_docs)
91
+
92
+ # get the text chunks
93
+ text_chunks = get_text_chunks(raw_text)
94
+
95
+ # create vector store
96
+ vectorstore = get_vectorstore(text_chunks)
97
+
98
+ # create conversation chain
99
+ st.session_state.conversation = get_conversation_chain(
100
+ vectorstore)
101
+
102
+
103
+ if __name__ == '__main__':
104
+ main()
htmlTemplates.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ css = '''
2
+ <style>
3
+ .chat-message {
4
+ padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
5
+ }
6
+ .chat-message.user {
7
+ background-color: #2b313e
8
+ }
9
+ .chat-message.bot {
10
+ background-color: #475063
11
+ }
12
+ .chat-message .avatar {
13
+ width: 20%;
14
+ }
15
+ .chat-message .avatar img {
16
+ max-width: 78px;
17
+ max-height: 78px;
18
+ border-radius: 50%;
19
+ object-fit: cover;
20
+ }
21
+ .chat-message .message {
22
+ width: 80%;
23
+ padding: 0 1.5rem;
24
+ color: #fff;
25
+ }
26
+ '''
27
+
28
+ bot_template = '''
29
+ <div class="chat-message bot">
30
+ <div class="avatar">
31
+ <img src="https://i.ibb.co/cN0nmSj/Screenshot-2023-05-28-at-02-37-21.png" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
32
+ </div>
33
+ <div class="message">{{MSG}}</div>
34
+ </div>
35
+ '''
36
+
37
+ user_template = '''
38
+ <div class="chat-message user">
39
+ <div class="avatar">
40
+ <img src="https://i.ibb.co/rdZC7LZ/Photo-logo-1.png">
41
+ </div>
42
+ <div class="message">{{MSG}}</div>
43
+ </div>
44
+ '''
makefile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ PYTHON_BINARY = $(VENV)/python
4
+ STREAMLIT_BINARY = $(VENV)/streamlit
5
+
6
+
7
+ re-venv: clean-venv venv
8
+
9
+ run: venv
10
+ $(PYTHON_BINARY) app.py
11
+ $(STREAMLIT_BINARY) run app.py
12
+
13
+
14
+ include Makefile.venv
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ openai
3
+ Streamlit
4
+ huggingface_hub
5
+ python-dotenv
6
+ watchdog
7
+ tiktoken
8
+ pinecone-client
9
+ joblib
10
+ pandas
11
+ scikit-learn
12
+ sentence_transformers
13
+ uvicorn
14
+ ctransformers
15
+ fastapi
16
+ ipykernel
17
+ python-box
18
+ transformers
19
+ PyPDF2
20
+ faiss-cpu
21
+ altair