norsu commited on
Commit
fe4bb5a
1 Parent(s): f72ff81

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +101 -0
  2. htmlTemplates.py +44 -0
  3. requirements.txt +91 -0
app.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from dotenv import load_dotenv
3
+ from PyPDF2 import PdfReader
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
6
+ from langchain.vectorstores import FAISS
7
+ from langchain.memory import ConversationBufferMemory
8
+ from langchain.chains import ConversationalRetrievalChain
9
+ from htmlTemplates import css, bot_template, user_template
10
+ from langchain.llms import HuggingFaceHub
11
+
12
+
13
+ def get_pdf_text(pdf_docs):
14
+ text = ""
15
+ for pdf in pdf_docs:
16
+ pdf_reader = PdfReader(pdf)
17
+ for page in pdf_reader.pages:
18
+ text += page.extract_text()
19
+ return text
20
+
21
+
22
+ def get_text_chunks(text):
23
+ text_splitter = CharacterTextSplitter(
24
+ separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len
25
+ )
26
+ chunks = text_splitter.split_text(text)
27
+ return chunks
28
+
29
+
30
+ def get_vectorstore(text_chunks):
31
+ embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
32
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
33
+ return vectorstore
34
+
35
+
36
+ def get_conversation_chain(vectorstore):
37
+ llm = HuggingFaceHub(
38
+ repo_id="google/flan-t5-xxl",
39
+ model_kwargs={"temperature": 0.5, "max_length": 512},
40
+ )
41
+
42
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
43
+ conversation_chain = ConversationalRetrievalChain.from_llm(
44
+ llm=llm, retriever=vectorstore.as_retriever(), memory=memory
45
+ )
46
+ return conversation_chain
47
+
48
+
49
+ def handle_userinput(user_question):
50
+ response = st.session_state.conversation({"question": user_question})
51
+ st.session_state.chat_history = response["chat_history"]
52
+
53
+ for i, message in enumerate(st.session_state.chat_history):
54
+ if i % 2 == 0:
55
+ st.write(
56
+ user_template.replace("{{MSG}}", message.content),
57
+ unsafe_allow_html=True,
58
+ )
59
+ else:
60
+ st.write(
61
+ bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True
62
+ )
63
+
64
+
65
+ def main():
66
+ load_dotenv()
67
+ st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
68
+ st.write(css, unsafe_allow_html=True)
69
+
70
+ if "conversation" not in st.session_state:
71
+ st.session_state.conversation = None
72
+ if "chat_history" not in st.session_state:
73
+ st.session_state.chat_history = None
74
+
75
+ st.header("Chat with multiple PDFs :books:")
76
+ user_question = st.text_input("Ask a question about your documents:")
77
+ if user_question:
78
+ handle_userinput(user_question)
79
+
80
+ with st.sidebar:
81
+ st.subheader("Your documents")
82
+ pdf_docs = st.file_uploader(
83
+ "Upload your PDFs here and click on 'Process'", accept_multiple_files=True
84
+ )
85
+ if st.button("Process"):
86
+ with st.spinner("Processing"):
87
+ # get pdf text
88
+ raw_text = get_pdf_text(pdf_docs)
89
+
90
+ # get the text chunks
91
+ text_chunks = get_text_chunks(raw_text)
92
+
93
+ # create vector store
94
+ vectorstore = get_vectorstore(text_chunks)
95
+
96
+ # create conversation chain
97
+ st.session_state.conversation = get_conversation_chain(vectorstore)
98
+
99
+
100
+ if __name__ == "__main__":
101
+ main()
htmlTemplates.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ css = """
2
+ <style>
3
+ .chat-message {
4
+ padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
5
+ }
6
+ .chat-message.user {
7
+ background-color: #2b313e
8
+ }
9
+ .chat-message.bot {
10
+ background-color: #475063
11
+ }
12
+ .chat-message .avatar {
13
+ width: 20%;
14
+ }
15
+ .chat-message .avatar img {
16
+ max-width: 78px;
17
+ max-height: 78px;
18
+ border-radius: 50%;
19
+ object-fit: cover;
20
+ }
21
+ .chat-message .message {
22
+ width: 80%;
23
+ padding: 0 1.5rem;
24
+ color: #fff;
25
+ }
26
+ """
27
+
28
+ bot_template = """
29
+ <div class="chat-message bot">
30
+ <div class="avatar">
31
+ <img src="https://i.ibb.co/cN0nmSj/Screenshot-2023-05-28-at-02-37-21.png" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
32
+ </div>
33
+ <div class="message">{{MSG}}</div>
34
+ </div>
35
+ """
36
+
37
+ user_template = """
38
+ <div class="chat-message user">
39
+ <div class="avatar">
40
+ <img src="https://i.ibb.co/rdZC7LZ/Photo-logo-1.png">
41
+ </div>
42
+ <div class="message">{{MSG}}</div>
43
+ </div>
44
+ """
requirements.txt ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.9.5
2
+ aiosignal==1.3.1
3
+ altair==5.3.0
4
+ annotated-types==0.6.0
5
+ attrs==23.2.0
6
+ blinker==1.7.0
7
+ cachetools==5.3.3
8
+ certifi==2024.2.2
9
+ charset-normalizer==3.3.2
10
+ click==8.1.7
11
+ colorama==0.4.6
12
+ dataclasses-json==0.6.4
13
+ faiss-cpu==1.8.0
14
+ filelock==3.13.4
15
+ frozenlist==1.4.1
16
+ fsspec==2024.3.1
17
+ gitdb==4.0.11
18
+ GitPython==3.1.43
19
+ greenlet==3.0.3
20
+ huggingface-hub==0.22.2
21
+ idna==3.7
22
+ InstructorEmbedding==1.0.1
23
+ Jinja2==3.1.3
24
+ joblib==1.4.0
25
+ jsonpatch==1.33
26
+ jsonpointer==2.4
27
+ jsonschema==4.21.1
28
+ jsonschema-specifications==2023.12.1
29
+ langchain==0.1.16
30
+ langchain-community==0.0.33
31
+ langchain-core==0.1.44
32
+ langchain-text-splitters==0.0.1
33
+ langsmith==0.1.48
34
+ markdown-it-py==3.0.0
35
+ MarkupSafe==2.1.5
36
+ marshmallow==3.21.1
37
+ mdurl==0.1.2
38
+ mpmath==1.3.0
39
+ multidict==6.0.5
40
+ mypy-extensions==1.0.0
41
+ networkx==3.3
42
+ nltk==3.8.1
43
+ numpy==1.26.4
44
+ orjson==3.10.1
45
+ packaging==23.2
46
+ pandas==2.2.2
47
+ pillow==10.3.0
48
+ protobuf==4.25.3
49
+ pyarrow==15.0.2
50
+ pydantic==2.7.0
51
+ pydantic_core==2.18.1
52
+ pydeck==0.8.1b0
53
+ Pygments==2.17.2
54
+ PyPDF2==3.0.1
55
+ python-dateutil==2.9.0.post0
56
+ python-dotenv==1.0.1
57
+ pytz==2024.1
58
+ PyYAML==6.0.1
59
+ referencing==0.34.0
60
+ regex==2024.4.16
61
+ requests==2.31.0
62
+ rich==13.7.1
63
+ rpds-py==0.18.0
64
+ safetensors==0.4.3
65
+ scikit-learn==1.4.2
66
+ scipy==1.13.0
67
+ sentence-transformers==2.2.2
68
+ sentencepiece==0.2.0
69
+ setuptools==68.2.2
70
+ six==1.16.0
71
+ smmap==5.0.1
72
+ SQLAlchemy==2.0.29
73
+ streamlit==1.33.0
74
+ sympy==1.12
75
+ tenacity==8.2.3
76
+ threadpoolctl==3.4.0
77
+ tokenizers==0.19.1
78
+ toml==0.10.2
79
+ toolz==0.12.1
80
+ torch==2.2.2
81
+ torchvision==0.17.2
82
+ tornado==6.4
83
+ tqdm==4.66.2
84
+ transformers==4.40.0
85
+ typing-inspect==0.9.0
86
+ typing_extensions==4.11.0
87
+ tzdata==2024.1
88
+ urllib3==2.2.1
89
+ watchdog==4.0.0
90
+ wheel==0.41.2
91
+ yarl==1.9.4