Bazedgul commited on
Commit
015bbbb
1 Parent(s): f969b6c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +128 -0
app.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from dotenv import load_dotenv
3
+ from PyPDF2 import PdfReader
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
6
+ from langchain.vectorstores import FAISS
7
+ from langchain.memory import ConversationBufferMemory
8
+ from langchain.chains import ConversationalRetrievalChain
9
+ from langchain.chat_models import ChatOpenAI
10
+ from htmlTemplates import bot_template, user_template, css
11
+
12
+ from transformers import pipeline
13
+
14
+ def get_pdf_text(pdf_files):
15
+
16
+ text = ""
17
+ for pdf_file in pdf_files:
18
+ reader = PdfReader(pdf_file)
19
+ for page in reader.pages:
20
+ text += page.extract_text()
21
+ return text
22
+
23
+ def get_chunk_text(text):
24
+
25
+ text_splitter = CharacterTextSplitter(
26
+ separator = "\n",
27
+ chunk_size = 1000,
28
+ chunk_overlap = 200,
29
+ length_function = len
30
+ )
31
+
32
+ chunks = text_splitter.split_text(text)
33
+
34
+ return chunks
35
+
36
+
37
+ def get_vector_store(text_chunks):
38
+
39
+ # For OpenAI Embeddings
40
+
41
+ embeddings = OpenAIEmbeddings()
42
+
43
+ # For Huggingface Embeddings
44
+
45
+ # embeddings = HuggingFaceInstructEmbeddings(model_name = "hkunlp/instructor-xl")
46
+
47
+ vectorstore = FAISS.from_texts(texts = text_chunks, embedding = embeddings)
48
+
49
+ return vectorstore
50
+
51
+ def get_conversation_chain(vector_store):
52
+
53
+ # OpenAI Model
54
+
55
+ llm = ChatOpenAI()
56
+
57
+ # HuggingFace Model
58
+
59
+ # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
60
+
61
+ memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
62
+
63
+ conversation_chain = ConversationalRetrievalChain.from_llm(
64
+ llm = llm,
65
+ retriever = vector_store.as_retriever(),
66
+ memory = memory
67
+ )
68
+
69
+ return conversation_chain
70
+
71
+ def handle_user_input(question):
72
+
73
+ response = st.session_state.conversation({'question':question})
74
+ st.session_state.chat_history = response['chat_history']
75
+
76
+ for i, message in enumerate(st.session_state.chat_history):
77
+ if i % 2 == 0:
78
+ st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
79
+ else:
80
+ st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
81
+
82
+
83
+
84
+ def main():
85
+ load_dotenv()
86
+ st.set_page_config(page_title='Chat with Your own PDFs', page_icon=':books:')
87
+
88
+ st.write(css, unsafe_allow_html=True)
89
+
90
+ if "conversation" not in st.session_state:
91
+ st.session_state.conversation = None
92
+
93
+ if "chat_history" not in st.session_state:
94
+ st.session_state.chat_history = None
95
+
96
+ st.header('Chat with Your own PDFs :books:')
97
+ question = st.text_input("Ask anything to your PDF: ")
98
+
99
+ if question:
100
+ handle_user_input(question)
101
+
102
+
103
+ with st.sidebar:
104
+ st.subheader("Upload your Documents Here: ")
105
+ pdf_files = st.file_uploader("Choose your PDF Files and Press OK", type=['pdf'], accept_multiple_files=True)
106
+
107
+ if st.button("OK"):
108
+ with st.spinner("Processing your PDFs..."):
109
+
110
+ # Get PDF Text
111
+ raw_text = get_pdf_text(pdf_files)
112
+
113
+ # Get Text Chunks
114
+ text_chunks = get_chunk_text(raw_text)
115
+
116
+
117
+ # Create Vector Store
118
+
119
+ vector_store = get_vector_store(text_chunks)
120
+ st.write("DONE")
121
+
122
+ # Create conversation chain
123
+
124
+ st.session_state.conversation = get_conversation_chain(vector_store)
125
+
126
+
127
+ if __name__ == '__main__':
128
+ main()