acodi commited on
Commit
870b873
1 Parent(s): d6c6f28

Add application file

Browse files
app.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from dotenv import load_dotenv
4
+ from PyPDF2 import PdfReader
5
+ from langchain.text_splitter import CharacterTextSplitter
6
+ from langchain_openai import OpenAIEmbeddings
7
+ from langchain_community.embeddings import HuggingFaceInstructEmbeddings
8
+ from langchain_community.vectorstores import FAISS
9
+ from langchain_openai import ChatOpenAI
10
+ from langchain_community.llms import HuggingFaceHub
11
+ from langchain.memory import ConversationBufferMemory
12
+ from langchain.chains import ConversationalRetrievalChain
13
+ from langchain_community.document_loaders import DirectoryLoader
14
+ from htmlTemplates import css, bot_template, user_template
15
+ from langchain.globals import set_verbose
16
+ set_verbose(False) # Updated function call
17
+
18
+
19
+ def read_files_from_directory(directory):
20
+ files = []
21
+ for filename in os.listdir(directory):
22
+ if filename.endswith(".pdf"):
23
+ files.append(os.path.join(directory, filename))
24
+ return files
25
+
26
+ def get_pdf_text(pdf_docs):
27
+ text = ""
28
+ for pdf in pdf_docs:
29
+ pdf_reader = PdfReader(pdf)
30
+ for page in pdf_reader.pages:
31
+ text += page.extract_text()
32
+ return text
33
+
34
+ def get_text_chunks(raw_text):
35
+ text_splitter = CharacterTextSplitter(
36
+ separator="\n",
37
+ chunk_size=1000,
38
+ chunk_overlap=200,
39
+ length_function=len
40
+ )
41
+ chunks = text_splitter.split_text(raw_text)
42
+ return chunks
43
+
44
+ def get_vector_store(text_chunks):
45
+ # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
46
+ embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'))
47
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
48
+ return vectorstore
49
+
50
+ def get_conversation_chain(vectorstore):
51
+ llm = ChatOpenAI(openai_api_key=os.getenv('OPENAI_API_KEY'))
52
+ # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
53
+ # llm = HuggingFaceHub(repo_id="meta-llama/Meta-Llama-3.1-8B-Instruct", model_kwargs={"temperature":0.5, "max_length":512})
54
+ # llm = HuggingFaceHub(repo_id="meta-llama/Meta-Llama-3.1-8B-Instruct")
55
+
56
+ memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
57
+ conversation_chain = ConversationalRetrievalChain.from_llm(
58
+ llm=llm,
59
+ retriever=vectorstore.as_retriever(),
60
+ memory=memory
61
+ )
62
+ return conversation_chain
63
+
64
+ # get handler user input method
65
+ def handle_user_input(user_question):
66
+ if st.session_state.conversation is not None:
67
+
68
+ response = st.session_state.conversation({'question': user_question})
69
+ st.session_state.chat_history = response['chat_history']
70
+
71
+ for i, message in enumerate(st.session_state.chat_history):
72
+ if i % 2 == 0:
73
+ st.write(user_template.replace(
74
+ "{{MSG}}", message.content), unsafe_allow_html=True)
75
+ else:
76
+ st.write(bot_template.replace(
77
+ "{{MSG}}", message.content), unsafe_allow_html=True)
78
+ else:
79
+ st.write("Please upload PDFs and click process")
80
+
81
+ def main():
82
+ load_dotenv()
83
+
84
+ st.set_page_config(page_title="Onki AI Assistant - Chat with multiple PDFs", page_icon=":books:")
85
+ st.write(css, unsafe_allow_html=True)
86
+
87
+ #load knowledge data PDF
88
+ files = read_files_from_directory('./data')
89
+ raw_knowledge_text = get_pdf_text(files)
90
+ raw_knowledge_chunks = get_text_chunks(raw_knowledge_text)
91
+ vectorstore_knowledge = get_vector_store(raw_knowledge_chunks)
92
+
93
+ st.session_state.conversation = get_conversation_chain(vectorstore_knowledge)
94
+
95
+ if "conversation" not in st.session_state:
96
+ st.session_state.conversation = None
97
+ if "chat_history" not in st.session_state:
98
+ st.session_state.chat_history = None
99
+
100
+ st.header("Chat with multiple PDFs :books:")
101
+ user_question = st.text_input("Ask a question about your documents:")
102
+
103
+ if user_question:
104
+ handle_user_input(user_question)
105
+
106
+ with st.sidebar:
107
+
108
+ st.subheader("Your documents")
109
+ pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
110
+ if st.button("Process"):
111
+ with st.spinner("Processing"):
112
+ # get pdf text
113
+ raw_text = get_pdf_text(pdf_docs)
114
+
115
+ # get the text chunks
116
+ text_chunks = get_text_chunks(raw_text)
117
+
118
+ # create vector store
119
+ vectorstore = get_vector_store(text_chunks)
120
+
121
+ vectorstore.merge_from(vectorstore_knowledge)
122
+
123
+ #create conversation chain
124
+ st.session_state.conversation = get_conversation_chain(vectorstore)
125
+
126
+ if __name__ == '__main__':
127
+ main()
data/E-9-2024-000119-ASW_en.pdf ADDED
Binary file (36.3 kB). View file
 
data/E-9-2024-000119_en.pdf ADDED
Binary file (43 kB). View file
 
htmlTemplates.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ css = '''
2
+ <style>
3
+ .chat-message {
4
+ padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
5
+ }
6
+ .chat-message.user {
7
+ background-color: #2b313e
8
+ }
9
+ .chat-message.bot {
10
+ background-color: #475063
11
+ }
12
+ .chat-message .avatar {
13
+ width: 20%;
14
+ }
15
+ .chat-message .avatar img {
16
+ max-width: 78px;
17
+ max-height: 78px;
18
+ border-radius: 50%;
19
+ object-fit: cover;
20
+ }
21
+ .chat-message .message {
22
+ width: 80%;
23
+ padding: 0 1.5rem;
24
+ color: #fff;
25
+ }
26
+ '''
27
+
28
+ bot_template = '''
29
+ <div class="chat-message bot">
30
+ <div class="avatar">
31
+ <img src="https://i.ibb.co/cN0nmSj/Screenshot-2023-05-28-at-02-37-21.png" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
32
+ </div>
33
+ <div class="message">{{MSG}}</div>
34
+ </div>
35
+ '''
36
+
37
+ user_template = '''
38
+ <div class="chat-message user">
39
+ <div class="avatar">
40
+ <img src="https://i.ibb.co/rdZC7LZ/Photo-logo-1.png">
41
+ </div>
42
+ <div class="message">{{MSG}}</div>
43
+ </div>
44
+ '''
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ #unstructured[pdf]
2
+ #python-magic
3
+ streamlit
4
+ python-dotenv
5
+ PyPDF2
6
+ langchain
7
+ langchain_openai
8
+ langchain_community
9
+ faiss-cpu