Elia Wäfler commited on
Commit
fcff3db
1 Parent(s): b18787d

Streamlit APP

Browse files
Files changed (2) hide show
  1. DocVerifyRAG.py +148 -0
  2. html_templates.py +44 -0
DocVerifyRAG.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from dotenv import load_dotenv
3
+ from PyPDF2 import PdfReader
4
+ from langchain import embeddings
5
+ from langchain.text_splitter import CharacterTextSplitter
6
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
7
+ from langchain.vectorstores import FAISS
8
+ from langchain.vectorstores import faiss
9
+ from langchain.chat_models import ChatOpenAI
10
+ from langchain.memory import ConversationBufferMemory
11
+ from langchain.chains import ConversationalRetrievalChain
12
+ from html_templates import css, bot_template, user_template
13
+ from langchain.llms import HuggingFaceHub
14
+ import os
15
+ import pickle
16
+ from datetime import datetime
17
+
18
+
19
+ def get_pdf_text(pdf_docs):
20
+ text = ""
21
+ for pdf in pdf_docs:
22
+ pdf_reader = PdfReader(pdf)
23
+ for page in pdf_reader.pages:
24
+ text += page.extract_text()
25
+ return text
26
+
27
+
28
+ def get_text_chunks(text):
29
+ text_splitter = CharacterTextSplitter(
30
+ separator="\n",
31
+ chunk_size=1000,
32
+ chunk_overlap=200,
33
+ length_function=len
34
+ )
35
+ chunks = text_splitter.split_text(text)
36
+ return chunks
37
+
38
+
39
+ def get_vectorstore(text_chunks):
40
+ embeddings = OpenAIEmbeddings()
41
+ # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
42
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
43
+ return vectorstore
44
+
45
+
46
+ def get_conversation_chain(vectorstore):
47
+ llm = ChatOpenAI()
48
+ # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
49
+
50
+ memory = ConversationBufferMemory(
51
+ memory_key='chat_history', return_messages=True)
52
+ conversation_chain = ConversationalRetrievalChain.from_llm(
53
+ llm=llm,
54
+ retriever=vectorstore.as_retriever(),
55
+ memory=memory
56
+ )
57
+ return conversation_chain
58
+
59
+
60
+ def handle_userinput(user_question):
61
+ response = st.session_state.conversation({'question': user_question})
62
+ st.session_state.chat_history = response['chat_history']
63
+
64
+ for i, message in enumerate(st.session_state.chat_history):
65
+ # Display user message
66
+ if i % 2 == 0:
67
+ st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
68
+ else:
69
+ print(message)
70
+ # Display AI response
71
+ st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
72
+ # Display source document information if available in the message
73
+ if hasattr(message, 'source') and message.source:
74
+ st.write(f"Source Document: {message.source}", unsafe_allow_html=True)
75
+
76
+
77
+ def safe_vec_store():
78
+ os.makedirs('vectorstore', exist_ok=True)
79
+ filename = 'vectores' + datetime.now().strftime('%Y%m%d%H%M') + '.pkl'
80
+ file_path = os.path.join('vectorstore', filename)
81
+ vector_store = st.session_state.vectorstore
82
+
83
+ # Serialize and save the entire FAISS object using pickle
84
+ with open(file_path, 'wb') as f:
85
+ pickle.dump(vector_store, f)
86
+
87
+
88
+ def main():
89
+ load_dotenv()
90
+ st.set_page_config(page_title="Anna Seiler Haus KI-Assistent", page_icon=":hospital:")
91
+ st.write(css, unsafe_allow_html=True)
92
+
93
+ st.subheader("Your documents")
94
+ pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
95
+ filenames = [file.name for file in pdf_docs if file is not None]
96
+
97
+ if "conversation" not in st.session_state:
98
+ st.session_state.conversation = None
99
+ if "chat_history" not in st.session_state:
100
+ st.session_state.chat_history = None
101
+
102
+ st.header("Anna Seiler Haus KI-Assistent ASH :hospital:")
103
+ user_question = st.text_input("Ask a question about your documents:")
104
+ if user_question:
105
+ handle_userinput(user_question)
106
+
107
+ with st.sidebar:
108
+
109
+ st.subheader("Classification Instrucitons")
110
+ classifier_docs = st.file_uploader("Upload your instructions here and click on 'Process'", accept_multiple_files=True)
111
+ filenames = [file.name for file in classifier_docs if file is not None]
112
+
113
+ if st.button("Process"):
114
+ with st.spinner("Processing"):
115
+ loaded_vec_store = None
116
+ for filename in filenames:
117
+ if ".pkl" in filename:
118
+ file_path = os.path.join('vectorstore', filename)
119
+ with open(file_path, 'rb') as f:
120
+ loaded_vec_store = pickle.load(f)
121
+ raw_text = get_pdf_text(pdf_docs)
122
+ text_chunks = get_text_chunks(raw_text)
123
+ vec = get_vectorstore(text_chunks)
124
+ if loaded_vec_store:
125
+ vec.merge_from(loaded_vec_store)
126
+ st.warning("loaded vectorstore")
127
+ if "vectorstore" in st.session_state:
128
+ vec.merge_from(st.session_state.vectorstore)
129
+ st.warning("merged to existing")
130
+ st.session_state.vectorstore = vec
131
+ st.session_state.conversation = get_conversation_chain(vec)
132
+ st.success("data loaded")
133
+
134
+ # Save and Load Embeddings
135
+ if st.button("Save Embeddings"):
136
+ if "vectorstore" in st.session_state:
137
+ safe_vec_store()
138
+ # st.session_state.vectorstore.save_local("faiss_index")
139
+ st.sidebar.success("safed")
140
+ else:
141
+ st.sidebar.warning("No embeddings to save. Please process documents first.")
142
+
143
+ if st.button("Load Embeddings"):
144
+ st.warning("this function is not in use, just upload the vectorstore")
145
+
146
+
147
+ if __name__ == '__main__':
148
+ main()
html_templates.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ css = '''
2
+ <style>
3
+ .chat-message {
4
+ padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
5
+ }
6
+ .chat-message.user {
7
+ background-color: #2b313e
8
+ }
9
+ .chat-message.bot {
10
+ background-color: #475063
11
+ }
12
+ .chat-message .avatar {
13
+ width: 20%;
14
+ }
15
+ .chat-message .avatar img {
16
+ max-width: 78px;
17
+ max-height: 78px;
18
+ border-radius: 50%;
19
+ object-fit: cover;
20
+ }
21
+ .chat-message .message {
22
+ width: 80%;
23
+ padding: 0 1.5rem;
24
+ color: #fff;
25
+ }
26
+ '''
27
+
28
+ bot_template = '''
29
+ <div class="chat-message bot">
30
+ <div class="avatar">
31
+ <img src="https://www.insel.ch/_ari/115280/49841742b8afbc44928918244fb4c6f9b487d5b3/9f6e35f65cbd0d6c47c145f90b1d5a297eb50bcd/1400/0/og/20230704-Anna-Seiler-Haus-009-screen.jpg.webp" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
32
+ </div>
33
+ <div class="message">{{MSG}}</div>
34
+ </div>
35
+ '''
36
+
37
+ user_template = '''
38
+ <div class="chat-message user">
39
+ <div class="avatar">
40
+ <img src="https://media.licdn.com/dms/image/C4D03AQHi5rJfheyUtQ/profile-displayphoto-shrink_800_800/0/1638174649461?e=2147483647&v=beta&t=KOsttcLGIwB9pBEVfceHj-ckv_zPHs-2COyrp7aYR-k">
41
+ </div>
42
+ <div class="message">{{MSG}}</div>
43
+ </div>
44
+ '''