awacke1 commited on
Commit
d2f0b97
β€’
1 Parent(s): 6b566e7

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +356 -0
app.py ADDED
@@ -0,0 +1,356 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import openai
3
+ import os
4
+ import base64
5
+ import glob
6
+ import json
7
+ import mistune
8
+ import pytz
9
+ import math
10
+ import requests
11
+ import time
12
+
13
+ from datetime import datetime
14
+ from openai import ChatCompletion
15
+ from xml.etree import ElementTree as ET
16
+ from bs4 import BeautifulSoup
17
+ from collections import deque
18
+ from audio_recorder_streamlit import audio_recorder
19
+
20
+ from dotenv import load_dotenv
21
+ from PyPDF2 import PdfReader
22
+ from langchain.text_splitter import CharacterTextSplitter
23
+ from langchain.embeddings import OpenAIEmbeddings
24
+ from langchain.vectorstores import FAISS
25
+ from langchain.chat_models import ChatOpenAI
26
+ from langchain.memory import ConversationBufferMemory
27
+ from langchain.chains import ConversationalRetrievalChain
28
+ from htmlTemplates import css, bot_template, user_template
29
+
30
+
31
+
32
+ def generate_filename(prompt, file_type):
33
+ central = pytz.timezone('US/Central')
34
+ safe_date_time = datetime.now(central).strftime("%m%d_%I%M") # Date and time DD-TT
35
+ safe_prompt = "".join(x for x in prompt if x.isalnum())[:45] # Limit file name size and trim whitespace
36
+ return f"{safe_date_time}_{safe_prompt}.{file_type}" # Return a safe file name
37
+
38
+ def transcribe_audio(openai_key, file_path, model):
39
+ OPENAI_API_URL = "https://api.openai.com/v1/audio/transcriptions"
40
+ headers = {
41
+ "Authorization": f"Bearer {openai_key}",
42
+ }
43
+ with open(file_path, 'rb') as f:
44
+ data = {'file': f}
45
+ response = requests.post(OPENAI_API_URL, headers=headers, files=data, data={'model': model})
46
+ if response.status_code == 200:
47
+ st.write(response.json())
48
+ chatResponse = chat_with_model(response.json().get('text'), '') # *************************************
49
+ transcript = response.json().get('text')
50
+ st.write('Responses:')
51
+ st.write(chatResponse)
52
+ filename = generate_filename(transcript, 'txt')
53
+ create_file(filename, transcript, chatResponse)
54
+ return transcript
55
+ else:
56
+ st.write(response.json())
57
+ st.error("Error in API call.")
58
+ return None
59
+
60
+ def save_and_play_audio(audio_recorder):
61
+ audio_bytes = audio_recorder()
62
+ if audio_bytes:
63
+ filename = generate_filename("Recording", "wav")
64
+ with open(filename, 'wb') as f:
65
+ f.write(audio_bytes)
66
+ st.audio(audio_bytes, format="audio/wav")
67
+ return filename
68
+ return None
69
+
70
+ def create_file(filename, prompt, response):
71
+ if filename.endswith(".txt"):
72
+ with open(filename, 'w') as file:
73
+ file.write(f"{prompt}\n{response}")
74
+ elif filename.endswith(".htm"):
75
+ with open(filename, 'w') as file:
76
+ file.write(f"{prompt} {response}")
77
+ elif filename.endswith(".md"):
78
+ with open(filename, 'w') as file:
79
+ file.write(f"{prompt}\n\n{response}")
80
+
81
+ def truncate_document(document, length):
82
+ return document[:length]
83
+ def divide_document(document, max_length):
84
+ return [document[i:i+max_length] for i in range(0, len(document), max_length)]
85
+
86
+ def get_table_download_link(file_path):
87
+ with open(file_path, 'r') as file:
88
+ try:
89
+ data = file.read()
90
+ except:
91
+ st.write('')
92
+ return file_path
93
+ b64 = base64.b64encode(data.encode()).decode()
94
+ file_name = os.path.basename(file_path)
95
+ ext = os.path.splitext(file_name)[1] # get the file extension
96
+ if ext == '.txt':
97
+ mime_type = 'text/plain'
98
+ elif ext == '.py':
99
+ mime_type = 'text/plain'
100
+ elif ext == '.xlsx':
101
+ mime_type = 'text/plain'
102
+ elif ext == '.csv':
103
+ mime_type = 'text/plain'
104
+ elif ext == '.htm':
105
+ mime_type = 'text/html'
106
+ elif ext == '.md':
107
+ mime_type = 'text/markdown'
108
+ else:
109
+ mime_type = 'application/octet-stream' # general binary data type
110
+ href = f'<a href="data:{mime_type};base64,{b64}" target="_blank" download="{file_name}">{file_name}</a>'
111
+ return href
112
+
113
+ def CompressXML(xml_text):
114
+ root = ET.fromstring(xml_text)
115
+ for elem in list(root.iter()):
116
+ if isinstance(elem.tag, str) and 'Comment' in elem.tag:
117
+ elem.parent.remove(elem)
118
+ return ET.tostring(root, encoding='unicode', method="xml")
119
+
120
+ def read_file_content(file,max_length):
121
+ if file.type == "application/json":
122
+ content = json.load(file)
123
+ return str(content)
124
+ elif file.type == "text/html" or file.type == "text/htm":
125
+ content = BeautifulSoup(file, "html.parser")
126
+ return content.text
127
+ elif file.type == "application/xml" or file.type == "text/xml":
128
+ tree = ET.parse(file)
129
+ root = tree.getroot()
130
+ xml = CompressXML(ET.tostring(root, encoding='unicode'))
131
+ return xml
132
+ elif file.type == "text/markdown" or file.type == "text/md":
133
+ md = mistune.create_markdown()
134
+ content = md(file.read().decode())
135
+ return content
136
+ elif file.type == "text/plain":
137
+ return file.getvalue().decode()
138
+ else:
139
+ return ""
140
+
141
+ def chat_with_model(prompt, document_section, model_choice='gpt-3.5-turbo'):
142
+ model = model_choice
143
+ conversation = [{'role': 'system', 'content': 'You are a helpful assistant.'}]
144
+ conversation.append({'role': 'user', 'content': prompt})
145
+ if len(document_section)>0:
146
+ conversation.append({'role': 'assistant', 'content': document_section})
147
+
148
+ start_time = time.time()
149
+ report = []
150
+ res_box = st.empty()
151
+ collected_chunks = []
152
+ collected_messages = []
153
+
154
+ for chunk in openai.ChatCompletion.create(
155
+ model='gpt-3.5-turbo',
156
+ messages=conversation,
157
+ temperature=0.5,
158
+ stream=True
159
+ ):
160
+
161
+ collected_chunks.append(chunk) # save the event response
162
+ chunk_message = chunk['choices'][0]['delta'] # extract the message
163
+ collected_messages.append(chunk_message) # save the message
164
+
165
+ content=chunk["choices"][0].get("delta",{}).get("content")
166
+
167
+ try:
168
+ report.append(content)
169
+ if len(content) > 0:
170
+ result = "".join(report).strip()
171
+ #result = result.replace("\n", "")
172
+ res_box.markdown(f'*{result}*')
173
+ except:
174
+ st.write('.')
175
+
176
+ full_reply_content = ''.join([m.get('content', '') for m in collected_messages])
177
+ st.write("Elapsed time:")
178
+ st.write(time.time() - start_time)
179
+ return full_reply_content
180
+
181
+ def chat_with_file_contents(prompt, file_content, model_choice='gpt-3.5-turbo'):
182
+ conversation = [{'role': 'system', 'content': 'You are a helpful assistant.'}]
183
+ conversation.append({'role': 'user', 'content': prompt})
184
+ if len(file_content)>0:
185
+ conversation.append({'role': 'assistant', 'content': file_content})
186
+ response = openai.ChatCompletion.create(model=model_choice, messages=conversation)
187
+ return response['choices'][0]['message']['content']
188
+
189
+ def pdf2txt(pdf_docs):
190
+ text = ""
191
+ for pdf in pdf_docs:
192
+ pdf_reader = PdfReader(pdf)
193
+ for page in pdf_reader.pages:
194
+ text += page.extract_text()
195
+ return text
196
+
197
+ def txt2chunks(text):
198
+ text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len)
199
+ return text_splitter.split_text(text)
200
+
201
+ def vector_store(text_chunks):
202
+ key = os.getenv('OPENAI_API_KEY')
203
+ embeddings = OpenAIEmbeddings(openai_api_key=key)
204
+ return FAISS.from_texts(texts=text_chunks, embedding=embeddings)
205
+
206
+ def get_chain(vectorstore):
207
+ llm = ChatOpenAI()
208
+ memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
209
+ return ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(), memory=memory)
210
+
211
+ def process_user_input(user_question):
212
+ response = st.session_state.conversation({'question': user_question})
213
+ st.session_state.chat_history = response['chat_history']
214
+ for i, message in enumerate(st.session_state.chat_history):
215
+ template = user_template if i % 2 == 0 else bot_template
216
+ st.write(template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
217
+ # Save file output from PDF query results
218
+ filename = generate_filename(user_question, 'txt')
219
+ create_file(filename, user_question, message.content)
220
+
221
+ #st.sidebar.markdown(get_table_download_link(filename), unsafe_allow_html=True)
222
+
223
+
224
+ def main():
225
+ # Sidebar and global
226
+ openai.api_key = os.getenv('OPENAI_API_KEY')
227
+ st.set_page_config(page_title="GPT Streamlit Document Reasoner",layout="wide")
228
+
229
+ # File type for output, model choice
230
+ menu = ["htm", "txt", "xlsx", "csv", "md", "py"] #619
231
+ choice = st.sidebar.selectbox("Output File Type:", menu)
232
+ model_choice = st.sidebar.radio("Select Model:", ('gpt-3.5-turbo', 'gpt-3.5-turbo-0301'))
233
+
234
+ # Audio, transcribe, GPT:
235
+ filename = save_and_play_audio(audio_recorder)
236
+ if filename is not None:
237
+ transcription = transcribe_audio(openai.api_key, filename, "whisper-1")
238
+ st.sidebar.markdown(get_table_download_link(filename), unsafe_allow_html=True)
239
+ filename=None # since transcription is finished next time just use the saved transcript
240
+
241
+ # prompt interfaces
242
+ user_prompt = st.text_area("Enter prompts, instructions & questions:", '', height=100)
243
+
244
+ # file section interface for prompts against large documents as context
245
+ collength, colupload = st.columns([2,3]) # adjust the ratio as needed
246
+ with collength:
247
+ max_length = st.slider("File section length for large files", min_value=1000, max_value=128000, value=12000, step=1000)
248
+ with colupload:
249
+ uploaded_file = st.file_uploader("Add a file for context:", type=["xml", "json", "xlsx","csv","html", "htm", "md", "txt"])
250
+
251
+ # Document section chat
252
+ document_sections = deque()
253
+ document_responses = {}
254
+ if uploaded_file is not None:
255
+ file_content = read_file_content(uploaded_file, max_length)
256
+ document_sections.extend(divide_document(file_content, max_length))
257
+ if len(document_sections) > 0:
258
+ if st.button("πŸ‘οΈ View Upload"):
259
+ st.markdown("**Sections of the uploaded file:**")
260
+ for i, section in enumerate(list(document_sections)):
261
+ st.markdown(f"**Section {i+1}**\n{section}")
262
+ st.markdown("**Chat with the model:**")
263
+ for i, section in enumerate(list(document_sections)):
264
+ if i in document_responses:
265
+ st.markdown(f"**Section {i+1}**\n{document_responses[i]}")
266
+ else:
267
+ if st.button(f"Chat about Section {i+1}"):
268
+ st.write('Reasoning with your inputs...')
269
+ response = chat_with_model(user_prompt, section, model_choice) # *************************************
270
+ st.write('Response:')
271
+ st.write(response)
272
+ document_responses[i] = response
273
+ filename = generate_filename(f"{user_prompt}_section_{i+1}", choice)
274
+ create_file(filename, user_prompt, response)
275
+ st.sidebar.markdown(get_table_download_link(filename), unsafe_allow_html=True)
276
+
277
+ if st.button('πŸ’¬ Chat'):
278
+ st.write('Reasoning with your inputs...')
279
+ response = chat_with_model(user_prompt, ''.join(list(document_sections,)), model_choice) # *************************************
280
+ st.write('Response:')
281
+ st.write(response)
282
+
283
+ filename = generate_filename(user_prompt, choice)
284
+ create_file(filename, user_prompt, response)
285
+ st.sidebar.markdown(get_table_download_link(filename), unsafe_allow_html=True)
286
+
287
+ all_files = glob.glob("*.*")
288
+ all_files = [file for file in all_files if len(os.path.splitext(file)[0]) >= 20] # exclude files with short names
289
+ all_files.sort(key=lambda x: (os.path.splitext(x)[1], x), reverse=True) # sort by file type and file name in descending order
290
+
291
+ # sidebar of files
292
+ file_contents=''
293
+ next_action=''
294
+ for file in all_files:
295
+ col1, col2, col3, col4, col5 = st.sidebar.columns([1,6,1,1,1]) # adjust the ratio as needed
296
+ with col1:
297
+ if st.button("🌐", key="md_"+file): # md emoji button
298
+ with open(file, 'r') as f:
299
+ file_contents = f.read()
300
+ next_action='md'
301
+ with col2:
302
+ st.markdown(get_table_download_link(file), unsafe_allow_html=True)
303
+ with col3:
304
+ if st.button("πŸ“‚", key="open_"+file): # open emoji button
305
+ with open(file, 'r') as f:
306
+ file_contents = f.read()
307
+ next_action='open'
308
+ with col4:
309
+ if st.button("πŸ”", key="read_"+file): # search emoji button
310
+ with open(file, 'r') as f:
311
+ file_contents = f.read()
312
+ next_action='search'
313
+ with col5:
314
+ if st.button("πŸ—‘", key="delete_"+file):
315
+ os.remove(file)
316
+ st.experimental_rerun()
317
+
318
+ if len(file_contents) > 0:
319
+ if next_action=='open':
320
+ file_content_area = st.text_area("File Contents:", file_contents, height=500)
321
+ if next_action=='md':
322
+ st.markdown(file_contents)
323
+ if next_action=='search':
324
+ file_content_area = st.text_area("File Contents:", file_contents, height=500)
325
+ st.write('Reasoning with your inputs...')
326
+ response = chat_with_model(user_prompt, file_contents, model_choice)
327
+ filename = generate_filename(file_contents, choice)
328
+ create_file(filename, file_contents, response)
329
+
330
+ st.experimental_rerun()
331
+ #st.sidebar.markdown(get_table_download_link(filename), unsafe_allow_html=True)
332
+
333
+ if __name__ == "__main__":
334
+ main()
335
+
336
+ load_dotenv()
337
+ st.write(css, unsafe_allow_html=True)
338
+
339
+ st.header("Chat with documents :books:")
340
+ user_question = st.text_input("Ask a question about your documents:")
341
+ if user_question:
342
+ process_user_input(user_question)
343
+
344
+ with st.sidebar:
345
+ st.subheader("Your documents")
346
+ docs = st.file_uploader("Upload your documents", accept_multiple_files=True)
347
+ with st.spinner("Processing"):
348
+ raw = pdf2txt(docs)
349
+ if len(raw) > 0:
350
+ length = str(len(raw))
351
+ text_chunks = txt2chunks(raw)
352
+ vectorstore = vector_store(text_chunks)
353
+ st.session_state.conversation = get_chain(vectorstore)
354
+ st.markdown('# AI Search Index of Length:' + length + ' Created.')
355
+ filename = generate_filename(raw, 'txt')
356
+ create_file(filename, raw, '')