Jashan1 commited on
Commit
160984c
·
verified ·
1 Parent(s): 61346bb

Upload 3 files

Browse files
Files changed (3) hide show
  1. .env +2 -0
  2. app.py +520 -0
  3. requirements.txt +10 -0
.env ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ OPENAI_API_KEY=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
2
+ LLAMA_CLOUD_API_KEY=llx-HNSQopbOXxJJ1sP49b5CRvtn4SmGwxCQEPop3jDEJi5IYolz
app.py ADDED
@@ -0,0 +1,520 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import requests
4
+ import streamlit as st
5
+ from openai import OpenAI
6
+ from PyPDF2 import PdfReader
7
+ import urllib.parse
8
+ from dotenv import load_dotenv
9
+ from openai import OpenAI
10
+ from io import BytesIO
11
+ from streamlit_extras.colored_header import colored_header
12
+ from streamlit_extras.add_vertical_space import add_vertical_space
13
+ from streamlit_extras.switch_page_button import switch_page
14
+ import json
15
+ import pandas as pd
16
+ from st_aggrid import AgGrid, GridOptionsBuilder, GridUpdateMode, DataReturnMode
17
+ import time
18
+ import random
19
+ import aiohttp
20
+ import asyncio
21
+ from PyPDF2 import PdfWriter
22
+
23
+ load_dotenv()
24
+
25
+ # ---------------------- Configuration ----------------------
26
+ st.set_page_config(page_title="Building Regulations Chatbot", layout="wide", initial_sidebar_state="expanded")
27
+ # Load environment variables from .env file
28
+ load_dotenv()
29
+
30
+ # Set OpenAI API key
31
+ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
32
+
33
+ # ---------------------- Session State Initialization ----------------------
34
+
35
+ if 'pdf_contents' not in st.session_state:
36
+ st.session_state.pdf_contents = []
37
+ if 'chat_history' not in st.session_state:
38
+ st.session_state.chat_history = []
39
+ if 'processed_pdfs' not in st.session_state:
40
+ st.session_state.processed_pdfs = False
41
+ if 'id_counter' not in st.session_state:
42
+ st.session_state.id_counter = 0
43
+ if 'assistant_id' not in st.session_state:
44
+ st.session_state.assistant_id = None
45
+ if 'thread_id' not in st.session_state:
46
+ st.session_state.thread_id = None
47
+ if 'file_ids' not in st.session_state:
48
+ st.session_state.file_ids = []
49
+
50
+
51
+ # ---------------------- Helper Functions ----------------------
52
+
53
+ def get_vector_stores():
54
+ try:
55
+ vector_stores = client.beta.vector_stores.list()
56
+ return vector_stores
57
+ except Exception as e:
58
+ return f"Error retrieving vector stores: {str(e)}"
59
+
60
+
61
+ def fetch_pdfs(city_code):
62
+ url = f"http://91.203.213.50:5000/oereblex/{city_code}"
63
+ response = requests.get(url)
64
+ if response.status_code == 200:
65
+ data = response.json()
66
+ print("First data:", data.get('data', [])[0] if data.get('data') else None)
67
+ return data.get('data', [])
68
+ else:
69
+ st.error(f"Failed to fetch PDFs for city code {city_code}")
70
+ return None
71
+
72
+
73
+ def download_pdf(url, doc_title):
74
+ # Add 'https://' scheme if it's missing
75
+ if not url.startswith(('http://', 'https://')):
76
+ url = 'https://' + url
77
+
78
+ try:
79
+ response = requests.get(url)
80
+ response.raise_for_status() # Raise an exception for bad status codes
81
+
82
+ # Sanitize doc_title to create a valid filename
83
+ sanitized_title = ''.join(c for c in doc_title if c.isalnum() or c in (' ', '_', '-')).rstrip()
84
+ sanitized_title = sanitized_title.replace(' ', '_')
85
+ filename = f"{sanitized_title}.pdf"
86
+
87
+ # Ensure filename is unique by appending the id_counter if necessary
88
+ if os.path.exists(filename):
89
+ filename = f"{sanitized_title}_{st.session_state.id_counter}.pdf"
90
+ st.session_state.id_counter += 1
91
+
92
+ # Save the PDF content to a file
93
+ with open(filename, 'wb') as f:
94
+ f.write(response.content)
95
+
96
+ return filename
97
+ except requests.RequestException as e:
98
+ st.error(f"Failed to download PDF from {url}. Error: {str(e)}")
99
+ return None
100
+
101
+
102
+ # Helper function to upload file to OpenAI
103
+ def upload_file_to_openai(file_path):
104
+ try:
105
+ file = client.files.create(
106
+ file=open(file_path, 'rb'),
107
+ purpose='assistants'
108
+ )
109
+ return file.id
110
+ except Exception as e:
111
+ st.error(f"Failed to upload file {file_path}. Error: {str(e)}")
112
+ return None
113
+
114
+
115
+ def create_assistant():
116
+ assistant = client.beta.assistants.create(
117
+ name="Building Regulations Assistant",
118
+ instructions="You are an expert on building regulations. Use the provided documents to answer questions accurately.",
119
+ model="gpt-4o-mini",
120
+ tools=[{"type": "file_search"}]
121
+ )
122
+ st.session_state.assistant_id = assistant.id
123
+ return assistant.id
124
+
125
+
126
+ def format_response(response, citations):
127
+ """Format the response with proper markdown structure."""
128
+ formatted_text = f"""
129
+ {response}
130
+
131
+ {"### Citations" if citations else ""}
132
+ {"".join([f"- {citation}\n" for citation in citations]) if citations else ""}
133
+ """
134
+ return formatted_text.strip()
135
+
136
+ def response_generator(response, citations):
137
+ """Generator for streaming response with structured output."""
138
+ # Yield the main response word by word
139
+ words = response.split()
140
+ for i, word in enumerate(words):
141
+ yield word + " "
142
+ # Add natural pauses at punctuation
143
+ if word.endswith(('.', '!', '?', ':')):
144
+ time.sleep(0.1)
145
+ else:
146
+ time.sleep(0.05)
147
+
148
+ # If there are citations, yield them with proper formatting
149
+ if citations:
150
+ # Add some spacing before citations
151
+ yield "\n\n### Citations\n\n"
152
+ time.sleep(0.1)
153
+
154
+ for citation in citations:
155
+ yield f"- {citation}\n"
156
+ time.sleep(0.05)
157
+
158
+ def chat_with_assistant(file_ids, user_message):
159
+ print("----- Starting chat_with_assistant -----")
160
+ print("Received file_ids:", file_ids)
161
+ print("Received user_message:", user_message)
162
+
163
+ # Create attachments for each file_id
164
+ attachments = [{"file_id": file_id, "tools": [{"type": "file_search"}]} for file_id in file_ids]
165
+ print("Attachments created:", attachments)
166
+
167
+ if st.session_state.thread_id is None:
168
+ print("No existing thread_id found. Creating a new thread.")
169
+ thread = client.beta.threads.create(
170
+ messages=[
171
+ {
172
+ "role": "user",
173
+ "content": user_message,
174
+ "attachments": attachments,
175
+ }
176
+ ]
177
+ )
178
+ st.session_state.thread_id = thread.id
179
+ print("New thread created with id:", st.session_state.thread_id)
180
+ else:
181
+ print(f"Existing thread_id found: {st.session_state.thread_id}. Adding message to the thread.")
182
+ message = client.beta.threads.messages.create(
183
+ thread_id=st.session_state.thread_id,
184
+ role="user",
185
+ content=user_message,
186
+ attachments=attachments
187
+ )
188
+ print("Message added to thread with id:", message.id)
189
+
190
+ try:
191
+ thread = client.beta.threads.retrieve(thread_id=st.session_state.thread_id)
192
+ print("Retrieved thread:", thread)
193
+ except Exception as e:
194
+ print(f"Error retrieving thread with id {st.session_state.thread_id}: {e}")
195
+ return "An error occurred while processing your request.", []
196
+
197
+ try:
198
+ run = client.beta.threads.runs.create_and_poll(
199
+ thread_id=thread.id, assistant_id=st.session_state.assistant_id
200
+ )
201
+ print("Run created and polled:", run)
202
+ except Exception as e:
203
+ print("Error during run creation and polling:", e)
204
+ return "An error occurred while processing your request.", []
205
+
206
+ try:
207
+ messages = list(client.beta.threads.messages.list(thread_id=thread.id, run_id=run.id))
208
+ print("Retrieved messages:", messages)
209
+ except Exception as e:
210
+ print("Error retrieving messages:", e)
211
+ return "An error occurred while retrieving messages.", []
212
+
213
+ # Process the first message content
214
+ if messages and messages[0].content:
215
+ message_content = messages[0].content[0].text
216
+ print("Raw message content:", message_content)
217
+
218
+ annotations = message_content.annotations
219
+ citations = []
220
+ seen_citations = set()
221
+
222
+ # Process annotations and citations
223
+ for index, annotation in enumerate(annotations):
224
+ message_content.value = message_content.value.replace(annotation.text, f"[{index}]")
225
+ if file_citation := getattr(annotation, "file_citation", None):
226
+ try:
227
+ cited_file = client.files.retrieve(file_citation.file_id)
228
+ citation_entry = f"[{index}] {cited_file.filename}"
229
+ if citation_entry not in seen_citations:
230
+ citations.append(citation_entry)
231
+ seen_citations.add(citation_entry)
232
+ except Exception as e:
233
+ print(f"Error retrieving cited file for annotation {index}: {e}")
234
+
235
+ # Create a container for the response with proper styling
236
+ response_container = st.container()
237
+ with response_container:
238
+ message_placeholder = st.empty()
239
+ streaming_content = ""
240
+
241
+ # Stream the response with structure
242
+ for chunk in response_generator(message_content.value, citations):
243
+ streaming_content += chunk
244
+ # Use markdown for proper formatting during streaming
245
+ message_placeholder.markdown(streaming_content + "▌")
246
+
247
+ # Final formatted response
248
+ final_formatted_response = format_response(message_content.value, citations)
249
+ message_placeholder.markdown(final_formatted_response)
250
+
251
+ return final_formatted_response, citations
252
+ else:
253
+ return "No response received from the assistant.", []
254
+
255
+
256
+ # ---------------------- Streamlit App ----------------------
257
+
258
+ # ---------------------- Custom CSS Injection ----------------------
259
+
260
+ # Inject custom CSS to style chat messages
261
+ st.markdown("""
262
+ <style>
263
+ /* Style for the chat container */
264
+ .chat-container {
265
+ display: flex;
266
+ flex-direction: column;
267
+ gap: 1.5rem;
268
+ }
269
+
270
+ /* Style for individual chat messages */
271
+ .chat-message {
272
+ margin-bottom: 1.5rem;
273
+ }
274
+
275
+ /* Style for user messages */
276
+ .chat-message.user > div:first-child {
277
+ color: #1E90FF; /* Dodger Blue for "You" */
278
+ font-weight: bold;
279
+ margin-bottom: 0.5rem;
280
+ }
281
+
282
+ /* Style for assistant messages */
283
+ .chat-message.assistant > div:first-child {
284
+ color: #32CD32; /* Lime Green for "Assistant" */
285
+ font-weight: bold;
286
+ margin-bottom: 0.5rem;
287
+ }
288
+
289
+ /* Style for the message content */
290
+ .message-content {
291
+ padding: 1rem;
292
+ border-radius: 0.5rem;
293
+ line-height: 1.5;
294
+ }
295
+
296
+ .message-content h3 {
297
+ color: #444;
298
+ margin-top: 1rem;
299
+ margin-bottom: 0.5rem;
300
+ font-size: 1.1rem;
301
+ }
302
+
303
+ .message-content ul {
304
+ margin-top: 0.5rem;
305
+ margin-bottom: 0.5rem;
306
+ padding-left: 1.5rem;
307
+ }
308
+
309
+ .message-content li {
310
+ margin-bottom: 0.25rem;
311
+ }
312
+ </style>
313
+ """, unsafe_allow_html=True)
314
+
315
+ page = st.sidebar.selectbox("Choose a page", ["Documents", "Home", "Admin"])
316
+
317
+ if page == "Home":
318
+ st.title("Building Regulations Chatbot", anchor=False)
319
+
320
+ # Sidebar improvements
321
+ with st.sidebar:
322
+ colored_header("Selected Documents", description="Documents for chat")
323
+ if 'selected_pdfs' in st.session_state and not st.session_state.selected_pdfs.empty:
324
+ for _, pdf in st.session_state.selected_pdfs.iterrows():
325
+ st.write(f"- {pdf['Doc Title']}")
326
+ else:
327
+ st.write("No documents selected. Please go to the Documents page.")
328
+
329
+ # Main chat area improvements
330
+ colored_header("Chat", description="Ask questions about building regulations")
331
+
332
+ # Chat container with custom CSS class
333
+ st.markdown('<div class="chat-container" id="chat-container">', unsafe_allow_html=True)
334
+ for chat in st.session_state.chat_history:
335
+ with st.container():
336
+ if chat['role'] == 'user':
337
+ st.markdown(f"""
338
+ <div class="chat-message user">
339
+ <div><strong>You</strong></div>
340
+ <div class="message-content">{chat['content']}</div>
341
+ </div>
342
+ """, unsafe_allow_html=True)
343
+ else:
344
+ st.markdown(f"""
345
+ <div class="chat-message assistant">
346
+ <div><strong>Assistant</strong></div>
347
+ </div>
348
+ """, unsafe_allow_html=True)
349
+ # Use st.markdown to render the assistant's message content
350
+ st.markdown(chat['content'])
351
+ st.markdown('</div>', unsafe_allow_html=True)
352
+
353
+ # Inject JavaScript to auto-scroll the chat container
354
+ st.markdown("""
355
+ <script>
356
+ const chatContainer = document.getElementById('chat-container');
357
+ if (chatContainer) {
358
+ chatContainer.scrollTop = chatContainer.scrollHeight;
359
+ }
360
+ </script>
361
+ """, unsafe_allow_html=True)
362
+
363
+ # Chat input improvements
364
+ with st.form("chat_form", clear_on_submit=True):
365
+ user_input = st.text_area("Ask a question about building regulations...", height=100)
366
+ col1, col2 = st.columns([3, 1])
367
+ with col2:
368
+ submit = st.form_submit_button("Send", use_container_width=True)
369
+
370
+ if submit and user_input.strip() != "":
371
+ # Add user message to chat history
372
+ st.session_state.chat_history.append({"role": "user", "content": user_input})
373
+
374
+ if not st.session_state.file_ids:
375
+ st.error("Please process PDFs first.")
376
+ else:
377
+ with st.spinner("Generating response..."):
378
+ try:
379
+ response, citations = chat_with_assistant(st.session_state.file_ids, user_input)
380
+ # The response is already formatted, so we can add it directly to chat history
381
+ st.session_state.chat_history.append({
382
+ "role": "assistant",
383
+ "content": response
384
+ })
385
+ except Exception as e:
386
+ st.error(f"Error generating response: {str(e)}")
387
+
388
+ # Rerun the app to update the chat display
389
+ st.rerun()
390
+
391
+ # Footer improvements
392
+ add_vertical_space(2)
393
+ st.markdown("---")
394
+ col1, col2 = st.columns(2)
395
+ with col1:
396
+ st.caption("Powered by OpenAI GPT-4 and Pinecone")
397
+ with col2:
398
+ st.caption("© 2023 Your Company Name")
399
+
400
+ elif page == "Documents":
401
+ st.title("Document Selection")
402
+
403
+ city_code_input = st.text_input("Enter city code:", key="city_code_input")
404
+ load_documents_button = st.button("Load Documents", key="load_documents_button")
405
+
406
+ if load_documents_button and city_code_input:
407
+ with st.spinner("Fetching PDFs..."):
408
+ pdfs = fetch_pdfs(city_code_input)
409
+ if pdfs:
410
+ st.session_state.available_pdfs = pdfs
411
+ st.success(f"Found {len(pdfs)} PDFs")
412
+ else:
413
+ st.error("No PDFs found")
414
+
415
+ if 'available_pdfs' in st.session_state:
416
+ st.write(f"Total PDFs: {len(st.session_state.available_pdfs)}")
417
+
418
+ # Create a DataFrame from the available PDFs
419
+ df = pd.DataFrame(st.session_state.available_pdfs)
420
+
421
+ # Select and rename only the specified columns
422
+ df = df[['municipality', 'abbreviation', 'doc_title', 'file_title', 'file_href', 'enactment_date', 'prio']]
423
+ df = df.rename(columns={
424
+ "municipality": "Municipality",
425
+ "abbreviation": "Abbreviation",
426
+ "doc_title": "Doc Title",
427
+ "file_title": "File Title",
428
+ "file_href": "File Href",
429
+ "enactment_date": "Enactment Date",
430
+ "prio": "Prio"
431
+ })
432
+
433
+ # Add a checkbox column to the DataFrame at the beginning
434
+ df.insert(0, "Select", False)
435
+
436
+ # Configure grid options
437
+ gb = GridOptionsBuilder.from_dataframe(df)
438
+ gb.configure_default_column(enablePivot=True, enableValue=True, enableRowGroup=True)
439
+ gb.configure_column("Select", header_name="Select", cellRenderer='checkboxRenderer')
440
+ gb.configure_column("File Href", cellRenderer='linkRenderer')
441
+ gb.configure_selection(selection_mode="multiple", use_checkbox=True)
442
+ gb.configure_side_bar()
443
+ gridOptions = gb.build()
444
+
445
+ # Display the AgGrid
446
+ grid_response = AgGrid(
447
+ df,
448
+ gridOptions=gridOptions,
449
+ enable_enterprise_modules=True,
450
+ update_mode=GridUpdateMode.MODEL_CHANGED,
451
+ data_return_mode=DataReturnMode.FILTERED_AND_SORTED,
452
+ fit_columns_on_grid_load=False,
453
+ )
454
+
455
+ # Get the selected rows
456
+ selected_rows = grid_response['selected_rows']
457
+
458
+ # Debug: Print the structure of selected_rows
459
+ st.write("Debug - Selected Rows Structure:", selected_rows)
460
+
461
+ if st.button("Process Selected PDFs"):
462
+ if len(selected_rows) > 0: # Check if there are any selected rows
463
+ # Convert selected_rows to a DataFrame
464
+ st.session_state.selected_pdfs = pd.DataFrame(selected_rows)
465
+ st.session_state.assistant_id = create_assistant()
466
+ with st.spinner("Processing PDFs and creating/updating assistant..."):
467
+ file_ids = []
468
+
469
+ for _, pdf in st.session_state.selected_pdfs.iterrows():
470
+ # Debug: Print each pdf item
471
+ st.write("Debug - PDF item:", pdf)
472
+
473
+ file_href = pdf['File Href']
474
+ doc_title = pdf['Doc Title']
475
+
476
+ # Pass doc_title to download_pdf
477
+ file_name = download_pdf(file_href, doc_title)
478
+ if file_name:
479
+ file_path = f"./{file_name}"
480
+ file_id = upload_file_to_openai(file_path)
481
+ if file_id:
482
+ file_ids.append(file_id)
483
+ else:
484
+ st.warning(f"Failed to upload {doc_title}. Skipping this file.")
485
+ else:
486
+ st.warning(f"Failed to download {doc_title}. Skipping this file.")
487
+
488
+ st.session_state.file_ids = file_ids
489
+ st.success("PDFs processed successfully. You can now chat on the Home page.")
490
+ else:
491
+ st.warning("Select at least one PDF.")
492
+
493
+
494
+ elif page == "Admin":
495
+ st.title("Admin Panel")
496
+ st.header("Vector Stores Information")
497
+
498
+ vector_stores = get_vector_stores()
499
+ json_vector_stores = json.dumps([vs.model_dump() for vs in vector_stores])
500
+ st.write(json_vector_stores)
501
+
502
+ # Add a button to go back to the main page
503
+
504
+
505
+
506
+
507
+
508
+
509
+
510
+
511
+
512
+
513
+
514
+
515
+
516
+
517
+
518
+
519
+
520
+
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit==1.25.0
2
+ openai==0.28.0
3
+ PyPDF2==3.0.1
4
+ python-dotenv==1.0.0
5
+ streamlit-extras==0.1.8
6
+ requests==2.31.0
7
+ pandas==2.1.1
8
+ st-aggrid==0.3.3
9
+ aiohttp==3.8.5
10
+ asyncio==4.0.0