AryanJh commited on
Commit
8476633
Β·
verified Β·
1 Parent(s): a16686c

Readded the RAG code

Browse files

Deleted it mistakenly

Files changed (1) hide show
  1. app.py +303 -0
app.py CHANGED
@@ -1,3 +1,306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  def create_demo():
2
  """Create an improved Gradio 5 interface for the Brock Events Assistant"""
3
  # Initialize the RAG system
 
1
+ # app.py
2
+ import gradio as gr
3
+ import feedparser
4
+ from bs4 import BeautifulSoup
5
+ from datetime import datetime, timedelta
6
+ import pytz
7
+ from typing import List, Dict
8
+ from sentence_transformers import SentenceTransformer
9
+ import chromadb
10
+ import gc
11
+ import json
12
+ import os
13
+ from fuzzywuzzy import fuzz
14
+
15
+ class BrockEventsRAG:
16
+ def __init__(self):
17
+ """Initialize the RAG system with improved caching"""
18
+ print("Initializing models and database...")
19
+ self.model = SentenceTransformer('all-MiniLM-L6-v2')
20
+ self.chroma_client = chromadb.Client()
21
+
22
+ # Get current date range
23
+ self.eastern = pytz.timezone('America/New_York')
24
+ self.today = datetime.now(self.eastern).replace(hour=0, minute=0, second=0, microsecond=0)
25
+ self.date_range_end = self.today + timedelta(days=14)
26
+
27
+ # Cache directory setup
28
+ os.makedirs("cache", exist_ok=True)
29
+ self.cache_file = "cache/events_cache.json"
30
+
31
+ # Initialize or reset collection
32
+ try:
33
+ self.collection = self.chroma_client.create_collection(
34
+ name="brock_events",
35
+ metadata={"description": "Brock University Events Database"}
36
+ )
37
+ except Exception:
38
+ self.chroma_client.delete_collection("brock_events")
39
+ self.collection = self.chroma_client.create_collection(
40
+ name="brock_events",
41
+ metadata={"description": "Brock University Events Database"}
42
+ )
43
+
44
+ # Initialize known patterns
45
+ self.known_categories = set()
46
+ self.known_hosts = set()
47
+ self.known_locations = set()
48
+ self.faculty_patterns = {}
49
+ self.category_patterns = {}
50
+
51
+ # Load initial events
52
+ self.update_database()
53
+
54
+ def parse_event_datetime(self, entry) -> tuple:
55
+ """Parse start and end times from both RSS and HTML"""
56
+ try:
57
+ # First try to get times from the events namespace
58
+ start_time = entry.get('start', None)
59
+ end_time = entry.get('end', None)
60
+
61
+ # Parse the RSS feed times if available
62
+ if start_time:
63
+ start_dt = datetime.strptime(start_time, '%a, %d %b %Y %H:%M:%S %Z')
64
+ start_dt = pytz.UTC.localize(start_dt).astimezone(self.eastern)
65
+ else:
66
+ start_dt = None
67
+
68
+ if end_time:
69
+ end_dt = datetime.strptime(end_time, '%a, %d %b %Y %H:%M:%S %Z')
70
+ end_dt = pytz.UTC.localize(end_dt).astimezone(self.eastern)
71
+ else:
72
+ end_dt = None
73
+
74
+ # If we didn't get times from RSS, try HTML
75
+ if not start_dt or not end_dt:
76
+ soup = BeautifulSoup(entry.description, 'html.parser')
77
+ start_elem = soup.find('time', class_='dt-start')
78
+ end_elem = soup.find('time', class_='dt-end')
79
+
80
+ if start_elem and 'datetime' in start_elem.attrs:
81
+ dt_str = start_elem['datetime'].split('.')[0]
82
+ start_dt = datetime.strptime(dt_str, '%Y-%m-%dT%H:%M:%S')
83
+ start_dt = self.eastern.localize(start_dt)
84
+
85
+ if end_elem and 'datetime' in end_elem.attrs:
86
+ dt_str = end_elem['datetime'].split('.')[0]
87
+ end_dt = datetime.strptime(dt_str, '%Y-%m-%dT%H:%M:%S')
88
+ end_dt = self.eastern.localize(end_dt)
89
+
90
+ return start_dt, end_dt
91
+ except Exception as e:
92
+ print(f"Error parsing dates: {e}")
93
+ return None, None
94
+
95
+ def get_location(self, entry) -> str:
96
+ """Extract location from both RSS and HTML"""
97
+ try:
98
+ location = entry.get('location', None)
99
+ if not location:
100
+ soup = BeautifulSoup(entry.description, 'html.parser')
101
+ location_elem = soup.find('span', class_='p-location')
102
+ if location_elem:
103
+ location = location_elem.get_text().strip()
104
+ return location if location else "Location not specified"
105
+ except Exception as e:
106
+ print(f"Error getting location: {e}")
107
+ return "Location not specified"
108
+
109
+ def process_event(self, entry) -> Dict:
110
+ """Process a single event entry with improved category handling"""
111
+ try:
112
+ start_time, end_time = self.parse_event_datetime(entry)
113
+ if not start_time or not self.is_event_in_range(start_time):
114
+ return None
115
+
116
+ location = self.get_location(entry)
117
+ categories = [tag.term for tag in entry.get('tags', [])]
118
+ categories_str = '; '.join(categories) if categories else 'No categories'
119
+
120
+ hosts = entry.get('host', [])
121
+ if not isinstance(hosts, list):
122
+ hosts = [hosts]
123
+ hosts_str = '; '.join(hosts) if hosts else 'No host specified'
124
+
125
+ # Update known patterns
126
+ self.known_categories.update(categories)
127
+ self.known_hosts.update(hosts)
128
+ self.known_locations.add(location)
129
+
130
+ # Learn faculty associations
131
+ for host in hosts:
132
+ for category in categories:
133
+ if 'faculty' in host.lower():
134
+ key = (host, category)
135
+ self.faculty_patterns[key] = self.faculty_patterns.get(key, 0) + 1
136
+
137
+ # Clean description
138
+ soup = BeautifulSoup(entry.description, 'html.parser')
139
+ description = ' '.join(soup.get_text().split())
140
+
141
+ return {
142
+ 'title': entry.title,
143
+ 'start_time': start_time,
144
+ 'end_time': end_time,
145
+ 'location': location,
146
+ 'categories': categories_str,
147
+ 'hosts': hosts_str,
148
+ 'description': description,
149
+ 'link': entry.link,
150
+ 'guid': entry.guid
151
+ }
152
+ except Exception as e:
153
+ print(f"Error processing event {entry.get('title', 'Unknown')}: {e}")
154
+ return None
155
+
156
+ def is_event_in_range(self, event_time: datetime) -> bool:
157
+ """Check if event falls within our date range"""
158
+ if not event_time:
159
+ return False
160
+ return self.today <= event_time <= self.date_range_end
161
+
162
+ def format_event_text(self, event: Dict) -> str:
163
+ """Format event information for embedding with improved structure"""
164
+ return f"""
165
+ Event: {event['title']}
166
+ Date: {event['start_time'].strftime('%A, %B %d, %Y')}
167
+ Time: {event['start_time'].strftime('%I:%M %p')} to {event['end_time'].strftime('%I:%M %p') if event['end_time'] else 'not specified'}
168
+ Location: {event['location']}
169
+ Categories: {event['categories']}
170
+ Hosted by: {event['hosts']}
171
+ Description: {event['description'][:500]}
172
+ """
173
+
174
+ def update_database(self):
175
+ """Update database with events in date range with improved error handling"""
176
+ print("Fetching events...")
177
+ try:
178
+ feed = feedparser.parse("https://experiencebu.brocku.ca/events.rss")
179
+ print(f"Found {len(feed.entries)} total events")
180
+
181
+ valid_events = []
182
+ for entry in feed.entries:
183
+ event = self.process_event(entry)
184
+ if event:
185
+ valid_events.append(event)
186
+
187
+ print(f"Found {len(valid_events)} events in the next 14 days")
188
+
189
+ if not valid_events:
190
+ print("No events found in date range")
191
+ return
192
+
193
+ # Prepare data for database
194
+ documents = [self.format_event_text(event) for event in valid_events]
195
+ metadatas = [{
196
+ 'title': event['title'],
197
+ 'date': event['start_time'].strftime('%Y-%m-%d'),
198
+ 'time': event['start_time'].strftime('%I:%M %p'),
199
+ 'location': event['location'],
200
+ 'categories': event['categories'],
201
+ 'link': event['link']
202
+ } for event in valid_events]
203
+ ids = [f"event_{i}" for i in range(len(valid_events))]
204
+
205
+ # Generate embeddings and add to database
206
+ embeddings = self.model.encode(documents)
207
+ self.collection.add(
208
+ documents=documents,
209
+ embeddings=embeddings.tolist(),
210
+ metadatas=metadatas,
211
+ ids=ids
212
+ )
213
+ print(f"Successfully added {len(valid_events)} events to database")
214
+
215
+ # Save to cache
216
+ self.save_cache({
217
+ 'last_update': datetime.now().isoformat(),
218
+ 'events': valid_events
219
+ })
220
+
221
+ gc.collect()
222
+ except Exception as e:
223
+ print(f"Error updating database: {e}")
224
+
225
+ def save_cache(self, data: dict):
226
+ """Save events data to cache file"""
227
+ try:
228
+ serializable_data = {
229
+ 'last_update': data['last_update'],
230
+ 'events': []
231
+ }
232
+
233
+ for event in data['events']:
234
+ event_copy = event.copy()
235
+ if event_copy.get('start_time'):
236
+ event_copy['start_time'] = event_copy['start_time'].isoformat()
237
+ if event_copy.get('end_time'):
238
+ event_copy['end_time'] = event_copy['end_time'].isoformat()
239
+ serializable_data['events'].append(event_copy)
240
+
241
+ with open(self.cache_file, 'w', encoding='utf-8') as f:
242
+ json.dump(serializable_data, f, ensure_ascii=False, indent=2)
243
+ except Exception as e:
244
+ print(f"Error saving cache: {e}")
245
+
246
+ def generate_response(self, question: str, history: list) -> str:
247
+ """Generate a response based on the query with improved matching"""
248
+ try:
249
+ # Query the database
250
+ results = self.query(question)
251
+ if not results or not results['documents'] or not results['documents'][0]:
252
+ return "I couldn't find any events matching your query. Try asking about upcoming events in a different way!"
253
+
254
+ # Analyze the question intent
255
+ question_lower = question.lower()
256
+ is_faculty_query = any(word in question_lower for word in ['faculty', 'department', 'school'])
257
+ is_time_query = any(word in question_lower for word in ['when', 'time', 'date', 'week', 'today', 'tomorrow'])
258
+ is_location_query = any(word in question_lower for word in ['where', 'location', 'place', 'building', 'room'])
259
+
260
+ response = "Here are some relevant events I found:\n\n"
261
+
262
+ # Add top 3 matching events with improved formatting
263
+ for i, (doc, metadata) in enumerate(zip(results['documents'][0][:3], results['metadatas'][0][:3]), 1):
264
+ event_location = metadata['location']
265
+ location_icon = "πŸ“±" if any(term in event_location.lower() for term in ['teams', 'zoom', 'online']) else "πŸ“"
266
+
267
+ response += f"{i}. **{metadata['title']}**\n"
268
+ response += f"πŸ“… {metadata['date']} at {metadata['time']}\n"
269
+ response += f"{location_icon} {event_location}\n"
270
+ if 'categories' in metadata:
271
+ response += f"🏷️ {metadata['categories']}\n"
272
+ response += f"πŸ”— More info: {metadata['link']}\n\n"
273
+
274
+ # Add context-aware prompt
275
+ if is_time_query:
276
+ response += "\nYou can ask about specific dates or times for any of these events!"
277
+ elif is_location_query:
278
+ response += "\nYou can ask for more details about the locations or directions!"
279
+ elif is_faculty_query:
280
+ response += "\nYou can ask about other faculty-specific events too!"
281
+ else:
282
+ response += "\nYou can ask me for more specific details about any of these events!"
283
+
284
+ return response
285
+
286
+ except Exception as e:
287
+ print(f"Error generating response: {e}")
288
+ return "I encountered an error while searching for events. Please try asking in a different way."
289
+
290
+ def query(self, question: str, n_results: int = 3) -> Dict:
291
+ """Query the database with improved error handling"""
292
+ try:
293
+ question_embedding = self.model.encode(question)
294
+ results = self.collection.query(
295
+ query_embeddings=[question_embedding.tolist()],
296
+ n_results=n_results,
297
+ include=['documents', 'metadatas', 'distances']
298
+ )
299
+ return results
300
+ except Exception as e:
301
+ print(f"Error during query: {e}")
302
+ return None
303
+
304
  def create_demo():
305
  """Create an improved Gradio 5 interface for the Brock Events Assistant"""
306
  # Initialize the RAG system