Spaces:

AryanJh
/

Brock-Events-Assistant

Sleeping

App Files Files Community

AryanJh commited on Dec 19, 2024

Commit

6520230

verified ·

1 Parent(s): 6e0003d

Enchanced event matching

Browse files

The simplified event matcher, though efficient, didn't work nicely

Files changed (1) hide show

app.py +154 -174

app.py CHANGED Viewed

@@ -12,199 +12,179 @@ import diskcache
 import os
 import chromadb
-class SimplifiedBrockEventsRAG:
     def __init__(self):
-        """Initialize simplified RAG system for CPU environment"""
-        print("Initializing simplified RAG system...")
-        # Initialize embedding model
-        self.tokenizer = AutoTokenizer.from_pretrained("microsoft/mpnet-base")
-        self.model = AutoModel.from_pretrained("microsoft/mpnet-base")
-        # Force CPU usage
-        self.device = torch.device('cpu')
-        self.model.to(self.device)
-        # Set up disk cache
-        cache_dir = os.path.join(os.getcwd(), "cache")
-        os.makedirs(cache_dir, exist_ok=True)
-        self.cache = diskcache.Cache(cache_dir)
-        # Initialize vector store
-        self.chroma_client = chromadb.Client()
-        # Initialize date handling
-        self.eastern = pytz.timezone('America/New_York')
-        self.today = datetime.now(self.eastern).replace(hour=0, minute=0, second=0, microsecond=0)
-        self.date_range_end = self.today + timedelta(days=14)
-        try:
-            self.collection = self.chroma_client.create_collection(
-                name="brock_events",
-                metadata={"description": "Brock University Events Database"}
-            )
-        except Exception:
-            self.chroma_client.delete_collection("brock_events")
-            self.collection = self.chroma_client.create_collection(
-                name="brock_events",
-                metadata={"description": "Brock University Events Database"}
-            )
-        self.load_patterns()
-    def load_patterns(self):
-        """Load optimized search patterns"""
-        self.patterns = {
-            'faculty': {
-                'math': ['mathematics', 'math', 'stats', 'computer science'],
-                'humanities': ['humanities', 'language', 'literature'],
-                'business': ['goodman', 'business', 'accounting'],
-                'science': ['science', 'biology', 'chemistry', 'physics']
-            },
-            'event_type': {
-                'academic': ['lecture', 'seminar', 'workshop', 'conference'],
-                'social': ['meetup', 'social', 'gathering', 'networking'],
-                'career': ['career', 'job', 'employment', 'professional']
-            },
-            'location': {
-                'online': ['online', 'virtual', 'zoom', 'teams'],
-                'campus': ['room', 'hall', 'building', 'plaza'],
-                'library': ['library', 'learning commons', 'makerspace']
-            }
-        }
-    @lru_cache(maxsize=128)
-    def generate_embedding(self, text: str) -> List[float]:
-        """Generate embedding using MPNet"""
-        inputs = self.tokenizer(text, padding=True, truncation=True, return_tensors="pt")
-        with torch.no_grad():
-            outputs = self.model(**inputs)
-            embeddings = self.mean_pooling(outputs, inputs['attention_mask'])
-            return F.normalize(embeddings, p=2, dim=1)[0].tolist()
-    def mean_pooling(self, model_output, attention_mask):
-        """Perform mean pooling on token embeddings"""
-        token_embeddings = model_output[0]
-        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
-    @lru_cache(maxsize=128)
-    def preprocess_query(self, query: str) -> str:
-        """Efficient query preprocessing"""
-        query = re.sub(r'[^\w\s]', ' ', query.lower())
-        replacements = {
-            'fms': 'faculty of mathematics and science',
-            'gsb': 'goodman school of business',
-            'foh': 'faculty of humanities'
-        }
-        for abbr, full in replacements.items():
-            query = query.replace(abbr, full)
-        return query.strip()
-    def semantic_search(self, query: str, k: int = 3) -> List[Dict]:
-        """Optimized semantic search"""
-        # Check cache first
-        cache_key = f"search_{query}_{k}"
-        if cache_key in self.cache:
-            return self.cache[cache_key]
-        # Process query and get embeddings
-        processed_query = self.preprocess_query(query)
-        query_embedding = self.generate_embedding(processed_query)
-        # Get results from vector store
-        results = self.collection.query(
-            query_embeddings=[query_embedding],
-            n_results=k,
-            include=['documents', 'metadatas', 'distances']
-        )
-        # Process and rank results
-        processed_results = []
-        for doc, metadata, distance in zip(
-            results['documents'][0],
-            results['metadatas'][0],
-            results['distances'][0]
-        ):
-            # Calculate relevance score
-            relevance_score = self.calculate_relevance(query, doc, metadata)
-            processed_results.append({
-                'document': doc,
-                'metadata': metadata,
-                'score': relevance_score
-            })
-        # Sort by relevance
-        processed_results.sort(key=lambda x: x['score'], reverse=True)
-        # Cache results
-        self.cache[cache_key] = processed_results
-        return processed_results
-    def calculate_relevance(self, query: str, document: str, metadata: Dict) -> float:
-        """Calculate optimized relevance score"""
         score = 0.0
         query_lower = query.lower()
-        # Title similarity (40%)
-        title_similarity = fuzz.ratio(query_lower, metadata['title'].lower()) / 100
-        score += title_similarity * 0.4
-        # Category matching (30%)
-        if 'categories' in metadata:
-            categories_lower = metadata['categories'].lower()
-            for category_type in self.patterns.values():
-                for keywords in category_type.values():
-                    if any(keyword in query_lower for keyword in keywords):
-                        if any(keyword in categories_lower for keyword in keywords):
-                            score += 0.3
-                            break
-        # Location matching (30%)
-        if 'location' in metadata:
-            location_lower = metadata['location'].lower()
-            for keywords in self.patterns['location'].values():
-                if any(keyword in query_lower for keyword in keywords):
-                    if any(keyword in location_lower for keyword in keywords):
-                        score += 0.3
-                        break
         return score
-    def generate_response(self, query: str, results: List[Dict]) -> str:
-        """Generate optimized response"""
-        if not results:
-            return "I couldn't find any events matching your query. Try asking in a different way!"
-        response = "Here are some relevant events I found:\n\n"
-        # Format results
-        for i, result in enumerate(results, 1):
-            metadata = result['metadata']
-            location = metadata['location']
-            is_online = any(term in location.lower()
-                          for term in self.patterns['location']['online'])
-            response += f"{i}. **{metadata['title']}**\n"
-            response += f"📅 {metadata['date']} at {metadata['time']}\n"
-            response += f"{'📱' if is_online else '📍'} {location}\n"
-            if 'categories' in metadata:
-                response += f"🏷️ {metadata['categories']}\n"
-            response += f"🔗 More info: {metadata['link']}\n\n"
-        # Add contextual suggestion
-        if any(keyword in query.lower() for keyword in self.patterns['faculty']['math']):
-            response += "You can ask about events from other faculties too!\n"
-        elif any(keyword in query.lower() for keyword in self.patterns['location']['library']):
-            response += "I can help you find events in other locations too!\n"
-        else:
-            response += "Feel free to ask about specific types of events!\n"
-        return response
 def create_demo():
     """Create optimized Gradio interface"""
-    rag_system = SimplifiedBrockEventsRAG()
     def process_query(message: str, history: list) -> Tuple[str, list]:
         """Process query and generate response"""

 import os
 import chromadb
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+import torch
+from typing import List, Dict, Tuple
+import pytz
+from fuzzywuzzy import fuzz
+class EnhancedEventMatcher:
     def __init__(self):
+        """Initialize the enhanced event matcher with T5"""
+        # Initialize T5 for response enhancement
+        self.tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        self.t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")
+        # Initialize pattern learning
+        self.known_categories = set()
+        self.known_hosts = set()
+        self.known_locations = set()
+        self.faculty_patterns = {}
+        self.category_patterns = {}
+    def learn_from_events(self, events: List[Event]):
+        """Learn patterns from existing events"""
+        # Original DynamicEventMatcher learning logic
+        for event in events:
+            self.known_categories.update(event.categories)
+            self.known_hosts.update(event.hosts)
+            self.known_locations.add(event.location)
+            # Learn faculty associations
+            for host in event.hosts:
+                for category in event.categories:
+                    key = (host, category)
+                    if 'faculty' in host.lower():
+                        self.faculty_patterns[key] = self.faculty_patterns.get(key, 0) + 1
+            # Learn category associations
+            for cat1 in event.categories:
+                for cat2 in event.categories:
+                    if cat1 != cat2:
+                        key = (cat1, cat2)
+                        self.category_patterns[key] = self.category_patterns.get(key, 0) + 1
+    def get_faculty_score(self, event: Event, query: str) -> float:
+        """Original faculty scoring logic"""
+        score = 0.0
+        query_lower = query.lower()
+        for host in event.hosts:
+            if 'faculty' in host.lower():
+                ratio = fuzz.partial_ratio(query_lower, host.lower())
+                if ratio > 80:
+                    score += 2.0 * (ratio / 100)
+        for category in event.categories:
+            for (host, cat), count in self.faculty_patterns.items():
+                if category == cat and fuzz.partial_ratio(query_lower, host.lower()) > 80:
+                    score += 1.0 * (count / max(self.faculty_patterns.values()))
+        return score
+    def get_category_score(self, event: Event, query_type: str) -> float:
+        """Original category scoring logic"""
         score = 0.0
+        if not query_type:
+            return score
+        for category in event.categories:
+            ratio = fuzz.partial_ratio(query_type.lower(), category.lower())
+            if ratio > 80:
+                score += 1.5 * (ratio / 100)
+            for (cat1, cat2), count in self.category_patterns.items():
+                if category == cat1 and fuzz.partial_ratio(query_type.lower(), cat2.lower()) > 80:
+                    score += 0.5 * (count / max(self.category_patterns.values()))
+        return score
+    def get_location_score(self, event: Event, query: str) -> float:
+        """Original location scoring logic"""
+        score = 0.0
+        location_lower = event.location.lower()
         query_lower = query.lower()
+        online_terms = {'online', 'virtual', 'teams', 'zoom'}
+        if any(term in query_lower for term in online_terms):
+            if any(term in location_lower for term in online_terms):
+                score += 1.5
+        campus_terms = {'room', 'hall', 'building', 'plaza', 'campus'}
+        if any(term in query_lower for term in {'in-person', 'campus', 'building'}):
+            if any(term in location_lower for term in campus_terms):
+                score += 1.5
         return score
+    def enhance_response(self, matched_events: List[Tuple[Event, float]], query: str) -> str:
+        """Use T5 to enhance response generation"""
+        # Format events for T5 input
+        events_text = ""
+        for event, score in matched_events:
+            events_text += f"""
+Event: {event.title}
+Date: {event.start_time.strftime('%A, %B %d, %Y')}
+Time: {event.start_time.strftime('%I:%M %p')}
+Location: {event.location}
+Categories: {', '.join(event.categories)}
+Score: {score:.2f}
+"""
+        # Create prompt for T5
+        prompt = f"""
+Query: {query}
+Available Events:
+{events_text}
+Generate a natural response highlighting the most relevant events and their details.
+"""
+        # Generate enhanced response
+        inputs = self.tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True)
+        outputs = self.t5_model.generate(inputs, max_length=300, num_beams=4, temperature=0.7)
+        enhanced_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Format final response with event details
+        final_response = enhanced_response + "\n\n"
+        for event, score in matched_events:
+            location_icon = "📱" if any(term in event.location.lower()
+                                      for term in ['teams', 'zoom', 'online']) else "📍"
+            final_response += f"""
+**{event.title}** {'🌟' * int(min(score, 5))}
+📅 {event.start_time.strftime('%A, %B %d, %Y')} at {event.start_time.strftime('%I:%M %p')}
+{location_icon} {event.location}
+👥 Hosted by: {', '.join(event.hosts)}
+🏷️ Categories: {', '.join(event.categories)}
+🔗 {event.link}
+"""
+        return final_response
+    def match_and_respond(self, events: List[Event], query: str, query_info: Dict) -> str:
+        """Main method to match events and generate response"""
+        # Learn patterns if not already learned
+        self.learn_from_events(events)
+        # Match events using original logic
+        matched_events = []
+        for event in events:
+            faculty_score = self.get_faculty_score(event, query_info['original_query'])
+            category_score = self.get_category_score(event, query_info['event_type'])
+            location_score = self.get_location_score(event, query_info['original_query'])
+            total_score = (faculty_score * 1.5 +
+                         category_score * 1.2 +
+                         location_score * 1.0)
+            if total_score > 0:
+                matched_events.append((event, total_score))
+        # Sort and get top matches
+        matched_events.sort(key=lambda x: x[1], reverse=True)
+        top_matches = matched_events[:3]
+        if not top_matches:
+            return f"I couldn't find any events matching your query for {query_info['faculty'] or 'any faculty'} " \
+                   f"and {query_info['event_type'] or 'any event type'}. Try broadening your search."
+        # Generate enhanced response using T5
+        return self.enhance_response(top_matches, query)
 def create_demo():
     """Create optimized Gradio interface"""
+    rag_system = EnhancedEventMatcher()
     def process_query(message: str, history: list) -> Tuple[str, list]:
         """Process query and generate response"""