import streamlit as st import pandas as pd import numpy as np from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity import json from typing import List, Dict import logging from pathlib import Path class CourseSearchSystem: def __init__(self, model_name: str = 'all-MiniLM-L6-v2'): self.model = SentenceTransformer(model_name) self.courses_df = None self.embeddings = None self.setup_logging() def setup_logging(self): logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('search_system.log'), logging.StreamHandler() ] ) self.logger = logging.getLogger(__name__) def load_courses(self, courses_data: List[Dict]): self.courses_df = pd.DataFrame(courses_data) self.courses_df['search_text'] = self.courses_df.apply( lambda x: f"{x['title']} {' '.join(x['categories'])}", axis=1 ) self.logger.info("Generating course embeddings...") self.embeddings = self.model.encode( self.courses_df['search_text'].tolist(), convert_to_tensor=True ) self.logger.info("Embeddings generated successfully") def search(self, query: str, top_k: int = 5) -> pd.DataFrame: query_embedding = self.model.encode(query, convert_to_tensor=True) similarities = cosine_similarity( query_embedding.cpu().numpy().reshape(1, -1), self.embeddings.cpu().numpy() )[0] top_indices = np.argsort(similarities)[-top_k:][::-1] results = self.courses_df.iloc[top_indices].copy() results['similarity_score'] = similarities[top_indices] return results def load_search_system(): search_system = CourseSearchSystem() try: courses_file = Path('courses.json') if not courses_file.exists(): st.error("Course data not found. Please run the scraper first.") st.stop() with open(courses_file, 'r', encoding='utf-8') as f: courses = json.load(f) search_system.load_courses(courses) return search_system except Exception as e: st.error(f"Error loading course data: {str(e)}") st.stop() def render_course_card(course: pd.Series): with st.container(): col1, col2 = st.columns([1, 3]) with col1: if course['image_url']: st.image(course['image_url'], width=200) else: st.image("https://via.placeholder.com/200x150", width=200) with col2: st.markdown(f"### [{course['title']}]({course['url']})") # Categories if course['categories']: st.markdown("**Categories:** " + ", ".join(course['categories'])) # Course details cols = st.columns(3) with cols[0]: st.metric("Lessons", course['lesson_count']) with cols[1]: st.metric("Reviews", course['rating_count']) with cols[2]: st.metric("Price", course['price']) # Similarity score if available if 'similarity_score' in course: st.progress(float(course['similarity_score'])) st.caption(f"Relevance: {course['similarity_score']:.1%}") def main(): st.set_page_config( page_title="Analytics Vidhya Course Search", page_icon="📚", layout="wide" ) # Header st.title("📚 Analytics Vidhya Course Search") st.markdown(""" Find the perfect course for your learning journey! This smart search system helps you discover relevant courses from Analytics Vidhya's free course catalog. """) search_system = load_search_system() # Search UI with st.container(): col1, col2 = st.columns([3, 1]) with col1: search_query = st.text_input( "🔍 What would you like to learn?", placeholder="E.g., 'machine learning', 'python', 'data science'" ) with col2: num_results = st.slider("Number of results", 1, 10, 5) # Filters with st.expander("Advanced Filters"): col1, col2 = st.columns(2) with col1: all_categories = set() for cats in search_system.courses_df['categories'].tolist(): all_categories.update(cats) selected_categories = st.multiselect( "Filter by Category", sorted(list(all_categories)) ) with col2: show_only_free = st.checkbox("Show Only Free Courses", value=True) # Search results if search_query: results = search_system.search(search_query, top_k=num_results) if selected_categories: results = results[results['categories'].apply( lambda x: any(cat in x for cat in selected_categories) )] if show_only_free: results = results[results['price'].str.contains('Free', case=False)] if len(results) > 0: st.markdown(f"### 🎯 Found {len(results)} relevant courses") # Display results for _, course in results.iterrows(): render_course_card(course) st.divider() else: st.info("No courses found matching your criteria. Try adjusting your search or filters.") else: # Display all courses when no search query st.markdown("### 📚 All Available Courses") results = search_system.courses_df # Apply filters if selected_categories: results = results[results['categories'].apply( lambda x: any(cat in x for cat in selected_categories) )] if show_only_free: results = results[results['price'].str.contains('Free', case=False)] for _, course in results.iterrows(): render_course_card(course) st.divider() if __name__ == "__main__": main()