Spaces:

nxyan
/

analytics-vidhya-course-search

Sleeping

File size: 6,443 Bytes

import streamlit as st
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json
from typing import List, Dict
import logging
from pathlib import Path

class CourseSearchSystem:
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.courses_df = None
        self.embeddings = None
        self.setup_logging()

    def setup_logging(self):
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('search_system.log'),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)

    def load_courses(self, courses_data: List[Dict]):
        self.courses_df = pd.DataFrame(courses_data)
        
        self.courses_df['search_text'] = self.courses_df.apply(
            lambda x: f"{x['title']} {' '.join(x['categories'])}",
            axis=1
        )
        
        self.logger.info("Generating course embeddings...")
        self.embeddings = self.model.encode(
            self.courses_df['search_text'].tolist(), 
            convert_to_tensor=True
        )
        self.logger.info("Embeddings generated successfully")

    def search(self, query: str, top_k: int = 5) -> pd.DataFrame:
        query_embedding = self.model.encode(query, convert_to_tensor=True)
        
        similarities = cosine_similarity(
            query_embedding.cpu().numpy().reshape(1, -1),
            self.embeddings.cpu().numpy()
        )[0]
        
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        
        results = self.courses_df.iloc[top_indices].copy()
        results['similarity_score'] = similarities[top_indices]
        
        return results

def load_search_system():
    search_system = CourseSearchSystem()
    
    try:
        courses_file = Path('courses.json')
        if not courses_file.exists():
            st.error("Course data not found. Please run the scraper first.")
            st.stop()
            
        with open(courses_file, 'r', encoding='utf-8') as f:
            courses = json.load(f)
            
        search_system.load_courses(courses)
        return search_system
    except Exception as e:
        st.error(f"Error loading course data: {str(e)}")
        st.stop()

def render_course_card(course: pd.Series):
    with st.container():
        col1, col2 = st.columns([1, 3])
        
        with col1:
            if course['image_url']:
                st.image(course['image_url'], width=200)
            else:
                st.image("https://via.placeholder.com/200x150", width=200)
        
        with col2:
            st.markdown(f"### [{course['title']}]({course['url']})")
            
            # Categories
            if course['categories']:
                st.markdown("**Categories:** " + ", ".join(course['categories']))
            
            # Course details
            cols = st.columns(3)
            with cols[0]:
                st.metric("Lessons", course['lesson_count'])
            with cols[1]:
                st.metric("Reviews", course['rating_count'])
            with cols[2]:
                st.metric("Price", course['price'])
            
            # Similarity score if available
            if 'similarity_score' in course:
                st.progress(float(course['similarity_score']))
                st.caption(f"Relevance: {course['similarity_score']:.1%}")

def main():
    st.set_page_config(
        page_title="Analytics Vidhya Course Search",
        page_icon="📚",
        layout="wide"
    )

    # Header
    st.title("📚 Analytics Vidhya Course Search")
    st.markdown("""
    Find the perfect course for your learning journey! This smart search system helps you discover 
    relevant courses from Analytics Vidhya's free course catalog.
    """)

    search_system = load_search_system()

    # Search UI
    with st.container():
        col1, col2 = st.columns([3, 1])
        with col1:
            search_query = st.text_input(
                "🔍 What would you like to learn?",
                placeholder="E.g., 'machine learning', 'python', 'data science'"
            )
        with col2:
            num_results = st.slider("Number of results", 1, 10, 5)
            
    # Filters
    with st.expander("Advanced Filters"):
        col1, col2 = st.columns(2)
        with col1:
            all_categories = set()
            for cats in search_system.courses_df['categories'].tolist():
                all_categories.update(cats)
            selected_categories = st.multiselect(
                "Filter by Category",
                sorted(list(all_categories))
            )
        
        with col2:
            show_only_free = st.checkbox("Show Only Free Courses", value=True)

    # Search results
    if search_query:
        results = search_system.search(search_query, top_k=num_results)
        
        if selected_categories:
            results = results[results['categories'].apply(
                lambda x: any(cat in x for cat in selected_categories)
            )]
        
        if show_only_free:
            results = results[results['price'].str.contains('Free', case=False)]
        
        if len(results) > 0:
            st.markdown(f"### 🎯 Found {len(results)} relevant courses")
            
            # Display results
            for _, course in results.iterrows():
                render_course_card(course)
                st.divider()
        else:
            st.info("No courses found matching your criteria. Try adjusting your search or filters.")
    else:
        # Display all courses when no search query
        st.markdown("### 📚 All Available Courses")
        results = search_system.courses_df
        
        # Apply filters
        if selected_categories:
            results = results[results['categories'].apply(
                lambda x: any(cat in x for cat in selected_categories)
            )]
        
        if show_only_free:
            results = results[results['price'].str.contains('Free', case=False)]
            
        for _, course in results.iterrows():
            render_course_card(course)
            st.divider()

if __name__ == "__main__":
    main()