|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
from sentence_transformers import SentenceTransformer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import json |
|
from typing import List, Dict |
|
import logging |
|
from pathlib import Path |
|
|
|
class CourseSearchSystem: |
|
def __init__(self, model_name: str = 'all-MiniLM-L6-v2'): |
|
self.model = SentenceTransformer(model_name) |
|
self.courses_df = None |
|
self.embeddings = None |
|
self.setup_logging() |
|
|
|
def setup_logging(self): |
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - %(levelname)s - %(message)s', |
|
handlers=[ |
|
logging.FileHandler('search_system.log'), |
|
logging.StreamHandler() |
|
] |
|
) |
|
self.logger = logging.getLogger(__name__) |
|
|
|
def load_courses(self, courses_data: List[Dict]): |
|
self.courses_df = pd.DataFrame(courses_data) |
|
|
|
self.courses_df['search_text'] = self.courses_df.apply( |
|
lambda x: f"{x['title']} {' '.join(x['categories'])}", |
|
axis=1 |
|
) |
|
|
|
self.logger.info("Generating course embeddings...") |
|
self.embeddings = self.model.encode( |
|
self.courses_df['search_text'].tolist(), |
|
convert_to_tensor=True |
|
) |
|
self.logger.info("Embeddings generated successfully") |
|
|
|
def search(self, query: str, top_k: int = 5) -> pd.DataFrame: |
|
query_embedding = self.model.encode(query, convert_to_tensor=True) |
|
|
|
similarities = cosine_similarity( |
|
query_embedding.cpu().numpy().reshape(1, -1), |
|
self.embeddings.cpu().numpy() |
|
)[0] |
|
|
|
top_indices = np.argsort(similarities)[-top_k:][::-1] |
|
|
|
results = self.courses_df.iloc[top_indices].copy() |
|
results['similarity_score'] = similarities[top_indices] |
|
|
|
return results |
|
|
|
def load_search_system(): |
|
search_system = CourseSearchSystem() |
|
|
|
try: |
|
courses_file = Path('courses.json') |
|
if not courses_file.exists(): |
|
st.error("Course data not found. Please run the scraper first.") |
|
st.stop() |
|
|
|
with open(courses_file, 'r', encoding='utf-8') as f: |
|
courses = json.load(f) |
|
|
|
search_system.load_courses(courses) |
|
return search_system |
|
except Exception as e: |
|
st.error(f"Error loading course data: {str(e)}") |
|
st.stop() |
|
|
|
def render_course_card(course: pd.Series): |
|
with st.container(): |
|
col1, col2 = st.columns([1, 3]) |
|
|
|
with col1: |
|
if course['image_url']: |
|
st.image(course['image_url'], width=200) |
|
else: |
|
st.image("https://via.placeholder.com/200x150", width=200) |
|
|
|
with col2: |
|
st.markdown(f"### [{course['title']}]({course['url']})") |
|
|
|
|
|
if course['categories']: |
|
st.markdown("**Categories:** " + ", ".join(course['categories'])) |
|
|
|
|
|
cols = st.columns(3) |
|
with cols[0]: |
|
st.metric("Lessons", course['lesson_count']) |
|
with cols[1]: |
|
st.metric("Reviews", course['rating_count']) |
|
with cols[2]: |
|
st.metric("Price", course['price']) |
|
|
|
|
|
if 'similarity_score' in course: |
|
st.progress(float(course['similarity_score'])) |
|
st.caption(f"Relevance: {course['similarity_score']:.1%}") |
|
|
|
def main(): |
|
st.set_page_config( |
|
page_title="Analytics Vidhya Course Search", |
|
page_icon="π", |
|
layout="wide" |
|
) |
|
|
|
|
|
st.title("π Analytics Vidhya Course Search") |
|
st.markdown(""" |
|
Find the perfect course for your learning journey! This smart search system helps you discover |
|
relevant courses from Analytics Vidhya's free course catalog. |
|
""") |
|
|
|
search_system = load_search_system() |
|
|
|
|
|
with st.container(): |
|
col1, col2 = st.columns([3, 1]) |
|
with col1: |
|
search_query = st.text_input( |
|
"π What would you like to learn?", |
|
placeholder="E.g., 'machine learning', 'python', 'data science'" |
|
) |
|
with col2: |
|
num_results = st.slider("Number of results", 1, 10, 5) |
|
|
|
|
|
with st.expander("Advanced Filters"): |
|
col1, col2 = st.columns(2) |
|
with col1: |
|
all_categories = set() |
|
for cats in search_system.courses_df['categories'].tolist(): |
|
all_categories.update(cats) |
|
selected_categories = st.multiselect( |
|
"Filter by Category", |
|
sorted(list(all_categories)) |
|
) |
|
|
|
with col2: |
|
show_only_free = st.checkbox("Show Only Free Courses", value=True) |
|
|
|
|
|
if search_query: |
|
results = search_system.search(search_query, top_k=num_results) |
|
|
|
if selected_categories: |
|
results = results[results['categories'].apply( |
|
lambda x: any(cat in x for cat in selected_categories) |
|
)] |
|
|
|
if show_only_free: |
|
results = results[results['price'].str.contains('Free', case=False)] |
|
|
|
if len(results) > 0: |
|
st.markdown(f"### π― Found {len(results)} relevant courses") |
|
|
|
|
|
for _, course in results.iterrows(): |
|
render_course_card(course) |
|
st.divider() |
|
else: |
|
st.info("No courses found matching your criteria. Try adjusting your search or filters.") |
|
else: |
|
|
|
st.markdown("### π All Available Courses") |
|
results = search_system.courses_df |
|
|
|
|
|
if selected_categories: |
|
results = results[results['categories'].apply( |
|
lambda x: any(cat in x for cat in selected_categories) |
|
)] |
|
|
|
if show_only_free: |
|
results = results[results['price'].str.contains('Free', case=False)] |
|
|
|
for _, course in results.iterrows(): |
|
render_course_card(course) |
|
st.divider() |
|
|
|
if __name__ == "__main__": |
|
main() |