| | import streamlit as st |
| | import pandas as pd |
| | import numpy as np |
| | from sklearn.feature_extraction.text import TfidfVectorizer |
| | from sklearn.metrics.pairwise import cosine_similarity |
| | from sklearn.model_selection import train_test_split |
| | from sklearn.metrics import mean_squared_error |
| |
|
| | |
| | @st.cache_data |
| | def load_data(): |
| | movies = pd.read_csv('movies_metadata.csv', low_memory=False) |
| | movies = movies.sample(n=15000, random_state=42) |
| | ratings = pd.read_csv('ratings_small.csv') |
| | movies['overview'] = movies['overview'].fillna('') |
| | movies['id'] = pd.to_numeric(movies['id'], errors='coerce').astype('Int64') |
| | return movies, ratings |
| |
|
| | |
| | @st.cache_data |
| | def compute_content_based_matrix(movies): |
| | movies['genres_str'] = movies['genres'].apply(lambda x: ' '.join(x.split('|'))) |
| | vectorizer = TfidfVectorizer() |
| | tfidf_matrix = vectorizer.fit_transform(movies['genres_str']) |
| | similarity_matrix = cosine_similarity(tfidf_matrix) |
| | title_to_index = pd.Series(movies.index, index=movies['title']) |
| | return tfidf_matrix, similarity_matrix, title_to_index |
| |
|
| | |
| | @st.cache_data |
| | def compute_user_profiles(ratings, movies): |
| | train_ratings, test_ratings = train_test_split(ratings, test_size=0.2, random_state=42) |
| | tfidf = TfidfVectorizer(stop_words='english') |
| | tfidf_matrix = tfidf.fit_transform(movies['genres'].fillna('')) |
| | movie_id_to_idx = {mid: idx for idx, mid in enumerate(movies['id'])} |
| | |
| | def build_user_profile(ratings_df, tfidf_matrix, movie_id_to_idx): |
| | user_profiles = {} |
| | for user_id, group in ratings_df.groupby('userId'): |
| | rated_movies = group['movieId'].values |
| | ratings = group['rating'].values |
| | movie_indices = [movie_id_to_idx[m] for m in rated_movies if m in movie_id_to_idx] |
| | if not movie_indices: |
| | continue |
| | weighted_vectors = np.sum([ratings[i] * tfidf_matrix[movie_indices[i]].toarray().flatten() |
| | for i in range(len(movie_indices))], axis=0) |
| | rating_sum = np.sum(ratings) |
| | user_profiles[user_id] = weighted_vectors / rating_sum if rating_sum > 0 else weighted_vectors |
| | return user_profiles, train_ratings, test_ratings |
| | |
| | user_profiles, train_ratings, test_ratings = build_user_profile(train_ratings, tfidf_matrix, movie_id_to_idx) |
| | return user_profiles, tfidf_matrix, movie_id_to_idx, train_ratings, test_ratings |
| |
|
| | |
| | def get_similar_movies(title, similarity_matrix, title_to_index, movies, N=5): |
| | try: |
| | index = title_to_index[title] |
| | similarity_scores = similarity_matrix[index] |
| | similar_indices = similarity_scores.argsort()[::-1][1:N+1] |
| | similar_movies = movies['title'].iloc[similar_indices] |
| | similar_scores = similarity_scores[similar_indices] |
| | return list(zip(similar_movies, similar_scores)) |
| | except KeyError: |
| | return None |
| |
|
| | |
| | def get_top_n_recommendations(user_id, user_profiles, tfidf_matrix, movie_id_to_idx, movies, train_ratings, n=5): |
| | if user_id not in user_profiles: |
| | return None |
| | user_profile = user_profiles[user_id] |
| | similarities = cosine_similarity(user_profile.reshape(1, -1), tfidf_matrix).flatten() |
| | movie_indices = np.argsort(similarities)[::-1] |
| | rated_movies = set(train_ratings[train_ratings['userId'] == user_id]['movieId'].values) |
| | top_n_indices = [idx for idx in movie_indices if movies['id'].iloc[idx] not in rated_movies][:n] |
| | return [(movies['title'].iloc[idx], 1 + 4 * similarities[idx]) for idx in top_n_indices] |
| |
|
| | |
| | st.title("🎥 Movie Recommender System") |
| | st.write("Pick a way to find awesome movies! Either choose a movie you like or enter your user ID for personalized picks.") |
| |
|
| | |
| | movies, ratings = load_data() |
| |
|
| | |
| | recommendation_type = st.sidebar.selectbox("Choose Recommendation Type", ["Content-Based", "User Profile-Based"]) |
| |
|
| | if recommendation_type == "Content-Based": |
| | st.header("Content-Based Movie Recommendations") |
| | st.write("Enter a movie title to find similar movies based on genres.") |
| | |
| | |
| | tfidf_matrix, similarity_matrix, title_to_index = compute_content_based_matrix(movies) |
| | |
| | |
| | movie_title = st.selectbox("Select a Movie", options=[""] + list(movies['title'].dropna().unique())) |
| | |
| | if movie_title: |
| | recommendations = get_similar_movies(movie_title, similarity_matrix, title_to_index, movies, N=5) |
| | if recommendations: |
| | st.write(f"**Movies similar to '{movie_title}':**") |
| | for i, (movie, score) in enumerate(recommendations, 1): |
| | st.write(f"{i}. {movie} (Similarity Score: {score:.2f})") |
| | else: |
| | st.error(f"Oops! Movie '{movie_title}' not found. Try another title!") |
| |
|
| | else: |
| | st.header("User Profile-Based Movie Recommendations") |
| | st.write("Enter your user ID to get personalized movie picks based on your ratings.") |
| | |
| | |
| | user_profiles, tfidf_matrix, movie_id_to_idx, train_ratings, test_ratings = compute_user_profiles(ratings, movies) |
| | |
| | |
| | user_id = st.number_input("Enter User ID", min_value=1, step=1, value=1) |
| | |
| | if st.button("Get Recommendations"): |
| | recommendations = get_top_n_recommendations(user_id, user_profiles, tfidf_matrix, movie_id_to_idx, movies, train_ratings, n=5) |
| | if recommendations: |
| | st.write(f"**Top 5 recommendations for User {user_id}:**") |
| | for i, (movie, pred_rating) in enumerate(recommendations, 1): |
| | st.write(f"{i}. {movie} (Predicted Rating: {pred_rating:.2f})") |
| | else: |
| | st.error(f"Oops! User ID {user_id} not found or hasn't rated enough movies. Try another ID!") |