Spaces:
Sleeping
Sleeping
#!/usr/bin/env python | |
# coding: utf-8 | |
import re | |
import pandas as pd | |
import gradio as gr | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
import numpy as np | |
from ast import literal_eval | |
from datasets import load_dataset | |
from sklearn.metrics.pairwise import linear_kernel | |
from sklearn.cluster import KMeans | |
###################################Movies################################## | |
Movies = pd.read_csv('movies_metadata.csv') | |
# Define the cleaning function | |
def clean_title(title): | |
title = re.sub("[^a-zA-Z0-9 ]", "", title) | |
return title | |
Movies = Movies.drop([19730, 29503, 35587]) | |
# Clean the 'genres', 'production_companies', 'production_countries', and 'spoken_languages' columns | |
Movies['genres'] = Movies['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else []) | |
Movies['production_companies'] = Movies['production_companies'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else []) | |
Movies['production_countries'] = Movies['production_countries'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else []) | |
Movies['spoken_languages'] = Movies['spoken_languages'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else []) | |
# Clean the movie titles | |
Movies["clean_title"] = Movies["original_title"].apply(clean_title) | |
Movies = Movies.rename(columns={'id': 'movieId'}) | |
Movies["movieId"] = pd.to_numeric(Movies["movieId"]) | |
# Create the TF-IDF vectorizer | |
vectorizer = TfidfVectorizer(ngram_range=(1,2)) | |
tfidf = vectorizer.fit_transform(Movies["clean_title"]) | |
###################################Rating################################## | |
ratings = pd.read_csv('ratings.csv') | |
ratings = ratings.drop('timestamp', axis=1) | |
sorted_ratings = ratings.sort_values(by=ratings.columns[1]) | |
links=pd.read_csv('links.csv') | |
links=links.drop('imdbId',axis=1) | |
ratings = pd.merge(sorted_ratings, links, on='movieId') | |
ratings.rename(columns={'movieId': 'tmdbId', 'tmdbId': 'movieId'}, inplace=True) | |
###################################Collaborative################################## | |
# Define the search function | |
def search(title): | |
title = clean_title(title) | |
query_vec = vectorizer.transform([title]) | |
similarity = cosine_similarity(query_vec, tfidf).flatten() | |
indices = np.argpartition(similarity, -5)[-5:] | |
results = Movies.iloc[indices].iloc[::-1] | |
return results | |
def find_similar_movies(movie_id): | |
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 3)]["userId"].unique() | |
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 3)]["movieId"] | |
similar_user_recs = similar_user_recs.value_counts() / len(similar_users) | |
similar_user_recs = similar_user_recs[similar_user_recs > .10] | |
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 3)] | |
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique()) | |
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1) | |
rec_percentages.columns = ["similar", "all"] | |
# Merge with Movies DataFrame to get additional movie information | |
rec_movies = rec_percentages.merge(Movies, left_index=True, right_on="movieId") | |
# Extract the year from the release_date column | |
rec_movies["year"] = pd.to_datetime(rec_movies["release_date"], errors="coerce").dt.year | |
# Sort by the year column in descending order | |
rec_movies = rec_movies.sort_values("year", ascending=False) | |
return rec_movies.head(2)[["year", "title", "genres"]] | |
# Define the recommender function | |
def recommend_movies(title): | |
results = search(title) | |
movie_id = results.iloc[0]["movieId"] | |
similar_movies = find_similar_movies(movie_id) | |
return similar_movies | |
###################################Cluster################################## | |
# Step 1: Preprocessing | |
genre_matrix = pd.get_dummies(Movies['genres'].apply(pd.Series).stack()).groupby(level=0).sum() | |
# Step 2: Cluster Generation | |
num_clusters =50 # Set the desired number of clusters | |
kmeans = KMeans(n_clusters=num_clusters,n_init = 10) | |
cluster_labels = kmeans.fit_predict(genre_matrix) | |
def search_cluster(title): | |
title = clean_title(title) | |
query_vec = vectorizer.transform([title]) | |
similarity = cosine_similarity(query_vec, tfidf[:43779 ]).flatten() | |
indices = np.argpartition(similarity, -5)[-5:] | |
results = Movies.iloc[indices].iloc[::-1] | |
return results | |
# Step 3: Recommendation Generation | |
def cluster_based_recommender(title1, title2, title3): | |
input_movie_indices = Movies.index[Movies['title'].isin([title1, title2, title3])] | |
input_movie_cluster_labels = cluster_labels[input_movie_indices] | |
similar_movies_indices = [] | |
for cluster_label in input_movie_cluster_labels: | |
cluster_movies_indices = np.where(cluster_labels == cluster_label)[0] | |
similar_movies_indices.extend(cluster_movies_indices) | |
similar_movies_indices = list(set(similar_movies_indices) - set(input_movie_indices)) | |
similar_movies_indices = [index for index in similar_movies_indices if index in Movies.index] | |
similar_movies = Movies.loc[similar_movies_indices, ["release_date", "title", "genres"]] | |
if len(similar_movies) >= 6: | |
recommended_movies = similar_movies.sample(n=6, random_state=25) | |
else: | |
recommended_movies = similar_movies | |
# Extract the year from the release_date column | |
recommended_movies["year"] = pd.to_datetime(recommended_movies["release_date"], errors="coerce").dt.year | |
# Sort by the year column in descending order | |
recommended_movies = recommended_movies.sort_values("year", ascending=False) | |
return recommended_movies | |
###################################Content################################## | |
# Step 1: Preprocessing | |
# Combine relevant features into a single string | |
Movies['features'] = Movies['genres'] | |
Movies['features'] = Movies['features'].apply(lambda x: ' '.join(x)) | |
# Step 2: Create a TF-IDF matrix | |
tfidf_content = TfidfVectorizer(stop_words='english') | |
tfidf_matrix = tfidf_content.fit_transform(Movies['features']) | |
tfidf_matrix = tfidf_matrix[:25000] | |
# Step 3: Compute the cosine similarity matrix | |
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix) | |
def search_content(title): | |
title = clean_title(title) | |
query_vec = vectorizer.transform([title]) | |
similarity = cosine_similarity(query_vec, tfidf[:20000]).flatten() | |
indices = np.argpartition(similarity, -5)[-5:] | |
results = Movies.iloc[indices].iloc[::-1] | |
return results | |
### modified for searh | |
def content_based_recommendation(title1, title2, title3, num_recommendations=6): | |
# Get the indices of the movies with the given titles | |
matching_movies = Movies[Movies['title'].isin([title1, title2, title3])] | |
if matching_movies.empty: | |
return pd.DataFrame(columns=['year', 'title', 'genres']) # Return an empty DataFrame if no match is found | |
indices = matching_movies.index | |
# Compute the average cosine similarity scores for the given movies | |
avg_sim_scores = np.mean(cosine_sim[indices], axis=0) | |
# Sort the movies based on the similarity scores | |
top_indices = np.argsort(avg_sim_scores)[::-1][:num_recommendations] | |
# Return the top recommended movies | |
recommended_movies = Movies.iloc[top_indices][['title', 'genres', 'release_date']] | |
# Extract the year from the release_date column | |
recommended_movies['year'] = pd.to_datetime(recommended_movies['release_date'], errors='coerce').dt.year | |
# Sort by the year column in descending order | |
recommended_movies = recommended_movies.sort_values('year', ascending=False) | |
# Keep only the desired columns in the final output | |
recommended_movies = recommended_movies[['year', 'title', 'genres']] | |
return recommended_movies.head(num_recommendations) | |
###################################Preidict################################## | |
import pandas as pd | |
def predict(title1, title2, title3, method): | |
if method == "collaborative": | |
results1 = search(title1) | |
results2 = search(title2) | |
results3 = search(title3) | |
similar_movies1 = find_similar_movies(results1.iloc[0]["movieId"]) | |
similar_movies2 = find_similar_movies(results2.iloc[0]["movieId"]) | |
similar_movies3 = find_similar_movies(results3.iloc[0]["movieId"]) | |
# Concatenate the similar movies from all three titles | |
similar_movies = pd.concat([similar_movies1, similar_movies2, similar_movies3], ignore_index=True) | |
elif method == "cluster": | |
results1 = search_cluster(title1) | |
results2 = search_cluster(title2) | |
results3 = search_cluster(title3) | |
similar_movies = cluster_based_recommender(results1.iloc[0]['original_title'], | |
results2.iloc[0]['original_title'], | |
results3.iloc[0]['original_title']) | |
similar_movies = similar_movies.head(6)[["title", "year", "genres"]] | |
elif method == "content": | |
results1 = search_content(title1) | |
results2 = search_content(title2) | |
results3 = search_content(title3) | |
similar_movies = content_based_recommendation(results1.iloc[0]['original_title'], | |
results2.iloc[0]['original_title'], | |
results3.iloc[0]['original_title']) | |
similar_movies = similar_movies.head(6)[["title", "year", "genres"]] | |
else: | |
return pd.DataFrame({"Error": ["Invalid recommendation method. Choose either 'collaborative', 'cluster', or 'content'."]}) | |
return similar_movies | |
# Create the Gradio interface | |
interface = gr.Interface( | |
fn=predict, | |
inputs=["text", "text", "text", gr.Radio(["collaborative", "cluster", "content"], label="Method")], | |
outputs='dataframe', | |
examples=[['Captain America','Avengers: Infinity War','Ant-Man', "collaborative"],['Shrek','The Smurfs','Up', "content"]], | |
title = "Recommender System", | |
description="Experience the ultimate recommendation journey with our cutting-edge recommender system! Utilizing collaborative, content, and cluster methods, we cater to your unique preferences. While computational limitations may impact results, maximize your experience by exploring our content-based or collaborative-based method. Please note that due to limitations, results may vary. Choose the path of personalized discovery and unlock a world of possibilities!", | |
flagging_options=["Good Prediction", "Bad Prediction"], | |
theme='abidlabs/banana' | |
) | |
# Launch the interface | |
interface.launch(debug=False) | |