Mahziar's picture
Update app.py
de17795
raw
history blame
10.8 kB
#!/usr/bin/env python
# coding: utf-8
import re
import pandas as pd
import gradio as gr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from ast import literal_eval
from datasets import load_dataset
from sklearn.metrics.pairwise import linear_kernel
from sklearn.cluster import KMeans
###################################Movies##################################
Movies = pd.read_csv('movies_metadata.csv')
# Define the cleaning function
def clean_title(title):
title = re.sub("[^a-zA-Z0-9 ]", "", title)
return title
Movies = Movies.drop([19730, 29503, 35587])
# Clean the 'genres', 'production_companies', 'production_countries', and 'spoken_languages' columns
Movies['genres'] = Movies['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
Movies['production_companies'] = Movies['production_companies'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
Movies['production_countries'] = Movies['production_countries'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
Movies['spoken_languages'] = Movies['spoken_languages'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
# Clean the movie titles
Movies["clean_title"] = Movies["original_title"].apply(clean_title)
Movies = Movies.rename(columns={'id': 'movieId'})
Movies["movieId"] = pd.to_numeric(Movies["movieId"])
# Create the TF-IDF vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(Movies["clean_title"])
###################################Rating##################################
ratings = pd.read_csv('ratings.csv')
ratings = ratings.drop('timestamp', axis=1)
sorted_ratings = ratings.sort_values(by=ratings.columns[1])
links=pd.read_csv('links.csv')
links=links.drop('imdbId',axis=1)
ratings = pd.merge(sorted_ratings, links, on='movieId')
ratings.rename(columns={'movieId': 'tmdbId', 'tmdbId': 'movieId'}, inplace=True)
###################################Collaborative##################################
# Define the search function
def search(title):
title = clean_title(title)
query_vec = vectorizer.transform([title])
similarity = cosine_similarity(query_vec, tfidf).flatten()
indices = np.argpartition(similarity, -5)[-5:]
results = Movies.iloc[indices].iloc[::-1]
return results
def find_similar_movies(movie_id):
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 3)]["userId"].unique()
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 3)]["movieId"]
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
similar_user_recs = similar_user_recs[similar_user_recs > .10]
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 3)]
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]
# Merge with Movies DataFrame to get additional movie information
rec_movies = rec_percentages.merge(Movies, left_index=True, right_on="movieId")
# Extract the year from the release_date column
rec_movies["year"] = pd.to_datetime(rec_movies["release_date"], errors="coerce").dt.year
# Sort by the year column in descending order
rec_movies = rec_movies.sort_values("year", ascending=False)
return rec_movies.head(2)[["year", "title", "genres"]]
# Define the recommender function
def recommend_movies(title):
results = search(title)
movie_id = results.iloc[0]["movieId"]
similar_movies = find_similar_movies(movie_id)
return similar_movies
###################################Cluster##################################
# Step 1: Preprocessing
genre_matrix = pd.get_dummies(Movies['genres'].apply(pd.Series).stack()).groupby(level=0).sum()
# Step 2: Cluster Generation
num_clusters =50 # Set the desired number of clusters
kmeans = KMeans(n_clusters=num_clusters,n_init = 10)
cluster_labels = kmeans.fit_predict(genre_matrix)
def search_cluster(title):
title = clean_title(title)
query_vec = vectorizer.transform([title])
similarity = cosine_similarity(query_vec, tfidf[:43779 ]).flatten()
indices = np.argpartition(similarity, -5)[-5:]
results = Movies.iloc[indices].iloc[::-1]
return results
# Step 3: Recommendation Generation
def cluster_based_recommender(title1, title2, title3):
input_movie_indices = Movies.index[Movies['title'].isin([title1, title2, title3])]
input_movie_cluster_labels = cluster_labels[input_movie_indices]
similar_movies_indices = []
for cluster_label in input_movie_cluster_labels:
cluster_movies_indices = np.where(cluster_labels == cluster_label)[0]
similar_movies_indices.extend(cluster_movies_indices)
similar_movies_indices = list(set(similar_movies_indices) - set(input_movie_indices))
similar_movies_indices = [index for index in similar_movies_indices if index in Movies.index]
similar_movies = Movies.loc[similar_movies_indices, ["release_date", "title", "genres"]]
if len(similar_movies) >= 6:
recommended_movies = similar_movies.sample(n=6, random_state=25)
else:
recommended_movies = similar_movies
# Extract the year from the release_date column
recommended_movies["year"] = pd.to_datetime(recommended_movies["release_date"], errors="coerce").dt.year
# Sort by the year column in descending order
recommended_movies = recommended_movies.sort_values("year", ascending=False)
return recommended_movies
###################################Content##################################
# Step 1: Preprocessing
# Combine relevant features into a single string
Movies['features'] = Movies['genres']
Movies['features'] = Movies['features'].apply(lambda x: ' '.join(x))
# Step 2: Create a TF-IDF matrix
tfidf_content = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_content.fit_transform(Movies['features'])
tfidf_matrix = tfidf_matrix[:25000]
# Step 3: Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
def search_content(title):
title = clean_title(title)
query_vec = vectorizer.transform([title])
similarity = cosine_similarity(query_vec, tfidf[:20000]).flatten()
indices = np.argpartition(similarity, -5)[-5:]
results = Movies.iloc[indices].iloc[::-1]
return results
### modified for searh
def content_based_recommendation(title1, title2, title3, num_recommendations=6):
# Get the indices of the movies with the given titles
matching_movies = Movies[Movies['title'].isin([title1, title2, title3])]
if matching_movies.empty:
return pd.DataFrame(columns=['year', 'title', 'genres']) # Return an empty DataFrame if no match is found
indices = matching_movies.index
# Compute the average cosine similarity scores for the given movies
avg_sim_scores = np.mean(cosine_sim[indices], axis=0)
# Sort the movies based on the similarity scores
top_indices = np.argsort(avg_sim_scores)[::-1][:num_recommendations]
# Return the top recommended movies
recommended_movies = Movies.iloc[top_indices][['title', 'genres', 'release_date']]
# Extract the year from the release_date column
recommended_movies['year'] = pd.to_datetime(recommended_movies['release_date'], errors='coerce').dt.year
# Sort by the year column in descending order
recommended_movies = recommended_movies.sort_values('year', ascending=False)
# Keep only the desired columns in the final output
recommended_movies = recommended_movies[['year', 'title', 'genres']]
return recommended_movies.head(num_recommendations)
###################################Preidict##################################
import pandas as pd
def predict(title1, title2, title3, method):
if method == "collaborative":
results1 = search(title1)
results2 = search(title2)
results3 = search(title3)
similar_movies1 = find_similar_movies(results1.iloc[0]["movieId"])
similar_movies2 = find_similar_movies(results2.iloc[0]["movieId"])
similar_movies3 = find_similar_movies(results3.iloc[0]["movieId"])
# Concatenate the similar movies from all three titles
similar_movies = pd.concat([similar_movies1, similar_movies2, similar_movies3], ignore_index=True)
elif method == "cluster":
results1 = search_cluster(title1)
results2 = search_cluster(title2)
results3 = search_cluster(title3)
similar_movies = cluster_based_recommender(results1.iloc[0]['original_title'],
results2.iloc[0]['original_title'],
results3.iloc[0]['original_title'])
similar_movies = similar_movies.head(6)[["title", "year", "genres"]]
elif method == "content":
results1 = search_content(title1)
results2 = search_content(title2)
results3 = search_content(title3)
similar_movies = content_based_recommendation(results1.iloc[0]['original_title'],
results2.iloc[0]['original_title'],
results3.iloc[0]['original_title'])
similar_movies = similar_movies.head(6)[["title", "year", "genres"]]
else:
return pd.DataFrame({"Error": ["Invalid recommendation method. Choose either 'collaborative', 'cluster', or 'content'."]})
return similar_movies
# Create the Gradio interface
interface = gr.Interface(
fn=predict,
inputs=["text", "text", "text", gr.Radio(["collaborative", "cluster", "content"], label="Method")],
outputs='dataframe',
examples=[['Captain America','Avengers: Infinity War','Ant-Man', "collaborative"],['Shrek','The Smurfs','Up', "content"]],
title = "Recommender System",
description="Experience the ultimate recommendation journey with our cutting-edge recommender system! Utilizing collaborative, content, and cluster methods, we cater to your unique preferences. While computational limitations may impact results, maximize your experience by exploring our content-based or collaborative-based method. Please note that due to limitations, results may vary. Choose the path of personalized discovery and unlock a world of possibilities!",
flagging_options=["Good Prediction", "Bad Prediction"],
theme='abidlabs/banana'
)
# Launch the interface
interface.launch(debug=False)