File size: 6,486 Bytes
936e46b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
import gradio as gr
import numpy as np
import pandas as pd
import torch
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
def load_model():
if torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2").to(device)
return model
def encode_and_calculate_similarity(model):
sentence_embeddings = model.encode(df_merged["soup"].tolist())
cos_sim = cosine_similarity(sentence_embeddings)
return cos_sim
def svd():
reader = Reader()
data = Dataset.load_from_df(df_ratings[["userId", "movieId", "rating"]], reader)
svd = SVD()
cross_validate(svd, data, measures=["RMSE", "MAE"], cv=5, verbose=True)
trainset = data.build_full_trainset()
svd.fit(trainset)
return svd
def get_sorted_movie_indices(title: str, cos_sim: np.ndarray) -> list[int]:
"""
Retrieve the sorted indices of movies based on their similarity scores to a given movie.
:param title: The title of the movie to find similar movies for.
:param cos_sim: The cosine similarity matrix of movies.
:return: A list of sorted movie indices.
"""
try:
# Get the index of the movie that matches the title
movie_index = movie_indices[title.lower()]
# If there are multiple movies with the same title, pick the first one.
if isinstance(movie_index, pd.Series):
movie_index = movie_index[0]
except KeyError:
print(f"Movie '{title}' not found. Please enter a valid movie title.")
return None
# Get the pairwise similarity scores of all movies with that movie
sim_scores = list(enumerate(cos_sim[movie_index]))
# Sort the movies based on the similarity scores
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:]
# Get the movie indices
sorted_movie_indices = [sim_score[0] for sim_score in sim_scores]
return sorted_movie_indices
def get_qualified_movies(
df: pd.DataFrame, df_qualified: pd.DataFrame, sorted_movie_indices: list[int]
) -> pd.DataFrame:
"""
Filter out movies that are not in the qualified movies chart based on IMDB's weighted rating.
:param df: The DataFrame containing movie details.
:param df_qualified: The DataFrame containing qualified movie details.
:param sorted_movie_indices: A list of movie indices sorted by similarity scores.
:return: A Pandas DataFrame containing the qualified movies sorted by similarity scores.
"""
movie_details = [
"id",
"title",
"genres",
"original_language",
"production_countries",
"release_date",
"runtime",
]
sorted_movies = df.loc[sorted_movie_indices, movie_details]
qualified_movies = sorted_movies[sorted_movies["id"].isin(df_qualified["id"])]
return qualified_movies
def predict_user_rating(
userId: int, qualified_movies: pd.DataFrame, indices_map: pd.DataFrame
) -> pd.DataFrame:
"""
Predict the user rating for qualified movies using SVD and return the sorted DataFrame.
:param userId: The ID of the user.
:param qualified_movies: A Pandas DataFrame containing qualified movies data.
:return: A Pandas DataFrame containing the final qualified movies sorted by estimated user ratings.
"""
# Calculate estimated user ratings for qualified movies using SVD
qualified_movies["predicted_user_rating"] = qualified_movies["id"].apply(
lambda x: round(svd.predict(userId, indices_map.loc[x]["movieId"]).est, 2)
)
final_qualified_movies = qualified_movies.sort_values(
by=["predicted_user_rating"], ascending=False
)
return final_qualified_movies
def get_movie_recommendations_hybrid(title: str, userId: int) -> pd.DataFrame:
"""
Get movie recommendations based on a given title and user ID.
:param title: The title of the movie to find similar movies for.
:param userId: The ID of the user.
:return: A Pandas DataFrame containing the recommended movies
"""
# Get recommended movie indices based on the given title
sorted_movie_indices = get_sorted_movie_indices(title, cos_sim)
# Filter out bad movies and select the top 50 qualified movies
qualified_movies = get_qualified_movies(
df_merged, df_qualified, sorted_movie_indices
).head(50)
# Predict user ratings for qualified movies and select the top recommended movies
recommended_movies = predict_user_rating(
userId, qualified_movies, indices_map
).head(5)
recommended_movies.columns = [
"ID",
"Title",
"Genres",
"Language",
"Production Countries",
"Release Date",
"Runtime",
"Predicted User Rating",
]
return recommended_movies
if __name__ == "__main__":
df_qualified = pd.read_csv("data/qualified_movies.csv")
df_ratings = pd.read_csv("data/ratings_small.csv")
df_merged = pd.read_csv("data/df_merged.csv")
model = load_model()
cos_sim = encode_and_calculate_similarity(model)
movie_indices = pd.Series(
df_merged.index, index=df_merged["title"].apply(lambda title: title.lower())
).drop_duplicates()
svd = svd()
indices_map = df_merged.set_index("id")
with gr.Blocks(theme=gr.themes.Soft(text_size="lg")) as demo:
gr.Markdown(
"""
# Movie Recommendation System
"""
)
title = gr.Dropdown(
choices=df_merged["title"].unique().tolist(),
label="Movie Title",
value="Iron Man",
)
user_id = gr.Number(
value=1, label="User ID", info="Please enter a number between 1 and 671!"
)
recommend_button = gr.Button("Get Movie Recommendations")
recommended_movies = gr.DataFrame(label="Movie Recommendations")
recommend_button.click(
get_movie_recommendations_hybrid,
inputs=[title, user_id],
outputs=recommended_movies,
)
examples = gr.Examples(
examples=[
"Captain America: The First Avenger",
"The Conjuring",
"Toy Story",
"Final Destination 5",
],
inputs=[title],
)
demo.launch()
|