Gary0417's picture
Update docstrings and type hinting
e5452dc
raw
history blame
8.49 kB
import gradio as gr
import numpy as np
import pandas as pd
import torch
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
def load_model() -> SentenceTransformer:
"""
Loads a pre-trained SentenceTransformer model.
:return: The loaded SentenceTransformer model.
"""
if torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2").to(device)
return model
def encode_and_calculate_similarity(
model: SentenceTransformer, df_merged: pd.DataFrame
) -> np.ndarray:
"""
Encodes sentences using the provided SentenceTransformer model and calculates cosine similarity.
:param model: The SentenceTransformer model to use for encoding.
:param df_merged: The DataFrame containing the sentences to encode.
:return: The cosine similarity matrix.
"""
sentence_embeddings = model.encode(df_merged["soup"].tolist())
cos_sim = cosine_similarity(sentence_embeddings)
return cos_sim
def svd(df_ratings: pd.DataFrame) -> SVD:
"""
Performs Singular Value Decomposition (SVD) on the provided DataFrame.
:param df_ratings: The DataFrame containing user ratings.
:return: The trained SVD model.
"""
reader = Reader()
data = Dataset.load_from_df(df_ratings[["userId", "movieId", "rating"]], reader)
svd = SVD()
cross_validate(svd, data, measures=["RMSE", "MAE"], cv=5, verbose=True)
trainset = data.build_full_trainset()
svd.fit(trainset)
return svd
def get_sorted_similar_movies(
title: str, cos_sim: np.ndarray, df_merged: pd.DataFrame
) -> pd.DataFrame:
"""
Get a sorted DataFrame of movies based on their similarity scores to a given movie.
:param title: The title of the movie to find similar movies for.
:param cos_sim: The cosine similarity matrix of movies.
:param df_merged: The DataFrame containing movie details.
:return: A sorted DataFrame of similar movies.
"""
try:
# Get the index of the movie that matches the title
movie_index = movie_indices[title.lower()]
# If there are multiple movies with the same title, pick the first one.
if isinstance(movie_index, pd.Series):
movie_index = movie_index[0]
except KeyError:
print(f"Movie '{title}' not found. Please enter a valid movie title.")
return None
# Get the pairwise similarity scores of all movies with that movie
sim_scores = list(enumerate(cos_sim[movie_index]))
# Sort the movies based on the similarity scores
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:]
# Get the movie indices
sorted_movie_indices = [sim_score[0] for sim_score in sim_scores]
# Get the similarity scores
sorted_similarity_scores = [format(sim_score[1], ".1f") for sim_score in sim_scores]
movie_details = [
"id",
"title",
"genres",
"original_language",
"production_countries",
"release_date",
"runtime",
"weighted_rating",
]
sorted_similar_movies = df_merged.loc[sorted_movie_indices, movie_details]
sorted_similar_movies["similarity_scores"] = sorted_similarity_scores
return sorted_similar_movies
def get_qualified_movies(
df_qualified: pd.DataFrame, sorted_similar_movies: pd.DataFrame
) -> pd.DataFrame:
"""
Filter out movies that are not in the qualified movies chart and sort the movies based on similarity scores and IMDB's weighted rating.
:param df_qualified: The DataFrame containing qualified movie details.
:param sorted_similar_movies: The DataFrame containing movie details sorted by similarity scores.
:return: A Pandas DataFrame containing the qualified movies sorted by similarity scores and IMDB's weighted rating..
"""
qualified_movies = sorted_similar_movies[
sorted_similar_movies["id"].isin(df_qualified["id"])
]
qualified_movies = qualified_movies.sort_values(
by=["similarity_scores", "weighted_rating"], ascending=False
)
return qualified_movies
def predict_user_rating(
userId: int, qualified_movies: pd.DataFrame, indices_map: pd.DataFrame
) -> pd.DataFrame:
"""
Predict the user rating for qualified movies using SVD and return the sorted DataFrame.
:param userId: The ID of the user.
:param qualified_movies: A Pandas DataFrame containing qualified movies data.
:return: A Pandas DataFrame containing the final qualified movies sorted by estimated user ratings.
"""
# Calculate estimated user ratings for qualified movies using SVD
qualified_movies["predicted_user_rating"] = qualified_movies["id"].apply(
lambda x: round(svd.predict(userId, indices_map.loc[x]["movieId"]).est, 1)
)
final_qualified_movies = qualified_movies.sort_values(
by=["predicted_user_rating", "similarity_scores", "weighted_rating"],
ascending=False,
)
return final_qualified_movies
def get_movie_recommendations_hybrid(
title: str, user_id: int
) -> tuple[pd.DataFrame, pd.DataFrame]:
"""
Get movie recommendations based on a given title and user ID.
:param title: The title of the movie to find similar movies for.
:param userId: The ID of the user.
:return: A tuple of two Pandas DataFrames.
The first DataFrame contains the recommended movies.
The second DataFrame contains the recommendation criteria (ID, Title, Predicted User Rating, Similarity Score, Weighted Rating).
"""
# Get recommended movie indices based on the given title
sorted_similar_movies = get_sorted_similar_movies(title, cos_sim, df_merged)
# Filter out bad movies and select the top 50 qualified movies
qualified_movies = get_qualified_movies(df_qualified, sorted_similar_movies).head(
50
)
# Predict user ratings for qualified movies and select the top recommended movies
recommended_movies = predict_user_rating(
user_id, qualified_movies, indices_map
).head(5)
recommended_movies.columns = [
"ID",
"Title",
"Genres",
"Language",
"Production Countries",
"Release Date",
"Runtime",
"Weighted Rating",
"Similarity Score",
"Predicted User Rating",
]
recommendation_criteria = recommended_movies[
["ID", "Title", "Predicted User Rating", "Similarity Score", "Weighted Rating"]
]
recommended_movies.drop(
["Predicted User Rating", "Similarity Score", "Weighted Rating"],
axis=1,
inplace=True,
)
return recommended_movies, recommendation_criteria
if __name__ == "__main__":
df_qualified = pd.read_csv("data/qualified_movies.csv")
df_ratings = pd.read_csv("data/ratings_small.csv")
df_merged = pd.read_csv("data/df_merged.csv")
model = load_model()
cos_sim = encode_and_calculate_similarity(model, df_merged)
movie_indices = pd.Series(
df_merged.index, index=df_merged["title"].apply(lambda title: title.lower())
).drop_duplicates()
svd = svd(df_ratings)
indices_map = df_merged.set_index("id")
with gr.Blocks(theme=gr.themes.Soft(text_size="lg")) as demo:
gr.Markdown(
"""
# Movie Recommendation System
"""
)
title = gr.Dropdown(
choices=df_merged["title"].unique().tolist(),
label="Movie Title",
value="Iron Man",
)
user_id = gr.Number(
value=1, label="User ID", info="Please enter a number between 1 and 671!"
)
recommend_button = gr.Button("Get Movie Recommendations")
recommended_movies = gr.DataFrame(label="Movie Recommendations")
recommendation_criteria = gr.DataFrame(label="Recommendation Criteria")
recommend_button.click(
get_movie_recommendations_hybrid,
inputs=[title, user_id],
outputs=[recommended_movies, recommendation_criteria],
)
examples = gr.Examples(
examples=[
"Captain America: The First Avenger",
"The Conjuring",
"Toy Story",
"Final Destination 5",
],
inputs=[title],
)
demo.launch()