File size: 6,486 Bytes
936e46b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import gradio as gr
import numpy as np
import pandas as pd
import torch

from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity



def load_model():
    if torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"

    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2").to(device)
    return model


def encode_and_calculate_similarity(model):
    sentence_embeddings = model.encode(df_merged["soup"].tolist())

    cos_sim = cosine_similarity(sentence_embeddings)

    return cos_sim


def svd():
    reader = Reader()
    data = Dataset.load_from_df(df_ratings[["userId", "movieId", "rating"]], reader)
    svd = SVD()
    cross_validate(svd, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

    trainset = data.build_full_trainset()
    svd.fit(trainset)
    return svd


def get_sorted_movie_indices(title: str, cos_sim: np.ndarray) -> list[int]:
    """
    Retrieve the sorted indices of movies based on their similarity scores to a given movie.

    :param title: The title of the movie to find similar movies for.
    :param cos_sim: The cosine similarity matrix of movies.
    :return: A list of sorted movie indices.
    """
    try:
        # Get the index of the movie that matches the title
        movie_index = movie_indices[title.lower()]

        # If there are multiple movies with the same title, pick the first one.
        if isinstance(movie_index, pd.Series):
            movie_index = movie_index[0]

    except KeyError:
        print(f"Movie '{title}' not found. Please enter a valid movie title.")
        return None

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cos_sim[movie_index]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:]

    # Get the movie indices
    sorted_movie_indices = [sim_score[0] for sim_score in sim_scores]

    return sorted_movie_indices


def get_qualified_movies(
    df: pd.DataFrame, df_qualified: pd.DataFrame, sorted_movie_indices: list[int]
) -> pd.DataFrame:
    """
    Filter out movies that are not in the qualified movies chart based on IMDB's weighted rating.

    :param df: The DataFrame containing movie details.
    :param df_qualified: The DataFrame containing qualified movie details.
    :param sorted_movie_indices: A list of movie indices sorted by similarity scores.
    :return: A Pandas DataFrame containing the qualified movies sorted by similarity scores.
    """
    movie_details = [
        "id",
        "title",
        "genres",
        "original_language",
        "production_countries",
        "release_date",
        "runtime",
    ]

    sorted_movies = df.loc[sorted_movie_indices, movie_details]
    qualified_movies = sorted_movies[sorted_movies["id"].isin(df_qualified["id"])]
    return qualified_movies


def predict_user_rating(
    userId: int, qualified_movies: pd.DataFrame, indices_map: pd.DataFrame
) -> pd.DataFrame:
    """
    Predict the user rating for qualified movies using SVD and return the sorted DataFrame.

    :param userId: The ID of the user.
    :param qualified_movies:  A Pandas DataFrame containing qualified movies data.
    :return: A Pandas DataFrame containing the final qualified movies sorted by estimated user ratings.
    """
    # Calculate estimated user ratings for qualified movies using SVD
    qualified_movies["predicted_user_rating"] = qualified_movies["id"].apply(
        lambda x: round(svd.predict(userId, indices_map.loc[x]["movieId"]).est, 2)
    )
    final_qualified_movies = qualified_movies.sort_values(
        by=["predicted_user_rating"], ascending=False
    )
    return final_qualified_movies


def get_movie_recommendations_hybrid(title: str, userId: int) -> pd.DataFrame:
    """
    Get movie recommendations based on a given title and user ID.

    :param title: The title of the movie to find similar movies for.
    :param userId: The ID of the user.
    :return: A Pandas DataFrame containing the recommended movies
    """
    # Get recommended movie indices based on the given title
    sorted_movie_indices = get_sorted_movie_indices(title, cos_sim)

    # Filter out bad movies and select the top 50 qualified movies
    qualified_movies = get_qualified_movies(
        df_merged, df_qualified, sorted_movie_indices
    ).head(50)

    # Predict user ratings for qualified movies and select the top recommended movies
    recommended_movies = predict_user_rating(
        userId, qualified_movies, indices_map
    ).head(5)

    recommended_movies.columns = [
        "ID",
        "Title",
        "Genres",
        "Language",
        "Production Countries",
        "Release Date",
        "Runtime",
        "Predicted User Rating",
    ]

    return recommended_movies


if __name__ == "__main__":
    df_qualified = pd.read_csv("data/qualified_movies.csv")
    df_ratings = pd.read_csv("data/ratings_small.csv")
    df_merged = pd.read_csv("data/df_merged.csv")

    model = load_model()
    cos_sim = encode_and_calculate_similarity(model)
    movie_indices = pd.Series(
        df_merged.index, index=df_merged["title"].apply(lambda title: title.lower())
    ).drop_duplicates()

    svd = svd()
    indices_map = df_merged.set_index("id")

    with gr.Blocks(theme=gr.themes.Soft(text_size="lg")) as demo:
        gr.Markdown(
            """
        # Movie Recommendation System
        """
        )
        title = gr.Dropdown(
            choices=df_merged["title"].unique().tolist(),
            label="Movie Title",
            value="Iron Man",
        )
        user_id = gr.Number(
            value=1, label="User ID", info="Please enter a number between 1 and 671!"
        )
        recommend_button = gr.Button("Get Movie Recommendations")
        recommended_movies = gr.DataFrame(label="Movie Recommendations")
        recommend_button.click(
            get_movie_recommendations_hybrid,
            inputs=[title, user_id],
            outputs=recommended_movies,
        )
        examples = gr.Examples(
            examples=[
                "Captain America: The First Avenger",
                "The Conjuring",
                "Toy Story",
                "Final Destination 5",
            ],
            inputs=[title],
        )

    demo.launch()