Spaces:
Runtime error
Runtime error
Commit
•
3e41e06
1
Parent(s):
eb2d08f
refactor code
Browse filesCo-authored-by: NicholasBaraldi <nicholas.baraldi@gmail.com>
recommendation_app/core/data_handler/data_handler.py
CHANGED
@@ -1,12 +1,18 @@
|
|
1 |
from typing import List
|
2 |
|
3 |
-
import numpy as np
|
4 |
import pandas as pd
|
5 |
from sklearn import preprocessing
|
6 |
|
7 |
|
8 |
class DataHandler:
|
|
|
|
|
9 |
def __init__(self, df: pd.DataFrame) -> None:
|
|
|
|
|
|
|
|
|
|
|
10 |
self.df = df
|
11 |
|
12 |
def normalize(self, features: List) -> pd.DataFrame:
|
@@ -33,10 +39,9 @@ class DataHandler:
|
|
33 |
Returns:
|
34 |
pd.DataFrame: DataFrame with one hot encoded columns.
|
35 |
"""
|
36 |
-
for
|
37 |
-
ohe_df = pd.get_dummies(self.df[
|
38 |
-
print(ohe_df)
|
39 |
ohe_df.reset_index(drop=True, inplace=True)
|
40 |
self.df = pd.concat([self.df, ohe_df], axis=1)
|
41 |
-
self.df.drop(columns=
|
42 |
return self.df
|
|
|
1 |
from typing import List
|
2 |
|
|
|
3 |
import pandas as pd
|
4 |
from sklearn import preprocessing
|
5 |
|
6 |
|
7 |
class DataHandler:
|
8 |
+
"""Feature Engineers the dataframe."""
|
9 |
+
|
10 |
def __init__(self, df: pd.DataFrame) -> None:
|
11 |
+
"""__init__ method.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
df (pd.DataFrame): pd.DataFrame.
|
15 |
+
"""
|
16 |
self.df = df
|
17 |
|
18 |
def normalize(self, features: List) -> pd.DataFrame:
|
|
|
39 |
Returns:
|
40 |
pd.DataFrame: DataFrame with one hot encoded columns.
|
41 |
"""
|
42 |
+
for feature in features:
|
43 |
+
ohe_df = pd.get_dummies(self.df[feature])
|
|
|
44 |
ohe_df.reset_index(drop=True, inplace=True)
|
45 |
self.df = pd.concat([self.df, ohe_df], axis=1)
|
46 |
+
self.df.drop(columns=feature, inplace=True)
|
47 |
return self.df
|
recommendation_app/core/model.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
from array import array
|
2 |
|
3 |
-
import numpy as np
|
4 |
import pandas as pd
|
5 |
from sklearn.metrics.pairwise import cosine_similarity
|
6 |
|
@@ -11,18 +10,21 @@ class Model:
|
|
11 |
|
12 |
def movie_similarity(self, chosen_movie: array, sim_movies: array) -> array:
|
13 |
"""Calculate the cosine similarity between two vectors.
|
|
|
14 |
Args:
|
15 |
-
chosen_movie (array): Array with all information about the movie
|
|
|
16 |
sim_movies (array): n dimensions array with all movies.
|
17 |
Returns:
|
18 |
-
array: Returns the cosine similarity between chosen_movie and
|
|
|
19 |
"""
|
20 |
chosen_movie = chosen_movie.reshape(1, -1)
|
21 |
-
# sim_movies = sim_movies.reshape(-1, 6)
|
22 |
return cosine_similarity(chosen_movie, sim_movies, dense_output=True)
|
23 |
|
24 |
def recommend(self, movie_id: str, n_rec: int) -> pd.DataFrame:
|
25 |
-
"""
|
|
|
26 |
Args:
|
27 |
movie_id (str): Name of the movie to be compared.
|
28 |
n_rec (int): Number of movies the user wants.
|
@@ -30,15 +32,9 @@ class Model:
|
|
30 |
pd.DataFrame: Dataframe with the n_rec recommendations.
|
31 |
"""
|
32 |
movie_info = self.df.loc[movie_id].values
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
print(y)
|
38 |
-
self.df["similarity"] = y
|
39 |
-
print(self.df)
|
40 |
-
# movie_info = self.df.loc[movie_id].values
|
41 |
-
# self.df['similarity'] = self.df.apply(self.movie_similarity(movie_info,
|
42 |
-
# self.df.values)))
|
43 |
|
44 |
return self.df.nlargest(columns="similarity", n=n_rec + 1)
|
|
|
1 |
from array import array
|
2 |
|
|
|
3 |
import pandas as pd
|
4 |
from sklearn.metrics.pairwise import cosine_similarity
|
5 |
|
|
|
10 |
|
11 |
def movie_similarity(self, chosen_movie: array, sim_movies: array) -> array:
|
12 |
"""Calculate the cosine similarity between two vectors.
|
13 |
+
|
14 |
Args:
|
15 |
+
chosen_movie (array): Array with all information about the movie
|
16 |
+
chosen by the user.
|
17 |
sim_movies (array): n dimensions array with all movies.
|
18 |
Returns:
|
19 |
+
array: Returns the cosine similarity between chosen_movie and
|
20 |
+
sim_array.
|
21 |
"""
|
22 |
chosen_movie = chosen_movie.reshape(1, -1)
|
|
|
23 |
return cosine_similarity(chosen_movie, sim_movies, dense_output=True)
|
24 |
|
25 |
def recommend(self, movie_id: str, n_rec: int) -> pd.DataFrame:
|
26 |
+
"""Return nlargest similarity movies based on movie_id.
|
27 |
+
|
28 |
Args:
|
29 |
movie_id (str): Name of the movie to be compared.
|
30 |
n_rec (int): Number of movies the user wants.
|
|
|
32 |
pd.DataFrame: Dataframe with the n_rec recommendations.
|
33 |
"""
|
34 |
movie_info = self.df.loc[movie_id].values
|
35 |
+
sim_array = self.movie_similarity(movie_info, self.df.values)
|
36 |
+
|
37 |
+
sim_list = sim_array.tolist()[0]
|
38 |
+
self.df["similarity"] = sim_list
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
return self.df.nlargest(columns="similarity", n=n_rec + 1)
|
recommendation_app/main.py
DELETED
@@ -1,64 +0,0 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
import pandas as pd
|
3 |
-
from core.data_handler.data_handler import DataHandler
|
4 |
-
from core.model import Model
|
5 |
-
|
6 |
-
PATH = "../netflix-recommendation-app/data/output/df_titles.csv"
|
7 |
-
df2 = pd.read_csv(PATH)
|
8 |
-
movie_names = df2["title"].tolist()
|
9 |
-
|
10 |
-
|
11 |
-
def gradio(movie_name, n_rec):
|
12 |
-
if __name__ == "__main__":
|
13 |
-
PATH = "../netflix-recommendation-app/data/output/df_titles.csv"
|
14 |
-
features = [
|
15 |
-
"type",
|
16 |
-
"release_year",
|
17 |
-
"age_certification",
|
18 |
-
"runtime",
|
19 |
-
"seasons",
|
20 |
-
"imdb_score",
|
21 |
-
"tmdb_popularity",
|
22 |
-
"tmdb_score",
|
23 |
-
"genres_transformed",
|
24 |
-
"production_countries_transformed",
|
25 |
-
]
|
26 |
-
df = pd.read_csv(PATH)
|
27 |
-
df_model = df.copy()
|
28 |
-
df_model = df_model[features]
|
29 |
-
x = DataHandler(df_model)
|
30 |
-
numeric_features = [
|
31 |
-
"release_year",
|
32 |
-
"runtime",
|
33 |
-
"seasons",
|
34 |
-
"imdb_score",
|
35 |
-
"tmdb_popularity",
|
36 |
-
"tmdb_score",
|
37 |
-
]
|
38 |
-
x.normalize(numeric_features)
|
39 |
-
categorical_features = [
|
40 |
-
"age_certification",
|
41 |
-
"type",
|
42 |
-
"genres_transformed",
|
43 |
-
"production_countries_transformed",
|
44 |
-
]
|
45 |
-
x.one_hot_encode(categorical_features)
|
46 |
-
# print(x.one_hot_encode(categorical_features))
|
47 |
-
# print(x.df)
|
48 |
-
mdl = Model(x.df)
|
49 |
-
n_rec = int(n_rec)
|
50 |
-
movie_name = str(movie_name)
|
51 |
-
movie_id = df.index[df["title"] == movie_name].tolist()
|
52 |
-
print(movie_id)
|
53 |
-
recommendations = mdl.recommend(movie_id, n_rec)
|
54 |
-
top_index = list(recommendations.index)[1:]
|
55 |
-
print(df[["title", "description"]].loc[top_index])
|
56 |
-
return df[["title", "description"]].loc[top_index]
|
57 |
-
|
58 |
-
|
59 |
-
app = gr.Interface(
|
60 |
-
fn=gradio,
|
61 |
-
inputs=[gr.Dropdown(choices=movie_names), gr.inputs.Number()],
|
62 |
-
outputs=[gr.outputs.Dataframe()],
|
63 |
-
)
|
64 |
-
app.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|