joao-victor-campos NicholasBaraldi commited on
Commit
3e41e06
1 Parent(s): eb2d08f

refactor code

Browse files

Co-authored-by: NicholasBaraldi <nicholas.baraldi@gmail.com>

recommendation_app/core/data_handler/data_handler.py CHANGED
@@ -1,12 +1,18 @@
1
  from typing import List
2
 
3
- import numpy as np
4
  import pandas as pd
5
  from sklearn import preprocessing
6
 
7
 
8
  class DataHandler:
 
 
9
  def __init__(self, df: pd.DataFrame) -> None:
 
 
 
 
 
10
  self.df = df
11
 
12
  def normalize(self, features: List) -> pd.DataFrame:
@@ -33,10 +39,9 @@ class DataHandler:
33
  Returns:
34
  pd.DataFrame: DataFrame with one hot encoded columns.
35
  """
36
- for i in features:
37
- ohe_df = pd.get_dummies(self.df[i])
38
- print(ohe_df)
39
  ohe_df.reset_index(drop=True, inplace=True)
40
  self.df = pd.concat([self.df, ohe_df], axis=1)
41
- self.df.drop(columns=i, inplace=True)
42
  return self.df
 
1
  from typing import List
2
 
 
3
  import pandas as pd
4
  from sklearn import preprocessing
5
 
6
 
7
  class DataHandler:
8
+ """Feature Engineers the dataframe."""
9
+
10
  def __init__(self, df: pd.DataFrame) -> None:
11
+ """__init__ method.
12
+
13
+ Args:
14
+ df (pd.DataFrame): pd.DataFrame.
15
+ """
16
  self.df = df
17
 
18
  def normalize(self, features: List) -> pd.DataFrame:
 
39
  Returns:
40
  pd.DataFrame: DataFrame with one hot encoded columns.
41
  """
42
+ for feature in features:
43
+ ohe_df = pd.get_dummies(self.df[feature])
 
44
  ohe_df.reset_index(drop=True, inplace=True)
45
  self.df = pd.concat([self.df, ohe_df], axis=1)
46
+ self.df.drop(columns=feature, inplace=True)
47
  return self.df
recommendation_app/core/model.py CHANGED
@@ -1,6 +1,5 @@
1
  from array import array
2
 
3
- import numpy as np
4
  import pandas as pd
5
  from sklearn.metrics.pairwise import cosine_similarity
6
 
@@ -11,18 +10,21 @@ class Model:
11
 
12
  def movie_similarity(self, chosen_movie: array, sim_movies: array) -> array:
13
  """Calculate the cosine similarity between two vectors.
 
14
  Args:
15
- chosen_movie (array): Array with all information about the movie chosen by the user.
 
16
  sim_movies (array): n dimensions array with all movies.
17
  Returns:
18
- array: Returns the cosine similarity between chosen_movie and sim_array.
 
19
  """
20
  chosen_movie = chosen_movie.reshape(1, -1)
21
- # sim_movies = sim_movies.reshape(-1, 6)
22
  return cosine_similarity(chosen_movie, sim_movies, dense_output=True)
23
 
24
  def recommend(self, movie_id: str, n_rec: int) -> pd.DataFrame:
25
- """Returns nlargest similarity movies based on movie_id.
 
26
  Args:
27
  movie_id (str): Name of the movie to be compared.
28
  n_rec (int): Number of movies the user wants.
@@ -30,15 +32,9 @@ class Model:
30
  pd.DataFrame: Dataframe with the n_rec recommendations.
31
  """
32
  movie_info = self.df.loc[movie_id].values
33
- x = self.movie_similarity(movie_info, self.df.values)
34
-
35
- # x.reshape(1, -1)
36
- y = x.tolist()[0]
37
- print(y)
38
- self.df["similarity"] = y
39
- print(self.df)
40
- # movie_info = self.df.loc[movie_id].values
41
- # self.df['similarity'] = self.df.apply(self.movie_similarity(movie_info,
42
- # self.df.values)))
43
 
44
  return self.df.nlargest(columns="similarity", n=n_rec + 1)
 
1
  from array import array
2
 
 
3
  import pandas as pd
4
  from sklearn.metrics.pairwise import cosine_similarity
5
 
 
10
 
11
  def movie_similarity(self, chosen_movie: array, sim_movies: array) -> array:
12
  """Calculate the cosine similarity between two vectors.
13
+
14
  Args:
15
+ chosen_movie (array): Array with all information about the movie
16
+ chosen by the user.
17
  sim_movies (array): n dimensions array with all movies.
18
  Returns:
19
+ array: Returns the cosine similarity between chosen_movie and
20
+ sim_array.
21
  """
22
  chosen_movie = chosen_movie.reshape(1, -1)
 
23
  return cosine_similarity(chosen_movie, sim_movies, dense_output=True)
24
 
25
  def recommend(self, movie_id: str, n_rec: int) -> pd.DataFrame:
26
+ """Return nlargest similarity movies based on movie_id.
27
+
28
  Args:
29
  movie_id (str): Name of the movie to be compared.
30
  n_rec (int): Number of movies the user wants.
 
32
  pd.DataFrame: Dataframe with the n_rec recommendations.
33
  """
34
  movie_info = self.df.loc[movie_id].values
35
+ sim_array = self.movie_similarity(movie_info, self.df.values)
36
+
37
+ sim_list = sim_array.tolist()[0]
38
+ self.df["similarity"] = sim_list
 
 
 
 
 
 
39
 
40
  return self.df.nlargest(columns="similarity", n=n_rec + 1)
recommendation_app/main.py DELETED
@@ -1,64 +0,0 @@
1
- import gradio as gr
2
- import pandas as pd
3
- from core.data_handler.data_handler import DataHandler
4
- from core.model import Model
5
-
6
- PATH = "../netflix-recommendation-app/data/output/df_titles.csv"
7
- df2 = pd.read_csv(PATH)
8
- movie_names = df2["title"].tolist()
9
-
10
-
11
- def gradio(movie_name, n_rec):
12
- if __name__ == "__main__":
13
- PATH = "../netflix-recommendation-app/data/output/df_titles.csv"
14
- features = [
15
- "type",
16
- "release_year",
17
- "age_certification",
18
- "runtime",
19
- "seasons",
20
- "imdb_score",
21
- "tmdb_popularity",
22
- "tmdb_score",
23
- "genres_transformed",
24
- "production_countries_transformed",
25
- ]
26
- df = pd.read_csv(PATH)
27
- df_model = df.copy()
28
- df_model = df_model[features]
29
- x = DataHandler(df_model)
30
- numeric_features = [
31
- "release_year",
32
- "runtime",
33
- "seasons",
34
- "imdb_score",
35
- "tmdb_popularity",
36
- "tmdb_score",
37
- ]
38
- x.normalize(numeric_features)
39
- categorical_features = [
40
- "age_certification",
41
- "type",
42
- "genres_transformed",
43
- "production_countries_transformed",
44
- ]
45
- x.one_hot_encode(categorical_features)
46
- # print(x.one_hot_encode(categorical_features))
47
- # print(x.df)
48
- mdl = Model(x.df)
49
- n_rec = int(n_rec)
50
- movie_name = str(movie_name)
51
- movie_id = df.index[df["title"] == movie_name].tolist()
52
- print(movie_id)
53
- recommendations = mdl.recommend(movie_id, n_rec)
54
- top_index = list(recommendations.index)[1:]
55
- print(df[["title", "description"]].loc[top_index])
56
- return df[["title", "description"]].loc[top_index]
57
-
58
-
59
- app = gr.Interface(
60
- fn=gradio,
61
- inputs=[gr.Dropdown(choices=movie_names), gr.inputs.Number()],
62
- outputs=[gr.outputs.Dataframe()],
63
- )
64
- app.launch()