SSBakh07 commited on
Commit
d133d89
1 Parent(s): 9040fba

Final assignment submission

Browse files
Files changed (21) hide show
  1. .env +1 -0
  2. DF_Construction.ipynb +0 -0
  3. app.py +97 -0
  4. items_0.csv +2 -0
  5. items_1.csv +2 -0
  6. items_2.csv +2 -0
  7. items_3.csv +2 -0
  8. items_4.csv +2 -0
  9. recommender.py +236 -0
  10. users_0.csv +0 -0
  11. users_1.csv +0 -0
  12. users_10.csv +0 -0
  13. users_11.csv +0 -0
  14. users_2.csv +0 -0
  15. users_3.csv +0 -0
  16. users_4.csv +0 -0
  17. users_5.csv +0 -0
  18. users_6.csv +0 -0
  19. users_7.csv +0 -0
  20. users_8.csv +0 -0
  21. users_9.csv +0 -0
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ OMDB_KEY=17c73f9a
DF_Construction.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import gradio as gr
3
+ from recommender import Recommender
4
+
5
+ css="""
6
+ .gradio-row {
7
+ flex-wrap: nowrap !important;
8
+ }
9
+
10
+ .btn {
11
+ height: 50px !important;
12
+ max-height: 50px !important
13
+ }
14
+ """
15
+
16
+ # Create recommender object
17
+ recommender = Recommender()
18
+ initial_picks = recommender.get_descs_for_recommended(recommender.recommended_ids)
19
+
20
+ with gr.Blocks(css=css) as demo:
21
+ gr.Markdown("# **Full report and code can be found here: [GitHub](https://github.com/SSBakh07/Statistical-ML---Spring-2023)**")
22
+ gr.Markdown("## Basic Movie Recommender")
23
+ with gr.Row(variant='compact', elem_classes="gradio-row", equal_height=True):
24
+
25
+ # First Component
26
+ with gr.Column(scale=1):
27
+ col_1_number = gr.Number(value=1, visible=False)
28
+
29
+ gr.Markdown("Based on similar movies...")
30
+
31
+ movie_title_1 = gr.Textbox(initial_picks[0]['title'], label="Movie Title")
32
+ movie_summary_1 = gr.Textbox(initial_picks[0]['overview'], label="Movie Summary")
33
+
34
+
35
+ with gr.Column():
36
+ gr.Markdown("How much did you enjoy this movie or how interested are you?")
37
+ slider_1 = gr.Slider(minimum=1, maximum=5, editable=True)
38
+ btn_submit_1 = gr.Button("Submit", elem_classes="btn")
39
+
40
+
41
+
42
+ # Second Component
43
+ with gr.Column(scale=1):
44
+ col_2_number = gr.Number(value=2, visible=False)
45
+
46
+ gr.Markdown("Based on similar users...")
47
+
48
+ movie_title_2 = gr.Textbox(initial_picks[1]['title'], label="Movie Title")
49
+ movie_summary_2 = gr.Textbox(initial_picks[1]['overview'], label="Movie Summary")
50
+
51
+
52
+ with gr.Column():
53
+ gr.Markdown("How much did you enjoy this movie or how interested are you?")
54
+ slider_2 = gr.Slider(minimum=1, maximum=5, editable=True)
55
+ btn_submit_2 = gr.Button("Submit", elem_classes="btn")
56
+
57
+
58
+
59
+ # Third Component
60
+ with gr.Column(scale=1):
61
+ col_3_number = gr.Number(value=3, visible=False)
62
+
63
+ gr.Markdown("Based on similar users and movies...")
64
+
65
+ movie_title_3 = gr.Textbox(initial_picks[2]['title'], label="Movie Title")
66
+ movie_summary_3 = gr.Textbox(initial_picks[2]['overview'], label="Movie Summary")
67
+
68
+
69
+ with gr.Column():
70
+ gr.Markdown("How much did you enjoy this movie or how interested are you?")
71
+ slider_3 = gr.Slider(minimum=1, maximum=5, editable=True)
72
+ btn_submit_3 = gr.Button("Submit", elem_classes="btn")
73
+
74
+
75
+
76
+ #Handler functions
77
+ def submit_opinion(number, value):
78
+ global recommender
79
+ res = recommender.on_pick(int(number), value)
80
+ text_res = recommender.get_descs_for_recommended(res)
81
+ final = []
82
+ for txt in text_res:
83
+ final.append(txt['title'])
84
+ final.append(txt['overview'])
85
+ return final
86
+
87
+
88
+ # Attach buttons to functions
89
+ submit_outputs = [movie_title_1, movie_summary_1, movie_title_2,
90
+ movie_summary_2, movie_title_3, movie_summary_3]
91
+ btn_submit_1.click(submit_opinion, [col_1_number, slider_1], submit_outputs)
92
+ btn_submit_2.click(submit_opinion, [col_2_number, slider_2], submit_outputs)
93
+ btn_submit_3.click(submit_opinion, [col_3_number, slider_3], submit_outputs)
94
+
95
+
96
+
97
+ demo.launch(debug=True)
items_0.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ adult,id,imdb_id,overview,popularity,runtime,title,vote_average,Animation,Comedy,Family,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,History,Science Fiction,Mystery,War,Foreign,Music,Documentary,Western,TV Movie,rb_ratio,pop_bin
2
+ True,862,tt0114709,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",21.946943,81.0,Toy Story,7.7,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12.4518011,9
items_1.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ adult,id,imdb_id,overview,popularity,runtime,title,vote_average,Animation,Comedy,Family,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,History,Science Fiction,Mystery,War,Foreign,Music,Documentary,Western,TV Movie,rb_ratio,pop_bin
2
+ True,27678,tt0106356,"A television movie based upon the book by Bryan Burrough and John Helyar, about the leveraged buyout (LBO) of RJR Nabisco.",1.685697,107.0,Barbarians at the Gate,6.8,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.3026610976581805,5
items_2.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ adult,id,imdb_id,overview,popularity,runtime,title,vote_average,Animation,Comedy,Family,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,History,Science Fiction,Mystery,War,Foreign,Music,Documentary,Western,TV Movie,rb_ratio,pop_bin
2
+ True,18520,tt0799916,"Join filmmaking duo Chris Hegedus and Nick Doob as their cameras follow Franken to book signings, campaign rallies and the launch of Air America Radio, documenting his transformation from irreverent funnyman to political pundit.",0.364839,84.0,Al Franken - God Spoke,6.8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.018583582460259,2
items_3.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ adult,id,imdb_id,overview,popularity,runtime,title,vote_average,Animation,Comedy,Family,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,History,Science Fiction,Mystery,War,Foreign,Music,Documentary,Western,TV Movie,rb_ratio,pop_bin
2
+ True,51548,tt1313254,"Spying on her students, a teacher finds an exciting hobby.",0.588512,97.0,The Invisible Eye,6.2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0689346370279518,3
items_4.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ adult,id,imdb_id,overview,popularity,runtime,title,vote_average,Animation,Comedy,Family,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,History,Science Fiction,Mystery,War,Foreign,Music,Documentary,Western,TV Movie,rb_ratio,pop_bin
2
+ True,164443,tt2377132,"This is an update of George Bernard Shaw's ""Pygmalion"" that changes the genders of the main characters. Hannah Higgins attempts to turn blue-collar Boston beer vendor Elliot Doolittle into a viable candidate and inadvertently learns something of Elliot's side of life.",5.152674,85.0,The Makeover,5.5,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0673618764144663,8
recommender.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Supress sklearn warnings
2
+ def warn(*args, **kwargs):
3
+ pass
4
+ import warnings
5
+ warnings.warn = warn
6
+
7
+ # Import libraries
8
+ import numpy as np
9
+ import pandas as pd
10
+ from glob import glob
11
+ from sklearn.neighbors import NearestNeighbors
12
+ from sklearn.preprocessing import MinMaxScaler
13
+
14
+ # The columns that will be taken into account when making item-based similarity recommendations
15
+ item_columns = []
16
+
17
+ # Number of neighbors to take into account
18
+ N_NEIGHBORS = 10
19
+
20
+
21
+ # Handler for Item DataFrame
22
+ class ItemData:
23
+ def __init__(self):
24
+ self.df = pd.concat([pd.read_csv(f) for f in glob("items_*.csv")], axis=0)
25
+ self._scale_cols()
26
+ self.item_columns = ['scaled_runtime', 'vote_scaled', 'Animation', 'Comedy', 'Family', 'Adventure', 'Fantasy', 'Romance', 'Drama', 'Action',
27
+ 'Crime', 'Thriller', 'Horror', 'History','Science Fiction', 'Mystery', 'War', 'Foreign', 'Music', 'Documentary',
28
+ 'Western', 'TV Movie', 'ratio_scaled', 'pop_scaled']
29
+ self.scaled_df = self.df[self.item_columns]
30
+
31
+ def _scale_cols(self):
32
+ runtime_col = self.df['runtime'].values.reshape(-1, 1)
33
+ runtime_scaler = MinMaxScaler().fit(runtime_col)
34
+ self.df['scaled_runtime'] = runtime_scaler.transform(runtime_col)
35
+
36
+ vote_col = self.df['vote_average'].values.reshape(-1, 1)
37
+ vote_scaler = MinMaxScaler().fit(vote_col)
38
+ self.df['vote_scaled'] = vote_scaler.transform(vote_col)
39
+
40
+ ratio_col = self.df['rb_ratio'].values.reshape(-1, 1)
41
+ ratio_scaler = MinMaxScaler().fit(ratio_col)
42
+ self.df['ratio_scaled'] = ratio_scaler.transform(ratio_col)
43
+
44
+ pop_col = self.df['pop_bin'].values.reshape(-1, 1)
45
+ ratio_scaler = MinMaxScaler().fit(pop_col)
46
+ self.df['pop_scaled'] = ratio_scaler.transform(pop_col)
47
+
48
+ def get_filtered_row_by_id(self, id):
49
+ return self.df[self.df['id'] == int(id)][self.item_columns]
50
+
51
+ def get_id_by_idx(self, idx):
52
+ return self.df.at[idx, 'id']
53
+
54
+ def get_random_id(self):
55
+ return self.df.sample(1)['id'].values[0]
56
+
57
+ def get_row_by_id(self, id):
58
+ return self.df[self.df['id'] == id]
59
+
60
+ def get_movie_title_by_id(self, id):
61
+ return self.get_row_by_id(id)['title'].values[0]
62
+
63
+ def get_movie_overview_by_id(self, id):
64
+ return self.get_row_by_id(id)['overview'].values[0]
65
+
66
+
67
+ # Handler for User DataFrame
68
+ class UserData:
69
+ def __init__(self):
70
+ self.df = pd.concat([pd.read_csv(f) for f in glob("users_*.csv")], axis=0)
71
+ self.df = self.df.fillna(0)
72
+
73
+
74
+ ###### Recommender System
75
+ class Recommender:
76
+ def __init__(self):
77
+ # Load preprocessed dataframes
78
+ self.item_handler = ItemData()
79
+ self.user_handler = UserData()
80
+ print("Dataframes loaded...")
81
+
82
+ self.preferences = pd.DataFrame(columns=self.user_handler.df.columns[1:]) # For user data
83
+ self.preferences.loc[0] = 0 # Initialize all ratings to zero
84
+
85
+ self.item_picks = pd.DataFrame(columns=self.item_handler.df.columns)
86
+
87
+ self.n_picks = 1
88
+ self.recommended_ids = [] # Resets every time
89
+ self.seen_movies = []
90
+
91
+ # Initialize nearest neighbor algorithm. With p=1, euclidean distance is our metric
92
+ self.user_recommender = NearestNeighbors(n_neighbors=N_NEIGHBORS, p=2).fit(self.user_handler.df.drop('user_id', axis=1))
93
+ self.item_recommender = NearestNeighbors(n_neighbors=N_NEIGHBORS, p=2).fit(self.item_handler.scaled_df)
94
+
95
+ # Initialize recommended movies
96
+ for i in range(3):
97
+ self.recommended_ids.append(self.get_item_recommendation()) # Getting random movies
98
+
99
+
100
+ def on_pick(self, idx, rating):
101
+ '''
102
+ Called whenever the user picks a new movie.
103
+ idx: [0, 2] -> which one of the recommendations was picked out of the 3 suggestions
104
+ '''
105
+ self.n_picks += 1
106
+
107
+ chosen_movie_id = self.recommended_ids[idx]
108
+ self.update(chosen_movie_id, rating)
109
+
110
+ # Recommend new movies
111
+ self.recommended_ids[0] = self.get_item_recommendation()
112
+ self.recommended_ids[1] = self.get_user_recommendation()
113
+ self.recommended_ids[2] = self.get_joint_recommendation()
114
+
115
+ return self.recommended_ids
116
+
117
+ def get_descs_for_recommended(self, recs):
118
+ descs = []
119
+ for rec in recs:
120
+ info = {}
121
+ info['title'] = self.item_handler.get_movie_title_by_id(rec)
122
+ info['overview'] = self.item_handler.get_movie_overview_by_id(rec)
123
+ descs.append(info)
124
+ return descs
125
+
126
+ def update(self, movie_id, rating):
127
+ '''
128
+ Update user preferences based on last picked movie (and given rating)
129
+ '''
130
+ self.seen_movies.append(movie_id)
131
+
132
+ # Update user data
133
+ self.preferences.at[0, str(movie_id)] = rating
134
+
135
+ # Update item data - but only if the user liked it
136
+ if rating > 2.5:
137
+ new_row = self.item_handler.get_row_by_id(movie_id)
138
+ self.item_picks = pd.concat([self.item_picks, new_row], axis=0)
139
+
140
+
141
+ def get_item_recommendation(self):
142
+ '''
143
+ Make recommendation based on item similarity
144
+ '''
145
+ # If user hasn't picked any movies they like yet, pick something random
146
+ if not self.item_picks.empty:
147
+ filtered_picks = self.item_picks[self.item_handler.item_columns]
148
+
149
+ # Return movie that's closest to average preference
150
+ summed_preferences = filtered_picks.sum(axis=0)
151
+ average_preferences = summed_preferences / filtered_picks.shape[0]
152
+
153
+ dist, idxes = self.item_recommender.kneighbors([average_preferences], min(len(self.seen_movies), self.item_handler.df.shape[0])) # guarenteed to pick a movie that has not been seen before
154
+
155
+ for idx in idxes[0]:
156
+ new_id = self.item_handler.get_id_by_idx(idx)
157
+ if new_id not in self.seen_movies:
158
+ return new_id
159
+
160
+ # Pick a random movie if strategy did not work
161
+ return self.item_handler.get_random_id()
162
+
163
+
164
+ def get_user_recommendation(self):
165
+ '''
166
+ Make recommendation based on user similarity
167
+ '''
168
+ # If user hasn't chosen anything yet
169
+ if self.item_picks.empty:
170
+ return self.item_handler.get_random_id()
171
+
172
+ _, idx = self.user_recommender.kneighbors(self.preferences.values, 25)
173
+
174
+ # Find the closest user's top 3 movies. If all have been seen, move onto the next user until a candidate movie is found
175
+ for best_idx in idx[0]:
176
+ cols_to_drop = ['user_id']
177
+ # Find best movie
178
+ for i in range(3):
179
+ best_movie = self.user_handler.df.drop(cols_to_drop, axis=1).iloc[best_idx].idxmax(axis=0)
180
+ if best_movie in self.seen_movies:
181
+ cols_to_drop.append(best_movie)
182
+ continue
183
+
184
+ if self.user_handler.df.at[best_idx, best_movie] > 2.5:
185
+ return int(best_movie)
186
+
187
+ # Otherwise, return random movie
188
+ return self.item_handler.get_random_id()
189
+
190
+
191
+ def get_joint_recommendation(self):
192
+ '''
193
+ Make recommendation based on both item and user similarity
194
+ '''
195
+ # If user hasn't chosen anything yet
196
+ if self.item_picks.empty:
197
+ return self.item_handler.get_random_id()
198
+
199
+ # Get similar users
200
+ _, user_idxs = self.user_recommender.kneighbors(self.preferences.values, 10)
201
+
202
+ # Get similar items
203
+ summed_preferences = self.item_picks[self.item_handler.item_columns].sum(axis=0)
204
+ average_preferences = summed_preferences / self.item_picks.shape[0]
205
+
206
+ n_movies = min(len(self.seen_movies), self.item_handler.df.shape[0])
207
+ _, item_idxs = self.item_recommender.kneighbors([average_preferences], n_movies) # guarenteed to pick a movie that has not been seen before
208
+
209
+ score_sums = [0 for i in range(n_movies)]
210
+ n_votes = [0 for i in range(n_movies)]
211
+
212
+ # Sum ratings per movie
213
+ for i, movie_idx in enumerate(item_idxs[0]):
214
+ movie_id = self.item_handler.get_id_by_idx(movie_idx)
215
+ if movie_id in self.seen_movies:
216
+ continue
217
+
218
+ for user_id in user_idxs[0]:
219
+ score = self.user_handler.df.at[user_id, str(movie_id)]
220
+ if score != 0:
221
+ score_sums[i] += score
222
+ n_votes[i] += 1
223
+
224
+
225
+ # Calculate per-movie score
226
+ final_score = []
227
+ for i, score in enumerate(score_sums):
228
+ if n_votes[i] > 0:
229
+ final_score.append(score/n_votes[i])
230
+ else:
231
+ final_score.append(-1)
232
+
233
+ # Find best score
234
+ best_score_idx = final_score.index(max(final_score))
235
+ best_movie_idx = item_idxs[0][best_score_idx]
236
+ return self.item_handler.get_id_by_idx(best_movie_idx)
users_0.csv ADDED
The diff for this file is too large to render. See raw diff
 
users_1.csv ADDED
The diff for this file is too large to render. See raw diff
 
users_10.csv ADDED
The diff for this file is too large to render. See raw diff
 
users_11.csv ADDED
The diff for this file is too large to render. See raw diff
 
users_2.csv ADDED
The diff for this file is too large to render. See raw diff
 
users_3.csv ADDED
The diff for this file is too large to render. See raw diff
 
users_4.csv ADDED
The diff for this file is too large to render. See raw diff
 
users_5.csv ADDED
The diff for this file is too large to render. See raw diff
 
users_6.csv ADDED
The diff for this file is too large to render. See raw diff
 
users_7.csv ADDED
The diff for this file is too large to render. See raw diff
 
users_8.csv ADDED
The diff for this file is too large to render. See raw diff
 
users_9.csv ADDED
The diff for this file is too large to render. See raw diff