ilkayisik commited on
Commit
cdf9a36
1 Parent(s): 63ff692

upgrade to version 2

Browse files
Files changed (1) hide show
  1. app.py +123 -84
app.py CHANGED
@@ -5,38 +5,33 @@ Created on Wed Sep 7 14:09:23 2022
5
 
6
  @author: ilkayisik
7
  Streamlit app for user based movie recommendations
 
 
 
8
  """
9
  # imports
10
- from datetime import datetime
11
- from PIL import Image
12
- from sklearn.metrics.pairwise import cosine_similarity
13
- import numpy as np
14
- import pandas as pd
15
  import streamlit as st
16
- import subprocess
17
- import sys
18
-
19
-
20
- def install(package):
21
- subprocess.check_call([sys.executable, "-m", "pip", "install", package])
22
-
23
-
 
24
  # %% load data
25
- movie_df = pd.read_csv(
26
- 'https://raw.githubusercontent.com/sherwan-m/WBSFLIX_Recommender_System/main/ml-latest-small/movies.csv')
27
- rating_df = pd.read_csv(
28
- 'https://raw.githubusercontent.com/sherwan-m/WBSFLIX_Recommender_System/main/ml-latest-small/ratings.csv')
29
- links_df = pd.read_csv(
30
- 'https://raw.githubusercontent.com/sherwan-m/WBSFLIX_Recommender_System/main/ml-latest-small/links.csv')
31
- tags_df = pd.read_csv(
32
- 'https://raw.githubusercontent.com/sherwan-m/WBSFLIX_Recommender_System/main/ml-latest-small/tags.csv')
33
  # %% format dataframes
34
  # MOVIE DF:
35
  movie_df = (
36
  movie_df
37
- .assign(year=lambda df_: df_['title'].replace(r'(.*)\((\d{4})\)', r'\2', regex=True))
38
- # replace with 0 if there is no year
39
- .assign(year=lambda df_: np.where(df_['year'].str.len() <= 5, df_['year'], 0)))
40
  # convert the year column to int
41
  movie_df['year'] = movie_df['year'].astype(int)
42
 
@@ -56,7 +51,6 @@ genre_list.insert(0, 'Any')
56
 
57
  year_list = list(set(list(movie_df['year'])))[1:]
58
 
59
-
60
  # create a list of movies
61
  movie_list = list(set(list(movie_df['title'])))
62
 
@@ -75,22 +69,20 @@ def make_pretty(styler):
75
  return styler
76
 
77
  # population based
78
-
79
-
80
  def popular_n_movies(n, genre):
81
  popular_n = (
82
- rating_df
83
- .groupby(by='movieId')
84
- .agg(rating_mean=('rating', 'mean'),
85
- rating_count=('movieId', 'count'),
86
- datetime=('datetime', 'mean'))
87
- .sort_values(['rating_mean', 'rating_count', 'datetime'], ascending=False)
88
- .loc[lambda df_:df_['rating_count'] >= (df_['rating_count'].mean() + df_['rating_count'].median())/2]
89
- .reset_index()
90
  )['movieId'].to_list()
91
- result = movie_df.loc[lambda df_: df_['movieId'].isin(popular_n)]
92
  if genre != 'Any':
93
- result = result.loc[lambda df_: df_['genres'].str.contains(genre)]
94
  df_rec = result.head(n).reset_index(drop=True)
95
  df_rec = df_rec[['title', 'genres', 'year']].reset_index(drop=True)
96
  new_index = ['movie-{}'.format(i+1) for i in range(n)]
@@ -99,8 +91,6 @@ def popular_n_movies(n, genre):
99
  return pretty_rec
100
 
101
  # movie/item based
102
-
103
-
104
  def item_n_movies(movie_name, n):
105
  min_rate_count = 10
106
  movieId = list(movie_df[movie_df['title'] == movie_name].movieId.head(1))[0]
@@ -110,7 +100,7 @@ def item_n_movies(movie_name, n):
110
  columns='movieId')
111
 
112
  movie_ratings = movies_crosstab[movieId]
113
- movie_ratings = movie_ratings[movie_ratings >= 0] # exclude NaNs
114
 
115
  # evaluating similarity
116
  similar_to_movie = movies_crosstab.corrwith(movie_ratings)
@@ -120,10 +110,9 @@ def item_n_movies(movie_name, n):
120
  rating = pd.DataFrame(rating_df.groupby('movieId')['rating'].mean())
121
  rating['rating_count'] = rating_df.groupby('movieId')['rating'].count()
122
  movie_corr_summary = corr_movie.join(rating['rating_count'])
123
- movie_corr_summary.drop(movieId, inplace=True) # drop forrest gump itself
124
 
125
- top_n = movie_corr_summary[movie_corr_summary['rating_count'] >=
126
- min_rate_count].sort_values('PearsonR', ascending=False).head(n)
127
  top_n = top_n.merge(movie_df, left_index=True, right_on="movieId")
128
  top_n = top_n[['title', 'genres']].reset_index(drop=True)
129
  new_index = ['movie-{}'.format(i+1) for i in range(n)]
@@ -132,24 +121,20 @@ def item_n_movies(movie_name, n):
132
  return pretty_rec
133
 
134
  # user based
135
-
136
-
137
  def user_n_movies(user_id, n):
138
  users_items = pd.pivot_table(data=rating_df,
139
- values='rating',
140
- index='userId',
141
- columns='movieId')
142
 
143
  users_items.fillna(0, inplace=True)
144
 
145
  user_similarities = pd.DataFrame(cosine_similarity(users_items),
146
- columns=users_items.index,
147
- index=users_items.index)
148
 
149
- weights = (user_similarities.query("userId!=@user_id")
150
- [user_id] / sum(user_similarities.query("userId!=@user_id")[user_id]))
151
- not_seen_movies = users_items.loc[users_items.index !=
152
- user_id, users_items.loc[user_id, :] == 0]
153
  weighted_averages = pd.DataFrame(not_seen_movies.T.dot(weights), columns=["predicted_rating"])
154
  recommendations = weighted_averages.merge(movie_df, left_index=True, right_on="movieId")
155
  top_recommendations = recommendations.sort_values("predicted_rating", ascending=False).head(n)
@@ -159,10 +144,53 @@ def user_n_movies(user_id, n):
159
  pretty_rec = top_recommendations.style.pipe(make_pretty)
160
  return pretty_rec
161
 
 
 
 
 
162
 
163
- # i will write another version of this function can manage time period of movies too
164
- x = popular_n_movies(5, 'Any')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  # %% STREAMLIT
167
  # Set configuration
168
  st.set_page_config(page_title="WBSFLIX",
@@ -172,8 +200,8 @@ st.set_page_config(page_title="WBSFLIX",
172
  )
173
 
174
  # set colors: These has to be set on the setting menu online
175
- # primary color: #FF4B4B, background color:#0E1117
176
- # text color: #FAFAFA, secondary background color: #E50914
177
 
178
  # Set the logo of app
179
  st.sidebar.image("wbs_logo.png",
@@ -188,12 +216,12 @@ st.sidebar.markdown("""
188
  st.sidebar.markdown("""
189
  ### How may we help you?
190
  """
191
- )
192
  # Popularity based recommender system
193
  genre_default = None
194
  pop_based_rec = st.sidebar.checkbox("Show me the all time favourites",
195
- False,
196
- help="Movies that are liked by many people")
197
 
198
 
199
  if pop_based_rec:
@@ -202,22 +230,24 @@ if pop_based_rec:
202
  with st.form(key="pop_form"):
203
  genre_default = ['Any']
204
  genre = st.multiselect(
205
- "Genre",
206
- options=genre_list,
207
- help="Select the genre of the movie you would like to watch",
208
- default=genre_default)
209
 
210
  nr_rec = st.slider("Number of recommendations",
211
- min_value=1,
212
- max_value=20,
213
- value=5,
214
- step=1,
215
- key="n",
216
- help="How many movie recommendations would you like to receive?",
217
- )
 
218
 
219
  submit_button_pop = st.form_submit_button(label="Submit")
220
 
 
221
  if submit_button_pop:
222
  popular_movie_recs = popular_n_movies(nr_rec, genre[0])
223
  st.table(popular_movie_recs)
@@ -228,8 +258,8 @@ st.write("")
228
  st.write("")
229
 
230
  item_based_rec = st.sidebar.checkbox("Show me a movie like this",
231
- False,
232
- help="Input some movies and we will show you similar ones")
233
 
234
  if item_based_rec:
235
  st.markdown("### Tell us a movie you like:")
@@ -265,8 +295,8 @@ st.write("")
265
  st.write("")
266
 
267
  user_based_rec = st.sidebar.checkbox("I want to get personalized recommendations",
268
- False,
269
- help="Login to get personalized recommendations")
270
 
271
  if user_based_rec:
272
  st.markdown("### Please login to get customized recommendations just for you")
@@ -275,12 +305,13 @@ if user_based_rec:
275
 
276
  user_id = st.number_input("Please enter your user id", step=1,
277
  min_value=1)
278
- # genre_default, year_default = ['Any'], ['Any']
279
- # genre = st.multiselect(
280
- # "Genre",
281
- # options=genre_list,
282
- # help="Select the genre of the movie you would like to watch",
283
- # default=genre_default)
 
284
 
285
  nr_rec = st.slider("Number of recommendations",
286
  min_value=1,
@@ -291,8 +322,16 @@ if user_based_rec:
291
  help="How many movie recommendations would you like to receive?",
292
  )
293
 
 
 
 
 
 
294
  submit_button_user = st.form_submit_button(label="Submit")
295
 
296
  if submit_button_user:
297
- user_movie_recs = user_n_movies(user_id, nr_rec)
298
- st.table(user_movie_recs)
 
 
 
 
5
 
6
  @author: ilkayisik
7
  Streamlit app for user based movie recommendations
8
+ Changes to the first version:
9
+ 1. put all the widgets to the sidebar
10
+ 2. add the time period option in the user id based recommendation
11
  """
12
  # imports
 
 
 
 
 
13
  import streamlit as st
14
+ import pandas as pd
15
+ import numpy as np
16
+ from sklearn.metrics.pairwise import cosine_similarity
17
+ from PIL import Image
18
+ from datetime import datetime
19
+ import os
20
+ abspath = os.path.abspath(__file__)
21
+ dname = os.path.dirname(abspath)
22
+ os.chdir(dname)
23
  # %% load data
24
+ movie_df = pd.read_csv('https://raw.githubusercontent.com/sherwan-m/WBSFLIX_Recommender_System/main/ml-latest-small/movies.csv')
25
+ rating_df = pd.read_csv('https://raw.githubusercontent.com/sherwan-m/WBSFLIX_Recommender_System/main/ml-latest-small/ratings.csv')
26
+ links_df = pd.read_csv('https://raw.githubusercontent.com/sherwan-m/WBSFLIX_Recommender_System/main/ml-latest-small/links.csv')
27
+ tags_df = pd.read_csv('https://raw.githubusercontent.com/sherwan-m/WBSFLIX_Recommender_System/main/ml-latest-small/tags.csv')
 
 
 
 
28
  # %% format dataframes
29
  # MOVIE DF:
30
  movie_df = (
31
  movie_df
32
+ .assign(year=lambda df_ : df_['title'].replace(r'(.*)\((\d{4})\)', r'\2', regex= True))
33
+ # replace with 0 if there is no year
34
+ .assign(year=lambda df_ : np.where(df_['year'].str.len() <=5 , df_['year'], 0)))
35
  # convert the year column to int
36
  movie_df['year'] = movie_df['year'].astype(int)
37
 
 
51
 
52
  year_list = list(set(list(movie_df['year'])))[1:]
53
 
 
54
  # create a list of movies
55
  movie_list = list(set(list(movie_df['title'])))
56
 
 
69
  return styler
70
 
71
  # population based
 
 
72
  def popular_n_movies(n, genre):
73
  popular_n = (
74
+ rating_df
75
+ .groupby(by='movieId')
76
+ .agg(rating_mean=('rating', 'mean'),
77
+ rating_count=('movieId', 'count'),
78
+ datetime=('datetime','mean'))
79
+ .sort_values(['rating_mean','rating_count','datetime'], ascending= False)
80
+ .loc[lambda df_ :df_['rating_count'] >= (df_['rating_count'].mean() + df_['rating_count'].median())/2]
81
+ .reset_index()
82
  )['movieId'].to_list()
83
+ result = movie_df.loc[lambda df_ : df_['movieId'].isin(popular_n)]
84
  if genre != 'Any':
85
+ result = result.loc[lambda df_ : df_['genres'].str.contains(genre)]
86
  df_rec = result.head(n).reset_index(drop=True)
87
  df_rec = df_rec[['title', 'genres', 'year']].reset_index(drop=True)
88
  new_index = ['movie-{}'.format(i+1) for i in range(n)]
 
91
  return pretty_rec
92
 
93
  # movie/item based
 
 
94
  def item_n_movies(movie_name, n):
95
  min_rate_count = 10
96
  movieId = list(movie_df[movie_df['title'] == movie_name].movieId.head(1))[0]
 
100
  columns='movieId')
101
 
102
  movie_ratings = movies_crosstab[movieId]
103
+ movie_ratings = movie_ratings[movie_ratings>=0] # exclude NaNs
104
 
105
  # evaluating similarity
106
  similar_to_movie = movies_crosstab.corrwith(movie_ratings)
 
110
  rating = pd.DataFrame(rating_df.groupby('movieId')['rating'].mean())
111
  rating['rating_count'] = rating_df.groupby('movieId')['rating'].count()
112
  movie_corr_summary = corr_movie.join(rating['rating_count'])
113
+ movie_corr_summary.drop(movieId, inplace=True) # drop forrest gump itself
114
 
115
+ top_n = movie_corr_summary[movie_corr_summary['rating_count'] >= min_rate_count].sort_values('PearsonR', ascending=False).head(n)
 
116
  top_n = top_n.merge(movie_df, left_index=True, right_on="movieId")
117
  top_n = top_n[['title', 'genres']].reset_index(drop=True)
118
  new_index = ['movie-{}'.format(i+1) for i in range(n)]
 
121
  return pretty_rec
122
 
123
  # user based
 
 
124
  def user_n_movies(user_id, n):
125
  users_items = pd.pivot_table(data=rating_df,
126
+ values='rating',
127
+ index='userId',
128
+ columns='movieId')
129
 
130
  users_items.fillna(0, inplace=True)
131
 
132
  user_similarities = pd.DataFrame(cosine_similarity(users_items),
133
+ columns=users_items.index,
134
+ index=users_items.index)
135
 
136
+ weights = (user_similarities.query("userId!=@user_id")[user_id] / sum(user_similarities.query("userId!=@user_id")[user_id]))
137
+ not_seen_movies = users_items.loc[users_items.index!=user_id, users_items.loc[user_id,:]==0]
 
 
138
  weighted_averages = pd.DataFrame(not_seen_movies.T.dot(weights), columns=["predicted_rating"])
139
  recommendations = weighted_averages.merge(movie_df, left_index=True, right_on="movieId")
140
  top_recommendations = recommendations.sort_values("predicted_rating", ascending=False).head(n)
 
144
  pretty_rec = top_recommendations.style.pipe(make_pretty)
145
  return pretty_rec
146
 
147
+ # user based with year as input
148
+ def top_n_user_based(user_id , n , genres, time_period):
149
+ if user_id not in rating_df["userId"]:
150
+ return pd.DataFrame(columns= ['movieId', 'title', 'genres', 'year'])
151
 
152
+ users_items = pd.pivot_table(data=rating_df,
153
+ values='rating',
154
+ index='userId',
155
+ columns='movieId')
156
+ users_items.fillna(0, inplace=True)
157
+ user_similarities = pd.DataFrame(cosine_similarity(users_items),
158
+ columns=users_items.index,
159
+ index=users_items.index)
160
+ weights = (
161
+ user_similarities.query("userId!=@user_id")[user_id] / sum(user_similarities.query("userId!=@user_id")[user_id])
162
+ )
163
+
164
+ new_userids = weights.sort_values(ascending=False).head(100).index.tolist()
165
+ new_userids.append(user_id)
166
+ new_ratings = rating_df.loc[lambda df_: df_['userId'].isin(new_userids)]
167
+ new_users_items = pd.pivot_table(data=new_ratings,
168
+ values='rating',
169
+ index='userId',
170
+ columns='movieId')
171
 
172
+ new_users_items.fillna(0, inplace=True)
173
+ new_user_similarities = pd.DataFrame(cosine_similarity(new_users_items),
174
+ columns=new_users_items.index,
175
+ index=new_users_items.index)
176
+ new_weights = (
177
+ new_user_similarities.query("userId!=@user_id")[user_id] / sum(new_user_similarities.query("userId!=@user_id")[user_id])
178
+ )
179
+ not_watched_movies = new_users_items.loc[new_users_items.index!=user_id, new_users_items.loc[user_id,:]==0]
180
+ weighted_averages = pd.DataFrame(not_watched_movies.T.dot(new_weights), columns=["predicted_rating"])
181
+ recommendations = weighted_averages.merge(movie_df, left_index=True, right_on="movieId").sort_values("predicted_rating", ascending=False)
182
+ recommendations = recommendations.loc[lambda df_ : ((df_['year'] >= time_period[0]) & ( df_['year'] <= time_period[1]))]
183
+ if len(genres)>0:
184
+ result = pd.DataFrame(columns=['predicted_rating', 'movieId', 'title', 'genres', 'year'])
185
+ for genre in genres:
186
+ result = pd.concat([result, recommendations.loc[lambda df_ : df_['genres'].str.contains(genre)]])
187
+
188
+ result.drop_duplicates(inplace=True)
189
+ result = result.sort_values("predicted_rating", ascending=False)
190
+ result.reset_index(inplace=True, drop= True)
191
+ return result.drop(columns=['predicted_rating']).head(n)
192
+
193
+ return recommendations.reset_index(drop=True).drop(columns=['predicted_rating']).head(n)
194
  # %% STREAMLIT
195
  # Set configuration
196
  st.set_page_config(page_title="WBSFLIX",
 
200
  )
201
 
202
  # set colors: These has to be set on the setting menu online
203
+ # primary color: #FF4B4B, background color:#0E1117
204
+ # text color: #FAFAFA, secondary background color: #E50914
205
 
206
  # Set the logo of app
207
  st.sidebar.image("wbs_logo.png",
 
216
  st.sidebar.markdown("""
217
  ### How may we help you?
218
  """
219
+ )
220
  # Popularity based recommender system
221
  genre_default = None
222
  pop_based_rec = st.sidebar.checkbox("Show me the all time favourites",
223
+ False,
224
+ help="Movies that are liked by many people")
225
 
226
 
227
  if pop_based_rec:
 
230
  with st.form(key="pop_form"):
231
  genre_default = ['Any']
232
  genre = st.multiselect(
233
+ "Genre",
234
+ options=genre_list,
235
+ help="Select the genre of the movie you would like to watch",
236
+ default=genre_default)
237
 
238
  nr_rec = st.slider("Number of recommendations",
239
+ min_value=1,
240
+ max_value=20,
241
+ value=5,
242
+ step=1,
243
+ key="n",
244
+ help="How many movie recommendations would you like to receive?",
245
+ )
246
+
247
 
248
  submit_button_pop = st.form_submit_button(label="Submit")
249
 
250
+
251
  if submit_button_pop:
252
  popular_movie_recs = popular_n_movies(nr_rec, genre[0])
253
  st.table(popular_movie_recs)
 
258
  st.write("")
259
 
260
  item_based_rec = st.sidebar.checkbox("Show me a movie like this",
261
+ False,
262
+ help="Input some movies and we will show you similar ones")
263
 
264
  if item_based_rec:
265
  st.markdown("### Tell us a movie you like:")
 
295
  st.write("")
296
 
297
  user_based_rec = st.sidebar.checkbox("I want to get personalized recommendations",
298
+ False,
299
+ help="Login to get personalized recommendations")
300
 
301
  if user_based_rec:
302
  st.markdown("### Please login to get customized recommendations just for you")
 
305
 
306
  user_id = st.number_input("Please enter your user id", step=1,
307
  min_value=1)
308
+ genre_default = ['Any']
309
+ genre = st.multiselect(
310
+ "Genre",
311
+ options=genre_list,
312
+ help="Select the genre of the movie you would like to watch",
313
+ #default=genre_default
314
+ )
315
 
316
  nr_rec = st.slider("Number of recommendations",
317
  min_value=1,
 
322
  help="How many movie recommendations would you like to receive?",
323
  )
324
 
325
+ time_period = st.slider('years:', min_value=1900,
326
+ max_value=2018,
327
+ value=(2010,2018),
328
+ step=1)
329
+
330
  submit_button_user = st.form_submit_button(label="Submit")
331
 
332
  if submit_button_user:
333
+ # user_movie_recs = user_n_movies(user_id, nr_rec)
334
+ user_movie_recs = top_n_user_based(user_id, nr_rec, genre, time_period)
335
+
336
+ # st.write(time_period)
337
+ st.table(user_movie_recs[['title', 'genres']].style.pipe(make_pretty))