upgrade to version 2
Browse files
app.py
CHANGED
@@ -5,38 +5,33 @@ Created on Wed Sep 7 14:09:23 2022
|
|
5 |
|
6 |
@author: ilkayisik
|
7 |
Streamlit app for user based movie recommendations
|
|
|
|
|
|
|
8 |
"""
|
9 |
# imports
|
10 |
-
from datetime import datetime
|
11 |
-
from PIL import Image
|
12 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
13 |
-
import numpy as np
|
14 |
-
import pandas as pd
|
15 |
import streamlit as st
|
16 |
-
import
|
17 |
-
import
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
24 |
# %% load data
|
25 |
-
movie_df = pd.read_csv(
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
links_df = pd.read_csv(
|
30 |
-
'https://raw.githubusercontent.com/sherwan-m/WBSFLIX_Recommender_System/main/ml-latest-small/links.csv')
|
31 |
-
tags_df = pd.read_csv(
|
32 |
-
'https://raw.githubusercontent.com/sherwan-m/WBSFLIX_Recommender_System/main/ml-latest-small/tags.csv')
|
33 |
# %% format dataframes
|
34 |
# MOVIE DF:
|
35 |
movie_df = (
|
36 |
movie_df
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
# convert the year column to int
|
41 |
movie_df['year'] = movie_df['year'].astype(int)
|
42 |
|
@@ -56,7 +51,6 @@ genre_list.insert(0, 'Any')
|
|
56 |
|
57 |
year_list = list(set(list(movie_df['year'])))[1:]
|
58 |
|
59 |
-
|
60 |
# create a list of movies
|
61 |
movie_list = list(set(list(movie_df['title'])))
|
62 |
|
@@ -75,22 +69,20 @@ def make_pretty(styler):
|
|
75 |
return styler
|
76 |
|
77 |
# population based
|
78 |
-
|
79 |
-
|
80 |
def popular_n_movies(n, genre):
|
81 |
popular_n = (
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
)['movieId'].to_list()
|
91 |
-
result = movie_df.loc[lambda df_: df_['movieId'].isin(popular_n)]
|
92 |
if genre != 'Any':
|
93 |
-
|
94 |
df_rec = result.head(n).reset_index(drop=True)
|
95 |
df_rec = df_rec[['title', 'genres', 'year']].reset_index(drop=True)
|
96 |
new_index = ['movie-{}'.format(i+1) for i in range(n)]
|
@@ -99,8 +91,6 @@ def popular_n_movies(n, genre):
|
|
99 |
return pretty_rec
|
100 |
|
101 |
# movie/item based
|
102 |
-
|
103 |
-
|
104 |
def item_n_movies(movie_name, n):
|
105 |
min_rate_count = 10
|
106 |
movieId = list(movie_df[movie_df['title'] == movie_name].movieId.head(1))[0]
|
@@ -110,7 +100,7 @@ def item_n_movies(movie_name, n):
|
|
110 |
columns='movieId')
|
111 |
|
112 |
movie_ratings = movies_crosstab[movieId]
|
113 |
-
movie_ratings = movie_ratings[movie_ratings
|
114 |
|
115 |
# evaluating similarity
|
116 |
similar_to_movie = movies_crosstab.corrwith(movie_ratings)
|
@@ -120,10 +110,9 @@ def item_n_movies(movie_name, n):
|
|
120 |
rating = pd.DataFrame(rating_df.groupby('movieId')['rating'].mean())
|
121 |
rating['rating_count'] = rating_df.groupby('movieId')['rating'].count()
|
122 |
movie_corr_summary = corr_movie.join(rating['rating_count'])
|
123 |
-
movie_corr_summary.drop(movieId, inplace=True)
|
124 |
|
125 |
-
top_n = movie_corr_summary[movie_corr_summary['rating_count'] >=
|
126 |
-
min_rate_count].sort_values('PearsonR', ascending=False).head(n)
|
127 |
top_n = top_n.merge(movie_df, left_index=True, right_on="movieId")
|
128 |
top_n = top_n[['title', 'genres']].reset_index(drop=True)
|
129 |
new_index = ['movie-{}'.format(i+1) for i in range(n)]
|
@@ -132,24 +121,20 @@ def item_n_movies(movie_name, n):
|
|
132 |
return pretty_rec
|
133 |
|
134 |
# user based
|
135 |
-
|
136 |
-
|
137 |
def user_n_movies(user_id, n):
|
138 |
users_items = pd.pivot_table(data=rating_df,
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
|
143 |
users_items.fillna(0, inplace=True)
|
144 |
|
145 |
user_similarities = pd.DataFrame(cosine_similarity(users_items),
|
146 |
-
|
147 |
-
|
148 |
|
149 |
-
weights = (user_similarities.query("userId!=@user_id")
|
150 |
-
|
151 |
-
not_seen_movies = users_items.loc[users_items.index !=
|
152 |
-
user_id, users_items.loc[user_id, :] == 0]
|
153 |
weighted_averages = pd.DataFrame(not_seen_movies.T.dot(weights), columns=["predicted_rating"])
|
154 |
recommendations = weighted_averages.merge(movie_df, left_index=True, right_on="movieId")
|
155 |
top_recommendations = recommendations.sort_values("predicted_rating", ascending=False).head(n)
|
@@ -159,10 +144,53 @@ def user_n_movies(user_id, n):
|
|
159 |
pretty_rec = top_recommendations.style.pipe(make_pretty)
|
160 |
return pretty_rec
|
161 |
|
|
|
|
|
|
|
|
|
162 |
|
163 |
-
|
164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
# %% STREAMLIT
|
167 |
# Set configuration
|
168 |
st.set_page_config(page_title="WBSFLIX",
|
@@ -172,8 +200,8 @@ st.set_page_config(page_title="WBSFLIX",
|
|
172 |
)
|
173 |
|
174 |
# set colors: These has to be set on the setting menu online
|
175 |
-
# primary color: #FF4B4B, background color:#0E1117
|
176 |
-
# text color: #FAFAFA, secondary background color: #E50914
|
177 |
|
178 |
# Set the logo of app
|
179 |
st.sidebar.image("wbs_logo.png",
|
@@ -188,12 +216,12 @@ st.sidebar.markdown("""
|
|
188 |
st.sidebar.markdown("""
|
189 |
### How may we help you?
|
190 |
"""
|
191 |
-
|
192 |
# Popularity based recommender system
|
193 |
genre_default = None
|
194 |
pop_based_rec = st.sidebar.checkbox("Show me the all time favourites",
|
195 |
-
|
196 |
-
|
197 |
|
198 |
|
199 |
if pop_based_rec:
|
@@ -202,22 +230,24 @@ if pop_based_rec:
|
|
202 |
with st.form(key="pop_form"):
|
203 |
genre_default = ['Any']
|
204 |
genre = st.multiselect(
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
|
210 |
nr_rec = st.slider("Number of recommendations",
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
|
|
218 |
|
219 |
submit_button_pop = st.form_submit_button(label="Submit")
|
220 |
|
|
|
221 |
if submit_button_pop:
|
222 |
popular_movie_recs = popular_n_movies(nr_rec, genre[0])
|
223 |
st.table(popular_movie_recs)
|
@@ -228,8 +258,8 @@ st.write("")
|
|
228 |
st.write("")
|
229 |
|
230 |
item_based_rec = st.sidebar.checkbox("Show me a movie like this",
|
231 |
-
|
232 |
-
|
233 |
|
234 |
if item_based_rec:
|
235 |
st.markdown("### Tell us a movie you like:")
|
@@ -265,8 +295,8 @@ st.write("")
|
|
265 |
st.write("")
|
266 |
|
267 |
user_based_rec = st.sidebar.checkbox("I want to get personalized recommendations",
|
268 |
-
|
269 |
-
|
270 |
|
271 |
if user_based_rec:
|
272 |
st.markdown("### Please login to get customized recommendations just for you")
|
@@ -275,12 +305,13 @@ if user_based_rec:
|
|
275 |
|
276 |
user_id = st.number_input("Please enter your user id", step=1,
|
277 |
min_value=1)
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
|
|
284 |
|
285 |
nr_rec = st.slider("Number of recommendations",
|
286 |
min_value=1,
|
@@ -291,8 +322,16 @@ if user_based_rec:
|
|
291 |
help="How many movie recommendations would you like to receive?",
|
292 |
)
|
293 |
|
|
|
|
|
|
|
|
|
|
|
294 |
submit_button_user = st.form_submit_button(label="Submit")
|
295 |
|
296 |
if submit_button_user:
|
297 |
-
user_movie_recs = user_n_movies(user_id, nr_rec)
|
298 |
-
|
|
|
|
|
|
|
|
5 |
|
6 |
@author: ilkayisik
|
7 |
Streamlit app for user based movie recommendations
|
8 |
+
Changes to the first version:
|
9 |
+
1. put all the widgets to the sidebar
|
10 |
+
2. add the time period option in the user id based recommendation
|
11 |
"""
|
12 |
# imports
|
|
|
|
|
|
|
|
|
|
|
13 |
import streamlit as st
|
14 |
+
import pandas as pd
|
15 |
+
import numpy as np
|
16 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
17 |
+
from PIL import Image
|
18 |
+
from datetime import datetime
|
19 |
+
import os
|
20 |
+
abspath = os.path.abspath(__file__)
|
21 |
+
dname = os.path.dirname(abspath)
|
22 |
+
os.chdir(dname)
|
23 |
# %% load data
|
24 |
+
movie_df = pd.read_csv('https://raw.githubusercontent.com/sherwan-m/WBSFLIX_Recommender_System/main/ml-latest-small/movies.csv')
|
25 |
+
rating_df = pd.read_csv('https://raw.githubusercontent.com/sherwan-m/WBSFLIX_Recommender_System/main/ml-latest-small/ratings.csv')
|
26 |
+
links_df = pd.read_csv('https://raw.githubusercontent.com/sherwan-m/WBSFLIX_Recommender_System/main/ml-latest-small/links.csv')
|
27 |
+
tags_df = pd.read_csv('https://raw.githubusercontent.com/sherwan-m/WBSFLIX_Recommender_System/main/ml-latest-small/tags.csv')
|
|
|
|
|
|
|
|
|
28 |
# %% format dataframes
|
29 |
# MOVIE DF:
|
30 |
movie_df = (
|
31 |
movie_df
|
32 |
+
.assign(year=lambda df_ : df_['title'].replace(r'(.*)\((\d{4})\)', r'\2', regex= True))
|
33 |
+
# replace with 0 if there is no year
|
34 |
+
.assign(year=lambda df_ : np.where(df_['year'].str.len() <=5 , df_['year'], 0)))
|
35 |
# convert the year column to int
|
36 |
movie_df['year'] = movie_df['year'].astype(int)
|
37 |
|
|
|
51 |
|
52 |
year_list = list(set(list(movie_df['year'])))[1:]
|
53 |
|
|
|
54 |
# create a list of movies
|
55 |
movie_list = list(set(list(movie_df['title'])))
|
56 |
|
|
|
69 |
return styler
|
70 |
|
71 |
# population based
|
|
|
|
|
72 |
def popular_n_movies(n, genre):
|
73 |
popular_n = (
|
74 |
+
rating_df
|
75 |
+
.groupby(by='movieId')
|
76 |
+
.agg(rating_mean=('rating', 'mean'),
|
77 |
+
rating_count=('movieId', 'count'),
|
78 |
+
datetime=('datetime','mean'))
|
79 |
+
.sort_values(['rating_mean','rating_count','datetime'], ascending= False)
|
80 |
+
.loc[lambda df_ :df_['rating_count'] >= (df_['rating_count'].mean() + df_['rating_count'].median())/2]
|
81 |
+
.reset_index()
|
82 |
)['movieId'].to_list()
|
83 |
+
result = movie_df.loc[lambda df_ : df_['movieId'].isin(popular_n)]
|
84 |
if genre != 'Any':
|
85 |
+
result = result.loc[lambda df_ : df_['genres'].str.contains(genre)]
|
86 |
df_rec = result.head(n).reset_index(drop=True)
|
87 |
df_rec = df_rec[['title', 'genres', 'year']].reset_index(drop=True)
|
88 |
new_index = ['movie-{}'.format(i+1) for i in range(n)]
|
|
|
91 |
return pretty_rec
|
92 |
|
93 |
# movie/item based
|
|
|
|
|
94 |
def item_n_movies(movie_name, n):
|
95 |
min_rate_count = 10
|
96 |
movieId = list(movie_df[movie_df['title'] == movie_name].movieId.head(1))[0]
|
|
|
100 |
columns='movieId')
|
101 |
|
102 |
movie_ratings = movies_crosstab[movieId]
|
103 |
+
movie_ratings = movie_ratings[movie_ratings>=0] # exclude NaNs
|
104 |
|
105 |
# evaluating similarity
|
106 |
similar_to_movie = movies_crosstab.corrwith(movie_ratings)
|
|
|
110 |
rating = pd.DataFrame(rating_df.groupby('movieId')['rating'].mean())
|
111 |
rating['rating_count'] = rating_df.groupby('movieId')['rating'].count()
|
112 |
movie_corr_summary = corr_movie.join(rating['rating_count'])
|
113 |
+
movie_corr_summary.drop(movieId, inplace=True) # drop forrest gump itself
|
114 |
|
115 |
+
top_n = movie_corr_summary[movie_corr_summary['rating_count'] >= min_rate_count].sort_values('PearsonR', ascending=False).head(n)
|
|
|
116 |
top_n = top_n.merge(movie_df, left_index=True, right_on="movieId")
|
117 |
top_n = top_n[['title', 'genres']].reset_index(drop=True)
|
118 |
new_index = ['movie-{}'.format(i+1) for i in range(n)]
|
|
|
121 |
return pretty_rec
|
122 |
|
123 |
# user based
|
|
|
|
|
124 |
def user_n_movies(user_id, n):
|
125 |
users_items = pd.pivot_table(data=rating_df,
|
126 |
+
values='rating',
|
127 |
+
index='userId',
|
128 |
+
columns='movieId')
|
129 |
|
130 |
users_items.fillna(0, inplace=True)
|
131 |
|
132 |
user_similarities = pd.DataFrame(cosine_similarity(users_items),
|
133 |
+
columns=users_items.index,
|
134 |
+
index=users_items.index)
|
135 |
|
136 |
+
weights = (user_similarities.query("userId!=@user_id")[user_id] / sum(user_similarities.query("userId!=@user_id")[user_id]))
|
137 |
+
not_seen_movies = users_items.loc[users_items.index!=user_id, users_items.loc[user_id,:]==0]
|
|
|
|
|
138 |
weighted_averages = pd.DataFrame(not_seen_movies.T.dot(weights), columns=["predicted_rating"])
|
139 |
recommendations = weighted_averages.merge(movie_df, left_index=True, right_on="movieId")
|
140 |
top_recommendations = recommendations.sort_values("predicted_rating", ascending=False).head(n)
|
|
|
144 |
pretty_rec = top_recommendations.style.pipe(make_pretty)
|
145 |
return pretty_rec
|
146 |
|
147 |
+
# user based with year as input
|
148 |
+
def top_n_user_based(user_id , n , genres, time_period):
|
149 |
+
if user_id not in rating_df["userId"]:
|
150 |
+
return pd.DataFrame(columns= ['movieId', 'title', 'genres', 'year'])
|
151 |
|
152 |
+
users_items = pd.pivot_table(data=rating_df,
|
153 |
+
values='rating',
|
154 |
+
index='userId',
|
155 |
+
columns='movieId')
|
156 |
+
users_items.fillna(0, inplace=True)
|
157 |
+
user_similarities = pd.DataFrame(cosine_similarity(users_items),
|
158 |
+
columns=users_items.index,
|
159 |
+
index=users_items.index)
|
160 |
+
weights = (
|
161 |
+
user_similarities.query("userId!=@user_id")[user_id] / sum(user_similarities.query("userId!=@user_id")[user_id])
|
162 |
+
)
|
163 |
+
|
164 |
+
new_userids = weights.sort_values(ascending=False).head(100).index.tolist()
|
165 |
+
new_userids.append(user_id)
|
166 |
+
new_ratings = rating_df.loc[lambda df_: df_['userId'].isin(new_userids)]
|
167 |
+
new_users_items = pd.pivot_table(data=new_ratings,
|
168 |
+
values='rating',
|
169 |
+
index='userId',
|
170 |
+
columns='movieId')
|
171 |
|
172 |
+
new_users_items.fillna(0, inplace=True)
|
173 |
+
new_user_similarities = pd.DataFrame(cosine_similarity(new_users_items),
|
174 |
+
columns=new_users_items.index,
|
175 |
+
index=new_users_items.index)
|
176 |
+
new_weights = (
|
177 |
+
new_user_similarities.query("userId!=@user_id")[user_id] / sum(new_user_similarities.query("userId!=@user_id")[user_id])
|
178 |
+
)
|
179 |
+
not_watched_movies = new_users_items.loc[new_users_items.index!=user_id, new_users_items.loc[user_id,:]==0]
|
180 |
+
weighted_averages = pd.DataFrame(not_watched_movies.T.dot(new_weights), columns=["predicted_rating"])
|
181 |
+
recommendations = weighted_averages.merge(movie_df, left_index=True, right_on="movieId").sort_values("predicted_rating", ascending=False)
|
182 |
+
recommendations = recommendations.loc[lambda df_ : ((df_['year'] >= time_period[0]) & ( df_['year'] <= time_period[1]))]
|
183 |
+
if len(genres)>0:
|
184 |
+
result = pd.DataFrame(columns=['predicted_rating', 'movieId', 'title', 'genres', 'year'])
|
185 |
+
for genre in genres:
|
186 |
+
result = pd.concat([result, recommendations.loc[lambda df_ : df_['genres'].str.contains(genre)]])
|
187 |
+
|
188 |
+
result.drop_duplicates(inplace=True)
|
189 |
+
result = result.sort_values("predicted_rating", ascending=False)
|
190 |
+
result.reset_index(inplace=True, drop= True)
|
191 |
+
return result.drop(columns=['predicted_rating']).head(n)
|
192 |
+
|
193 |
+
return recommendations.reset_index(drop=True).drop(columns=['predicted_rating']).head(n)
|
194 |
# %% STREAMLIT
|
195 |
# Set configuration
|
196 |
st.set_page_config(page_title="WBSFLIX",
|
|
|
200 |
)
|
201 |
|
202 |
# set colors: These has to be set on the setting menu online
|
203 |
+
# primary color: #FF4B4B, background color:#0E1117
|
204 |
+
# text color: #FAFAFA, secondary background color: #E50914
|
205 |
|
206 |
# Set the logo of app
|
207 |
st.sidebar.image("wbs_logo.png",
|
|
|
216 |
st.sidebar.markdown("""
|
217 |
### How may we help you?
|
218 |
"""
|
219 |
+
)
|
220 |
# Popularity based recommender system
|
221 |
genre_default = None
|
222 |
pop_based_rec = st.sidebar.checkbox("Show me the all time favourites",
|
223 |
+
False,
|
224 |
+
help="Movies that are liked by many people")
|
225 |
|
226 |
|
227 |
if pop_based_rec:
|
|
|
230 |
with st.form(key="pop_form"):
|
231 |
genre_default = ['Any']
|
232 |
genre = st.multiselect(
|
233 |
+
"Genre",
|
234 |
+
options=genre_list,
|
235 |
+
help="Select the genre of the movie you would like to watch",
|
236 |
+
default=genre_default)
|
237 |
|
238 |
nr_rec = st.slider("Number of recommendations",
|
239 |
+
min_value=1,
|
240 |
+
max_value=20,
|
241 |
+
value=5,
|
242 |
+
step=1,
|
243 |
+
key="n",
|
244 |
+
help="How many movie recommendations would you like to receive?",
|
245 |
+
)
|
246 |
+
|
247 |
|
248 |
submit_button_pop = st.form_submit_button(label="Submit")
|
249 |
|
250 |
+
|
251 |
if submit_button_pop:
|
252 |
popular_movie_recs = popular_n_movies(nr_rec, genre[0])
|
253 |
st.table(popular_movie_recs)
|
|
|
258 |
st.write("")
|
259 |
|
260 |
item_based_rec = st.sidebar.checkbox("Show me a movie like this",
|
261 |
+
False,
|
262 |
+
help="Input some movies and we will show you similar ones")
|
263 |
|
264 |
if item_based_rec:
|
265 |
st.markdown("### Tell us a movie you like:")
|
|
|
295 |
st.write("")
|
296 |
|
297 |
user_based_rec = st.sidebar.checkbox("I want to get personalized recommendations",
|
298 |
+
False,
|
299 |
+
help="Login to get personalized recommendations")
|
300 |
|
301 |
if user_based_rec:
|
302 |
st.markdown("### Please login to get customized recommendations just for you")
|
|
|
305 |
|
306 |
user_id = st.number_input("Please enter your user id", step=1,
|
307 |
min_value=1)
|
308 |
+
genre_default = ['Any']
|
309 |
+
genre = st.multiselect(
|
310 |
+
"Genre",
|
311 |
+
options=genre_list,
|
312 |
+
help="Select the genre of the movie you would like to watch",
|
313 |
+
#default=genre_default
|
314 |
+
)
|
315 |
|
316 |
nr_rec = st.slider("Number of recommendations",
|
317 |
min_value=1,
|
|
|
322 |
help="How many movie recommendations would you like to receive?",
|
323 |
)
|
324 |
|
325 |
+
time_period = st.slider('years:', min_value=1900,
|
326 |
+
max_value=2018,
|
327 |
+
value=(2010,2018),
|
328 |
+
step=1)
|
329 |
+
|
330 |
submit_button_user = st.form_submit_button(label="Submit")
|
331 |
|
332 |
if submit_button_user:
|
333 |
+
# user_movie_recs = user_n_movies(user_id, nr_rec)
|
334 |
+
user_movie_recs = top_n_user_based(user_id, nr_rec, genre, time_period)
|
335 |
+
|
336 |
+
# st.write(time_period)
|
337 |
+
st.table(user_movie_recs[['title', 'genres']].style.pipe(make_pretty))
|