aminian commited on
Commit
b6c882b
1 Parent(s): 3a862f4

Add project

Browse files
Files changed (2) hide show
  1. ML_Final_Project.ipynb +0 -0
  2. ml_final_project.py +410 -0
ML_Final_Project.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
ml_final_project.py ADDED
@@ -0,0 +1,410 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """ML Final Project
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1Aof3bcIIqSmvsh0cux6wZ5NPk1wY-l3D
8
+
9
+ ### install dependencies
10
+ """
11
+
12
+ !gdown "1W3-WEplVSztLR3lvkyYdiKZGMT4y0cNi&confirm=t"
13
+
14
+ #!unzip IMDB.zip
15
+
16
+ #!pip install mlflow
17
+
18
+ """# Content-based filtering
19
+
20
+ ### import libraries
21
+ """
22
+
23
+ import numpy as np
24
+ import pandas as pd
25
+ import mlflow as mf
26
+
27
+ #mf.log_artifacts({'rating':'/content/rating_small.csv', 'rating':'/content/rating_small.csv', 'movies':'/content/movies_metadata.csv','keywords':'/content/keywords.csv', 'credits':'/content/credits.csv'})
28
+
29
+ """### read data from file"""
30
+
31
+ keywords = pd.read_csv('/content/IMDB/keywords.csv')
32
+ keywords
33
+
34
+ rating = pd.read_csv('/content/IMDB/ratings_small.csv')
35
+ rating
36
+
37
+ credits = pd.read_csv('/content/IMDB/credits.csv')
38
+ credits
39
+
40
+ metadata = pd.read_csv('/content/IMDB/movies_metadata.csv')
41
+ metadata
42
+
43
+ """keep only related columns from released movies:"""
44
+
45
+ metadata = metadata[metadata['status'] == 'Released']
46
+ cols = np.array(['adult', 'belongs_to_collection', 'genres', 'id', 'original_language', 'title', 'production_countries', 'production_companies', 'video'])
47
+ metadata = metadata[cols]
48
+
49
+ metadata.iloc[1]
50
+
51
+ def find_collection(x):
52
+ if x == '':
53
+ return ''
54
+ return eval(str(x))['name']
55
+
56
+ metadata['belongs_to_collection'] = metadata['belongs_to_collection'].fillna('')
57
+ metadata['belongs_to_collection'] = metadata['belongs_to_collection'].apply(find_collection)
58
+ metadata.iloc[1]
59
+
60
+ def find_names(x):
61
+ if x == '':
62
+ return ''
63
+ genre_arr = eval(str(x))
64
+ return ','.join(i['name'] for i in eval(str(x)))
65
+
66
+ metadata['genres'] = metadata['genres'].fillna('')
67
+ metadata['genres']=metadata['genres'].apply(find_names)
68
+ metadata['production_countries']=metadata['production_countries'].apply(find_names)
69
+ metadata['production_companies']=metadata['production_companies'].apply(find_names)
70
+ credits['cast'] = credits['cast'].apply(find_names)
71
+ metadata.iloc[1]
72
+
73
+ keywords['keywords'] = keywords['keywords'].apply(find_names)
74
+ metadata['id'] = metadata['id'].astype(int)
75
+ metadata = pd.merge(metadata,keywords,how='inner',on='id')
76
+ metadata.iloc[1]
77
+
78
+ def to_int(x):
79
+ if x == 'True':
80
+ return 1
81
+ return 0
82
+
83
+ metadata['adult'].unique()
84
+
85
+ """there are 3 values other than True or False in adult column. there are entered by mistake so we remove those rows."""
86
+
87
+ metadata = metadata[(metadata['adult'] == 'True') | (metadata['adult'] == 'False')]
88
+ metadata['adult'] = metadata['adult'].apply(to_int)
89
+ metadata['video'].unique()
90
+
91
+ """removing nan values from dataset and replacing 'True' and 'False' with 1 and 0:"""
92
+
93
+ metadata = metadata[~metadata['video'].isna()]
94
+ metadata['video'] = metadata['video'].apply(to_int)
95
+
96
+ """## Vectorize string features"""
97
+
98
+ metadata
99
+
100
+ from sklearn.feature_extraction.text import CountVectorizer
101
+ from sklearn.feature_extraction.text import TfidfVectorizer
102
+
103
+ def my_tok(text):
104
+ return text.split(",")
105
+
106
+ def vectorize_string(col_name, feature_name, limit=None, df=metadata):
107
+ vectorizer = CountVectorizer(tokenizer=my_tok, max_features=limit, min_df=2)
108
+ X = vectorizer.fit_transform(df[col_name])
109
+ vec_cols = vectorizer.get_feature_names_out()
110
+ vec_data = X.toarray()
111
+ #vec_cols = np.char.add(feature_name+':', vec_cols)
112
+ vec_cols = feature_name+':'+vec_cols
113
+ return vec_data, vec_cols
114
+
115
+ def tfidf(col_name, feature_name, limit=None, df=metadata):
116
+ vectorizer = TfidfVectorizer(tokenizer=my_tok, max_features=limit, min_df=2)
117
+ X = vectorizer.fit_transform(df[col_name])
118
+ vec_cols = vectorizer.get_feature_names_out()
119
+ vec_data = X.toarray()
120
+ #vec_cols = np.char.add(feature_name+':', vec_cols)
121
+ vec_cols = feature_name+':'+vec_cols
122
+ return vec_data, vec_cols
123
+
124
+ genre_data, genre_cols = vectorize_string('genres', 'genre')
125
+ genre_cols
126
+
127
+ companies_data, companies_cols = vectorize_string('production_companies', 'company', 100)
128
+ companies_cols
129
+
130
+ countries_data, countries_cols = vectorize_string('production_countries', 'country')
131
+ countries_cols
132
+
133
+ collection_data, collection_cols = vectorize_string('belongs_to_collection', 'collection')
134
+ collection_cols
135
+
136
+ metadata['original_language']= metadata['original_language'].fillna('')
137
+ lang_data, lang_cols = vectorize_string('original_language', 'lang')
138
+ lang_cols
139
+
140
+ collection_cols.shape
141
+
142
+ keyword_data, keyword_cols = tfidf('keywords', 'keyword', 1000)
143
+ keyword_cols
144
+
145
+ credits.drop(columns=['crew'], inplace=True)
146
+ credit_data, credit_cols = vectorize_string('cast','cast', 1000, df=credits)
147
+ credit_cols
148
+
149
+ metadata = pd.concat([metadata[['title','id','adult','video']],
150
+ pd.DataFrame(genre_data, columns=genre_cols),
151
+ pd.DataFrame(countries_data, columns=countries_cols),
152
+ pd.DataFrame(collection_data, columns=collection_cols),
153
+ pd.DataFrame(keyword_data, columns=keyword_cols),
154
+ pd.DataFrame(companies_data, columns=companies_cols),
155
+ pd.DataFrame(lang_data, columns=lang_cols)], axis=1)
156
+
157
+ credits[credit_cols] = credit_data
158
+ metadata = pd.merge(metadata, credits, how='inner', on='id')
159
+ metadata
160
+
161
+ #metadata.drop(['production_countries', 'genres', 'belongs_to_collection', 'keywords', 'production_companies', 'original_language'], axis=1, inplace=True)
162
+
163
+ """list of all numerical features(everything except id and title)"""
164
+
165
+ feature_cols = np.concatenate((np.array(['adult', 'video']), genre_cols,countries_cols,collection_cols,keyword_cols,companies_cols,lang_cols,credit_cols))
166
+ feature_cols
167
+ #metadata[feature_cols] = metadata[feature_cols].astype('int8')
168
+
169
+ del genre_data,countries_data,collection_data,keyword_data,companies_data,lang_data,credit_data
170
+ del genre_cols,countries_cols,collection_cols,keyword_cols,companies_cols,lang_cols,credit_cols
171
+
172
+ feature_cols.shape
173
+
174
+ metadata
175
+
176
+ def split_dataframe(df, holdout_fraction=0.1):
177
+ test = df.sample(frac=holdout_fraction, replace=False)
178
+ train = df[~df.index.isin(test.index)]
179
+ return train, test
180
+
181
+ train, test = split_dataframe(metadata)
182
+
183
+ allIds = metadata['id']
184
+
185
+ number_of_batches = 4
186
+ batches = np.array_split(train, number_of_batches)
187
+ mf.log_param('number of batches', number_of_batches)
188
+ del metadata
189
+ del train
190
+
191
+ """## Algorithm
192
+
193
+ """
194
+
195
+ batches[0]
196
+
197
+ from sklearn.metrics.pairwise import cosine_similarity
198
+
199
+ """`content_based_recommmeder` returns a list of movie ids based on it's input. the input should be a dataframe which has `movieId`, `rating` columns(like `ratings_small.csv` but without `userId`)"""
200
+
201
+ number_of_batches =1
202
+ def content_based_recommender_movie(movieId):
203
+ print("movie title is:", metadata[metadata['id']==movieId])
204
+ sim_mat= cosine_similarity(metadata[feature_cols])
205
+ return sim_mat
206
+
207
+ #content_based_recommender_movie(272)
208
+
209
+ batches[1].describe()
210
+
211
+ from sklearn.metrics.pairwise import euclidean_distances as dist
212
+ def content_based_recommender(user, df, k=10, movieIds=allIds):
213
+ user_movies = pd.merge(user,df,how='inner',left_on='movieId',right_on='id')
214
+ user_movies[feature_cols] = user_movies[feature_cols].multiply(user_movies['rating'], axis="index")
215
+ mean_user_movies = user_movies[feature_cols].mean(axis=0)
216
+ sim_mat = cosine_similarity(df[feature_cols][df.id.isin(movieIds)], mean_user_movies[feature_cols].values.reshape(1,-1))
217
+ temp_data = {'id':df['id'][df.id.isin(movieIds)], 'title':df['title'][df.id.isin(movieIds)], 'sim':sim_mat.flatten()}
218
+ return pd.DataFrame(temp_data)
219
+
220
+ def content_based_all_batches(user, k=10, movieIds=allIds):
221
+ ans = content_based_recommender(user, batches[0], k, movieIds)
222
+ for i in range(1,number_of_batches):
223
+ ans.append(content_based_recommender(user, batches[i], k, movieIds))
224
+ return ans.sort_values(by='sim', ascending=False)
225
+
226
+
227
+ content_based_k = 10
228
+ mf.log_param('content based k', content_based_k)
229
+ #xx = content_based_recommender(rating[rating['userId'] == 1], batches[1], content_based_k)
230
+ xx = content_based_all_batches(rating[rating['userId'] == 1], content_based_k)
231
+ xx.shape
232
+
233
+ """# Collaborative Filtering
234
+
235
+ ### import libraries
236
+ """
237
+
238
+ import numpy as np
239
+ import pandas as pd
240
+ from sklearn.utils.extmath import randomized_svd
241
+
242
+ """### explore datasets"""
243
+
244
+ rating = pd.read_csv('/content/IMDB/ratings_small.csv')
245
+ rating.head()
246
+
247
+ rating.shape
248
+
249
+ links_small = pd.read_csv('/content/IMDB/links_small.csv')
250
+ links_small.head()
251
+
252
+ credits = pd.read_csv('/content/IMDB/credits.csv')
253
+ credits.head()
254
+
255
+ movie = pd.read_csv('/content/IMDB/movies_metadata.csv')
256
+ movie.head()
257
+
258
+ movie = movie.rename(columns={'id': 'movieId'})
259
+
260
+ movie.shape
261
+
262
+ movie.head()
263
+
264
+ """### data preprocessing
265
+
266
+ There are three rows entered by mistake, so we remove that row.
267
+ """
268
+
269
+ movie = movie[(movie['movieId']!='1997-08-20') & (movie['movieId']!='2012-09-29') & (movie['movieId']!='2014-01-01')]
270
+
271
+ def find_names(x):
272
+ if x == '':
273
+ return ''
274
+ genre_arr = eval(str(x))
275
+ return ','.join(i['name'] for i in eval(str(x)))
276
+
277
+ movie['genres'] = movie['genres'].fillna('')
278
+
279
+ movie['genres']=movie['genres'].apply(find_names)
280
+
281
+ movie.movieId = movie.movieId.astype("uint64")
282
+
283
+ """only keep rating for movies with metadata in movie dataset"""
284
+
285
+ new_rating = pd.merge(rating, movie, how='inner', on=["movieId"])
286
+
287
+ new_rating = new_rating[["userId", "movieId", "rating"]]
288
+
289
+ movie.head()
290
+
291
+ new_rating.head()
292
+
293
+ train, test = split_dataframe(new_rating)
294
+
295
+ """### matrix factorization"""
296
+
297
+ inter_mat_df = rating.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)
298
+ inter_mat_df
299
+
300
+ inter_mat = inter_mat_df.to_numpy()
301
+
302
+ ratings_mean = np.mean(inter_mat, axis = 1)
303
+ inter_mat_normal = inter_mat - ratings_mean.reshape(-1, 1)
304
+
305
+ inter_mat_normal
306
+
307
+ """We use singular value decomposition for matrix factorization"""
308
+
309
+ svd_U, svd_sigma, svd_V = randomized_svd(inter_mat_normal,
310
+ n_components=15,
311
+ n_iter=5,
312
+ random_state=47)
313
+
314
+ """This function gives the diagonal form"""
315
+
316
+ svd_sigma = np.diag(svd_sigma)
317
+
318
+ """Making predictions"""
319
+
320
+ rating_weights = np.dot(np.dot(svd_U, svd_sigma), svd_V) + ratings_mean.reshape(-1, 1)
321
+
322
+ weights_df = pd.DataFrame(rating_weights, columns = inter_mat_df.columns)
323
+
324
+ weights_df.head()
325
+
326
+ """making recommendations"""
327
+
328
+ def recommend_top_k(preds_df, ratings_df, movie, userId, k=10):
329
+ user_row = userId-1
330
+ sorted_user_predictions = preds_df.iloc[user_row].sort_values(ascending=False)
331
+ user_data = ratings_df[ratings_df.userId == (userId)]
332
+ user_rated = user_data.merge(movie, how = 'left', left_on = 'movieId', right_on = 'movieId'). \
333
+ sort_values(['rating'], ascending=False)
334
+ user_preds = movie.merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
335
+ on = 'movieId').rename(columns = {user_row: 'prediction'}). \
336
+ sort_values('prediction', ascending = False). \
337
+ iloc[:k, :]
338
+ return user_rated, user_preds
339
+
340
+ collaborative_k = 100
341
+ user_rated, user_preds = recommend_top_k(weights_df, new_rating, movie, 220, collaborative_k)
342
+ mf.log_param('collaborative k', collaborative_k)
343
+
344
+ user_preds.head()
345
+
346
+ user_rated.head()
347
+
348
+ user_rated[["title", "genres"]].head(10)
349
+
350
+ user_preds[["title", "genres"]].head(10)
351
+
352
+ """# Ensemble Model"""
353
+
354
+ def ensemble(userId, k=10):
355
+ user_rated, user_preds = recommend_top_k(weights_df, new_rating, movie, userId, k*k)
356
+ content_based_result = content_based_all_batches(rating[rating['userId'] == userId], k=k, movieIds=user_preds['movieId'])
357
+ return content_based_result[['id','title']]
358
+
359
+ ensemble_k=10
360
+ mf.log_param('ensemble k', ensemble_k)
361
+ ensemble(220, ensemble_k)
362
+
363
+ """# Evaluation"""
364
+
365
+ df_res = user_preds[["movieId", "prediction"]]. \
366
+ merge(user_rated[["movieId", "rating"]], how = 'outer', on = 'movieId')
367
+
368
+ df_res.sort_values(by='prediction',ascending=False,inplace=True)
369
+ df_res
370
+
371
+ threshold = 2
372
+ df_res['prediction'] = df_res['prediction'] >= threshold
373
+ df_res['rating'] = df_res['rating'] >= threshold
374
+ df_res
375
+
376
+ def precision_at_k(df, k=10, y_test: str='rating', y_pred='prediction'):
377
+ dfK = df.head(k)
378
+ sum_df = dfK[y_pred].sum()
379
+ true_pred = dfK[dfK[y_pred] & dfK[y_test]].shape[0]
380
+ if sum_df > 0:
381
+ return true_pred/sum_df
382
+ else:
383
+ return None
384
+
385
+ def recall_at_k(df, k=10, y_test='rating', y_pred='prediction'):
386
+ dfK = df.head(k)
387
+ sum_df = df[y_test].sum()
388
+ true_pred = dfK[dfK[y_pred] & dfK[y_test]].shape[0]
389
+ if sum_df > 0:
390
+ return true_pred/sum_df
391
+ else:
392
+ return None
393
+
394
+ prec_at_k = precision_at_k(df_res, 100, y_test='rating', y_pred='prediction')
395
+ rec_at_k = recall_at_k(df_res, 100, y_test='rating', y_pred='prediction')
396
+
397
+ print("precision@k: ", prec_at_k)
398
+ print("recall@k: ", rec_at_k)
399
+ mf.log_metric('recall', rec_at_k)
400
+ mf.log_metric('precision', prec_at_k)
401
+
402
+
403
+
404
+ """# MLOps"""
405
+
406
+ def updata_batch(new_batch):
407
+ number_of_batches = number_of_batches+1
408
+ batches = batches.append(new_batch)
409
+ mf.log_param('number of batches', number_of_batches)
410
+