Zeynabrz commited on
Commit
127b23c
1 Parent(s): 1be4228

Upload mlflow.py

Browse files
Files changed (1) hide show
  1. mlflow.py +244 -0
mlflow.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import mlflow
2
+ from mlflow import log_metric, log_param, log_artifacts
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ import matplotlib.pyplot as plt
7
+ from ast import literal_eval
8
+ from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
9
+ from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
10
+ from nltk.stem.snowball import SnowballStemmer
11
+ import warnings; warnings.simplefilter('ignore')
12
+
13
+ """# Read datasets"""
14
+ # /Applications/Education/7th Semester/Machine Learning/Me/Project/IMDB/credits.csv
15
+ def read_data_set():
16
+ md=pd.read_csv('IMDB/movies_metadata.csv')
17
+ md.head(2)
18
+
19
+ credits=pd.read_csv('IMDB/credits.csv')
20
+ credits.head(2)
21
+
22
+ keywords=pd.read_csv('IMDB/keywords.csv')
23
+ keywords.head(2)
24
+
25
+ links_small=pd.read_csv('IMDB/links_small.csv')
26
+ links_small.head(2)
27
+
28
+ df_rating = pd.read_csv('IMDB/ratings_small.csv')
29
+ return md,credits,keywords,links_small,df_rating
30
+ df_rating.head(5)
31
+
32
+ """# Data preprocessing"""
33
+
34
+ def data_preprocessing():
35
+ links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')
36
+ md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
37
+ vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
38
+ vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
39
+ C = vote_averages.mean()
40
+
41
+ m = vote_counts.quantile(0.95)
42
+
43
+ md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
44
+ qualified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
45
+ qualified['vote_count'] = qualified['vote_count'].astype('int')
46
+ qualified['vote_average'] = qualified['vote_average'].astype('int')
47
+ qualified.shape
48
+
49
+ def weighted_rating(x):
50
+ v = x['vote_count']
51
+ R = x['vote_average']
52
+ return (v/(v+m) * R) + (m/(m+v) * C)
53
+
54
+ s = md.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
55
+ s.name = 'genre'
56
+ gen_md = md.drop('genres', axis=1).join(s)
57
+ md = md.drop([19730, 29503, 35587])
58
+ md.head(5)
59
+
60
+ md['id'] = md['id'].astype('int')
61
+ smd = md[md['id'].isin(links_small)]
62
+ smd.shape
63
+
64
+ smd['tagline'] = smd['tagline'].fillna('')
65
+ smd['description'] = smd['overview'] + smd['tagline']
66
+ smd['description'] = smd['description'].fillna('')
67
+
68
+ tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
69
+ tfidf_matrix = tf.fit_transform(smd['description'])
70
+ tfidf_matrix.shape
71
+
72
+ cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
73
+
74
+ smd = smd.reset_index()
75
+ titles = smd['title']
76
+ indices = pd.Series(smd.index, index=smd['title'])
77
+
78
+ """#First content based recommendation"""
79
+
80
+ def get_recommendations(title):
81
+ idx = indices[title]
82
+ sim_scores = list(enumerate(cosine_sim[idx]))
83
+ sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
84
+ sim_scores = sim_scores[1:31]
85
+ movie_indices = [i[0] for i in sim_scores]
86
+ return titles.iloc[movie_indices]
87
+
88
+ # get_recommendations('The Godfather').head(10)
89
+
90
+ """# Some process on data to have better recommendations"""
91
+
92
+ read_data_set.keywords['id'] = read_data_set.keywords['id'].astype('int')
93
+ credits['id'] = credits['id'].astype('int')
94
+ read_data_set.md['id'] = read_data_set.md['id'].astype('int')
95
+ read_data_set.md.shape
96
+
97
+ md = md.merge(credits, on='id')
98
+ md = md.merge(keywords, on='id')
99
+ smd = md[md['id'].isin(read_data_set.links_small)]
100
+ smd.shape
101
+
102
+ smd['cast'] = smd['cast'].apply(literal_eval)
103
+ smd['crew'] = smd['crew'].apply(literal_eval)
104
+ smd['keywords'] = smd['keywords'].apply(literal_eval)
105
+ smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
106
+ smd['crew_size'] = smd['crew'].apply(lambda x: len(x))
107
+
108
+ def get_director(x):
109
+ for i in x:
110
+ if i['job'] == 'Director':
111
+ return i['name']
112
+ return np.nan
113
+
114
+ smd['director'] = smd['crew'].apply(get_director)
115
+ smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
116
+ smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)
117
+
118
+ smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
119
+
120
+ smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
121
+ smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
122
+ smd['director'] = smd['director'].apply(lambda x: [x,x, x])
123
+
124
+ s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
125
+ s.name = 'keyword'
126
+ s = s.value_counts()
127
+ s[:5]
128
+
129
+ s = s[s > 1]
130
+ stemmer = SnowballStemmer('english')
131
+ stemmer.stem('dogs')
132
+
133
+ def filter_keywords(x):
134
+ words = []
135
+ for i in x:
136
+ if i in s:
137
+ words.append(i)
138
+ return words
139
+
140
+ smd['keywords'] = smd['keywords'].apply(filter_keywords)
141
+ smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
142
+ smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
143
+
144
+ smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
145
+ smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))
146
+
147
+ smd.head(5)
148
+
149
+ count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
150
+ count_matrix = count.fit_transform(smd['soup'])
151
+ cosine_sim = cosine_similarity(count_matrix, count_matrix)
152
+
153
+ smd = smd.reset_index()
154
+ titles = smd['title']
155
+ indices = pd.Series(smd.index, index=smd['title'])
156
+
157
+ df_rating = pd.read_csv('IMDB/ratings_small.csv')
158
+ df_rating.head(5)
159
+
160
+ """# Data preprocessing and analyzing"""
161
+ rating_copy = df_rating.copy()
162
+ rating_copy['rating'] = rating_copy['rating'].apply(np.floor)
163
+ gp_by_rating = rating_copy.groupby('rating')['rating'].agg(['count'])
164
+
165
+
166
+
167
+ movie_count = df_rating['movieId'].nunique()
168
+ cust_count = df_rating['userId'].nunique()
169
+
170
+
171
+
172
+ ax = gp_by_rating.plot(kind = 'barh', legend = False, figsize = (8,8))
173
+ plt.title('{:,} Movies, {:,} customers'.format(movie_count, cust_count), fontsize=14)
174
+ plt.axis('off')
175
+
176
+ for i in range(0,6):
177
+ ax.text(gp_by_rating.iloc[i][0]/4, i, 'Rating {}: {:.0f}%'.
178
+ format(i, gp_by_rating.iloc[i][0]*100 / gp_by_rating.sum()[0]), color = 'black')
179
+
180
+ agg_function = ['count','mean']
181
+
182
+ gp_by_movie = df_rating.groupby('movieId')['rating'].agg(agg_function)
183
+
184
+ df_rating = pd.merge(df_rating, smd, how='right', left_on='movieId', right_on='id')
185
+ df_rating = df_rating[['movieId', 'userId', 'rating']]
186
+ pivot_rating = pd.pivot_table(df_rating, values='rating', index='userId', columns='movieId')
187
+ pivot_rating
188
+
189
+
190
+ """# Improved recommendation"""
191
+ with mlflow.start_run(run_name="run") as run:
192
+ get_recommendations('The Dark Knight').head(10)
193
+
194
+ def improved_recommendations(title):
195
+ idx = indices[title]
196
+ sim_scores = list(enumerate(cosine_sim[idx]))
197
+ sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
198
+ sim_scores = sim_scores[1:26]
199
+ movie_indices = [i[0] for i in sim_scores]
200
+ eval_cosine = sum(movie_indices) / len(movie_indices)
201
+ movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
202
+ vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
203
+ vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
204
+ C = vote_averages.mean()
205
+ m = vote_counts.quantile(0.60)
206
+ qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
207
+ qualified['vote_count'] = qualified['vote_count'].astype('int')
208
+ qualified['vote_average'] = qualified['vote_average'].astype('int')
209
+ qualified['wr'] = qualified.apply(weighted_rating, axis=1)
210
+ qualified = qualified.sort_values('wr', ascending=False).head(10)
211
+ return qualified, eval_cosine
212
+
213
+ q, eval_cosine = improved_recommendations('The Dark Knight')
214
+ mlflow.log_metric('cosine_sim',eval_cosine)
215
+ """# Collaborative Filtering"""
216
+
217
+
218
+ """# PearsonR recommendation"""
219
+
220
+ df_movie_title = smd[['id', 'title']]
221
+ df_movie_title.shape
222
+
223
+ def corr_recommend(movie_title, min_count):
224
+ i = int(df_movie_title[df_movie_title['title'] == movie_title]['id'])
225
+ target = pivot_rating[i]
226
+ similar_to_target = pivot_rating.corrwith(target)
227
+ corr_target = pd.DataFrame(similar_to_target, columns = ['PearsonR'])
228
+ corr_target.dropna(inplace = True)
229
+ corr_target = corr_target.sort_values('PearsonR', ascending = False)
230
+ corr_target.index = corr_target.index.map(int)
231
+ corr_target = corr_target.join(df_movie_title).join(gp_by_movie)[['PearsonR', 'title', 'count', 'mean']]
232
+ return corr_target[corr_target['count']>min_count][:10]
233
+
234
+ corr_recommend('The Dark Knight', 0)
235
+
236
+ def hybrid_recommendation(movie_name):
237
+
238
+ soup_based = improved_recommendations(movie_name)
239
+ corr = corr_recommend(soup_based.iloc[0]['title'],0)
240
+ return get_recommendations(corr.iloc[0]['title'])
241
+
242
+
243
+
244
+ print(hybrid_recommendation('Toy Story').head(10))