|
|
|
"""ML Final Project |
|
|
|
Automatically generated by Colaboratory. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1Aof3bcIIqSmvsh0cux6wZ5NPk1wY-l3D |
|
|
|
### install dependencies |
|
""" |
|
|
|
!gdown "1W3-WEplVSztLR3lvkyYdiKZGMT4y0cNi&confirm=t" |
|
|
|
|
|
|
|
|
|
|
|
"""# Content-based filtering |
|
|
|
### import libraries |
|
""" |
|
|
|
import numpy as np |
|
import pandas as pd |
|
import mlflow as mf |
|
|
|
|
|
|
|
"""### read data from file""" |
|
|
|
keywords = pd.read_csv('/content/IMDB/keywords.csv') |
|
keywords |
|
|
|
rating = pd.read_csv('/content/IMDB/ratings_small.csv') |
|
rating |
|
|
|
credits = pd.read_csv('/content/IMDB/credits.csv') |
|
credits |
|
|
|
metadata = pd.read_csv('/content/IMDB/movies_metadata.csv') |
|
metadata |
|
|
|
"""keep only related columns from released movies:""" |
|
|
|
metadata = metadata[metadata['status'] == 'Released'] |
|
cols = np.array(['adult', 'belongs_to_collection', 'genres', 'id', 'original_language', 'title', 'production_countries', 'production_companies', 'video']) |
|
metadata = metadata[cols] |
|
|
|
metadata.iloc[1] |
|
|
|
def find_collection(x): |
|
if x == '': |
|
return '' |
|
return eval(str(x))['name'] |
|
|
|
metadata['belongs_to_collection'] = metadata['belongs_to_collection'].fillna('') |
|
metadata['belongs_to_collection'] = metadata['belongs_to_collection'].apply(find_collection) |
|
metadata.iloc[1] |
|
|
|
def find_names(x): |
|
if x == '': |
|
return '' |
|
genre_arr = eval(str(x)) |
|
return ','.join(i['name'] for i in eval(str(x))) |
|
|
|
metadata['genres'] = metadata['genres'].fillna('') |
|
metadata['genres']=metadata['genres'].apply(find_names) |
|
metadata['production_countries']=metadata['production_countries'].apply(find_names) |
|
metadata['production_companies']=metadata['production_companies'].apply(find_names) |
|
credits['cast'] = credits['cast'].apply(find_names) |
|
metadata.iloc[1] |
|
|
|
keywords['keywords'] = keywords['keywords'].apply(find_names) |
|
metadata['id'] = metadata['id'].astype(int) |
|
metadata = pd.merge(metadata,keywords,how='inner',on='id') |
|
metadata.iloc[1] |
|
|
|
def to_int(x): |
|
if x == 'True': |
|
return 1 |
|
return 0 |
|
|
|
metadata['adult'].unique() |
|
|
|
"""there are 3 values other than True or False in adult column. there are entered by mistake so we remove those rows.""" |
|
|
|
metadata = metadata[(metadata['adult'] == 'True') | (metadata['adult'] == 'False')] |
|
metadata['adult'] = metadata['adult'].apply(to_int) |
|
metadata['video'].unique() |
|
|
|
"""removing nan values from dataset and replacing 'True' and 'False' with 1 and 0:""" |
|
|
|
metadata = metadata[~metadata['video'].isna()] |
|
metadata['video'] = metadata['video'].apply(to_int) |
|
|
|
"""## Vectorize string features""" |
|
|
|
metadata |
|
|
|
from sklearn.feature_extraction.text import CountVectorizer |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
|
def my_tok(text): |
|
return text.split(",") |
|
|
|
def vectorize_string(col_name, feature_name, limit=None, df=metadata): |
|
vectorizer = CountVectorizer(tokenizer=my_tok, max_features=limit, min_df=2) |
|
X = vectorizer.fit_transform(df[col_name]) |
|
vec_cols = vectorizer.get_feature_names_out() |
|
vec_data = X.toarray() |
|
|
|
vec_cols = feature_name+':'+vec_cols |
|
return vec_data, vec_cols |
|
|
|
def tfidf(col_name, feature_name, limit=None, df=metadata): |
|
vectorizer = TfidfVectorizer(tokenizer=my_tok, max_features=limit, min_df=2) |
|
X = vectorizer.fit_transform(df[col_name]) |
|
vec_cols = vectorizer.get_feature_names_out() |
|
vec_data = X.toarray() |
|
|
|
vec_cols = feature_name+':'+vec_cols |
|
return vec_data, vec_cols |
|
|
|
genre_data, genre_cols = vectorize_string('genres', 'genre') |
|
genre_cols |
|
|
|
companies_data, companies_cols = vectorize_string('production_companies', 'company', 100) |
|
companies_cols |
|
|
|
countries_data, countries_cols = vectorize_string('production_countries', 'country') |
|
countries_cols |
|
|
|
collection_data, collection_cols = vectorize_string('belongs_to_collection', 'collection') |
|
collection_cols |
|
|
|
metadata['original_language']= metadata['original_language'].fillna('') |
|
lang_data, lang_cols = vectorize_string('original_language', 'lang') |
|
lang_cols |
|
|
|
collection_cols.shape |
|
|
|
keyword_data, keyword_cols = tfidf('keywords', 'keyword', 1000) |
|
keyword_cols |
|
|
|
credits.drop(columns=['crew'], inplace=True) |
|
credit_data, credit_cols = vectorize_string('cast','cast', 1000, df=credits) |
|
credit_cols |
|
|
|
metadata = pd.concat([metadata[['title','id','adult','video']], |
|
pd.DataFrame(genre_data, columns=genre_cols), |
|
pd.DataFrame(countries_data, columns=countries_cols), |
|
pd.DataFrame(collection_data, columns=collection_cols), |
|
pd.DataFrame(keyword_data, columns=keyword_cols), |
|
pd.DataFrame(companies_data, columns=companies_cols), |
|
pd.DataFrame(lang_data, columns=lang_cols)], axis=1) |
|
|
|
credits[credit_cols] = credit_data |
|
metadata = pd.merge(metadata, credits, how='inner', on='id') |
|
metadata |
|
|
|
|
|
|
|
"""list of all numerical features(everything except id and title)""" |
|
|
|
feature_cols = np.concatenate((np.array(['adult', 'video']), genre_cols,countries_cols,collection_cols,keyword_cols,companies_cols,lang_cols,credit_cols)) |
|
feature_cols |
|
|
|
|
|
del genre_data,countries_data,collection_data,keyword_data,companies_data,lang_data,credit_data |
|
del genre_cols,countries_cols,collection_cols,keyword_cols,companies_cols,lang_cols,credit_cols |
|
|
|
feature_cols.shape |
|
|
|
metadata |
|
|
|
def split_dataframe(df, holdout_fraction=0.1): |
|
test = df.sample(frac=holdout_fraction, replace=False) |
|
train = df[~df.index.isin(test.index)] |
|
return train, test |
|
|
|
train, test = split_dataframe(metadata) |
|
|
|
allIds = metadata['id'] |
|
|
|
number_of_batches = 4 |
|
batches = np.array_split(train, number_of_batches) |
|
mf.log_param('number of batches', number_of_batches) |
|
del metadata |
|
del train |
|
|
|
"""## Algorithm |
|
|
|
""" |
|
|
|
batches[0] |
|
|
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
"""`content_based_recommmeder` returns a list of movie ids based on it's input. the input should be a dataframe which has `movieId`, `rating` columns(like `ratings_small.csv` but without `userId`)""" |
|
|
|
number_of_batches =1 |
|
def content_based_recommender_movie(movieId): |
|
print("movie title is:", metadata[metadata['id']==movieId]) |
|
sim_mat= cosine_similarity(metadata[feature_cols]) |
|
return sim_mat |
|
|
|
|
|
|
|
batches[1].describe() |
|
|
|
from sklearn.metrics.pairwise import euclidean_distances as dist |
|
def content_based_recommender(user, df, k=10, movieIds=allIds): |
|
user_movies = pd.merge(user,df,how='inner',left_on='movieId',right_on='id') |
|
user_movies[feature_cols] = user_movies[feature_cols].multiply(user_movies['rating'], axis="index") |
|
mean_user_movies = user_movies[feature_cols].mean(axis=0) |
|
sim_mat = cosine_similarity(df[feature_cols][df.id.isin(movieIds)], mean_user_movies[feature_cols].values.reshape(1,-1)) |
|
temp_data = {'id':df['id'][df.id.isin(movieIds)], 'title':df['title'][df.id.isin(movieIds)], 'sim':sim_mat.flatten()} |
|
return pd.DataFrame(temp_data) |
|
|
|
def content_based_all_batches(user, k=10, movieIds=allIds): |
|
ans = content_based_recommender(user, batches[0], k, movieIds) |
|
for i in range(1,number_of_batches): |
|
ans.append(content_based_recommender(user, batches[i], k, movieIds)) |
|
return ans.sort_values(by='sim', ascending=False) |
|
|
|
|
|
content_based_k = 10 |
|
mf.log_param('content based k', content_based_k) |
|
|
|
xx = content_based_all_batches(rating[rating['userId'] == 1], content_based_k) |
|
xx.shape |
|
|
|
"""# Collaborative Filtering |
|
|
|
### import libraries |
|
""" |
|
|
|
import numpy as np |
|
import pandas as pd |
|
from sklearn.utils.extmath import randomized_svd |
|
|
|
"""### explore datasets""" |
|
|
|
rating = pd.read_csv('/content/IMDB/ratings_small.csv') |
|
rating.head() |
|
|
|
rating.shape |
|
|
|
links_small = pd.read_csv('/content/IMDB/links_small.csv') |
|
links_small.head() |
|
|
|
credits = pd.read_csv('/content/IMDB/credits.csv') |
|
credits.head() |
|
|
|
movie = pd.read_csv('/content/IMDB/movies_metadata.csv') |
|
movie.head() |
|
|
|
movie = movie.rename(columns={'id': 'movieId'}) |
|
|
|
movie.shape |
|
|
|
movie.head() |
|
|
|
"""### data preprocessing |
|
|
|
There are three rows entered by mistake, so we remove that row. |
|
""" |
|
|
|
movie = movie[(movie['movieId']!='1997-08-20') & (movie['movieId']!='2012-09-29') & (movie['movieId']!='2014-01-01')] |
|
|
|
def find_names(x): |
|
if x == '': |
|
return '' |
|
genre_arr = eval(str(x)) |
|
return ','.join(i['name'] for i in eval(str(x))) |
|
|
|
movie['genres'] = movie['genres'].fillna('') |
|
|
|
movie['genres']=movie['genres'].apply(find_names) |
|
|
|
movie.movieId = movie.movieId.astype("uint64") |
|
|
|
"""only keep rating for movies with metadata in movie dataset""" |
|
|
|
new_rating = pd.merge(rating, movie, how='inner', on=["movieId"]) |
|
|
|
new_rating = new_rating[["userId", "movieId", "rating"]] |
|
|
|
movie.head() |
|
|
|
new_rating.head() |
|
|
|
train, test = split_dataframe(new_rating) |
|
|
|
"""### matrix factorization""" |
|
|
|
inter_mat_df = rating.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0) |
|
inter_mat_df |
|
|
|
inter_mat = inter_mat_df.to_numpy() |
|
|
|
ratings_mean = np.mean(inter_mat, axis = 1) |
|
inter_mat_normal = inter_mat - ratings_mean.reshape(-1, 1) |
|
|
|
inter_mat_normal |
|
|
|
"""We use singular value decomposition for matrix factorization""" |
|
|
|
svd_U, svd_sigma, svd_V = randomized_svd(inter_mat_normal, |
|
n_components=15, |
|
n_iter=5, |
|
random_state=47) |
|
|
|
"""This function gives the diagonal form""" |
|
|
|
svd_sigma = np.diag(svd_sigma) |
|
|
|
"""Making predictions""" |
|
|
|
rating_weights = np.dot(np.dot(svd_U, svd_sigma), svd_V) + ratings_mean.reshape(-1, 1) |
|
|
|
weights_df = pd.DataFrame(rating_weights, columns = inter_mat_df.columns) |
|
|
|
weights_df.head() |
|
|
|
"""making recommendations""" |
|
|
|
def recommend_top_k(preds_df, ratings_df, movie, userId, k=10): |
|
user_row = userId-1 |
|
sorted_user_predictions = preds_df.iloc[user_row].sort_values(ascending=False) |
|
user_data = ratings_df[ratings_df.userId == (userId)] |
|
user_rated = user_data.merge(movie, how = 'left', left_on = 'movieId', right_on = 'movieId'). \ |
|
sort_values(['rating'], ascending=False) |
|
user_preds = movie.merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left', |
|
on = 'movieId').rename(columns = {user_row: 'prediction'}). \ |
|
sort_values('prediction', ascending = False). \ |
|
iloc[:k, :] |
|
return user_rated, user_preds |
|
|
|
collaborative_k = 100 |
|
user_rated, user_preds = recommend_top_k(weights_df, new_rating, movie, 220, collaborative_k) |
|
mf.log_param('collaborative k', collaborative_k) |
|
|
|
user_preds.head() |
|
|
|
user_rated.head() |
|
|
|
user_rated[["title", "genres"]].head(10) |
|
|
|
user_preds[["title", "genres"]].head(10) |
|
|
|
"""# Ensemble Model""" |
|
|
|
def ensemble(userId, k=10): |
|
user_rated, user_preds = recommend_top_k(weights_df, new_rating, movie, userId, k*k) |
|
content_based_result = content_based_all_batches(rating[rating['userId'] == userId], k=k, movieIds=user_preds['movieId']) |
|
return content_based_result[['id','title']] |
|
|
|
ensemble_k=10 |
|
mf.log_param('ensemble k', ensemble_k) |
|
ensemble(220, ensemble_k) |
|
|
|
"""# Evaluation""" |
|
|
|
df_res = user_preds[["movieId", "prediction"]]. \ |
|
merge(user_rated[["movieId", "rating"]], how = 'outer', on = 'movieId') |
|
|
|
df_res.sort_values(by='prediction',ascending=False,inplace=True) |
|
df_res |
|
|
|
threshold = 2 |
|
df_res['prediction'] = df_res['prediction'] >= threshold |
|
df_res['rating'] = df_res['rating'] >= threshold |
|
df_res |
|
|
|
def precision_at_k(df, k=10, y_test: str='rating', y_pred='prediction'): |
|
dfK = df.head(k) |
|
sum_df = dfK[y_pred].sum() |
|
true_pred = dfK[dfK[y_pred] & dfK[y_test]].shape[0] |
|
if sum_df > 0: |
|
return true_pred/sum_df |
|
else: |
|
return None |
|
|
|
def recall_at_k(df, k=10, y_test='rating', y_pred='prediction'): |
|
dfK = df.head(k) |
|
sum_df = df[y_test].sum() |
|
true_pred = dfK[dfK[y_pred] & dfK[y_test]].shape[0] |
|
if sum_df > 0: |
|
return true_pred/sum_df |
|
else: |
|
return None |
|
|
|
prec_at_k = precision_at_k(df_res, 100, y_test='rating', y_pred='prediction') |
|
rec_at_k = recall_at_k(df_res, 100, y_test='rating', y_pred='prediction') |
|
|
|
print("precision@k: ", prec_at_k) |
|
print("recall@k: ", rec_at_k) |
|
mf.log_metric('recall', rec_at_k) |
|
mf.log_metric('precision', prec_at_k) |
|
|
|
|
|
|
|
"""# MLOps""" |
|
|
|
def updata_batch(new_batch): |
|
number_of_batches = number_of_batches+1 |
|
batches = batches.append(new_batch) |
|
mf.log_param('number of batches', number_of_batches) |
|
|
|
|