Spaces:
Sleeping
Sleeping
# Supress sklearn warnings | |
def warn(*args, **kwargs): | |
pass | |
import warnings | |
warnings.warn = warn | |
# Import libraries | |
import numpy as np | |
import pandas as pd | |
from glob import glob | |
from sklearn.neighbors import NearestNeighbors | |
from sklearn.preprocessing import MinMaxScaler | |
# The columns that will be taken into account when making item-based similarity recommendations | |
item_columns = [] | |
# Number of neighbors to take into account | |
N_NEIGHBORS = 10 | |
# Handler for Item DataFrame | |
class ItemData: | |
def __init__(self): | |
self.df = pd.concat([pd.read_csv(f) for f in glob("items_*.csv")], axis=0) | |
self._scale_cols() | |
self.item_columns = ['scaled_runtime', 'vote_scaled', 'Animation', 'Comedy', 'Family', 'Adventure', 'Fantasy', 'Romance', 'Drama', 'Action', | |
'Crime', 'Thriller', 'Horror', 'History','Science Fiction', 'Mystery', 'War', 'Foreign', 'Music', 'Documentary', | |
'Western', 'TV Movie', 'ratio_scaled', 'pop_scaled'] | |
self.scaled_df = self.df[self.item_columns] | |
def _scale_cols(self): | |
runtime_col = self.df['runtime'].values.reshape(-1, 1) | |
runtime_scaler = MinMaxScaler().fit(runtime_col) | |
self.df['scaled_runtime'] = runtime_scaler.transform(runtime_col) | |
vote_col = self.df['vote_average'].values.reshape(-1, 1) | |
vote_scaler = MinMaxScaler().fit(vote_col) | |
self.df['vote_scaled'] = vote_scaler.transform(vote_col) | |
ratio_col = self.df['rb_ratio'].values.reshape(-1, 1) | |
ratio_scaler = MinMaxScaler().fit(ratio_col) | |
self.df['ratio_scaled'] = ratio_scaler.transform(ratio_col) | |
pop_col = self.df['pop_bin'].values.reshape(-1, 1) | |
ratio_scaler = MinMaxScaler().fit(pop_col) | |
self.df['pop_scaled'] = ratio_scaler.transform(pop_col) | |
def get_filtered_row_by_id(self, id): | |
return self.df[self.df['id'] == int(id)][self.item_columns] | |
def get_id_by_idx(self, idx): | |
return self.df.iloc[int(idx)]['id'] | |
def get_random_id(self): | |
return self.df.sample(1)['id'].values[0] | |
def get_row_by_id(self, id): | |
return self.df[self.df['id'] == id] | |
def get_movie_title_by_id(self, id): | |
return self.get_row_by_id(id)['title'].values[0] | |
def get_movie_overview_by_id(self, id): | |
return self.get_row_by_id(id)['overview'].values[0] | |
# Handler for User DataFrame | |
class UserData: | |
def __init__(self): | |
self.df = pd.concat([pd.read_csv(f) for f in glob("users_*.csv")], axis=0) | |
self.df = self.df.fillna(0) | |
###### Recommender System | |
class Recommender: | |
def __init__(self): | |
# Load preprocessed dataframes | |
self.item_handler = ItemData() | |
self.user_handler = UserData() | |
print("Dataframes loaded...") | |
self.preferences = pd.DataFrame(columns=self.user_handler.df.columns[1:]) # For user data | |
self.preferences.loc[0] = 0 # Initialize all ratings to zero | |
self.item_picks = pd.DataFrame(columns=self.item_handler.df.columns) | |
self.n_picks = 1 | |
self.recommended_ids = [] # Resets every time | |
self.seen_movies = [] | |
# Initialize nearest neighbor algorithm. With p=1, euclidean distance is our metric | |
self.user_recommender = NearestNeighbors(n_neighbors=N_NEIGHBORS, p=2).fit(self.user_handler.df.drop('user_id', axis=1)) | |
self.item_recommender = NearestNeighbors(n_neighbors=N_NEIGHBORS, p=2).fit(self.item_handler.scaled_df) | |
# Initialize recommended movies | |
for i in range(3): | |
self.recommended_ids.append(self.get_item_recommendation()) # Getting random movies | |
def on_pick(self, idx, rating): | |
''' | |
Called whenever the user picks a new movie. | |
idx: [0, 2] -> which one of the recommendations was picked out of the 3 suggestions | |
''' | |
self.n_picks += 1 | |
chosen_movie_id = self.recommended_ids[idx] | |
self.update(chosen_movie_id, rating) | |
# Recommend new movies | |
self.recommended_ids[0] = self.get_item_recommendation() | |
self.recommended_ids[1] = self.get_user_recommendation() | |
self.recommended_ids[2] = self.get_joint_recommendation() | |
return self.recommended_ids | |
def get_descs_for_recommended(self, recs): | |
descs = [] | |
for rec in recs: | |
info = {} | |
info['title'] = self.item_handler.get_movie_title_by_id(rec) | |
info['overview'] = self.item_handler.get_movie_overview_by_id(rec) | |
descs.append(info) | |
return descs | |
def update(self, movie_id, rating): | |
''' | |
Update user preferences based on last picked movie (and given rating) | |
''' | |
self.seen_movies.append(movie_id) | |
# Update user data | |
self.preferences.at[0, str(movie_id)] = rating | |
# Update item data - but only if the user liked it | |
if rating > 2.5: | |
new_row = self.item_handler.get_row_by_id(movie_id) | |
self.item_picks = pd.concat([self.item_picks, new_row], axis=0) | |
def get_item_recommendation(self): | |
''' | |
Make recommendation based on item similarity | |
''' | |
# If user hasn't picked any movies they like yet, pick something random | |
if not self.item_picks.empty: | |
filtered_picks = self.item_picks[self.item_handler.item_columns] | |
# Return movie that's closest to average preference | |
summed_preferences = filtered_picks.sum(axis=0) | |
average_preferences = summed_preferences / filtered_picks.shape[0] | |
dist, idxes = self.item_recommender.kneighbors([average_preferences], min(len(self.seen_movies), self.item_handler.df.shape[0])) # guarenteed to pick a movie that has not been seen before | |
for idx in idxes[0]: | |
new_id = self.item_handler.get_id_by_idx(idx) | |
if new_id not in self.seen_movies: | |
return new_id | |
# Pick a random movie if strategy did not work | |
return self.item_handler.get_random_id() | |
def get_user_recommendation(self): | |
''' | |
Make recommendation based on user similarity | |
''' | |
# If user hasn't chosen anything yet | |
if self.item_picks.empty: | |
return self.item_handler.get_random_id() | |
_, idx = self.user_recommender.kneighbors(self.preferences.values, 25) | |
# Find the closest user's top 3 movies. If all have been seen, move onto the next user until a candidate movie is found | |
for best_idx in idx[0]: | |
cols_to_drop = ['user_id'] | |
# Find best movie | |
for i in range(3): | |
best_movie = self.user_handler.df.drop(cols_to_drop, axis=1).iloc[best_idx].idxmax(axis=0) | |
if best_movie in self.seen_movies: | |
cols_to_drop.append(best_movie) | |
continue | |
if self.user_handler.df.iloc[best_idx][best_movie] > 2.5: | |
return int(best_movie) | |
# Otherwise, return random movie | |
return self.item_handler.get_random_id() | |
def get_joint_recommendation(self): | |
''' | |
Make recommendation based on both item and user similarity | |
''' | |
# If user hasn't chosen anything yet | |
if self.item_picks.empty: | |
return self.item_handler.get_random_id() | |
# Get similar users | |
_, user_idxs = self.user_recommender.kneighbors(self.preferences.values, 10) | |
# Get similar items | |
summed_preferences = self.item_picks[self.item_handler.item_columns].sum(axis=0) | |
average_preferences = summed_preferences / self.item_picks.shape[0] | |
n_movies = min(len(self.seen_movies), self.item_handler.df.shape[0]) | |
_, item_idxs = self.item_recommender.kneighbors([average_preferences], n_movies) # guarenteed to pick a movie that has not been seen before | |
score_sums = [0 for i in range(n_movies)] | |
n_votes = [0 for i in range(n_movies)] | |
# Sum ratings per movie | |
for i, movie_idx in enumerate(item_idxs[0]): | |
movie_id = self.item_handler.get_id_by_idx(movie_idx) | |
if movie_id in self.seen_movies: | |
continue | |
for user_id in user_idxs[0]: | |
score = self.user_handler.df.iloc[user_id][str(movie_id)] | |
if score != 0: | |
score_sums[i] += score | |
n_votes[i] += 1 | |
# Calculate per-movie score | |
final_score = [] | |
for i, score in enumerate(score_sums): | |
if n_votes[i] > 0: | |
final_score.append(score/n_votes[i]) | |
else: | |
final_score.append(-1) | |
# Find best score | |
best_score_idx = final_score.index(max(final_score)) | |
best_movie_idx = item_idxs[0][best_score_idx] | |
return self.item_handler.get_id_by_idx(best_movie_idx) | |
if __name__ == '__main__': | |
recommender = Recommender() | |
res = recommender.on_pick(0, 1) | |
print(res) | |
res = recommender.on_pick(0, 3.4) | |
print(res) | |
res = recommender.on_pick(1, 1) | |
print(res) | |
res = recommender.on_pick(1, 3.4) | |
print(res) | |
# print(recommender.item_handler.df.iloc[20712]['id']) | |