README / model.py
Yew Chong
UI update
79ebb2c
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle
import os
from collections import defaultdict
import random
import warnings
import logging
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.vectorstores import FAISS
warnings.filterwarnings("ignore")
random.seed(5153)
logging.basicConfig(level=logging.DEBUG)
class Model:
def __init__(self):
self.cache_path = "Data/cache.pkl"
self.is_loaded = False
self.dataset = None
self.predictions = None
self.user_details = None
self.temp_store = None
self.pipeline = None
self.chosen_books_per_user = None
self.all_books = pd.read_csv("Data/books.csv")
logging.info("Initialized model")
def run_predictions_on_full_test(self):
if self.is_loaded:
logging.info("Model is already loaded")
return
if self.does_cache_exist():
logging.info("Retrieving cached full-test predictions")
self.retrieve_cache()
logging.info("Completed full-test")
return
logging.info("Generating full-test predictions")
reviews_df = pd.read_csv("Data/final_dataset/reviews_test.csv")
good_reviews = reviews_df[reviews_df['rating'] > 3]
good_user_books_dict = good_reviews.groupby('user_id')['book_id'].unique().apply(list).to_dict()
# to further minimize compute time, we only use 20 (randomly sampled) users
num_random_users = 20
randomly_sampled_users = random.sample(list(good_user_books_dict.keys()), num_random_users)
sampled_good_user_books_dict = {user_id: good_user_books_dict[user_id] for user_id in randomly_sampled_users}
# to minimize compute time, we take only 150 random (good) books per user
# prepare it in the form of user_id -> list[book_id]
num_rand_books_per_user = 150
chosen_books_per_user = {
user_id: random.sample(books, min(len(books), num_rand_books_per_user))
for user_id, books in sampled_good_user_books_dict.items()
}
# save this for reference
self.chosen_books_per_user = chosen_books_per_user
# run predictions on all of the above users
self.prepare_predictions(chosen_books_per_user)
logging.info("Caching full-test predictions")
self.cache_results()
logging.info("Completed full-test")
def run_prediction_on_adhoc_user(self, chosen_book_ids):
self.prepare_predictions(
{'current_user': chosen_book_ids}
)
def prepare_predictions(self, target_users_and_books):
"""
Given a dictionary of user_id to list[book_id], where the list of book IDs are the books favored by
the associated user, this function returns the recommended books for each user provided in the dictionary
:param target_users_and_books: Dictionary of user ID to favored books (as book IDs)
:return: Dataframe of user IDs and associated recommended books, plus individual model scores
"""
target_user_list = list(target_users_and_books.keys())
file_dict = {}
for filename in ['reviews_test', 'users_test', 'reviews_sub']:
file_dict[filename] = pd.read_csv(f'Data/final_dataset/{filename}.csv')
file_dict['users'] = file_dict['users_test']
file_dict['reviews'] = file_dict['reviews_test']
file_dict['good_reviews'] = file_dict['reviews'][file_dict['reviews']['rating'] > 3]
file_dict['books'] = pd.read_csv('Data/books.csv')
#################################################################################
# GENRE MODEL; DESCRIPTION MODEL; TITLE MODEL; BOOK STATS CLUSTER MODEL
#################################################################################
clusterbooks = pd.DataFrame(
np.load('Data/Recommended Storage/cluster_books.npy', allow_pickle=True),
columns=['target_book', 'recco_book_id', 'similarity_score']).astype(float) # wasn't saved as float
genrebooks = pd.DataFrame(
np.load('Data/Recommended Storage/genres_books.npy', allow_pickle=True),
columns=['target_book', 'recco_book_id', 'similarity_score'])
descbooks = pd.DataFrame(
np.load('Data/Recommended Storage/description_books.npy', allow_pickle=True),
columns=['target_book', 'recco_book_id', 'similarity_score'])
revbooks = pd.DataFrame(
np.load('Data/Recommended Storage/reviews_books_new.npy', allow_pickle=True),
columns=['target_book', 'recco_book_id', 'similarity_score'])
def optimized_converter(simbooks, user_id_list, name, prog_bar_description):
user_ratings_list = pd.DataFrame(columns=['user_id', 'recco_book_id', 'similarity_score'])
for curr_user_id in tqdm(user_id_list, desc=prog_bar_description):
curr_user_books = pd.Series(target_users_and_books[curr_user_id])
relevant_simbooks = simbooks[simbooks['target_book'].isin(curr_user_books)]
summed_scores = relevant_simbooks.groupby('recco_book_id')['similarity_score'].sum().reset_index()
summed_scores['user_id'] = curr_user_id
if not curr_user_books.empty:
summed_scores = summed_scores[~summed_scores['recco_book_id'].isin(curr_user_books)]
# TODO: Think about how to adjust this for small number of books
summed_scores['similarity_score'] /= len(curr_user_books)
top_30 = summed_scores.nlargest(30, 'similarity_score')
user_ratings_list = pd.concat([user_ratings_list, top_30], ignore_index=True)
return user_ratings_list.rename(columns={'recco_book_id': 'book_id', 'similarity_score': name})
genre_users = optimized_converter(genrebooks, target_user_list, 'gen_score', "Generating recs (genre)")
cluster_users = optimized_converter(clusterbooks, target_user_list, 'clus_score',
"Generating recs (book stats cluster)")
description_users = optimized_converter(descbooks, target_user_list, 'desc_score',
"Generating recs (description)")
reviews_users = optimized_converter(revbooks, target_user_list, 'rev_score', "Generating recs (reviews)")
#################################################################################
# USER SIMILARITY CLUSTERING MODEL
#################################################################################
def jaccard_similarity_pandas(target_user, reviews_sub, n):
target_user_books = target_users_and_books[target_user]
relevant_reviews = reviews_sub[reviews_sub['book_id'].isin(target_user_books)]
intersections = relevant_reviews.groupby('user_id').size()
# all_books = pd.concat(
# [df[df['user_id'] == target_user]['book_id'], reviews_sub['book_id']]).drop_duplicates()
user_book_counts = reviews_sub.groupby('user_id')['book_id'].nunique()
unions = len(target_user_books) + user_book_counts - intersections
jaccard_index = intersections / unions
top_n_users = jaccard_index.nlargest(n)
return top_n_users.reset_index().values.tolist()
def recommend_books(target_user_id, reviews_sub, num_books):
# df = reviews_sub[(reviews_sub['rating'].isin([4, 5]))]
top_n_similar_users = jaccard_similarity_pandas(target_user_id, reviews_sub, n=20)
target_user_books = target_users_and_books[target_user_id]
similar_users_reviews = reviews_sub[reviews_sub['user_id'].isin([user[0] for user in top_n_similar_users])]
recommended_books = defaultdict(float)
for curr_user_id, similarity_score in top_n_similar_users:
user_reviews = similar_users_reviews[similar_users_reviews['user_id'] == curr_user_id]
for _, row in user_reviews.iterrows():
if row['book_id'] not in target_user_books:
recommended_books[row['book_id']] += similarity_score
# Return top recommended books sorted by score
sorted_recommended_books = sorted(recommended_books.items(), key=lambda x: x[1], reverse=True)
return [(target_user_id, book_id, book_score) for book_id, book_score in
sorted_recommended_books[:num_books]]
all_recommendations = []
for each_user_id in tqdm(target_user_list, desc="Generating recs (users)"):
recommendations = recommend_books(each_user_id, file_dict['reviews_sub'], 30)
all_recommendations.extend(recommendations)
user_users = pd.DataFrame(all_recommendations, columns=['user_id', 'book_id', 'user_score'])
user_users.head()
#################################################################################
# TITLE SIMILARITY MODEL
#################################################################################
store = FAISS.load_local(
"Data/faiss_store",
HuggingFaceBgeEmbeddings(
model_kwargs={"device": "cpu"},
encode_kwargs={"normalize_embeddings": True}
),
allow_dangerous_deserialization=True
)
title_output = []
for user_id, books in tqdm(target_users_and_books.items(), desc="Generating recs (title)"):
user_book_id = target_users_and_books[user_id]
user_books = file_dict['books'][(file_dict['books']['book_id'].isin(user_book_id))]
titles = '\n'.join(user_books['title_without_series']) # Using titles without series for queries
results = store.similarity_search_with_score(titles, k=80)
for result, score in results:
if result.metadata.get('book_id') not in user_books:
title_output.append([user_id, result.metadata.get('book_id'), 1 - score])
# Save formatted
title_users = pd.DataFrame(title_output, columns=['user_id', 'book_id', 'tit_score'])
#################################################################################
# COMBINING MODEL OUTPUTS
#################################################################################
self.temp_store = {
'cluster': cluster_users,
'genre': genre_users,
'desc': description_users,
'reviews': reviews_users,
'users': user_users,
'title': title_users,
}
combined_df = pd.merge(cluster_users, genre_users, on=['user_id', 'book_id'], how='outer')
combined_df = pd.merge(combined_df, description_users, on=['user_id', 'book_id'], how='outer')
combined_df = pd.merge(combined_df, reviews_users, on=['user_id', 'book_id'], how='outer')
combined_df = pd.merge(combined_df, user_users, on=['user_id', 'book_id'], how='outer')
combined_df = pd.merge(combined_df, title_users, on=['user_id', 'book_id'], how='outer')
combined_df.fillna(0, inplace=True)
combined_df['book_id'] = combined_df['book_id'].astype(int)
combined_df['tit_score'] = combined_df['tit_score'].astype(float)
reviews_df = file_dict['reviews'][file_dict['reviews']['rating'].isin([1, 2, 3, 4, 5])]
reviews_filtered = reviews_df[['user_id', 'book_id', 'rating']]
combined_df = combined_df.merge(reviews_filtered, on=['user_id', 'book_id'], how='left')
combined_df.rename(columns={'rating': 'target'}, inplace=True)
combined_df['binary'] = np.where(combined_df['target'] >= 4, 1, 0)
# remove books which are not recommended at all
combined_df = combined_df[
(combined_df[['clus_score', 'gen_score', 'desc_score', 'rev_score', 'user_score', 'tit_score']] != 0).any(
axis=1)]
with open("Data/final_model.pkl", 'rb') as file:
self.pipeline = pickle.load(file)
X_test = combined_df.drop(columns=['user_id', 'book_id', 'target', 'binary'])
predictions_df = combined_df[
['user_id', 'book_id', 'clus_score', 'gen_score', 'desc_score', 'rev_score', 'user_score',
'tit_score', 'target', 'binary']].copy()
predictions_df['final_score'] = self.pipeline.predict_proba(X_test).T[1]
predictions_df['would_recommend'] = predictions_df['final_score'] >= 0.45 # peak f2 score at this threshold
predictions_df = predictions_df.sort_values(['user_id', 'final_score'], ascending=[True, False])
self.dataset = combined_df
self.predictions = predictions_df
def prepare_user_details(self):
users_list = self.dataset['user_id'].unique()
users_df = pd.read_csv("Data/final_dataset/users_test.csv")
books_df = pd.read_csv("Data/final_dataset/books_test.csv")
# filter to keep only relevant users
users_df = users_df[users_df['user_id'].isin(users_list)]
# merge to get book and review data
full_df = users_df.merge(books_df, on="user_id")
user_details = pd.DataFrame()
top_books_per_user = full_df.groupby("user_id").apply(
lambda x: x.sort_values('rating').nlargest(n=5, columns='rating')['title_without_series'].tolist())
user_details['top_books'] = top_books_per_user
self.user_details = user_details
def get_user_predictions(self, chosen_user):
logging.info(f"Generating predictions for user: {chosen_user}")
user_predictions = self.predictions[self.predictions['user_id'] == chosen_user]
user_predictions = user_predictions.dropna(subset=['target'])
if len(user_predictions) == 0:
logging.info(f"No predictions hit! Exiting early")
return None
# transform model scores using the pipeline (scaler + logistic regression coefficients)
# specifically, apply scaler then apply linear layer of logistic regression
model_score_cols = [c for c in user_predictions.columns if c.endswith('_score') and c != 'final_score']
scaled_model_scores = self.pipeline['scaler'].transform(user_predictions[model_score_cols])
multed_model_scores = scaled_model_scores * self.pipeline['classifier'].coef_[0]
final_model_scores = pd.DataFrame(multed_model_scores, columns=model_score_cols)
final_model_scores['intercept'] = self.pipeline['classifier'].intercept_[0]
columns = ['book_id', 'target', 'final_score', 'would_recommend']
predictions_and_score = pd.concat(
[user_predictions[columns].reset_index(drop=True), final_model_scores],
axis=1
)
return predictions_and_score.merge(self.all_books[['book_id', 'title_without_series']], on='book_id')
def cache_results(self):
with open(self.cache_path, 'wb+') as f:
to_pickle = dict()
to_pickle['dataset'] = self.dataset
to_pickle['predictions'] = self.predictions
to_pickle['temp_store'] = self.temp_store
to_pickle['pipeline'] = self.pipeline
to_pickle['chosen_books'] = self.chosen_books_per_user
# to_pickle['user_details'] = self.user_details
pickle.dump(to_pickle, f)
self.is_loaded = True
def does_cache_exist(self):
return os.path.exists(self.cache_path)
def retrieve_cache(self):
with open(self.cache_path, 'rb') as f:
unpickled = pickle.load(f)
for key, val in unpickled.items():
exec(f"self.{key} = val")
self.is_loaded = True