Book-Recommender-System / preprocessing.py
quyanh's picture
update
b1ca130
import json
import yaml
import pandas as pd
import numpy as np
from pathlib import Path
from jsonargparse import ArgumentParser
def parse_args():
"""Parse command-line arguments."""
parser = ArgumentParser()
parser.add_argument("--rating_path", type=str, required=True, default="./dataset/ratings.csv")
parser.add_argument("--book_path", type=str, required=True, default="./dataset/books.csv")
parser.add_argument("--out_dir", type=str, required=True, default="./processed")
parser.add_argument("--limit", required=True, type=int, default=1000)
return vars(parser.parse_args())
def main(
rating_path,
book_path,
out_dir,
limit,
**kwargs
):
data = pd.read_csv(rating_path, delimiter=';', nrows=limit, encoding='ISO-8859-1')
# Make Y
Y = data.pivot(index='ISBN', columns='User-ID', values='Book-Rating')
Y = Y.fillna(0)
Y = Y.values
# Make R
R = np.where(Y != 0, 1, 0)
# Save Y and R as dense matrices
out_dir_path = Path(out_dir)
if out_dir_path.exists():
assert out_dir_path.is_dir()
else:
out_dir_path.mkdir(parents=True)
np.save(f'{out_dir_path}/Y.npy', Y)
np.save(f'{out_dir_path}/R.npy', R)
# Create mappings for book and user IDs
book_lst = data['ISBN'].unique()
user_lst = data['User-ID'].unique()
book_id_map = {book_id: i for i, book_id in enumerate(book_lst)}
user_id_map = {user_id: i for i, user_id in enumerate(user_lst)}
# Convert keys to compatible types
book_id_map = {str(key): value for key, value in book_id_map.items()}
user_id_map = {str(key): value for key, value in user_id_map.items()}
# Save book_id_map to file
with open(f'{out_dir_path}/book_id_map.json', 'w') as f:
json.dump(book_id_map, f)
# Save user_id_map to file
with open(f'{out_dir_path}/user_id_map.json', 'w') as f:
json.dump(user_id_map, f)
# Get summary
function = {
"Book-Rating": "mean",
"User-ID": "count"
}
book_df = pd.read_csv(book_path, delimiter=';', encoding='ISO-8859-1', on_bad_lines='skip')
book_df = book_df[book_df["ISBN"].isin(book_id_map.keys())]
summary_rating = data.groupby("ISBN").agg(function, axis=0)
summary_rating = summary_rating.rename(columns={"Book-Rating": "Mean-Rating", "User-ID": "Num-Rating"})
df = book_df.merge(summary_rating, how="left", left_on="ISBN", right_on="ISBN")
df = df.dropna()
df.drop(columns=["Image-URL-S", "Image-URL-M", "Image-URL-L"], inplace=True)
df.to_csv(f"{out_dir_path}/summary_book.csv", index=False)
if __name__ == "__main__":
main(**parse_args())