File size: 2,657 Bytes
97ce7fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1ca130
97ce7fb
 
 
b1ca130
97ce7fb
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import json
import yaml
import pandas as pd
import numpy as np

from pathlib import Path
from jsonargparse import ArgumentParser


def parse_args():
    """Parse command-line arguments."""
    parser = ArgumentParser()
    parser.add_argument("--rating_path", type=str, required=True, default="./dataset/ratings.csv")
    parser.add_argument("--book_path", type=str, required=True, default="./dataset/books.csv")
    parser.add_argument("--out_dir", type=str, required=True, default="./processed")
    parser.add_argument("--limit", required=True, type=int, default=1000)

    return vars(parser.parse_args())


def main(
    rating_path,
    book_path,
    out_dir,
    limit,
    **kwargs
):
    data = pd.read_csv(rating_path, delimiter=';', nrows=limit, encoding='ISO-8859-1')

    # Make Y
    Y = data.pivot(index='ISBN', columns='User-ID', values='Book-Rating')
    Y = Y.fillna(0)
    Y = Y.values

    # Make R
    R = np.where(Y != 0, 1, 0)

    # Save Y and R as dense matrices
    out_dir_path = Path(out_dir)
    if out_dir_path.exists():
        assert out_dir_path.is_dir()
    else:
        out_dir_path.mkdir(parents=True)
    np.save(f'{out_dir_path}/Y.npy', Y)
    np.save(f'{out_dir_path}/R.npy', R)

    # Create mappings for book and user IDs
    book_lst = data['ISBN'].unique()
    user_lst = data['User-ID'].unique()
    book_id_map = {book_id: i for i, book_id in enumerate(book_lst)}
    user_id_map = {user_id: i for i, user_id in enumerate(user_lst)}
    # Convert keys to compatible types
    book_id_map = {str(key): value for key, value in book_id_map.items()}
    user_id_map = {str(key): value for key, value in user_id_map.items()}

    # Save book_id_map to file
    with open(f'{out_dir_path}/book_id_map.json', 'w') as f:
        json.dump(book_id_map, f)

    # Save user_id_map to file
    with open(f'{out_dir_path}/user_id_map.json', 'w') as f:
        json.dump(user_id_map, f)

    # Get summary
    function = {
        "Book-Rating": "mean",
        "User-ID": "count"
    }

    book_df = pd.read_csv(book_path, delimiter=';', encoding='ISO-8859-1', on_bad_lines='skip')
    book_df = book_df[book_df["ISBN"].isin(book_id_map.keys())]
    summary_rating = data.groupby("ISBN").agg(function, axis=0)
    summary_rating = summary_rating.rename(columns={"Book-Rating": "Mean-Rating", "User-ID": "Num-Rating"})
    df = book_df.merge(summary_rating, how="left", left_on="ISBN", right_on="ISBN")
    df = df.dropna()
    df.drop(columns=["Image-URL-S", "Image-URL-M", "Image-URL-L"], inplace=True)
    df.to_csv(f"{out_dir_path}/summary_book.csv", index=False)


if __name__ == "__main__":
    main(**parse_args())