File size: 2,205 Bytes
55432ef
 
172edb9
 
 
 
 
8d44559
 
 
 
 
 
 
 
 
172edb9
 
 
 
 
8d44559
 
172edb9
 
 
8d44559
eec1ef5
172edb9
 
8d44559
172edb9
 
8d44559
 
172edb9
 
8d44559
172edb9
 
8d44559
 
 
 
 
 
 
172edb9
 
 
19b1e5c
1626440
172edb9
 
19b1e5c
1626440
172edb9
eec1ef5
8d44559
 
 
f7d18eb
19b1e5c
8d44559
 
 
172edb9
d77e4b8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import streamlit as st
import pandas as pd


@st.cache
def load_and_preprocess_data():
    df = pd.read_csv(
        "Ratings.csv",
        encoding="latin-1",
    )
    df_users=pd.read_csv(
        "Users.csv",
        encoding="latin-1",
    )
    df_books=pd.read_csv(
        "Books.csv",
        encoding="latin-1",
    )

    # Remove nans values
    df = df.dropna()
    df_users = df_users.dropna()
    df_books = df_books.dropna()

    # Use only positive quantites. This is not a robust approach,
    # but to keep things simple it quite good.
    df = df[df["Book-Rating"] > 0]
    

    # Parse the date column and add 10 years, just to better visualization
    #df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"]).dt.floor(   "d") + pd.offsets.DateOffset(years=10)

    # Change customer id to int
    df["User-ID"] = df["User-ID"].astype(int)
    df_users["Age"] = df_users["Age"].astype(int)

    # Add price column
    #df["Price"] = df["Quantity"] * df["UnitPrice"]

    # Get unique entries in the dataset of users and products
    users = df["User-ID"].unique()
    products = df["ISBN"].unique()
    location=df_users["Location"]
    age=df_users["Age"]
    title=df_books["Book-Title"]
    author=df_books["Book-Author"]
    year=df_books["Year-Of-Publication"]

    # Create a categorical type for users and product. User ordered to ensure
    # reproducibility
    user_cat = pd.CategoricalDtype(categories=sorted(users), ordered=True)
    product_cat = pd.CategoricalDtype(categories=sorted(products), ordered=True)

    # Transform and get the indexes of the columns
    user_idx = df["User-ID"].astype(product_cat).cat.codes
    product_idx = df["ISBN"].astype(product_cat).cat.codes

    # Add the categorical index to the starting dataframe  
    # Merging both DataFrames based on respective common columns
    merged_df = pd.merge(df, df_users[['User-ID', 'Location', 'Age']], on='User-ID', how='left')
    merged_df = pd.merge(merged_df, df_books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication']], on='ISBN', how='left')
    merged_df["ProductIndex"] = product_idx
    merged_df["CustomerIndex"] = user_idx


    

    return merged_df, user_idx, product_idx