File size: 2,165 Bytes
55432ef
 
172edb9
 
 
 
 
8d44559
 
 
 
 
 
 
 
 
172edb9
 
 
 
 
8d44559
 
172edb9
 
 
8d44559
172edb9
 
8d44559
172edb9
 
8d44559
 
172edb9
 
8d44559
172edb9
 
8d44559
 
 
 
 
 
 
172edb9
 
 
8d44559
 
172edb9
 
8d44559
 
172edb9
 
8d44559
 
 
 
 
 
 
 
 
172edb9
8d44559
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import streamlit as st
import pandas as pd


@st.cache
def load_and_preprocess_data():
    df = pd.read_csv(
        "Ratings.csv",
        encoding="latin-1",
    )
    df_users=pd.read_csv(
        "Users.csv",
        encoding="latin-1",
    )
    df_books=pd.read_csv(
        "Books.csv",
        encoding="latin-1",
    )

    # Remove nans values
    df = df.dropna()
    df_users = df_users.dropna()
    df_books = df_books.dropna()

    # Use only positive quantites. This is not a robust approach,
    # but to keep things simple it quite good.
    df = df[df["Book-Rating"] > 0]

    # Parse the date column and add 10 years, just to better visualization
    #df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"]).dt.floor(   "d") + pd.offsets.DateOffset(years=10)

    # Change customer id to int
    df["User-ID"] = df["User-ID"].astype(int)
    df_users["Age"] = df_users["Age"].astype(int)

    # Add price column
    #df["Price"] = df["Quantity"] * df["UnitPrice"]

    # Get unique entries in the dataset of users and products
    users = df["User-ID"].unique()
    products = df["ISBN"].unique()
    location=df_users["Location"]
    age=df_users["Age"]
    title=df_books["Book-Title"]
    author=df_books["Book-Author"]
    year=df_books["Year-Of-Publication"]

    # Create a categorical type for users and product. User ordered to ensure
    # reproducibility
    #user_cat = pd.CategoricalDtype(categories=sorted(users), ordered=True)
    #product_cat = pd.CategoricalDtype(categories=sorted(products), ordered=True)

    # Transform and get the indexes of the columns
    #user_idx = df["User-ID"].astype(user_cat).cat.codes
    #product_idx = df["ISBN"].astype(product_cat).cat.codes

    # Add the categorical index to the starting dataframe
    #df["CustomerIndex"] = user_idx
    #df["ProductIndex"] = product_idx
    # Merging both DataFrames based on respective common columns
    merged_df = pd.merge(df, df_users[['User-ID', 'Location', 'Age']], on='User-ID', how='left')
    merged_df = pd.merge(merged_df, df_books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication']], on='ISBN', how='left')



    

    return merged_df