Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
def load_and_preprocess_data(): | |
df = pd.read_csv( | |
"Ratings.csv", | |
encoding="latin-1", | |
) | |
df_users=pd.read_csv( | |
"Users.csv", | |
encoding="latin-1", | |
) | |
df_books=pd.read_csv( | |
"Books.csv", | |
encoding="latin-1", | |
) | |
# Remove nans values | |
df = df.dropna() | |
df_users = df_users.dropna() | |
df_books = df_books.dropna() | |
# Use only positive quantites. This is not a robust approach, | |
# but to keep things simple it quite good. | |
df = df[df["Book-Rating"] > 0] | |
# Parse the date column and add 10 years, just to better visualization | |
#df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"]).dt.floor( "d") + pd.offsets.DateOffset(years=10) | |
# Change customer id to int | |
df["User-ID"] = df["User-ID"].astype(int) | |
df_users["Age"] = df_users["Age"].astype(int) | |
# Add price column | |
#df["Price"] = df["Quantity"] * df["UnitPrice"] | |
# Get unique entries in the dataset of users and products | |
users = df["User-ID"].unique() | |
products = df["ISBN"].unique() | |
location=df_users["Location"] | |
age=df_users["Age"] | |
title=df_books["Book-Title"] | |
author=df_books["Book-Author"] | |
year=df_books["Year-Of-Publication"] | |
# Create a categorical type for users and product. User ordered to ensure | |
# reproducibility | |
#user_cat = pd.CategoricalDtype(categories=sorted(users), ordered=True) | |
product_cat = pd.CategoricalDtype(categories=sorted(products), ordered=True) | |
# Transform and get the indexes of the columns | |
user_idx = df["User-ID"] | |
product_idx = df["ISBN"].astype(product_cat).cat.codes | |
# Add the categorical index to the starting dataframe | |
#df["CustomerIndex"] = user_idx | |
merged_df["ProductIndex"] = product_idx | |
# Merging both DataFrames based on respective common columns | |
merged_df = pd.merge(df, df_users[['User-ID', 'Location', 'Age']], on='User-ID', how='left') | |
merged_df = pd.merge(merged_df, df_books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication']], on='ISBN', how='left') | |
return merged_df, user_idx, product_idx | |