Spaces:
Sleeping
Sleeping
File size: 2,205 Bytes
55432ef 172edb9 8d44559 172edb9 8d44559 172edb9 8d44559 eec1ef5 172edb9 8d44559 172edb9 8d44559 172edb9 8d44559 172edb9 8d44559 172edb9 19b1e5c 1626440 172edb9 19b1e5c 1626440 172edb9 eec1ef5 8d44559 f7d18eb 19b1e5c 8d44559 172edb9 d77e4b8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import streamlit as st
import pandas as pd
@st.cache
def load_and_preprocess_data():
df = pd.read_csv(
"Ratings.csv",
encoding="latin-1",
)
df_users=pd.read_csv(
"Users.csv",
encoding="latin-1",
)
df_books=pd.read_csv(
"Books.csv",
encoding="latin-1",
)
# Remove nans values
df = df.dropna()
df_users = df_users.dropna()
df_books = df_books.dropna()
# Use only positive quantites. This is not a robust approach,
# but to keep things simple it quite good.
df = df[df["Book-Rating"] > 0]
# Parse the date column and add 10 years, just to better visualization
#df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"]).dt.floor( "d") + pd.offsets.DateOffset(years=10)
# Change customer id to int
df["User-ID"] = df["User-ID"].astype(int)
df_users["Age"] = df_users["Age"].astype(int)
# Add price column
#df["Price"] = df["Quantity"] * df["UnitPrice"]
# Get unique entries in the dataset of users and products
users = df["User-ID"].unique()
products = df["ISBN"].unique()
location=df_users["Location"]
age=df_users["Age"]
title=df_books["Book-Title"]
author=df_books["Book-Author"]
year=df_books["Year-Of-Publication"]
# Create a categorical type for users and product. User ordered to ensure
# reproducibility
user_cat = pd.CategoricalDtype(categories=sorted(users), ordered=True)
product_cat = pd.CategoricalDtype(categories=sorted(products), ordered=True)
# Transform and get the indexes of the columns
user_idx = df["User-ID"].astype(product_cat).cat.codes
product_idx = df["ISBN"].astype(product_cat).cat.codes
# Add the categorical index to the starting dataframe
# Merging both DataFrames based on respective common columns
merged_df = pd.merge(df, df_users[['User-ID', 'Location', 'Age']], on='User-ID', how='left')
merged_df = pd.merge(merged_df, df_books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication']], on='ISBN', how='left')
merged_df["ProductIndex"] = product_idx
merged_df["CustomerIndex"] = user_idx
return merged_df, user_idx, product_idx
|