import streamlit as st import pandas as pd @st.cache def load_and_preprocess_data(): df = pd.read_csv( "Ratings.csv", encoding="latin-1", ) df_users=pd.read_csv( "Users.csv", encoding="latin-1", ) df_books=pd.read_csv( "Books.csv", encoding="latin-1", ) # Remove nans values df = df.dropna() df_users = df_users.dropna() df_books = df_books.dropna() # Use only positive quantites. This is not a robust approach, # but to keep things simple it quite good. df = df[df["Book-Rating"] > 0] # Parse the date column and add 10 years, just to better visualization #df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"]).dt.floor( "d") + pd.offsets.DateOffset(years=10) # Change customer id to int df["User-ID"] = df["User-ID"].astype(int) df_users["Age"] = df_users["Age"].astype(int) df_books["Year-Of-Publication"] = df_books["Year-Of-Publication"].astype(int) # Add price column #df["Price"] = df["Quantity"] * df["UnitPrice"] # Get unique entries in the dataset of users and products users = df["User-ID"].unique() products = df["ISBN"].unique() location=df_users["Location"] age=df_users["Age"] title=df_books["Book-Title"] author=df_books["Book-Author"] year=df_books["Year-Of-Publication"] # Create a categorical type for users and product. User ordered to ensure # reproducibility user_cat = pd.CategoricalDtype(categories=sorted(users), ordered=True) product_cat = pd.CategoricalDtype(categories=sorted(products), ordered=True) # Transform and get the indexes of the columns user_idx = df["User-ID"].astype(product_cat).cat.codes product_idx = df["ISBN"].astype(product_cat).cat.codes # Add the categorical index to the starting dataframe # Merging both DataFrames based on respective common columns merged_df = pd.merge(df, df_users[['User-ID', 'Location', 'Age']], on='User-ID', how='left') merged_df = pd.merge(merged_df, df_books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication']], on='ISBN', how='left') merged_df["ProductIndex"] = product_idx merged_df["CustomerIndex"] = user_idx return merged_df, user_idx, product_idx