Spaces:
Sleeping
Sleeping
juanmartip95
commited on
Commit
•
8d44559
1
Parent(s):
4c8a491
Update utils.py
Browse files
utils.py
CHANGED
@@ -1,47 +1,67 @@
|
|
1 |
-
import streamlit as
|
2 |
-
import pandas as pd
|
3 |
|
4 |
|
5 |
@st.cache
|
6 |
def load_and_preprocess_data():
|
7 |
df = pd.read_csv(
|
8 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
encoding="latin-1",
|
10 |
)
|
11 |
|
12 |
# Remove nans values
|
13 |
df = df.dropna()
|
|
|
|
|
14 |
|
15 |
# Use only positive quantites. This is not a robust approach,
|
16 |
# but to keep things simple it quite good.
|
17 |
-
df = df[df["
|
18 |
|
19 |
# Parse the date column and add 10 years, just to better visualization
|
20 |
-
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"]).dt.floor(
|
21 |
-
"d"
|
22 |
-
) + pd.offsets.DateOffset(years=10)
|
23 |
|
24 |
# Change customer id to int
|
25 |
-
df["
|
|
|
26 |
|
27 |
# Add price column
|
28 |
-
df["Price"] = df["Quantity"] * df["UnitPrice"]
|
29 |
|
30 |
# Get unique entries in the dataset of users and products
|
31 |
-
users = df["
|
32 |
-
products = df["
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
# Create a categorical type for users and product. User ordered to ensure
|
35 |
# reproducibility
|
36 |
-
user_cat = pd.CategoricalDtype(categories=sorted(users), ordered=True)
|
37 |
-
product_cat = pd.CategoricalDtype(categories=sorted(products), ordered=True)
|
38 |
|
39 |
# Transform and get the indexes of the columns
|
40 |
-
user_idx = df["
|
41 |
-
product_idx = df["
|
42 |
|
43 |
# Add the categorical index to the starting dataframe
|
44 |
-
df["CustomerIndex"] = user_idx
|
45 |
-
df["ProductIndex"] = product_idx
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
-
return
|
|
|
1 |
+
import streamlit as stimport pandas as pd
|
|
|
2 |
|
3 |
|
4 |
@st.cache
|
5 |
def load_and_preprocess_data():
|
6 |
df = pd.read_csv(
|
7 |
+
"Ratings.csv",
|
8 |
+
encoding="latin-1",
|
9 |
+
)
|
10 |
+
df_users=pd.read_csv(
|
11 |
+
"Users.csv",
|
12 |
+
encoding="latin-1",
|
13 |
+
)
|
14 |
+
df_books=pd.read_csv(
|
15 |
+
"Books.csv",
|
16 |
encoding="latin-1",
|
17 |
)
|
18 |
|
19 |
# Remove nans values
|
20 |
df = df.dropna()
|
21 |
+
df_users = df_users.dropna()
|
22 |
+
df_books = df_books.dropna()
|
23 |
|
24 |
# Use only positive quantites. This is not a robust approach,
|
25 |
# but to keep things simple it quite good.
|
26 |
+
df = df[df["Book-Rating"] > 0]
|
27 |
|
28 |
# Parse the date column and add 10 years, just to better visualization
|
29 |
+
#df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"]).dt.floor( "d") + pd.offsets.DateOffset(years=10)
|
|
|
|
|
30 |
|
31 |
# Change customer id to int
|
32 |
+
df["User-ID"] = df["User-ID"].astype(int)
|
33 |
+
df_users["Age"] = df_users["Age"].astype(int)
|
34 |
|
35 |
# Add price column
|
36 |
+
#df["Price"] = df["Quantity"] * df["UnitPrice"]
|
37 |
|
38 |
# Get unique entries in the dataset of users and products
|
39 |
+
users = df["User-ID"].unique()
|
40 |
+
products = df["ISBN"].unique()
|
41 |
+
location=df_users["Location"]
|
42 |
+
age=df_users["Age"]
|
43 |
+
title=df_books["Book-Title"]
|
44 |
+
author=df_books["Book-Author"]
|
45 |
+
year=df_books["Year-Of-Publication"]
|
46 |
|
47 |
# Create a categorical type for users and product. User ordered to ensure
|
48 |
# reproducibility
|
49 |
+
#user_cat = pd.CategoricalDtype(categories=sorted(users), ordered=True)
|
50 |
+
#product_cat = pd.CategoricalDtype(categories=sorted(products), ordered=True)
|
51 |
|
52 |
# Transform and get the indexes of the columns
|
53 |
+
#user_idx = df["User-ID"].astype(user_cat).cat.codes
|
54 |
+
#product_idx = df["ISBN"].astype(product_cat).cat.codes
|
55 |
|
56 |
# Add the categorical index to the starting dataframe
|
57 |
+
#df["CustomerIndex"] = user_idx
|
58 |
+
#df["ProductIndex"] = product_idx
|
59 |
+
# Merging both DataFrames based on respective common columns
|
60 |
+
merged_df = pd.merge(df, df_users[['User-ID', 'Location', 'Age']], on='User-ID', how='left')
|
61 |
+
merged_df = pd.merge(merged_df, df_books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication']], on='ISBN', how='left')
|
62 |
+
|
63 |
+
|
64 |
+
|
65 |
+
|
66 |
|
67 |
+
return merged_df
|