juanmartip95 commited on
Commit
8d44559
1 Parent(s): 4c8a491

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +38 -18
utils.py CHANGED
@@ -1,47 +1,67 @@
1
- import streamlit as st
2
- import pandas as pd
3
 
4
 
5
  @st.cache
6
  def load_and_preprocess_data():
7
  df = pd.read_csv(
8
- "Data/OnlineRetail.csv",
 
 
 
 
 
 
 
 
9
  encoding="latin-1",
10
  )
11
 
12
  # Remove nans values
13
  df = df.dropna()
 
 
14
 
15
  # Use only positive quantites. This is not a robust approach,
16
  # but to keep things simple it quite good.
17
- df = df[df["Quantity"] > 0]
18
 
19
  # Parse the date column and add 10 years, just to better visualization
20
- df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"]).dt.floor(
21
- "d"
22
- ) + pd.offsets.DateOffset(years=10)
23
 
24
  # Change customer id to int
25
- df["CustomerID"] = df["CustomerID"].astype(int)
 
26
 
27
  # Add price column
28
- df["Price"] = df["Quantity"] * df["UnitPrice"]
29
 
30
  # Get unique entries in the dataset of users and products
31
- users = df["CustomerID"].unique()
32
- products = df["StockCode"].unique()
 
 
 
 
 
33
 
34
  # Create a categorical type for users and product. User ordered to ensure
35
  # reproducibility
36
- user_cat = pd.CategoricalDtype(categories=sorted(users), ordered=True)
37
- product_cat = pd.CategoricalDtype(categories=sorted(products), ordered=True)
38
 
39
  # Transform and get the indexes of the columns
40
- user_idx = df["CustomerID"].astype(user_cat).cat.codes
41
- product_idx = df["StockCode"].astype(product_cat).cat.codes
42
 
43
  # Add the categorical index to the starting dataframe
44
- df["CustomerIndex"] = user_idx
45
- df["ProductIndex"] = product_idx
 
 
 
 
 
 
 
46
 
47
- return df, user_idx, product_idx
 
1
+ import streamlit as stimport pandas as pd
 
2
 
3
 
4
  @st.cache
5
  def load_and_preprocess_data():
6
  df = pd.read_csv(
7
+ "Ratings.csv",
8
+ encoding="latin-1",
9
+ )
10
+ df_users=pd.read_csv(
11
+ "Users.csv",
12
+ encoding="latin-1",
13
+ )
14
+ df_books=pd.read_csv(
15
+ "Books.csv",
16
  encoding="latin-1",
17
  )
18
 
19
  # Remove nans values
20
  df = df.dropna()
21
+ df_users = df_users.dropna()
22
+ df_books = df_books.dropna()
23
 
24
  # Use only positive quantites. This is not a robust approach,
25
  # but to keep things simple it quite good.
26
+ df = df[df["Book-Rating"] > 0]
27
 
28
  # Parse the date column and add 10 years, just to better visualization
29
+ #df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"]).dt.floor( "d") + pd.offsets.DateOffset(years=10)
 
 
30
 
31
  # Change customer id to int
32
+ df["User-ID"] = df["User-ID"].astype(int)
33
+ df_users["Age"] = df_users["Age"].astype(int)
34
 
35
  # Add price column
36
+ #df["Price"] = df["Quantity"] * df["UnitPrice"]
37
 
38
  # Get unique entries in the dataset of users and products
39
+ users = df["User-ID"].unique()
40
+ products = df["ISBN"].unique()
41
+ location=df_users["Location"]
42
+ age=df_users["Age"]
43
+ title=df_books["Book-Title"]
44
+ author=df_books["Book-Author"]
45
+ year=df_books["Year-Of-Publication"]
46
 
47
  # Create a categorical type for users and product. User ordered to ensure
48
  # reproducibility
49
+ #user_cat = pd.CategoricalDtype(categories=sorted(users), ordered=True)
50
+ #product_cat = pd.CategoricalDtype(categories=sorted(products), ordered=True)
51
 
52
  # Transform and get the indexes of the columns
53
+ #user_idx = df["User-ID"].astype(user_cat).cat.codes
54
+ #product_idx = df["ISBN"].astype(product_cat).cat.codes
55
 
56
  # Add the categorical index to the starting dataframe
57
+ #df["CustomerIndex"] = user_idx
58
+ #df["ProductIndex"] = product_idx
59
+ # Merging both DataFrames based on respective common columns
60
+ merged_df = pd.merge(df, df_users[['User-ID', 'Location', 'Age']], on='User-ID', how='left')
61
+ merged_df = pd.merge(merged_df, df_books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication']], on='ISBN', how='left')
62
+
63
+
64
+
65
+
66
 
67
+ return merged_df