Spaces:

juanmartip95
/

recomenderlacocreadora

Sleeping

App Files Files Community

juanmartip95 commited on Jan 10

Commit

8d44559

•

1 Parent(s): 4c8a491

Update utils.py

Browse files

Files changed (1) hide show

utils.py +38 -18

utils.py CHANGED Viewed

@@ -1,47 +1,67 @@
-import streamlit as st
-import pandas as pd
 @st.cache
 def load_and_preprocess_data():
     df = pd.read_csv(
-        "Data/OnlineRetail.csv",
         encoding="latin-1",
     )
     # Remove nans values
     df = df.dropna()
     # Use only positive quantites. This is not a robust approach,
     # but to keep things simple it quite good.
-    df = df[df["Quantity"] > 0]
     # Parse the date column and add 10 years, just to better visualization
-    df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"]).dt.floor(
-        "d"
-    ) + pd.offsets.DateOffset(years=10)
     # Change customer id to int
-    df["CustomerID"] = df["CustomerID"].astype(int)
     # Add price column
-    df["Price"] = df["Quantity"] * df["UnitPrice"]
     # Get unique entries in the dataset of users and products
-    users = df["CustomerID"].unique()
-    products = df["StockCode"].unique()
     # Create a categorical type for users and product. User ordered to ensure
     # reproducibility
-    user_cat = pd.CategoricalDtype(categories=sorted(users), ordered=True)
-    product_cat = pd.CategoricalDtype(categories=sorted(products), ordered=True)
     # Transform and get the indexes of the columns
-    user_idx = df["CustomerID"].astype(user_cat).cat.codes
-    product_idx = df["StockCode"].astype(product_cat).cat.codes
     # Add the categorical index to the starting dataframe
-    df["CustomerIndex"] = user_idx
-    df["ProductIndex"] = product_idx
-    return df, user_idx, product_idx

+import streamlit as stimport pandas as pd
 @st.cache
 def load_and_preprocess_data():
     df = pd.read_csv(
+        "Ratings.csv",
+        encoding="latin-1",
+    )
+    df_users=pd.read_csv(
+        "Users.csv",
+        encoding="latin-1",
+    )
+    df_books=pd.read_csv(
+        "Books.csv",
         encoding="latin-1",
     )
     # Remove nans values
     df = df.dropna()
+    df_users = df_users.dropna()
+    df_books = df_books.dropna()
     # Use only positive quantites. This is not a robust approach,
     # but to keep things simple it quite good.
+    df = df[df["Book-Rating"] > 0]
     # Parse the date column and add 10 years, just to better visualization
+    #df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"]).dt.floor(   "d") + pd.offsets.DateOffset(years=10)
     # Change customer id to int
+    df["User-ID"] = df["User-ID"].astype(int)
+    df_users["Age"] = df_users["Age"].astype(int)
     # Add price column
+    #df["Price"] = df["Quantity"] * df["UnitPrice"]
     # Get unique entries in the dataset of users and products
+    users = df["User-ID"].unique()
+    products = df["ISBN"].unique()
+    location=df_users["Location"]
+    age=df_users["Age"]
+    title=df_books["Book-Title"]
+    author=df_books["Book-Author"]
+    year=df_books["Year-Of-Publication"]
     # Create a categorical type for users and product. User ordered to ensure
     # reproducibility
+    #user_cat = pd.CategoricalDtype(categories=sorted(users), ordered=True)
+    #product_cat = pd.CategoricalDtype(categories=sorted(products), ordered=True)
     # Transform and get the indexes of the columns
+    #user_idx = df["User-ID"].astype(user_cat).cat.codes
+    #product_idx = df["ISBN"].astype(product_cat).cat.codes
     # Add the categorical index to the starting dataframe
+    #df["CustomerIndex"] = user_idx
+    #df["ProductIndex"] = product_idx
+    # Merging both DataFrames based on respective common columns
+    merged_df = pd.merge(df, df_users[['User-ID', 'Location', 'Age']], on='User-ID', how='left')
+    merged_df = pd.merge(merged_df, df_books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication']], on='ISBN', how='left')
+    return merged_df