Projet_Artefact_Segmentation / 5_customer_segmentation_streamlit.py
Palplatine's picture
Adding files segmentation project
604e735
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import datetime
import streamlit as st
#####################################################################################################################################
st.set_page_config(layout='wide')
# Sidebar: Image + main info on dataset
def data_subset(data, beginning='2010-12-01', end='2011-12-09'):
beginning = pd.to_datetime(beginning)
end = pd.to_datetime(end)
# Subsetting
data = data[(data['InvoiceDate'] >= beginning) & (data['InvoiceDate'] <= end)]
return data
# Loading datasets
df_info = pd.read_csv('static/customer_info.csv')
df_info['InvoiceDate'] = pd.to_datetime(df_info['InvoiceDate'])
with st.sidebar:
col1, col2, col3 = st.columns(3)
with col2:
random_image = Image.open('static/logo_random.png')
st.image(random_image)
# Showing top products
if st.checkbox('Check to see top products sold in a selected timeframe'):
start = st.date_input('Input beginning of the wanted timeframe', datetime.date(2010, 12, 1),
min_value=datetime.date(2010, 12, 1), max_value=datetime.date(2011, 12, 9), key=1)
end = st.date_input('Input beginning of the wanted timeframe', datetime.date(2011, 12, 9),
min_value=start, max_value=datetime.date(2011, 12, 9), key=2)
df_top_products = df_info.copy()
df_subset_products = data_subset(df_top_products, start, end)
df_subset_products = df_top_products.groupby('Description')['Quantity'].sum()
number_chosen_products = st.number_input('How many top products sold do you want to see?', value=5)
df_subset_products_top = pd.DataFrame(df_top_products.sort_values(by='Quantity', ascending=False)).iloc[:number_chosen_products,:]
df_subset_products_top = df_subset_products_top[['Description', 'Quantity']]
st.dataframe(df_subset_products_top)
# Showing most recent clients
if st.checkbox('Check to see the most recent customers in a selected timeframe'):
start_clts = st.date_input('Input beginning of the wanted timeframe', datetime.date(2010, 12, 1),
min_value=datetime.date(2010, 12, 1), max_value=datetime.date(2011, 12, 9), key=3)
end_clts = st.date_input('Input beginning of the wanted timeframe', datetime.date(2011, 12, 9),
min_value=start_clts, max_value=datetime.date(2011, 12, 9), key=4)
df_recent_customers = df_info.copy()
df_subset_recent_customers = data_subset(df_recent_customers, start_clts, end_clts)
df_subset_recent_customers = df_subset_recent_customers.groupby('CustomerID')['Recency'].min()
number_chosen_recency = st.number_input('How many recent customers do you want to see?', value=5)
df_subset_recent_customers_top = pd.DataFrame(df_subset_recent_customers.sort_values()).iloc[:number_chosen_recency,:]
st.dataframe(df_subset_recent_customers_top)
# Showing most prolific customers
if st.checkbox('Check to see the top customers in a selected timeframe'):
start_top = st.date_input('Input beginning of the wanted timeframe', datetime.date(2010, 12, 1),
min_value=datetime.date(2010, 12, 1), max_value=datetime.date(2011, 12, 9), key=5)
end_top = st.date_input('Input beginning of the wanted timeframe', datetime.date(2011, 12, 9),
min_value=start_top, max_value=datetime.date(2011, 12, 9), key=6)
df_top_customers = df_info.copy()
df_subset_top_customers = data_subset(df_top_customers, start_top, end_top)
df_subset_top_customers = df_subset_top_customers.groupby('CustomerID')['Monetary'].sum()
number_chosen_top_clts = st.number_input('How many top customers do you want to see?', value=5)
df_subset_top_customers_top = pd.DataFrame(df_subset_top_customers.sort_values(ascending=False)).iloc[:number_chosen_top_clts,:]
st.dataframe(df_subset_top_customers_top)
#####################################################################################################################################
st.title('E-commerce: client dashboard')
st.write("---")
# Loading dataset
df_info_customer = df_info.copy()
customer_id_default = int(df_info_customer['CustomerID'].min())
# We choose a CustomerID
st.number_input('CustomerID', min_value=customer_id_default, value=customer_id_default, step=1, format="%d", key='customer_id')
customer_id = st.session_state.customer_id
if customer_id not in df_info_customer['CustomerID'].values:
st.write('This CustomerID is not available right now, please find another.')
else:
start_info = st.date_input('Input beginning of the wanted timeframe', datetime.date(2010, 12, 1),
min_value=datetime.date(2010, 12, 1), max_value=datetime.date(2011, 12, 9), key=7)
end_info = st.date_input('Input beginning of the wanted timeframe', datetime.date(2011, 12, 9),
min_value=start_info, max_value=datetime.date(2011, 12, 9), key=8)
df_subset_info_customer = data_subset(df_info_customer, start_info, end_info)
# Main info (recency, number of orders, how much the customer spent)
df_subset_info_customer = df_subset_info_customer[df_subset_info_customer['CustomerID'] == customer_id]
df_main_info = df_subset_info_customer.groupby('CustomerID').agg(Recency=('Recency', 'min'), NbOrder=('NbOrder', 'max'), MonetaryTotal=('Monetary', 'sum'))
# GroupBy to get the mean value of each order for the customer
df_mean_order = df_subset_info_customer.groupby(['InvoiceNo', 'CustomerID']).agg(TotalOrderValue=('Monetary', 'sum'))
df_mean_order = df_mean_order.groupby('CustomerID').agg(MeanOrderValue=('TotalOrderValue', 'mean'))
# GroupBy to get the most bought product and its quantity
df_product_clts = pd.DataFrame(df_info.groupby(['CustomerID','Description'])['Quantity'].sum())
df_product_clts = df_product_clts.reset_index()
df_product_clts = df_product_clts[df_product_clts['CustomerID'] == customer_id]
ids, values = df_product_clts.groupby('CustomerID')['Quantity'].max().index, df_product_clts.groupby('CustomerID')['Quantity'].max().values
df_product_clts = df_product_clts[(df_product_clts['CustomerID'] == ids[0]) & (df_product_clts['Quantity'] == values[0])]
# Now we create the columns we want
df_main_info['MeanOrderValue'] = df_mean_order['MeanOrderValue'].values[0]
df_main_info['MostOrderedProduct'] = df_product_clts['Description'].values[0]
df_main_info['MostOrderedProductQuantity'] = df_product_clts['Quantity'].values[0]
# We can show it now that it's complete
st.dataframe(df_main_info)
st.write("---")
#####################################################################################################################################
st.subheader('Similarity between customers:')
with st.expander('Choose a number of similar customers to compare:'):
if st.checkbox('Only similar customers:'):
options_similar = ['Recency', 'NbOrder', 'MonetaryTotal', 'MeanOrderValue']
option_similar = st.selectbox('Choose a feature to plot:', tuple(options_similar))
df_similar_customer = df_info.copy()
# Main info (recency, number of orders, how much the customer spent)
df_similar_customer_grouped = df_similar_customer.groupby('CustomerID').agg(Recency=('Recency', 'min'), NbOrder=('NbOrder', 'max'), MonetaryTotal=('Monetary', 'sum'))
# GroupBy to get the mean value of each order for the customer
df_mean_order_similar = df_similar_customer.groupby(['InvoiceNo', 'CustomerID']).agg(TotalOrderValue=('Monetary', 'sum'))
df_mean_order_similar = df_mean_order_similar.groupby('CustomerID').agg(MeanOrderValue=('TotalOrderValue', 'mean'))
# Now we create the column we want
df_similar_customer_grouped['MeanOrderValue'] = df_mean_order_similar['MeanOrderValue'].values
# We select the client
df_similar_customer_grouped = df_similar_customer_grouped.reset_index()
df_selected_clt = df_similar_customer_grouped[df_similar_customer_grouped['CustomerID'] == customer_id]
# We calculate distances (euclidean)
distances = []
for i in range(df_similar_customer_grouped.shape[0]):
distance = np.linalg.norm(df_similar_customer_grouped.drop('CustomerID', axis=1).values[i] - df_selected_clt.drop('CustomerID', axis=1).values)
distances.append(distance)
n_neighbors = st.slider("Number of similar customers:", min_value=5, max_value=30, value=10)
neighbors = sorted(distances)[:n_neighbors]
# We get the indices of the similar customers
indices_neighbors = []
for i in range(len(neighbors)):
indices_neighbors.append(distances.index(neighbors[i]))
df_neighbors_selected = df_similar_customer_grouped.iloc[indices_neighbors, :]
fig2, ax = plt.subplots()
ax.set_xlabel('Customers', fontsize=17)
ax.set_ylabel(option_similar, fontsize=17)
ax.axhline(y=df_selected_clt[option_similar].values, color='r', label='axhline - full height')
ax = plt.boxplot(df_neighbors_selected[option_similar], showfliers=False)
st.pyplot(fig2)
if st.checkbox('Compare to all customers:'):
options_all = ['Recency', 'NbOrder', 'MonetaryTotal', 'MeanOrderValue']
option_all = st.selectbox('Choose a feature to plot:', tuple(options_all))
df_all_customer = df_info.copy()
# Main info (recency, number of orders, how much the customer spent)
df_all_customer_grouped = df_all_customer.groupby('CustomerID').agg(Recency=('Recency', 'min'), NbOrder=('NbOrder', 'max'), MonetaryTotal=('Monetary', 'sum'))
# GroupBy to get the mean value of each order for the customer
df_mean_order_all = df_all_customer.groupby(['InvoiceNo', 'CustomerID']).agg(TotalOrderValue=('Monetary', 'sum'))
df_mean_order_all = df_mean_order_all.groupby('CustomerID').agg(MeanOrderValue=('TotalOrderValue', 'mean'))
# Now we create the column we want
df_all_customer_grouped['MeanOrderValue'] = df_mean_order_all['MeanOrderValue'].values
# We select the client
df_selected_clt_all = df_all_customer_grouped.reset_index()
df_selected_clt_all = df_selected_clt_all[df_selected_clt_all['CustomerID'] == customer_id]
# We calculate distances (euclidean)
distances = []
for i in range(df_all_customer_grouped.shape[0]):
distance = np.linalg.norm(df_all_customer_grouped.values[i] - df_selected_clt_all.drop('CustomerID', axis=1).values)
distances.append(distance)
fig2, ax = plt.subplots()
ax.set_xlabel('Customers', fontsize=17)
ax.set_ylabel(option_all, fontsize=17)
ax.axhline(y=df_selected_clt_all[option_all].values, color='r', label='axhline - full height')
ax = plt.boxplot(df_all_customer_grouped[option_all], showfliers=False)
st.pyplot(fig2)
st.write("---")
#####################################################################################################################################
st.subheader('Barplot of top selected products in the selected timeframe:')
with st.expander('Select to choose how many top products you want to see and in which timeframe'):
start_product_date = st.date_input('Input beginning of the wanted timeframe', datetime.date(2010, 12, 1),
min_value=datetime.date(2010, 12, 1), max_value=datetime.date(2011, 12, 9), key=9)
end_product_date = st.date_input('Input beginning of the wanted timeframe', datetime.date(2011, 12, 9),
min_value=start_product_date, max_value=datetime.date(2011, 12, 9), key=10)
df_top_products_plot = df_info.copy()
df_subset_products = data_subset(df_top_products_plot, start_product_date, end_product_date)
start_product, end_product = st.select_slider('Select a range of top product', options=[x for x in range(1, 21)], value=(1, 10))
df_subset_products = df_subset_products.groupby('Description')['Quantity'].sum()
df_subset_products = df_subset_products.reset_index()
df_slider_products = df_subset_products.sort_values(by='Quantity', ascending=False)
df_slider_products = df_slider_products.iloc[start_product-1:end_product, :]
fig, ax = plt.subplots()
bars = plt.barh(y=df_slider_products['Description'], width=df_slider_products['Quantity'], color=['darkmagenta', 'darkblue', 'darkgreen', 'darkred', 'darkgrey', 'darkorange'])
ax.bar_label(bars)
ax = plt.gca().invert_yaxis()
st.subheader('Selected top products:')
st.pyplot(fig)
st.write("---")
#####################################################################################################################################
st.subheader('Barplot of sales:')
with st.expander('Select to choose the periodicity:'):
options_similar = ['Months', 'Days', 'Hours']
option_similar = st.selectbox('Choose a periodicity:', tuple(options_similar))
if option_similar == 'Months':
df_months = df_info.copy()
df_months = df_months.merge(pd.DataFrame(df_months.groupby('CustomerID')['Monetary'].sum()), on='CustomerID')
df_months['Periodicity'] = pd.DatetimeIndex(df_months['InvoiceDate']).month
df_months = df_months.sort_values('Recency')
df_months = df_months.drop_duplicates(subset='CustomerID')
fig1, ax1 = plt.subplots()
ax1 = sns.barplot(x=df_months['Periodicity'], y=df_months['Monetary_y'], errorbar=None)
plt.title('Sales per Months')
plt.xlabel('Periodicity: Months')
plt.ylabel('TotalOrderValue')
st.pyplot(fig1)
elif option_similar == 'Days':
df_days = df_info.copy()
df_days = df_days.merge(pd.DataFrame(df_days.groupby('CustomerID')['Monetary'].sum()), on='CustomerID')
df_days['Periodicity'] = pd.DatetimeIndex(df_days['InvoiceDate']).day
df_days = df_days.sort_values('Recency')
df_days = df_days.drop_duplicates(subset='CustomerID')
fig2, ax2 = plt.subplots()
ax2 = sns.barplot(x=df_days['Periodicity'], y=df_days['Monetary_y'], errorbar=None)
plt.title('Sales per Days')
plt.xlabel('Periodicity: Days')
plt.xticks(rotation=90)
plt.ylabel('TotalOrderValue')
st.pyplot(fig2)
elif option_similar == 'Hours':
df_hours = df_info.copy()
df_hours = df_hours.merge(pd.DataFrame(df_hours.groupby('CustomerID')['Monetary'].sum()), on='CustomerID')
df_hours['Periodicity'] = pd.DatetimeIndex(df_hours['InvoiceDate']).hour
df_hours = df_hours.sort_values('Recency')
df_hours = df_hours.drop_duplicates(subset='CustomerID')
fig3, ax3 = plt.subplots()
ax3 = sns.barplot(x=df_hours['Periodicity'], y=df_hours['Monetary_y'], errorbar=None)
plt.title('Sales per Hours')
plt.xlabel('Periodicity: Hours')
plt.ylabel('TotalOrderValue')
st.pyplot(fig3)
st.write("---")
#####################################################################################################################################
col1, col2, col3, col4, col5 = st.columns(5)
with col5:
logo_artefact = Image.open('static/logo_artefact.png')
st.image(logo_artefact)