|
import pandas as pd |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
from PIL import Image |
|
import datetime |
|
import streamlit as st |
|
|
|
|
|
st.set_page_config(layout='wide') |
|
|
|
|
|
def data_subset(data, beginning='2010-12-01', end='2011-12-09'): |
|
|
|
beginning = pd.to_datetime(beginning) |
|
end = pd.to_datetime(end) |
|
|
|
|
|
data = data[(data['InvoiceDate'] >= beginning) & (data['InvoiceDate'] <= end)] |
|
|
|
return data |
|
|
|
|
|
df_info = pd.read_csv('static/customer_info.csv') |
|
df_info['InvoiceDate'] = pd.to_datetime(df_info['InvoiceDate']) |
|
|
|
with st.sidebar: |
|
col1, col2, col3 = st.columns(3) |
|
with col2: |
|
random_image = Image.open('static/logo_random.png') |
|
st.image(random_image) |
|
|
|
|
|
if st.checkbox('Check to see top products sold in a selected timeframe'): |
|
start = st.date_input('Input beginning of the wanted timeframe', datetime.date(2010, 12, 1), |
|
min_value=datetime.date(2010, 12, 1), max_value=datetime.date(2011, 12, 9), key=1) |
|
end = st.date_input('Input beginning of the wanted timeframe', datetime.date(2011, 12, 9), |
|
min_value=start, max_value=datetime.date(2011, 12, 9), key=2) |
|
|
|
df_top_products = df_info.copy() |
|
df_subset_products = data_subset(df_top_products, start, end) |
|
|
|
df_subset_products = df_top_products.groupby('Description')['Quantity'].sum() |
|
number_chosen_products = st.number_input('How many top products sold do you want to see?', value=5) |
|
df_subset_products_top = pd.DataFrame(df_top_products.sort_values(by='Quantity', ascending=False)).iloc[:number_chosen_products,:] |
|
df_subset_products_top = df_subset_products_top[['Description', 'Quantity']] |
|
st.dataframe(df_subset_products_top) |
|
|
|
|
|
if st.checkbox('Check to see the most recent customers in a selected timeframe'): |
|
start_clts = st.date_input('Input beginning of the wanted timeframe', datetime.date(2010, 12, 1), |
|
min_value=datetime.date(2010, 12, 1), max_value=datetime.date(2011, 12, 9), key=3) |
|
end_clts = st.date_input('Input beginning of the wanted timeframe', datetime.date(2011, 12, 9), |
|
min_value=start_clts, max_value=datetime.date(2011, 12, 9), key=4) |
|
df_recent_customers = df_info.copy() |
|
df_subset_recent_customers = data_subset(df_recent_customers, start_clts, end_clts) |
|
|
|
df_subset_recent_customers = df_subset_recent_customers.groupby('CustomerID')['Recency'].min() |
|
number_chosen_recency = st.number_input('How many recent customers do you want to see?', value=5) |
|
df_subset_recent_customers_top = pd.DataFrame(df_subset_recent_customers.sort_values()).iloc[:number_chosen_recency,:] |
|
st.dataframe(df_subset_recent_customers_top) |
|
|
|
|
|
if st.checkbox('Check to see the top customers in a selected timeframe'): |
|
start_top = st.date_input('Input beginning of the wanted timeframe', datetime.date(2010, 12, 1), |
|
min_value=datetime.date(2010, 12, 1), max_value=datetime.date(2011, 12, 9), key=5) |
|
end_top = st.date_input('Input beginning of the wanted timeframe', datetime.date(2011, 12, 9), |
|
min_value=start_top, max_value=datetime.date(2011, 12, 9), key=6) |
|
df_top_customers = df_info.copy() |
|
df_subset_top_customers = data_subset(df_top_customers, start_top, end_top) |
|
|
|
df_subset_top_customers = df_subset_top_customers.groupby('CustomerID')['Monetary'].sum() |
|
number_chosen_top_clts = st.number_input('How many top customers do you want to see?', value=5) |
|
df_subset_top_customers_top = pd.DataFrame(df_subset_top_customers.sort_values(ascending=False)).iloc[:number_chosen_top_clts,:] |
|
st.dataframe(df_subset_top_customers_top) |
|
|
|
|
|
st.title('E-commerce: client dashboard') |
|
st.write("---") |
|
|
|
|
|
df_info_customer = df_info.copy() |
|
customer_id_default = int(df_info_customer['CustomerID'].min()) |
|
|
|
|
|
st.number_input('CustomerID', min_value=customer_id_default, value=customer_id_default, step=1, format="%d", key='customer_id') |
|
customer_id = st.session_state.customer_id |
|
|
|
if customer_id not in df_info_customer['CustomerID'].values: |
|
st.write('This CustomerID is not available right now, please find another.') |
|
|
|
else: |
|
start_info = st.date_input('Input beginning of the wanted timeframe', datetime.date(2010, 12, 1), |
|
min_value=datetime.date(2010, 12, 1), max_value=datetime.date(2011, 12, 9), key=7) |
|
end_info = st.date_input('Input beginning of the wanted timeframe', datetime.date(2011, 12, 9), |
|
min_value=start_info, max_value=datetime.date(2011, 12, 9), key=8) |
|
df_subset_info_customer = data_subset(df_info_customer, start_info, end_info) |
|
|
|
|
|
df_subset_info_customer = df_subset_info_customer[df_subset_info_customer['CustomerID'] == customer_id] |
|
df_main_info = df_subset_info_customer.groupby('CustomerID').agg(Recency=('Recency', 'min'), NbOrder=('NbOrder', 'max'), MonetaryTotal=('Monetary', 'sum')) |
|
|
|
|
|
df_mean_order = df_subset_info_customer.groupby(['InvoiceNo', 'CustomerID']).agg(TotalOrderValue=('Monetary', 'sum')) |
|
df_mean_order = df_mean_order.groupby('CustomerID').agg(MeanOrderValue=('TotalOrderValue', 'mean')) |
|
|
|
|
|
df_product_clts = pd.DataFrame(df_info.groupby(['CustomerID','Description'])['Quantity'].sum()) |
|
df_product_clts = df_product_clts.reset_index() |
|
df_product_clts = df_product_clts[df_product_clts['CustomerID'] == customer_id] |
|
ids, values = df_product_clts.groupby('CustomerID')['Quantity'].max().index, df_product_clts.groupby('CustomerID')['Quantity'].max().values |
|
df_product_clts = df_product_clts[(df_product_clts['CustomerID'] == ids[0]) & (df_product_clts['Quantity'] == values[0])] |
|
|
|
|
|
df_main_info['MeanOrderValue'] = df_mean_order['MeanOrderValue'].values[0] |
|
df_main_info['MostOrderedProduct'] = df_product_clts['Description'].values[0] |
|
df_main_info['MostOrderedProductQuantity'] = df_product_clts['Quantity'].values[0] |
|
|
|
|
|
st.dataframe(df_main_info) |
|
|
|
st.write("---") |
|
|
|
st.subheader('Similarity between customers:') |
|
with st.expander('Choose a number of similar customers to compare:'): |
|
|
|
if st.checkbox('Only similar customers:'): |
|
options_similar = ['Recency', 'NbOrder', 'MonetaryTotal', 'MeanOrderValue'] |
|
option_similar = st.selectbox('Choose a feature to plot:', tuple(options_similar)) |
|
|
|
df_similar_customer = df_info.copy() |
|
|
|
|
|
df_similar_customer_grouped = df_similar_customer.groupby('CustomerID').agg(Recency=('Recency', 'min'), NbOrder=('NbOrder', 'max'), MonetaryTotal=('Monetary', 'sum')) |
|
|
|
|
|
df_mean_order_similar = df_similar_customer.groupby(['InvoiceNo', 'CustomerID']).agg(TotalOrderValue=('Monetary', 'sum')) |
|
df_mean_order_similar = df_mean_order_similar.groupby('CustomerID').agg(MeanOrderValue=('TotalOrderValue', 'mean')) |
|
|
|
|
|
df_similar_customer_grouped['MeanOrderValue'] = df_mean_order_similar['MeanOrderValue'].values |
|
|
|
|
|
df_similar_customer_grouped = df_similar_customer_grouped.reset_index() |
|
df_selected_clt = df_similar_customer_grouped[df_similar_customer_grouped['CustomerID'] == customer_id] |
|
|
|
|
|
distances = [] |
|
for i in range(df_similar_customer_grouped.shape[0]): |
|
distance = np.linalg.norm(df_similar_customer_grouped.drop('CustomerID', axis=1).values[i] - df_selected_clt.drop('CustomerID', axis=1).values) |
|
distances.append(distance) |
|
|
|
n_neighbors = st.slider("Number of similar customers:", min_value=5, max_value=30, value=10) |
|
neighbors = sorted(distances)[:n_neighbors] |
|
|
|
|
|
indices_neighbors = [] |
|
for i in range(len(neighbors)): |
|
indices_neighbors.append(distances.index(neighbors[i])) |
|
|
|
df_neighbors_selected = df_similar_customer_grouped.iloc[indices_neighbors, :] |
|
|
|
fig2, ax = plt.subplots() |
|
ax.set_xlabel('Customers', fontsize=17) |
|
ax.set_ylabel(option_similar, fontsize=17) |
|
ax.axhline(y=df_selected_clt[option_similar].values, color='r', label='axhline - full height') |
|
ax = plt.boxplot(df_neighbors_selected[option_similar], showfliers=False) |
|
|
|
st.pyplot(fig2) |
|
|
|
if st.checkbox('Compare to all customers:'): |
|
options_all = ['Recency', 'NbOrder', 'MonetaryTotal', 'MeanOrderValue'] |
|
option_all = st.selectbox('Choose a feature to plot:', tuple(options_all)) |
|
|
|
df_all_customer = df_info.copy() |
|
|
|
|
|
df_all_customer_grouped = df_all_customer.groupby('CustomerID').agg(Recency=('Recency', 'min'), NbOrder=('NbOrder', 'max'), MonetaryTotal=('Monetary', 'sum')) |
|
|
|
|
|
df_mean_order_all = df_all_customer.groupby(['InvoiceNo', 'CustomerID']).agg(TotalOrderValue=('Monetary', 'sum')) |
|
df_mean_order_all = df_mean_order_all.groupby('CustomerID').agg(MeanOrderValue=('TotalOrderValue', 'mean')) |
|
|
|
|
|
df_all_customer_grouped['MeanOrderValue'] = df_mean_order_all['MeanOrderValue'].values |
|
|
|
|
|
df_selected_clt_all = df_all_customer_grouped.reset_index() |
|
df_selected_clt_all = df_selected_clt_all[df_selected_clt_all['CustomerID'] == customer_id] |
|
|
|
|
|
distances = [] |
|
for i in range(df_all_customer_grouped.shape[0]): |
|
distance = np.linalg.norm(df_all_customer_grouped.values[i] - df_selected_clt_all.drop('CustomerID', axis=1).values) |
|
distances.append(distance) |
|
|
|
fig2, ax = plt.subplots() |
|
ax.set_xlabel('Customers', fontsize=17) |
|
ax.set_ylabel(option_all, fontsize=17) |
|
ax.axhline(y=df_selected_clt_all[option_all].values, color='r', label='axhline - full height') |
|
ax = plt.boxplot(df_all_customer_grouped[option_all], showfliers=False) |
|
|
|
st.pyplot(fig2) |
|
|
|
st.write("---") |
|
|
|
st.subheader('Barplot of top selected products in the selected timeframe:') |
|
with st.expander('Select to choose how many top products you want to see and in which timeframe'): |
|
|
|
start_product_date = st.date_input('Input beginning of the wanted timeframe', datetime.date(2010, 12, 1), |
|
min_value=datetime.date(2010, 12, 1), max_value=datetime.date(2011, 12, 9), key=9) |
|
end_product_date = st.date_input('Input beginning of the wanted timeframe', datetime.date(2011, 12, 9), |
|
min_value=start_product_date, max_value=datetime.date(2011, 12, 9), key=10) |
|
df_top_products_plot = df_info.copy() |
|
df_subset_products = data_subset(df_top_products_plot, start_product_date, end_product_date) |
|
start_product, end_product = st.select_slider('Select a range of top product', options=[x for x in range(1, 21)], value=(1, 10)) |
|
df_subset_products = df_subset_products.groupby('Description')['Quantity'].sum() |
|
df_subset_products = df_subset_products.reset_index() |
|
df_slider_products = df_subset_products.sort_values(by='Quantity', ascending=False) |
|
df_slider_products = df_slider_products.iloc[start_product-1:end_product, :] |
|
|
|
fig, ax = plt.subplots() |
|
bars = plt.barh(y=df_slider_products['Description'], width=df_slider_products['Quantity'], color=['darkmagenta', 'darkblue', 'darkgreen', 'darkred', 'darkgrey', 'darkorange']) |
|
|
|
ax.bar_label(bars) |
|
ax = plt.gca().invert_yaxis() |
|
|
|
st.subheader('Selected top products:') |
|
st.pyplot(fig) |
|
|
|
st.write("---") |
|
|
|
|
|
st.subheader('Barplot of sales:') |
|
with st.expander('Select to choose the periodicity:'): |
|
options_similar = ['Months', 'Days', 'Hours'] |
|
option_similar = st.selectbox('Choose a periodicity:', tuple(options_similar)) |
|
|
|
if option_similar == 'Months': |
|
df_months = df_info.copy() |
|
df_months = df_months.merge(pd.DataFrame(df_months.groupby('CustomerID')['Monetary'].sum()), on='CustomerID') |
|
df_months['Periodicity'] = pd.DatetimeIndex(df_months['InvoiceDate']).month |
|
df_months = df_months.sort_values('Recency') |
|
df_months = df_months.drop_duplicates(subset='CustomerID') |
|
|
|
fig1, ax1 = plt.subplots() |
|
ax1 = sns.barplot(x=df_months['Periodicity'], y=df_months['Monetary_y'], errorbar=None) |
|
plt.title('Sales per Months') |
|
plt.xlabel('Periodicity: Months') |
|
plt.ylabel('TotalOrderValue') |
|
st.pyplot(fig1) |
|
|
|
elif option_similar == 'Days': |
|
df_days = df_info.copy() |
|
df_days = df_days.merge(pd.DataFrame(df_days.groupby('CustomerID')['Monetary'].sum()), on='CustomerID') |
|
df_days['Periodicity'] = pd.DatetimeIndex(df_days['InvoiceDate']).day |
|
df_days = df_days.sort_values('Recency') |
|
df_days = df_days.drop_duplicates(subset='CustomerID') |
|
|
|
fig2, ax2 = plt.subplots() |
|
ax2 = sns.barplot(x=df_days['Periodicity'], y=df_days['Monetary_y'], errorbar=None) |
|
plt.title('Sales per Days') |
|
plt.xlabel('Periodicity: Days') |
|
plt.xticks(rotation=90) |
|
plt.ylabel('TotalOrderValue') |
|
st.pyplot(fig2) |
|
|
|
elif option_similar == 'Hours': |
|
df_hours = df_info.copy() |
|
df_hours = df_hours.merge(pd.DataFrame(df_hours.groupby('CustomerID')['Monetary'].sum()), on='CustomerID') |
|
df_hours['Periodicity'] = pd.DatetimeIndex(df_hours['InvoiceDate']).hour |
|
df_hours = df_hours.sort_values('Recency') |
|
df_hours = df_hours.drop_duplicates(subset='CustomerID') |
|
|
|
fig3, ax3 = plt.subplots() |
|
ax3 = sns.barplot(x=df_hours['Periodicity'], y=df_hours['Monetary_y'], errorbar=None) |
|
plt.title('Sales per Hours') |
|
plt.xlabel('Periodicity: Hours') |
|
plt.ylabel('TotalOrderValue') |
|
st.pyplot(fig3) |
|
|
|
|
|
st.write("---") |
|
|
|
|
|
col1, col2, col3, col4, col5 = st.columns(5) |
|
with col5: |
|
logo_artefact = Image.open('static/logo_artefact.png') |
|
st.image(logo_artefact) |