Palplatine commited on
Commit
e14c6fe
1 Parent(s): 590ad2e

First commit streamlit application

Browse files
app.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
+ from PIL import Image
6
+ import datetime
7
+ import streamlit as st
8
+
9
+ #####################################################################################################################################
10
+ st.set_page_config(layout='wide')
11
+
12
+ # Sidebar: Image + main info on dataset
13
+ def data_subset(data, beginning='2010-12-01', end='2011-12-09'):
14
+
15
+ beginning = pd.to_datetime(beginning)
16
+ end = pd.to_datetime(end)
17
+
18
+ # Subsetting
19
+ data = data[(data['InvoiceDate'] >= beginning) & (data['InvoiceDate'] <= end)]
20
+
21
+ return data
22
+
23
+ # Loading datasets
24
+ df_info = pd.read_csv('static/customer_info.csv')
25
+ df_info['InvoiceDate'] = pd.to_datetime(df_info['InvoiceDate'])
26
+
27
+ with st.sidebar:
28
+ col1, col2, col3 = st.columns(3)
29
+ with col2:
30
+ random_image = Image.open('static/logo_random.png')
31
+ st.image(random_image)
32
+
33
+ # Showing top products
34
+ if st.checkbox('Check to see top products sold in a selected timeframe'):
35
+ start = st.date_input('Input beginning of the wanted timeframe', datetime.date(2010, 12, 1),
36
+ min_value=datetime.date(2010, 12, 1), max_value=datetime.date(2011, 12, 9), key=1)
37
+ end = st.date_input('Input beginning of the wanted timeframe', datetime.date(2011, 12, 9),
38
+ min_value=start, max_value=datetime.date(2011, 12, 9), key=2)
39
+
40
+ df_top_products = df_info.copy()
41
+ df_subset_products = data_subset(df_top_products, start, end)
42
+
43
+ df_subset_products = df_top_products.groupby('Description')['Quantity'].sum()
44
+ number_chosen_products = st.number_input('How many top products sold do you want to see?', value=5)
45
+ df_subset_products_top = pd.DataFrame(df_top_products.sort_values(by='Quantity', ascending=False)).iloc[:number_chosen_products,:]
46
+ df_subset_products_top = df_subset_products_top[['Description', 'Quantity']]
47
+ st.dataframe(df_subset_products_top)
48
+
49
+ # Showing most recent clients
50
+ if st.checkbox('Check to see the most recent customers in a selected timeframe'):
51
+ start_clts = st.date_input('Input beginning of the wanted timeframe', datetime.date(2010, 12, 1),
52
+ min_value=datetime.date(2010, 12, 1), max_value=datetime.date(2011, 12, 9), key=3)
53
+ end_clts = st.date_input('Input beginning of the wanted timeframe', datetime.date(2011, 12, 9),
54
+ min_value=start_clts, max_value=datetime.date(2011, 12, 9), key=4)
55
+ df_recent_customers = df_info.copy()
56
+ df_subset_recent_customers = data_subset(df_recent_customers, start_clts, end_clts)
57
+
58
+ df_subset_recent_customers = df_subset_recent_customers.groupby('CustomerID')['Recency'].min()
59
+ number_chosen_recency = st.number_input('How many recent customers do you want to see?', value=5)
60
+ df_subset_recent_customers_top = pd.DataFrame(df_subset_recent_customers.sort_values()).iloc[:number_chosen_recency,:]
61
+ st.dataframe(df_subset_recent_customers_top)
62
+
63
+ # Showing most prolific customers
64
+ if st.checkbox('Check to see the top customers in a selected timeframe'):
65
+ start_top = st.date_input('Input beginning of the wanted timeframe', datetime.date(2010, 12, 1),
66
+ min_value=datetime.date(2010, 12, 1), max_value=datetime.date(2011, 12, 9), key=5)
67
+ end_top = st.date_input('Input beginning of the wanted timeframe', datetime.date(2011, 12, 9),
68
+ min_value=start_top, max_value=datetime.date(2011, 12, 9), key=6)
69
+ df_top_customers = df_info.copy()
70
+ df_subset_top_customers = data_subset(df_top_customers, start_top, end_top)
71
+
72
+ df_subset_top_customers = df_subset_top_customers.groupby('CustomerID')['Monetary'].sum()
73
+ number_chosen_top_clts = st.number_input('How many top customers do you want to see?', value=5)
74
+ df_subset_top_customers_top = pd.DataFrame(df_subset_top_customers.sort_values(ascending=False)).iloc[:number_chosen_top_clts,:]
75
+ st.dataframe(df_subset_top_customers_top)
76
+
77
+ #####################################################################################################################################
78
+ st.title('E-commerce: client dashboard')
79
+ st.write("---")
80
+
81
+ # Loading dataset
82
+ df_info_customer = df_info.copy()
83
+ customer_id_default = int(df_info_customer['CustomerID'].min())
84
+
85
+ # We choose a CustomerID
86
+ st.number_input('CustomerID', min_value=customer_id_default, value=customer_id_default, step=1, format="%d", key='customer_id')
87
+ customer_id = st.session_state.customer_id
88
+
89
+ if customer_id not in df_info_customer['CustomerID'].values:
90
+ st.write('This CustomerID is not available right now, please find another.')
91
+
92
+ else:
93
+ start_info = st.date_input('Input beginning of the wanted timeframe', datetime.date(2010, 12, 1),
94
+ min_value=datetime.date(2010, 12, 1), max_value=datetime.date(2011, 12, 9), key=7)
95
+ end_info = st.date_input('Input beginning of the wanted timeframe', datetime.date(2011, 12, 9),
96
+ min_value=start_info, max_value=datetime.date(2011, 12, 9), key=8)
97
+ df_subset_info_customer = data_subset(df_info_customer, start_info, end_info)
98
+
99
+ # Main info (recency, number of orders, how much the customer spent)
100
+ df_subset_info_customer = df_subset_info_customer[df_subset_info_customer['CustomerID'] == customer_id]
101
+ df_main_info = df_subset_info_customer.groupby('CustomerID').agg(Recency=('Recency', 'min'), NbOrder=('NbOrder', 'max'), MonetaryTotal=('Monetary', 'sum'))
102
+
103
+ # GroupBy to get the mean value of each order for the customer
104
+ df_mean_order = df_subset_info_customer.groupby(['InvoiceNo', 'CustomerID']).agg(TotalOrderValue=('Monetary', 'sum'))
105
+ df_mean_order = df_mean_order.groupby('CustomerID').agg(MeanOrderValue=('TotalOrderValue', 'mean'))
106
+
107
+ # GroupBy to get the most bought product and its quantity
108
+ df_product_clts = pd.DataFrame(df_info.groupby(['CustomerID','Description'])['Quantity'].sum())
109
+ df_product_clts = df_product_clts.reset_index()
110
+ df_product_clts = df_product_clts[df_product_clts['CustomerID'] == customer_id]
111
+ ids, values = df_product_clts.groupby('CustomerID')['Quantity'].max().index, df_product_clts.groupby('CustomerID')['Quantity'].max().values
112
+ df_product_clts = df_product_clts[(df_product_clts['CustomerID'] == ids[0]) & (df_product_clts['Quantity'] == values[0])]
113
+
114
+ # Now we create the columns we want
115
+ df_main_info['MeanOrderValue'] = df_mean_order['MeanOrderValue'].values[0]
116
+ df_main_info['MostOrderedProduct'] = df_product_clts['Description'].values[0]
117
+ df_main_info['MostOrderedProductQuantity'] = df_product_clts['Quantity'].values[0]
118
+
119
+ # We can show it now that it's complete
120
+ st.dataframe(df_main_info)
121
+
122
+ st.write("---")
123
+ #####################################################################################################################################
124
+ st.subheader('Similarity between customers:')
125
+ with st.expander('Choose a number of similar customers to compare:'):
126
+
127
+ if st.checkbox('Only similar customers:'):
128
+ options_similar = ['Recency', 'NbOrder', 'MonetaryTotal', 'MeanOrderValue']
129
+ option_similar = st.selectbox('Choose a feature to plot:', tuple(options_similar))
130
+
131
+ df_similar_customer = df_info.copy()
132
+
133
+ # Main info (recency, number of orders, how much the customer spent)
134
+ df_similar_customer_grouped = df_similar_customer.groupby('CustomerID').agg(Recency=('Recency', 'min'), NbOrder=('NbOrder', 'max'), MonetaryTotal=('Monetary', 'sum'))
135
+
136
+ # GroupBy to get the mean value of each order for the customer
137
+ df_mean_order_similar = df_similar_customer.groupby(['InvoiceNo', 'CustomerID']).agg(TotalOrderValue=('Monetary', 'sum'))
138
+ df_mean_order_similar = df_mean_order_similar.groupby('CustomerID').agg(MeanOrderValue=('TotalOrderValue', 'mean'))
139
+
140
+ # Now we create the column we want
141
+ df_similar_customer_grouped['MeanOrderValue'] = df_mean_order_similar['MeanOrderValue'].values
142
+
143
+ # We select the client
144
+ df_similar_customer_grouped = df_similar_customer_grouped.reset_index()
145
+ df_selected_clt = df_similar_customer_grouped[df_similar_customer_grouped['CustomerID'] == customer_id]
146
+
147
+ # We calculate distances (euclidean)
148
+ distances = []
149
+ for i in range(df_similar_customer_grouped.shape[0]):
150
+ distance = np.linalg.norm(df_similar_customer_grouped.drop('CustomerID', axis=1).values[i] - df_selected_clt.drop('CustomerID', axis=1).values)
151
+ distances.append(distance)
152
+
153
+ n_neighbors = st.slider("Number of similar customers:", min_value=5, max_value=30, value=10)
154
+ neighbors = sorted(distances)[:n_neighbors]
155
+
156
+ # We get the indices of the similar customers
157
+ indices_neighbors = []
158
+ for i in range(len(neighbors)):
159
+ indices_neighbors.append(distances.index(neighbors[i]))
160
+
161
+ df_neighbors_selected = df_similar_customer_grouped.iloc[indices_neighbors, :]
162
+
163
+ fig2, ax = plt.subplots()
164
+ ax.set_xlabel('Customers', fontsize=17)
165
+ ax.set_ylabel(option_similar, fontsize=17)
166
+ ax.axhline(y=df_selected_clt[option_similar].values, color='r', label='axhline - full height')
167
+ ax = plt.boxplot(df_neighbors_selected[option_similar], showfliers=False)
168
+
169
+ st.pyplot(fig2)
170
+
171
+ if st.checkbox('Compare to all customers:'):
172
+ options_all = ['Recency', 'NbOrder', 'MonetaryTotal', 'MeanOrderValue']
173
+ option_all = st.selectbox('Choose a feature to plot:', tuple(options_all))
174
+
175
+ df_all_customer = df_info.copy()
176
+
177
+ # Main info (recency, number of orders, how much the customer spent)
178
+ df_all_customer_grouped = df_all_customer.groupby('CustomerID').agg(Recency=('Recency', 'min'), NbOrder=('NbOrder', 'max'), MonetaryTotal=('Monetary', 'sum'))
179
+
180
+ # GroupBy to get the mean value of each order for the customer
181
+ df_mean_order_all = df_all_customer.groupby(['InvoiceNo', 'CustomerID']).agg(TotalOrderValue=('Monetary', 'sum'))
182
+ df_mean_order_all = df_mean_order_all.groupby('CustomerID').agg(MeanOrderValue=('TotalOrderValue', 'mean'))
183
+
184
+ # Now we create the column we want
185
+ df_all_customer_grouped['MeanOrderValue'] = df_mean_order_all['MeanOrderValue'].values
186
+
187
+ # We select the client
188
+ df_selected_clt_all = df_all_customer_grouped.reset_index()
189
+ df_selected_clt_all = df_selected_clt_all[df_selected_clt_all['CustomerID'] == customer_id]
190
+
191
+ # We calculate distances (euclidean)
192
+ distances = []
193
+ for i in range(df_all_customer_grouped.shape[0]):
194
+ distance = np.linalg.norm(df_all_customer_grouped.values[i] - df_selected_clt_all.drop('CustomerID', axis=1).values)
195
+ distances.append(distance)
196
+
197
+ fig2, ax = plt.subplots()
198
+ ax.set_xlabel('Customers', fontsize=17)
199
+ ax.set_ylabel(option_all, fontsize=17)
200
+ ax.axhline(y=df_selected_clt_all[option_all].values, color='r', label='axhline - full height')
201
+ ax = plt.boxplot(df_all_customer_grouped[option_all], showfliers=False)
202
+
203
+ st.pyplot(fig2)
204
+
205
+ st.write("---")
206
+ #####################################################################################################################################
207
+ st.subheader('Barplot of top selected products in the selected timeframe:')
208
+ with st.expander('Select to choose how many top products you want to see and in which timeframe'):
209
+
210
+ start_product_date = st.date_input('Input beginning of the wanted timeframe', datetime.date(2010, 12, 1),
211
+ min_value=datetime.date(2010, 12, 1), max_value=datetime.date(2011, 12, 9), key=9)
212
+ end_product_date = st.date_input('Input beginning of the wanted timeframe', datetime.date(2011, 12, 9),
213
+ min_value=start_product_date, max_value=datetime.date(2011, 12, 9), key=10)
214
+ df_top_products_plot = df_info.copy()
215
+ df_subset_products = data_subset(df_top_products_plot, start_product_date, end_product_date)
216
+ start_product, end_product = st.select_slider('Select a range of top product', options=[x for x in range(1, 21)], value=(1, 10))
217
+ df_subset_products = df_subset_products.groupby('Description')['Quantity'].sum()
218
+ df_subset_products = df_subset_products.reset_index()
219
+ df_slider_products = df_subset_products.sort_values(by='Quantity', ascending=False)
220
+ df_slider_products = df_slider_products.iloc[start_product-1:end_product, :]
221
+
222
+ fig, ax = plt.subplots()
223
+ bars = plt.barh(y=df_slider_products['Description'], width=df_slider_products['Quantity'], color=['darkmagenta', 'darkblue', 'darkgreen', 'darkred', 'darkgrey', 'darkorange'])
224
+
225
+ ax.bar_label(bars)
226
+ ax = plt.gca().invert_yaxis()
227
+
228
+ st.subheader('Selected top products:')
229
+ st.pyplot(fig)
230
+
231
+ st.write("---")
232
+ #####################################################################################################################################
233
+
234
+ st.subheader('Barplot of sales:')
235
+ with st.expander('Select to choose the periodicity:'):
236
+ options_similar = ['Months', 'Days', 'Hours']
237
+ option_similar = st.selectbox('Choose a periodicity:', tuple(options_similar))
238
+
239
+ if option_similar == 'Months':
240
+ df_months = df_info.copy()
241
+ df_months = df_months.merge(pd.DataFrame(df_months.groupby('CustomerID')['Monetary'].sum()), on='CustomerID')
242
+ df_months['Periodicity'] = pd.DatetimeIndex(df_months['InvoiceDate']).month
243
+ df_months = df_months.sort_values('Recency')
244
+ df_months = df_months.drop_duplicates(subset='CustomerID')
245
+
246
+ fig1, ax1 = plt.subplots()
247
+ ax1 = sns.barplot(x=df_months['Periodicity'], y=df_months['Monetary_y'], errorbar=None)
248
+ plt.title('Sales per Months')
249
+ plt.xlabel('Periodicity: Months')
250
+ plt.ylabel('TotalOrderValue')
251
+ st.pyplot(fig1)
252
+
253
+ elif option_similar == 'Days':
254
+ df_days = df_info.copy()
255
+ df_days = df_days.merge(pd.DataFrame(df_days.groupby('CustomerID')['Monetary'].sum()), on='CustomerID')
256
+ df_days['Periodicity'] = pd.DatetimeIndex(df_days['InvoiceDate']).day
257
+ df_days = df_days.sort_values('Recency')
258
+ df_days = df_days.drop_duplicates(subset='CustomerID')
259
+
260
+ fig2, ax2 = plt.subplots()
261
+ ax2 = sns.barplot(x=df_days['Periodicity'], y=df_days['Monetary_y'], errorbar=None)
262
+ plt.title('Sales per Days')
263
+ plt.xlabel('Periodicity: Days')
264
+ plt.xticks(rotation=90)
265
+ plt.ylabel('TotalOrderValue')
266
+ st.pyplot(fig2)
267
+
268
+ elif option_similar == 'Hours':
269
+ df_hours = df_info.copy()
270
+ df_hours = df_hours.merge(pd.DataFrame(df_hours.groupby('CustomerID')['Monetary'].sum()), on='CustomerID')
271
+ df_hours['Periodicity'] = pd.DatetimeIndex(df_hours['InvoiceDate']).hour
272
+ df_hours = df_hours.sort_values('Recency')
273
+ df_hours = df_hours.drop_duplicates(subset='CustomerID')
274
+
275
+ fig3, ax3 = plt.subplots()
276
+ ax3 = sns.barplot(x=df_hours['Periodicity'], y=df_hours['Monetary_y'], errorbar=None)
277
+ plt.title('Sales per Hours')
278
+ plt.xlabel('Periodicity: Hours')
279
+ plt.ylabel('TotalOrderValue')
280
+ st.pyplot(fig3)
281
+
282
+
283
+ st.write("---")
284
+ #####################################################################################################################################
285
+
286
+ col1, col2, col3, col4, col5 = st.columns(5)
287
+ with col5:
288
+ logo_artefact = Image.open('static/logo_artefact.png')
289
+ st.image(logo_artefact)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ numpy
4
+ matplotlib
5
+ datetime
6
+ Pillow
7
+ plotly
static/customer_info.csv ADDED
The diff for this file is too large to render. See raw diff
 
static/logo_artefact.png ADDED
static/logo_random.png ADDED