GMARTINEZMILLA commited on
Commit
801e1c6
1 Parent(s): 3ac8646

feat: updated website script and requirements

Browse files
Files changed (2) hide show
  1. app.py +96 -57
  2. requirements.txt +6 -1
app.py CHANGED
@@ -13,6 +13,7 @@ st.set_page_config(page_title="Customer Insights App", page_icon=":bar_chart:")
13
  df = pd.read_csv("df_clean.csv")
14
  nombres_proveedores = pd.read_csv("nombres_proveedores.csv", sep=';')
15
  euros_proveedor = pd.read_csv("euros_proveedor.csv", sep=',')
 
16
 
17
  # Ensure customer codes are strings
18
  df['CLIENTE'] = df['CLIENTE'].astype(str)
@@ -119,84 +120,122 @@ elif page == "Customer Analysis":
119
  customer_data = df[df["CLIENTE"] == str(customer_code)]
120
  customer_euros = euros_proveedor[euros_proveedor["CLIENTE"] == str(customer_code)]
121
 
 
122
  if not customer_data.empty and not customer_euros.empty:
123
  st.write(f"### Analysis for Customer {customer_code}")
124
 
125
- # Get percentage of units sold for each manufacturer
126
- all_manufacturers = customer_data.iloc[:, 1:].T # Exclude CLIENTE column
127
- all_manufacturers.index = all_manufacturers.index.astype(str)
128
-
129
- # Get total sales for each manufacturer
130
- sales_data = customer_euros.iloc[:, 1:].T # Exclude CLIENTE column
131
- sales_data.index = sales_data.index.astype(str)
132
-
133
- # Remove the 'CLIENTE' row from sales_data to avoid issues with mixed types
134
- sales_data_filtered = sales_data.drop(index='CLIENTE', errors='ignore')
135
-
136
- # Ensure all values are numeric
137
- sales_data_filtered = sales_data_filtered.apply(pd.to_numeric, errors='coerce')
138
-
139
- # Sort manufacturers by percentage of units and get top 10
140
- top_units = all_manufacturers.sort_values(by=all_manufacturers.columns[0], ascending=False).head(10)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
- # Sort manufacturers by total sales and get top 10
143
- top_sales = sales_data_filtered.sort_values(by=sales_data_filtered.columns[0], ascending=False).head(10)
 
144
 
145
- # Combine top manufacturers from both lists and get up to 20 unique manufacturers
146
- combined_top = pd.concat([top_units, top_sales]).index.unique()[:20]
147
 
148
- # Filter out manufacturers that are not present in both datasets
149
- combined_top = [m for m in combined_top if m in all_manufacturers.index and m in sales_data_filtered.index]
150
 
151
- # Create a DataFrame with combined data for these top manufacturers
152
- combined_data = pd.DataFrame({
153
- 'units': all_manufacturers.loc[combined_top, all_manufacturers.columns[0]],
154
- 'sales': sales_data_filtered.loc[combined_top, sales_data_filtered.columns[0]]
155
- }).fillna(0)
156
 
157
- # Sort by units, then by sales
158
- combined_data_sorted = combined_data.sort_values(by=['units', 'sales'], ascending=False)
 
 
159
 
160
- # Filter out manufacturers with 0 units
161
- non_zero_manufacturers = combined_data_sorted[combined_data_sorted['units'] > 0]
162
 
163
- # If we have less than 3 non-zero manufacturers, add some zero-value ones
164
- if len(non_zero_manufacturers) < 3:
165
- zero_manufacturers = combined_data_sorted[combined_data_sorted['units'] == 0].head(3 - len(non_zero_manufacturers))
166
- manufacturers_to_show = pd.concat([non_zero_manufacturers, zero_manufacturers])
167
- else:
168
- manufacturers_to_show = non_zero_manufacturers
169
 
170
- values = manufacturers_to_show['units'].tolist()
171
- amounts = manufacturers_to_show['sales'].tolist()
172
- manufacturers = [get_supplier_name(m) for m in manufacturers_to_show.index]
173
 
174
- st.write(f"### Results for top {len(manufacturers)} manufacturers:")
175
- for manufacturer, value, amount in zip(manufacturers, values, amounts):
176
- st.write(f"{manufacturer} = {value:.2f}% of units, €{amount:.2f} total sales")
177
 
178
- if manufacturers: # Only create the chart if we have data
179
- fig = radar_chart(manufacturers, values, amounts, f'Radar Chart for Top {len(manufacturers)} Manufacturers of Customer {customer_code}')
180
- st.pyplot(fig)
181
- else:
182
- st.warning("No data available to create the radar chart.")
183
 
184
- # Customer sales 2021-2024 (if data exists)
185
- sales_columns = ['VENTA_2021', 'VENTA_2022', 'VENTA_2023', 'VENTA_2024']
186
- if all(col in df.columns for col in sales_columns):
187
- years = ['2021', '2022', '2023', '2024']
188
- customer_sales = customer_data[sales_columns].values[0]
189
 
190
- fig_sales = px.line(x=years, y=customer_sales, markers=True, title=f'Sales Over the Years for Customer {customer_code}')
191
- fig_sales.update_layout(xaxis_title="Year", yaxis_title="Sales")
192
- st.plotly_chart(fig_sales)
 
 
193
  else:
194
- st.warning("Sales data for 2021-2024 not available.")
195
  else:
196
  st.warning(f"No data found for customer {customer_code}. Please check the code.")
197
  else:
198
  st.warning("Please select a customer.")
199
 
 
200
  # Customer Recommendations Page
201
  elif page == "Articles Recommendations":
202
  st.title("Articles Recommendations")
 
13
  df = pd.read_csv("df_clean.csv")
14
  nombres_proveedores = pd.read_csv("nombres_proveedores.csv", sep=';')
15
  euros_proveedor = pd.read_csv("euros_proveedor.csv", sep=',')
16
+ ventas_clientes = pd.read_csv("ventas_clientes.csv", sep=',')
17
 
18
  # Ensure customer codes are strings
19
  df['CLIENTE'] = df['CLIENTE'].astype(str)
 
120
  customer_data = df[df["CLIENTE"] == str(customer_code)]
121
  customer_euros = euros_proveedor[euros_proveedor["CLIENTE"] == str(customer_code)]
122
 
123
+ # Check if customer data exists
124
  if not customer_data.empty and not customer_euros.empty:
125
  st.write(f"### Analysis for Customer {customer_code}")
126
 
127
+ # **Step 1: Find Customer's Cluster**
128
+ customer_clusters = pd.read_csv('predicts/customer_clusters.csv')
129
+ cluster = customer_clusters[customer_clusters['cliente_id'] == customer_code]['cluster_id'].values[0]
130
+ st.write(f"Customer {customer_code} belongs to cluster {cluster}")
131
+
132
+ # **Step 2: Load the Corresponding Model**
133
+ model_path = f'models/modelo_cluster_{cluster}.txt'
134
+ gbm = lgb.Booster(model_file=model_path)
135
+ st.write(f"Loaded model for cluster {cluster}")
136
+
137
+ # **Step 3: Load X_predict for that cluster and extract customer-specific data**
138
+ X_predict_cluster = pd.read_csv(f'predicts/X_predict_cluster_{cluster}.csv')
139
+ X_cliente = X_predict_cluster[X_predict_cluster['cliente_id'] == customer_code]
140
+
141
+ if not X_cliente.empty:
142
+ # **Step 4: Make Prediction for the selected customer**
143
+ y_pred = gbm.predict(X_cliente.drop(columns=['cliente_id']), num_iteration=gbm.best_iteration)
144
+ st.write(f"Predicted sales for Customer {customer_code}: {y_pred[0]:.2f}")
145
+
146
+ # **Step 5: Merge with actual data from df_agg_2024**
147
+ df_agg_2024 = pd.read_csv('predicts/df_agg_2024.csv')
148
+ actual_sales = df_agg_2024[(df_agg_2024['cliente_id'] == customer_code) & (df_agg_2024['marca_id_encoded'].isin(X_cliente['marca_id_encoded']))]
149
+ if not actual_sales.empty:
150
+ merged_data = pd.merge(
151
+ pd.DataFrame({'cliente_id': [customer_code], 'ventas_predichas': y_pred}),
152
+ actual_sales[['cliente_id', 'marca_id_encoded', 'precio_total']],
153
+ on='cliente_id',
154
+ how='left'
155
+ )
156
+ merged_data.rename(columns={'precio_total': 'ventas_reales'}, inplace=True)
157
+
158
+ # Calculate metrics (MAE, MAPE, RMSE, SMAPE)
159
+ mae = mean_absolute_error(merged_data['ventas_reales'], merged_data['ventas_predichas'])
160
+ mape = np.mean(np.abs((merged_data['ventas_reales'] - merged_data['ventas_predichas']) / merged_data['ventas_reales'])) * 100
161
+ rmse = np.sqrt(mean_squared_error(merged_data['ventas_reales'], merged_data['ventas_predichas']))
162
+ smape_value = smape(merged_data['ventas_reales'], merged_data['ventas_predichas'])
163
+
164
+ st.write(f"MAE: {mae:.2f}")
165
+ st.write(f"MAPE: {mape:.2f}%")
166
+ st.write(f"RMSE: {rmse:.2f}")
167
+ st.write(f"SMAPE: {smape_value:.2f}%")
168
+
169
+ # **Step 6: Analysis of results (show insights if the customer is performing well or not)**
170
+ if mae < threshold_good:
171
+ st.success(f"Customer {customer_code} is performing well based on the predictions.")
172
+ else:
173
+ st.warning(f"Customer {customer_code} is not performing well based on the predictions.")
174
+ else:
175
+ st.warning(f"No actual sales data found for customer {customer_code} in df_agg_2024.")
176
 
177
+ # **Show the radar chart**
178
+ all_manufacturers = customer_data.iloc[:, 1:].T # Exclude CLIENTE column
179
+ all_manufacturers.index = all_manufacturers.index.astype(str)
180
 
181
+ sales_data = customer_euros.iloc[:, 1:].T # Exclude CLIENTE column
182
+ sales_data.index = sales_data.index.astype(str)
183
 
184
+ sales_data_filtered = sales_data.drop(index='CLIENTE', errors='ignore')
185
+ sales_data_filtered = sales_data_filtered.apply(pd.to_numeric, errors='coerce')
186
 
187
+ top_units = all_manufacturers.sort_values(by=all_manufacturers.columns[0], ascending=False).head(10)
188
+ top_sales = sales_data_filtered.sort_values(by=sales_data_filtered.columns[0], ascending=False).head(10)
189
+ combined_top = pd.concat([top_units, top_sales]).index.unique()[:20]
190
+ combined_top = [m for m in combined_top if m in all_manufacturers.index and m in sales_data_filtered.index]
 
191
 
192
+ combined_data = pd.DataFrame({
193
+ 'units': all_manufacturers.loc[combined_top, all_manufacturers.columns[0]],
194
+ 'sales': sales_data_filtered.loc[combined_top, sales_data_filtered.columns[0]]
195
+ }).fillna(0)
196
 
197
+ combined_data_sorted = combined_data.sort_values(by=['units', 'sales'], ascending=False)
198
+ non_zero_manufacturers = combined_data_sorted[combined_data_sorted['units'] > 0]
199
 
200
+ if len(non_zero_manufacturers) < 3:
201
+ zero_manufacturers = combined_data_sorted[combined_data_sorted['units'] == 0].head(3 - len(non_zero_manufacturers))
202
+ manufacturers_to_show = pd.concat([non_zero_manufacturers, zero_manufacturers])
203
+ else:
204
+ manufacturers_to_show = non_zero_manufacturers
 
205
 
206
+ values = manufacturers_to_show['units'].tolist()
207
+ amounts = manufacturers_to_show['sales'].tolist()
208
+ manufacturers = [get_supplier_name(m) for m in manufacturers_to_show.index]
209
 
210
+ st.write(f"### Results for top {len(manufacturers)} manufacturers:")
211
+ for manufacturer, value, amount in zip(manufacturers, values, amounts):
212
+ st.write(f"{manufacturer} = {value:.2f}% of units, €{amount:.2f} total sales")
213
 
214
+ if manufacturers:
215
+ fig = radar_chart(manufacturers, values, amounts, f'Radar Chart for Top {len(manufacturers)} Manufacturers of Customer {customer_code}')
216
+ st.pyplot(fig)
217
+ else:
218
+ st.warning("No data available to create the radar chart.")
219
 
220
+ # **Show sales over the years graph**
221
+ sales_columns = ['VENTA_2021', 'VENTA_2022', 'VENTA_2023']
222
+ if all(col in ventas_clientes.columns for col in sales_columns):
223
+ years = ['2021', '2022', '2023']
224
+ customer_sales = ventas_clientes[ventas_clientes['codigo_cliente'] == customer_code][sales_columns].values[0]
225
 
226
+ fig_sales = px.line(x=years, y=customer_sales, markers=True, title=f'Sales Over the Years for Customer {customer_code}')
227
+ fig_sales.update_layout(xaxis_title="Year", yaxis_title="Sales")
228
+ st.plotly_chart(fig_sales)
229
+ else:
230
+ st.warning("Sales data for 2021-2023 not available.")
231
  else:
232
+ st.warning(f"No prediction data found for customer {customer_code}.")
233
  else:
234
  st.warning(f"No data found for customer {customer_code}. Please check the code.")
235
  else:
236
  st.warning("Please select a customer.")
237
 
238
+
239
  # Customer Recommendations Page
240
  elif page == "Articles Recommendations":
241
  st.title("Articles Recommendations")
requirements.txt CHANGED
@@ -1,3 +1,8 @@
1
  plotly
2
  matplotlib
3
- scikit-learn
 
 
 
 
 
 
1
  plotly
2
  matplotlib
3
+ scikit-learn
4
+ streamlit
5
+ lightgbm
6
+ pandas
7
+ numpy
8
+ joblib