Spaces:

GMARTINEZMILLA
/

Final_Project

Sleeping

App Files Files Community

GMARTINEZMILLA commited on Oct 15, 2024

Commit

801e1c6

1 Parent(s): 3ac8646

feat: updated website script and requirements

Browse files

Files changed (2) hide show

app.py +96 -57
requirements.txt +6 -1

app.py CHANGED Viewed

@@ -13,6 +13,7 @@ st.set_page_config(page_title="Customer Insights App", page_icon=":bar_chart:")
 df = pd.read_csv("df_clean.csv")
 nombres_proveedores = pd.read_csv("nombres_proveedores.csv", sep=';')
 euros_proveedor = pd.read_csv("euros_proveedor.csv", sep=',')
 # Ensure customer codes are strings
 df['CLIENTE'] = df['CLIENTE'].astype(str)
@@ -119,84 +120,122 @@ elif page == "Customer Analysis":
             customer_data = df[df["CLIENTE"] == str(customer_code)]
             customer_euros = euros_proveedor[euros_proveedor["CLIENTE"] == str(customer_code)]
             if not customer_data.empty and not customer_euros.empty:
                 st.write(f"### Analysis for Customer {customer_code}")
-                # Get percentage of units sold for each manufacturer
-                all_manufacturers = customer_data.iloc[:, 1:].T  # Exclude CLIENTE column
-                all_manufacturers.index = all_manufacturers.index.astype(str)
-                # Get total sales for each manufacturer
-                sales_data = customer_euros.iloc[:, 1:].T  # Exclude CLIENTE column
-                sales_data.index = sales_data.index.astype(str)
-                # Remove the 'CLIENTE' row from sales_data to avoid issues with mixed types
-                sales_data_filtered = sales_data.drop(index='CLIENTE', errors='ignore')
-                # Ensure all values are numeric
-                sales_data_filtered = sales_data_filtered.apply(pd.to_numeric, errors='coerce')
-                # Sort manufacturers by percentage of units and get top 10
-                top_units = all_manufacturers.sort_values(by=all_manufacturers.columns[0], ascending=False).head(10)
-                # Sort manufacturers by total sales and get top 10
-                top_sales = sales_data_filtered.sort_values(by=sales_data_filtered.columns[0], ascending=False).head(10)
-                # Combine top manufacturers from both lists and get up to 20 unique manufacturers
-                combined_top = pd.concat([top_units, top_sales]).index.unique()[:20]
-                # Filter out manufacturers that are not present in both datasets
-                combined_top = [m for m in combined_top if m in all_manufacturers.index and m in sales_data_filtered.index]
-                # Create a DataFrame with combined data for these top manufacturers
-                combined_data = pd.DataFrame({
-                    'units': all_manufacturers.loc[combined_top, all_manufacturers.columns[0]],
-                    'sales': sales_data_filtered.loc[combined_top, sales_data_filtered.columns[0]]
-                }).fillna(0)
-                # Sort by units, then by sales
-                combined_data_sorted = combined_data.sort_values(by=['units', 'sales'], ascending=False)
-                # Filter out manufacturers with 0 units
-                non_zero_manufacturers = combined_data_sorted[combined_data_sorted['units'] > 0]
-                # If we have less than 3 non-zero manufacturers, add some zero-value ones
-                if len(non_zero_manufacturers) < 3:
-                    zero_manufacturers = combined_data_sorted[combined_data_sorted['units'] == 0].head(3 - len(non_zero_manufacturers))
-                    manufacturers_to_show = pd.concat([non_zero_manufacturers, zero_manufacturers])
-                else:
-                    manufacturers_to_show = non_zero_manufacturers
-                values = manufacturers_to_show['units'].tolist()
-                amounts = manufacturers_to_show['sales'].tolist()
-                manufacturers = [get_supplier_name(m) for m in manufacturers_to_show.index]
-                st.write(f"### Results for top {len(manufacturers)} manufacturers:")
-                for manufacturer, value, amount in zip(manufacturers, values, amounts):
-                    st.write(f"{manufacturer} = {value:.2f}% of units, €{amount:.2f} total sales")
-                if manufacturers:  # Only create the chart if we have data
-                    fig = radar_chart(manufacturers, values, amounts, f'Radar Chart for Top {len(manufacturers)} Manufacturers of Customer {customer_code}')
-                    st.pyplot(fig)
-                else:
-                    st.warning("No data available to create the radar chart.")
-                # Customer sales 2021-2024 (if data exists)
-                sales_columns = ['VENTA_2021', 'VENTA_2022', 'VENTA_2023', 'VENTA_2024']
-                if all(col in df.columns for col in sales_columns):
-                    years = ['2021', '2022', '2023', '2024']
-                    customer_sales = customer_data[sales_columns].values[0]
-                    fig_sales = px.line(x=years, y=customer_sales, markers=True, title=f'Sales Over the Years for Customer {customer_code}')
-                    fig_sales.update_layout(xaxis_title="Year", yaxis_title="Sales")
-                    st.plotly_chart(fig_sales)
                 else:
-                    st.warning("Sales data for 2021-2024 not available.")
             else:
                 st.warning(f"No data found for customer {customer_code}. Please check the code.")
         else:
             st.warning("Please select a customer.")
 # Customer Recommendations Page
 elif page == "Articles Recommendations":
     st.title("Articles Recommendations")

 df = pd.read_csv("df_clean.csv")
 nombres_proveedores = pd.read_csv("nombres_proveedores.csv", sep=';')
 euros_proveedor = pd.read_csv("euros_proveedor.csv", sep=',')
+ventas_clientes = pd.read_csv("ventas_clientes.csv", sep=',')
 # Ensure customer codes are strings
 df['CLIENTE'] = df['CLIENTE'].astype(str)
             customer_data = df[df["CLIENTE"] == str(customer_code)]
             customer_euros = euros_proveedor[euros_proveedor["CLIENTE"] == str(customer_code)]
+            # Check if customer data exists
             if not customer_data.empty and not customer_euros.empty:
                 st.write(f"### Analysis for Customer {customer_code}")
+                # **Step 1: Find Customer's Cluster**
+                customer_clusters = pd.read_csv('predicts/customer_clusters.csv')
+                cluster = customer_clusters[customer_clusters['cliente_id'] == customer_code]['cluster_id'].values[0]
+                st.write(f"Customer {customer_code} belongs to cluster {cluster}")
+                # **Step 2: Load the Corresponding Model**
+                model_path = f'models/modelo_cluster_{cluster}.txt'
+                gbm = lgb.Booster(model_file=model_path)
+                st.write(f"Loaded model for cluster {cluster}")
+                # **Step 3: Load X_predict for that cluster and extract customer-specific data**
+                X_predict_cluster = pd.read_csv(f'predicts/X_predict_cluster_{cluster}.csv')
+                X_cliente = X_predict_cluster[X_predict_cluster['cliente_id'] == customer_code]
+                if not X_cliente.empty:
+                    # **Step 4: Make Prediction for the selected customer**
+                    y_pred = gbm.predict(X_cliente.drop(columns=['cliente_id']), num_iteration=gbm.best_iteration)
+                    st.write(f"Predicted sales for Customer {customer_code}: {y_pred[0]:.2f}")
+                    # **Step 5: Merge with actual data from df_agg_2024**
+                    df_agg_2024 = pd.read_csv('predicts/df_agg_2024.csv')
+                    actual_sales = df_agg_2024[(df_agg_2024['cliente_id'] == customer_code) & (df_agg_2024['marca_id_encoded'].isin(X_cliente['marca_id_encoded']))]
+                    if not actual_sales.empty:
+                        merged_data = pd.merge(
+                            pd.DataFrame({'cliente_id': [customer_code], 'ventas_predichas': y_pred}),
+                            actual_sales[['cliente_id', 'marca_id_encoded', 'precio_total']],
+                            on='cliente_id',
+                            how='left'
+                        )
+                        merged_data.rename(columns={'precio_total': 'ventas_reales'}, inplace=True)
+                        # Calculate metrics (MAE, MAPE, RMSE, SMAPE)
+                        mae = mean_absolute_error(merged_data['ventas_reales'], merged_data['ventas_predichas'])
+                        mape = np.mean(np.abs((merged_data['ventas_reales'] - merged_data['ventas_predichas']) / merged_data['ventas_reales'])) * 100
+                        rmse = np.sqrt(mean_squared_error(merged_data['ventas_reales'], merged_data['ventas_predichas']))
+                        smape_value = smape(merged_data['ventas_reales'], merged_data['ventas_predichas'])
+                        st.write(f"MAE: {mae:.2f}")
+                        st.write(f"MAPE: {mape:.2f}%")
+                        st.write(f"RMSE: {rmse:.2f}")
+                        st.write(f"SMAPE: {smape_value:.2f}%")
+                        # **Step 6: Analysis of results (show insights if the customer is performing well or not)**
+                        if mae < threshold_good:
+                            st.success(f"Customer {customer_code} is performing well based on the predictions.")
+                        else:
+                            st.warning(f"Customer {customer_code} is not performing well based on the predictions.")
+                    else:
+                        st.warning(f"No actual sales data found for customer {customer_code} in df_agg_2024.")
+                    # **Show the radar chart**
+                    all_manufacturers = customer_data.iloc[:, 1:].T  # Exclude CLIENTE column
+                    all_manufacturers.index = all_manufacturers.index.astype(str)
+                    sales_data = customer_euros.iloc[:, 1:].T  # Exclude CLIENTE column
+                    sales_data.index = sales_data.index.astype(str)
+                    sales_data_filtered = sales_data.drop(index='CLIENTE', errors='ignore')
+                    sales_data_filtered = sales_data_filtered.apply(pd.to_numeric, errors='coerce')
+                    top_units = all_manufacturers.sort_values(by=all_manufacturers.columns[0], ascending=False).head(10)
+                    top_sales = sales_data_filtered.sort_values(by=sales_data_filtered.columns[0], ascending=False).head(10)
+                    combined_top = pd.concat([top_units, top_sales]).index.unique()[:20]
+                    combined_top = [m for m in combined_top if m in all_manufacturers.index and m in sales_data_filtered.index]
+                    combined_data = pd.DataFrame({
+                        'units': all_manufacturers.loc[combined_top, all_manufacturers.columns[0]],
+                        'sales': sales_data_filtered.loc[combined_top, sales_data_filtered.columns[0]]
+                    }).fillna(0)
+                    combined_data_sorted = combined_data.sort_values(by=['units', 'sales'], ascending=False)
+                    non_zero_manufacturers = combined_data_sorted[combined_data_sorted['units'] > 0]
+                    if len(non_zero_manufacturers) < 3:
+                        zero_manufacturers = combined_data_sorted[combined_data_sorted['units'] == 0].head(3 - len(non_zero_manufacturers))
+                        manufacturers_to_show = pd.concat([non_zero_manufacturers, zero_manufacturers])
+                    else:
+                        manufacturers_to_show = non_zero_manufacturers
+                    values = manufacturers_to_show['units'].tolist()
+                    amounts = manufacturers_to_show['sales'].tolist()
+                    manufacturers = [get_supplier_name(m) for m in manufacturers_to_show.index]
+                    st.write(f"### Results for top {len(manufacturers)} manufacturers:")
+                    for manufacturer, value, amount in zip(manufacturers, values, amounts):
+                        st.write(f"{manufacturer} = {value:.2f}% of units, €{amount:.2f} total sales")
+                    if manufacturers:
+                        fig = radar_chart(manufacturers, values, amounts, f'Radar Chart for Top {len(manufacturers)} Manufacturers of Customer {customer_code}')
+                        st.pyplot(fig)
+                    else:
+                        st.warning("No data available to create the radar chart.")
+                    # **Show sales over the years graph**
+                    sales_columns = ['VENTA_2021', 'VENTA_2022', 'VENTA_2023']
+                    if all(col in ventas_clientes.columns for col in sales_columns):
+                        years = ['2021', '2022', '2023']
+                        customer_sales = ventas_clientes[ventas_clientes['codigo_cliente'] == customer_code][sales_columns].values[0]
+                        fig_sales = px.line(x=years, y=customer_sales, markers=True, title=f'Sales Over the Years for Customer {customer_code}')
+                        fig_sales.update_layout(xaxis_title="Year", yaxis_title="Sales")
+                        st.plotly_chart(fig_sales)
+                    else:
+                        st.warning("Sales data for 2021-2023 not available.")
                 else:
+                    st.warning(f"No prediction data found for customer {customer_code}.")
             else:
                 st.warning(f"No data found for customer {customer_code}. Please check the code.")
         else:
             st.warning("Please select a customer.")
 # Customer Recommendations Page
 elif page == "Articles Recommendations":
     st.title("Articles Recommendations")

requirements.txt CHANGED Viewed

@@ -1,3 +1,8 @@
 plotly
 matplotlib
-scikit-learn

 plotly
 matplotlib
+scikit-learn
+streamlit
+lightgbm
+pandas
+numpy
+joblib