Spaces:

7sugiwa
/

profitboost

Sleeping

App Files Files Community

7sugiwa commited on Feb 8

Commit

5a56710

•

1 Parent(s): f844302

Upload 4 files

Browse files

Files changed (2) hide show

app.py +32 -50
prediction.py +17 -48

app.py CHANGED Viewed

@@ -4,10 +4,15 @@ import streamlit as st
 import pandas as pd
 import numpy as np
 import joblib
-from eda import (average_sales_by_region, average_sales_and_profit_over_time,
-                 segment_vs_region_distribution, sales_vs_profit_across_segments,
                  category_composition_for_profit_and_sales)
-from prediction import make_prediction
 # Load the dataset for EDA
 @st.cache
@@ -16,10 +21,6 @@ def load_data():
 df = load_data()
-# Load the pipeline and model for predictions
-pipeline = joblib.load('full_pipeline_with_unit_price.pkl')
-model = joblib.load('best_model.pkl')
 # Sidebar for navigation
 st.sidebar.title("Navigation")
 selection = st.sidebar.radio("Go to", ["Home", "EDA", "Make a Prediction"])
@@ -29,66 +30,47 @@ if selection == "Home":
 elif selection == "EDA":
     st.title("Exploratory Data Analysis (EDA)")
-    # Average Sales by Region
-    st.header("Average Sales by Region")
-    fig1 = average_sales_by_region(df)
-    st.pyplot(fig1)
-    # Average Sales and Profit Over Time
-    st.header("Average Sales and Profit Over Time")
-    fig2 = average_sales_and_profit_over_time(df)
-    st.pyplot(fig2)
-    # Segment vs. Region Distribution
-    st.header("Segment vs. Region Distribution")
-    fig3 = segment_vs_region_distribution(df)
-    st.pyplot(fig3)
-    # Sales vs. Profit Across Different Customer Segments
-    st.header("Sales vs. Profit Across Different Customer Segments")
-    fig4 = sales_vs_profit_across_segments(df)
-    st.pyplot(fig4)
-    # Category Composition for Profit and Sales
-    st.header("Category Composition for Profit and Sales")
-    fig5 = category_composition_for_profit_and_sales(df)
-    st.pyplot(fig5)
 elif selection == "Make a Prediction":
     st.title("Make a Sales Prediction")
-            # Input form
     with st.form("input_form"):
-        row_id = st.number_input('Row ID', min_value=1, value=1, step=1)
-        order_id = st.text_input('Order ID')
-        order_date = st.date_input('Order Date')
-        ship_date = st.date_input('Ship Date')
         ship_mode = st.selectbox('Ship Mode', ['First Class', 'Second Class', 'Standard Class', 'Same Day'])
-        customer_id = st.text_input('Customer ID')
-        customer_name = st.text_input('Customer Name')
         segment = st.selectbox('Segment', ['Consumer', 'Corporate', 'Home Office'])
         country = st.text_input('Country', value='United States')
         city = st.text_input('City')
         state = st.text_input('State')
         postal_code = st.text_input('Postal Code')
         region = st.selectbox('Region', ['South', 'West', 'Central', 'East'])
-        product_id = st.text_input('Product ID')
         category = st.selectbox('Category', ['Furniture', 'Office Supplies', 'Technology'])
         sub_category = st.selectbox('Sub-Category', ['Bookcases', 'Chairs', 'Labels', 'Tables', 'Storage', 'Furnishings', 'Art', 'Phones', 'Binders', 'Appliances', 'Paper', 'Accessories', 'Envelopes', 'Fasteners', 'Supplies', 'Machines', 'Copiers'])
         product_name = st.text_input('Product Name')
         sales = st.number_input('Sales', value=0.0, format="%.2f")
         quantity = st.number_input('Quantity', value=1, format="%d")
         discount = st.number_input('Discount', value=0.0, format="%.2f")
-        profit = st.number_input('Profit', value=0.0, format="%.2f")
         submit_button = st.form_submit_button("Predict")
     if submit_button:
-        # Construct the input DataFrame. Modify as necessary to fit the model's expected input
-        input_data = pd.DataFrame([[sales, quantity, discount, sub_category]],
-                                      columns=['sales', 'quantity', 'discount', 'sub_category'])
-        # Call prediction function
-        predicted_profit = make_prediction(input_data)
-        st.write(f'Predicted Profit: {predicted_profit:.2f}')

 import pandas as pd
 import numpy as np
 import joblib
+from datetime import datetime
+# Assuming the necessary EDA functions are defined in eda.py and imported here
+from eda import (average_sales_by_region, average_sales_and_profit_over_time,
+                 segment_vs_region_distribution, sales_vs_profit_across_segments,
                  category_composition_for_profit_and_sales)
+# Load the model for predictions
+model = joblib.load('best_model.pkl')
 # Load the dataset for EDA
 @st.cache
 df = load_data()
 # Sidebar for navigation
 st.sidebar.title("Navigation")
 selection = st.sidebar.radio("Go to", ["Home", "EDA", "Make a Prediction"])
 elif selection == "EDA":
     st.title("Exploratory Data Analysis (EDA)")
+    # Display EDA plots directly here or call a function that does
+    average_sales_by_region(df)
+    average_sales_and_profit_over_time(df)
+    segment_vs_region_distribution(df)
+    sales_vs_profit_across_segments(df)
+    category_composition_for_profit_and_sales(df)
 elif selection == "Make a Prediction":
     st.title("Make a Sales Prediction")
     with st.form("input_form"):
+        # Capture all inputs as per the original dataset
+        order_date = st.date_input('Order Date', datetime.now())
+        ship_date = st.date_input('Ship Date', datetime.now())
         ship_mode = st.selectbox('Ship Mode', ['First Class', 'Second Class', 'Standard Class', 'Same Day'])
         segment = st.selectbox('Segment', ['Consumer', 'Corporate', 'Home Office'])
         country = st.text_input('Country', value='United States')
         city = st.text_input('City')
         state = st.text_input('State')
         postal_code = st.text_input('Postal Code')
         region = st.selectbox('Region', ['South', 'West', 'Central', 'East'])
         category = st.selectbox('Category', ['Furniture', 'Office Supplies', 'Technology'])
         sub_category = st.selectbox('Sub-Category', ['Bookcases', 'Chairs', 'Labels', 'Tables', 'Storage', 'Furnishings', 'Art', 'Phones', 'Binders', 'Appliances', 'Paper', 'Accessories', 'Envelopes', 'Fasteners', 'Supplies', 'Machines', 'Copiers'])
         product_name = st.text_input('Product Name')
         sales = st.number_input('Sales', value=0.0, format="%.2f")
         quantity = st.number_input('Quantity', value=1, format="%d")
         discount = st.number_input('Discount', value=0.0, format="%.2f")
         submit_button = st.form_submit_button("Predict")
     if submit_button:
+        # Construct the input DataFrame
+        input_features = pd.DataFrame([[
+            order_date, ship_date, ship_mode, segment, country, city, state,
+            postal_code, region, category, sub_category, product_name, sales, quantity, discount
+        ]], columns=[
+            'Order Date', 'Ship Date', 'Ship Mode', 'Segment', 'Country', 'City', 'State',
+            'Postal Code', 'Region', 'Category', 'Sub-Category', 'Product Name', 'Sales', 'Quantity', 'Discount'
+        ])
+        # Preprocess and predict (You'll need to adjust this part based on how your model expects input)
+        # For example, you might need to transform 'input_features' to match the expected input format of your model
+        predicted_profit = model.predict(input_features)  # Adjust this line as necessary
+        st.write(f'Predicted Profit: {predicted_profit:.2f}')

prediction.py CHANGED Viewed

@@ -1,87 +1,56 @@
-# prediction.py
 import joblib
 import pandas as pd
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.cluster import KMeans
-import pandas as pd
-import joblib
 class UnitPriceTransformer(BaseEstimator, TransformerMixin):
     def fit(self, X, y=None):
         return self
     def transform(self, X):
         X['unit_price'] = X['sales'] / X['quantity']
-        return X
 class KMeansAndLabelTransformer(BaseEstimator, TransformerMixin):
     def __init__(self, n_clusters=3):
         self.n_clusters = n_clusters
         self.kmeans = KMeans(n_clusters=n_clusters, random_state=42)
     def fit(self, X, y=None):
-        # Fit the KMeans model on the 'unit_price', ensuring it's reshaped for a single feature
         self.kmeans.fit(X[['unit_price']])
         return self
     def transform(self, X):
-        # Predict the cluster labels
         cluster_labels = self.kmeans.predict(X[['unit_price']])
-        # Convert cluster labels to strings for concatenation
-        # Create a new DataFrame column for 'distinct_cluster_label'
-        # Here, we use the apply function with a lambda to concatenate the string representations safely
-        X = X.copy()  # Avoid SettingWithCopyWarning
-        X['cluster_labels_str'] = cluster_labels.astype(str)
-        X['distinct_cluster_label'] = X.apply(lambda row: row['cluster_labels_str'] + "_" + str(row['sub_category']), axis=1)
-        # Now that 'distinct_cluster_label' is created, 'cluster_labels_str' can be dropped
-        X.drop(['cluster_labels_str'], axis=1, inplace=True)
-        return X
 class DynamicOneHotEncoder(BaseEstimator, TransformerMixin):
-    def fit(self, X, y=None):
         self.encoder = OneHotEncoder(handle_unknown='ignore')
         self.encoder.fit(X[['distinct_cluster_label']])
         return self
     def transform(self, X):
         encoded_features = self.encoder.transform(X[['distinct_cluster_label']]).toarray()
         encoded_df = pd.DataFrame(encoded_features, columns=self.encoder.get_feature_names_out(['distinct_cluster_label']))
-        X.reset_index(drop=True, inplace=True)
-        result = pd.concat([X, encoded_df], axis=1)
-        result.drop(['distinct_cluster_label', 'sub_category', 'unit_price'], axis=1, inplace=True)  # Drop original columns if not needed
-        return result
-# Load the preprocessing pipeline
 pipeline = joblib.load('full_pipeline_with_unit_price.pkl')
-# Load the model
 model = joblib.load('best_model.pkl')
 def make_prediction(input_features):
-    """
-    Takes a dictionary of features, transforms it using the pipeline,
-    and makes a prediction with the model.
-    Parameters:
-    - input_features: dict, where keys are feature names and values are the corresponding values
-    Returns:
-    - The predicted value as a float.
-    """
-    # Convert the input features dictionary into a DataFrame
-    features_df = pd.DataFrame([input_features])
-    # Process features through the pipeline
-    processed_features = pipeline.transform(features_df)
-    # Make a prediction with the processed features using the model
     prediction = model.predict(processed_features)
-    return prediction[0]  # Assuming we want a single prediction value

 import joblib
 import pandas as pd
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.cluster import KMeans
+# Custom Transformer: UnitPriceTransformer
 class UnitPriceTransformer(BaseEstimator, TransformerMixin):
     def fit(self, X, y=None):
         return self
     def transform(self, X):
+        X = X.copy()  # Work on a copy to avoid SettingWithCopyWarning
         X['unit_price'] = X['sales'] / X['quantity']
+        return X[['unit_price']]
+# Custom Transformer: KMeansAndLabelTransformer
 class KMeansAndLabelTransformer(BaseEstimator, TransformerMixin):
     def __init__(self, n_clusters=3):
         self.n_clusters = n_clusters
         self.kmeans = KMeans(n_clusters=n_clusters, random_state=42)
     def fit(self, X, y=None):
         self.kmeans.fit(X[['unit_price']])
         return self
     def transform(self, X):
+        X = X.copy()  # Work on a copy to avoid SettingWithCopyWarning
         cluster_labels = self.kmeans.predict(X[['unit_price']])
+        X['distinct_cluster_label'] = cluster_labels.astype(str) + "_" + X['sub_category']
+        return X[['distinct_cluster_label']]
+# Custom Transformer: DynamicOneHotEncoder
 class DynamicOneHotEncoder(BaseEstimator, TransformerMixin):
+    def __init__(self):
         self.encoder = OneHotEncoder(handle_unknown='ignore')
+    def fit(self, X, y=None):
         self.encoder.fit(X[['distinct_cluster_label']])
         return self
     def transform(self, X):
+        X = X.copy()  # Work on a copy to avoid SettingWithCopyWarning
         encoded_features = self.encoder.transform(X[['distinct_cluster_label']]).toarray()
+        # Create a DataFrame with the encoded features
         encoded_df = pd.DataFrame(encoded_features, columns=self.encoder.get_feature_names_out(['distinct_cluster_label']))
+        return encoded_df
+# Load the pipeline and model
 pipeline = joblib.load('full_pipeline_with_unit_price.pkl')
 model = joblib.load('best_model.pkl')
 def make_prediction(input_features):
+    processed_features = pipeline.transform(pd.DataFrame([input_features]))
     prediction = model.predict(processed_features)
+    return prediction[0]