7sugiwa commited on
Commit
5a56710
1 Parent(s): f844302

Upload 4 files

Browse files
Files changed (2) hide show
  1. app.py +32 -50
  2. prediction.py +17 -48
app.py CHANGED
@@ -4,10 +4,15 @@ import streamlit as st
4
  import pandas as pd
5
  import numpy as np
6
  import joblib
7
- from eda import (average_sales_by_region, average_sales_and_profit_over_time,
8
- segment_vs_region_distribution, sales_vs_profit_across_segments,
 
 
 
9
  category_composition_for_profit_and_sales)
10
- from prediction import make_prediction
 
 
11
 
12
  # Load the dataset for EDA
13
  @st.cache
@@ -16,10 +21,6 @@ def load_data():
16
 
17
  df = load_data()
18
 
19
- # Load the pipeline and model for predictions
20
- pipeline = joblib.load('full_pipeline_with_unit_price.pkl')
21
- model = joblib.load('best_model.pkl')
22
-
23
  # Sidebar for navigation
24
  st.sidebar.title("Navigation")
25
  selection = st.sidebar.radio("Go to", ["Home", "EDA", "Make a Prediction"])
@@ -29,66 +30,47 @@ if selection == "Home":
29
 
30
  elif selection == "EDA":
31
  st.title("Exploratory Data Analysis (EDA)")
32
-
33
- # Average Sales by Region
34
- st.header("Average Sales by Region")
35
- fig1 = average_sales_by_region(df)
36
- st.pyplot(fig1)
37
-
38
- # Average Sales and Profit Over Time
39
- st.header("Average Sales and Profit Over Time")
40
- fig2 = average_sales_and_profit_over_time(df)
41
- st.pyplot(fig2)
42
-
43
- # Segment vs. Region Distribution
44
- st.header("Segment vs. Region Distribution")
45
- fig3 = segment_vs_region_distribution(df)
46
- st.pyplot(fig3)
47
-
48
- # Sales vs. Profit Across Different Customer Segments
49
- st.header("Sales vs. Profit Across Different Customer Segments")
50
- fig4 = sales_vs_profit_across_segments(df)
51
- st.pyplot(fig4)
52
-
53
- # Category Composition for Profit and Sales
54
- st.header("Category Composition for Profit and Sales")
55
- fig5 = category_composition_for_profit_and_sales(df)
56
- st.pyplot(fig5)
57
-
58
  elif selection == "Make a Prediction":
59
  st.title("Make a Sales Prediction")
60
- # Input form
61
  with st.form("input_form"):
62
- row_id = st.number_input('Row ID', min_value=1, value=1, step=1)
63
- order_id = st.text_input('Order ID')
64
- order_date = st.date_input('Order Date')
65
- ship_date = st.date_input('Ship Date')
66
  ship_mode = st.selectbox('Ship Mode', ['First Class', 'Second Class', 'Standard Class', 'Same Day'])
67
- customer_id = st.text_input('Customer ID')
68
- customer_name = st.text_input('Customer Name')
69
  segment = st.selectbox('Segment', ['Consumer', 'Corporate', 'Home Office'])
70
  country = st.text_input('Country', value='United States')
71
  city = st.text_input('City')
72
  state = st.text_input('State')
73
  postal_code = st.text_input('Postal Code')
74
  region = st.selectbox('Region', ['South', 'West', 'Central', 'East'])
75
- product_id = st.text_input('Product ID')
76
  category = st.selectbox('Category', ['Furniture', 'Office Supplies', 'Technology'])
77
  sub_category = st.selectbox('Sub-Category', ['Bookcases', 'Chairs', 'Labels', 'Tables', 'Storage', 'Furnishings', 'Art', 'Phones', 'Binders', 'Appliances', 'Paper', 'Accessories', 'Envelopes', 'Fasteners', 'Supplies', 'Machines', 'Copiers'])
78
  product_name = st.text_input('Product Name')
79
  sales = st.number_input('Sales', value=0.0, format="%.2f")
80
  quantity = st.number_input('Quantity', value=1, format="%d")
81
  discount = st.number_input('Discount', value=0.0, format="%.2f")
82
- profit = st.number_input('Profit', value=0.0, format="%.2f")
83
 
84
  submit_button = st.form_submit_button("Predict")
85
 
86
  if submit_button:
87
- # Construct the input DataFrame. Modify as necessary to fit the model's expected input
88
- input_data = pd.DataFrame([[sales, quantity, discount, sub_category]],
89
- columns=['sales', 'quantity', 'discount', 'sub_category'])
90
-
91
- # Call prediction function
92
- predicted_profit = make_prediction(input_data)
93
-
94
- st.write(f'Predicted Profit: {predicted_profit:.2f}')
 
 
 
 
 
 
 
4
  import pandas as pd
5
  import numpy as np
6
  import joblib
7
+ from datetime import datetime
8
+
9
+ # Assuming the necessary EDA functions are defined in eda.py and imported here
10
+ from eda import (average_sales_by_region, average_sales_and_profit_over_time,
11
+ segment_vs_region_distribution, sales_vs_profit_across_segments,
12
  category_composition_for_profit_and_sales)
13
+
14
+ # Load the model for predictions
15
+ model = joblib.load('best_model.pkl')
16
 
17
  # Load the dataset for EDA
18
  @st.cache
 
21
 
22
  df = load_data()
23
 
 
 
 
 
24
  # Sidebar for navigation
25
  st.sidebar.title("Navigation")
26
  selection = st.sidebar.radio("Go to", ["Home", "EDA", "Make a Prediction"])
 
30
 
31
  elif selection == "EDA":
32
  st.title("Exploratory Data Analysis (EDA)")
33
+ # Display EDA plots directly here or call a function that does
34
+ average_sales_by_region(df)
35
+ average_sales_and_profit_over_time(df)
36
+ segment_vs_region_distribution(df)
37
+ sales_vs_profit_across_segments(df)
38
+ category_composition_for_profit_and_sales(df)
39
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  elif selection == "Make a Prediction":
41
  st.title("Make a Sales Prediction")
 
42
  with st.form("input_form"):
43
+ # Capture all inputs as per the original dataset
44
+ order_date = st.date_input('Order Date', datetime.now())
45
+ ship_date = st.date_input('Ship Date', datetime.now())
 
46
  ship_mode = st.selectbox('Ship Mode', ['First Class', 'Second Class', 'Standard Class', 'Same Day'])
 
 
47
  segment = st.selectbox('Segment', ['Consumer', 'Corporate', 'Home Office'])
48
  country = st.text_input('Country', value='United States')
49
  city = st.text_input('City')
50
  state = st.text_input('State')
51
  postal_code = st.text_input('Postal Code')
52
  region = st.selectbox('Region', ['South', 'West', 'Central', 'East'])
 
53
  category = st.selectbox('Category', ['Furniture', 'Office Supplies', 'Technology'])
54
  sub_category = st.selectbox('Sub-Category', ['Bookcases', 'Chairs', 'Labels', 'Tables', 'Storage', 'Furnishings', 'Art', 'Phones', 'Binders', 'Appliances', 'Paper', 'Accessories', 'Envelopes', 'Fasteners', 'Supplies', 'Machines', 'Copiers'])
55
  product_name = st.text_input('Product Name')
56
  sales = st.number_input('Sales', value=0.0, format="%.2f")
57
  quantity = st.number_input('Quantity', value=1, format="%d")
58
  discount = st.number_input('Discount', value=0.0, format="%.2f")
 
59
 
60
  submit_button = st.form_submit_button("Predict")
61
 
62
  if submit_button:
63
+ # Construct the input DataFrame
64
+ input_features = pd.DataFrame([[
65
+ order_date, ship_date, ship_mode, segment, country, city, state,
66
+ postal_code, region, category, sub_category, product_name, sales, quantity, discount
67
+ ]], columns=[
68
+ 'Order Date', 'Ship Date', 'Ship Mode', 'Segment', 'Country', 'City', 'State',
69
+ 'Postal Code', 'Region', 'Category', 'Sub-Category', 'Product Name', 'Sales', 'Quantity', 'Discount'
70
+ ])
71
+
72
+ # Preprocess and predict (You'll need to adjust this part based on how your model expects input)
73
+ # For example, you might need to transform 'input_features' to match the expected input format of your model
74
+ predicted_profit = model.predict(input_features) # Adjust this line as necessary
75
+
76
+ st.write(f'Predicted Profit: {predicted_profit:.2f}')
prediction.py CHANGED
@@ -1,87 +1,56 @@
1
- # prediction.py
2
-
3
  import joblib
4
  import pandas as pd
5
  from sklearn.base import BaseEstimator, TransformerMixin
6
  from sklearn.preprocessing import OneHotEncoder
7
  from sklearn.cluster import KMeans
8
- import pandas as pd
9
- import joblib
10
 
 
11
  class UnitPriceTransformer(BaseEstimator, TransformerMixin):
12
  def fit(self, X, y=None):
13
  return self
14
 
15
  def transform(self, X):
 
16
  X['unit_price'] = X['sales'] / X['quantity']
17
- return X
18
 
 
19
  class KMeansAndLabelTransformer(BaseEstimator, TransformerMixin):
20
  def __init__(self, n_clusters=3):
21
  self.n_clusters = n_clusters
22
  self.kmeans = KMeans(n_clusters=n_clusters, random_state=42)
23
 
24
  def fit(self, X, y=None):
25
- # Fit the KMeans model on the 'unit_price', ensuring it's reshaped for a single feature
26
  self.kmeans.fit(X[['unit_price']])
27
  return self
28
 
29
  def transform(self, X):
30
- # Predict the cluster labels
31
  cluster_labels = self.kmeans.predict(X[['unit_price']])
32
-
33
- # Convert cluster labels to strings for concatenation
34
- # Create a new DataFrame column for 'distinct_cluster_label'
35
- # Here, we use the apply function with a lambda to concatenate the string representations safely
36
- X = X.copy() # Avoid SettingWithCopyWarning
37
- X['cluster_labels_str'] = cluster_labels.astype(str)
38
- X['distinct_cluster_label'] = X.apply(lambda row: row['cluster_labels_str'] + "_" + str(row['sub_category']), axis=1)
39
-
40
- # Now that 'distinct_cluster_label' is created, 'cluster_labels_str' can be dropped
41
- X.drop(['cluster_labels_str'], axis=1, inplace=True)
42
-
43
- return X
44
-
45
-
46
 
 
47
  class DynamicOneHotEncoder(BaseEstimator, TransformerMixin):
48
- def fit(self, X, y=None):
49
  self.encoder = OneHotEncoder(handle_unknown='ignore')
 
 
50
  self.encoder.fit(X[['distinct_cluster_label']])
51
  return self
52
 
53
  def transform(self, X):
 
54
  encoded_features = self.encoder.transform(X[['distinct_cluster_label']]).toarray()
 
55
  encoded_df = pd.DataFrame(encoded_features, columns=self.encoder.get_feature_names_out(['distinct_cluster_label']))
56
- X.reset_index(drop=True, inplace=True)
57
- result = pd.concat([X, encoded_df], axis=1)
58
- result.drop(['distinct_cluster_label', 'sub_category', 'unit_price'], axis=1, inplace=True) # Drop original columns if not needed
59
- return result
60
 
61
- # Load the preprocessing pipeline
62
  pipeline = joblib.load('full_pipeline_with_unit_price.pkl')
63
-
64
- # Load the model
65
  model = joblib.load('best_model.pkl')
66
 
67
  def make_prediction(input_features):
68
- """
69
- Takes a dictionary of features, transforms it using the pipeline,
70
- and makes a prediction with the model.
71
-
72
- Parameters:
73
- - input_features: dict, where keys are feature names and values are the corresponding values
74
-
75
- Returns:
76
- - The predicted value as a float.
77
- """
78
- # Convert the input features dictionary into a DataFrame
79
- features_df = pd.DataFrame([input_features])
80
-
81
- # Process features through the pipeline
82
- processed_features = pipeline.transform(features_df)
83
-
84
- # Make a prediction with the processed features using the model
85
  prediction = model.predict(processed_features)
86
-
87
- return prediction[0] # Assuming we want a single prediction value
 
 
 
1
  import joblib
2
  import pandas as pd
3
  from sklearn.base import BaseEstimator, TransformerMixin
4
  from sklearn.preprocessing import OneHotEncoder
5
  from sklearn.cluster import KMeans
 
 
6
 
7
+ # Custom Transformer: UnitPriceTransformer
8
  class UnitPriceTransformer(BaseEstimator, TransformerMixin):
9
  def fit(self, X, y=None):
10
  return self
11
 
12
  def transform(self, X):
13
+ X = X.copy() # Work on a copy to avoid SettingWithCopyWarning
14
  X['unit_price'] = X['sales'] / X['quantity']
15
+ return X[['unit_price']]
16
 
17
+ # Custom Transformer: KMeansAndLabelTransformer
18
  class KMeansAndLabelTransformer(BaseEstimator, TransformerMixin):
19
  def __init__(self, n_clusters=3):
20
  self.n_clusters = n_clusters
21
  self.kmeans = KMeans(n_clusters=n_clusters, random_state=42)
22
 
23
  def fit(self, X, y=None):
 
24
  self.kmeans.fit(X[['unit_price']])
25
  return self
26
 
27
  def transform(self, X):
28
+ X = X.copy() # Work on a copy to avoid SettingWithCopyWarning
29
  cluster_labels = self.kmeans.predict(X[['unit_price']])
30
+ X['distinct_cluster_label'] = cluster_labels.astype(str) + "_" + X['sub_category']
31
+ return X[['distinct_cluster_label']]
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ # Custom Transformer: DynamicOneHotEncoder
34
  class DynamicOneHotEncoder(BaseEstimator, TransformerMixin):
35
+ def __init__(self):
36
  self.encoder = OneHotEncoder(handle_unknown='ignore')
37
+
38
+ def fit(self, X, y=None):
39
  self.encoder.fit(X[['distinct_cluster_label']])
40
  return self
41
 
42
  def transform(self, X):
43
+ X = X.copy() # Work on a copy to avoid SettingWithCopyWarning
44
  encoded_features = self.encoder.transform(X[['distinct_cluster_label']]).toarray()
45
+ # Create a DataFrame with the encoded features
46
  encoded_df = pd.DataFrame(encoded_features, columns=self.encoder.get_feature_names_out(['distinct_cluster_label']))
47
+ return encoded_df
 
 
 
48
 
49
+ # Load the pipeline and model
50
  pipeline = joblib.load('full_pipeline_with_unit_price.pkl')
 
 
51
  model = joblib.load('best_model.pkl')
52
 
53
  def make_prediction(input_features):
54
+ processed_features = pipeline.transform(pd.DataFrame([input_features]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  prediction = model.predict(processed_features)
56
+ return prediction[0]