Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- app.py +32 -50
- prediction.py +17 -48
app.py
CHANGED
@@ -4,10 +4,15 @@ import streamlit as st
|
|
4 |
import pandas as pd
|
5 |
import numpy as np
|
6 |
import joblib
|
7 |
-
from
|
8 |
-
|
|
|
|
|
|
|
9 |
category_composition_for_profit_and_sales)
|
10 |
-
|
|
|
|
|
11 |
|
12 |
# Load the dataset for EDA
|
13 |
@st.cache
|
@@ -16,10 +21,6 @@ def load_data():
|
|
16 |
|
17 |
df = load_data()
|
18 |
|
19 |
-
# Load the pipeline and model for predictions
|
20 |
-
pipeline = joblib.load('full_pipeline_with_unit_price.pkl')
|
21 |
-
model = joblib.load('best_model.pkl')
|
22 |
-
|
23 |
# Sidebar for navigation
|
24 |
st.sidebar.title("Navigation")
|
25 |
selection = st.sidebar.radio("Go to", ["Home", "EDA", "Make a Prediction"])
|
@@ -29,66 +30,47 @@ if selection == "Home":
|
|
29 |
|
30 |
elif selection == "EDA":
|
31 |
st.title("Exploratory Data Analysis (EDA)")
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
st.header("Average Sales and Profit Over Time")
|
40 |
-
fig2 = average_sales_and_profit_over_time(df)
|
41 |
-
st.pyplot(fig2)
|
42 |
-
|
43 |
-
# Segment vs. Region Distribution
|
44 |
-
st.header("Segment vs. Region Distribution")
|
45 |
-
fig3 = segment_vs_region_distribution(df)
|
46 |
-
st.pyplot(fig3)
|
47 |
-
|
48 |
-
# Sales vs. Profit Across Different Customer Segments
|
49 |
-
st.header("Sales vs. Profit Across Different Customer Segments")
|
50 |
-
fig4 = sales_vs_profit_across_segments(df)
|
51 |
-
st.pyplot(fig4)
|
52 |
-
|
53 |
-
# Category Composition for Profit and Sales
|
54 |
-
st.header("Category Composition for Profit and Sales")
|
55 |
-
fig5 = category_composition_for_profit_and_sales(df)
|
56 |
-
st.pyplot(fig5)
|
57 |
-
|
58 |
elif selection == "Make a Prediction":
|
59 |
st.title("Make a Sales Prediction")
|
60 |
-
# Input form
|
61 |
with st.form("input_form"):
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
ship_date = st.date_input('Ship Date')
|
66 |
ship_mode = st.selectbox('Ship Mode', ['First Class', 'Second Class', 'Standard Class', 'Same Day'])
|
67 |
-
customer_id = st.text_input('Customer ID')
|
68 |
-
customer_name = st.text_input('Customer Name')
|
69 |
segment = st.selectbox('Segment', ['Consumer', 'Corporate', 'Home Office'])
|
70 |
country = st.text_input('Country', value='United States')
|
71 |
city = st.text_input('City')
|
72 |
state = st.text_input('State')
|
73 |
postal_code = st.text_input('Postal Code')
|
74 |
region = st.selectbox('Region', ['South', 'West', 'Central', 'East'])
|
75 |
-
product_id = st.text_input('Product ID')
|
76 |
category = st.selectbox('Category', ['Furniture', 'Office Supplies', 'Technology'])
|
77 |
sub_category = st.selectbox('Sub-Category', ['Bookcases', 'Chairs', 'Labels', 'Tables', 'Storage', 'Furnishings', 'Art', 'Phones', 'Binders', 'Appliances', 'Paper', 'Accessories', 'Envelopes', 'Fasteners', 'Supplies', 'Machines', 'Copiers'])
|
78 |
product_name = st.text_input('Product Name')
|
79 |
sales = st.number_input('Sales', value=0.0, format="%.2f")
|
80 |
quantity = st.number_input('Quantity', value=1, format="%d")
|
81 |
discount = st.number_input('Discount', value=0.0, format="%.2f")
|
82 |
-
profit = st.number_input('Profit', value=0.0, format="%.2f")
|
83 |
|
84 |
submit_button = st.form_submit_button("Predict")
|
85 |
|
86 |
if submit_button:
|
87 |
-
# Construct the input DataFrame
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
import pandas as pd
|
5 |
import numpy as np
|
6 |
import joblib
|
7 |
+
from datetime import datetime
|
8 |
+
|
9 |
+
# Assuming the necessary EDA functions are defined in eda.py and imported here
|
10 |
+
from eda import (average_sales_by_region, average_sales_and_profit_over_time,
|
11 |
+
segment_vs_region_distribution, sales_vs_profit_across_segments,
|
12 |
category_composition_for_profit_and_sales)
|
13 |
+
|
14 |
+
# Load the model for predictions
|
15 |
+
model = joblib.load('best_model.pkl')
|
16 |
|
17 |
# Load the dataset for EDA
|
18 |
@st.cache
|
|
|
21 |
|
22 |
df = load_data()
|
23 |
|
|
|
|
|
|
|
|
|
24 |
# Sidebar for navigation
|
25 |
st.sidebar.title("Navigation")
|
26 |
selection = st.sidebar.radio("Go to", ["Home", "EDA", "Make a Prediction"])
|
|
|
30 |
|
31 |
elif selection == "EDA":
|
32 |
st.title("Exploratory Data Analysis (EDA)")
|
33 |
+
# Display EDA plots directly here or call a function that does
|
34 |
+
average_sales_by_region(df)
|
35 |
+
average_sales_and_profit_over_time(df)
|
36 |
+
segment_vs_region_distribution(df)
|
37 |
+
sales_vs_profit_across_segments(df)
|
38 |
+
category_composition_for_profit_and_sales(df)
|
39 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
elif selection == "Make a Prediction":
|
41 |
st.title("Make a Sales Prediction")
|
|
|
42 |
with st.form("input_form"):
|
43 |
+
# Capture all inputs as per the original dataset
|
44 |
+
order_date = st.date_input('Order Date', datetime.now())
|
45 |
+
ship_date = st.date_input('Ship Date', datetime.now())
|
|
|
46 |
ship_mode = st.selectbox('Ship Mode', ['First Class', 'Second Class', 'Standard Class', 'Same Day'])
|
|
|
|
|
47 |
segment = st.selectbox('Segment', ['Consumer', 'Corporate', 'Home Office'])
|
48 |
country = st.text_input('Country', value='United States')
|
49 |
city = st.text_input('City')
|
50 |
state = st.text_input('State')
|
51 |
postal_code = st.text_input('Postal Code')
|
52 |
region = st.selectbox('Region', ['South', 'West', 'Central', 'East'])
|
|
|
53 |
category = st.selectbox('Category', ['Furniture', 'Office Supplies', 'Technology'])
|
54 |
sub_category = st.selectbox('Sub-Category', ['Bookcases', 'Chairs', 'Labels', 'Tables', 'Storage', 'Furnishings', 'Art', 'Phones', 'Binders', 'Appliances', 'Paper', 'Accessories', 'Envelopes', 'Fasteners', 'Supplies', 'Machines', 'Copiers'])
|
55 |
product_name = st.text_input('Product Name')
|
56 |
sales = st.number_input('Sales', value=0.0, format="%.2f")
|
57 |
quantity = st.number_input('Quantity', value=1, format="%d")
|
58 |
discount = st.number_input('Discount', value=0.0, format="%.2f")
|
|
|
59 |
|
60 |
submit_button = st.form_submit_button("Predict")
|
61 |
|
62 |
if submit_button:
|
63 |
+
# Construct the input DataFrame
|
64 |
+
input_features = pd.DataFrame([[
|
65 |
+
order_date, ship_date, ship_mode, segment, country, city, state,
|
66 |
+
postal_code, region, category, sub_category, product_name, sales, quantity, discount
|
67 |
+
]], columns=[
|
68 |
+
'Order Date', 'Ship Date', 'Ship Mode', 'Segment', 'Country', 'City', 'State',
|
69 |
+
'Postal Code', 'Region', 'Category', 'Sub-Category', 'Product Name', 'Sales', 'Quantity', 'Discount'
|
70 |
+
])
|
71 |
+
|
72 |
+
# Preprocess and predict (You'll need to adjust this part based on how your model expects input)
|
73 |
+
# For example, you might need to transform 'input_features' to match the expected input format of your model
|
74 |
+
predicted_profit = model.predict(input_features) # Adjust this line as necessary
|
75 |
+
|
76 |
+
st.write(f'Predicted Profit: {predicted_profit:.2f}')
|
prediction.py
CHANGED
@@ -1,87 +1,56 @@
|
|
1 |
-
# prediction.py
|
2 |
-
|
3 |
import joblib
|
4 |
import pandas as pd
|
5 |
from sklearn.base import BaseEstimator, TransformerMixin
|
6 |
from sklearn.preprocessing import OneHotEncoder
|
7 |
from sklearn.cluster import KMeans
|
8 |
-
import pandas as pd
|
9 |
-
import joblib
|
10 |
|
|
|
11 |
class UnitPriceTransformer(BaseEstimator, TransformerMixin):
|
12 |
def fit(self, X, y=None):
|
13 |
return self
|
14 |
|
15 |
def transform(self, X):
|
|
|
16 |
X['unit_price'] = X['sales'] / X['quantity']
|
17 |
-
return X
|
18 |
|
|
|
19 |
class KMeansAndLabelTransformer(BaseEstimator, TransformerMixin):
|
20 |
def __init__(self, n_clusters=3):
|
21 |
self.n_clusters = n_clusters
|
22 |
self.kmeans = KMeans(n_clusters=n_clusters, random_state=42)
|
23 |
|
24 |
def fit(self, X, y=None):
|
25 |
-
# Fit the KMeans model on the 'unit_price', ensuring it's reshaped for a single feature
|
26 |
self.kmeans.fit(X[['unit_price']])
|
27 |
return self
|
28 |
|
29 |
def transform(self, X):
|
30 |
-
#
|
31 |
cluster_labels = self.kmeans.predict(X[['unit_price']])
|
32 |
-
|
33 |
-
|
34 |
-
# Create a new DataFrame column for 'distinct_cluster_label'
|
35 |
-
# Here, we use the apply function with a lambda to concatenate the string representations safely
|
36 |
-
X = X.copy() # Avoid SettingWithCopyWarning
|
37 |
-
X['cluster_labels_str'] = cluster_labels.astype(str)
|
38 |
-
X['distinct_cluster_label'] = X.apply(lambda row: row['cluster_labels_str'] + "_" + str(row['sub_category']), axis=1)
|
39 |
-
|
40 |
-
# Now that 'distinct_cluster_label' is created, 'cluster_labels_str' can be dropped
|
41 |
-
X.drop(['cluster_labels_str'], axis=1, inplace=True)
|
42 |
-
|
43 |
-
return X
|
44 |
-
|
45 |
-
|
46 |
|
|
|
47 |
class DynamicOneHotEncoder(BaseEstimator, TransformerMixin):
|
48 |
-
def
|
49 |
self.encoder = OneHotEncoder(handle_unknown='ignore')
|
|
|
|
|
50 |
self.encoder.fit(X[['distinct_cluster_label']])
|
51 |
return self
|
52 |
|
53 |
def transform(self, X):
|
|
|
54 |
encoded_features = self.encoder.transform(X[['distinct_cluster_label']]).toarray()
|
|
|
55 |
encoded_df = pd.DataFrame(encoded_features, columns=self.encoder.get_feature_names_out(['distinct_cluster_label']))
|
56 |
-
|
57 |
-
result = pd.concat([X, encoded_df], axis=1)
|
58 |
-
result.drop(['distinct_cluster_label', 'sub_category', 'unit_price'], axis=1, inplace=True) # Drop original columns if not needed
|
59 |
-
return result
|
60 |
|
61 |
-
# Load the
|
62 |
pipeline = joblib.load('full_pipeline_with_unit_price.pkl')
|
63 |
-
|
64 |
-
# Load the model
|
65 |
model = joblib.load('best_model.pkl')
|
66 |
|
67 |
def make_prediction(input_features):
|
68 |
-
|
69 |
-
Takes a dictionary of features, transforms it using the pipeline,
|
70 |
-
and makes a prediction with the model.
|
71 |
-
|
72 |
-
Parameters:
|
73 |
-
- input_features: dict, where keys are feature names and values are the corresponding values
|
74 |
-
|
75 |
-
Returns:
|
76 |
-
- The predicted value as a float.
|
77 |
-
"""
|
78 |
-
# Convert the input features dictionary into a DataFrame
|
79 |
-
features_df = pd.DataFrame([input_features])
|
80 |
-
|
81 |
-
# Process features through the pipeline
|
82 |
-
processed_features = pipeline.transform(features_df)
|
83 |
-
|
84 |
-
# Make a prediction with the processed features using the model
|
85 |
prediction = model.predict(processed_features)
|
86 |
-
|
87 |
-
return prediction[0] # Assuming we want a single prediction value
|
|
|
|
|
|
|
1 |
import joblib
|
2 |
import pandas as pd
|
3 |
from sklearn.base import BaseEstimator, TransformerMixin
|
4 |
from sklearn.preprocessing import OneHotEncoder
|
5 |
from sklearn.cluster import KMeans
|
|
|
|
|
6 |
|
7 |
+
# Custom Transformer: UnitPriceTransformer
|
8 |
class UnitPriceTransformer(BaseEstimator, TransformerMixin):
|
9 |
def fit(self, X, y=None):
|
10 |
return self
|
11 |
|
12 |
def transform(self, X):
|
13 |
+
X = X.copy() # Work on a copy to avoid SettingWithCopyWarning
|
14 |
X['unit_price'] = X['sales'] / X['quantity']
|
15 |
+
return X[['unit_price']]
|
16 |
|
17 |
+
# Custom Transformer: KMeansAndLabelTransformer
|
18 |
class KMeansAndLabelTransformer(BaseEstimator, TransformerMixin):
|
19 |
def __init__(self, n_clusters=3):
|
20 |
self.n_clusters = n_clusters
|
21 |
self.kmeans = KMeans(n_clusters=n_clusters, random_state=42)
|
22 |
|
23 |
def fit(self, X, y=None):
|
|
|
24 |
self.kmeans.fit(X[['unit_price']])
|
25 |
return self
|
26 |
|
27 |
def transform(self, X):
|
28 |
+
X = X.copy() # Work on a copy to avoid SettingWithCopyWarning
|
29 |
cluster_labels = self.kmeans.predict(X[['unit_price']])
|
30 |
+
X['distinct_cluster_label'] = cluster_labels.astype(str) + "_" + X['sub_category']
|
31 |
+
return X[['distinct_cluster_label']]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
+
# Custom Transformer: DynamicOneHotEncoder
|
34 |
class DynamicOneHotEncoder(BaseEstimator, TransformerMixin):
|
35 |
+
def __init__(self):
|
36 |
self.encoder = OneHotEncoder(handle_unknown='ignore')
|
37 |
+
|
38 |
+
def fit(self, X, y=None):
|
39 |
self.encoder.fit(X[['distinct_cluster_label']])
|
40 |
return self
|
41 |
|
42 |
def transform(self, X):
|
43 |
+
X = X.copy() # Work on a copy to avoid SettingWithCopyWarning
|
44 |
encoded_features = self.encoder.transform(X[['distinct_cluster_label']]).toarray()
|
45 |
+
# Create a DataFrame with the encoded features
|
46 |
encoded_df = pd.DataFrame(encoded_features, columns=self.encoder.get_feature_names_out(['distinct_cluster_label']))
|
47 |
+
return encoded_df
|
|
|
|
|
|
|
48 |
|
49 |
+
# Load the pipeline and model
|
50 |
pipeline = joblib.load('full_pipeline_with_unit_price.pkl')
|
|
|
|
|
51 |
model = joblib.load('best_model.pkl')
|
52 |
|
53 |
def make_prediction(input_features):
|
54 |
+
processed_features = pipeline.transform(pd.DataFrame([input_features]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
prediction = model.predict(processed_features)
|
56 |
+
return prediction[0]
|
|