Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- eda.py +75 -0
- prediction.py +31 -0
eda.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import seaborn as sns
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
import numpy as np
|
5 |
+
import plotly.express as px
|
6 |
+
from matplotlib.gridspec import GridSpec
|
7 |
+
|
8 |
+
def average_sales_by_region(df):
|
9 |
+
"""
|
10 |
+
Generate a bar plot for average sales by region.
|
11 |
+
"""
|
12 |
+
df_bar = df[['region', 'sales']]
|
13 |
+
df_bar = df_bar.groupby('region').mean().sort_values(by='sales', ascending=False)
|
14 |
+
fig, ax = plt.subplots(figsize=[10, 6])
|
15 |
+
sns.barplot(x=df_bar.index, y='sales', data=df_bar, palette='viridis', ax=ax)
|
16 |
+
ax.set_title('Average Sales Across Different Regions')
|
17 |
+
ax.set_xlabel('Region')
|
18 |
+
ax.set_ylabel('Average Sales')
|
19 |
+
for index, value in enumerate(df_bar['sales']):
|
20 |
+
ax.text(index, value, f"{value:.2f}", ha='center', va='bottom')
|
21 |
+
return fig
|
22 |
+
|
23 |
+
def average_sales_and_profit_over_time(df):
|
24 |
+
"""
|
25 |
+
Generate a line plot for average sales and profit over time.
|
26 |
+
"""
|
27 |
+
df_line = df[['order_date', 'sales', 'profit']].sort_values('order_date')
|
28 |
+
df_line['order_date'] = pd.to_datetime(df_line['order_date'])
|
29 |
+
df_line = df_line.groupby(df_line['order_date'].dt.to_period("M")).mean()
|
30 |
+
df_line.index = df_line.index.to_timestamp()
|
31 |
+
fig, ax = plt.subplots(figsize=[10, 6])
|
32 |
+
ax.plot(df_line.index, 'sales', data=df_line, color='green', label='Avg Sales')
|
33 |
+
ax.plot(df_line.index, 'profit', data=df_line, color='red', label='Avg Profit')
|
34 |
+
ax.legend()
|
35 |
+
ax.set_title('Average Sales and Profit Over Time (Monthly)')
|
36 |
+
ax.set_xlabel('Time')
|
37 |
+
ax.set_ylabel('Value')
|
38 |
+
return fig
|
39 |
+
|
40 |
+
def segment_vs_region_distribution(df):
|
41 |
+
"""
|
42 |
+
Generate a count plot for segments across different regions.
|
43 |
+
"""
|
44 |
+
fig = plt.figure(figsize=(10, 6))
|
45 |
+
sns.countplot(x='segment', data=df, hue='region', palette='viridis')
|
46 |
+
plt.title('Segment vs. Region Distribution')
|
47 |
+
plt.xlabel('Segment')
|
48 |
+
plt.ylabel('Count')
|
49 |
+
plt.legend(title='Region')
|
50 |
+
return fig
|
51 |
+
|
52 |
+
def sales_vs_profit_across_segments(df):
|
53 |
+
"""
|
54 |
+
Generate a scatter plot comparing sales and profit across different customer segments.
|
55 |
+
"""
|
56 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
57 |
+
sns.scatterplot(x='sales', y='profit', hue='segment', data=df, palette='viridis', size='sales', sizes=(20, 200), ax=ax)
|
58 |
+
ax.set_title('Sales vs. Profit Across Different Customer Segments')
|
59 |
+
ax.set_xlabel('Sales')
|
60 |
+
ax.set_ylabel('Profit')
|
61 |
+
return fig
|
62 |
+
|
63 |
+
def category_composition_for_profit_and_sales(df):
|
64 |
+
"""
|
65 |
+
Generate pie charts for the composition of category for profit and sales.
|
66 |
+
"""
|
67 |
+
df_pie = df.groupby('category').agg({'sales': 'sum', 'profit': 'sum'}).reset_index()
|
68 |
+
fig, axs = plt.subplots(1, 2, figsize=(14, 7))
|
69 |
+
axs[0].pie(df_pie['sales'], labels=df_pie['category'], autopct='%1.1f%%', startangle=140, colors=['#ff9999','#66b3ff','#99ff99','#ffcc99'])
|
70 |
+
axs[0].set_title('Sales Composition by Category')
|
71 |
+
axs[1].pie(df_pie['profit'], labels=df_pie['category'], autopct='%1.1f%%', startangle=140, colors=['#ff9999','#66b3ff','#99ff99','#ffcc99'])
|
72 |
+
axs[1].set_title('Profit Composition by Category')
|
73 |
+
return fig
|
74 |
+
|
75 |
+
# Additional EDA functions can be added following the same pattern
|
prediction.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# prediction.py
|
2 |
+
|
3 |
+
import joblib
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
# Load the pipeline and model
|
7 |
+
pipeline = joblib.load('full_pipeline_with_unit_price.pkl')
|
8 |
+
model = joblib.load('best_model.pkl')
|
9 |
+
|
10 |
+
def make_prediction(features):
|
11 |
+
"""
|
12 |
+
Takes an array of original features, transforms it using the pipeline,
|
13 |
+
and makes a prediction with the model.
|
14 |
+
|
15 |
+
Parameters:
|
16 |
+
- features: array-like, shape [n_features]
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
- The predicted value as a float.
|
20 |
+
"""
|
21 |
+
# Convert the features array into a DataFrame expected by the pipeline
|
22 |
+
# Note: You need to ensure the columns match those expected by the pipeline
|
23 |
+
features_df = pd.DataFrame([features], columns=['sales', 'quantity', 'discount', 'sub_category'])
|
24 |
+
|
25 |
+
# Process features through the pipeline
|
26 |
+
processed_features = pipeline.transform(features_df)
|
27 |
+
|
28 |
+
# Make a prediction
|
29 |
+
prediction = model.predict(processed_features)
|
30 |
+
|
31 |
+
return prediction[0] # Assuming we want a single prediction value
|