7sugiwa commited on
Commit
51abf05
1 Parent(s): 595ddfc

Upload 2 files

Browse files
Files changed (2) hide show
  1. eda.py +75 -0
  2. prediction.py +31 -0
eda.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import seaborn as sns
3
+ import matplotlib.pyplot as plt
4
+ import numpy as np
5
+ import plotly.express as px
6
+ from matplotlib.gridspec import GridSpec
7
+
8
+ def average_sales_by_region(df):
9
+ """
10
+ Generate a bar plot for average sales by region.
11
+ """
12
+ df_bar = df[['region', 'sales']]
13
+ df_bar = df_bar.groupby('region').mean().sort_values(by='sales', ascending=False)
14
+ fig, ax = plt.subplots(figsize=[10, 6])
15
+ sns.barplot(x=df_bar.index, y='sales', data=df_bar, palette='viridis', ax=ax)
16
+ ax.set_title('Average Sales Across Different Regions')
17
+ ax.set_xlabel('Region')
18
+ ax.set_ylabel('Average Sales')
19
+ for index, value in enumerate(df_bar['sales']):
20
+ ax.text(index, value, f"{value:.2f}", ha='center', va='bottom')
21
+ return fig
22
+
23
+ def average_sales_and_profit_over_time(df):
24
+ """
25
+ Generate a line plot for average sales and profit over time.
26
+ """
27
+ df_line = df[['order_date', 'sales', 'profit']].sort_values('order_date')
28
+ df_line['order_date'] = pd.to_datetime(df_line['order_date'])
29
+ df_line = df_line.groupby(df_line['order_date'].dt.to_period("M")).mean()
30
+ df_line.index = df_line.index.to_timestamp()
31
+ fig, ax = plt.subplots(figsize=[10, 6])
32
+ ax.plot(df_line.index, 'sales', data=df_line, color='green', label='Avg Sales')
33
+ ax.plot(df_line.index, 'profit', data=df_line, color='red', label='Avg Profit')
34
+ ax.legend()
35
+ ax.set_title('Average Sales and Profit Over Time (Monthly)')
36
+ ax.set_xlabel('Time')
37
+ ax.set_ylabel('Value')
38
+ return fig
39
+
40
+ def segment_vs_region_distribution(df):
41
+ """
42
+ Generate a count plot for segments across different regions.
43
+ """
44
+ fig = plt.figure(figsize=(10, 6))
45
+ sns.countplot(x='segment', data=df, hue='region', palette='viridis')
46
+ plt.title('Segment vs. Region Distribution')
47
+ plt.xlabel('Segment')
48
+ plt.ylabel('Count')
49
+ plt.legend(title='Region')
50
+ return fig
51
+
52
+ def sales_vs_profit_across_segments(df):
53
+ """
54
+ Generate a scatter plot comparing sales and profit across different customer segments.
55
+ """
56
+ fig, ax = plt.subplots(figsize=(10, 6))
57
+ sns.scatterplot(x='sales', y='profit', hue='segment', data=df, palette='viridis', size='sales', sizes=(20, 200), ax=ax)
58
+ ax.set_title('Sales vs. Profit Across Different Customer Segments')
59
+ ax.set_xlabel('Sales')
60
+ ax.set_ylabel('Profit')
61
+ return fig
62
+
63
+ def category_composition_for_profit_and_sales(df):
64
+ """
65
+ Generate pie charts for the composition of category for profit and sales.
66
+ """
67
+ df_pie = df.groupby('category').agg({'sales': 'sum', 'profit': 'sum'}).reset_index()
68
+ fig, axs = plt.subplots(1, 2, figsize=(14, 7))
69
+ axs[0].pie(df_pie['sales'], labels=df_pie['category'], autopct='%1.1f%%', startangle=140, colors=['#ff9999','#66b3ff','#99ff99','#ffcc99'])
70
+ axs[0].set_title('Sales Composition by Category')
71
+ axs[1].pie(df_pie['profit'], labels=df_pie['category'], autopct='%1.1f%%', startangle=140, colors=['#ff9999','#66b3ff','#99ff99','#ffcc99'])
72
+ axs[1].set_title('Profit Composition by Category')
73
+ return fig
74
+
75
+ # Additional EDA functions can be added following the same pattern
prediction.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # prediction.py
2
+
3
+ import joblib
4
+ import pandas as pd
5
+
6
+ # Load the pipeline and model
7
+ pipeline = joblib.load('full_pipeline_with_unit_price.pkl')
8
+ model = joblib.load('best_model.pkl')
9
+
10
+ def make_prediction(features):
11
+ """
12
+ Takes an array of original features, transforms it using the pipeline,
13
+ and makes a prediction with the model.
14
+
15
+ Parameters:
16
+ - features: array-like, shape [n_features]
17
+
18
+ Returns:
19
+ - The predicted value as a float.
20
+ """
21
+ # Convert the features array into a DataFrame expected by the pipeline
22
+ # Note: You need to ensure the columns match those expected by the pipeline
23
+ features_df = pd.DataFrame([features], columns=['sales', 'quantity', 'discount', 'sub_category'])
24
+
25
+ # Process features through the pipeline
26
+ processed_features = pipeline.transform(features_df)
27
+
28
+ # Make a prediction
29
+ prediction = model.predict(processed_features)
30
+
31
+ return prediction[0] # Assuming we want a single prediction value