molinari135
commited on
Commit
•
d449be0
1
Parent(s):
220a214
Delete product_return_prediction/modeling
Browse files
product_return_prediction/modeling/__init__.py
DELETED
File without changes
|
product_return_prediction/modeling/eval.py
DELETED
@@ -1,101 +0,0 @@
|
|
1 |
-
import pickle
|
2 |
-
import typer
|
3 |
-
import json
|
4 |
-
|
5 |
-
import seaborn as sns
|
6 |
-
import pandas as pd
|
7 |
-
import matplotlib.pyplot as plt
|
8 |
-
|
9 |
-
from loguru import logger
|
10 |
-
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
11 |
-
from pathlib import Path
|
12 |
-
from codecarbon import EmissionsTracker
|
13 |
-
|
14 |
-
from product_return_prediction.dataset import scale_data_with_trained_scaler
|
15 |
-
from product_return_prediction.config import (
|
16 |
-
MODELS_DIR,
|
17 |
-
PROCESSED_DATA_DIR,
|
18 |
-
TARGET_COLUMN,
|
19 |
-
REPORTS_DIR
|
20 |
-
)
|
21 |
-
|
22 |
-
app = typer.Typer()
|
23 |
-
|
24 |
-
|
25 |
-
def evaluate_model(test_data: pd.DataFrame, scaler_file: Path, model: any, model_name: str):
|
26 |
-
"""
|
27 |
-
Evaluates the performance of a trained model on the provided test data. It includes scaling the features
|
28 |
-
using a pre-trained scaler, making predictions, computing accuracy, generating a classification report,
|
29 |
-
and visualizing the confusion matrix.
|
30 |
-
|
31 |
-
This function scales the test data using a pre-trained scaler, applies the trained model to make predictions,
|
32 |
-
and calculates key performance metrics, including accuracy. It then generates a detailed classification report,
|
33 |
-
saves the report to a JSON file, and plots the confusion matrix to visually assess model performance.
|
34 |
-
|
35 |
-
Args:
|
36 |
-
test_data (pd.DataFrame): The test dataset, which includes both features and the target column.
|
37 |
-
scaler_file (Path): Path to the pre-trained scaler file, used to scale the feature columns.
|
38 |
-
model (any): The trained model object, used to make predictions on the test data.
|
39 |
-
model_name (str): The name of the model, used for saving the evaluation report.
|
40 |
-
|
41 |
-
Example:
|
42 |
-
```python
|
43 |
-
evaluate_model(test_data, scaler_file='scaler.pkl', model=model, model_name='log_reg')
|
44 |
-
```
|
45 |
-
"""
|
46 |
-
|
47 |
-
X_test = test_data.drop(columns=[TARGET_COLUMN]).copy()
|
48 |
-
y_test = test_data[TARGET_COLUMN].copy()
|
49 |
-
|
50 |
-
X_test = scale_data_with_trained_scaler(X_test, scaler_file)
|
51 |
-
|
52 |
-
cc_file = f"{model_name}_emissions.csv"
|
53 |
-
tracker = EmissionsTracker(project_name="eval", output_dir=REPORTS_DIR, output_file=cc_file)
|
54 |
-
tracker.start()
|
55 |
-
|
56 |
-
y_pred = model.predict(X_test)
|
57 |
-
|
58 |
-
tracker.stop()
|
59 |
-
|
60 |
-
accuracy = accuracy_score(y_test, y_pred)
|
61 |
-
logger.info(f"Accuracy: {accuracy * 100:.2f}%")
|
62 |
-
|
63 |
-
report = classification_report(y_test, y_pred)
|
64 |
-
logger.info(f"Classification Report:\n{report}")
|
65 |
-
|
66 |
-
report = classification_report(y_test, y_pred, output_dict=True)
|
67 |
-
with open(REPORTS_DIR / f"{model_name}.json", "w") as json_file:
|
68 |
-
json.dump(report, json_file, indent=4)
|
69 |
-
|
70 |
-
cm = confusion_matrix(y_test, y_pred)
|
71 |
-
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=model.classes_, yticklabels=model.classes_)
|
72 |
-
plt.title("Confusion Matrix")
|
73 |
-
plt.xlabel("Predicted Labels")
|
74 |
-
plt.ylabel("True Labels")
|
75 |
-
|
76 |
-
# Saving the confusion matrix in the reports/figures directory
|
77 |
-
plt.savefig(REPORTS_DIR / f"figures/cm_{model_name}.png", dpi=300, bbox_inches='tight')
|
78 |
-
plt.close()
|
79 |
-
|
80 |
-
|
81 |
-
@app.command()
|
82 |
-
def main(
|
83 |
-
test_file: Path = PROCESSED_DATA_DIR / "test.tsv",
|
84 |
-
scaler_file: Path = MODELS_DIR / "scaler.pkl",
|
85 |
-
log_reg_model_path: Path = MODELS_DIR / "log_reg.pkl",
|
86 |
-
svm_model_path: Path = MODELS_DIR / "svm.pkl",
|
87 |
-
):
|
88 |
-
test_data = pd.read_csv(test_file, sep='\t')
|
89 |
-
|
90 |
-
with open(log_reg_model_path, "rb") as f:
|
91 |
-
log_reg = pickle.load(f)
|
92 |
-
|
93 |
-
with open(svm_model_path, "rb") as f:
|
94 |
-
svm = pickle.load(f)
|
95 |
-
|
96 |
-
evaluate_model(test_data, scaler_file, log_reg, "log_reg_eval")
|
97 |
-
evaluate_model(test_data, scaler_file, svm, "svm_eval")
|
98 |
-
|
99 |
-
|
100 |
-
if __name__ == "__main__":
|
101 |
-
app()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
product_return_prediction/modeling/predict.py
DELETED
@@ -1,60 +0,0 @@
|
|
1 |
-
from pathlib import Path
|
2 |
-
|
3 |
-
import typer
|
4 |
-
import pickle
|
5 |
-
import json
|
6 |
-
import pandas as pd
|
7 |
-
from loguru import logger
|
8 |
-
from codecarbon import EmissionsTracker
|
9 |
-
|
10 |
-
from product_return_prediction.config import MODELS_DIR, INTERIM_DATA_DIR, EXTERNAL_DATA_DIR, REPORTS_DIR, RAW_DATA_DIR
|
11 |
-
from product_return_prediction.dataset import prepare_inventory, scale_data_with_trained_scaler
|
12 |
-
|
13 |
-
app = typer.Typer()
|
14 |
-
|
15 |
-
|
16 |
-
@app.command()
|
17 |
-
def main(
|
18 |
-
sales_path: Path = RAW_DATA_DIR / "sales.xlsx",
|
19 |
-
inventory_path: Path = EXTERNAL_DATA_DIR / "inventory.csv",
|
20 |
-
json_percentage: Path = INTERIM_DATA_DIR / "colour_return_percentage.json",
|
21 |
-
scaler_file: Path = MODELS_DIR / "scaler.pkl",
|
22 |
-
model_path: Path = MODELS_DIR / "svm.pkl",
|
23 |
-
):
|
24 |
-
sales = pd.read_excel(sales_path)
|
25 |
-
inventory = pd.read_csv(inventory_path)
|
26 |
-
|
27 |
-
with open(json_percentage, 'r') as f:
|
28 |
-
percentages = json.load(f)
|
29 |
-
|
30 |
-
# ---- Prepare inventory data for inference ----
|
31 |
-
inventory = prepare_inventory(sales, inventory, percentages)
|
32 |
-
|
33 |
-
with open(model_path, "rb") as f:
|
34 |
-
model = pickle.load(f)
|
35 |
-
|
36 |
-
# ---- Scale 5 random rows from the inventory ----
|
37 |
-
random_row = inventory.sample(n=5)
|
38 |
-
logger.info(f"Your product:\n {random_row}")
|
39 |
-
random_row = scale_data_with_trained_scaler(random_row, scaler_file)
|
40 |
-
|
41 |
-
# ---- Compute predictions and probabilities ----
|
42 |
-
cc_file = "svm_predict_emissions.csv"
|
43 |
-
tracker = EmissionsTracker(project_name="eval", output_dir=REPORTS_DIR, output_file=cc_file)
|
44 |
-
tracker.start()
|
45 |
-
|
46 |
-
predictions = model.predict(random_row)
|
47 |
-
probabilities = model.predict_proba(random_row)
|
48 |
-
|
49 |
-
tracker.stop()
|
50 |
-
|
51 |
-
for pred, prob in zip(predictions, probabilities):
|
52 |
-
prob_confidence = prob.max()
|
53 |
-
if pred == 1:
|
54 |
-
logger.info(f"The product will be returned with {prob_confidence:.2f} confidence")
|
55 |
-
else:
|
56 |
-
logger.info(f"The product will NOT be returned with {prob_confidence:.2f} confidence")
|
57 |
-
|
58 |
-
|
59 |
-
if __name__ == "__main__":
|
60 |
-
app()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
product_return_prediction/modeling/train.py
DELETED
@@ -1,143 +0,0 @@
|
|
1 |
-
import pickle
|
2 |
-
from pathlib import Path
|
3 |
-
|
4 |
-
import dagshub
|
5 |
-
import mlflow
|
6 |
-
import pandas as pd
|
7 |
-
import typer
|
8 |
-
from loguru import logger
|
9 |
-
from sklearn.linear_model import LogisticRegression
|
10 |
-
from sklearn.model_selection import GridSearchCV
|
11 |
-
from sklearn.svm import SVC
|
12 |
-
from codecarbon import EmissionsTracker
|
13 |
-
|
14 |
-
from product_return_prediction.dataset import scale_data_with_trained_scaler
|
15 |
-
from product_return_prediction.config import (
|
16 |
-
MODELS_DIR,
|
17 |
-
PROCESSED_DATA_DIR,
|
18 |
-
TARGET_COLUMN,
|
19 |
-
REPORTS_DIR
|
20 |
-
)
|
21 |
-
|
22 |
-
dagshub.init(repo_owner='se4ai2425-uniba', repo_name='product-return-prediction', mlflow=True)
|
23 |
-
|
24 |
-
app = typer.Typer()
|
25 |
-
|
26 |
-
|
27 |
-
# TODO The training dataset must have the following columns:
|
28 |
-
# Product Type, Product Subtype, Product Gender, Net Sales (FA), Net Sales Units (FA)
|
29 |
-
# TARGET_COLUMN, Product Order Count, Total Order Value, Main Material, Colour Return Percentage
|
30 |
-
# Total Customer Purchases, Total Customer Returns, Customer Return Percentage
|
31 |
-
# TODO The scaler and model paths must be Pickle (.pkl) files
|
32 |
-
def train_log_reg(train_data: pd.DataFrame, scaler_file: Path, model_path: Path):
|
33 |
-
"""
|
34 |
-
Trains a Logistic Regression model using the provided training data, applies feature scaling,
|
35 |
-
and saves the trained model to a specified file.
|
36 |
-
|
37 |
-
This function trains a Logistic Regression model using the training data. The feature columns are
|
38 |
-
scaled using a pre-trained scaler before fitting the model. The model is then saved to the specified
|
39 |
-
file path, and the training process is tracked using MLflow.
|
40 |
-
|
41 |
-
Args:
|
42 |
-
train_data (pd.DataFrame): The training data, including features and target column.
|
43 |
-
scaler_file (Path): Path to the pre-trained scaler file, used to scale the feature columns.
|
44 |
-
model_path (Path): Path where the trained Logistic Regression model will be saved.
|
45 |
-
"""
|
46 |
-
|
47 |
-
run_name = model_path.stem
|
48 |
-
mlflow.start_run(run_name=run_name)
|
49 |
-
mlflow.sklearn.autolog()
|
50 |
-
|
51 |
-
# Apply scaling to the feature columns (excluding the target column)
|
52 |
-
X_train = train_data.drop(columns=[TARGET_COLUMN]).copy()
|
53 |
-
y_train = train_data[TARGET_COLUMN].copy()
|
54 |
-
|
55 |
-
# Scale X_train using the pre-trained scaler
|
56 |
-
X_train = scale_data_with_trained_scaler(X_train, scaler_file)
|
57 |
-
|
58 |
-
# Initialize the Logistic Regression model
|
59 |
-
model = LogisticRegression(max_iter=1000, class_weight="balanced")
|
60 |
-
logger.info(f"Model: {model}")
|
61 |
-
|
62 |
-
cc_file = "log_reg_train_emissions.csv"
|
63 |
-
tracker = EmissionsTracker(project_name="train", output_dir=REPORTS_DIR, output_file=cc_file)
|
64 |
-
tracker.start()
|
65 |
-
|
66 |
-
# Fit the model to the training data
|
67 |
-
model.fit(X_train, y_train)
|
68 |
-
|
69 |
-
tracker.stop()
|
70 |
-
mlflow.end_run()
|
71 |
-
|
72 |
-
# Save the trained model to disk
|
73 |
-
with open(model_path, "wb") as f:
|
74 |
-
pickle.dump(model, f)
|
75 |
-
logger.success(f"Model saved to {model_path}")
|
76 |
-
|
77 |
-
|
78 |
-
# TODO The training dataset must have the following columns:
|
79 |
-
# Product Type, Product Subtype, Product Gender, Net Sales (FA), Net Sales Units (FA)
|
80 |
-
# TARGET_COLUMN, Product Order Count, Total Order Value, Main Material, Colour Return Percentage
|
81 |
-
# Total Customer Purchases, Total Customer Returns, Customer Return Percentage
|
82 |
-
# TODO The scaler and model paths must be Pickle (.pkl) files
|
83 |
-
def train_svm(train_data: pd.DataFrame, scaler_file: Path, model_path: Path):
|
84 |
-
"""
|
85 |
-
Trains a Support Vector Machine (SVM) classifier using the provided training data, applies feature scaling,
|
86 |
-
performs hyperparameter tuning via grid search, and saves the trained model to a specified file.
|
87 |
-
|
88 |
-
This function trains an SVM model with hyperparameter optimization using grid search. The feature columns
|
89 |
-
are scaled using a pre-trained scaler before fitting the model. The trained model is saved to the specified
|
90 |
-
file path, and the training process is tracked using MLflow.
|
91 |
-
|
92 |
-
Args:
|
93 |
-
train_data (pd.DataFrame): The training data, including features and target column.
|
94 |
-
scaler_file (Path): Path to the pre-trained scaler file, used to scale the feature columns.
|
95 |
-
model_path (Path): Path where the trained SVM model will be saved.
|
96 |
-
"""
|
97 |
-
|
98 |
-
run_name = model_path.stem
|
99 |
-
mlflow.start_run(run_name=run_name)
|
100 |
-
mlflow.sklearn.autolog()
|
101 |
-
|
102 |
-
X_train = train_data.drop(columns=[TARGET_COLUMN]).copy()
|
103 |
-
y_train = train_data[TARGET_COLUMN].copy()
|
104 |
-
|
105 |
-
X_train = scale_data_with_trained_scaler(X_train, scaler_file)
|
106 |
-
|
107 |
-
param_grid = {"C": [0.1, 1, 10], "kernel": ["rbf"], "gamma": ["scale", "auto"]}
|
108 |
-
|
109 |
-
logger.info("Starting Grid Search for best hyperparameters")
|
110 |
-
grid_search = GridSearchCV(SVC(probability=True), param_grid, scoring="balanced_accuracy", cv=10)
|
111 |
-
grid_search.fit(X_train, y_train)
|
112 |
-
model = grid_search.best_estimator_
|
113 |
-
|
114 |
-
cc_file = "svm_train_emissions.csv"
|
115 |
-
tracker = EmissionsTracker(project_name="train", output_dir=REPORTS_DIR, output_file=cc_file)
|
116 |
-
tracker.start()
|
117 |
-
|
118 |
-
model.fit(X_train, y_train)
|
119 |
-
|
120 |
-
tracker.stop()
|
121 |
-
mlflow.end_run()
|
122 |
-
|
123 |
-
with open(model_path, "wb") as f:
|
124 |
-
pickle.dump(model, f)
|
125 |
-
logger.success(f"Model saved to {model_path}")
|
126 |
-
|
127 |
-
|
128 |
-
@app.command()
|
129 |
-
def main(
|
130 |
-
train_file: Path = PROCESSED_DATA_DIR / "train.tsv",
|
131 |
-
scaler_file: Path = MODELS_DIR / "scaler.pkl",
|
132 |
-
log_reg_model_path: Path = MODELS_DIR / "log_reg.pkl",
|
133 |
-
svm_model_path: Path = MODELS_DIR / "svm.pkl",
|
134 |
-
):
|
135 |
-
train_data = pd.read_csv(train_file, sep='\t')
|
136 |
-
|
137 |
-
# ---- Train models ----
|
138 |
-
train_log_reg(train_data, scaler_file, log_reg_model_path)
|
139 |
-
train_svm(train_data, scaler_file, svm_model_path)
|
140 |
-
|
141 |
-
|
142 |
-
if __name__ == "__main__":
|
143 |
-
app()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|