Spaces:

molinari135
/

product-return-prediction-api

Running

App Files Files Community

molinari135 commited on 24 days ago

Commit

d449be0

•

1 Parent(s): 220a214

Delete product_return_prediction/modeling

Browse files

Files changed (4) hide show

product_return_prediction/modeling/__init__.py +0 -0
product_return_prediction/modeling/eval.py +0 -101
product_return_prediction/modeling/predict.py +0 -60
product_return_prediction/modeling/train.py +0 -143

product_return_prediction/modeling/__init__.py DELETED Viewed

File without changes

product_return_prediction/modeling/eval.py DELETED Viewed

@@ -1,101 +0,0 @@
-import pickle
-import typer
-import json
-import seaborn as sns
-import pandas as pd
-import matplotlib.pyplot as plt
-from loguru import logger
-from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
-from pathlib import Path
-from codecarbon import EmissionsTracker
-from product_return_prediction.dataset import scale_data_with_trained_scaler
-from product_return_prediction.config import (
-    MODELS_DIR,
-    PROCESSED_DATA_DIR,
-    TARGET_COLUMN,
-    REPORTS_DIR
-)
-app = typer.Typer()
-def evaluate_model(test_data: pd.DataFrame, scaler_file: Path, model: any, model_name: str):
-    """
-    Evaluates the performance of a trained model on the provided test data. It includes scaling the features
-    using a pre-trained scaler, making predictions, computing accuracy, generating a classification report,
-    and visualizing the confusion matrix.
-    This function scales the test data using a pre-trained scaler, applies the trained model to make predictions,
-    and calculates key performance metrics, including accuracy. It then generates a detailed classification report,
-    saves the report to a JSON file, and plots the confusion matrix to visually assess model performance.
-    Args:
-        test_data (pd.DataFrame): The test dataset, which includes both features and the target column.
-        scaler_file (Path): Path to the pre-trained scaler file, used to scale the feature columns.
-        model (any): The trained model object, used to make predictions on the test data.
-        model_name (str): The name of the model, used for saving the evaluation report.
-    Example:
-        ```python
-        evaluate_model(test_data, scaler_file='scaler.pkl', model=model, model_name='log_reg')
-        ```
-    """
-    X_test = test_data.drop(columns=[TARGET_COLUMN]).copy()
-    y_test = test_data[TARGET_COLUMN].copy()
-    X_test = scale_data_with_trained_scaler(X_test, scaler_file)
-    cc_file = f"{model_name}_emissions.csv"
-    tracker = EmissionsTracker(project_name="eval", output_dir=REPORTS_DIR, output_file=cc_file)
-    tracker.start()
-    y_pred = model.predict(X_test)
-    tracker.stop()
-    accuracy = accuracy_score(y_test, y_pred)
-    logger.info(f"Accuracy: {accuracy * 100:.2f}%")
-    report = classification_report(y_test, y_pred)
-    logger.info(f"Classification Report:\n{report}")
-    report = classification_report(y_test, y_pred, output_dict=True)
-    with open(REPORTS_DIR / f"{model_name}.json", "w") as json_file:
-        json.dump(report, json_file, indent=4)
-    cm = confusion_matrix(y_test, y_pred)
-    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=model.classes_, yticklabels=model.classes_)
-    plt.title("Confusion Matrix")
-    plt.xlabel("Predicted Labels")
-    plt.ylabel("True Labels")
-    # Saving the confusion matrix in the reports/figures directory
-    plt.savefig(REPORTS_DIR / f"figures/cm_{model_name}.png", dpi=300, bbox_inches='tight')
-    plt.close()
-@app.command()
-def main(
-    test_file: Path = PROCESSED_DATA_DIR / "test.tsv",
-    scaler_file: Path = MODELS_DIR / "scaler.pkl",
-    log_reg_model_path: Path = MODELS_DIR / "log_reg.pkl",
-    svm_model_path: Path = MODELS_DIR / "svm.pkl",
-):
-    test_data = pd.read_csv(test_file, sep='\t')
-    with open(log_reg_model_path, "rb") as f:
-        log_reg = pickle.load(f)
-    with open(svm_model_path, "rb") as f:
-        svm = pickle.load(f)
-    evaluate_model(test_data, scaler_file, log_reg, "log_reg_eval")
-    evaluate_model(test_data, scaler_file, svm, "svm_eval")
-if __name__ == "__main__":
-    app()

product_return_prediction/modeling/predict.py DELETED Viewed

@@ -1,60 +0,0 @@
-from pathlib import Path
-import typer
-import pickle
-import json
-import pandas as pd
-from loguru import logger
-from codecarbon import EmissionsTracker
-from product_return_prediction.config import MODELS_DIR, INTERIM_DATA_DIR, EXTERNAL_DATA_DIR, REPORTS_DIR, RAW_DATA_DIR
-from product_return_prediction.dataset import prepare_inventory, scale_data_with_trained_scaler
-app = typer.Typer()
-@app.command()
-def main(
-    sales_path: Path = RAW_DATA_DIR / "sales.xlsx",
-    inventory_path: Path = EXTERNAL_DATA_DIR / "inventory.csv",
-    json_percentage: Path = INTERIM_DATA_DIR / "colour_return_percentage.json",
-    scaler_file: Path = MODELS_DIR / "scaler.pkl",
-    model_path: Path = MODELS_DIR / "svm.pkl",
-):
-    sales = pd.read_excel(sales_path)
-    inventory = pd.read_csv(inventory_path)
-    with open(json_percentage, 'r') as f:
-        percentages = json.load(f)
-    # ---- Prepare inventory data for inference ----
-    inventory = prepare_inventory(sales, inventory, percentages)
-    with open(model_path, "rb") as f:
-        model = pickle.load(f)
-    # ---- Scale 5 random rows from the inventory ----
-    random_row = inventory.sample(n=5)
-    logger.info(f"Your product:\n {random_row}")
-    random_row = scale_data_with_trained_scaler(random_row, scaler_file)
-    # ---- Compute predictions and probabilities ----
-    cc_file = "svm_predict_emissions.csv"
-    tracker = EmissionsTracker(project_name="eval", output_dir=REPORTS_DIR, output_file=cc_file)
-    tracker.start()
-    predictions = model.predict(random_row)
-    probabilities = model.predict_proba(random_row)
-    tracker.stop()
-    for pred, prob in zip(predictions, probabilities):
-        prob_confidence = prob.max()
-        if pred == 1:
-            logger.info(f"The product will be returned with {prob_confidence:.2f} confidence")
-        else:
-            logger.info(f"The product will NOT be returned with {prob_confidence:.2f} confidence")
-if __name__ == "__main__":
-    app()

product_return_prediction/modeling/train.py DELETED Viewed

@@ -1,143 +0,0 @@
-import pickle
-from pathlib import Path
-import dagshub
-import mlflow
-import pandas as pd
-import typer
-from loguru import logger
-from sklearn.linear_model import LogisticRegression
-from sklearn.model_selection import GridSearchCV
-from sklearn.svm import SVC
-from codecarbon import EmissionsTracker
-from product_return_prediction.dataset import scale_data_with_trained_scaler
-from product_return_prediction.config import (
-    MODELS_DIR,
-    PROCESSED_DATA_DIR,
-    TARGET_COLUMN,
-    REPORTS_DIR
-)
-dagshub.init(repo_owner='se4ai2425-uniba', repo_name='product-return-prediction', mlflow=True)
-app = typer.Typer()
-# TODO The training dataset must have the following columns:
-# Product Type, Product Subtype, Product Gender, Net Sales (FA), Net Sales Units (FA)
-# TARGET_COLUMN, Product Order Count, Total Order Value, Main Material, Colour Return Percentage
-# Total Customer Purchases, Total Customer Returns, Customer Return Percentage
-# TODO The scaler and model paths must be Pickle (.pkl) files
-def train_log_reg(train_data: pd.DataFrame, scaler_file: Path, model_path: Path):
-    """
-    Trains a Logistic Regression model using the provided training data, applies feature scaling,
-    and saves the trained model to a specified file.
-    This function trains a Logistic Regression model using the training data. The feature columns are
-    scaled using a pre-trained scaler before fitting the model. The model is then saved to the specified
-    file path, and the training process is tracked using MLflow.
-    Args:
-        train_data (pd.DataFrame): The training data, including features and target column.
-        scaler_file (Path): Path to the pre-trained scaler file, used to scale the feature columns.
-        model_path (Path): Path where the trained Logistic Regression model will be saved.
-    """
-    run_name = model_path.stem
-    mlflow.start_run(run_name=run_name)
-    mlflow.sklearn.autolog()
-    # Apply scaling to the feature columns (excluding the target column)
-    X_train = train_data.drop(columns=[TARGET_COLUMN]).copy()
-    y_train = train_data[TARGET_COLUMN].copy()
-    # Scale X_train using the pre-trained scaler
-    X_train = scale_data_with_trained_scaler(X_train, scaler_file)
-    # Initialize the Logistic Regression model
-    model = LogisticRegression(max_iter=1000, class_weight="balanced")
-    logger.info(f"Model: {model}")
-    cc_file = "log_reg_train_emissions.csv"
-    tracker = EmissionsTracker(project_name="train", output_dir=REPORTS_DIR, output_file=cc_file)
-    tracker.start()
-    # Fit the model to the training data
-    model.fit(X_train, y_train)
-    tracker.stop()
-    mlflow.end_run()
-    # Save the trained model to disk
-    with open(model_path, "wb") as f:
-        pickle.dump(model, f)
-    logger.success(f"Model saved to {model_path}")
-# TODO The training dataset must have the following columns:
-# Product Type, Product Subtype, Product Gender, Net Sales (FA), Net Sales Units (FA)
-# TARGET_COLUMN, Product Order Count, Total Order Value, Main Material, Colour Return Percentage
-# Total Customer Purchases, Total Customer Returns, Customer Return Percentage
-# TODO The scaler and model paths must be Pickle (.pkl) files
-def train_svm(train_data: pd.DataFrame, scaler_file: Path, model_path: Path):
-    """
-    Trains a Support Vector Machine (SVM) classifier using the provided training data, applies feature scaling,
-    performs hyperparameter tuning via grid search, and saves the trained model to a specified file.
-    This function trains an SVM model with hyperparameter optimization using grid search. The feature columns
-    are scaled using a pre-trained scaler before fitting the model. The trained model is saved to the specified
-    file path, and the training process is tracked using MLflow.
-    Args:
-        train_data (pd.DataFrame): The training data, including features and target column.
-        scaler_file (Path): Path to the pre-trained scaler file, used to scale the feature columns.
-        model_path (Path): Path where the trained SVM model will be saved.
-    """
-    run_name = model_path.stem
-    mlflow.start_run(run_name=run_name)
-    mlflow.sklearn.autolog()
-    X_train = train_data.drop(columns=[TARGET_COLUMN]).copy()
-    y_train = train_data[TARGET_COLUMN].copy()
-    X_train = scale_data_with_trained_scaler(X_train, scaler_file)
-    param_grid = {"C": [0.1, 1, 10], "kernel": ["rbf"], "gamma": ["scale", "auto"]}
-    logger.info("Starting Grid Search for best hyperparameters")
-    grid_search = GridSearchCV(SVC(probability=True), param_grid, scoring="balanced_accuracy", cv=10)
-    grid_search.fit(X_train, y_train)
-    model = grid_search.best_estimator_
-    cc_file = "svm_train_emissions.csv"
-    tracker = EmissionsTracker(project_name="train", output_dir=REPORTS_DIR, output_file=cc_file)
-    tracker.start()
-    model.fit(X_train, y_train)
-    tracker.stop()
-    mlflow.end_run()
-    with open(model_path, "wb") as f:
-        pickle.dump(model, f)
-    logger.success(f"Model saved to {model_path}")
-@app.command()
-def main(
-    train_file: Path = PROCESSED_DATA_DIR / "train.tsv",
-    scaler_file: Path = MODELS_DIR / "scaler.pkl",
-    log_reg_model_path: Path = MODELS_DIR / "log_reg.pkl",
-    svm_model_path: Path = MODELS_DIR / "svm.pkl",
-):
-    train_data = pd.read_csv(train_file, sep='\t')
-    # ---- Train models ----
-    train_log_reg(train_data, scaler_file, log_reg_model_path)
-    train_svm(train_data, scaler_file, svm_model_path)
-if __name__ == "__main__":
-    app()