OCW-FraudDetection / train.py
kgauvin603's picture
Update train.py
176392b verified
import seaborn as sns
import pandas as pd
import numpy as np
import pyod
import pyreadr
import urllib
import rdata
import wget
import os
import joblib
import warnings
from pyod.models.mcd import MCD
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
# Ignore all warnings
warnings.filterwarnings("ignore")
# Download the dataset
url = "http://www.ulb.ac.be/di/map/adalpozz/data/creditcard.Rdata"
dst_path = "./creditcard.Rdata"
wget.download(url, dst_path)
# Load the dataset
parsed_res = rdata.parser.parse_file(dst_path)
res = rdata.conversion.convert(parsed_res)
dataset = res['creditcard'].reset_index(drop=True).drop(['Time'], axis=1)
# Prepare the data
y = dataset['Class'].astype(int) # Convert to integers
df = dataset.drop(['Class'], axis=1)
df.columns = df.columns.astype(str)
print("Data subsets created")
# Split the data
X_train, X_test, y_train, y_test = train_test_split(df, y, train_size=0.6, random_state=0, stratify=y)
X_train, _, y_train, _ = train_test_split(X_train, y_train, train_size=0.2, random_state=0, stratify=y_train)
# Reset indices
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
# Define the numerical features and the pipeline for numerical features
numerical_features = [f'V{i}' for i in range(1, 29)] + ['Amount']
numerical_pipeline = make_pipeline(
StandardScaler() # Example: Standardize numerical features
)
# Creating a column transformer named preprocessor to apply specific pipelines to numerical and categorical features separately.
preprocessor = make_column_transformer(
(numerical_pipeline, numerical_features)
)
# Creating model
clf = MCD()
# Creating a pipeline combining preprocessing steps (imputation and encoding) with the MCD model
model_pipeline = make_pipeline(
preprocessor, # Applying preprocessing steps
clf # Training MCD model
)
print("Preprocessing Data")
# Fit the model and train model to predict anomalies
model_pipeline.fit(X_train)
y_test_pred = model_pipeline.predict(X_test)
print("Serializing Model")
# Save the model in the current working directory
saved_model_path = "model.joblib"
joblib.dump(model_pipeline, saved_model_path)
print(f"Model Serialized and Saved to {saved_model_path}")