Spaces:
Sleeping
Sleeping
import seaborn as sns | |
import pandas as pd | |
import numpy as np | |
import pyod | |
import pyreadr | |
import urllib | |
import rdata | |
import wget | |
import os | |
import joblib | |
import warnings | |
from pyod.models.mcd import MCD | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.compose import make_column_transformer | |
from sklearn.pipeline import make_pipeline | |
from sklearn.model_selection import train_test_split | |
# Ignore all warnings | |
warnings.filterwarnings("ignore") | |
# Download the dataset | |
url = "http://www.ulb.ac.be/di/map/adalpozz/data/creditcard.Rdata" | |
dst_path = "./creditcard.Rdata" | |
wget.download(url, dst_path) | |
# Load the dataset | |
parsed_res = rdata.parser.parse_file(dst_path) | |
res = rdata.conversion.convert(parsed_res) | |
dataset = res['creditcard'].reset_index(drop=True).drop(['Time'], axis=1) | |
# Prepare the data | |
y = dataset['Class'].astype(int) # Convert to integers | |
df = dataset.drop(['Class'], axis=1) | |
df.columns = df.columns.astype(str) | |
print("Data subsets created") | |
# Split the data | |
X_train, X_test, y_train, y_test = train_test_split(df, y, train_size=0.6, random_state=0, stratify=y) | |
X_train, _, y_train, _ = train_test_split(X_train, y_train, train_size=0.2, random_state=0, stratify=y_train) | |
# Reset indices | |
X_train.reset_index(drop=True, inplace=True) | |
y_train.reset_index(drop=True, inplace=True) | |
# Define the numerical features and the pipeline for numerical features | |
numerical_features = [f'V{i}' for i in range(1, 29)] + ['Amount'] | |
numerical_pipeline = make_pipeline( | |
StandardScaler() # Example: Standardize numerical features | |
) | |
# Creating a column transformer named preprocessor to apply specific pipelines to numerical and categorical features separately. | |
preprocessor = make_column_transformer( | |
(numerical_pipeline, numerical_features) | |
) | |
# Creating model | |
clf = MCD() | |
# Creating a pipeline combining preprocessing steps (imputation and encoding) with the MCD model | |
model_pipeline = make_pipeline( | |
preprocessor, # Applying preprocessing steps | |
clf # Training MCD model | |
) | |
print("Preprocessing Data") | |
# Fit the model and train model to predict anomalies | |
model_pipeline.fit(X_train) | |
y_test_pred = model_pipeline.predict(X_test) | |
print("Serializing Model") | |
# Save the model in the current working directory | |
saved_model_path = "model.joblib" | |
joblib.dump(model_pipeline, saved_model_path) | |
print(f"Model Serialized and Saved to {saved_model_path}") | |