nevoit's picture
Upload 40 files
76cdfb8
raw
history blame contribute delete
No virus
10 kB
import os
import numpy as np
import pandas as pd
from scipy.io import arff
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from nt_gan import GAN
from nt_gg import GG
dataset_directory = 'datasets'
saved_models_path = 'outputs'
def prepare_architecture(arff_data_path):
"""
This function create the architecture of the GAN network.
The generator and the discriminator are created and then combined into the GAN model
:param arff_data_path: data path for the arff file
:return: a dictionary with all the relevant variables for the next stages
"""
data, meta_data = arff.loadarff(arff_data_path) # This function reads arff file into tuple of data and its meta.
df = pd.DataFrame(data)
columns = df.columns
transformed_data, x, x_scaled, meta_data_rev, min_max_scaler = create_scaled_data(df, meta_data)
number_of_features = len(transformed_data.columns) # Define the GAN and training parameters
return x_scaled, meta_data_rev, columns, min_max_scaler, number_of_features
def create_scaled_data(df, meta_data):
"""
:param df:
:param meta_data:
:return:
"""
meta_data_dict = {k: {a.replace(' ', ''): b + 1 for b, a in enumerate(v.values)} for k, v in
meta_data._attributes.items() if
v.type_name != 'numeric'} # Starts from one and not zero because one is for Nan values
meta_data_rev = {k: {b + 1: a.replace(' ', '') for b, a in enumerate(v.values)} for k, v in
meta_data._attributes.items() if
v.type_name != 'numeric'} # Starts from one and not zero because one is for Nan values
transformed_data = df.copy()
for col in df.columns:
if col in meta_data_dict:
# Sometimes the values can not be found in the meta data, so we treat these values as Nan
transformed_data[col] = transformed_data[col].apply(
lambda x: meta_data_dict[col][str(x).split('\'')[1]] if str(x).split('\'')[1] in meta_data_dict[
col] else 0)
x = transformed_data.values # returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
return transformed_data, x, x_scaled, meta_data_rev, min_max_scaler
def re_scaled_data(data, columns, meta_data_rev, min_max_scaler):
"""
This function re-scaled the fake data to the original format.
:param data: the data we want to re scaled
:param columns:
:param meta_data_rev:
:return:
"""
data_inv = min_max_scaler.inverse_transform(data)
df = pd.DataFrame(data_inv, columns=columns)
transformed_data = df.copy()
for col in transformed_data.columns:
if col in meta_data_rev:
# Sometimes the values can not be found in the meta data, so we treat these values as Nan
transformed_data[col] = transformed_data[col].apply(
lambda x: meta_data_rev[col][int(round(x))] if int(round(x)) in meta_data_rev[
col] else np.nan)
return transformed_data
def first_question():
"""
This function answers the first question
:return:
"""
to_plot_losses = True
results_output = os.path.join(saved_models_path, f'question_one_results.csv')
results = {'dataset': [], 'lr': [], 'ep': [], 'bs': [], 'alpha': [], 'dropout': [], 'gen_loss': [], 'dis_loss': [],
'activation': [], 'fooled_len': [], 'not_fooled_len': [], 'mean_min_distance_fooled': [],
'mean_min_distance_not_fooled': [], 'mean_min_distance_gap': []}
# w1 * (MMDF + MMDNF) - w3 * (MMDG) + w2 * (NFL/ 100)
# MMDG = MMDNF - MMDF
# data_name = ["adult", "bank-full"]
# learning_rate = [0.01, 0.001, 0.0001]
# epochs = [5, 10, 15]
# batch_size = [64, 128, 1024]
# alpha_relu = [0.2, 0.5]
# dropout = [0.3, 0.5]
data_name = ["adult"]
learning_rate = [0.001]
epochs = [10]
batch_size = [128]
alpha_relu = [0.5]
dropout = [0.5]
loss = 'binary_crossentropy'
activation = 'sigmoid'
for data in data_name:
for lr in learning_rate:
for ep in epochs:
for bs in batch_size:
for al in alpha_relu:
for dr in dropout:
arff_data_path = f'./datasets/{data}.arff'
model_name = f'data_{data}_ep_{ep}_bs_{bs}_lr_{lr}_al_{al}_dr_{dr}'
pca_output = os.path.join(saved_models_path, f'{model_name}_pca.png')
fooled_output = os.path.join(saved_models_path, f'{model_name}_fooled.csv')
not_fooled_output = os.path.join(saved_models_path, f'{model_name}_not_fooled.csv')
x_scaled, meta_data_rev, columns, min_max_scaler, number_of_features = prepare_architecture(
arff_data_path)
gan_obj = GAN(number_of_features=number_of_features, saved_models_path=saved_models_path,
learning_rate=lr, alpha_relu=al, dropout=dr,
loss=loss, activation=activation)
gen_loss, dis_loss = gan_obj.train(scaled_data=x_scaled, epochs=ep, batch_size=bs,
to_plot_losses=to_plot_losses, model_name=model_name)
dis_fooled_scaled, dis_not_fooled_scaled, mean_min_distance_fooled, mean_min_distance_not_fooled = gan_obj.test(
scaled_data=x_scaled, sample_num=100, pca_output=pca_output)
dis_fooled = re_scaled_data(data=dis_fooled_scaled, columns=columns,
meta_data_rev=meta_data_rev,
min_max_scaler=min_max_scaler)
dis_fooled.to_csv(fooled_output)
dis_not_fooled = re_scaled_data(data=dis_not_fooled_scaled, columns=columns,
meta_data_rev=meta_data_rev,
min_max_scaler=min_max_scaler)
dis_not_fooled.to_csv(not_fooled_output)
results['dataset'].append(data)
results['lr'].append(lr)
results['ep'].append(ep)
results['bs'].append(bs)
results['alpha'].append(al)
results['dropout'].append(dr)
results['gen_loss'].append(gen_loss)
results['dis_loss'].append(dis_loss)
results['activation'].append(activation)
results['fooled_len'].append(len(dis_fooled_scaled))
results['not_fooled_len'].append(len(dis_not_fooled_scaled))
results['mean_min_distance_fooled'].append(mean_min_distance_fooled)
results['mean_min_distance_not_fooled'].append(mean_min_distance_not_fooled)
results['mean_min_distance_gap'].append(mean_min_distance_not_fooled-mean_min_distance_fooled)
results_df = pd.DataFrame.from_dict(results)
results_df.to_csv(results_output, index=False)
def second_question():
data_name = ["adult", "bank-full"]
learning_rate = [0.001]
epochs = [10]
batch_size = [128]
alpha_relu = [0.2]
dropout = [0.3]
results = {'dataset': [], 'lr': [], 'ep': [], 'bs': [], 'alpha': [], 'dropout': [], 'gen_loss': [], 'proba_error': []}
combs = len(data_name) * len(learning_rate) * len(epochs) * len(batch_size) * len(alpha_relu) * len(dropout)
i = 1
for data in data_name:
for lr in learning_rate:
for ep in epochs:
for bs in batch_size:
for al in alpha_relu:
for dr in dropout:
print(f'Running combination {i}/{combs}')
data_path = f'./datasets/{data}.arff'
model_name = f'data_{data}_ep_{ep}_bs_{bs}_lr_{lr}_part2'
x_scaled, meta_data_rev, cols, min_max_scaler, feature_num = prepare_architecture(data_path)
general_generator = GG(feature_num, saved_models_path, lr, dr, al)
x_train, x_test, y_train, y_test = train_test_split(x_scaled[:, :-1], x_scaled[:, -1], test_size=0.1)
general_generator.train_gg(x_train, y_train, ep, bs, model_name, data, saved_models_path, True)
error = general_generator.get_error()
results['dataset'].append(data)
results['lr'].append(lr)
results['ep'].append(ep)
results['bs'].append(bs)
results['alpha'].append(al)
results['dropout'].append(dr)
results['gen_loss'].append(general_generator.losses['gen_loss'][-1])
results['proba_error'].append(error.mean())
i += 1
# Test set performance
general_generator.plot_discriminator_results(x_test, y_test, data, saved_models_path)
general_generator.plot_generator_results(data, saved_models_path)
results_output = os.path.join(saved_models_path, f'question_two_results.csv')
results_df = pd.DataFrame.from_dict(results)
# results_df.to_csv(results_output, index=False)
def main():
# first_question()
second_question()
if __name__ == '__main__':
main()