abhishekrs4's picture
updated scripts in the modeling module
77b575f
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
class WaterPotabilityDataLoader:
def __init__(self, file_csv, test_size=0.1, random_state=4):
self.file_csv = file_csv
self.test_size = test_size
self.random_state = random_state
self.df_csv = None
self.df_train = None
self.df_test = None
self.X_train = None
self.Y_train = None
self.X_test = None
self.Y_test = None
def read_csv_file(self):
self.df_csv = pd.read_csv(self.file_csv)
return
def split_data(self):
self.df_train, self.df_test = train_test_split(
self.df_csv, test_size=self.test_size, random_state=self.random_state
)
return
def get_data_from_data_frame(self, which_set="train"):
"""
---------
Arguments
---------
which_set : str
a string indicating for which set the data arrays should be returned
-------
Returns
-------
(X_arr, Y_arr) : tuple
a tuple of numpy arrays of features and labels for the appropriate set
"""
if which_set == "train":
data_frame = self.df_train
else:
data_frame = self.df_test
arr = data_frame.to_numpy()
X_arr, Y_arr = arr[:, :-1], arr[:, -1:].reshape(-1)
return X_arr, Y_arr
def get_dict_nan_counts_per_col(data_frame):
"""
---------
Arguments
---------
data_frame : pd.DataFrame
a pandas dataframe of some dataset
-------
Returns
-------
dict_nan_counts_per_col : dict
a dictionary of NaN counts per column
"""
dict_nan_counts_per_col = data_frame.isna().sum().to_dict()
dict_nan_counts_per_col = dict(
sorted(dict_nan_counts_per_col.items(), key=lambda kv: kv[1], reverse=True)
)
return dict_nan_counts_per_col