Spaces:
Sleeping
Sleeping
import numpy as np | |
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
class WaterPotabilityDataLoader: | |
def __init__(self, file_csv, test_size=0.1, random_state=4): | |
self.file_csv = file_csv | |
self.test_size = test_size | |
self.random_state = random_state | |
self.df_csv = None | |
self.df_train = None | |
self.df_test = None | |
self.X_train = None | |
self.Y_train = None | |
self.X_test = None | |
self.Y_test = None | |
def read_csv_file(self): | |
self.df_csv = pd.read_csv(self.file_csv) | |
return | |
def split_data(self): | |
self.df_train, self.df_test = train_test_split( | |
self.df_csv, test_size=self.test_size, random_state=self.random_state | |
) | |
return | |
def get_data_from_data_frame(self, which_set="train"): | |
""" | |
--------- | |
Arguments | |
--------- | |
which_set : str | |
a string indicating for which set the data arrays should be returned | |
------- | |
Returns | |
------- | |
(X_arr, Y_arr) : tuple | |
a tuple of numpy arrays of features and labels for the appropriate set | |
""" | |
if which_set == "train": | |
data_frame = self.df_train | |
else: | |
data_frame = self.df_test | |
arr = data_frame.to_numpy() | |
X_arr, Y_arr = arr[:, :-1], arr[:, -1:].reshape(-1) | |
return X_arr, Y_arr | |
def get_dict_nan_counts_per_col(data_frame): | |
""" | |
--------- | |
Arguments | |
--------- | |
data_frame : pd.DataFrame | |
a pandas dataframe of some dataset | |
------- | |
Returns | |
------- | |
dict_nan_counts_per_col : dict | |
a dictionary of NaN counts per column | |
""" | |
dict_nan_counts_per_col = data_frame.isna().sum().to_dict() | |
dict_nan_counts_per_col = dict( | |
sorted(dict_nan_counts_per_col.items(), key=lambda kv: kv[1], reverse=True) | |
) | |
return dict_nan_counts_per_col | |