|
import pandas as pd
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
from scipy.stats.mstats import winsorize
|
|
from sklearn.preprocessing import StandardScaler,MinMaxScaler
|
|
from sklearn.metrics import root_mean_squared_error
|
|
from scipy.stats import yeojohnson
|
|
import evaluationer
|
|
from sklearn.model_selection import train_test_split as tts
|
|
def detect_outliers(df,num_cols):
|
|
global outlier_df,zscore_cols,outlier_indexes,iqr_cols
|
|
outlier_df = pd.DataFrame({"method" :[],"columns name":[],"upper limit":[],
|
|
"lower limit":[],"no of Rows":[],"percentage outlier":[]})
|
|
if type(num_cols) == list:
|
|
if len(num_cols)!=0:
|
|
num_cols = num_cols
|
|
else:
|
|
num_cols = df.select_dtypes(exclude = "object").columns.tolist()
|
|
else:
|
|
if num_cols.tolist() != None:
|
|
num_cols = num_cols
|
|
else:
|
|
num_cols = df.select_dtypes(exclude = "object").columns.tolist()
|
|
zscore_cols = []
|
|
iqr_cols = []
|
|
outlier_indexes =[]
|
|
for col in num_cols:
|
|
skewness = df[col].skew()
|
|
if -0.5 <= skewness <= 0.5:
|
|
method = "zscore"
|
|
zscore_cols.append(col)
|
|
|
|
else:
|
|
method = "iqr"
|
|
iqr_cols.append(col)
|
|
if len(zscore_cols) >0:
|
|
for col in zscore_cols:
|
|
mean = df[col].mean()
|
|
std = df[col].std()
|
|
ul = mean + (3*std)
|
|
ll = mean - (3*std)
|
|
mask = (df[col] < ll) | (df[col] > ul)
|
|
temp = df[mask]
|
|
|
|
Zscore_index = temp.index.tolist()
|
|
outlier_indexes.extend(Zscore_index)
|
|
|
|
if len(temp)>0:
|
|
|
|
temp_df = pd.DataFrame({"method" : ["ZScore"],
|
|
"columns name" : [col],
|
|
"upper limit" : [round(ul,2)],
|
|
"lower limit" :[ round(ll,2)],
|
|
"no of Rows" : [len(temp)],
|
|
"percentage outlier" : [round(len(temp)*100/len(df),2)]})
|
|
|
|
outlier_df = pd.concat([outlier_df,temp_df]).reset_index(drop = True)
|
|
|
|
else:
|
|
print("No columns for Zscore method")
|
|
|
|
|
|
if len(iqr_cols) >0:
|
|
for col in iqr_cols:
|
|
q3 = df[col].quantile(.75)
|
|
q1 = df[col].quantile(.25)
|
|
IQR = q3 -q1
|
|
ul = q3 + 1.5*IQR
|
|
ll = q1 - 1.5*IQR
|
|
mask = (df[col] < ll) | (df[col] > ul)
|
|
temp = df[mask]
|
|
|
|
IQR_index = temp.index.tolist()
|
|
outlier_indexes.extend(IQR_index)
|
|
|
|
if len(temp)>0:
|
|
list(outlier_indexes).append(list(IQR_index))
|
|
|
|
temp_df1 = pd.DataFrame({"method" : ["IQR"],
|
|
"columns name" : [col],
|
|
"upper limit" : [round(ul,2)],
|
|
"lower limit" : [round(ll,2)],
|
|
"no of Rows": [len(temp)],
|
|
"percentage outlier" : [round((len(temp)*100/len(df)),2)]
|
|
})
|
|
|
|
outlier_df = pd.concat([outlier_df,temp_df1]).reset_index(drop = True)
|
|
|
|
else:
|
|
print("No columns for IQR method")
|
|
|
|
|
|
outlier_indexes = list(set(outlier_indexes))
|
|
|
|
return outlier_df,outlier_indexes
|
|
|
|
|
|
def outlier_handling(df,y,model,outlier_indexes = [],outlier_cols = None ,method = root_mean_squared_error,test_size = 0.2, random_state = 42,eva = "reg"):
|
|
num_col = df.select_dtypes(exclude = "O").columns
|
|
|
|
global outliers_dropped_df,log_transformed_df,sqrt_transformed_df,yeo_johnson_transformed_df,rank_transformed_df
|
|
global std_scaler_df,winsorize_transformed_df,inverse_log_transformed_winsorize_df,inverse_sqrt_transformed_winsorize_df,minmaxscaler_df
|
|
if eva == "reg":
|
|
if len(outlier_indexes) ==0:
|
|
print("no outlier indexes passed")
|
|
outliers_dropped_df = df.copy()
|
|
else:
|
|
outliers_dropped_df = df.drop(index =outlier_indexes)
|
|
|
|
if outlier_cols != None:
|
|
|
|
if df[outlier_cols][df[outlier_cols] <0].sum().sum() == 0:
|
|
log_transformed_df = df.copy()
|
|
log_transformed_df[outlier_cols] = np.log(log_transformed_df[outlier_cols] + 1e-5)
|
|
sqrt_transformed_df = df.copy()
|
|
sqrt_transformed_df[outlier_cols] = np.sqrt(sqrt_transformed_df[outlier_cols] + 1e-5)
|
|
inverse_log_transformed_winsorize_df = log_transformed_df.copy()
|
|
inverse_sqrt_transformed_winsorize_df = sqrt_transformed_df.copy()
|
|
for column in outlier_cols:
|
|
inverse_log_transformed_winsorize_df[column] = np.exp(winsorize(inverse_log_transformed_winsorize_df[column], limits=[0.05, 0.05]))
|
|
inverse_sqrt_transformed_winsorize_df[column] = (winsorize(inverse_sqrt_transformed_winsorize_df[column], limits=[0.05, 0.05]))**2
|
|
else:
|
|
print("df have values less than zero")
|
|
std_scaler_df = df.copy()
|
|
std_scaler_df[outlier_cols] = StandardScaler().fit_transform(std_scaler_df[outlier_cols])
|
|
|
|
minmaxscaler_df = df.copy()
|
|
minmaxscaler_df[outlier_cols] = MinMaxScaler().fit_transform(minmaxscaler_df[outlier_cols])
|
|
|
|
yeo_johnson_transformed_df = df.copy()
|
|
for column in outlier_cols:
|
|
try:
|
|
yeo_johnson_transformed_df[column], lambda_ = yeojohnson(yeo_johnson_transformed_df[column])
|
|
|
|
except :
|
|
yeo_johnson_transformed_df[column] = yeo_johnson_transformed_df[column]
|
|
|
|
print(f"Yeo-Johnson transformation failed for column '{column}'. Original data used.")
|
|
|
|
rank_transformed_df = df.copy()
|
|
rank_transformed_df[outlier_cols] = rank_transformed_df[outlier_cols].rank()
|
|
winsorize_transformed_df = df.copy()
|
|
for column in outlier_cols:
|
|
winsorize_transformed_df[column] = winsorize(winsorize_transformed_df[column], limits=[0.05, 0.05])
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
if df[num_col][df[num_col] <0].sum().sum() == 0:
|
|
log_transformed_df = df.copy()
|
|
log_transformed_df[num_col] = np.log(log_transformed_df[num_col] + 1e-5)
|
|
sqrt_transformed_df = df.copy()
|
|
sqrt_transformed_df[num_col] = np.sqrt(sqrt_transformed_df[num_col] + 1e-5)
|
|
inverse_log_transformed_winsorize_df = log_transformed_df.copy()
|
|
inverse_sqrt_transformed_winsorize_df = sqrt_transformed_df.copy()
|
|
for column in num_col:
|
|
inverse_log_transformed_winsorize_df[column] = np.exp(winsorize(inverse_log_transformed_winsorize_df[column], limits=[0.05, 0.05]))
|
|
inverse_sqrt_transformed_winsorize_df[column] = (winsorize(inverse_sqrt_transformed_winsorize_df[column], limits=[0.05, 0.05]))**2
|
|
else:
|
|
|
|
print("df have values less than zero")
|
|
|
|
std_scaler_df = df.copy()
|
|
std_scaler_df[outlier_cols] = StandardScaler().fit_transform(std_scaler_df[outlier_cols])
|
|
|
|
minmaxscaler_df = df.copy()
|
|
minmaxscaler_df[outlier_cols] = MinMaxScaler().fit_transform(minmaxscaler_df[outlier_cols])
|
|
|
|
yeo_johnson_transformed_df = df.copy()
|
|
for column in num_col:
|
|
try:
|
|
yeo_johnson_transformed_df[column], lambda_ = yeojohnson(yeo_johnson_transformed_df[column])
|
|
|
|
except :
|
|
yeo_johnson_transformed_df[column] = yeo_johnson_transformed_df[column]
|
|
|
|
print(f"Yeo-Johnson transformation failed for column '{column}'. Original data used.")
|
|
|
|
rank_transformed_df = df.copy()
|
|
rank_transformed_df[num_col] = rank_transformed_df[num_col].rank()
|
|
winsorize_transformed_df = df.copy()
|
|
for column in num_col:
|
|
winsorize_transformed_df[column] = winsorize(winsorize_transformed_df[column], limits=[0.05, 0.05])
|
|
|
|
if (df[num_col][df[num_col] <0].sum().sum() == 0):
|
|
outlier_handled_df = [std_scaler_df,minmaxscaler_df,outliers_dropped_df,log_transformed_df,sqrt_transformed_df,yeo_johnson_transformed_df,
|
|
rank_transformed_df,winsorize_transformed_df,inverse_log_transformed_winsorize_df,inverse_sqrt_transformed_winsorize_df]
|
|
|
|
outlier_handled_df_name = ["std_scaler_df","minmaxscaler_df","outliers_dropped_df", "log_transformed_df","sqrt_transformed_df", "yeo_johnson_transformed_df","rank_transformed_df","winsorize_transformed_df",
|
|
"inverse_log_transformed_winsorize_df", "inverse_sqrt_transformed_winsorize_df"]
|
|
elif df[outlier_cols][df[outlier_cols] <0].sum().sum() == 0:
|
|
outlier_handled_df = [std_scaler_df,minmaxscaler_df,outliers_dropped_df,log_transformed_df,sqrt_transformed_df,yeo_johnson_transformed_df,
|
|
rank_transformed_df,winsorize_transformed_df,inverse_log_transformed_winsorize_df,inverse_sqrt_transformed_winsorize_df]
|
|
|
|
outlier_handled_df_name = ["std_scaler_df","minmaxscaler_df","outliers_dropped_df","log_transformed_df", "sqrt_transformed_df","yeo_johnson_transformed_df","rank_transformed_df",
|
|
"winsorize_transformed_df","inverse_log_transformed_winsorize_df","inverse_sqrt_transformed_winsorize_df"]
|
|
|
|
else:
|
|
outlier_handled_df = [std_scaler_df,minmaxscaler_df,outliers_dropped_df,yeo_johnson_transformed_df,rank_transformed_df,winsorize_transformed_df]
|
|
|
|
outlier_handled_df_name = ["std_scaler_df","minmaxscaler_df","outliers_dropped_df","yeo_johnson_transformed_df","rank_transformed_df","winsorize_transformed_df"]
|
|
|
|
for j,i in enumerate(outlier_handled_df):
|
|
X_train, X_test, y_train, y_test = tts(i,y[i.index],test_size = test_size, random_state = random_state)
|
|
evaluationer.evaluation(f"{outlier_handled_df_name[j]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
|
|
|
|
|
|
return evaluationer.reg_evaluation_df , outlier_handled_df,outlier_handled_df_name
|
|
elif eva =="class":
|
|
|
|
std_scaler_df = df.copy()
|
|
|
|
std_scaler_df.loc[:,:] = StandardScaler().fit_transform(std_scaler_df.loc[:,:])
|
|
|
|
minmaxscaler_df = df.copy()
|
|
minmaxscaler_df.loc[:,:] = MinMaxScaler().fit_transform(minmaxscaler_df.loc[:,:])
|
|
|
|
rank_transformed_df = df.copy()
|
|
rank_transformed_df = rank_transformed_df.rank()
|
|
|
|
outlier_handled_df = [std_scaler_df,minmaxscaler_df,rank_transformed_df]
|
|
outlier_handled_df_name = ["std_scaler_df","minmaxscaler_df","rank_transformed_df"]
|
|
|
|
for j,i in enumerate(outlier_handled_df):
|
|
|
|
X_train, X_test, y_train, y_test = tts(i,y[i.index],test_size = test_size, random_state = random_state)
|
|
evaluationer.evaluation(f"{outlier_handled_df_name[j]}", X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva = "class")
|
|
return evaluationer.classification_evaluation_df, outlier_handled_df,outlier_handled_df_name
|
|
|
|
|
|
|