File size: 7,671 Bytes
a8af817
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error,r2_score,mean_squared_error,root_mean_squared_log_error,mean_absolute_error,mean_squared_log_error
from sklearn.metrics import f1_score, accuracy_score, precision_score,recall_score, average_precision_score
# creating a class for evaluation
    
reg_evaluation_df = pd.DataFrame({"evaluation_df_method" :[],
                                "model": [],# model displays regression model
                                "method": [],# method display evaluation metrics used
                                "train_r2": [],# train r2 shows train R2 score
                                "test_r2": [],# test r2 shows test R2 Score
                                "adjusted_r2_train": [],# adjusted_r2_train shows adjusted r2 score for train
                                "adjusted_r2_test": [],# adjusted_r2_test shows adjusted r2 score for test
                                "train_evaluation": [],# train_evaluation shows train evaluation score by used method
                                "test_evaluation" : []# test_evaluation shows test evaluation score by used method
                            })

classification_evaluation_df = pd.DataFrame({"evaluation_df_method" :[],
                        'model': [],
                        'train_f1': [],
                        'test_f1': [],
                        'train_acc': [],
                        'test_acc': [],
                        'precision_train': [],
                        'precision_test': [],
                        'recall_train': [],
                        'recall_test': []
                    })

# function for evaluating dataframe
def evaluation(evaluation_df_method,X_train,X_test,y_train,y_test,model,method,eva):# input parameters from train_test_split , model and method for evaluation.
    global y_pred_train,y_pred_test,y_pred_proba_train,y_pred_proba_test
    model = model
    model.fit(X_train,y_train) # model fitting
    y_pred_train = model.predict(X_train) # model prediction for train
    y_pred_test = model.predict(X_test) # model prediction for test

    if eva == "reg":
        
        train_r2 = r2_score(y_train, y_pred_train) # evaluating r2 score for train
        test_r2 = r2_score(y_test, y_pred_test)  # evaluating r2 score for test
        
        n_r_train, n_c_train = X_train.shape # getting no of rows and columns of train data
        n_r_test,  n_c_test = X_test.shape # getting no of rows and columns of test data
        
        adj_r2_train = 1 - ((1 - train_r2)*(n_r_train - 1)/ (n_r_train - n_c_train - 1))  # evaluating adjusted r2 score for train
        adj_r2_test = 1 - ((1 - test_r2)*(n_r_test - 1)/ (n_r_test - n_c_test - 1)) # evaluating adjusted r2 score for test
    
        train_evaluation = method(y_train, y_pred_train) # evaluating train error
        test_evaluation = method(y_test, y_pred_test) # evaluating test error
        
        if method == root_mean_squared_error:
            a = "root_mean_squared_error"
        elif method ==root_mean_squared_log_error:
            a = "root_mean_squared_log_error"
        elif method == mean_absolute_error:
            a = "mean_absolute_error"
        elif method == mean_squared_error:
            a = "mean_squared_error"
        elif method == mean_squared_log_error:
            a = "mean_squared_log_error"    
        
        # declaring global dataframes
        global reg_evaluation_df,temp_df
        
        # creating temporary dataframe for concating in later into main evaluation dataframe
        temp_df = pd.DataFrame({"evaluation_df_method" :[evaluation_df_method],
                                    "model": [model],
                                    "method": [a],
                                    "train_r2": [train_r2],
                                    "test_r2": [test_r2],
                                    "adjusted_r2_train": [adj_r2_train],
                                    "adjusted_r2_test": [adj_r2_test],
                                    "train_evaluation": [train_evaluation],
                                    "test_evaluation" : [test_evaluation]
                                    })
        reg_evaluation_df = pd.concat([reg_evaluation_df,temp_df]).reset_index(drop = True)
        


        
        # return reg_evaluation_df # returning evaluation_df

    elif eva == "class":
                
        # y_pred_proba_train= model.predict_proba(X_train)
        # y_pred_proba_test= model.predict_proba(X_test)

        unique_classes = np.unique(y_train)
    
        # Determine the average method
        if len(unique_classes) == 2:
            # Binary classification
            print("Using 'binary' average for binary classification.")
            average_method = 'binary'
        elif len(unique_classes)!=2:
            # Determine the distribution of the target column
            class_counts = np.bincount(y_train)
            
            # Check if the dataset is imbalanced
            imbalance_ratio = max(class_counts) / min(class_counts)
            
            if imbalance_ratio > 1.5:
                # Imbalanced dataset
                print("Using 'weighted' average due to imbalanced dataset.")
                average_method = 'weighted'
            else:
                # Balanced dataset
                print("Using 'macro' average due to balanced dataset.")
                average_method = 'macro'
            
        # F1 scores
        train_f1_scores = (f1_score(y_train, y_pred_train,average=average_method))
        test_f1_scores = (f1_score(y_test, y_pred_test,average=average_method))    
    
        # Accuracies
        train_accuracies = (accuracy_score(y_train, y_pred_train))
        test_accuracies = (accuracy_score(y_test, y_pred_test))
    
        # Precisions
        train_precisions = (precision_score(y_train, y_pred_train,average=average_method))
        test_precisions = (precision_score(y_test, y_pred_test,average=average_method))
    
        # Recalls
        train_recalls = (recall_score(y_train, y_pred_train,average=average_method))
        test_recalls = (recall_score(y_test, y_pred_test,average=average_method))
        
        # declaring global dataframes
        global classification_evaluation_df,temp_df1
        
        # creating temporary dataframe for concating in later into main evaluation dataframe
        temp_df1 = pd.DataFrame({"evaluation_df_method" :[evaluation_df_method],
            'model': [model],
            'train_f1': [train_f1_scores],
            'test_f1': [test_f1_scores],
            'train_acc': [train_accuracies],
            'test_acc': [test_accuracies],
            'precision_train': [train_precisions],
            'precision_test': [test_precisions],
            'recall_train': [train_recalls],
            'recall_test': [test_recalls]
        })
        classification_evaluation_df = pd.concat([classification_evaluation_df, temp_df1]).reset_index(drop = True)
        
        return classification_evaluation_df # returning evaluation_df

global method_df
method_df = pd.DataFrame(data = [root_mean_squared_error, root_mean_squared_log_error,mean_absolute_error,mean_squared_error,mean_squared_log_error],
                         index = ["root_mean_squared_error", "root_mean_squared_log_error","mean_absolute_error","mean_squared_error","mean_squared_log_error"])