Gaurav069 commited on
Commit
a8af817
1 Parent(s): 1d68d51

Upload 9 files

Browse files
Files changed (9) hide show
  1. app.py +535 -0
  2. auto_optimizer.py +317 -0
  3. best_tts.py +110 -0
  4. evaluationer.py +151 -0
  5. feature_selections.py +104 -0
  6. models.py +70 -0
  7. null_value_handling.py +49 -0
  8. outliers.py +233 -0
  9. requirements.txt +10 -0
app.py ADDED
@@ -0,0 +1,535 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import libraries
2
+ import streamlit as st
3
+ import joblib
4
+ import numpy as np
5
+ import pandas as pd
6
+ from sklearn.model_selection import train_test_split as tts
7
+ import evaluationer,models, null_value_handling
8
+ import auto_optimizer
9
+ from sklearn.experimental import enable_iterative_imputer
10
+ from sklearn.impute import SimpleImputer, IterativeImputer
11
+ # st.set_page_config(layout="wide")
12
+
13
+ st.set_page_config(
14
+ page_title="LazyML App",
15
+ page_icon="🧊",
16
+ initial_sidebar_state="expanded",
17
+ menu_items={
18
+ 'Get Help': 'https://www.extremelycoolapp.com/help',
19
+ 'Report a bug': "https://www.extremelycoolapp.com/bug",
20
+ 'About': "# This is a header. This is an *extremely* cool app!"
21
+ }
22
+ )
23
+
24
+ import streamlit as st
25
+
26
+ # Title with Rainbow Transition Effect and Neon Glow
27
+ html_code = """
28
+ <div class="title-container">
29
+ <h1 class="neon-text">
30
+ LazyML
31
+ </h1>
32
+ </div>
33
+
34
+ <style>
35
+ @keyframes rainbow-text-animation {
36
+ 0% { color: red; }
37
+ 16.67% { color: orange; }
38
+ 33.33% { color: yellow; }
39
+ 50% { color: green; }
40
+ 66.67% { color: blue; }
41
+ 83.33% { color: indigo; }
42
+ 100% { color: violet; }
43
+ }
44
+
45
+ .title-container {
46
+ text-align: center;
47
+ margin: 1em 0;
48
+ padding-bottom: 10px;
49
+ border-bottom: 4 px solid #fcdee9; /* Magenta underline */
50
+ }
51
+
52
+ .neon-text {
53
+ font-family: Arial, sans-serif;
54
+ font-size: 4em;
55
+ margin: 0;
56
+ animation: rainbow-text-animation 5s infinite linear;
57
+ text-shadow: 0 0 5px rgba(255, 255, 255, 0.8),
58
+ 0 0 10px rgba(255, 255, 255, 0.7),
59
+ 0 0 20px rgba(255, 255, 255, 0.6),
60
+ 0 0 40px rgba(255, 0, 255, 0.6),
61
+ 0 0 80px rgba(255, 0, 255, 0.6),
62
+ 0 0 90px rgba(255, 0, 255, 0.6),
63
+ 0 0 100px rgba(255, 0, 255, 0.6),
64
+ 0 0 150px rgba(255, 0, 255, 0.6);
65
+ }
66
+ </style>
67
+ """
68
+
69
+ st.markdown(html_code, unsafe_allow_html=True)
70
+
71
+
72
+ # file uploader
73
+ csv_upload = st.sidebar.file_uploader("Input CSV File for ML modelling", type=['csv'])
74
+ csv_upload2 = st.sidebar.file_uploader("Input CSV File of Test Data Prediction",type = ["csv"])
75
+ test = pd.DataFrame()
76
+ if csv_upload is not None:
77
+ # read the uploaded file into dataframe
78
+ df = pd.read_csv(csv_upload)
79
+
80
+ # saving the dataframe to a CSV file
81
+ df.to_csv('csv_upload.csv', index=False)
82
+ st.write("Train File uploaded successfully. ✅")
83
+
84
+ if csv_upload2 is not None:
85
+ test = pd.read_csv(csv_upload2)
86
+ id_col = st.selectbox("select column for submission i.e, ID",test.columns)
87
+ submission_id = test[id_col]
88
+ # st.write("Train File upl",submission_id)
89
+
90
+
91
+
92
+
93
+ if len(test) >0:
94
+ # saving the test dataframe to a CSV file
95
+ test.to_csv('csv_upload_test.csv', index=False)
96
+ st.write("Test File uploaded successfully. ✅")
97
+
98
+ display_train_data = st.radio("Display Train Data",["Yes","No"],index = 1)
99
+ if display_train_data == "Yes":
100
+ st.dataframe(df.head())
101
+
102
+ if len(test) >0:
103
+ display_test_data = st.radio("Display Test Data",["Yes","No"],index = 1)
104
+ if display_test_data == "Yes":
105
+ st.dataframe(test.head())
106
+
107
+
108
+ if st.radio("Select Supervision Category",["Supervised","Un-Supervised"],index =0) == "Supervised":
109
+
110
+ selected_column = st.selectbox('Select Target column', df.columns, index=(len(df.columns)-1))
111
+
112
+ # Display the selected column
113
+ st.write('You selected:', selected_column)
114
+
115
+ y = df[selected_column]
116
+
117
+ if y.dtype == "O":
118
+ st.write("⚠️⚠️⚠️ Target Column is Object Type ⚠️⚠️⚠️")
119
+ if st.radio("Proceed for Label Encoding ",["Yes","No"],index = 1) == "Yes":
120
+ from sklearn.preprocessing import LabelEncoder
121
+ le = LabelEncoder()
122
+ y= pd.Series(le.fit_transform(y))
123
+ st.write("Label Encoding Completed ✅")
124
+
125
+ if st.radio("Display Target Column",["Yes","No"],index =1) == "Yes":
126
+ st.dataframe(y.head())
127
+
128
+
129
+ select_target_trans = st.radio("Target column Transformation",["Yes","No"],index = 1)
130
+ if select_target_trans == "Yes":
131
+ selected_transformation = st.selectbox("Select Transformation method",["Log Transformation","Power Transformation"])
132
+ if selected_transformation == "Log Transformation":
133
+ if y.min() <=0:
134
+ st.write("Values in target columns are zeroes or negative, please select power transformation")
135
+ else:
136
+ log_selected_transformation = st.selectbox("Select Logarithmic method",["Natural Log base(e)","Log base 10","Log base (2)"])
137
+ if log_selected_transformation == "Natural Log base(e)":
138
+ y = np.log(y)
139
+ st.write("Log base (e) Transformation Completed ✅")
140
+ elif log_selected_transformation == "Log base 10":
141
+ y = np.log10(y)
142
+ st.write("Log base 10 Transformation Completed ✅")
143
+ elif log_selected_transformation == "Log base (2)":
144
+ y = np.log2(y)
145
+ st.write("Log base 2 Transformation Completed ✅")
146
+ elif selected_transformation == "Power Transformation":
147
+ power_selected_transformation = st.selectbox("Select Power Transformation method",["Square Root","Other"])
148
+ if power_selected_transformation == "Square Root":
149
+ y = np.sqrt(y)
150
+ st.write("Square root Transformation Completed ✅")
151
+ elif power_selected_transformation == "Other":
152
+ power_value = st.number_input("Enter Power Value",value=3)
153
+ y = y**(1/power_value)
154
+ st.write(f"power root of {power_value} Transformation Completed ✅")
155
+
156
+ if st.radio("Display Target Column after Transformation",["Yes","No"],index =1) == "Yes":
157
+ st.dataframe(y.head())
158
+ # inverse of transformation
159
+
160
+ X = df.drop(columns = selected_column)
161
+
162
+ if st.radio("Display X-Train Data",["Yes","No"],index =1) == "Yes":
163
+ st.dataframe(X.head())
164
+ if st.radio("Check for duplicate Values",["Yes","No"],index = 1) == "Yes":
165
+ len_duplicates = len(X[X.duplicated()])
166
+ if len_duplicates >0:
167
+ st.write(f"There are {len_duplicates} duplicate values in Train")
168
+ if st.selectbox("Drop Duplicate values",["Yes","No"],index = 1) == "Yes":
169
+ X = X.drop_duplicates()
170
+ st.write("Duplicate values removed ✅")
171
+ else:
172
+ st.write("There are no duplicate values in Train")
173
+ # dropping not important columns
174
+ if st.radio("Drop Un-Important Column(s)",["Yes","No"],index = 1) == "Yes":
175
+ selected_drop_column = st.multiselect('Select columns to be dropped', X.columns)
176
+ X = X.drop(columns = selected_drop_column)
177
+ if len(test) >0:
178
+ test = test.drop(columns = selected_drop_column)
179
+ st.write("Un-Important column(s) Delected ✅")
180
+ st.dataframe(X.head())
181
+
182
+ num_cols = X.select_dtypes(exclude = "O").columns
183
+ cat_cols = X.select_dtypes(include = "O").columns
184
+ st.write("Numerical Columns in Train Data: ", tuple(num_cols))
185
+ st.write("Categorical Columns in Train Data: ", tuple(cat_cols))
186
+
187
+ if st.radio("Select method for ML modelling", ["Manual","Auto Optimized"],index = 0) == "Auto Optimized":
188
+ ml_cat_ao = st.radio("Select Machine Learning Category",["Regression","Classification"],index =0)
189
+
190
+ if ml_cat_ao =="Regression":
191
+ eva = "reg"
192
+ st.write("Select ML algorithm")
193
+ reg_model_name = st.selectbox("select model",models.Regression_models.index)
194
+ reg_model = models.Regression_models.loc[reg_model_name].values[0]
195
+ auto_optimizer.Auto_optimizer(X,y,eva,reg_model)
196
+
197
+ elif ml_cat_ao =="Classification":
198
+ eva = "class"
199
+ st.write("Select ML algorithm")
200
+ class_model_name = st.selectbox("select model",models.Classification_models.index)
201
+ class_model = models.Classification_models.loc[class_model_name].values[0]
202
+ auto_optimizer.Auto_optimizer(X,y,eva,class_model)
203
+
204
+
205
+ else:
206
+ if X.isnull().sum().sum() >0 :
207
+ st.write("⚠️⚠️⚠️ There are missing values in Train Data ⚠️⚠️⚠️")
208
+
209
+ if st.selectbox("Drop null values or Impute",["Drop Null Values","Impute Null Values"],index = 1) == "Drop Null Values":
210
+
211
+ X = X.dropna()
212
+ if len(test) >0:
213
+ st.write("⚠️⚠️⚠️ If choosing drop values, test dataset will also drop those values please choose missing value imputation method befittingly.⚠️⚠️⚠️ ")
214
+ test = test.dropna()
215
+
216
+ clean_num_nvh_df = pd.DataFrame()
217
+ if X[num_cols].isnull().sum().sum() >0:
218
+ st.write("Numerical Columns with Percentage of Null Values: ")
219
+ num_cols_nvh = X[num_cols].isnull().sum()[X[num_cols].isnull().sum()>0].index
220
+ st.dataframe(round(X[num_cols].isnull().sum()[X[num_cols].isnull().sum()>0]/len(X)*100,2))
221
+ dict_1= {}
222
+ for nvh_method in null_value_handling.null_value_handling_method_num_cols :
223
+
224
+ selected_nvh_num_cols = st.multiselect(f'method:- \"{nvh_method}\" for Numerical columns', num_cols_nvh,)
225
+ dict_1[nvh_method] = selected_nvh_num_cols
226
+
227
+ num_cols_nvh = set(num_cols_nvh) - set(selected_nvh_num_cols)
228
+ if len(num_cols_nvh) ==0:
229
+ break
230
+ num_nvh_df = pd.DataFrame(data=dict_1.values(), index=dict_1.keys())
231
+
232
+ clean_num_nvh_df = num_nvh_df.T[num_nvh_df.T.count()[num_nvh_df.T.count()>0].index]
233
+
234
+ st.write("Methods for Numerical columns null value handling",clean_num_nvh_df )
235
+
236
+ if len(test) >0:
237
+ if test[num_cols].isnull().sum().sum() >0:
238
+ test_num_cols_nvh = test[num_cols].isnull().sum()[test[num_cols].isnull().sum()>0].index
239
+ st.write("Columns with Null Value in Test",test_num_cols_nvh)
240
+ test[num_cols] = IterativeImputer(max_iter = 200,random_state= 42).fit_transform(test[num_cols])
241
+
242
+
243
+ clean_num_nvh_df_cat = pd.DataFrame()
244
+ if X[cat_cols].isnull().sum().sum() >0:
245
+ st.write("Categorical Columns with Percentage of Null Values: ")
246
+ cat_cols_nvh = X[cat_cols].isnull().sum()[X[cat_cols].isnull().sum()>0].index
247
+ st.dataframe(round(X[cat_cols].isnull().sum()[X[cat_cols].isnull().sum()>0]/len(X)*100,2))
248
+
249
+ dict_2= {}
250
+ for nvh_method in null_value_handling.null_value_handling_method_cat_cols :
251
+ st.write("dsff",nvh_method)
252
+
253
+ selected_nvh_num_cols = st.multiselect(f'method:- \"{nvh_method}\" for Numerical columns', cat_cols_nvh,)
254
+ dict_2[nvh_method] = selected_nvh_num_cols
255
+
256
+ cat_cols_nvh = set(cat_cols_nvh) - set(selected_nvh_num_cols)
257
+ if len(cat_cols_nvh) ==0:
258
+ break
259
+ num_nvh_df_cat = pd.DataFrame(data=dict_2.values(), index=dict_2.keys())
260
+ clean_num_nvh_df_cat = num_nvh_df_cat.T
261
+ st.write("Methods for Categorical columns null value handling",[clean_num_nvh_df_cat])
262
+
263
+ if len(test) >0:
264
+ if test[cat_cols].isnull().sum().sum() >0:
265
+ test_num_cols_nvh_cat = test[cat_cols].isnull().sum()[test[cat_cols].isnull().sum()>0].index
266
+ st.write("sdgs",test_num_cols_nvh_cat)
267
+ test[cat_cols] = SimpleImputer(strategy = "most_frequent").fit_transform(test[cat_cols])
268
+
269
+
270
+ null_value_handling.null_handling(X,clean_num_nvh_df,clean_num_nvh_df_cat)
271
+ st.write("X Data after Null value handling", X.head())
272
+
273
+ new_df = pd.concat([X,y[X.index]],axis = 1)
274
+
275
+ csv = new_df.to_csv(index = False)
276
+ if st.radio("Download Null Value Handled DataFrame as CSV File ? ",["Yes","No"],index = 1) == "Yes":
277
+ st.download_button(label="Download Null Value Handled CSV File",data=csv,file_name='NVH_DataFrame.csv',mime='text/csv')
278
+
279
+ ord_enc_cols = []
280
+
281
+ if len(cat_cols) == 0:
282
+ st.write("No Categorical Columns in Train")
283
+ else:
284
+ st.write("Select Columns for Ordinal Encoding")
285
+ for column in cat_cols:
286
+
287
+ selected = st.checkbox(column)
288
+ if selected:
289
+ st.write(f"No. of Unique value in {column} column are", X[column].nunique())
290
+ ord_enc_cols.append(column)
291
+ ohe_enc_cols = set(cat_cols) -set(ord_enc_cols)
292
+ ohe_enc_cols = list(ohe_enc_cols)
293
+ if len(ord_enc_cols)>0:
294
+ st.write("ordinal encoded columns" ,tuple(ord_enc_cols))
295
+ if len(ohe_enc_cols)>0:
296
+ st.write("one hot encoded columns" ,tuple(ohe_enc_cols))
297
+
298
+ if len(ord_enc_cols)>0:
299
+ if st.radio("proceed for ordinal encoding",["Yes","No"],index = 1) == "Yes":
300
+ ordinal_order_vals = []
301
+
302
+ for column in ord_enc_cols:
303
+ unique_vals = X[column].unique()
304
+ # st.write(f"No. of Unique value in {column} column are", len(unique_vals))
305
+
306
+ ordered_unique_vals = st.multiselect("Select values in order for Ordinal Encoding",unique_vals,unique_vals)
307
+ ordinal_order_vals.append(ordered_unique_vals)
308
+
309
+ st.write("order of values for Ordinal Encoding",tuple(ordinal_order_vals))
310
+ # import ordinal encoder
311
+ from sklearn.preprocessing import OrdinalEncoder
312
+ ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
313
+ X[ord_enc_cols] = ord.fit_transform(X[ord_enc_cols])
314
+ if len(test) >0:
315
+ test[ord_enc_cols] = ord.transform(test[ord_enc_cols])
316
+ st.write("DataFrame after Ordinal Encoding",X.head())
317
+ st.write("Ordinal Encoding Completed ✅")
318
+
319
+ if len(ohe_enc_cols)>0:
320
+ if st.radio("proceed for OnehotEncoding ",["Yes","No"],index = 1) == "Yes": # import one hot encoder
321
+ from sklearn.preprocessing import OneHotEncoder
322
+ ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
323
+ pd.options.mode.chained_assignment = None
324
+ X.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(X[ohe_enc_cols])
325
+ X.drop(columns = ohe_enc_cols,inplace = True)
326
+ if len(test) >0:
327
+ test.loc[:, ohe.get_feature_names_out()] = ohe.transform(test[ohe_enc_cols])
328
+ test.drop(columns = ohe_enc_cols,inplace = True)
329
+
330
+ pd.options.mode.chained_assignment = 'warn'
331
+
332
+ st.write("DataFrame after One Hot Encoding",X.head())
333
+ st.write("OneHot Encoding Completed ✅")
334
+
335
+ new_df = pd.concat([X,y],axis = 1)
336
+
337
+ csv = new_df.to_csv(index = False)
338
+ if st.radio("Download Encoded DataFrame as CSV File ? ",["Yes","No"],index = 1) == "Yes":
339
+ st.download_button(label="Download Ordinal Encoded CSV File",data=csv,file_name='Encoded_DataFrame.csv',mime='text/csv')
340
+
341
+
342
+ random_state = st.number_input("Enter Random_state",max_value=100,min_value=1,value=42)
343
+ test_size = st.number_input("Enter test_size",max_value=0.99, min_value = 0.01,value =0.2)
344
+ if st.radio("select Train Validation Split Method",
345
+ [f"Train_Test_split, Default (Random_state = {random_state},Test_size = {test_size})",
346
+ "KFoldCV, Default (CV = 5)"], index = 0)== f"Train_Test_split, Default (Random_state = {random_state},Test_size = {test_size})":
347
+ ttsmethod = "Train_Test_split"
348
+ else:
349
+ ttsmethod = "KFoldCV"
350
+ st.write('You selected:', ttsmethod)
351
+ if ttsmethod == "Train_Test_split":
352
+ X_train,X_Val,y_train,y_val = tts(X,y[X.index],random_state = random_state,test_size = test_size)
353
+ st.write('X-Training Data shape:', (X_train.info()))
354
+
355
+ st.write('X-Training Data shape:', X_train.shape)
356
+ st.write('X-Validation Data shape:', X_Val.shape)
357
+
358
+ ml_cat = st.radio("Select Machine Learning Category",["Regression","Classification"],index =0)
359
+
360
+ if ml_cat =="Regression":
361
+ method_name_selector = st.selectbox("Select Error Evaluation Method",evaluationer.method_df.index,index = 0)
362
+
363
+ method = evaluationer.method_df.loc[method_name_selector].values[0]
364
+ reg_algorithm = []
365
+ selected_options = []
366
+
367
+ for option in models.Regression_models.index:
368
+ selected = st.checkbox(option)
369
+ if selected:
370
+ selected_options.append(option)
371
+
372
+ param = models.Regression_models.loc[option][0].get_params()
373
+ Temp_parameter = pd.DataFrame(data=param.values(), index=param.keys())
374
+ Temp_parameter_transposed = Temp_parameter.T
375
+ parameter = pd.DataFrame(data=param.values(), index=param.keys())
376
+ def is_boolean(val):
377
+ return isinstance(val, bool)
378
+
379
+ # Apply the function to the DataFrame column and create a new column with the resuSlts
380
+ bool_cols= parameter[parameter[0].apply(is_boolean)].index
381
+ param_transposed = parameter.T
382
+ # st.write("hrweurgesj",param_transposed.loc[:, bool_cols])
383
+ # st.write("bool_cols",bool_cols)
384
+ remaining_cols = set(param_transposed.columns) - set(bool_cols)
385
+ remaining_cols = tuple(remaining_cols)
386
+ # st.write("rem_Cols",remaining_cols)
387
+
388
+ for col in remaining_cols:
389
+ param_transposed[col] = pd.to_numeric(param_transposed[col],errors="ignore")
390
+ cat_cols = param_transposed.select_dtypes(include = ["O"]).T.index.to_list()
391
+ num_cols = set(remaining_cols) - set(cat_cols)
392
+ cat_cols = set(cat_cols) - set(bool_cols)
393
+ num_cols = tuple(num_cols)
394
+ # st.write("sdsafdsd",num_cols)
395
+ for i in num_cols:
396
+ param_transposed[i] = st.number_input(f"input \"{i}\" value \n{option}",value = parameter.T[i].values[0])
397
+ for i in cat_cols:
398
+ param_transposed[i] = st.text_input(f"input \"{i}\" value \n{option}",value = parameter.T[i].values[0])
399
+ for i in bool_cols:
400
+ st.write("default value to insert",Temp_parameter_transposed[i].values[0])
401
+ param_transposed[i] = st.selectbox(f"input \"{i}\" value \n{option}",[False, True], index=Temp_parameter_transposed[i].values[0])
402
+
403
+ inv_param = param_transposed.T
404
+ new_param = inv_param.dropna().loc[:,0].to_dict()
405
+ # st.write("asad",new_param)
406
+ models.Regression_models.loc[option][0].set_params(**new_param)
407
+ a = models.Regression_models.loc[option][0].get_params()
408
+ reg_algorithm.append(models.Regression_models.loc[option][0])
409
+ if st.button("Train Regression Model"):
410
+ for algorithm in reg_algorithm:
411
+ evaluationer.evaluation(f"{algorithm} baseline",X_train,X_Val,y_train,y_val,algorithm,method,"reg")
412
+ st.write("Regression Model Trained Successfully",evaluationer.reg_evaluation_df)
413
+ if len(test)>0:
414
+ if st.radio("Predict",["Yes","No"],index = 1) =="Yes":
415
+
416
+ if len(evaluationer.reg_evaluation_df) >0:
417
+ a = st.number_input("select index of best algorithm for test prediction",min_value = 0,max_value =len(evaluationer.reg_evaluation_df) -1, value = len(evaluationer.reg_evaluation_df) -1)
418
+
419
+ test_prediction = evaluationer.reg_evaluation_df.loc[a,"model"].predict(test)
420
+ if select_target_trans == "Yes":
421
+ if selected_transformation == "Log Transformation":
422
+ if log_selected_transformation == "Natural Log base(e)":
423
+ test_prediction = np.exp(test_prediction)
424
+ st.write("Natural Log base(e) Inverse Transformation Completed ✅")
425
+ elif log_selected_transformation == "Log base 10":
426
+ test_prediction = np.power(10,test_prediction)
427
+ st.write("Log base 10 Inverse Transformation Completed ✅")
428
+ elif log_selected_transformation == "Log base (2)":
429
+ test_prediction = np.power(2,test_prediction)
430
+ st.write("Log base 2 Inverse Transformation Completed ✅")
431
+ elif selected_transformation == "Power Transformation":
432
+ if power_selected_transformation == "Square Root":
433
+ test_prediction = np.power(test_prediction,2)
434
+ st.write("Square root Inverse Transformation Completed ✅")
435
+ elif power_selected_transformation == "Other":
436
+ test_prediction = test_prediction**(power_value)
437
+ st.write(f"power root of {power_value} Inverse Transformation Completed ✅")
438
+ submission_file = pd.DataFrame(index = [submission_id],data = test_prediction,columns = [selected_column])
439
+ st.write("Sample of Prediction File",submission_file.head())
440
+ csv_prediction = submission_file.to_csv()
441
+ if st.radio("Download Prediction File as CSV File ? ",["Yes","No"],index = 1) == "Yes":
442
+ st.download_button(label="Download Prediction CSV File",data=csv_prediction,file_name='prediction.csv',mime='text/csv')
443
+
444
+
445
+
446
+
447
+ if ml_cat =="Classification":
448
+
449
+
450
+
451
+ cla_algorithm = []
452
+ selected_options = []
453
+
454
+ for option in models.Classification_models.index:
455
+ selected = st.checkbox(option)
456
+ if selected:
457
+ selected_options.append(option)
458
+
459
+ param = models.Classification_models.loc[option][0].get_params()
460
+
461
+
462
+ parameter = pd.DataFrame(data=param.values(), index=param.keys())
463
+ Temp_parameter = parameter.copy()
464
+ Temp_parameter_transposed = (Temp_parameter.T).copy()
465
+ def is_boolean(val):
466
+ return isinstance(val, bool)
467
+
468
+ # Apply the function to the DataFrame column and create a new column with the resuSlts
469
+ bool_cols= parameter[parameter[0].apply(is_boolean)].index
470
+ param_transposed = parameter.T
471
+ st.write("bool_cols",bool_cols)
472
+ remaining_cols = set(param_transposed.columns) - set(bool_cols)
473
+ remaining_cols = tuple(remaining_cols)
474
+ st.write("rem_Cols",remaining_cols)
475
+
476
+ for col in remaining_cols:
477
+ param_transposed[col] = pd.to_numeric(param_transposed[col],errors="ignore")
478
+ cat_cols = param_transposed.select_dtypes(include = ["O"]).T.index.to_list()
479
+ num_cols = set(remaining_cols) - set(cat_cols)
480
+ num_cols = tuple(num_cols)
481
+ st.write("sdsafdsd",num_cols)
482
+ for i in num_cols:
483
+ param_transposed[i] = st.number_input(f"input \"{i}\" value \n{option}",value = parameter.T[i].values[0])
484
+ for i in cat_cols:
485
+ param_transposed[i] = st.text_input(f"input \"{i}\" value \n{option}",value = parameter.T[i].values[0])
486
+ for i in bool_cols:
487
+ st.write("default value to insert",Temp_parameter_transposed[i].values[0])
488
+ param_transposed[i] = st.selectbox(f"input \"{i}\" value \n{option}",[False,True], index=Temp_parameter_transposed[i].values[0])
489
+ inv_param = param_transposed.T
490
+ new_param = inv_param.dropna().loc[:,0].to_dict()
491
+ st.write("asad",new_param)
492
+ models.Classification_models.loc[option][0].set_params(**new_param)
493
+ a = models.Classification_models.loc[option][0].get_params()
494
+ cla_algorithm.append(models.Classification_models.loc[option][0])
495
+ # st.write("sada",reg_algorithm/)
496
+ if st.button("Train Regression Model"):
497
+ method = None
498
+ for algorithm in cla_algorithm:
499
+ evaluationer.evaluation(f"{algorithm} baseline",X_train,X_Val,y_train,y_val,algorithm,method,eva ="class")
500
+ st.write("Regression Model Trained Successfully",evaluationer.classification_evaluation_df)
501
+
502
+ if len(test)>0:
503
+ if st.radio("Predict",["Yes","No"],index = 1) =="Yes":
504
+ if len(evaluationer.classification_evaluation_df) >0:
505
+ a = st.number_input("select index of best algorithm for test prediction",min_value = 0,max_value =len(evaluationer.classification_evaluation_df) -1, value = len(evaluationer.classification_evaluation_df) -1)
506
+
507
+ test_prediction = evaluationer.classification_evaluation_df.loc[a,"model"].predict(test)
508
+ if select_target_trans == "Yes":
509
+ if selected_transformation == "Log Transformation":
510
+ if log_selected_transformation == "Natural Log base(e)":
511
+ test_prediction = np.exp(test_prediction)
512
+ st.write("Natural Log base(e) Inverse Transformation Completed ✅")
513
+ elif log_selected_transformation == "Log base 10":
514
+ test_prediction = np.power(10,test_prediction)
515
+ st.write("Log base 10 Inverse Transformation Completed ✅")
516
+ elif log_selected_transformation == "Log base (2)":
517
+ test_prediction = np.power(2,test_prediction)
518
+ st.write("Log base 2 Inverse Transformation Completed ✅")
519
+ elif selected_transformation == "Power Transformation":
520
+ if power_selected_transformation == "Square Root":
521
+ test_prediction = np.power(test_prediction,2)
522
+ st.write("Square root Inverse Transformation Completed ✅")
523
+ elif power_selected_transformation == "Other":
524
+ test_prediction = test_prediction**(power_value)
525
+ st.write(f"power root of {power_value} Inverse Transformation Completed ✅")
526
+
527
+ submission_file = pd.DataFrame(index = [submission_id],data = test_prediction,columns = [selected_column])
528
+ st.write("Sample of Prediction File",submission_file.head())
529
+ csv_prediction = submission_file.to_csv()
530
+ if st.radio("Download Prediction File as CSV File ? ",["Yes","No"],index = 1) == "Yes":
531
+ st.download_button(label="Download Prediction CSV File",data=csv_prediction,file_name='prediction.csv',mime='text/csv')
532
+
533
+
534
+
535
+
auto_optimizer.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import streamlit as st
4
+ from sklearn.impute import KNNImputer,SimpleImputer,IterativeImputer
5
+ import best_tts, evaluationer,models
6
+ from sklearn.experimental import enable_iterative_imputer
7
+ from sklearn.model_selection import train_test_split as tts
8
+ from collections import Counter
9
+ #root_mean_squared_error
10
+ from sklearn.metrics import root_mean_squared_error
11
+ import seaborn as sns
12
+ import matplotlib.pyplot as plt
13
+ import outliers,best_tts
14
+ import feature_selections
15
+ def Auto_optimizer(X,y,eva,model,test= None):
16
+ pass
17
+ num_cols = X.select_dtypes(exclude = "O").columns
18
+ cat_cols = X.select_dtypes(include = "O").columns
19
+ st.write("Num_cols",tuple(num_cols))
20
+ st.write("cat_cols",tuple(cat_cols))
21
+
22
+ # check for Duplicate and drop duplicated in X
23
+
24
+ if len(X.isnull().sum()[(X.isnull().sum()/len(X)*100) >40]) >0:
25
+ X = X.drop(columns = X.isnull().sum()[(X.isnull().sum()/len(X)*100) >40].index)
26
+ st.write("Columns with more than 40% null values removed")
27
+ # st.write("csx",X)
28
+
29
+ len_null = X.isnull().sum().sum()
30
+
31
+ st.write(f"There are {len_null} null values in Train")
32
+
33
+ knn_imputed_num_X = X.copy()
34
+ si_mean_imputed_num_X = X.copy()
35
+ # st.write("sf",si_mean_imputed_num_X)
36
+ si_median_imputed_num_X = X.copy()
37
+ si_most_frequent_imputed_num_X = X.copy()
38
+ iter_imputed_num_X = X.copy()
39
+ knn_imputed_X_cat_dropped = knn_imputed_num_X.copy()
40
+ si_mean_imputed_X_cat_dropped = si_mean_imputed_num_X.copy()
41
+ si_median_imputed_X_cat_dropped = si_median_imputed_num_X.copy()
42
+ si_most_frequent_imputed_X_cat_dropped = si_most_frequent_imputed_num_X.copy()
43
+ iter_imputed_X_cat_dropped = iter_imputed_num_X.copy()
44
+ if len_null >0:
45
+
46
+ if X[num_cols].isnull().sum().sum() >0:
47
+
48
+ knn_imputer = KNNImputer(n_neighbors = 5)
49
+ knn_imputed_num_X[num_cols] = knn_imputer.fit_transform(knn_imputed_num_X[num_cols])
50
+ si_imputer = SimpleImputer(strategy = "mean")
51
+ si_mean_imputed_num_X[num_cols] = si_imputer.fit_transform(si_mean_imputed_num_X[num_cols])
52
+ si_imputer = SimpleImputer(strategy = "median")
53
+ si_median_imputed_num_X[num_cols] = si_imputer.fit_transform(si_median_imputed_num_X[num_cols])
54
+ si_imputer = SimpleImputer(strategy = "most_frequent")
55
+ si_most_frequent_imputed_num_X[num_cols] = si_imputer.fit_transform(si_most_frequent_imputed_num_X[num_cols])
56
+ iter_imputer = IterativeImputer(max_iter = 200,random_state= 42)
57
+ iter_imputed_num_X[num_cols] = iter_imputer.fit_transform(iter_imputed_num_X[num_cols])
58
+ knn_imputed_X_cat_dropped = knn_imputed_num_X.copy()
59
+ si_mean_imputed_X_cat_dropped = si_mean_imputed_num_X.copy()
60
+ si_median_imputed_X_cat_dropped = si_median_imputed_num_X.copy()
61
+ si_most_frequent_imputed_X_cat_dropped = si_most_frequent_imputed_num_X.copy()
62
+ iter_imputed_X_cat_dropped = iter_imputed_num_X.copy()
63
+
64
+ if X[cat_cols].isnull().sum().sum() >0:
65
+ # treating missing values in categorical columns
66
+ # st.write("si_mean_imputed_num_X",si_mean_imputed_num_X)
67
+ si_imputer = SimpleImputer(strategy = "most_frequent")
68
+
69
+ knn_imputed_num_X[cat_cols] = si_imputer.fit_transform(knn_imputed_num_X[cat_cols])
70
+ si_imputer = SimpleImputer(strategy = "most_frequent")
71
+ si_mean_imputed_num_X.loc[:,cat_cols] = si_imputer.fit_transform(si_mean_imputed_num_X.loc[:,cat_cols])
72
+ # st.write("si_mean_imputed_num_X",si_mean_imputed_num_X)
73
+ si_median_imputed_num_X[cat_cols] = si_imputer.fit_transform(si_median_imputed_num_X[cat_cols])
74
+ si_most_frequent_imputed_num_X[cat_cols] = si_imputer.fit_transform(si_most_frequent_imputed_num_X[cat_cols])
75
+ iter_imputed_num_X[cat_cols] = si_imputer.fit_transform(iter_imputed_num_X[cat_cols])
76
+
77
+ knn_imputed_X_cat_dropped = knn_imputed_X_cat_dropped.dropna()
78
+ si_mean_imputed_X_cat_dropped =si_mean_imputed_X_cat_dropped.dropna()
79
+ si_median_imputed_X_cat_dropped =si_median_imputed_X_cat_dropped.dropna()
80
+ si_most_frequent_imputed_X_cat_dropped =si_most_frequent_imputed_X_cat_dropped.dropna()
81
+ iter_imputed_X_cat_dropped =iter_imputed_X_cat_dropped.dropna()
82
+ st.write("sdds",knn_imputed_num_X)
83
+ st.write("sddssd",knn_imputed_X_cat_dropped)
84
+
85
+ miss_val_dropped_X = X.dropna()
86
+
87
+ # list of dataframes
88
+
89
+ list_X_after_missing_values= [knn_imputed_num_X,
90
+ si_mean_imputed_num_X,
91
+ si_median_imputed_num_X,
92
+ si_most_frequent_imputed_num_X,
93
+ iter_imputed_num_X,
94
+ knn_imputed_X_cat_dropped,
95
+ si_mean_imputed_X_cat_dropped,
96
+ si_median_imputed_X_cat_dropped,
97
+ si_most_frequent_imputed_X_cat_dropped,
98
+ iter_imputed_X_cat_dropped,
99
+ miss_val_dropped_X]
100
+ list_X_after_missing_values_names= ["knn_imputed_num_X",
101
+ "si_mean_imputed_num_X",
102
+ "si_median_imputed_num_X",
103
+ "si_most_frequent_imputed_num_X",
104
+ "iter_imputed_num_X",
105
+ "knn_imputed_X_cat_dropped",
106
+ "si_mean_imputed_X_cat_dropped",
107
+ "si_median_imputed_X_cat_dropped",
108
+ "si_most_frequent_imputed_X_cat_dropped",
109
+ "iter_imputed_X_cat_dropped",
110
+ "miss_val_dropped_X"]
111
+ # st.write("si_most_frequent_imputed_num_X",si_most_frequent_imputed_num_X,)
112
+ ord_enc_cols = []
113
+ ohe_enc_cols = []
114
+
115
+ if len(cat_cols) == 0:
116
+ st.write("No Categorical Columns in Train")
117
+ else:
118
+ st.write("Select Columns for Ordinal Encoding")
119
+ for column in cat_cols:
120
+ selected = st.checkbox(column)
121
+ if selected:
122
+ st.write(f"No. of Unique value in {column} column are", X[column].nunique())
123
+ ord_enc_cols.append(column)
124
+ ohe_enc_cols = set(cat_cols) -set(ord_enc_cols)
125
+ ohe_enc_cols = list(ohe_enc_cols)
126
+
127
+ if len(ord_enc_cols)>0:
128
+ st.write("ordinal encoded columns" ,tuple(ord_enc_cols))
129
+ if len(ohe_enc_cols)>0:
130
+ st.write("one hot encoded columns" ,tuple(ohe_enc_cols))
131
+
132
+ if len(ord_enc_cols)>0:
133
+
134
+ ordinal_order_vals = []
135
+
136
+ for column in ord_enc_cols:
137
+ unique_vals = X.dropna()[column].unique()
138
+ # st.write(f"No. of Unique value in {column} column are", len(unique_vals))
139
+
140
+ ordered_unique_vals = st.multiselect("Select values in order for Ordinal Encoding",unique_vals,unique_vals)
141
+ ordinal_order_vals.append(ordered_unique_vals)
142
+
143
+ st.write("order of values for Ordinal Encoding",tuple(ordinal_order_vals))
144
+
145
+ if len_null > 0:
146
+
147
+ for df_name, df in enumerate(list_X_after_missing_values):
148
+ # st.write(f"{list_X_after_missing_values_names[df_name]}",df)
149
+ from sklearn.preprocessing import OrdinalEncoder
150
+ ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
151
+ df[ord_enc_cols] = ord.fit_transform(df[ord_enc_cols])
152
+ # st.write(f"{list_X_after_missing_values_names[df_name]}",df)
153
+ else :
154
+ from sklearn.preprocessing import OrdinalEncoder
155
+ ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
156
+ X[ord_enc_cols] = ord.fit_transform(X[ord_enc_cols])
157
+
158
+ st.write("Ordinal Encoding Completed ✅")
159
+
160
+ if len(ohe_enc_cols)>0:
161
+ if len_null > 0:
162
+ for df_name, df in enumerate(list_X_after_missing_values):
163
+ from sklearn.preprocessing import OneHotEncoder
164
+ ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
165
+ pd.options.mode.chained_assignment = None
166
+ df.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(df[ohe_enc_cols])
167
+ df.drop(columns = ohe_enc_cols,inplace = True)
168
+ pd.options.mode.chained_assignment = 'warn'
169
+ else:
170
+ from sklearn.preprocessing import OneHotEncoder
171
+ ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
172
+ pd.options.mode.chained_assignment = None
173
+ X.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(X[ohe_enc_cols])
174
+ X.drop(columns = ohe_enc_cols,inplace = True)
175
+ pd.options.mode.chained_assignment = 'warn'
176
+ st.write("OneHot Encoding Completed ✅")
177
+
178
+
179
+ if len(ohe_enc_cols)>0:
180
+ if len_null > 0:
181
+ for name,df in enumerate(list_X_after_missing_values):
182
+ X_train,X_test,y_train,y_test = tts(df,y[df.index],test_size =.2 ,random_state = 42)
183
+ # best_tts.best_tts(df,y,model,eva)
184
+ evaluationer.evaluation(f"{list_X_after_missing_values_names[name]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
185
+ else:
186
+ X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size =.2 ,random_state = 42)
187
+ # best_tts.best_tts(X,y,model,eva)
188
+
189
+ evaluationer.evaluation(f"baseline_model",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
190
+
191
+ if len_null >0:
192
+ for name,df in enumerate(list_X_after_missing_values):
193
+ X_train,X_test,y_train,y_test = tts(df,y[df.index],test_size =.2 ,random_state = 42)
194
+ st.write(f"this is test{list_X_after_missing_values_names[name]}",X_train.isnull().sum().sum())
195
+ evaluationer.evaluation(f"{list_X_after_missing_values_names[name]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
196
+
197
+ if eva == "class":
198
+ counter = Counter(y)
199
+ total = sum(counter.values())
200
+ balance_ratio = {cls: count / total for cls, count in counter.items()}
201
+ num_classes = len(balance_ratio)
202
+ ideal_ratio = 1 / num_classes
203
+ a = all(abs(ratio - ideal_ratio) <= 0.1 * ideal_ratio for ratio in balance_ratio.values())
204
+ if a == True:
205
+ st.write("Balanced Dataset ✅")
206
+ st.write("Using accuracy for Evaluation")
207
+ value = "test_acc"
208
+ else:
209
+ st.write("Unbalanced Dataset ❌")
210
+ st.write("Using F1 score for Evaluation")
211
+ value = "test_f1"
212
+ st.write("SFdfs",evaluationer.classification_evaluation_df)
213
+ evaluationer.classification_evaluation_df.sort_values(by = value,inplace= True)
214
+ name = str(evaluationer.classification_evaluation_df.iloc[-1,0])
215
+ st.write("df name",evaluationer.classification_evaluation_df.iloc[-1,0])
216
+ if len_null >0:
217
+ b = list_X_after_missing_values_names.index(name)
218
+ st.write("Sdffsf",b)
219
+ st.write("df",list_X_after_missing_values[b])
220
+ X = list_X_after_missing_values[b]
221
+ if eva == "reg":
222
+ st.write("Using R2 score for Evaluation",evaluationer.reg_evaluation_df)
223
+ value = "test_r2"
224
+ evaluationer.reg_evaluation_df.sort_values(by = value,inplace= True)
225
+ st.write("adfsdf",evaluationer.reg_evaluation_df.iloc[-1,0])
226
+ name = str(evaluationer.reg_evaluation_df.iloc[-1,0])
227
+ st.write("Sdffsf",name)
228
+ if len_null >0:
229
+ b = list_X_after_missing_values_names.index(name)
230
+ st.write("Sdffsf",b)
231
+ st.write("df",list_X_after_missing_values[b])
232
+ X = list_X_after_missing_values[b]
233
+
234
+
235
+ # Create a figure and axes
236
+ num_plots = len(num_cols)
237
+ cols = 2 # Number of columns in the subplot grid
238
+ rows = (num_plots + cols - 1) // cols # Calculate the number of rows needed
239
+
240
+ fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
241
+
242
+ # Flatten the axes array for easy iteration, and remove any excess subplots
243
+ axes = axes.flatten()
244
+ for ax in axes[num_plots:]:
245
+ fig.delaxes(ax)
246
+
247
+ for i, col in enumerate(num_cols):
248
+ sns.histplot(X[col], ax=axes[i],kde = True,color=sns.color_palette('Oranges', as_cmap=True)(0.7))
249
+ axes[i].set_title(col)
250
+
251
+ # Adjust layout
252
+ plt.tight_layout()
253
+
254
+ # Show the plot in Streamlit
255
+ st.pyplot(fig)
256
+
257
+ # Create a figure and axes
258
+ num_plots = len(num_cols)
259
+ cols = 3 # Number of columns in the subplot grid
260
+ rows = (num_plots + cols - 1) // cols # Calculate the number of rows needed
261
+
262
+ fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows))
263
+
264
+ # Flatten the axes array for easy iteration, and remove any excess subplots
265
+ axes = axes.flatten()
266
+ for ax in axes[num_plots:]:
267
+ fig.delaxes(ax)
268
+
269
+ for i, col in enumerate(num_cols):
270
+ sns.boxplot(y=X[col], ax=axes[i],palette="magma")
271
+ axes[i].set_title(col)
272
+
273
+ # Adjust layout
274
+ plt.tight_layout()
275
+
276
+ # Show the plot in Streamlit
277
+ st.pyplot(fig)
278
+
279
+ outlier_cols = st.multiselect("De-Select columns for Detecting Outliers", num_cols,default= list(num_cols))
280
+
281
+ st.write("Checking for Outliers")
282
+ outliers_df_X,outlier_indexes = outliers.detect_outliers(X,list(outlier_cols))
283
+ st.write("Outliers in Dataframe Summary",outliers_df_X)
284
+ st.write("Columns for Outliers handling",tuple(outliers_df_X["columns name"]))
285
+
286
+ select_outlier_cols = st.multiselect("Select columns for Outlier Handling",tuple(outliers_df_X["columns name"]),default =tuple(outliers_df_X["columns name"]))
287
+ resultant,outlier_handled_df,outlier_handled_df_name= outliers.outlier_handling(X,y,model,outlier_indexes = outlier_indexes,outlier_cols = select_outlier_cols ,method = root_mean_squared_error,test_size = 0.2, random_state = 42,eva = "reg")
288
+ st.write("outlier handling with methods",resultant)
289
+ st.write("Best method with outlier handling",resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])
290
+ try :
291
+ st.write("Best X Data Index No.",outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0]))
292
+
293
+ st.write("Best X DataFrame after outlier handling ",outlier_handled_df[outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])])
294
+ X = outlier_handled_df[outlier_handled_df_name.index(resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])]
295
+ except :
296
+ "evaluation of baseline model is better continuing with baseline model"
297
+
298
+ # result_df ,X_train_b,X_test_b,y_train_b,y_test_b = best_tts.best_tts(X,y,model,eva)
299
+ X_train,X_test,y_train,y_test = tts(X,y[X.index],random_state = 42,test_size = 0.2)
300
+ st.write("result_df",X)
301
+ st.write("fsdfs",X_train)
302
+ result_df_1 = feature_selections.feature_selection(X_train,X_test,y_train,y_test,model,alpha = 0.05)
303
+ st.write("sdchsvdgj",result_df_1)
304
+
305
+
306
+
307
+
308
+
309
+
310
+
311
+
312
+
313
+
314
+
315
+
316
+
317
+
best_tts.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.model_selection import train_test_split as tts
2
+ from sklearn.metrics import r2_score,f1_score,accuracy_score, root_mean_squared_error
3
+ import evaluationer
4
+ import pandas as pd
5
+ import numpy as np
6
+
7
+ def best_tts(X,y,model,eva):
8
+ # def best_tts(X,y,test_size_range = range(10,25),random_state_range =range(1,100), stratify=None,shuffle=True,model = LinearRegression(),method = root_mean_squared_error,eva = "reg"):
9
+
10
+ if eva == "reg":
11
+
12
+ test_r2_,test_r2_ts,test_r2_rs = 0,0,0
13
+ for k in range(10,25):
14
+ i = k/100
15
+ for j in range(1,100):
16
+ X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size = i, random_state = j,)
17
+
18
+ model = model
19
+ model.fit(X_train,y_train) # model fitting
20
+ y_pred_train = model.predict(X_train) # model prediction for train
21
+ y_pred_test = model.predict(X_test) # model prediction for test
22
+
23
+ train_r2 = r2_score(y_train, y_pred_train) # evaluating r2 score for train
24
+
25
+
26
+ test_r2 = r2_score(y_test, y_pred_test) # evaluating r2 score for test
27
+ if test_r2_ < test_r2:
28
+ test_r2_ = test_r2
29
+ test_r2_ts = i
30
+ test_r2_rs = j
31
+
32
+ n_r_train, n_c_train = X_train.shape # getting no of rows and columns of train data
33
+ n_r_test, n_c_test = X_test.shape # getting no of rows and columns of test data
34
+
35
+ adj_r2_train = 1 - ((1 - train_r2)*(n_r_train - 1)/ (n_r_train - n_c_train - 1)) # evaluating adjusted r2 score for train
36
+
37
+
38
+ adj_r2_test = 1 - ((1 - test_r2)*(n_r_test - 1)/ (n_r_test - n_c_test - 1)) # evaluating adjusted r2 score for test
39
+
40
+
41
+ train_evaluation = root_mean_squared_error(y_train, y_pred_train) # evaluating train error
42
+
43
+
44
+ test_evaluation = root_mean_squared_error(y_test, y_pred_test) # evaluating test error
45
+
46
+ X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size = test_r2_ts, random_state = test_r2_rs)
47
+ evaluationer.evaluation("best_tts",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
48
+ return evaluationer.reg_evaluation_df,X_train,X_test,y_train,y_test
49
+
50
+
51
+
52
+ elif eva == "class":
53
+ global test_accuracies_,test_accuracies_ts,test_accuracies_rs
54
+ test_accuracies_,test_accuracies_ts,test_accuracies_rs = 0,0,0
55
+
56
+ for k in range(10,25):
57
+ i = k/100
58
+ for j in range(1,100):
59
+ X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size = i, random_state = j)
60
+ model = model
61
+ model.fit(X_train,y_train) # model fitting
62
+ y_pred_train = model.predict(X_train) # model prediction for train
63
+ y_pred_test = model.predict(X_test) # model prediction for test
64
+ # y_pred_proba_train= model.predict_proba(X_train)
65
+ # y_pred_proba_test= model.predict_proba(X_test)
66
+
67
+
68
+ unique_classes = np.unique(y_train)
69
+
70
+ # Determine the average method
71
+ if len(unique_classes) == 2:
72
+ # Binary classification
73
+ # print("Using 'binary' average for binary classification.")
74
+ average_method = 'binary'
75
+ elif len(unique_classes)!=2:
76
+ # Determine the distribution of the target column
77
+ class_counts = np.bincount(y_train)
78
+
79
+ # Check if the dataset is imbalanced
80
+ imbalance_ratio = max(class_counts) / min(class_counts)
81
+
82
+ if imbalance_ratio > 1.5:
83
+ # Imbalanced dataset
84
+ # print("Using 'weighted' average due to imbalanced dataset.")
85
+ average_method = 'weighted'
86
+ else:
87
+ # Balanced dataset
88
+ # print("Using 'macro' average due to balanced dataset.")
89
+ average_method = 'macro'
90
+ # F1 scores
91
+ train_f1_scores = (f1_score(y_train, y_pred_train,average=average_method))
92
+
93
+
94
+ test_f1_scores = (f1_score(y_test, y_pred_test,average=average_method))
95
+
96
+ # Accuracies
97
+ train_accuracies = (accuracy_score(y_train, y_pred_train))
98
+
99
+ test_accuracies = (accuracy_score(y_test, y_pred_test))
100
+ if test_accuracies_ <test_accuracies:
101
+ test_accuracies_,test_accuracies_ts,test_accuracies_rs =test_accuracies, i,j
102
+ X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size = test_accuracies_ts, random_state = test_accuracies_rs)
103
+ print(f"test_size = {test_accuracies_ts}, random_state = {test_accuracies_rs}")
104
+
105
+ evaluationer.evaluation("best_tts",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
106
+
107
+
108
+ return evaluationer.classification_evaluation_df,X_train,X_test,y_train,y_test
109
+
110
+
evaluationer.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # importing libraries
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.model_selection import train_test_split
5
+ from sklearn.metrics import root_mean_squared_error,r2_score,mean_squared_error,root_mean_squared_log_error,mean_absolute_error,mean_squared_log_error
6
+ from sklearn.metrics import f1_score, accuracy_score, precision_score,recall_score, average_precision_score
7
+ # creating a class for evaluation
8
+
9
+ reg_evaluation_df = pd.DataFrame({"evaluation_df_method" :[],
10
+ "model": [],# model displays regression model
11
+ "method": [],# method display evaluation metrics used
12
+ "train_r2": [],# train r2 shows train R2 score
13
+ "test_r2": [],# test r2 shows test R2 Score
14
+ "adjusted_r2_train": [],# adjusted_r2_train shows adjusted r2 score for train
15
+ "adjusted_r2_test": [],# adjusted_r2_test shows adjusted r2 score for test
16
+ "train_evaluation": [],# train_evaluation shows train evaluation score by used method
17
+ "test_evaluation" : []# test_evaluation shows test evaluation score by used method
18
+ })
19
+
20
+ classification_evaluation_df = pd.DataFrame({"evaluation_df_method" :[],
21
+ 'model': [],
22
+ 'train_f1': [],
23
+ 'test_f1': [],
24
+ 'train_acc': [],
25
+ 'test_acc': [],
26
+ 'precision_train': [],
27
+ 'precision_test': [],
28
+ 'recall_train': [],
29
+ 'recall_test': []
30
+ })
31
+
32
+ # function for evaluating dataframe
33
+ def evaluation(evaluation_df_method,X_train,X_test,y_train,y_test,model,method,eva):# input parameters from train_test_split , model and method for evaluation.
34
+ global y_pred_train,y_pred_test,y_pred_proba_train,y_pred_proba_test
35
+ model = model
36
+ model.fit(X_train,y_train) # model fitting
37
+ y_pred_train = model.predict(X_train) # model prediction for train
38
+ y_pred_test = model.predict(X_test) # model prediction for test
39
+
40
+ if eva == "reg":
41
+
42
+ train_r2 = r2_score(y_train, y_pred_train) # evaluating r2 score for train
43
+ test_r2 = r2_score(y_test, y_pred_test) # evaluating r2 score for test
44
+
45
+ n_r_train, n_c_train = X_train.shape # getting no of rows and columns of train data
46
+ n_r_test, n_c_test = X_test.shape # getting no of rows and columns of test data
47
+
48
+ adj_r2_train = 1 - ((1 - train_r2)*(n_r_train - 1)/ (n_r_train - n_c_train - 1)) # evaluating adjusted r2 score for train
49
+ adj_r2_test = 1 - ((1 - test_r2)*(n_r_test - 1)/ (n_r_test - n_c_test - 1)) # evaluating adjusted r2 score for test
50
+
51
+ train_evaluation = method(y_train, y_pred_train) # evaluating train error
52
+ test_evaluation = method(y_test, y_pred_test) # evaluating test error
53
+
54
+ if method == root_mean_squared_error:
55
+ a = "root_mean_squared_error"
56
+ elif method ==root_mean_squared_log_error:
57
+ a = "root_mean_squared_log_error"
58
+ elif method == mean_absolute_error:
59
+ a = "mean_absolute_error"
60
+ elif method == mean_squared_error:
61
+ a = "mean_squared_error"
62
+ elif method == mean_squared_log_error:
63
+ a = "mean_squared_log_error"
64
+
65
+ # declaring global dataframes
66
+ global reg_evaluation_df,temp_df
67
+
68
+ # creating temporary dataframe for concating in later into main evaluation dataframe
69
+ temp_df = pd.DataFrame({"evaluation_df_method" :[evaluation_df_method],
70
+ "model": [model],
71
+ "method": [a],
72
+ "train_r2": [train_r2],
73
+ "test_r2": [test_r2],
74
+ "adjusted_r2_train": [adj_r2_train],
75
+ "adjusted_r2_test": [adj_r2_test],
76
+ "train_evaluation": [train_evaluation],
77
+ "test_evaluation" : [test_evaluation]
78
+ })
79
+ reg_evaluation_df = pd.concat([reg_evaluation_df,temp_df]).reset_index(drop = True)
80
+
81
+
82
+
83
+
84
+ # return reg_evaluation_df # returning evaluation_df
85
+
86
+ elif eva == "class":
87
+
88
+ # y_pred_proba_train= model.predict_proba(X_train)
89
+ # y_pred_proba_test= model.predict_proba(X_test)
90
+
91
+ unique_classes = np.unique(y_train)
92
+
93
+ # Determine the average method
94
+ if len(unique_classes) == 2:
95
+ # Binary classification
96
+ print("Using 'binary' average for binary classification.")
97
+ average_method = 'binary'
98
+ elif len(unique_classes)!=2:
99
+ # Determine the distribution of the target column
100
+ class_counts = np.bincount(y_train)
101
+
102
+ # Check if the dataset is imbalanced
103
+ imbalance_ratio = max(class_counts) / min(class_counts)
104
+
105
+ if imbalance_ratio > 1.5:
106
+ # Imbalanced dataset
107
+ print("Using 'weighted' average due to imbalanced dataset.")
108
+ average_method = 'weighted'
109
+ else:
110
+ # Balanced dataset
111
+ print("Using 'macro' average due to balanced dataset.")
112
+ average_method = 'macro'
113
+
114
+ # F1 scores
115
+ train_f1_scores = (f1_score(y_train, y_pred_train,average=average_method))
116
+ test_f1_scores = (f1_score(y_test, y_pred_test,average=average_method))
117
+
118
+ # Accuracies
119
+ train_accuracies = (accuracy_score(y_train, y_pred_train))
120
+ test_accuracies = (accuracy_score(y_test, y_pred_test))
121
+
122
+ # Precisions
123
+ train_precisions = (precision_score(y_train, y_pred_train,average=average_method))
124
+ test_precisions = (precision_score(y_test, y_pred_test,average=average_method))
125
+
126
+ # Recalls
127
+ train_recalls = (recall_score(y_train, y_pred_train,average=average_method))
128
+ test_recalls = (recall_score(y_test, y_pred_test,average=average_method))
129
+
130
+ # declaring global dataframes
131
+ global classification_evaluation_df,temp_df1
132
+
133
+ # creating temporary dataframe for concating in later into main evaluation dataframe
134
+ temp_df1 = pd.DataFrame({"evaluation_df_method" :[evaluation_df_method],
135
+ 'model': [model],
136
+ 'train_f1': [train_f1_scores],
137
+ 'test_f1': [test_f1_scores],
138
+ 'train_acc': [train_accuracies],
139
+ 'test_acc': [test_accuracies],
140
+ 'precision_train': [train_precisions],
141
+ 'precision_test': [test_precisions],
142
+ 'recall_train': [train_recalls],
143
+ 'recall_test': [test_recalls]
144
+ })
145
+ classification_evaluation_df = pd.concat([classification_evaluation_df, temp_df1]).reset_index(drop = True)
146
+
147
+ return classification_evaluation_df # returning evaluation_df
148
+
149
+ global method_df
150
+ method_df = pd.DataFrame(data = [root_mean_squared_error, root_mean_squared_log_error,mean_absolute_error,mean_squared_error,mean_squared_log_error],
151
+ index = ["root_mean_squared_error", "root_mean_squared_log_error","mean_absolute_error","mean_squared_error","mean_squared_log_error"])
feature_selections.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.feature_selection import mutual_info_regression
2
+ from statsmodels.stats.outliers_influence import variance_inflation_factor
3
+ from sklearn.linear_model import Lasso
4
+ from sklearn.linear_model import LogisticRegression
5
+ from sklearn.metrics import roc_curve, auc
6
+ import statsmodels.api as sm
7
+ import pandas as pd
8
+ import numpy as np
9
+ import evaluationer
10
+ import streamlit as st
11
+ # import root_mean_squared_error
12
+ from sklearn.metrics import root_mean_squared_error
13
+ def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
14
+
15
+ st.write("dvsdv",y_train)
16
+ st.write("dvfssdv",X_train)
17
+
18
+ model = sm.OLS(y_train, sm.add_constant(X_train))
19
+ model_fit = model.fit()
20
+ pval_cols = model_fit.pvalues[model_fit.pvalues > 0.05].index.tolist()
21
+ coef_cols = model_fit.params[abs(model_fit.params) < 0.001].index.tolist()
22
+ pval_and_coef_cols = list(set(coef_cols) | set(pval_cols))
23
+
24
+ mi_scores = mutual_info_regression(X_train, y_train)
25
+ mi = pd.DataFrame()
26
+
27
+ mi["col_name"] = X_train.columns
28
+ mi["mi_score"] = mi_scores
29
+
30
+ mi_cols = mi[mi.mi_score ==0].col_name.values.tolist()
31
+
32
+ corr = X_train.corr()
33
+
34
+ corru= pd.DataFrame(np.triu(corr),columns = corr.columns , index = corr.index)
35
+ corr_u_cols = corru[corru[(corru > 0.5 )& (corru <1)].any()].index.tolist()
36
+
37
+ corrl= pd.DataFrame(np.tril(corr),columns = corr.columns , index = corr.index)
38
+ corr_l_cols = corrl[corrl[(corrl > 0.5 )& (corrl <1)].any()].index.tolist()
39
+
40
+ X_new_vif = sm.add_constant(X_train)
41
+ # Calculating VIF
42
+ vif = pd.DataFrame()
43
+ vif["variables"] = X_new_vif.columns
44
+ vif["VIF"] = [variance_inflation_factor(X_new_vif.values, i) for i in range(X_new_vif.shape[1])]
45
+ st.write("gdfgdsdsdfad",vif)
46
+ if len(vif[vif["variables"] == "const"]) == 1:
47
+ vif = vif.drop(index = (vif[vif["variables"] == "const"].index[0]))
48
+ st.write("gdfgdsad",vif)
49
+ # drop const in vif cols
50
+ # vif_cols = X_new_vif.drop(columns = "const")
51
+ vif_cols = vif[vif.VIF >10].variables.tolist()
52
+
53
+
54
+ # lasso
55
+ if alpha == "best":
56
+
57
+ lasso_len = []
58
+ alpha_i = []
59
+ for i in range(1,1000,5):
60
+ j = i/10000
61
+
62
+ model_lasso = Lasso(alpha=j)
63
+ model_lasso.fit(X_train, y_train)
64
+ col_df = pd.DataFrame({
65
+ "col_name": X_train.columns,
66
+ "lasso_coef": model_lasso.coef_
67
+ })
68
+ a = len(col_df[col_df.lasso_coef ==0])
69
+ lasso_len.append(a)
70
+ alpha_i.append(j)
71
+ for i in zip(lasso_len,alpha_i):
72
+ print(i)
73
+ input_alpha = float(input("enter alpha"))
74
+ model_lasso = Lasso(alpha=input_alpha)
75
+ model_lasso.fit(X_train, y_train)
76
+ col_df = pd.DataFrame({
77
+ "col_name": X_train.columns,
78
+ "lasso_coef": model_lasso.coef_
79
+ })
80
+
81
+ lasso_cols =col_df[col_df.lasso_coef ==0].col_name.tolist()
82
+ else:
83
+ model_lasso = Lasso(alpha=alpha)
84
+ model_lasso.fit(X_train, y_train)
85
+ col_df = pd.DataFrame({
86
+ "col_name": X_train.columns,
87
+ "lasso_coef": model_lasso.coef_
88
+ })
89
+
90
+ lasso_cols =col_df[col_df.lasso_coef ==0].col_name.tolist()
91
+
92
+ feature_cols = [pval_cols,coef_cols,pval_and_coef_cols,mi_cols,corr_u_cols,corr_l_cols,vif_cols,lasso_cols]
93
+
94
+ for col in feature_cols:
95
+
96
+ try:
97
+ st.write(f"{col}",X_train.drop(columns = col))
98
+ except:
99
+ st.write(f"error IN col")
100
+ feature_cols_name = ["pval_cols","coef_cols","pval_and_coef_cols","mi_cols","corr_u_cols","corr_l_cols","vif_cols","lasso_cols"]
101
+ st.write("feature_cols", vif_cols)
102
+ for i,j in enumerate(feature_cols):
103
+ evaluationer.evaluation(f"{feature_cols_name[i]} dropped" ,X_train.drop(columns = j),X_test.drop(columns = j),y_train,y_test,model_reg,method = root_mean_squared_error,eva = "reg")
104
+ return evaluationer.reg_evaluation_df
models.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
+ # import algorithms for classification
6
+ from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
7
+ from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier,HistGradientBoostingClassifier
8
+ from sklearn.neighbors import KNeighborsClassifier
9
+ from sklearn.tree import DecisionTreeClassifier
10
+ from sklearn.svm import SVC
11
+ from xgboost import XGBClassifier,XGBRFClassifier
12
+ from sklearn.neural_network import MLPClassifier
13
+ from lightgbm import LGBMClassifier
14
+ from sklearn.naive_bayes import MultinomialNB,CategoricalNB
15
+ # import algorithms for regression
16
+ from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso, ElasticNet
17
+ from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor,HistGradientBoostingRegressor
18
+ from sklearn.neighbors import KNeighborsRegressor
19
+ from sklearn.tree import DecisionTreeRegressor
20
+ from sklearn.svm import SVR
21
+ from xgboost import XGBRegressor, XGBRFRegressor
22
+ from sklearn.neural_network import MLPRegressor
23
+ from lightgbm import LGBMRegressor
24
+ from sklearn.naive_bayes import GaussianNB
25
+
26
+ # dictionary where keys are name of algorithm and values are algorithm for classifier
27
+ algos_class = {
28
+ "Logistic Regression": LogisticRegression(),
29
+ "SGD Classifier": SGDClassifier(),
30
+ "Ridge Classifier": RidgeClassifier(),
31
+ "Random Forest Classifier": RandomForestClassifier(),
32
+ "AdaBoost Classifier": AdaBoostClassifier(),
33
+ "Gradient Boosting Classifier": GradientBoostingClassifier(),
34
+ "Hist Gradient Boosting Classifier": HistGradientBoostingClassifier(),
35
+ "K Neighbors Classifier": KNeighborsClassifier(),
36
+ "Decision Tree Classifier": DecisionTreeClassifier(),
37
+ "SVC": SVC(),
38
+ "XGB Classifier": XGBClassifier(),
39
+ "XGBRF Classifier": XGBRFClassifier(),
40
+ "MLP Classifier": MLPClassifier(),
41
+ "LGBM Classifier": LGBMClassifier(),
42
+ "Multinomial Naive Bayes": MultinomialNB(),
43
+ "Categorical Naive Bayes": CategoricalNB()}
44
+
45
+ # dictionary where keys are name of algorithm and values are algorithm for regression
46
+ algos_reg = {
47
+ "Linear Regression": LinearRegression(),
48
+ "SGD Regressor": SGDRegressor(),
49
+ "Ridge Regressor": Ridge(),
50
+ "Lasso Regressor": Lasso(),
51
+ "ElasticNet Regressor": ElasticNet(),
52
+ "Random Forest Regressor": RandomForestRegressor(),
53
+ "AdaBoost Regressor": AdaBoostRegressor(),
54
+ "Gradient Boosting Regressor": GradientBoostingRegressor(),
55
+ "Hist Gradient Boosting Regressor": HistGradientBoostingRegressor(),
56
+ "K Neighbors Regressor": KNeighborsRegressor(),
57
+ "Decision Tree Regressor": DecisionTreeRegressor(),
58
+ "SVR": SVR(),
59
+ "XGB Regressor": XGBRegressor(),
60
+ "XGBRF Regressor": XGBRFRegressor(),
61
+ "MLP Regressor": MLPRegressor(),
62
+ "LGBM Regressor": LGBMRegressor(),
63
+ "Gaussian Naive Bayes": GaussianNB()}
64
+
65
+ # dataframe where index are name of algorithm as "algorithm name" , column is algorithm as "algorithm"
66
+
67
+ Classification_models = pd.DataFrame(data=algos_class.values(), index=algos_class.keys())
68
+
69
+ Regression_models = pd.DataFrame(data=algos_reg.values(), index=algos_reg.keys())
70
+
null_value_handling.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import streamlit as st
3
+ # import simple imputer, iter imputer , knn inputer
4
+ from sklearn.model_selection import train_test_split as tts
5
+ from sklearn.experimental import enable_iterative_imputer
6
+ from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
7
+ import evaluationer
8
+ # import label, ohe,ordinal encoder
9
+ from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
10
+
11
+ # creating a function for null_handling with different methods for null value imputing, categorical columns encoding and evaluation
12
+
13
+ null_value_handling_method_num_cols = ["KNN Imputed","SI Mean Imputed","SI Median Imputed","SI Most Frequent Imputed","Iter Imputed"]
14
+ null_value_handling_method_cat_cols = ["SI Most Frequent Imputed (categorical)"]
15
+
16
+ # dict for null value handling method num cols
17
+
18
+ dict1 = {"KNN Imputed" :KNNImputer(n_neighbors = 5),"SI Mean Imputed":SimpleImputer(strategy = "mean"),"SI Median Imputed":SimpleImputer(strategy = "median"),
19
+ "SI Most Frequent Imputed":SimpleImputer(strategy = "most_frequent"),"Iter Imputed":IterativeImputer(max_iter = 200,random_state= 42)}
20
+
21
+ dict2 = {"SI Most Frequent Imputed (categorical)":SimpleImputer(strategy = "most_frequent")}
22
+
23
+ # creating dataframe from dict1 and dict2
24
+ num_nvh_method_df = pd.DataFrame(data=dict1.values(), index=dict1.keys())
25
+ cat_nvh_method_df = pd.DataFrame(data=dict2.values(), index=dict2.keys())
26
+
27
+ num_imputed_dict = {"KNN Imputed":[],"SI Mean Imputed":[],"SI Median Imputed":[],"SI Most Frequent Imputed":[],"Iter Imputed":[]}
28
+
29
+ cat_imputed_dict = {"SI Most Frequent Imputed (categorical)":[],"Iter Imputed":[]}
30
+
31
+ num_imputed_df = pd.DataFrame(data = num_imputed_dict.values(),index = num_imputed_dict.keys())
32
+
33
+ cat_imputed_df = pd.DataFrame(data = cat_imputed_dict.values(),index = cat_imputed_dict.keys())
34
+
35
+ final_df = []
36
+ def null_handling(X,clean_num_nvh_df,clean_num_nvh_df_cat):
37
+ num_nvh_method = clean_num_nvh_df.columns #KNN Imputed","SI Mean Imputed","SI Media
38
+ cat_nvh_method = clean_num_nvh_df_cat.columns
39
+ for method in num_nvh_method:
40
+ X[clean_num_nvh_df[method].dropna().values] = num_nvh_method_df.loc[method].values[0].fit_transform(X[clean_num_nvh_df[method].dropna().values])
41
+
42
+ for method in cat_nvh_method:
43
+ X[clean_num_nvh_df_cat[method].dropna().values] = cat_nvh_method_df.loc[method].values[0].fit_transform(X[clean_num_nvh_df_cat[method].dropna().values])
44
+
45
+ final_df = X
46
+
47
+ return final_df
48
+
49
+
outliers.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
+ from scipy.stats.mstats import winsorize
6
+ from sklearn.preprocessing import StandardScaler,MinMaxScaler
7
+ from sklearn.metrics import root_mean_squared_error
8
+ from scipy.stats import yeojohnson
9
+ import evaluationer
10
+ from sklearn.model_selection import train_test_split as tts
11
+ def detect_outliers(df,num_cols):
12
+ global outlier_df,zscore_cols,outlier_indexes,iqr_cols
13
+ outlier_df = pd.DataFrame({"method" :[],"columns name":[],"upper limit":[],
14
+ "lower limit":[],"no of Rows":[],"percentage outlier":[]})
15
+ if type(num_cols) == list:
16
+ if len(num_cols)!=0:
17
+ num_cols = num_cols
18
+ else:
19
+ num_cols = df.select_dtypes(exclude = "object").columns.tolist()
20
+ else:
21
+ if num_cols.tolist() != None:
22
+ num_cols = num_cols
23
+ else:
24
+ num_cols = df.select_dtypes(exclude = "object").columns.tolist()
25
+ zscore_cols = []
26
+ iqr_cols = []
27
+ outlier_indexes =[]
28
+ for col in num_cols:
29
+ skewness = df[col].skew()
30
+ if -0.5 <= skewness <= 0.5:
31
+ method = "zscore"
32
+ zscore_cols.append(col)
33
+
34
+ else:
35
+ method = "iqr"
36
+ iqr_cols.append(col)
37
+ if len(zscore_cols) >0:
38
+ for col in zscore_cols:
39
+ mean = df[col].mean()
40
+ std = df[col].std()
41
+ ul = mean + (3*std)
42
+ ll = mean - (3*std)
43
+ mask = (df[col] < ll) | (df[col] > ul)
44
+ temp = df[mask]
45
+
46
+ Zscore_index = temp.index.tolist()
47
+ outlier_indexes.extend(Zscore_index)
48
+
49
+ if len(temp)>0:
50
+
51
+ temp_df = pd.DataFrame({"method" : ["ZScore"],
52
+ "columns name" : [col],
53
+ "upper limit" : [round(ul,2)],
54
+ "lower limit" :[ round(ll,2)],
55
+ "no of Rows" : [len(temp)],
56
+ "percentage outlier" : [round(len(temp)*100/len(df),2)]})
57
+
58
+ outlier_df = pd.concat([outlier_df,temp_df]).reset_index(drop = True)
59
+
60
+ else:
61
+ print("No columns for Zscore method")
62
+
63
+
64
+ if len(iqr_cols) >0:
65
+ for col in iqr_cols:
66
+ q3 = df[col].quantile(.75)
67
+ q1 = df[col].quantile(.25)
68
+ IQR = q3 -q1
69
+ ul = q3 + 1.5*IQR
70
+ ll = q1 - 1.5*IQR
71
+ mask = (df[col] < ll) | (df[col] > ul)
72
+ temp = df[mask]
73
+
74
+ IQR_index = temp.index.tolist()
75
+ outlier_indexes.extend(IQR_index)
76
+
77
+ if len(temp)>0:
78
+ list(outlier_indexes).append(list(IQR_index))
79
+
80
+ temp_df1 = pd.DataFrame({"method" : ["IQR"],
81
+ "columns name" : [col],
82
+ "upper limit" : [round(ul,2)],
83
+ "lower limit" : [round(ll,2)],
84
+ "no of Rows": [len(temp)],
85
+ "percentage outlier" : [round((len(temp)*100/len(df)),2)]
86
+ })
87
+
88
+ outlier_df = pd.concat([outlier_df,temp_df1]).reset_index(drop = True)
89
+
90
+ else:
91
+ print("No columns for IQR method")
92
+
93
+
94
+ outlier_indexes = list(set(outlier_indexes))
95
+
96
+ return outlier_df,outlier_indexes
97
+
98
+
99
+ def outlier_handling(df,y,model,outlier_indexes = [],outlier_cols = None ,method = root_mean_squared_error,test_size = 0.2, random_state = 42,eva = "reg"):
100
+ num_col = df.select_dtypes(exclude = "O").columns
101
+
102
+ global outliers_dropped_df,log_transformed_df,sqrt_transformed_df,yeo_johnson_transformed_df,rank_transformed_df
103
+ global std_scaler_df,winsorize_transformed_df,inverse_log_transformed_winsorize_df,inverse_sqrt_transformed_winsorize_df,minmaxscaler_df
104
+ if eva == "reg":
105
+ if len(outlier_indexes) ==0:
106
+ print("no outlier indexes passed")
107
+ outliers_dropped_df = df.copy()
108
+ else:
109
+ outliers_dropped_df = df.drop(index =outlier_indexes)
110
+
111
+ if outlier_cols != None:
112
+
113
+ if df[outlier_cols][df[outlier_cols] <0].sum().sum() == 0:
114
+ log_transformed_df = df.copy()
115
+ log_transformed_df[outlier_cols] = np.log(log_transformed_df[outlier_cols] + 1e-5)
116
+ sqrt_transformed_df = df.copy()
117
+ sqrt_transformed_df[outlier_cols] = np.sqrt(sqrt_transformed_df[outlier_cols] + 1e-5)
118
+ inverse_log_transformed_winsorize_df = log_transformed_df.copy()
119
+ inverse_sqrt_transformed_winsorize_df = sqrt_transformed_df.copy()
120
+ for column in outlier_cols:
121
+ inverse_log_transformed_winsorize_df[column] = np.exp(winsorize(inverse_log_transformed_winsorize_df[column], limits=[0.05, 0.05]))
122
+ inverse_sqrt_transformed_winsorize_df[column] = (winsorize(inverse_sqrt_transformed_winsorize_df[column], limits=[0.05, 0.05]))**2
123
+ else:
124
+ print("df have values less than zero")
125
+ std_scaler_df = df.copy()
126
+ std_scaler_df[outlier_cols] = StandardScaler().fit_transform(std_scaler_df[outlier_cols])
127
+
128
+ minmaxscaler_df = df.copy()
129
+ minmaxscaler_df[outlier_cols] = MinMaxScaler().fit_transform(minmaxscaler_df[outlier_cols])
130
+
131
+ yeo_johnson_transformed_df = df.copy()
132
+ for column in outlier_cols:
133
+ try:
134
+ yeo_johnson_transformed_df[column], lambda_ = yeojohnson(yeo_johnson_transformed_df[column])
135
+
136
+ except :
137
+ yeo_johnson_transformed_df[column] = yeo_johnson_transformed_df[column]
138
+
139
+ print(f"Yeo-Johnson transformation failed for column '{column}'. Original data used.")
140
+ # yeo_johnson_transformed_df[column], lambda_ = yeojohnson(yeo_johnson_transformed_df[column])
141
+ rank_transformed_df = df.copy()
142
+ rank_transformed_df[outlier_cols] = rank_transformed_df[outlier_cols].rank()
143
+ winsorize_transformed_df = df.copy()
144
+ for column in outlier_cols:
145
+ winsorize_transformed_df[column] = winsorize(winsorize_transformed_df[column], limits=[0.05, 0.05])
146
+
147
+
148
+
149
+ else:
150
+
151
+
152
+ if df[num_col][df[num_col] <0].sum().sum() == 0:
153
+ log_transformed_df = df.copy()
154
+ log_transformed_df[num_col] = np.log(log_transformed_df[num_col] + 1e-5)
155
+ sqrt_transformed_df = df.copy()
156
+ sqrt_transformed_df[num_col] = np.sqrt(sqrt_transformed_df[num_col] + 1e-5)
157
+ inverse_log_transformed_winsorize_df = log_transformed_df.copy()
158
+ inverse_sqrt_transformed_winsorize_df = sqrt_transformed_df.copy()
159
+ for column in num_col:
160
+ inverse_log_transformed_winsorize_df[column] = np.exp(winsorize(inverse_log_transformed_winsorize_df[column], limits=[0.05, 0.05]))
161
+ inverse_sqrt_transformed_winsorize_df[column] = (winsorize(inverse_sqrt_transformed_winsorize_df[column], limits=[0.05, 0.05]))**2
162
+ else:
163
+
164
+ print("df have values less than zero")
165
+
166
+ std_scaler_df = df.copy()
167
+ std_scaler_df[outlier_cols] = StandardScaler().fit_transform(std_scaler_df[outlier_cols])
168
+
169
+ minmaxscaler_df = df.copy()
170
+ minmaxscaler_df[outlier_cols] = MinMaxScaler().fit_transform(minmaxscaler_df[outlier_cols])
171
+
172
+ yeo_johnson_transformed_df = df.copy()
173
+ for column in num_col:
174
+ try:
175
+ yeo_johnson_transformed_df[column], lambda_ = yeojohnson(yeo_johnson_transformed_df[column])
176
+
177
+ except :
178
+ yeo_johnson_transformed_df[column] = yeo_johnson_transformed_df[column]
179
+
180
+ print(f"Yeo-Johnson transformation failed for column '{column}'. Original data used.")
181
+ # yeo_johnson_transformed_df[column], lambda_ = yeojohnson(yeo_johnson_transformed_df[column])
182
+ rank_transformed_df = df.copy()
183
+ rank_transformed_df[num_col] = rank_transformed_df[num_col].rank()
184
+ winsorize_transformed_df = df.copy()
185
+ for column in num_col:
186
+ winsorize_transformed_df[column] = winsorize(winsorize_transformed_df[column], limits=[0.05, 0.05])
187
+
188
+ if (df[num_col][df[num_col] <0].sum().sum() == 0):
189
+ outlier_handled_df = [std_scaler_df,minmaxscaler_df,outliers_dropped_df,log_transformed_df,sqrt_transformed_df,yeo_johnson_transformed_df,
190
+ rank_transformed_df,winsorize_transformed_df,inverse_log_transformed_winsorize_df,inverse_sqrt_transformed_winsorize_df]
191
+
192
+ outlier_handled_df_name = ["std_scaler_df","minmaxscaler_df","outliers_dropped_df", "log_transformed_df","sqrt_transformed_df", "yeo_johnson_transformed_df","rank_transformed_df","winsorize_transformed_df",
193
+ "inverse_log_transformed_winsorize_df", "inverse_sqrt_transformed_winsorize_df"]
194
+ elif df[outlier_cols][df[outlier_cols] <0].sum().sum() == 0:
195
+ outlier_handled_df = [std_scaler_df,minmaxscaler_df,outliers_dropped_df,log_transformed_df,sqrt_transformed_df,yeo_johnson_transformed_df,
196
+ rank_transformed_df,winsorize_transformed_df,inverse_log_transformed_winsorize_df,inverse_sqrt_transformed_winsorize_df]
197
+
198
+ outlier_handled_df_name = ["std_scaler_df","minmaxscaler_df","outliers_dropped_df","log_transformed_df", "sqrt_transformed_df","yeo_johnson_transformed_df","rank_transformed_df",
199
+ "winsorize_transformed_df","inverse_log_transformed_winsorize_df","inverse_sqrt_transformed_winsorize_df"]
200
+
201
+ else:
202
+ outlier_handled_df = [std_scaler_df,minmaxscaler_df,outliers_dropped_df,yeo_johnson_transformed_df,rank_transformed_df,winsorize_transformed_df]
203
+
204
+ outlier_handled_df_name = ["std_scaler_df","minmaxscaler_df","outliers_dropped_df","yeo_johnson_transformed_df","rank_transformed_df","winsorize_transformed_df"]
205
+
206
+ for j,i in enumerate(outlier_handled_df):
207
+ X_train, X_test, y_train, y_test = tts(i,y[i.index],test_size = test_size, random_state = random_state)
208
+ evaluationer.evaluation(f"{outlier_handled_df_name[j]}",X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva)
209
+
210
+
211
+ return evaluationer.reg_evaluation_df , outlier_handled_df,outlier_handled_df_name
212
+ elif eva =="class":
213
+
214
+ std_scaler_df = df.copy()
215
+
216
+ std_scaler_df.loc[:,:] = StandardScaler().fit_transform(std_scaler_df.loc[:,:])
217
+
218
+ minmaxscaler_df = df.copy()
219
+ minmaxscaler_df.loc[:,:] = MinMaxScaler().fit_transform(minmaxscaler_df.loc[:,:])
220
+
221
+ rank_transformed_df = df.copy()
222
+ rank_transformed_df = rank_transformed_df.rank()
223
+
224
+ outlier_handled_df = [std_scaler_df,minmaxscaler_df,rank_transformed_df]
225
+ outlier_handled_df_name = ["std_scaler_df","minmaxscaler_df","rank_transformed_df"]
226
+
227
+ for j,i in enumerate(outlier_handled_df):
228
+
229
+ X_train, X_test, y_train, y_test = tts(i,y[i.index],test_size = test_size, random_state = random_state)
230
+ evaluationer.evaluation(f"{outlier_handled_df_name[j]}", X_train,X_test,y_train,y_test,model,root_mean_squared_error,eva = "class")
231
+ return evaluationer.classification_evaluation_df, outlier_handled_df,outlier_handled_df_name
232
+ # returning evaluating dataframe
233
+
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit==1.34.0
2
+ joblib==1.4.2
3
+ numpy==1.26.4
4
+ pandas==2.2.2
5
+ scikit-learn==1.4.2
6
+ seaborn==0.13.2
7
+ matplotlib==3.9.0
8
+ xgboost==2.0.3
9
+ lightgbm==4.3.0
10
+ statsmodels==0.14.2