Upload 12 files
Browse files- app.py +65 -20
- auto_optimizer.py +25 -12
- feature_selections.py +58 -5
- requirements.txt +11 -14
app.py
CHANGED
@@ -8,7 +8,7 @@ import evaluationer,models, null_value_handling
|
|
8 |
import auto_optimizer
|
9 |
from sklearn.experimental import enable_iterative_imputer
|
10 |
from sklearn.impute import SimpleImputer, IterativeImputer
|
11 |
-
import eda
|
12 |
# st.set_page_config(layout="wide")
|
13 |
|
14 |
st.set_page_config(
|
@@ -86,6 +86,8 @@ html_code = """
|
|
86 |
st.markdown(html_code, unsafe_allow_html=True)
|
87 |
st.divider()
|
88 |
|
|
|
|
|
89 |
st.markdown(
|
90 |
"""
|
91 |
<style>
|
@@ -137,6 +139,37 @@ if (len(sep) ==0):
|
|
137 |
sep = ","
|
138 |
csv_upload2 = st.sidebar.file_uploader("Input CSV File of Test Data Prediction",type = ["csv"])
|
139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
test = pd.DataFrame()
|
141 |
if csv_upload is not None:
|
142 |
# read the uploaded file into dataframe
|
@@ -260,14 +293,15 @@ if csv_upload is not None:
|
|
260 |
st.write("There are no duplicate values in Train")
|
261 |
st.divider()
|
262 |
# dropping not important columns
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
|
|
271 |
|
272 |
st.divider()
|
273 |
num_cols = X.select_dtypes(exclude = "O").columns
|
@@ -296,7 +330,7 @@ if csv_upload is not None:
|
|
296 |
st.write("Select ML algorithm")
|
297 |
class_model_name = st.selectbox("select model",models.Classification_models.index)
|
298 |
class_model = models.Classification_models.loc[class_model_name].values[0]
|
299 |
-
auto_optimizer.Auto_optimizer(X,y,eva,class_model)
|
300 |
|
301 |
|
302 |
else:
|
@@ -349,7 +383,7 @@ if csv_upload is not None:
|
|
349 |
|
350 |
dict_2= {}
|
351 |
for nvh_method in null_value_handling.null_value_handling_method_cat_cols :
|
352 |
-
|
353 |
|
354 |
selected_nvh_num_cols = st.multiselect(f'method:- \"{nvh_method}\" for Numerical columns', cat_cols_nvh,)
|
355 |
dict_2[nvh_method] = selected_nvh_num_cols
|
@@ -368,17 +402,22 @@ if csv_upload is not None:
|
|
368 |
test[cat_cols] = SimpleImputer(strategy = "most_frequent").fit_transform(test[cat_cols])
|
369 |
|
370 |
|
371 |
-
|
372 |
-
|
|
|
373 |
|
374 |
-
|
375 |
-
|
376 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
377 |
|
378 |
-
st.markdown('<p class="success-message">Null Values Handled Successfully. ✅</p>', unsafe_allow_html=True)
|
379 |
-
if st.checkbox("Download Null Value Handled DataFrame as CSV File ? "):
|
380 |
-
st.download_button(label="Download Null Value Handled CSV File",data=csv,file_name='NVH_DataFrame.csv',mime='text/csv')
|
381 |
-
st.divider()
|
382 |
ord_enc_cols = []
|
383 |
|
384 |
if len(cat_cols) == 0:
|
@@ -448,6 +487,12 @@ if csv_upload is not None:
|
|
448 |
st.download_button(label="Download Ordinal Encoded CSV File",data=csv,file_name='Encoded_DataFrame.csv',mime='text/csv')
|
449 |
|
450 |
st.divider()
|
|
|
|
|
|
|
|
|
|
|
|
|
451 |
st.markdown('<div class="message-box success">Modelling</div>', unsafe_allow_html=True)
|
452 |
st.write("")
|
453 |
st.markdown('<p class="success-message">Select Train Validation Split Method</p>', unsafe_allow_html=True)
|
|
|
8 |
import auto_optimizer
|
9 |
from sklearn.experimental import enable_iterative_imputer
|
10 |
from sklearn.impute import SimpleImputer, IterativeImputer
|
11 |
+
import eda,outliers
|
12 |
# st.set_page_config(layout="wide")
|
13 |
|
14 |
st.set_page_config(
|
|
|
86 |
st.markdown(html_code, unsafe_allow_html=True)
|
87 |
st.divider()
|
88 |
|
89 |
+
|
90 |
+
|
91 |
st.markdown(
|
92 |
"""
|
93 |
<style>
|
|
|
139 |
sep = ","
|
140 |
csv_upload2 = st.sidebar.file_uploader("Input CSV File of Test Data Prediction",type = ["csv"])
|
141 |
|
142 |
+
if csv_upload is None:
|
143 |
+
st.title("LazyML")
|
144 |
+
|
145 |
+
st.header("Welcome to LazyML – your go-to app for effortless machine learning!")
|
146 |
+
|
147 |
+
st.subheader("Overview")
|
148 |
+
st.write("""
|
149 |
+
LazyML is designed to make machine learning accessible to everyone, regardless of their technical expertise. Whether you're a seasoned data scientist or a complete beginner, LazyML takes the complexity out of building and deploying machine learning models.
|
150 |
+
""")
|
151 |
+
|
152 |
+
st.subheader("Key Features")
|
153 |
+
st.write("""
|
154 |
+
- **Automated Model Building:** Automatically preprocess your data, select the best algorithms, and fine-tune models with minimal effort.
|
155 |
+
- **User-Friendly Interface:** Intuitive and easy-to-navigate interface that guides you through the entire machine learning workflow.
|
156 |
+
- **Data Visualization:** Comprehensive visualization tools to help you understand your data and model performance.
|
157 |
+
- **Customizable Pipelines:** Flexibility to customize data preprocessing, feature engineering, and model selection to suit your needs.
|
158 |
+
- **Performance Metrics:** Detailed performance metrics and comparison reports for informed decision-making.
|
159 |
+
- **Deployment Ready:** Easily deploy your models and start making predictions with just a few clicks.
|
160 |
+
""")
|
161 |
+
|
162 |
+
st.subheader("How It Works")
|
163 |
+
st.write("""
|
164 |
+
1. **Upload Your Data:** Start by uploading your dataset in CSV format.
|
165 |
+
2. **Data Preprocessing:** LazyML automatically cleans and preprocesses your data, handling missing values, and scaling features as needed.
|
166 |
+
3. **Model Selection:** The app evaluates multiple algorithms and selects the best performing ones for your specific data.
|
167 |
+
4. **Model Training:** Selected models are trained and fine-tuned using cross-validation to ensure robustness.
|
168 |
+
5. **Evaluation:** Get detailed reports on model performance with key metrics like accuracy, precision, recall, and F1 score.
|
169 |
+
6. **Deployment:** Once satisfied with the model, deploy it and start making real-time predictions.
|
170 |
+
""")
|
171 |
+
|
172 |
+
|
173 |
test = pd.DataFrame()
|
174 |
if csv_upload is not None:
|
175 |
# read the uploaded file into dataframe
|
|
|
293 |
st.write("There are no duplicate values in Train")
|
294 |
st.divider()
|
295 |
# dropping not important columns
|
296 |
+
if len(X.columns) >1:
|
297 |
+
st.markdown('<div class="message-box success">Drop Unimportant Columns</div>', unsafe_allow_html=True)
|
298 |
+
if st.radio(" ",["Yes","No"],index = 1) == "Yes":
|
299 |
+
selected_drop_column = st.multiselect('Select columns to be dropped', X.columns)
|
300 |
+
X = X.drop(columns = selected_drop_column)
|
301 |
+
if len(test) >0:
|
302 |
+
test = test.drop(columns = selected_drop_column)
|
303 |
+
st.write("Un-Important column(s) Deleted ✅")
|
304 |
+
st.dataframe(X.head())
|
305 |
|
306 |
st.divider()
|
307 |
num_cols = X.select_dtypes(exclude = "O").columns
|
|
|
330 |
st.write("Select ML algorithm")
|
331 |
class_model_name = st.selectbox("select model",models.Classification_models.index)
|
332 |
class_model = models.Classification_models.loc[class_model_name].values[0]
|
333 |
+
auto_optimizer.Auto_optimizer(X,y,eva,class_model,class_model_name)
|
334 |
|
335 |
|
336 |
else:
|
|
|
383 |
|
384 |
dict_2= {}
|
385 |
for nvh_method in null_value_handling.null_value_handling_method_cat_cols :
|
386 |
+
|
387 |
|
388 |
selected_nvh_num_cols = st.multiselect(f'method:- \"{nvh_method}\" for Numerical columns', cat_cols_nvh,)
|
389 |
dict_2[nvh_method] = selected_nvh_num_cols
|
|
|
402 |
test[cat_cols] = SimpleImputer(strategy = "most_frequent").fit_transform(test[cat_cols])
|
403 |
|
404 |
|
405 |
+
try:
|
406 |
+
null_value_handling.null_handling(X,clean_num_nvh_df,clean_num_nvh_df_cat)
|
407 |
+
st.write("X Data after Null value handling", X.head())
|
408 |
|
409 |
+
new_df = pd.concat([X,y[X.index]],axis = 1)
|
410 |
+
|
411 |
+
csv = new_df.to_csv(index = False)
|
412 |
+
|
413 |
+
st.markdown('<p class="success-message">Null Values Handled Successfully. ✅</p>', unsafe_allow_html=True)
|
414 |
+
if st.checkbox("Download Null Value Handled DataFrame as CSV File ? "):
|
415 |
+
st.download_button(label="Download Null Value Handled CSV File",data=csv,file_name='NVH_DataFrame.csv',mime='text/csv')
|
416 |
+
st.divider()
|
417 |
+
except:
|
418 |
+
st.markdown('<p class="unsuccess-message">⚠️⚠️⚠️ Categorical column null value not handled ⚠️⚠️⚠️</p>', unsafe_allow_html=True)
|
419 |
+
|
420 |
|
|
|
|
|
|
|
|
|
421 |
ord_enc_cols = []
|
422 |
|
423 |
if len(cat_cols) == 0:
|
|
|
487 |
st.download_button(label="Download Ordinal Encoded CSV File",data=csv,file_name='Encoded_DataFrame.csv',mime='text/csv')
|
488 |
|
489 |
st.divider()
|
490 |
+
st.markdown('<div class="message-box success">Outlier Detection</div>', unsafe_allow_html=True)
|
491 |
+
st.write("")
|
492 |
+
if st.button("Click to check outliers"):
|
493 |
+
outlier,out_index = outliers.detect_outliers(new_df,num_cols)
|
494 |
+
st.write("outlier",outlier)
|
495 |
+
st.divider()
|
496 |
st.markdown('<div class="message-box success">Modelling</div>', unsafe_allow_html=True)
|
497 |
st.write("")
|
498 |
st.markdown('<p class="success-message">Select Train Validation Split Method</p>', unsafe_allow_html=True)
|
auto_optimizer.py
CHANGED
@@ -285,7 +285,7 @@ def Auto_optimizer(X,y,eva,model,model_name,test= None):
|
|
285 |
st.write("Columns for Outliers handling",tuple(outliers_df_X["columns name"]))
|
286 |
|
287 |
select_outlier_cols = st.multiselect("Select columns for Outlier Handling",tuple(outliers_df_X["columns name"]),default =tuple(outliers_df_X["columns name"]))
|
288 |
-
resultant,outlier_handled_df,outlier_handled_df_name= outliers.outlier_handling(X,y,model,outlier_indexes = outlier_indexes,outlier_cols = select_outlier_cols ,method = root_mean_squared_error,test_size = 0.2, random_state = 42,eva =
|
289 |
st.write("outlier handling with methods",resultant)
|
290 |
st.write("Best method with outlier handling",resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])
|
291 |
try :
|
@@ -302,17 +302,30 @@ def Auto_optimizer(X,y,eva,model,model_name,test= None):
|
|
302 |
|
303 |
|
304 |
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
316 |
|
317 |
|
318 |
|
|
|
285 |
st.write("Columns for Outliers handling",tuple(outliers_df_X["columns name"]))
|
286 |
|
287 |
select_outlier_cols = st.multiselect("Select columns for Outlier Handling",tuple(outliers_df_X["columns name"]),default =tuple(outliers_df_X["columns name"]))
|
288 |
+
resultant,outlier_handled_df,outlier_handled_df_name= outliers.outlier_handling(X,y,model,outlier_indexes = outlier_indexes,outlier_cols = select_outlier_cols ,method = root_mean_squared_error,test_size = 0.2, random_state = 42,eva = eva)
|
289 |
st.write("outlier handling with methods",resultant)
|
290 |
st.write("Best method with outlier handling",resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])
|
291 |
try :
|
|
|
302 |
|
303 |
|
304 |
|
305 |
+
if eva == "reg":
|
306 |
+
try:
|
307 |
+
result_df_1 , feature_col, feature_col_name = feature_selections.feature_selection(X_train,X_test,y_train,y_test,model,alpha = 0.05)
|
308 |
+
X = X.drop(columns = feature_col[feature_col_name.index(result_df_1.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])])
|
309 |
+
except:
|
310 |
+
"evaluation by feature selection is not better than previous"
|
311 |
+
|
312 |
+
try:
|
313 |
+
result,X_train_b,X_test_b,y_train_b,y_test_b = best_tts.best_tts(X,y,model,eva)
|
314 |
+
st.write("result_df",result)
|
315 |
+
except:
|
316 |
+
X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size =0.2,random_state = 42)
|
317 |
+
elif eva =="class":
|
318 |
+
try:
|
319 |
+
result_df_1 , feature_col, feature_col_name = feature_selections.clas_feature_selection(X_train,X_test,y_train,y_test,model)
|
320 |
+
X = X.drop(columns = feature_col[feature_col_name.index(result_df_1.sort_values(by = "test_acc").tail(1).iloc[:,0].values[0])])
|
321 |
+
except:
|
322 |
+
"evaluation by feature selection is not better than previous"
|
323 |
+
|
324 |
+
try:
|
325 |
+
result,X_train_b,X_test_b,y_train_b,y_test_b = best_tts.best_tts(X,y,model,eva)
|
326 |
+
st.write("result_df",result)
|
327 |
+
except:
|
328 |
+
X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size =0.2,random_state = 42)
|
329 |
|
330 |
|
331 |
|
feature_selections.py
CHANGED
@@ -8,8 +8,16 @@ import pandas as pd
|
|
8 |
import numpy as np
|
9 |
import evaluationer
|
10 |
import streamlit as st
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
from sklearn.metrics import root_mean_squared_error
|
14 |
def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
|
15 |
|
@@ -40,10 +48,10 @@ def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
|
|
40 |
vif = pd.DataFrame()
|
41 |
vif["variables"] = X_new_vif.columns
|
42 |
vif["VIF"] = [variance_inflation_factor(X_new_vif.values, i) for i in range(X_new_vif.shape[1])]
|
43 |
-
st.write("gdfgdsdsdfad",vif)
|
44 |
if len(vif[vif["variables"] == "const"]) == 1:
|
45 |
vif = vif.drop(index = (vif[vif["variables"] == "const"].index[0]))
|
46 |
-
st.write("gdfgdsad",vif)
|
47 |
# drop const in vif cols
|
48 |
# vif_cols = X_new_vif.drop(columns = "const")
|
49 |
vif_cols = vif[vif.VIF >10].variables.tolist()
|
@@ -101,4 +109,49 @@ def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
|
|
101 |
evaluationer.evaluation(f"{feature_cols_name[i]}" ,X_train.drop(columns = j),X_test.drop(columns = j),y_train,y_test,model_reg,method = root_mean_squared_error,eva = "reg")
|
102 |
return evaluationer.reg_evaluation_df,feature_cols,feature_cols_name
|
103 |
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
import numpy as np
|
9 |
import evaluationer
|
10 |
import streamlit as st
|
11 |
+
from sklearn.feature_selection import RFE,RFECV
|
12 |
+
from sklearn.linear_model import Lasso
|
13 |
+
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
|
14 |
+
import numpy as np
|
15 |
+
import pandas as pd
|
16 |
+
import matplotlib.pyplot as plt
|
17 |
+
from sklearn.linear_model import LogisticRegression
|
18 |
+
from sklearn.feature_selection import RFE, RFECV, SelectKBest, chi2, mutual_info_classif
|
19 |
+
from sklearn.model_selection import StratifiedKFold
|
20 |
+
from sklearn.metrics import f1_score
|
21 |
from sklearn.metrics import root_mean_squared_error
|
22 |
def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
|
23 |
|
|
|
48 |
vif = pd.DataFrame()
|
49 |
vif["variables"] = X_new_vif.columns
|
50 |
vif["VIF"] = [variance_inflation_factor(X_new_vif.values, i) for i in range(X_new_vif.shape[1])]
|
51 |
+
# st.write("gdfgdsdsdfad",vif)
|
52 |
if len(vif[vif["variables"] == "const"]) == 1:
|
53 |
vif = vif.drop(index = (vif[vif["variables"] == "const"].index[0]))
|
54 |
+
# st.write("gdfgdsad",vif)
|
55 |
# drop const in vif cols
|
56 |
# vif_cols = X_new_vif.drop(columns = "const")
|
57 |
vif_cols = vif[vif.VIF >10].variables.tolist()
|
|
|
109 |
evaluationer.evaluation(f"{feature_cols_name[i]}" ,X_train.drop(columns = j),X_test.drop(columns = j),y_train,y_test,model_reg,method = root_mean_squared_error,eva = "reg")
|
110 |
return evaluationer.reg_evaluation_df,feature_cols,feature_cols_name
|
111 |
|
112 |
+
def clas_feature_selection(X_train, X_test,y_train,y_test,model,n_features_to_select = None, step=1,importance_getter='auto',refcv_graph= False,C=0.05,k = 10):
|
113 |
+
global rfe_cols,rfecv_cols,lasso_cols,chi2_imp_col,mi_imp_col
|
114 |
+
rfe = RFE(estimator= model,n_features_to_select = n_features_to_select,importance_getter=importance_getter, step=1)
|
115 |
+
rfe.fit(X_train,y_train)
|
116 |
+
rfe_cols = X_train.columns[rfe.support_]
|
117 |
+
cv = StratifiedKFold(5)
|
118 |
+
rfecv = RFECV(estimator=model,
|
119 |
+
step=1,
|
120 |
+
cv=cv,
|
121 |
+
scoring="f1",
|
122 |
+
min_features_to_select=1,
|
123 |
+
n_jobs=-1)
|
124 |
+
rfecv.fit(X_train,y_train)
|
125 |
+
rfecv_cols = X_train.columns[rfecv.support_]
|
126 |
+
if refcv_graph == True:
|
127 |
+
n_scores = len(rfecv.cv_results_["mean_test_score"])
|
128 |
+
plt.figure()
|
129 |
+
plt.xlabel("Number of features selected")
|
130 |
+
plt.ylabel("Mean test f1")
|
131 |
+
plt.errorbar(range(min_features_to_select, n_scores + min_features_to_select),
|
132 |
+
rfecv.cv_results_["mean_test_score"],
|
133 |
+
yerr=rfecv.cv_results_["std_test_score"],
|
134 |
+
)
|
135 |
+
plt.grid(True)
|
136 |
+
plt.title("Recursive Feature Elimination \nwith correlated features")
|
137 |
+
plt.show()
|
138 |
+
clf = LogisticRegression(penalty = "l1", C = C,
|
139 |
+
random_state = 42,
|
140 |
+
solver = "liblinear")
|
141 |
+
clf.fit(X_train, y_train)
|
142 |
+
lasso_cols = clf.feature_names_in_[clf.coef_[0] != 0]
|
143 |
+
|
144 |
+
sk = SelectKBest(chi2, k=k)
|
145 |
+
X_chi2 = sk.fit_transform(X_train, y_train)
|
146 |
+
chi2_imp_col = X_train.columns[sk.get_support()]
|
147 |
+
sk = SelectKBest(mutual_info_classif, k=k)
|
148 |
+
X_mutual = sk.fit_transform(X_train, y_train)
|
149 |
+
mi_imp_col = X_train.columns[sk.get_support()]
|
150 |
+
|
151 |
+
feature_cols = [rfe_cols,rfecv_cols,lasso_cols,chi2_imp_col,mi_imp_col]
|
152 |
+
feature_cols_name = ["rfe_cols","rfecv_cols","lasso_cols","chi2_imp_col","mi_imp_col"]
|
153 |
+
|
154 |
+
for i,j in enumerate(feature_cols):
|
155 |
+
# evaluationerevaluation(f"{feature_cols_name[i]} " ,X_train[j],X_test[j],y_train,y_test,model = model,eva = "class")
|
156 |
+
evaluationer.evaluation(f"{feature_cols_name[i]}" ,X_train[j],X_test[j],y_train,y_test,model,method = root_mean_squared_error,eva = "class")
|
157 |
+
return evaluationer.classification_evaluation_df , feature_cols, feature_cols_name
|
requirements.txt
CHANGED
@@ -1,14 +1,11 @@
|
|
1 |
-
|
2 |
-
streamlit==1.34.0
|
3 |
-
joblib==1.4.2
|
4 |
-
numpy==1.26.4
|
5 |
-
pandas==2.2.2
|
6 |
-
scikit-learn==1.4.2
|
7 |
-
datashader==0.16.2
|
8 |
-
colorcet==3.1.0
|
9 |
-
plotly==5.22.0
|
10 |
-
matplotlib==3.9.0
|
11 |
-
seaborn==0.13.2
|
12 |
-
xgboost==2.0.3
|
13 |
-
lightgbm==4.3.0
|
14 |
-
statsmodels==0.14.2
|
|
|
1 |
+
|
2 |
+
streamlit==1.34.0
|
3 |
+
joblib==1.4.2
|
4 |
+
numpy==1.26.4
|
5 |
+
pandas==2.2.2
|
6 |
+
scikit-learn==1.4.2
|
7 |
+
datashader==0.16.2
|
8 |
+
colorcet==3.1.0
|
9 |
+
plotly==5.22.0
|
10 |
+
matplotlib==3.9.0
|
11 |
+
seaborn==0.13.2
|
|
|
|
|
|