Spaces:

Gaurav069
/

LazyML

Sleeping

App Files Files Community

Gaurav069 commited on Jun 20, 2024

Commit

f736b41

verified ·

1 Parent(s): 9d86a56

Upload 12 files

Browse files

Files changed (4) hide show

app.py +65 -20
auto_optimizer.py +25 -12
feature_selections.py +58 -5
requirements.txt +11 -14

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ import evaluationer,models, null_value_handling
 import auto_optimizer
 from sklearn.experimental import enable_iterative_imputer
 from sklearn.impute import SimpleImputer, IterativeImputer
-import eda
 # st.set_page_config(layout="wide")
 st.set_page_config(
@@ -86,6 +86,8 @@ html_code = """
 st.markdown(html_code, unsafe_allow_html=True)
 st.divider()
 st.markdown(
     """
     <style>
@@ -137,6 +139,37 @@ if (len(sep) ==0):
     sep = ","
 csv_upload2 = st.sidebar.file_uploader("Input CSV File of Test Data Prediction",type = ["csv"])
 test = pd.DataFrame()
 if csv_upload is not None:
     # read the uploaded file into dataframe
@@ -260,14 +293,15 @@ if csv_upload is not None:
                 st.write("There are no duplicate values in Train")
         st.divider()
         # dropping not important columns
-        st.markdown('<div class="message-box success">Drop Unimportant Columns</div>', unsafe_allow_html=True)
-        if st.radio("   ",["Yes","No"],index = 1) == "Yes":
-            selected_drop_column = st.multiselect('Select columns to be dropped', X.columns)
-            X = X.drop(columns = selected_drop_column)
-            if len(test) >0:
-                test = test.drop(columns = selected_drop_column)
-            st.write("Un-Important column(s) Deleted ✅")
-            st.dataframe(X.head())
         st.divider()
         num_cols = X.select_dtypes(exclude = "O").columns
@@ -296,7 +330,7 @@ if csv_upload is not None:
                 st.write("Select ML algorithm")
                 class_model_name = st.selectbox("select model",models.Classification_models.index)
                 class_model = models.Classification_models.loc[class_model_name].values[0]
-                auto_optimizer.Auto_optimizer(X,y,eva,class_model)
         else:
@@ -349,7 +383,7 @@ if csv_upload is not None:
                     dict_2= {}
                     for nvh_method in null_value_handling.null_value_handling_method_cat_cols :
-                        st.write("dsff",nvh_method)
                         selected_nvh_num_cols = st.multiselect(f'method:- \"{nvh_method}\" for Numerical columns', cat_cols_nvh,)
                         dict_2[nvh_method] = selected_nvh_num_cols
@@ -368,17 +402,22 @@ if csv_upload is not None:
                         test[cat_cols] = SimpleImputer(strategy = "most_frequent").fit_transform(test[cat_cols])
-                null_value_handling.null_handling(X,clean_num_nvh_df,clean_num_nvh_df_cat)
-                st.write("X Data after Null value handling", X.head())
-                new_df = pd.concat([X,y[X.index]],axis = 1)
-                csv = new_df.to_csv(index = False)
-                st.markdown('<p class="success-message">Null Values Handled Successfully. ✅</p>', unsafe_allow_html=True)
-                if st.checkbox("Download Null Value Handled DataFrame as CSV File ? "):
-                    st.download_button(label="Download Null Value Handled CSV File",data=csv,file_name='NVH_DataFrame.csv',mime='text/csv')
-                st.divider()
             ord_enc_cols = []
             if len(cat_cols) == 0:
@@ -448,6 +487,12 @@ if csv_upload is not None:
                 st.download_button(label="Download Ordinal Encoded CSV File",data=csv,file_name='Encoded_DataFrame.csv',mime='text/csv')
             st.divider()
             st.markdown('<div class="message-box success">Modelling</div>', unsafe_allow_html=True)
             st.write("")
             st.markdown('<p class="success-message">Select Train Validation Split Method</p>', unsafe_allow_html=True)

 import auto_optimizer
 from sklearn.experimental import enable_iterative_imputer
 from sklearn.impute import SimpleImputer, IterativeImputer
+import eda,outliers
 # st.set_page_config(layout="wide")
 st.set_page_config(
 st.markdown(html_code, unsafe_allow_html=True)
 st.divider()
 st.markdown(
     """
     <style>
     sep = ","
 csv_upload2 = st.sidebar.file_uploader("Input CSV File of Test Data Prediction",type = ["csv"])
+if csv_upload is None:
+    st.title("LazyML")
+    st.header("Welcome to LazyML – your go-to app for effortless machine learning!")
+    st.subheader("Overview")
+    st.write("""
+    LazyML is designed to make machine learning accessible to everyone, regardless of their technical expertise. Whether you're a seasoned data scientist or a complete beginner, LazyML takes the complexity out of building and deploying machine learning models.
+    """)
+    st.subheader("Key Features")
+    st.write("""
+    - **Automated Model Building:** Automatically preprocess your data, select the best algorithms, and fine-tune models with minimal effort.
+    - **User-Friendly Interface:** Intuitive and easy-to-navigate interface that guides you through the entire machine learning workflow.
+    - **Data Visualization:** Comprehensive visualization tools to help you understand your data and model performance.
+    - **Customizable Pipelines:** Flexibility to customize data preprocessing, feature engineering, and model selection to suit your needs.
+    - **Performance Metrics:** Detailed performance metrics and comparison reports for informed decision-making.
+    - **Deployment Ready:** Easily deploy your models and start making predictions with just a few clicks.
+    """)
+    st.subheader("How It Works")
+    st.write("""
+    1. **Upload Your Data:** Start by uploading your dataset in CSV format.
+    2. **Data Preprocessing:** LazyML automatically cleans and preprocesses your data, handling missing values, and scaling features as needed.
+    3. **Model Selection:** The app evaluates multiple algorithms and selects the best performing ones for your specific data.
+    4. **Model Training:** Selected models are trained and fine-tuned using cross-validation to ensure robustness.
+    5. **Evaluation:** Get detailed reports on model performance with key metrics like accuracy, precision, recall, and F1 score.
+    6. **Deployment:** Once satisfied with the model, deploy it and start making real-time predictions.
+    """)
 test = pd.DataFrame()
 if csv_upload is not None:
     # read the uploaded file into dataframe
                 st.write("There are no duplicate values in Train")
         st.divider()
         # dropping not important columns
+        if len(X.columns) >1:
+            st.markdown('<div class="message-box success">Drop Unimportant Columns</div>', unsafe_allow_html=True)
+            if st.radio("   ",["Yes","No"],index = 1) == "Yes":
+                selected_drop_column = st.multiselect('Select columns to be dropped', X.columns)
+                X = X.drop(columns = selected_drop_column)
+                if len(test) >0:
+                    test = test.drop(columns = selected_drop_column)
+                st.write("Un-Important column(s) Deleted ✅")
+                st.dataframe(X.head())
         st.divider()
         num_cols = X.select_dtypes(exclude = "O").columns
                 st.write("Select ML algorithm")
                 class_model_name = st.selectbox("select model",models.Classification_models.index)
                 class_model = models.Classification_models.loc[class_model_name].values[0]
+                auto_optimizer.Auto_optimizer(X,y,eva,class_model,class_model_name)
         else:
                     dict_2= {}
                     for nvh_method in null_value_handling.null_value_handling_method_cat_cols :
                         selected_nvh_num_cols = st.multiselect(f'method:- \"{nvh_method}\" for Numerical columns', cat_cols_nvh,)
                         dict_2[nvh_method] = selected_nvh_num_cols
                         test[cat_cols] = SimpleImputer(strategy = "most_frequent").fit_transform(test[cat_cols])
+                try:
+                    null_value_handling.null_handling(X,clean_num_nvh_df,clean_num_nvh_df_cat)
+                    st.write("X Data after Null value handling", X.head())
+                    new_df = pd.concat([X,y[X.index]],axis = 1)
+                    csv = new_df.to_csv(index = False)
+                    st.markdown('<p class="success-message">Null Values Handled Successfully. ✅</p>', unsafe_allow_html=True)
+                    if st.checkbox("Download Null Value Handled DataFrame as CSV File ? "):
+                        st.download_button(label="Download Null Value Handled CSV File",data=csv,file_name='NVH_DataFrame.csv',mime='text/csv')
+                    st.divider()
+                except:
+                    st.markdown('<p class="unsuccess-message">⚠️⚠️⚠️ Categorical column null value not handled ⚠️⚠️⚠️</p>', unsafe_allow_html=True)
             ord_enc_cols = []
             if len(cat_cols) == 0:
                 st.download_button(label="Download Ordinal Encoded CSV File",data=csv,file_name='Encoded_DataFrame.csv',mime='text/csv')
             st.divider()
+            st.markdown('<div class="message-box success">Outlier Detection</div>', unsafe_allow_html=True)
+            st.write("")
+            if st.button("Click to check outliers"):
+                outlier,out_index = outliers.detect_outliers(new_df,num_cols)
+                st.write("outlier",outlier)
+            st.divider()
             st.markdown('<div class="message-box success">Modelling</div>', unsafe_allow_html=True)
             st.write("")
             st.markdown('<p class="success-message">Select Train Validation Split Method</p>', unsafe_allow_html=True)

auto_optimizer.py CHANGED Viewed

@@ -285,7 +285,7 @@ def Auto_optimizer(X,y,eva,model,model_name,test= None):
         st.write("Columns for Outliers handling",tuple(outliers_df_X["columns name"]))
         select_outlier_cols = st.multiselect("Select columns for Outlier Handling",tuple(outliers_df_X["columns name"]),default =tuple(outliers_df_X["columns name"]))
-        resultant,outlier_handled_df,outlier_handled_df_name= outliers.outlier_handling(X,y,model,outlier_indexes = outlier_indexes,outlier_cols = select_outlier_cols ,method = root_mean_squared_error,test_size = 0.2, random_state = 42,eva = "reg")
         st.write("outlier handling with methods",resultant)
         st.write("Best method with outlier handling",resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])
         try :
@@ -302,17 +302,30 @@ def Auto_optimizer(X,y,eva,model,model_name,test= None):
-        try:
-            result_df_1 , feature_col, feature_col_name = feature_selections.feature_selection(X_train,X_test,y_train,y_test,model,alpha = 0.05)
-            X = X.drop(columns = feature_col[feature_col_name.index(result_df_1.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])])
-        except:
-            "evaluation by feature selection is not better than previous"
-        try:
-            result,X_train_b,X_test_b,y_train_b,y_test_b = best_tts.best_tts(X,y,model,eva)
-            st.write("result_df",result)
-        except:
-            X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size =0.2,random_state = 42)

         st.write("Columns for Outliers handling",tuple(outliers_df_X["columns name"]))
         select_outlier_cols = st.multiselect("Select columns for Outlier Handling",tuple(outliers_df_X["columns name"]),default =tuple(outliers_df_X["columns name"]))
+        resultant,outlier_handled_df,outlier_handled_df_name= outliers.outlier_handling(X,y,model,outlier_indexes = outlier_indexes,outlier_cols = select_outlier_cols ,method = root_mean_squared_error,test_size = 0.2, random_state = 42,eva = eva)
         st.write("outlier handling with methods",resultant)
         st.write("Best method with outlier handling",resultant.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])
         try :
+        if eva == "reg":
+            try:
+                result_df_1 , feature_col, feature_col_name = feature_selections.feature_selection(X_train,X_test,y_train,y_test,model,alpha = 0.05)
+                X = X.drop(columns = feature_col[feature_col_name.index(result_df_1.sort_values(by = "test_r2").tail(1).iloc[:,0].values[0])])
+            except:
+                "evaluation by feature selection is not better than previous"
+            try:
+                result,X_train_b,X_test_b,y_train_b,y_test_b = best_tts.best_tts(X,y,model,eva)
+                st.write("result_df",result)
+            except:
+                X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size =0.2,random_state = 42)
+        elif eva =="class":
+            try:
+                result_df_1 , feature_col, feature_col_name = feature_selections.clas_feature_selection(X_train,X_test,y_train,y_test,model)
+                X = X.drop(columns = feature_col[feature_col_name.index(result_df_1.sort_values(by = "test_acc").tail(1).iloc[:,0].values[0])])
+            except:
+                "evaluation by feature selection is not better than previous"
+            try:
+                result,X_train_b,X_test_b,y_train_b,y_test_b = best_tts.best_tts(X,y,model,eva)
+                st.write("result_df",result)
+            except:
+                X_train,X_test,y_train,y_test = tts(X,y[X.index],test_size =0.2,random_state = 42)

feature_selections.py CHANGED Viewed

@@ -8,8 +8,16 @@ import pandas as pd
 import numpy as np
 import evaluationer
 import streamlit as st
 from sklearn.metrics import root_mean_squared_error
 def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
@@ -40,10 +48,10 @@ def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
     vif = pd.DataFrame()
     vif["variables"] = X_new_vif.columns
     vif["VIF"] = [variance_inflation_factor(X_new_vif.values, i) for i in range(X_new_vif.shape[1])]
-    st.write("gdfgdsdsdfad",vif)
     if len(vif[vif["variables"] == "const"]) == 1:
         vif = vif.drop(index = (vif[vif["variables"] == "const"].index[0]))
-    st.write("gdfgdsad",vif)
     # drop const in vif cols
     # vif_cols = X_new_vif.drop(columns = "const")
     vif_cols = vif[vif.VIF >10].variables.tolist()
@@ -101,4 +109,49 @@ def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
         evaluationer.evaluation(f"{feature_cols_name[i]}" ,X_train.drop(columns = j),X_test.drop(columns = j),y_train,y_test,model_reg,method = root_mean_squared_error,eva = "reg")
     return evaluationer.reg_evaluation_df,feature_cols,feature_cols_name

 import numpy as np
 import evaluationer
 import streamlit as st
+from sklearn.feature_selection import RFE,RFECV
+from sklearn.linear_model import Lasso
+from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from sklearn.linear_model import LogisticRegression
+from sklearn.feature_selection import RFE, RFECV, SelectKBest, chi2, mutual_info_classif
+from sklearn.model_selection import StratifiedKFold
+from sklearn.metrics import f1_score
 from sklearn.metrics import root_mean_squared_error
 def feature_selection(X_train, X_test,y_train,y_test,model_reg,alpha = 0.05):
     vif = pd.DataFrame()
     vif["variables"] = X_new_vif.columns
     vif["VIF"] = [variance_inflation_factor(X_new_vif.values, i) for i in range(X_new_vif.shape[1])]
+    # st.write("gdfgdsdsdfad",vif)
     if len(vif[vif["variables"] == "const"]) == 1:
         vif = vif.drop(index = (vif[vif["variables"] == "const"].index[0]))
+    # st.write("gdfgdsad",vif)
     # drop const in vif cols
     # vif_cols = X_new_vif.drop(columns = "const")
     vif_cols = vif[vif.VIF >10].variables.tolist()
         evaluationer.evaluation(f"{feature_cols_name[i]}" ,X_train.drop(columns = j),X_test.drop(columns = j),y_train,y_test,model_reg,method = root_mean_squared_error,eva = "reg")
     return evaluationer.reg_evaluation_df,feature_cols,feature_cols_name
+def clas_feature_selection(X_train, X_test,y_train,y_test,model,n_features_to_select = None, step=1,importance_getter='auto',refcv_graph= False,C=0.05,k = 10):
+    global rfe_cols,rfecv_cols,lasso_cols,chi2_imp_col,mi_imp_col
+    rfe = RFE(estimator= model,n_features_to_select = n_features_to_select,importance_getter=importance_getter, step=1)
+    rfe.fit(X_train,y_train)
+    rfe_cols = X_train.columns[rfe.support_]
+    cv = StratifiedKFold(5)
+    rfecv = RFECV(estimator=model,
+        step=1,
+        cv=cv,
+        scoring="f1",
+        min_features_to_select=1,
+        n_jobs=-1)
+    rfecv.fit(X_train,y_train)
+    rfecv_cols = X_train.columns[rfecv.support_]
+    if refcv_graph == True:
+        n_scores = len(rfecv.cv_results_["mean_test_score"])
+        plt.figure()
+        plt.xlabel("Number of features selected")
+        plt.ylabel("Mean test f1")
+        plt.errorbar(range(min_features_to_select, n_scores + min_features_to_select),
+            rfecv.cv_results_["mean_test_score"],
+            yerr=rfecv.cv_results_["std_test_score"],
+        )
+        plt.grid(True)
+        plt.title("Recursive Feature Elimination \nwith correlated features")
+        plt.show()
+    clf = LogisticRegression(penalty = "l1", C = C,
+                         random_state = 42,
+                         solver = "liblinear")
+    clf.fit(X_train, y_train)
+    lasso_cols = clf.feature_names_in_[clf.coef_[0] != 0]
+    sk = SelectKBest(chi2, k=k)
+    X_chi2 = sk.fit_transform(X_train, y_train)
+    chi2_imp_col = X_train.columns[sk.get_support()]
+    sk = SelectKBest(mutual_info_classif, k=k)
+    X_mutual = sk.fit_transform(X_train, y_train)
+    mi_imp_col = X_train.columns[sk.get_support()]
+    feature_cols = [rfe_cols,rfecv_cols,lasso_cols,chi2_imp_col,mi_imp_col]
+    feature_cols_name = ["rfe_cols","rfecv_cols","lasso_cols","chi2_imp_col","mi_imp_col"]
+    for i,j in enumerate(feature_cols):
+        # evaluationerevaluation(f"{feature_cols_name[i]} " ,X_train[j],X_test[j],y_train,y_test,model = model,eva = "class")
+        evaluationer.evaluation(f"{feature_cols_name[i]}" ,X_train[j],X_test[j],y_train,y_test,model,method = root_mean_squared_error,eva = "class")
+    return evaluationer.classification_evaluation_df  ,  feature_cols,    feature_cols_name

requirements.txt CHANGED Viewed

@@ -1,14 +1,11 @@
-streamlit==1.34.0
-joblib==1.4.2
-numpy==1.26.4
-pandas==2.2.2
-scikit-learn==1.4.2
-datashader==0.16.2
-colorcet==3.1.0
-plotly==5.22.0
-matplotlib==3.9.0
-seaborn==0.13.2
-xgboost==2.0.3
-lightgbm==4.3.0
-statsmodels==0.14.2

+streamlit==1.34.0
+joblib==1.4.2
+numpy==1.26.4
+pandas==2.2.2
+scikit-learn==1.4.2
+datashader==0.16.2
+colorcet==3.1.0
+plotly==5.22.0
+matplotlib==3.9.0
+seaborn==0.13.2