LazyML / app.py
Gaurav069's picture
Upload 9 files
a8af817 verified
raw
history blame
30.6 kB
# import libraries
import streamlit as st
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split as tts
import evaluationer,models, null_value_handling
import auto_optimizer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
# st.set_page_config(layout="wide")
st.set_page_config(
page_title="LazyML App",
page_icon="🧊",
initial_sidebar_state="expanded",
menu_items={
'Get Help': 'https://www.extremelycoolapp.com/help',
'Report a bug': "https://www.extremelycoolapp.com/bug",
'About': "# This is a header. This is an *extremely* cool app!"
}
)
import streamlit as st
# Title with Rainbow Transition Effect and Neon Glow
html_code = """
<div class="title-container">
<h1 class="neon-text">
LazyML
</h1>
</div>
<style>
@keyframes rainbow-text-animation {
0% { color: red; }
16.67% { color: orange; }
33.33% { color: yellow; }
50% { color: green; }
66.67% { color: blue; }
83.33% { color: indigo; }
100% { color: violet; }
}
.title-container {
text-align: center;
margin: 1em 0;
padding-bottom: 10px;
border-bottom: 4 px solid #fcdee9; /* Magenta underline */
}
.neon-text {
font-family: Arial, sans-serif;
font-size: 4em;
margin: 0;
animation: rainbow-text-animation 5s infinite linear;
text-shadow: 0 0 5px rgba(255, 255, 255, 0.8),
0 0 10px rgba(255, 255, 255, 0.7),
0 0 20px rgba(255, 255, 255, 0.6),
0 0 40px rgba(255, 0, 255, 0.6),
0 0 80px rgba(255, 0, 255, 0.6),
0 0 90px rgba(255, 0, 255, 0.6),
0 0 100px rgba(255, 0, 255, 0.6),
0 0 150px rgba(255, 0, 255, 0.6);
}
</style>
"""
st.markdown(html_code, unsafe_allow_html=True)
# file uploader
csv_upload = st.sidebar.file_uploader("Input CSV File for ML modelling", type=['csv'])
csv_upload2 = st.sidebar.file_uploader("Input CSV File of Test Data Prediction",type = ["csv"])
test = pd.DataFrame()
if csv_upload is not None:
# read the uploaded file into dataframe
df = pd.read_csv(csv_upload)
# saving the dataframe to a CSV file
df.to_csv('csv_upload.csv', index=False)
st.write("Train File uploaded successfully. βœ…")
if csv_upload2 is not None:
test = pd.read_csv(csv_upload2)
id_col = st.selectbox("select column for submission i.e, ID",test.columns)
submission_id = test[id_col]
# st.write("Train File upl",submission_id)
if len(test) >0:
# saving the test dataframe to a CSV file
test.to_csv('csv_upload_test.csv', index=False)
st.write("Test File uploaded successfully. βœ…")
display_train_data = st.radio("Display Train Data",["Yes","No"],index = 1)
if display_train_data == "Yes":
st.dataframe(df.head())
if len(test) >0:
display_test_data = st.radio("Display Test Data",["Yes","No"],index = 1)
if display_test_data == "Yes":
st.dataframe(test.head())
if st.radio("Select Supervision Category",["Supervised","Un-Supervised"],index =0) == "Supervised":
selected_column = st.selectbox('Select Target column', df.columns, index=(len(df.columns)-1))
# Display the selected column
st.write('You selected:', selected_column)
y = df[selected_column]
if y.dtype == "O":
st.write("⚠️⚠️⚠️ Target Column is Object Type ⚠️⚠️⚠️")
if st.radio("Proceed for Label Encoding ",["Yes","No"],index = 1) == "Yes":
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y= pd.Series(le.fit_transform(y))
st.write("Label Encoding Completed βœ…")
if st.radio("Display Target Column",["Yes","No"],index =1) == "Yes":
st.dataframe(y.head())
select_target_trans = st.radio("Target column Transformation",["Yes","No"],index = 1)
if select_target_trans == "Yes":
selected_transformation = st.selectbox("Select Transformation method",["Log Transformation","Power Transformation"])
if selected_transformation == "Log Transformation":
if y.min() <=0:
st.write("Values in target columns are zeroes or negative, please select power transformation")
else:
log_selected_transformation = st.selectbox("Select Logarithmic method",["Natural Log base(e)","Log base 10","Log base (2)"])
if log_selected_transformation == "Natural Log base(e)":
y = np.log(y)
st.write("Log base (e) Transformation Completed βœ…")
elif log_selected_transformation == "Log base 10":
y = np.log10(y)
st.write("Log base 10 Transformation Completed βœ…")
elif log_selected_transformation == "Log base (2)":
y = np.log2(y)
st.write("Log base 2 Transformation Completed βœ…")
elif selected_transformation == "Power Transformation":
power_selected_transformation = st.selectbox("Select Power Transformation method",["Square Root","Other"])
if power_selected_transformation == "Square Root":
y = np.sqrt(y)
st.write("Square root Transformation Completed βœ…")
elif power_selected_transformation == "Other":
power_value = st.number_input("Enter Power Value",value=3)
y = y**(1/power_value)
st.write(f"power root of {power_value} Transformation Completed βœ…")
if st.radio("Display Target Column after Transformation",["Yes","No"],index =1) == "Yes":
st.dataframe(y.head())
# inverse of transformation
X = df.drop(columns = selected_column)
if st.radio("Display X-Train Data",["Yes","No"],index =1) == "Yes":
st.dataframe(X.head())
if st.radio("Check for duplicate Values",["Yes","No"],index = 1) == "Yes":
len_duplicates = len(X[X.duplicated()])
if len_duplicates >0:
st.write(f"There are {len_duplicates} duplicate values in Train")
if st.selectbox("Drop Duplicate values",["Yes","No"],index = 1) == "Yes":
X = X.drop_duplicates()
st.write("Duplicate values removed βœ…")
else:
st.write("There are no duplicate values in Train")
# dropping not important columns
if st.radio("Drop Un-Important Column(s)",["Yes","No"],index = 1) == "Yes":
selected_drop_column = st.multiselect('Select columns to be dropped', X.columns)
X = X.drop(columns = selected_drop_column)
if len(test) >0:
test = test.drop(columns = selected_drop_column)
st.write("Un-Important column(s) Delected βœ…")
st.dataframe(X.head())
num_cols = X.select_dtypes(exclude = "O").columns
cat_cols = X.select_dtypes(include = "O").columns
st.write("Numerical Columns in Train Data: ", tuple(num_cols))
st.write("Categorical Columns in Train Data: ", tuple(cat_cols))
if st.radio("Select method for ML modelling", ["Manual","Auto Optimized"],index = 0) == "Auto Optimized":
ml_cat_ao = st.radio("Select Machine Learning Category",["Regression","Classification"],index =0)
if ml_cat_ao =="Regression":
eva = "reg"
st.write("Select ML algorithm")
reg_model_name = st.selectbox("select model",models.Regression_models.index)
reg_model = models.Regression_models.loc[reg_model_name].values[0]
auto_optimizer.Auto_optimizer(X,y,eva,reg_model)
elif ml_cat_ao =="Classification":
eva = "class"
st.write("Select ML algorithm")
class_model_name = st.selectbox("select model",models.Classification_models.index)
class_model = models.Classification_models.loc[class_model_name].values[0]
auto_optimizer.Auto_optimizer(X,y,eva,class_model)
else:
if X.isnull().sum().sum() >0 :
st.write("⚠️⚠️⚠️ There are missing values in Train Data ⚠️⚠️⚠️")
if st.selectbox("Drop null values or Impute",["Drop Null Values","Impute Null Values"],index = 1) == "Drop Null Values":
X = X.dropna()
if len(test) >0:
st.write("⚠️⚠️⚠️ If choosing drop values, test dataset will also drop those values please choose missing value imputation method befittingly.⚠️⚠️⚠️ ")
test = test.dropna()
clean_num_nvh_df = pd.DataFrame()
if X[num_cols].isnull().sum().sum() >0:
st.write("Numerical Columns with Percentage of Null Values: ")
num_cols_nvh = X[num_cols].isnull().sum()[X[num_cols].isnull().sum()>0].index
st.dataframe(round(X[num_cols].isnull().sum()[X[num_cols].isnull().sum()>0]/len(X)*100,2))
dict_1= {}
for nvh_method in null_value_handling.null_value_handling_method_num_cols :
selected_nvh_num_cols = st.multiselect(f'method:- \"{nvh_method}\" for Numerical columns', num_cols_nvh,)
dict_1[nvh_method] = selected_nvh_num_cols
num_cols_nvh = set(num_cols_nvh) - set(selected_nvh_num_cols)
if len(num_cols_nvh) ==0:
break
num_nvh_df = pd.DataFrame(data=dict_1.values(), index=dict_1.keys())
clean_num_nvh_df = num_nvh_df.T[num_nvh_df.T.count()[num_nvh_df.T.count()>0].index]
st.write("Methods for Numerical columns null value handling",clean_num_nvh_df )
if len(test) >0:
if test[num_cols].isnull().sum().sum() >0:
test_num_cols_nvh = test[num_cols].isnull().sum()[test[num_cols].isnull().sum()>0].index
st.write("Columns with Null Value in Test",test_num_cols_nvh)
test[num_cols] = IterativeImputer(max_iter = 200,random_state= 42).fit_transform(test[num_cols])
clean_num_nvh_df_cat = pd.DataFrame()
if X[cat_cols].isnull().sum().sum() >0:
st.write("Categorical Columns with Percentage of Null Values: ")
cat_cols_nvh = X[cat_cols].isnull().sum()[X[cat_cols].isnull().sum()>0].index
st.dataframe(round(X[cat_cols].isnull().sum()[X[cat_cols].isnull().sum()>0]/len(X)*100,2))
dict_2= {}
for nvh_method in null_value_handling.null_value_handling_method_cat_cols :
st.write("dsff",nvh_method)
selected_nvh_num_cols = st.multiselect(f'method:- \"{nvh_method}\" for Numerical columns', cat_cols_nvh,)
dict_2[nvh_method] = selected_nvh_num_cols
cat_cols_nvh = set(cat_cols_nvh) - set(selected_nvh_num_cols)
if len(cat_cols_nvh) ==0:
break
num_nvh_df_cat = pd.DataFrame(data=dict_2.values(), index=dict_2.keys())
clean_num_nvh_df_cat = num_nvh_df_cat.T
st.write("Methods for Categorical columns null value handling",[clean_num_nvh_df_cat])
if len(test) >0:
if test[cat_cols].isnull().sum().sum() >0:
test_num_cols_nvh_cat = test[cat_cols].isnull().sum()[test[cat_cols].isnull().sum()>0].index
st.write("sdgs",test_num_cols_nvh_cat)
test[cat_cols] = SimpleImputer(strategy = "most_frequent").fit_transform(test[cat_cols])
null_value_handling.null_handling(X,clean_num_nvh_df,clean_num_nvh_df_cat)
st.write("X Data after Null value handling", X.head())
new_df = pd.concat([X,y[X.index]],axis = 1)
csv = new_df.to_csv(index = False)
if st.radio("Download Null Value Handled DataFrame as CSV File ? ",["Yes","No"],index = 1) == "Yes":
st.download_button(label="Download Null Value Handled CSV File",data=csv,file_name='NVH_DataFrame.csv',mime='text/csv')
ord_enc_cols = []
if len(cat_cols) == 0:
st.write("No Categorical Columns in Train")
else:
st.write("Select Columns for Ordinal Encoding")
for column in cat_cols:
selected = st.checkbox(column)
if selected:
st.write(f"No. of Unique value in {column} column are", X[column].nunique())
ord_enc_cols.append(column)
ohe_enc_cols = set(cat_cols) -set(ord_enc_cols)
ohe_enc_cols = list(ohe_enc_cols)
if len(ord_enc_cols)>0:
st.write("ordinal encoded columns" ,tuple(ord_enc_cols))
if len(ohe_enc_cols)>0:
st.write("one hot encoded columns" ,tuple(ohe_enc_cols))
if len(ord_enc_cols)>0:
if st.radio("proceed for ordinal encoding",["Yes","No"],index = 1) == "Yes":
ordinal_order_vals = []
for column in ord_enc_cols:
unique_vals = X[column].unique()
# st.write(f"No. of Unique value in {column} column are", len(unique_vals))
ordered_unique_vals = st.multiselect("Select values in order for Ordinal Encoding",unique_vals,unique_vals)
ordinal_order_vals.append(ordered_unique_vals)
st.write("order of values for Ordinal Encoding",tuple(ordinal_order_vals))
# import ordinal encoder
from sklearn.preprocessing import OrdinalEncoder
ord = OrdinalEncoder(categories=ordinal_order_vals,handle_unknown= "use_encoded_value",unknown_value = -1 )
X[ord_enc_cols] = ord.fit_transform(X[ord_enc_cols])
if len(test) >0:
test[ord_enc_cols] = ord.transform(test[ord_enc_cols])
st.write("DataFrame after Ordinal Encoding",X.head())
st.write("Ordinal Encoding Completed βœ…")
if len(ohe_enc_cols)>0:
if st.radio("proceed for OnehotEncoding ",["Yes","No"],index = 1) == "Yes": # import one hot encoder
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output = False,handle_unknown = "ignore")
pd.options.mode.chained_assignment = None
X.loc[:, ohe.get_feature_names_out()] = ohe.fit_transform(X[ohe_enc_cols])
X.drop(columns = ohe_enc_cols,inplace = True)
if len(test) >0:
test.loc[:, ohe.get_feature_names_out()] = ohe.transform(test[ohe_enc_cols])
test.drop(columns = ohe_enc_cols,inplace = True)
pd.options.mode.chained_assignment = 'warn'
st.write("DataFrame after One Hot Encoding",X.head())
st.write("OneHot Encoding Completed βœ…")
new_df = pd.concat([X,y],axis = 1)
csv = new_df.to_csv(index = False)
if st.radio("Download Encoded DataFrame as CSV File ? ",["Yes","No"],index = 1) == "Yes":
st.download_button(label="Download Ordinal Encoded CSV File",data=csv,file_name='Encoded_DataFrame.csv',mime='text/csv')
random_state = st.number_input("Enter Random_state",max_value=100,min_value=1,value=42)
test_size = st.number_input("Enter test_size",max_value=0.99, min_value = 0.01,value =0.2)
if st.radio("select Train Validation Split Method",
[f"Train_Test_split, Default (Random_state = {random_state},Test_size = {test_size})",
"KFoldCV, Default (CV = 5)"], index = 0)== f"Train_Test_split, Default (Random_state = {random_state},Test_size = {test_size})":
ttsmethod = "Train_Test_split"
else:
ttsmethod = "KFoldCV"
st.write('You selected:', ttsmethod)
if ttsmethod == "Train_Test_split":
X_train,X_Val,y_train,y_val = tts(X,y[X.index],random_state = random_state,test_size = test_size)
st.write('X-Training Data shape:', (X_train.info()))
st.write('X-Training Data shape:', X_train.shape)
st.write('X-Validation Data shape:', X_Val.shape)
ml_cat = st.radio("Select Machine Learning Category",["Regression","Classification"],index =0)
if ml_cat =="Regression":
method_name_selector = st.selectbox("Select Error Evaluation Method",evaluationer.method_df.index,index = 0)
method = evaluationer.method_df.loc[method_name_selector].values[0]
reg_algorithm = []
selected_options = []
for option in models.Regression_models.index:
selected = st.checkbox(option)
if selected:
selected_options.append(option)
param = models.Regression_models.loc[option][0].get_params()
Temp_parameter = pd.DataFrame(data=param.values(), index=param.keys())
Temp_parameter_transposed = Temp_parameter.T
parameter = pd.DataFrame(data=param.values(), index=param.keys())
def is_boolean(val):
return isinstance(val, bool)
# Apply the function to the DataFrame column and create a new column with the resuSlts
bool_cols= parameter[parameter[0].apply(is_boolean)].index
param_transposed = parameter.T
# st.write("hrweurgesj",param_transposed.loc[:, bool_cols])
# st.write("bool_cols",bool_cols)
remaining_cols = set(param_transposed.columns) - set(bool_cols)
remaining_cols = tuple(remaining_cols)
# st.write("rem_Cols",remaining_cols)
for col in remaining_cols:
param_transposed[col] = pd.to_numeric(param_transposed[col],errors="ignore")
cat_cols = param_transposed.select_dtypes(include = ["O"]).T.index.to_list()
num_cols = set(remaining_cols) - set(cat_cols)
cat_cols = set(cat_cols) - set(bool_cols)
num_cols = tuple(num_cols)
# st.write("sdsafdsd",num_cols)
for i in num_cols:
param_transposed[i] = st.number_input(f"input \"{i}\" value \n{option}",value = parameter.T[i].values[0])
for i in cat_cols:
param_transposed[i] = st.text_input(f"input \"{i}\" value \n{option}",value = parameter.T[i].values[0])
for i in bool_cols:
st.write("default value to insert",Temp_parameter_transposed[i].values[0])
param_transposed[i] = st.selectbox(f"input \"{i}\" value \n{option}",[False, True], index=Temp_parameter_transposed[i].values[0])
inv_param = param_transposed.T
new_param = inv_param.dropna().loc[:,0].to_dict()
# st.write("asad",new_param)
models.Regression_models.loc[option][0].set_params(**new_param)
a = models.Regression_models.loc[option][0].get_params()
reg_algorithm.append(models.Regression_models.loc[option][0])
if st.button("Train Regression Model"):
for algorithm in reg_algorithm:
evaluationer.evaluation(f"{algorithm} baseline",X_train,X_Val,y_train,y_val,algorithm,method,"reg")
st.write("Regression Model Trained Successfully",evaluationer.reg_evaluation_df)
if len(test)>0:
if st.radio("Predict",["Yes","No"],index = 1) =="Yes":
if len(evaluationer.reg_evaluation_df) >0:
a = st.number_input("select index of best algorithm for test prediction",min_value = 0,max_value =len(evaluationer.reg_evaluation_df) -1, value = len(evaluationer.reg_evaluation_df) -1)
test_prediction = evaluationer.reg_evaluation_df.loc[a,"model"].predict(test)
if select_target_trans == "Yes":
if selected_transformation == "Log Transformation":
if log_selected_transformation == "Natural Log base(e)":
test_prediction = np.exp(test_prediction)
st.write("Natural Log base(e) Inverse Transformation Completed βœ…")
elif log_selected_transformation == "Log base 10":
test_prediction = np.power(10,test_prediction)
st.write("Log base 10 Inverse Transformation Completed βœ…")
elif log_selected_transformation == "Log base (2)":
test_prediction = np.power(2,test_prediction)
st.write("Log base 2 Inverse Transformation Completed βœ…")
elif selected_transformation == "Power Transformation":
if power_selected_transformation == "Square Root":
test_prediction = np.power(test_prediction,2)
st.write("Square root Inverse Transformation Completed βœ…")
elif power_selected_transformation == "Other":
test_prediction = test_prediction**(power_value)
st.write(f"power root of {power_value} Inverse Transformation Completed βœ…")
submission_file = pd.DataFrame(index = [submission_id],data = test_prediction,columns = [selected_column])
st.write("Sample of Prediction File",submission_file.head())
csv_prediction = submission_file.to_csv()
if st.radio("Download Prediction File as CSV File ? ",["Yes","No"],index = 1) == "Yes":
st.download_button(label="Download Prediction CSV File",data=csv_prediction,file_name='prediction.csv',mime='text/csv')
if ml_cat =="Classification":
cla_algorithm = []
selected_options = []
for option in models.Classification_models.index:
selected = st.checkbox(option)
if selected:
selected_options.append(option)
param = models.Classification_models.loc[option][0].get_params()
parameter = pd.DataFrame(data=param.values(), index=param.keys())
Temp_parameter = parameter.copy()
Temp_parameter_transposed = (Temp_parameter.T).copy()
def is_boolean(val):
return isinstance(val, bool)
# Apply the function to the DataFrame column and create a new column with the resuSlts
bool_cols= parameter[parameter[0].apply(is_boolean)].index
param_transposed = parameter.T
st.write("bool_cols",bool_cols)
remaining_cols = set(param_transposed.columns) - set(bool_cols)
remaining_cols = tuple(remaining_cols)
st.write("rem_Cols",remaining_cols)
for col in remaining_cols:
param_transposed[col] = pd.to_numeric(param_transposed[col],errors="ignore")
cat_cols = param_transposed.select_dtypes(include = ["O"]).T.index.to_list()
num_cols = set(remaining_cols) - set(cat_cols)
num_cols = tuple(num_cols)
st.write("sdsafdsd",num_cols)
for i in num_cols:
param_transposed[i] = st.number_input(f"input \"{i}\" value \n{option}",value = parameter.T[i].values[0])
for i in cat_cols:
param_transposed[i] = st.text_input(f"input \"{i}\" value \n{option}",value = parameter.T[i].values[0])
for i in bool_cols:
st.write("default value to insert",Temp_parameter_transposed[i].values[0])
param_transposed[i] = st.selectbox(f"input \"{i}\" value \n{option}",[False,True], index=Temp_parameter_transposed[i].values[0])
inv_param = param_transposed.T
new_param = inv_param.dropna().loc[:,0].to_dict()
st.write("asad",new_param)
models.Classification_models.loc[option][0].set_params(**new_param)
a = models.Classification_models.loc[option][0].get_params()
cla_algorithm.append(models.Classification_models.loc[option][0])
# st.write("sada",reg_algorithm/)
if st.button("Train Regression Model"):
method = None
for algorithm in cla_algorithm:
evaluationer.evaluation(f"{algorithm} baseline",X_train,X_Val,y_train,y_val,algorithm,method,eva ="class")
st.write("Regression Model Trained Successfully",evaluationer.classification_evaluation_df)
if len(test)>0:
if st.radio("Predict",["Yes","No"],index = 1) =="Yes":
if len(evaluationer.classification_evaluation_df) >0:
a = st.number_input("select index of best algorithm for test prediction",min_value = 0,max_value =len(evaluationer.classification_evaluation_df) -1, value = len(evaluationer.classification_evaluation_df) -1)
test_prediction = evaluationer.classification_evaluation_df.loc[a,"model"].predict(test)
if select_target_trans == "Yes":
if selected_transformation == "Log Transformation":
if log_selected_transformation == "Natural Log base(e)":
test_prediction = np.exp(test_prediction)
st.write("Natural Log base(e) Inverse Transformation Completed βœ…")
elif log_selected_transformation == "Log base 10":
test_prediction = np.power(10,test_prediction)
st.write("Log base 10 Inverse Transformation Completed βœ…")
elif log_selected_transformation == "Log base (2)":
test_prediction = np.power(2,test_prediction)
st.write("Log base 2 Inverse Transformation Completed βœ…")
elif selected_transformation == "Power Transformation":
if power_selected_transformation == "Square Root":
test_prediction = np.power(test_prediction,2)
st.write("Square root Inverse Transformation Completed βœ…")
elif power_selected_transformation == "Other":
test_prediction = test_prediction**(power_value)
st.write(f"power root of {power_value} Inverse Transformation Completed βœ…")
submission_file = pd.DataFrame(index = [submission_id],data = test_prediction,columns = [selected_column])
st.write("Sample of Prediction File",submission_file.head())
csv_prediction = submission_file.to_csv()
if st.radio("Download Prediction File as CSV File ? ",["Yes","No"],index = 1) == "Yes":
st.download_button(label="Download Prediction CSV File",data=csv_prediction,file_name='prediction.csv',mime='text/csv')