Spaces:

singhk28
/

nocodeml

Sleeping

App Files Files Community

nocodeml / streamlit_app.py

singhk28

first commit

81adc8e over 1 year ago

raw

history blame

No virus

10.4 kB

	# Module Imports
	import pandas as pd
	import numpy as np
	import streamlit as st
	from pycaret import regression as reg
	from pycaret import classification as clf
	from sklearn.metrics import mean_absolute_error, max_error, r2_score, mean_squared_error
	import matplotlib.pyplot as plt
	import streamlit.components.v1 as components
	import mpld3
	# ---------------------------------------------------------------------------------------------------------------------- #
	# Collecting User Input
	## Preamble
	st.markdown(f'<h1 style="color:#0096FF;font-size:54px;">{"No Code ML"}</h1>', unsafe_allow_html=True)
	st.markdown(f"This tool prepares a machine learning model, using your tabular data, from scratch. The model is then used to make predictions for various combinations of the provided data to try to obtain a combination that achieves the desired target value (if possible).")
	st.markdown(f"To use this tool, fill out all the requested fields from top to bottom.")
	st.markdown(f"Note: If an error is obtained refresh the page and start over.")
	## Column Name
	st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"1) Provide name of the column you want to predict with model."}</h3>', unsafe_allow_html=True)
	target_col = st.text_input("Enter the exact name of the column with your target variable. This field is case sensitive. (i.e., capital letters must match.)")
	## Model Type: Regression or Classifier
	st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"2) Select type of model you would like to build"}</h3>', unsafe_allow_html=True)
	mod_type = st.selectbox("What type of model would you like to train? Pick regression model for continous values and classifier for categorical values.", ('regression', 'classifier'))
	## Desired Target Value
	st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"3) What is the desired value?"}</h3>', unsafe_allow_html=True)
	if mod_type == 'regression':
	desired_value = float(st.number_input("Enter the desired value for the target variable."))
	else:
	desired_value = st.text_input("Enter the desired target parameter value. This field is case sensitive. (i.e., capital letters must match.)", key="DV for Classifier")
	## Ask for Dataset
	st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"4) Upload CSV file "}</h3>', unsafe_allow_html=True)
	uploaded_file = st.file_uploader("Upload a CSV file", type="csv")

	# ---------------------------------------------------------------------------------------------------------------------- #
	if uploaded_file:
	# Read CSV File and Provide Preview of Data and Statistical Summary:
	data = pd.read_csv(uploaded_file)

	if target_col not in list(data.columns):
	st.error("ERROR: Provided name of the target column is not in the CSV file. Please make sure you provide the exact match (case sensitive).Please provide the correct label and try again.")
	exit()

	st.subheader("Data preview:")
	st.write(data.head())
	st.subheader("Statistical Summary of the Provided Data:")
	st.write(data.describe())

	# Prepare Train/Test Split:
	train_frac = 0.8
	test_frac = 1 - train_frac
	train_data = data.sample(frac=train_frac, random_state=0)
	test_data = data.drop(train_data.index)

	# ---------------------------------------------------------------------------------------------------------------------- #
	# Figure out Column Data Types
	object_columns = data.select_dtypes(include="object").columns.tolist()
	# Build Regression Model
	if mod_type == "regression":
	# Setup Regressor Problem
	if object_columns:
	if len(data) > 20:
	s = reg.setup(train_data, target = target_col, log_experiment=True, normalize=True, categorical_features=object_columns, fold=20, silent= True, experiment_name = 'No_code_ML')
	else:
	s = reg.setup(data, target = target_col, log_experiment=True, normalize=True, categorical_features=object_columns, fold=20, silent= True, experiment_name = 'No_code_ML')
	else:
	if len(data) > 20:
	s = reg.setup(train_data, target = target_col, log_experiment=True, normalize=True, silent= True, experiment_name = 'No_code_ML')
	else:
	s = reg.setup(data, target = target_col, log_experiment=True, normalize=True, silent= True, experiment_name = 'No_code_ML')

	# Find the best algorithm to build Model:
	st.subheader("Algorithm Selection")
	with st.spinner(text="Finding the best algorithm for your dataset..."):
	best_mod = reg.compare_models()
	regression_results = reg.pull()
	best_mod_name = regression_results.Model[0]
	st.write(regression_results)

	# Tune the hyperparameters for the best algorithm:
	st.subheader("Tuning the Model")
	with st.spinner(text="Tuning the algorithm..."):
	tuned_mod = reg.tune_model(best_mod, optimize = 'RMSE', n_iter=25)

	# Finalize the model (Train on the entire train dataset):
	with st.spinner("Finalizing the model..."):
	final_mod = reg.finalize_model(tuned_mod)

	st.success('Model successfully trained! Here are your results:')
	st.write('Best algorithm: ', best_mod_name)
	st.write('Best hyperparameters: ', final_mod.get_params())

	# Print a SHAP Analysis Summary Plot:
	st.subheader("SHAP Analysis Summary Plot")
	st.pyplot(reg.interpret_model(final_mod))

	if len(data) > 20:
	# Predict on the test set if it was created:
	st.subheader("Evaluating model on the test/hold out data:")
	predictions = reg.predict_model(final_mod, data=test_data)
	st.success('Here are your results:')
	st.write(predictions)
	st.caption('"Label" is the value predicted by the model.')

	# Accuracy of predictions:
	MAE_val = mean_absolute_error(predictions[target_col], predictions['Label'])
	RMSE_err = mean_squared_error(predictions[target_col], predictions['Label'], squared=False)
	Max_err = max_error(predictions[target_col], predictions['Label'])
	r2_val = r2_score(predictions[target_col], predictions['Label'])
	err_dict = {'Mean Absolute Error': MAE_val, 'Root Mean Squared Error': RMSE_err, 'Maximum Error': Max_err}
	df_err = pd.DataFrame(err_dict, index=[0])
	st.write(df_err)

	# Create an true vs. predicted plot
	fig = plt.figure(figsize=(8,8))
	plt.grid(b=None)
	plt.scatter(x=predictions[target_col], y=predictions['Label'])
	plt.xlabel("True Value", fontsize=18)
	plt.ylabel("Predicted Value", fontsize=18)
	fig_html = mpld3.fig_to_html(fig)
	components.html(fig_html, height=1000)

	# ---------------------------------------------------------------------------------------------------------------------- #
	# Use Trained Model to Explore Parameter Space
	st.subheader("Using the Trained Model to Optimize Target Variable:")
	if object_columns:
	st.write("Optimization with string data types not currently supported.")
	else:
	with st.spinner("Generating Parameter Combinations for Desired Value of the Target Variable"):
	# Creating Variables for Data Generation Used in the Optimization Segment
	list_of_cols = list(data.columns[0:-1])
	# Figuring out Data Distribution of Original Data & Set Upper and Lower Bounds for New Parameters
	data_spread = data[target_col].std()/5
	max_list = [data[i].max() for i in list_of_cols]
	min_list = [data[i].min() for i in list_of_cols]
	dv_min = desired_value - data_spread
	dv_max = desired_value + data_spread

	# Generate DF from New Parameters
	generated_data = np.array([np.random.randint(low=min_list[i], high=max_list[i], size=10000) for i in range(0,len(max_list))]).T
	generated_data_df = pd.DataFrame(generated_data)
	generated_data_df.columns = list_of_cols

	# Make Predictions with Trained Model & Display Top 10 Results Based on Distance from Desired Value
	generated_predictions = reg.predict_model(final_mod, data = generated_data_df)
	generated_predictions['distance_to_dv'] = np.abs(predictions['Label'] - desired_value)
	proposed_values_to_try = generated_predictions[(generated_predictions["Label"] >=dv_min) & (generated_predictions["Label"] <=dv_max)]
	proposed_values_to_try.sort_values('distance_to_dv', inplace=True)
	proposed_values_to_try.reset_index(drop=True, inplace=True)
	final_proposed_parameters = proposed_values_to_try[0:10]
	if len(final_proposed_parameters) == 0:
	st.write("No parameters could be found for the desired value based on current model. Try collecting additional data or provide a different target value.")
	else:
	st.write(final_proposed_parameters)
	st.download_button(label="Download the Proposed Parameters to Try", data = final_proposed_parameters.to_csv(index=False), file_name='Final_proposed_parameters.csv')

	# ---------------------------------------------------------------------------------------------------------------------- #
	# Build Classifier Model
	# if mod_type == "classifier":
	# # Setup Classifier Problem
	# s = clf.setup(train_data, target = target_col, log_experiment=True, normalize=True, silent= True, experiment_name = 'QD_ML')

	# # Compare Model Performance:
	# st.subheader("Algorithm Selection")
	# with st.spinner(text="Finding the best algorithm for your model..."):
	# best_mod = clf.compare_models()
	# regression_results = clf.pull()

	# st.balloons()
	# st.success('Model successfully trained! Here are your results:')
	# st.write(regression_results)

	# # Print a SHAP Analysis Summary Plot:
	# st.subheader("SHAP Analysis Summary Plot")
	# st.pyplot(clf.interpret_model(best_mod))