Spaces:

arpitr
/

end_to_end_ml_app

Sleeping

App Files Files Community

end_to_end_ml_app / app.py

arpitr

Added app.py

7c0398c over 2 years ago

raw

history blame contribute delete

7 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	from PIL import Image
	from pydataset import data
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import confusion_matrix
	from sklearn.metrics import classification_report
	from sklearn.metrics import accuracy_score
	from sklearn.metrics import precision_recall_fscore_support
	from sklearn.preprocessing import StandardScaler
	from sklearn.linear_model import LinearRegression
	from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
	from sklearn.linear_model import LogisticRegression
	from sklearn.svm import SVC
	from sklearn.gaussian_process import GaussianProcessClassifier
	from sklearn.gaussian_process.kernels import RBF
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
	from sklearn.naive_bayes import GaussianNB
	from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

	#import pandas as pd

	st.header('Try end to end predictive modeling on different datasets')

	st.info("""
	- Pick the dataset
	- Validate the dataset
	- Prepare the dataset (Impute/Scaling/Categorical Encoding/Imbalance)
	- Pick the Machine Learning Algorithmn
	- Analyse the results (Accuracy, MAE, Recall, Precision, F1)
	""")

	class MlStreamlitApp:

	def __init__(self):
	self.dataset_list = data()
	#, show_doc=True
	@st.cache(suppress_st_warning=True)
	def load_data(self,dataset_name):
	df = data(str(dataset_name))
	df.columns = df.columns.str.replace('.','_')
	df.columns = df.columns.str.lower()
	return df

	'''
	@st.cache(suppress_st_warning=True)
	def show_dataset_doc(self,dataset_name):
	st.write(dataset_name)
	st.code(data('iris',show_doc=True))
	'''

	def show_datasets_list(self):
	st.info('Datasets details')
	st.write(self.dataset_list)

	def run(self):
	st.sidebar.title('Streamlit ML App')
	dataset_list=['']+list(self.dataset_list.dataset_id)
	dataset_name = st.sidebar.selectbox('Select the Dataset',dataset_list)
	process_selection = st.sidebar.radio("What on your mind?",('EDA', 'Predictive Modelling'))

	if dataset_name == '':
	st.sidebar.warning('Select the Dataset')
	self.show_datasets_list()
	elif (dataset_name and process_selection == 'Predictive Modelling'):
	df = self.load_data(dataset_name)
	st.write(df.head())
	#image = Image.open('./ml_process.jpeg')
	#st.sidebar.image(image)
	dataset_target = st.selectbox('Select the Target', list(df.columns))
	df=df.rename(columns={dataset_target:'target'})

	df_dum=pd.get_dummies(df.loc[:, df.columns != 'target'],drop_first=True)
	df=pd.concat([df_dum,df.target],axis=1)

	#algo_type = st.selectbox('Classification or Regression', list(['Classification','Regression']))

	if df.target.dtypes == 'object':
	algo_type='Classification'
	ml_algos = ['LogisticRegression','DecisionTreeClassifier','RandomForestClassifier','AdaBoostClassifier']
	else:
	algo_type='Regression'
	ml_algos = ['LinearRegression']

	# if algo_type == 'Classification':
	# ml_algos = ['LogisticRegression','DecisionTreeClassifier','RandomForestClassifier','AdaBoostClassifier']

	# else:
	# ml_algos = ['LinearRegression']

	X= df.loc[:, df.columns != 'target']
	y= df['target']

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

	#st.write(X_train.head())
	#st.write(y_test.head())

	ml_algo = st.selectbox('Select the ML Algo', list(ml_algos))

	if ml_algo == 'LogisticRegression':
	clf_fit = LogisticRegression().fit(X_train, y_train)
	predictions = clf_fit.predict(X_test)
	st.write(predictions[1:5])
	elif ml_algo == 'DecisionTreeClassifier':
	clf_fit = DecisionTreeClassifier().fit(X_train, y_train)
	predictions = clf_fit.predict(X_test)
	st.write(predictions[1:5])
	#RandomForestClassifier
	elif ml_algo == 'RandomForestClassifier':
	clf_fit = RandomForestClassifier().fit(X_train, y_train)
	predictions = clf_fit.predict(X_test)
	st.write(predictions[1:5])
	elif ml_algo == 'AdaBoostClassifier':
	clf_fit = AdaBoostClassifier().fit(X_train, y_train)
	predictions = clf_fit.predict(X_test)
	st.write(predictions[1:5])
	elif ml_algo == 'LinearRegression':
	clf_fit = LinearRegression().fit(X_train, y_train)
	predictions = clf_fit.predict(X_test)
	st.write(predictions[1:5])
	else:
	st.write('No ML Algo selected')

	if algo_type=='Classification':
	st.write("""
	Confusion Matrix
	""")
	st.write(confusion_matrix(y_test, predictions))
	st.write("""
	#### Accuracy Score:
	""")
	st.write( accuracy_score(y_test, predictions))
	st.write("""
	#### Other Scores - precision_recall_fscore:
	""")
	precision,recall,f1_score,support=precision_recall_fscore_support(y_test, predictions,average='weighted')
	st.write(round(precision,2),round(recall,2),round(f1_score,2))
	else:
	st.write(""" ### Model Evaluation """)
	r2_metrics = r2_score(y_test, predictions)
	mse = mean_squared_error(y_test, predictions)
	rmse = np.sqrt(mse)
	mae=mean_absolute_error(y_test, predictions)
	st.write(""" #### Rsquared, MSE , RMSE, MAE """)
	st.write(round(r2_metrics,2),round(mse,2),round(rmse,2),round(mae,2))
	else:
	eda_selection = st.sidebar.radio("What you want to see?",('Summary', 'Plots'))
	df = self.load_data(dataset_name)
	if eda_selection == 'Summary':
	st.write("Glimpse of Data",df.head(10))
	st.write("Total No. of Rows",df.shape[0])
	st.write("Total No. of Columns",df.shape[1])
	st.write("Types of Columns",df.dtypes)
	st.write("Summary Stats",df.describe().T)
	st.write("Total Nulls in the columns",df.isnull().sum())
	st.write("Total Duplicate Rows",df[df.duplicated()].shape[0])
	st.write("Correlation Matrix",df.corr())
	else:
	st.info('Plots') #WIP


	if __name__ == '__main__':
	mlApp = MlStreamlitApp()
	mlApp.run()