Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| from PIL import Image | |
| from pydataset import data | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import confusion_matrix | |
| from sklearn.metrics import classification_report | |
| from sklearn.metrics import accuracy_score | |
| from sklearn.metrics import precision_recall_fscore_support | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.linear_model import LinearRegression | |
| from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.svm import SVC | |
| from sklearn.gaussian_process import GaussianProcessClassifier | |
| from sklearn.gaussian_process.kernels import RBF | |
| from sklearn.tree import DecisionTreeClassifier | |
| from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier | |
| from sklearn.naive_bayes import GaussianNB | |
| from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis | |
| #import pandas as pd | |
| st.header('Try end to end predictive modeling on different datasets') | |
| st.info(""" | |
| - Pick the dataset | |
| - Validate the dataset | |
| - Prepare the dataset (Impute/Scaling/Categorical Encoding/Imbalance) | |
| - Pick the Machine Learning Algorithmn | |
| - Analyse the results (Accuracy, MAE, Recall, Precision, F1) | |
| """) | |
| class MlStreamlitApp: | |
| def __init__(self): | |
| self.dataset_list = data() | |
| #, show_doc=True | |
| def load_data(self,dataset_name): | |
| df = data(str(dataset_name)) | |
| df.columns = df.columns.str.replace('.','_') | |
| df.columns = df.columns.str.lower() | |
| return df | |
| ''' | |
| @st.cache(suppress_st_warning=True) | |
| def show_dataset_doc(self,dataset_name): | |
| st.write(dataset_name) | |
| st.code(data('iris',show_doc=True)) | |
| ''' | |
| def show_datasets_list(self): | |
| st.info('Datasets details') | |
| st.write(self.dataset_list) | |
| def run(self): | |
| st.sidebar.title('Streamlit ML App') | |
| dataset_list=['']+list(self.dataset_list.dataset_id) | |
| dataset_name = st.sidebar.selectbox('Select the Dataset',dataset_list) | |
| process_selection = st.sidebar.radio("What on your mind?",('EDA', 'Predictive Modelling')) | |
| if dataset_name == '': | |
| st.sidebar.warning('Select the Dataset') | |
| self.show_datasets_list() | |
| elif (dataset_name and process_selection == 'Predictive Modelling'): | |
| df = self.load_data(dataset_name) | |
| st.write(df.head()) | |
| #image = Image.open('./ml_process.jpeg') | |
| #st.sidebar.image(image) | |
| dataset_target = st.selectbox('Select the Target', list(df.columns)) | |
| df=df.rename(columns={dataset_target:'target'}) | |
| df_dum=pd.get_dummies(df.loc[:, df.columns != 'target'],drop_first=True) | |
| df=pd.concat([df_dum,df.target],axis=1) | |
| #algo_type = st.selectbox('Classification or Regression', list(['Classification','Regression'])) | |
| if df.target.dtypes == 'object': | |
| algo_type='Classification' | |
| ml_algos = ['LogisticRegression','DecisionTreeClassifier','RandomForestClassifier','AdaBoostClassifier'] | |
| else: | |
| algo_type='Regression' | |
| ml_algos = ['LinearRegression'] | |
| # if algo_type == 'Classification': | |
| # ml_algos = ['LogisticRegression','DecisionTreeClassifier','RandomForestClassifier','AdaBoostClassifier'] | |
| # else: | |
| # ml_algos = ['LinearRegression'] | |
| X= df.loc[:, df.columns != 'target'] | |
| y= df['target'] | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42) | |
| #st.write(X_train.head()) | |
| #st.write(y_test.head()) | |
| ml_algo = st.selectbox('Select the ML Algo', list(ml_algos)) | |
| if ml_algo == 'LogisticRegression': | |
| clf_fit = LogisticRegression().fit(X_train, y_train) | |
| predictions = clf_fit.predict(X_test) | |
| st.write(predictions[1:5]) | |
| elif ml_algo == 'DecisionTreeClassifier': | |
| clf_fit = DecisionTreeClassifier().fit(X_train, y_train) | |
| predictions = clf_fit.predict(X_test) | |
| st.write(predictions[1:5]) | |
| #RandomForestClassifier | |
| elif ml_algo == 'RandomForestClassifier': | |
| clf_fit = RandomForestClassifier().fit(X_train, y_train) | |
| predictions = clf_fit.predict(X_test) | |
| st.write(predictions[1:5]) | |
| elif ml_algo == 'AdaBoostClassifier': | |
| clf_fit = AdaBoostClassifier().fit(X_train, y_train) | |
| predictions = clf_fit.predict(X_test) | |
| st.write(predictions[1:5]) | |
| elif ml_algo == 'LinearRegression': | |
| clf_fit = LinearRegression().fit(X_train, y_train) | |
| predictions = clf_fit.predict(X_test) | |
| st.write(predictions[1:5]) | |
| else: | |
| st.write('No ML Algo selected') | |
| if algo_type=='Classification': | |
| st.write(""" | |
| Confusion Matrix | |
| """) | |
| st.write(confusion_matrix(y_test, predictions)) | |
| st.write(""" | |
| #### Accuracy Score: | |
| """) | |
| st.write( accuracy_score(y_test, predictions)) | |
| st.write(""" | |
| #### Other Scores - precision_recall_fscore: | |
| """) | |
| precision,recall,f1_score,support=precision_recall_fscore_support(y_test, predictions,average='weighted') | |
| st.write(round(precision,2),round(recall,2),round(f1_score,2)) | |
| else: | |
| st.write(""" ### Model Evaluation """) | |
| r2_metrics = r2_score(y_test, predictions) | |
| mse = mean_squared_error(y_test, predictions) | |
| rmse = np.sqrt(mse) | |
| mae=mean_absolute_error(y_test, predictions) | |
| st.write(""" #### Rsquared, MSE , RMSE, MAE """) | |
| st.write(round(r2_metrics,2),round(mse,2),round(rmse,2),round(mae,2)) | |
| else: | |
| eda_selection = st.sidebar.radio("What you want to see?",('Summary', 'Plots')) | |
| df = self.load_data(dataset_name) | |
| if eda_selection == 'Summary': | |
| st.write("Glimpse of Data",df.head(10)) | |
| st.write("Total No. of Rows",df.shape[0]) | |
| st.write("Total No. of Columns",df.shape[1]) | |
| st.write("Types of Columns",df.dtypes) | |
| st.write("Summary Stats",df.describe().T) | |
| st.write("Total Nulls in the columns",df.isnull().sum()) | |
| st.write("Total Duplicate Rows",df[df.duplicated()].shape[0]) | |
| st.write("Correlation Matrix",df.corr()) | |
| else: | |
| st.info('Plots') #WIP | |
| if __name__ == '__main__': | |
| mlApp = MlStreamlitApp() | |
| mlApp.run() |