Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
from PIL import Image | |
from pydataset import data | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import confusion_matrix | |
from sklearn.metrics import classification_report | |
from sklearn.metrics import accuracy_score | |
from sklearn.metrics import precision_recall_fscore_support | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.linear_model import LinearRegression | |
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.svm import SVC | |
from sklearn.gaussian_process import GaussianProcessClassifier | |
from sklearn.gaussian_process.kernels import RBF | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier | |
from sklearn.naive_bayes import GaussianNB | |
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis | |
#import pandas as pd | |
st.header('Try end to end predictive modeling on different datasets') | |
st.info(""" | |
- Pick the dataset | |
- Validate the dataset | |
- Prepare the dataset (Impute/Scaling/Categorical Encoding/Imbalance) | |
- Pick the Machine Learning Algorithmn | |
- Analyse the results (Accuracy, MAE, Recall, Precision, F1) | |
""") | |
class MlStreamlitApp: | |
def __init__(self): | |
self.dataset_list = data() | |
#, show_doc=True | |
def load_data(self,dataset_name): | |
df = data(str(dataset_name)) | |
df.columns = df.columns.str.replace('.','_') | |
df.columns = df.columns.str.lower() | |
return df | |
''' | |
@st.cache(suppress_st_warning=True) | |
def show_dataset_doc(self,dataset_name): | |
st.write(dataset_name) | |
st.code(data('iris',show_doc=True)) | |
''' | |
def show_datasets_list(self): | |
st.info('Datasets details') | |
st.write(self.dataset_list) | |
def run(self): | |
st.sidebar.title('Streamlit ML App') | |
dataset_list=['']+list(self.dataset_list.dataset_id) | |
dataset_name = st.sidebar.selectbox('Select the Dataset',dataset_list) | |
process_selection = st.sidebar.radio("What on your mind?",('EDA', 'Predictive Modelling')) | |
if dataset_name == '': | |
st.sidebar.warning('Select the Dataset') | |
self.show_datasets_list() | |
elif (dataset_name and process_selection == 'Predictive Modelling'): | |
df = self.load_data(dataset_name) | |
st.write(df.head()) | |
#image = Image.open('./ml_process.jpeg') | |
#st.sidebar.image(image) | |
dataset_target = st.selectbox('Select the Target', list(df.columns)) | |
df=df.rename(columns={dataset_target:'target'}) | |
df_dum=pd.get_dummies(df.loc[:, df.columns != 'target'],drop_first=True) | |
df=pd.concat([df_dum,df.target],axis=1) | |
#algo_type = st.selectbox('Classification or Regression', list(['Classification','Regression'])) | |
if df.target.dtypes == 'object': | |
algo_type='Classification' | |
ml_algos = ['LogisticRegression','DecisionTreeClassifier','RandomForestClassifier','AdaBoostClassifier'] | |
else: | |
algo_type='Regression' | |
ml_algos = ['LinearRegression'] | |
# if algo_type == 'Classification': | |
# ml_algos = ['LogisticRegression','DecisionTreeClassifier','RandomForestClassifier','AdaBoostClassifier'] | |
# else: | |
# ml_algos = ['LinearRegression'] | |
X= df.loc[:, df.columns != 'target'] | |
y= df['target'] | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42) | |
#st.write(X_train.head()) | |
#st.write(y_test.head()) | |
ml_algo = st.selectbox('Select the ML Algo', list(ml_algos)) | |
if ml_algo == 'LogisticRegression': | |
clf_fit = LogisticRegression().fit(X_train, y_train) | |
predictions = clf_fit.predict(X_test) | |
st.write(predictions[1:5]) | |
elif ml_algo == 'DecisionTreeClassifier': | |
clf_fit = DecisionTreeClassifier().fit(X_train, y_train) | |
predictions = clf_fit.predict(X_test) | |
st.write(predictions[1:5]) | |
#RandomForestClassifier | |
elif ml_algo == 'RandomForestClassifier': | |
clf_fit = RandomForestClassifier().fit(X_train, y_train) | |
predictions = clf_fit.predict(X_test) | |
st.write(predictions[1:5]) | |
elif ml_algo == 'AdaBoostClassifier': | |
clf_fit = AdaBoostClassifier().fit(X_train, y_train) | |
predictions = clf_fit.predict(X_test) | |
st.write(predictions[1:5]) | |
elif ml_algo == 'LinearRegression': | |
clf_fit = LinearRegression().fit(X_train, y_train) | |
predictions = clf_fit.predict(X_test) | |
st.write(predictions[1:5]) | |
else: | |
st.write('No ML Algo selected') | |
if algo_type=='Classification': | |
st.write(""" | |
Confusion Matrix | |
""") | |
st.write(confusion_matrix(y_test, predictions)) | |
st.write(""" | |
#### Accuracy Score: | |
""") | |
st.write( accuracy_score(y_test, predictions)) | |
st.write(""" | |
#### Other Scores - precision_recall_fscore: | |
""") | |
precision,recall,f1_score,support=precision_recall_fscore_support(y_test, predictions,average='weighted') | |
st.write(round(precision,2),round(recall,2),round(f1_score,2)) | |
else: | |
st.write(""" ### Model Evaluation """) | |
r2_metrics = r2_score(y_test, predictions) | |
mse = mean_squared_error(y_test, predictions) | |
rmse = np.sqrt(mse) | |
mae=mean_absolute_error(y_test, predictions) | |
st.write(""" #### Rsquared, MSE , RMSE, MAE """) | |
st.write(round(r2_metrics,2),round(mse,2),round(rmse,2),round(mae,2)) | |
else: | |
eda_selection = st.sidebar.radio("What you want to see?",('Summary', 'Plots')) | |
df = self.load_data(dataset_name) | |
if eda_selection == 'Summary': | |
st.write("Glimpse of Data",df.head(10)) | |
st.write("Total No. of Rows",df.shape[0]) | |
st.write("Total No. of Columns",df.shape[1]) | |
st.write("Types of Columns",df.dtypes) | |
st.write("Summary Stats",df.describe().T) | |
st.write("Total Nulls in the columns",df.isnull().sum()) | |
st.write("Total Duplicate Rows",df[df.duplicated()].shape[0]) | |
st.write("Correlation Matrix",df.corr()) | |
else: | |
st.info('Plots') #WIP | |
if __name__ == '__main__': | |
mlApp = MlStreamlitApp() | |
mlApp.run() |