import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble import RandomForestRegressor, VotingRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.linear_model import LinearRegression from sklearn.neighbors import KNeighborsRegressor from sklearn.svm import SVR from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score from sklearn.neural_network import MLPRegressor from lightgbm import LGBMRegressor from xgboost import XGBRegressor st.title('Kidney Disease Prediction Application') st.write(''' Please fill in the attributes below, then hit the Predict button to get your results. ''') st.header('Input Attributes') age = st.slider('Your Age (Years)', min_value=0.0, max_value=100.0, value=50.0, step=1.0) st.write(''' ''') bp = st.slider('Blood Pressure (mm/Hg)', min_value=0.0, max_value=200.0, value=150.0, step=1.0) st.write(''' ''') sg = st.slider('Specific Gravity (SG)', min_value=1.005, max_value=1.025, value=1.015, step=0.005) st.write(''' ''') al = st.slider('Albumin Level (g/L)', min_value=0.0, max_value=5.0, value=2.0, step=1.0) st.write(''' ''') sugar = st.slider('Sugar Level', min_value=0.0, max_value=5.0, value=2.0, step=1.0) st.write(''' ''') rbc = st.radio("Red Blood Cell Count", ('Normal', 'Abnormal')) st.write(''' ''') if rbc == "Normal": rbc = 0 else: rbc = 1 pc = st.radio("Pus Cell Count", ('Normal', 'Abnormal')) st.write(''' ''') if pc == "Normal": pc = 0 else: pc = 1 pcc = st.radio("Pus Cell Clumps", ('Present', 'Not Present')) st.write(''' ''') if pcc == "Present": pcc = 1 else: pcc = 0 ba = st.radio("Bacterial Infection", ('Present', 'Not Present')) st.write(''' ''') if ba == "Present": ba = 1 else: ba = 0 bgr = st.slider('Blood Glucose Random (mgs/dl)', min_value=0.0, max_value=600.0, value=300.0, step=1.0) st.write(''' ''') bu = st.slider('Blood Urea (mgs/dl)', min_value=0.0, max_value=500.0, value=250.0, step=0.1) st.write(''' ''') sc = st.slider('Serum Creatinine (mgs/dl)', min_value=0.0, max_value=100.0, value=50.0, step=0.1) st.write(''' ''') sod = st.slider('Sodium (mEq/L)', min_value=0.0, max_value=200.0, value=100.0, step=0.1) st.write(''' ''') pot = st.slider('Potassium (mEq/L)', min_value=0.0, max_value=100.0, value=50.0, step=0.1) st.write(''' ''') hemo = st.slider('Hemoglobin (gms)', min_value=0.0, max_value=20.0, value=10.0, step=0.1) st.write(''' ''') pcv = st.slider('Packed Cell Volume', min_value=0.0, max_value=100.0, value=50.0, step=0.1) st.write(''' ''') wbc = st.slider('White Blood Cell Count (cells/cumm)', min_value=0.0, max_value=50000.0, value=25000.0, step=1.0) st.write(''' ''') rbcc = st.slider('Red Blood Cell Count (millions/cmm)', min_value=0.0, max_value=200.0, value=100.0, step=1.0) st.write(''' ''') htn = st.radio("Hypertension", ('Yes', 'No')) st.write(''' ''') if htn == "Yes": htn = 1 else: htn = 0 dm = st.radio("Diabetes Mellitus", ('Yes', 'No')) st.write(''' ''') if dm == "Yes": dm = 1 else: dm = 0 cad = st.radio("Coronary Artery Disease", ('Yes', 'No')) st.write(''' ''') if cad == "Yes": cad = 1 else: cad = 0 appet = st.radio("Appetite", ('Good', 'Poor')) st.write(''' ''') if appet == "Good": appet = 1 else: appet = 0 pe = st.radio("Pedal Edema", ('Yes', 'No')) st.write(''' ''') if pe == "Yes": pe = 1 else: pe = 0 ane = st.radio("Anemia", ('Yes', 'No')) st.write(''' ''') if ane == "Yes": ane = 1 else: ane = 0 selected_models = st.multiselect("Choose Regression Models", ('Random Forest', 'Linear Regression', 'K-Nearest Neighbors', 'Decision Tree', 'Gradient Boosting Regression', 'XGBoost Regression', 'LightGBM Regression')) st.write(''' ''') # Initialize an empty list to store the selected models models_to_run = [] # Check which models were selected and add them to the models_to_run list if 'Random Forest' in selected_models: models_to_run.append(RandomForestRegressor()) if 'Linear Regression' in selected_models: models_to_run.append(LinearRegression()) if 'K-Nearest Neighbors' in selected_models: models_to_run.append(KNeighborsRegressor()) if 'Decision Tree' in selected_models: models_to_run.append(DecisionTreeRegressor()) if 'Support Vector Machine' in selected_models: models_to_run.append(SVR()) if 'Gradient Boosting Regression' in selected_models: models_to_run.append(GradientBoostingRegressor()) if 'XGBoost Regression' in selected_models: models_to_run.append(XGBRegressor()) if 'LightGBM Regression' in selected_models: models_to_run.append(LGBMRegressor()) if 'Neural Network (MLP) Regression' in selected_models: models_to_run.append(MLPRegressor()) user_input = np.array([age, bp, sg, al, sugar, rbc, pc, pcc, ba, bgr, bu, sc, sod, pot, hemo, pcv, wbc, rbcc, htn, dm, cad, appet, pe, ane]).reshape(1, -1) # import dataset def get_dataset(): data = pd.read_csv('kidney.csv') return data def generate_model_labels(model_names): model_labels = [] for name in model_names: words = name.split() if len(words) > 1: # Multiple words, use initials label = "".join(word[0] for word in words) else: # Single word, take the first 3 letters label = name[:3] model_labels.append(label) return model_labels if st.button('Submit'): df = get_dataset() # fix column names df.columns = (["id", "age", "bp", "sg", "al", "su", "rbc", "pc", "pcc", "ba", "bgr", "bu", "sc", "sod", "pot", "hemo", "pcv", "wc", "rc", "htn", "dm", "cad", "appet", "pe", "ane", "class"]) # Transforming classification into numerical format df['class'] = df['class'].apply(lambda x: 1 if x == 'ckd' else 0) # Transforming ane into numerical format df['ane'] = df['ane'].apply(lambda x: 1 if x == 'yes' else 0) # Transforming pe into numerical format df['pe'] = df['pe'].apply(lambda x: 1 if x == 'yes' else 0) # Transforming appet into numerical format df['appet'] = df['appet'].apply(lambda x: 1 if x == 'poor' else 0) # Transforming cad into numerical format df['cad'] = df['cad'].apply(lambda x: 1 if x == 'yes' else 0) # Transforming dm into numerical format df['dm'] = df['dm'].apply(lambda x: 1 if x == 'yes' else 0) # Transforming htn into numerical format df['htn'] = df['htn'].apply(lambda x: 1 if x == 'yes' else 0) # Transforming ba into numerical format df['ba'] = df['ba'].apply(lambda x: 1 if x == 'present' else 0) # Transforming pcc into numerical format df['pcc'] = df['pcc'].apply(lambda x: 1 if x == 'present' else 0) # Transforming pc into numerical format df['pc'] = df['pc'].apply(lambda x: 1 if x == 'abnormal' else 0) # Transforming rbc into numerical format df['rbc'] = df['rbc'].apply(lambda x: 1 if x == 'abnormal' else 0) # Replace NaN values with median for float columns float_columns = df.select_dtypes(include=['float']).columns df[float_columns] = df[float_columns].fillna(df[float_columns].median()) # Convert columns to numeric numeric_columns = ['pcv', 'wc', 'rc'] df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce') # Replace NaN values with median for numeric columns df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].median()) # Split the dataset into train and test X = df.drop(['class','id'], axis=1) y = df['class'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Create two columns to divide the screen left_column, right_column = st.columns(2) # Left column content with left_column: # Create a VotingRegressor with the selected models ensemble = VotingRegressor( estimators=[('rf', LGBMRegressor()), ('gb', GradientBoostingRegressor()), ('xb', XGBRegressor())] ) # Fit the voting regressor to the training data ensemble.fit(X_train, y_train) # Make predictions on the test set model_predictions = ensemble.predict(user_input) # Evaluate the model's performance on the test set ensemble_r2 = r2_score(y_test, ensemble.predict(X_test)) ensemble_mse = mean_squared_error(y_test, ensemble.predict(X_test)) ensemble_mae = mean_absolute_error(y_test, ensemble.predict(X_test)) ensemble_rmse = np.sqrt(ensemble_mse) st.write(f'According to Ensemble Model, Your Diabetes Risk Score is: {model_predictions[0]:.2f}') st.write('Ensemble Model R-squared (R2) Score:', ensemble_r2) st.write('Ensemble Model Root Mean Squared Error (RMSE):', ensemble_rmse) st.write('Ensemble Model Mean Squared Error (MSE):', ensemble_mse) st.write('Ensemble Model Mean Absolute Error (MAE):', ensemble_mae) st.write('------------------------------------------------------------------------------------------------------') # Right column content with right_column: # Initialize lists to store model names and their respective performance metrics model_names = ['Ensemble'] r2_scores = [ensemble_r2] rmses = [ensemble_rmse] mses = [ensemble_mse] maes = [ensemble_mae] for model in models_to_run: # Train the selected model model.fit(X_train, y_train) # Make predictions on the test set model_predictions = model.predict(user_input) # Evaluate the model's performance on the test set model_mse = mean_squared_error(y_test, model.predict(X_test)) model_mae = mean_absolute_error(y_test, model.predict(X_test)) rmse = np.sqrt(model_mse) model_r2 = r2_score(y_test, model.predict(X_test)) st.write(f'According to {type(model).__name__} Model, Your Diabetes Risk Score is: {model_predictions[0]:.2f}') st.write(f'{type(model).__name__} R-squared (R2) Score:', model_r2) st.write(f'{type(model).__name__} Root Mean Squared Error (RMSE):', rmse) st.write(f'{type(model).__name__} Mean Squared Error (MSE):', model_mse) st.write(f'{type(model).__name__} Mean Absolute Error (MAE):', model_mae) st.write('------------------------------------------------------------------------------------------------------') # Append model performance metrics to the lists model_names.append(type(model).__name__) r2_scores.append(model_r2) rmses.append(rmse) mses.append(model_mse) maes.append(model_mae) # Create a DataFrame to store the performance metrics metrics_df = pd.DataFrame({ 'Model': model_names, 'R-squared (R2) Score': r2_scores, 'Root Mean Squared Error (RMSE)': rmses, 'Mean Squared Error (MSE)': mses, 'Mean Absolute Error (MAE)': maes }) # Get the model labels model_labels = generate_model_labels(metrics_df['Model']) # Plot the comparison graphs plt.figure(figsize=(12, 10)) # R2 Score comparison plt.subplot(2, 2, 3) plt.bar(model_labels, metrics_df['R-squared (R2) Score'], color='green') plt.title('R2 Score Comparison') # RMSE comparison plt.subplot(2, 2, 4) plt.bar(model_labels, metrics_df['Root Mean Squared Error (RMSE)'], color='blue') plt.title('RMSE Comparison') # MSE comparison plt.subplot(2, 2, 1) plt.bar(model_labels, metrics_df['Mean Squared Error (MSE)'], color='orange') plt.title('MSE Comparison') # MAE comparison plt.subplot(2, 2, 2) plt.bar(model_labels, metrics_df['Mean Absolute Error (MAE)'], color='purple') plt.title('MAE Comparison') # Adjust layout to prevent overlapping of titles plt.tight_layout() # Display the graphs in Streamlit st.pyplot()