# -*- coding: utf-8 -*- """Copy of finalProjectDaniel Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1Xmu0qEBPBWsUKnRKtCsUn2mmP6R5tkZQ # Importing libraries """ ## Basic imports import matplotlib.pyplot as plt import pandas as pd import numpy as np import tensorflow as tf ## Specific imports from sklearn.model_selection import train_test_split from sklearn.model_selection import StratifiedShuffleSplit from sklearn.impute import SimpleImputer from sklearn.preprocessing import OneHotEncoder from sklearn import preprocessing from sklearn.base import BaseEstimator, TransformerMixin from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OrdinalEncoder from sklearn.linear_model import LinearRegression """#Loading the DataSet and Reducing the Features of Interest""" # from google.colab import drive # drive.mount('/content/drive/', force_remount=True) # !ls /content/drive/MyDrive/FALL2022/Warfarin_Dose_Prediction_Dataset.xls ## for theresa to run it # from google.colab import drive # drive.mount('/content/drive/') # !ls /content/drive/MyDrive/Machine Learning/data_final_project.csv # !pip install --upgrade xlrd import pandas as pd # original_df = pd.read_excel('/content/drive/MyDrive/FALL2022/Warfarin_Dose_Prediction_Dataset.xls') # for theresa to run it original_df = pd.read_csv('data_final_project.csv', sep=',') # original_df.info() patients = original_df[['Gender','Race (Reported)', 'Age', 'Height (cm)', 'Weight (kg)', 'Diabetes', 'Simvastatin (Zocor)', 'Amiodarone (Cordarone)', 'Target INR', 'INR on Reported Therapeutic Dose of Warfarin', 'Cyp2C9 genotypes', 'VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T', 'Therapeutic Dose of Warfarin']].copy() # patients.head(n=5) # patients.describe() # patients.info() # patients.to_excel("patients_df_reduced.xlsx") """# Setting aside a validation set right away separates dataset into patients_df (95%) and validation_set (5%) """ from sklearn.model_selection import train_test_split patients_df, validation_set = train_test_split(patients, test_size=0.05, random_state=42) """# Visualizing Data Features and Correlations on whole dataset (minus validation set) ###Looking at Numerical Data (note that some of these are numerical catagorical but are entered as 0 or 1) """ # Commented out IPython magic to ensure Python compatibility. # %matplotlib inline patients_df.hist(bins=50, figsize=(20,15)) plt.show() corr_matrix = patients_df.corr() corr_matrix["Therapeutic Dose of Warfarin"].sort_values(ascending=False) # note that Target INR and INR on Reported Therapeutic Dose of Warfarin are linearly related. Target INR has so few values that I will remove it as part of pre-processing corr_matrix["Target INR"].sort_values(ascending=False) """### Looking at Catagorical Text Data (Use these catagories for gradio implementation later)""" patients_df['Gender'].value_counts() patients_df['Age'].value_counts() patients_df['Race (Reported)'].value_counts() patients_df['Target INR'].value_counts() patients_df['Diabetes'].value_counts() patients_df['Simvastatin (Zocor)'].value_counts() patients_df['Amiodarone (Cordarone)'].value_counts() patients_df['Cyp2C9 genotypes'].value_counts() patients_df['VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T'].value_counts() """# Dropping any rows that have Nan in the target column ON WHOLE DATASET""" # Dropping any rows that have Nan in the target column patients_df.dropna(subset=['Therapeutic Dose of Warfarin'], inplace=True) patients_df.info() """# Dividing Data into Statified Train (80%) and Test Set (20%) This includes minimal pre-processing of gender and weight on the full dataset that was necessary for the statified sampling based on weight Test and Train Sets with features and labels are stored in ;'strat_train_set' and 'strat_test_set' patients_df -> strat_train_set, strat_test_set ### Perform Statified Sampling based on Weight (Chapter 2 Pages 54-55) ### Dropping Rows with Nan Gender Columns (since there are only 4 of them) -- NEED TO DO BEFORE STAT SAMPLING IN THIS CASE """ patients_df.dropna(subset=['Gender'], inplace=True) """#### Replacing Nan values in weight group with median based on Gender as is needed to perform statified sampling for the weight group""" ## looking at median female weight median_female_weight=patients_df.loc[patients_df['Gender'] == 'female', 'Weight (kg)'].median() median_female_weight ## looking at median male weight median_male_weight=patients_df.loc[patients_df['Gender'] == 'male', 'Weight (kg)'].median() median_male_weight ## filling in null weight values on full dataset medians = patients_df.groupby(['Gender'])['Weight (kg)'].median() patients_df = patients_df.set_index(['Gender']) patients_df['Weight (kg)'] = patients_df['Weight (kg)'].fillna(medians) patients_df = patients_df.reset_index() patients_df['Weight (kg)'].isna().sum() """#### Creating Weight Catagories from which the test set will sample from""" patients_df["weight_cat"] = pd.cut(patients_df["Weight (kg)"], bins=[0, 50, 75, 100, np.inf], labels=[1, 2, 3, 4]) patients_df["weight_cat"].hist() """#### Dividing patients_df into strat_train_set (80%) and strat_test_set (20%) distribution""" from sklearn.model_selection import StratifiedShuffleSplit split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) for train_index, test_index in split.split(patients_df, patients_df["weight_cat"]): strat_train_set = patients_df.loc[train_index] strat_test_set = patients_df.loc[test_index] """#### Comparing proportion of samples per weight catagory between test set and original dataset #####(distrbutions are the same, showing that the stratified sampling worked) """ strat_test_set["weight_cat"].value_counts() / len(strat_test_set) for set_ in (strat_train_set, strat_test_set): set_.drop("weight_cat", axis=1, inplace=True) patients_df["weight_cat"].value_counts() / len(patients_df) """## Separate the data from the labels in training set: ##### strat_train_set -> patients_info and patients_labels """ patients_info = strat_train_set.drop("Therapeutic Dose of Warfarin", axis=1) # drop labels for training set patients_labels = strat_train_set["Therapeutic Dose of Warfarin"].copy() """## Custom Transformers for Pre-processing (Important Part) ##### reference: Chapter 2 Textbook associated google collab notebook ##### creating a custom transformer to handle catagorical attributes Nan Values: ##### includes Gender, Cyp2C9 genotypes, VKORC1 genotype, Diabetes, Amiodarone, Simvastatin, Race, Age """ from sklearn.base import BaseEstimator, TransformerMixin class CatTransformer(BaseEstimator, TransformerMixin): """ REPLACEMENT OF NAN FOR ALL CATAGORICAL FEATURES for Gender, fills with mode from training set for Cyp2C9 genotypes, fills with mode from training set as there is a most common class by far for VKORC1 genotype, many more are unknown, and there is not a most common class, so fills with "unknown", thus creating a new catagory for Diabetes phenotype, fills with mode--assumes no diabetes for Amiodarone (Cordarone) drug, fills with mode from training set as there is a most common class by far for Simvastatin (Zocor), fills with mode from training set as there is a most common class by far for Race, fills nan with "unknown" and converts all classes to upper so that the several groups labelled "other" are grouped together for Race, only a few were missing--replacement of nan with Mode for Race, due to there already being a catagory for "Black or African American", the catagories "Black" and "African American" were grouped together under "Black or African American" for Age, fills nan with mode from training set--not many Age values are missing. Even though there is not a most common class by a lot, I think this is best """ def __init__(self): # no *args or **kwargs pass def fit(self, X, y=None): self.mode_Gen = X['Gender'].mode()[0] self.mode_Cyp = X['Cyp2C9 genotypes'].mode()[0] self.mode_Amio = X['Amiodarone (Cordarone)'].mode()[0] self.mode_Simv = X['Simvastatin (Zocor)'].mode()[0] self.mode_Diab = X['Diabetes'].mode()[0] self.mode_Age = X['Age'].mode()[0] return self def transform(self, X): X['Cyp2C9 genotypes']=X['Cyp2C9 genotypes'].fillna(self.mode_Cyp) X['VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T']=X['VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T'].fillna("Unknown") X['Amiodarone (Cordarone)']=X['Amiodarone (Cordarone)'].fillna(self.mode_Amio) X['Simvastatin (Zocor)']=X['Simvastatin (Zocor)'].fillna(self.mode_Simv) X['Diabetes']=X['Diabetes'].fillna(self.mode_Diab) X['Race (Reported)'] = X['Race (Reported)'].fillna("UNSPECIFIED") X['Race (Reported)'] = X['Race (Reported)'].str.upper() X=X.replace({'Race (Reported)': {'AFRICAN-AMERICAN': 'BLACK OR AFRICAN AMERICAN', 'BLACK': 'BLACK OR AFRICAN AMERICAN'}}) X['Age']=X['Age'].fillna(self.mode_Age) X['Gender']=X['Gender'].fillna(self.mode_Gen) return X """##### creating a custom transformer to handle the transformation of height nan variables based on gender-depenedent median""" from sklearn.base import BaseEstimator, TransformerMixin class GenderTransformer(BaseEstimator, TransformerMixin): """ replaces missing Height variables by median for the associated gender replaces missing Weight variables by median for the associated gender """ def __init__(self): # no *args or **kwargs pass def fit(self, X, y=None): self.medians_height = X.groupby(['Gender'])["Height (cm)"].median() self.medians_weight = X.groupby(['Gender'])["Weight (kg)"].median() return self def transform(self, X): X = X.set_index(['Gender']) X["Height (cm)"] = X["Height (cm)"].fillna(self.medians_height) X["Weight (kg)"] = X["Weight (kg)"].fillna(self.medians_weight) X = X.reset_index() return X """##### creating a custom transformer to add extra attributes (BMI, BSA):""" from sklearn.base import BaseEstimator, TransformerMixin # column index col_names = ["Height (cm)", "Weight (kg)"] weight_ix, height_ix = [0, 1] # get the column indices; they are 0 and 1 class CombinedAttributesAdder(BaseEstimator, TransformerMixin): """ adds the variables for BSA (body surface area) to the data def transform returns numpy array Body Surface Area (as calculated from the DuBois and DuBois formula) reference: https://www.uptodate.com/contents/image?imageKey=ONC%2F96451&topicKey=ONC%2F83810&search=Pharmacogenomics&rank=3~18&source=see_link """ def __init__(self): # no *args or **kwargs pass def fit(self, X, y=None): return self # nothing else to do def transform(self, X): # BMI = X[:, weight_ix] / ((X[:, height_ix]/100)**2) BSA = ((0.007184*(X[:, weight_ix])**0.425)) * ((X[:, height_ix])**0.725) return np.c_[X, BSA] """#### Working Transformer Pipelines ##### pipeline for dealing with missing height and weight values """ from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler gender_pipeline = Pipeline([ ('gender_transformer', GenderTransformer()), ]) """##### pipeline for dealing with catagorical data nan values""" from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler cat_pipeline = Pipeline([ ('catagorical_transformer', CatTransformer()), ]) """##### pipeline for dealing with numerical data: height, weight, INR ##### uses CombinedAttributeAdder class for the addition of BSA (or BMI) ##### uses SimpleImputer to replace any remaining Nan values with the median for that feature ##### uses StandardScaler for scaling """ from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) """##### full pipuline using ColumnTransformer ##### Adds Attributes (from num_pipeline), Scales and imputes numerical data (from num_pipeline), Uses ordinal encoder for Ordinal Catagorical Data (Age), Uses 1Hot Encoder for non-ordinal Catagorical Data """ from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OrdinalEncoder num_attribs = ['Height (cm)', 'Weight (kg)', 'INR on Reported Therapeutic Dose of Warfarin'] cat_attribs_ordinal = ['Age', 'Gender', 'Diabetes', 'Simvastatin (Zocor)', 'Amiodarone (Cordarone)'] cat_attribs_1hot = ["Race (Reported)", 'Cyp2C9 genotypes', 'VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T'] """ Pipeline using column transformer Adds BSA attribute (from num_pipeline) imputes remaining nan numerical data using median (from num_pipeline) scales numerical data using StandardScaler (from num_pipeline) Uses ordinal encoder for Ordinal Catagorical Data (Age) and Binary Catagorical Data (gender, diabetes, simvastatin, amiodorone)--see cat_attrib_ordinal Uses 1Hot Encoder for non-ordinal Catagorical Data--see cat_attribs_1hot """ scale_encode_pipeline = ColumnTransformer([ ("num", num_pipeline, num_attribs), ('cat_ord', OrdinalEncoder(), cat_attribs_ordinal), ("cat_1hot", OneHotEncoder(sparse=False, handle_unknown='ignore'), cat_attribs_1hot), ]) #input list of (name, transformer, columns) tuples specifying the transformer objects to be applied to subsets of the data. """## Full PreProcess Function to incorporate all pipelines ##### contains "full_preprocess_function()" """ def series_to_df(data_series): """ function to help with processing new data (potentially useful for Gradio implementation) input: Series with dimensions (12,) output: pandas dataframe with features as column names; can now be sent through full_preprocess_function """ data_df = data_series.to_frame() data_df = data_df.transpose() return data_df def full_preprocess_function(data_df, train=False): """ INPUT: program expects the equivalent of an instance (or multiple instances) from the non pre-processed dataset (without the label) in the form of a pandas_df --input should have the following 12 features as column names: Gender, Race (Reported), Age, Height (cm), Weight (kg), Diabetes, Simvastatin (Zocor), Amiodarone (Cordarone), Target INR, INR on Reported Therapeutic Dose of Warfarin, Cyp2C9 genotypes, VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T --input should either contain a value for each feature of Nan program will remove the Target INR column from dataset as there were too few values and it was multicollinearly related to INR Reported if train==True, function will send training data to pre-processing be fit and transformed else, function will send new data to pre-processing to be transformed (not fit) OUTPUT: function returns pandas df of features, including feature names as column names Note for encoded variables: Gender: 0=female, 1=male; Diabetes: 0=no, 1=yes; Simvastatin: 0=no, 1=yes; Amiodorone: 0=no, 1=yes; Age: {0: '10 - 19', 1:'20 - 29', 2:'30 - 39', 3:'40 - 49', 4:'50 - 59', 5:'60 - 69', 6:'70 - 79', 7:'80 - 89', 8:'90+'} """ if isinstance(data_df, pd.Series) and data_df.shape == (12,): raise TypeError("Expects pd.DataFrame; Send your data through the series_to_df() function for conversion to proper format") if not isinstance(data_df, pd.DataFrame): raise TypeError("Expects pd.DataFrame; See full_preprocess function documentation for input expectations") # prepared_feature_names = ['Height (cm)', 'Weight (kg)', 'INR (Reported)', 'BSA (m**2)', 'Age', 'Gender', 'Diabetes', 'Simvastatin', 'Amiodorone', # 'ASIAN', 'BLACK OR AFRICAN AMERICAN', 'CAUCASIAN', 'CHINESE', 'HAN CHINESE', 'HISPANIC', 'INDIAN', 'INTERMEDIATE', 'JAPANESE', 'KOREAN', 'MALAY', 'OTHER','OTHER MIXED RACE', 'UNSPECIFIED', 'WHITE', # '*1/*1', '*1/*11', '*1/*13', '*1/*14', '*1/*2', '*1/*3', '*1/*5', '*1/*6', '*2/*2', '*2/*3', '*3/*3', # 'A/A', 'A/G', 'G/G', 'Unknown'] data_df.drop(['Target INR'], axis=1, inplace=True) # remove Target INR due to too few values and collinearity with INR Reported if train==True: data_cat_tr = cat_pipeline.fit_transform(data_df) data_height_tr = gender_pipeline.fit_transform(data_cat_tr) data_prepared = scale_encode_pipeline.fit_transform(data_height_tr) else: data_cat_tr = cat_pipeline.transform(data_df) data_height_tr = gender_pipeline.transform(data_cat_tr ) data_prepared = scale_encode_pipeline.transform(data_height_tr) data_prepared_df = pd.DataFrame(data_prepared) # data_prepared_df.drop(['Weight (kg)'], axis=1, inplace=True) # removing weight to address multicollinearity return data_prepared_df """ ## showing un-pre-processed dataset patients_info.head() X_train_prepared = full_preprocess_function(patients_info, train=True) # showing pre-processed training dataset X_train_prepared.head() X_train_prepared.info() """##### Send pre-processed train_data to excel (labels too)""" # X_train_prepared.to_excel("X_patients_train.xlsx") # patients_labels.to_excel('y_patients_train.xlsx') """## Making Sure Pre-processed training set works with basic model""" from sklearn.linear_model import LinearRegression lin_reg = LinearRegression() lin_reg.fit(X_train_prepared, patients_labels) patients_labels from sklearn.metrics import mean_squared_error patients_predictions = lin_reg.predict(X_train_prepared) lin_mse = mean_squared_error(patients_labels, patients_predictions) lin_rmse = np.sqrt(lin_mse) lin_rmse """## Pre-processing on Test Set (currently stored in strat_test_set) ##### note: strat_test_set contains features and labels ##### produces X_test_prepared and y_test #### Separate strat_test_set features from labels ##### stored in X_test and y_test """ X_test = strat_test_set.drop("Therapeutic Dose of Warfarin", axis=1) y_test = strat_test_set["Therapeutic Dose of Warfarin"].copy() """#### Send X_test to pre-processing function/pipeline ##### stored in X_test_prepared """ X_test_prepared = full_preprocess_function(X_test) """##### Send pre-processed test_data to excel (labels too)""" # X_test_prepared.to_excel("X_patients_test.xlsx") # y_test.to_excel("y_patients_test.xlsx") """## Making sure Pre-processed testing set works with simple regression model""" test_predictions = lin_reg.predict(X_test_prepared) """#### Evaluate mse and rmse""" test_mse = mean_squared_error(y_test, test_predictions) test_rmse = np.sqrt(test_mse) test_rmse """## Pre-processing on Validation Set ##### produces X_val_prepared and y_val #### Dropping nan labels and Separating validation_set features from labels ##### oridinally stored in 'X_val' and 'y_val' """ validation_set.dropna(subset=['Therapeutic Dose of Warfarin'], inplace=True) X_val = validation_set.drop("Therapeutic Dose of Warfarin", axis=1) y_val = validation_set["Therapeutic Dose of Warfarin"].copy() """## Sending a single instance from X_val through pre-processing pipeline and making sure it works with simple regression model""" trial = X_val.iloc[3] trial trial.shape trial_df = series_to_df(trial) # example of input for full_preprocessing_function() trial_df X_val_trial = full_preprocess_function(trial_df) # example of pre-processed single test input X_val_trial trial_val_prediction = lin_reg.predict(X_val_trial) trial_val_prediction y_trial = y_val.iloc[3] y_trial """#### Sending X_val through pre-processing pipeline""" X_val_prepared = full_preprocess_function(X_val) """## Making sure pre-processed validation set works with simple regression model""" val_predictions = lin_reg.predict(X_val_prepared) val_mse = mean_squared_error(y_val, val_predictions) val_rmse = np.sqrt(val_mse) val_rmse """##### Send pre-processed validation_data to excel (labels too)""" # X_val_prepared.to_excel("X_patients_val.xlsx") # y_val.to_excel("y_patients_val.xlsx") """#**PART II ----> ML MODELS FOR BINARY CLASSIFICATION** **First let's create a binary classification dataset by cutting the target values into two categories (<30 mg , >=30 mg)** """ import numpy as np y_train = patients_labels #Preparing training/testing/validation data for binary classifier train_label_binary = (y_train >= 30) print("binary train labels:", train_label_binary) # print("original test labels:", y_test) test_label_binary = (y_test >= 30) print("binary test labels:", test_label_binary) validation_label_binary = (y_val >= 30) print("binary validation labels:", validation_label_binary) """## 1.LOGISTIC REGRESSION MODEL Logistic regression can be used for binary classification because it estimates the probability that one instance belogns to a class or not. So by using a probability threshold e.g 50%, it classifies the instances in positive class (1) if the probability is greater than 50 %. otherwise the instances will be classified in negative class (0). So, this model works in the same way as the Linear Regression but instead of outputing the result, it outputs the logistic of the result. """ from sklearn.linear_model import LogisticRegression log_regression = LogisticRegression(penalty = 'l2', C = 1, random_state = 0 ) log_regression.fit(X_train_prepared, train_label_binary.values.ravel()) log_prediction = log_regression.predict(X_train_prepared) log_prediction """## 2.SUPPORT VECTOR MACHINE The main goal of Support Vector Machines is to fit the widest possible “street” between the classes. So, we need to have a large margin between the decision boundary which separates the classes and the training instances. the objective of SVM to find the optimal classifier is bacause the other linear classifiers might separate linear dataset in the correct way but the decision boundary is so close the training instances so that these models will probably not perform as well on new instances. Tha's why SVM tries to find the widest possible "street" between the classes. """ from sklearn.svm import SVC # # define linear kernel, # svm_model_linear = SVC(kernel = "linear",C = 1 ) # svm_model_linear.fit(X_train_prepared, train_label_binary.values.ravel()) # svm_linear_prediction= svm_model_linear.predict(X_train_prepared) # svm_linear_prediction # define polynomial kernel, P158 svm_model_polynomial = SVC(kernel = "poly", degree = 7, C = 7 ) svm_model_polynomial.fit(X_train_prepared, train_label_binary.values.ravel()) svm_polynomial_prediction = svm_model_polynomial.predict(X_train_prepared) svm_polynomial_prediction """## 3.DECISION TREE MODEL""" from sklearn.tree import DecisionTreeClassifier # define tree model decision_tree_model = DecisionTreeClassifier(max_depth = 5) decision_tree_model.fit(X_train_prepared, train_label_binary.values.ravel()) decision_tree_prediction = decision_tree_model.predict(X_train_prepared) decision_tree_prediction """## 4.RANDOM FOREST MODEL""" from sklearn.ensemble import RandomForestClassifier random_forest_model = RandomForestClassifier(n_estimators = 500, max_depth= 10, max_leaf_nodes = -1) random_forest_model.fit(X_train_prepared, train_label_binary.values.ravel()) random_forest_prediction = random_forest_model.predict(X_train_prepared) random_forest_prediction """## 5.NEURAL NETWORK""" import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense from tensorflow.keras.layers import Flatten from tensorflow.keras.layers import Dropout # Define decision threshold NN_threshold = 0.5; def build_NN(n_layers = 3, n_neurons = 1000, dropout = 0): model = Sequential() # create Sequential model for i in range(n_layers-1): model.add(Dense(n_neurons, activation = 'relu')) model.add(Dropout(dropout)) model.add(Dense(1, activation = 'sigmoid')) # 2 output neurons for binary classification model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ['accuracy']) # binary cross-entropy because it's binary classification! return model # Build random NN NN_model = build_NN(n_layers = 3, n_neurons = 10) train_history = NN_model.fit(X_train_prepared, train_label_binary.values.ravel(), validation_data=(X_val_prepared,validation_label_binary.values.ravel()), batch_size=128, epochs = 20) NN_prediction = NN_model.predict(X_train_prepared) # Prepare prediction to be comparable NN_prediction = (NN_prediction >= NN_threshold) """## **Calculating the performance of each model in the train dataset**""" from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score methods = [decision_tree_prediction, random_forest_prediction,svm_polynomial_prediction,log_prediction, NN_prediction] names = ["decision_tree_model", "random_forest_model","svm_polynomial_model","log_model", 'neural_net'] accuracy = [] precision =[] recall = [] ROC = [] F1= [] for method in methods: accuracyy = accuracy_score(train_label_binary, method) accuracy.append(accuracyy) precision1 = precision_score(train_label_binary, method) precision.append(precision1) recall1 = recall_score(train_label_binary, method) recall.append(recall1) ROC1 = roc_auc_score(train_label_binary, method) ROC.append(ROC1) F11 = f1_score(train_label_binary, method) F1.append(F11) data = {'Method': names, 'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'ROC': ROC, 'F1 score': F1, } evaluation = pd.DataFrame(data, columns=['Method', "Accuracy", "Precision","Recall", "ROC", "F1 score"]) evaluation """## **Let's do a better Evaluation Using Cross-Validation** **Logistic Regression cross validation** """ from sklearn.model_selection import cross_val_score, GridSearchCV from sklearn.linear_model import LogisticRegression log_regression= LogisticRegression(solver ='liblinear') penalty = ['l1', 'l2'] C = [1,0.1,0.01,0.001] hyperparameters = dict(C=C, penalty=penalty) classifier = GridSearchCV(log_regression, hyperparameters, cv=10, verbose =0) best_model = classifier.fit(X_train_prepared, train_label_binary ) #printing out the best parameters for Logistic Regression model print('Best penalty:', best_model.best_estimator_.get_params()['penalty']) print('Best C:', best_model.best_estimator_.get_params()['C']) model = LogisticRegression(solver ='liblinear', **best_model.best_params_) model.fit(X_train_prepared, train_label_binary ) logistic_prediction= model.predict(X_train_prepared) logistic_prediction #calculating the accuracy of the model scores = cross_val_score(model, X_train_prepared, train_label_binary ) scores print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std())) from sklearn.model_selection import cross_val_predict from sklearn.metrics import roc_curve y_scores = cross_val_predict(model, X_train_prepared, train_label_binary, cv= 10, method = "decision_function") #decision_function fpr, tpr, thresholds = roc_curve (train_label_binary, y_scores) def plot_roc_curve(fpr, tpr, label =None): plt.plot(fpr, tpr, linewidth=2, label = label) plt.plot([0,1], [0,1], "k--") plot_roc_curve(fpr, tpr) plt.title('ROC curve for Logistic Regression') plt.xlabel('False Positive Rate (1- specifity') plt.ylabel('True Positive Rate (Recall)') plt.legend(['Logistic Regression'],loc ="lower right") plt.grid() plt.show() """**Support Vector Machine Cross validation**""" from sklearn.svm import SVC # hyperparameter_set = {'C': [0.001, 0.01, 0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': [0.001, 0.01, 0.1, 1]} # svm = SVC() # classifier2 = GridSearchCV(svm, hyperparameter_set, cv=10, verbose =0) # best_SV = classifier2.fit(X_train_prepared, train_label_binary ) # #printing out the best parameters for SVM model # print('Best kernel:', best_SV.best_params_['kernel']) # print('Best C:', best_SV.best_params_['C']) # print('Best gamma:', best_SV.best_params_['gamma']) SVM_final_model = SVC(C=1, kernel= 'rbf', gamma = 0.1, probability=True) SVM_final_model.fit(X_train_prepared, train_label_binary) svm_prediction= SVM_final_model.predict(X_train_prepared) svm_prediction #calculating the accuracy of the model scores = cross_val_score(SVM_final_model, X_train_prepared, train_label_binary ) scores print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std())) #Drawing the ROC curve for SVM from sklearn.model_selection import cross_val_predict from sklearn.metrics import roc_curve y_scores = cross_val_predict(model, X_train_prepared, train_label_binary, cv= 10, method = "decision_function") fpr, tpr, thresholds = roc_curve (train_label_binary, y_scores) def plot_roc_curve(fpr, tpr, label =None): plt.plot(fpr, tpr, linewidth=2, label = label) plt.plot([0,1], [0,1], "k--") plot_roc_curve(fpr, tpr) plt.title('ROC curve for Support Vector Machine') plt.xlabel('False Positive Rate (1- specifity') plt.ylabel('True Positive Rate (Recall)') plt.legend(['Support Vector Machine '],loc ="lower right") plt.grid() plt.show() """**Random Forest Cross Validation**""" # hyperparameter_set = {'n_estimators': [100, 200, 300, 400], 'max_features': ['auto', 'sqrt']} # random_forest = RandomForestClassifier() # classifier3 = GridSearchCV(random_forest, hyperparameter_set, cv=10, verbose =0) # best_model3 = classifier3.fit(X_train_prepared, train_label_binary ) # print('Best n_estimators:', best_model3.best_params_['n_estimators']) # print('Best max_features:', best_model3.best_params_['max_features']) model3 = RandomForestClassifier(n_estimators = 200, max_features= 'sqrt') model3.fit(X_train_prepared, train_label_binary) random_forest_prediction= model3.predict(X_train_prepared) random_forest_prediction #calculating the accuracy of the model scores = cross_val_score(model3, X_train_prepared, train_label_binary ) scores print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std())) #Drawing the ROC curve for SVM from sklearn.model_selection import cross_val_predict from sklearn.metrics import roc_curve y_scores = cross_val_predict(model, X_train_prepared, train_label_binary, cv= 10, method = "decision_function") #decision_function fpr, tpr, thresholds = roc_curve (train_label_binary, y_scores) def plot_roc_curve(fpr, tpr, label =None): plt.plot(fpr, tpr, linewidth=2, label = label) plt.plot([0,1], [0,1], "k--") plot_roc_curve(fpr, tpr) plt.title('ROC curve for Random Forest') plt.xlabel('False Positive Rate (1- specifity') plt.ylabel('True Positive Rate (Recall)') plt.legend(['Random Forest '],loc ="lower right") plt.grid() plt.show() """**Showing the feature importance analysis in random forest.**""" from pandas import DataFrame random_forest = RandomForestClassifier(n_estimators = 300, random_state=60) random_forest.fit(X_train_prepared,train_label_binary) random_forest_importance = random_forest.feature_importances_ print(random_forest_importance) features = original_df.columns importances = random_forest_importance indices = np.argsort(importances) **Calculating the evaluation metrics for each model and then adding the data in pandas DataFrame** """ from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score predictions = [logistic_prediction,svm_prediction, random_forest_prediction] names = ["Logistic_regression model","Support Vector Machine model", "Random_forest_model"] accuracy = [] precision =[] recall = [] ROC = [] F1= [] for i in predictions: accuracyy = accuracy_score(train_label_binary, i) accuracy.append(accuracyy) precision1 = precision_score(train_label_binary, i) precision.append(precision1) recall1 = recall_score(train_label_binary, i) recall.append(recall1) ROC1 = roc_auc_score(train_label_binary, i) ROC.append(ROC1) F11 = f1_score(train_label_binary, i) F1.append(F11) data2 = {'Method': names, 'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'ROC': ROC, 'F1 score': F1, } evaluation = pd.DataFrame(data2, columns=['Method', "Accuracy", "Precision","Recall", "ROC", "F1 score"]) evaluation """**Drawing the ROC curve of all models on the train dataset**""" from sklearn.model_selection import cross_val_predict from sklearn.metrics import roc_curve roc_curve_rates = [] for model in [model3, SVM_final_model, model]: #models are 'Logistic Regression', 'RandomForestClassifier', 'SVC' #finds the predicted probability for the sets and model predict_probability = cross_val_predict(model, X_train_prepared, train_label_binary, cv= 10, method = "predict_proba") #gets the probs for pos class y_scorse = predict_probability[:,1] #calculates the fpr and tpr with te scores fpr, tpr, threshold = roc_curve(train_label_binary, y_scorse) roc_curve_rates.append({'fpr': fpr, 'tpr': tpr}) #Takes the dics array and plots each line on the same graph line_names = ['Logistic Regression', 'RandomForestClassifier', 'SVC'] plt.plot(fpr, tpr, linewidth=2) for i in range(len(roc_curve_rates)): plt.plot(roc_curve_rates[i]['fpr'], roc_curve_rates[i]['tpr'], linewidth=2, label=line_names[i]) plt.xlim([0,1]) plt.ylim([0,1]) plt.plot([0,1], [0,1], "k--") plt.title('ROC curve') plt.xlabel('False Positive Rate (1 - specifity)') plt.ylabel('True Positive Rate (Recall)') plt.legend(loc ="lower right") plt.grid() plt.show() """**Optimizing the Neural Network**""" # Parameters to check number_of_layers = [3, 4, 5, 6, 7] number_of_neurons = [10, 100, 100, 5000] # Variables for saving data best_epoch = [[]]; best_accuracy = [[]]; i = 0; # Add early stopping into model training from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint keras_callbacks = [ EarlyStopping(monitor='val_loss', patience=5, mode='min', min_delta=0.0001), ] # Loop through all parameters for layers in number_of_layers: for neurons in number_of_neurons: print("Testing NN - Layers: "+ str(layers) + "; Neurons per layer:" + str(neurons)) NN_model = build_NN(layers, neurons) train_history = NN_model.fit(X_train_prepared, train_label_binary.values.ravel(), validation_data=(X_val_prepared,validation_label_binary.values.ravel()), batch_size=128, epochs = 30, callbacks=keras_callbacks) # Using validation accuracy as performance metric accuracy = train_history.history['val_accuracy'] best_accuracy[i].append(max(accuracy)) best_epoch[i].append(accuracy.index(max(accuracy))) i = i + 1; best_epoch.append([]) best_accuracy.append([]) # Remove last element best_epoch.pop(i) best_accuracy.pop(i) # Build model with best parameters ideal_layers_index = best_accuracy.index(max(best_accuracy)) ideal_layers = number_of_layers[ideal_layers_index] ideal_neurons = number_of_neurons[best_accuracy[ideal_layers_index].index(max(best_accuracy[ideal_layers_index]))] # Print Results print("Best number of layers:", str(ideal_layers)) print("Best number of neurons:", str(ideal_neurons)) """## **Evaluate all the models on the Test Set** """ #Logistic Regression logistic_regression_final_model = LogisticRegression(solver ='liblinear', **best_model.best_params_) logistic_regression_final_model.fit(X_train_prepared, train_label_binary ) logistic_prediction_test= logistic_regression_final_model.predict(X_test_prepared) logistic_prediction_test #Support Vector Machine SVM_final_model = SVC(C=0.1, kernel= 'linear', gamma = 'scale', probability=True) SVM_final_model.fit(X_train_prepared, train_label_binary) svm_prediction_test= SVM_final_model.predict(X_test_prepared) svm_prediction_test # Random Forest Classifier random_forest_final_model = RandomForestClassifier(n_estimators = 400, max_features= 'sqrt') random_forest_final_model.fit(X_train_prepared, train_label_binary) random_forest_prediction_test= random_forest_final_model.predict(X_test_prepared) random_forest_prediction_test # Neural Network keras_callbacks = [ EarlyStopping(monitor='val_loss', patience=10, mode='min', min_delta=0.0001), ModelCheckpoint('./checkmodel.h5', monitor='val_loss', save_best_only=True, mode='min') ] NN_final_model = build_NN(ideal_layers, ideal_neurons, dropout=0.15) NN_final_model.fit(X_train_prepared, train_label_binary, validation_data=(X_val_prepared,validation_label_binary.values.ravel()), batch_size=128, epochs = 30, callbacks=keras_callbacks) NN_prediction_test= NN_final_model.predict(X_test_prepared) # Prepare prediction to be comparable NN_prediction_test = (NN_prediction_test >= NN_threshold) from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score predictions = [logistic_prediction_test,svm_prediction_test, random_forest_prediction_test, NN_prediction_test] names = ["Logistic_regression_test","Support_vector_machine_test", "Random_forest_test", "Neural_net_test"] accuracy = [] precision =[] recall = [] ROC = [] F1= [] for i in predictions: accuracyy = accuracy_score(test_label_binary, i) accuracy.append(accuracyy) precision1 = precision_score(test_label_binary, i) precision.append(precision1) recall1 = recall_score(test_label_binary, i) recall.append(recall1) ROC1 = roc_auc_score(test_label_binary, i) ROC.append(ROC1) F11 = f1_score(test_label_binary, i) F1.append(F11) data3 = {'Method': names, 'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'ROC': ROC, 'F1 score': F1, } evaluation = pd.DataFrame(data3, columns=['Method', "Accuracy", "Precision","Recall", "ROC", "F1 score"]) evaluation """**Trade-off between precision and recall** **for** : 1. Logistic Regression 2. Support Vector Machine 1. Random Forest """ from sklearn.metrics import precision_recall_curve import matplotlib.pyplot as plt y_score = logistic_regression_final_model.predict_proba(X_test_prepared)[:, 1] #calculate precision and recall precision, recall, thresholds = precision_recall_curve(test_label_binary, y_score) #create precision recall curve fig, ax = plt.subplots() ax.plot(recall, precision, color='red') #add axis labels to plot ax.set_title('Precision-Recall Curve for Logistic Regression') ax.set_ylabel('Precision') ax.set_xlabel('Recall') #display plot plt.grid(True) plt.show() from sklearn.metrics import precision_recall_curve import matplotlib.pyplot as plt y_score = random_forest_final_model.predict_proba(X_test_prepared)[:, 1] #calculate precision and recall precision, recall, thresholds = precision_recall_curve(test_label_binary, y_score) #create precision recall curve fig, ax = plt.subplots() ax.plot(recall, precision, color='blue') #add axis labels to plot ax.set_title('Precision-Recall Curve for Support Vector Machine') ax.set_ylabel('Precision') ax.set_xlabel('Recall') #display plot plt.grid(True) plt.show() from sklearn.metrics import precision_recall_curve import matplotlib.pyplot as plt y_score = SVM_final_model.predict_proba(X_test_prepared)[:, 1] #calculate precision and recall precision, recall, thresholds = precision_recall_curve(test_label_binary, y_score) #create precision recall curve fig, ax = plt.subplots() ax.plot(recall, precision, color='purple') #add axis labels to plot ax.set_title('Precision-Recall Curve for Random Forest Model') ax.set_ylabel('Precision') ax.set_xlabel('Recall') #display plot plt.grid(True) plt.show() """**Drawing the ROC curve of all models on the test dataset**""" from sklearn.model_selection import cross_val_predict from sklearn.metrics import roc_curve roc_curve_rates = [] for model in [logistic_regression_final_model, random_forest_final_model,SVM_final_model]: #models are 'Logistic Regression', 'RandomForestClassifier', 'SVC' #finds the predicted probability for the sets and model predict_probability = cross_val_predict(logistic_regression_final_model, X_test_prepared, test_label_binary, cv= 10, method = "predict_proba") #gets the probs for pos class y_scorse = predict_probability[:,1] #calculates the fpr and tpr with te scores fpr, tpr, threshold = roc_curve(test_label_binary, y_scorse) roc_curve_rates.append({'fpr': fpr, 'tpr': tpr}) #Takes the dics array and plots each line on the same graph line_names = ['Logistic Regression', 'RandomForestClassifier', 'SVC'] plt.plot(fpr, tpr, linewidth=2) for i in range(len(roc_curve_rates)): plt.plot(roc_curve_rates[i]['fpr'], roc_curve_rates[i]['tpr'], linewidth=2, label=line_names[i]) plt.xlim([0,1]) plt.ylim([0,1]) plt.plot([0,1], [0,1], "k--") plt.title('ROC curve') plt.xlabel('False Positive Rate (1 - specifity)') plt.ylabel('True Positive Rate (Recall)') plt.legend(loc ="lower right") plt.grid() plt.show() """#**PART III ----> Gradio Implementation** """ # Install Gradio # !pip install --quiet gradio # Import Gradio Library import gradio as gr # Define callback function def warfarin_callback(age, height, weight, gender, race, diabetes, medication, Cyp2C9, VKORC1, INR, model): # Input validation if not gender: return "Please select the patient's gender" if not race: return "Please select the patient's race" # Extract medication simvastatin = 0.0 amiodarone = 0.0 if 'Simvastatin (Zocor)' in medication: simvastatin = 1.0 if 'Amiodarone (Cordarone)' in medication: amiodarone = 1.0 # Categorize age age_categories = ['10 - 19', '20 - 29', '30 - 39', '40 - 49', '50 - 59', '60 - 69', '70 - 79', '80 - 89', '90+'] age_category = age_categories[int(np.floor(age/10)) - 1] # Gender, Race (Reported), Age, Height (cm), Weight (kg), Diabetes, Simvastatin (Zocor), Amiodarone (Cordarone), Target INR, INR on Reported Therapeutic Dose of Warfarin, Cyp2C9 genotypes, VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T input_df = pd.DataFrame([[gender.lower(), race, age_category, height, weight, float(diabetes), simvastatin, amiodarone, 0.0, INR, Cyp2C9, VKORC1]], columns=["Gender", "Race (Reported)", "Age", "Height (cm)", "Weight (kg)", "Diabetes", "Simvastatin (Zocor)", "Amiodarone (Cordarone)", "Target INR", "INR on Reported Therapeutic Dose of Warfarin", "Cyp2C9 genotypes", "VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T"]) preprocessed_input_df = full_preprocess_function(input_df) # Model Selection if model == "Logistic Regression": prediction = (logistic_regression_final_model.predict(preprocessed_input_df)) elif model == "Support Vector Machine": prediction = (SVM_final_model.predict(preprocessed_input_df)) elif model == "Random Forest": prediction = (random_forest_final_model.predict(preprocessed_input_df)) elif model == "Neural Network": prediction = (NN_final_model.predict(preprocessed_input_df)) prediction = prediction > NN_threshold else: return "Please select a Machine Learning Model" if prediction: return "The recommended Warfarin Dose is >30mg" else: return "The recommended Warfarin Dose is <=30mg" # Define output module as Warfarin dose output_dose = gr.Textbox(label = "Warfarin Dose") # Define all input modules input_age = gr.Slider(10, 100, step=1, label = "Age", default=30) input_height = gr.Number(label = "Height (cm)") input_weight = gr.Number(label = "Weight (kg)") input_gender = gr.Radio(choices=["Male", "Female"], label = "Gender") input_race = gr.Dropdown(choices=['Asian', 'Black or African American', 'Caucasian', 'Chinese', 'Han Chinese', 'Hispanic', 'Indian', 'Intermediate', 'Japanese', 'Korean', 'Malay', 'Other','Other Mixed Race', 'Unspecified', 'White'], label = "Race") input_diabetes = gr.Checkbox(label = "Is the patient Diabetic?") input_medication = gr.CheckboxGroup(["Simvastatin (Zocor)", "Amiodarone (Cordarone)"], label = "Is the patient taking any of the following medication?") input_Cyp269 = gr.Dropdown(['*1/*1', '*1/*11', '*1/*13', '*1/*14', '*1/*2', '*1/*3', '*1/*5', '*1/*6', '*2/*2', '*2/*3', '*3/*3'], label = "Cyp2C9 genotype") input_VKORC1 = gr.Dropdown(['A/A', 'A/G', 'G/G', 'Unknown'], label = "VKORC1 genotype") input_INR = gr.Slider(1, 5, step=0.01, label = "INR on Reported Therapeutic Dose of Warfarin", default=2.45) input_model = gr.Dropdown(choices=["Logistic Regression", "Support Vector Machine", "Random Forest", "Neural Network" ], label = "Machine Learning Model") gr.Interface(fn=warfarin_callback, inputs=[input_age, input_height, input_weight,input_gender, input_race, input_diabetes, input_medication, input_Cyp269, input_VKORC1, input_INR, input_model], outputs=output_dose).launch(debug=False)