# -*- coding: utf-8 -*-
"""Copy of finalProjectDaniel

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1Xmu0qEBPBWsUKnRKtCsUn2mmP6R5tkZQ

# Importing libraries
"""

## Basic imports
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tensorflow as tf

## Specific imports
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LinearRegression

"""#Loading the DataSet and Reducing the Features of Interest"""

# from google.colab import drive
# drive.mount('/content/drive/', force_remount=True)
# !ls /content/drive/MyDrive/FALL2022/Warfarin_Dose_Prediction_Dataset.xls

## for theresa to run it
# from google.colab import drive
# drive.mount('/content/drive/')
# !ls /content/drive/MyDrive/Machine Learning/data_final_project.csv

# !pip install --upgrade xlrd

import pandas as pd
# original_df = pd.read_excel('/content/drive/MyDrive/FALL2022/Warfarin_Dose_Prediction_Dataset.xls')

# for theresa to run it
original_df = pd.read_csv('data_final_project.csv', sep=',')

# original_df.info()

patients = original_df[['Gender','Race (Reported)', 'Age', 'Height (cm)', 'Weight (kg)', 'Diabetes', 'Simvastatin (Zocor)', 'Amiodarone (Cordarone)',
                        'Target INR', 'INR on Reported Therapeutic Dose of Warfarin', 'Cyp2C9 genotypes',
                        'VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T', 'Therapeutic Dose of Warfarin']].copy()

# patients.head(n=5)

# patients.describe()

# patients.info()

# patients.to_excel("patients_df_reduced.xlsx")

"""# Setting aside a validation set right away
separates dataset into patients_df (95%) and validation_set (5%)
"""

from sklearn.model_selection import train_test_split

patients_df, validation_set = train_test_split(patients, test_size=0.05, random_state=42)

"""# Visualizing Data Features and Correlations on whole dataset (minus validation set)

###Looking at Numerical Data (note that some of these are numerical catagorical but are entered as 0 or 1)
"""

# Commented out IPython magic to ensure Python compatibility.
# %matplotlib inline
patients_df.hist(bins=50, figsize=(20,15))
plt.show()

corr_matrix = patients_df.corr()
corr_matrix["Therapeutic Dose of Warfarin"].sort_values(ascending=False)

# note that Target INR and INR on Reported Therapeutic Dose of Warfarin are linearly related. Target INR has so few values that I will remove it as part of pre-processing
corr_matrix["Target INR"].sort_values(ascending=False)

"""### Looking at Catagorical Text Data (Use these catagories for gradio implementation later)"""

patients_df['Gender'].value_counts()

patients_df['Age'].value_counts()

patients_df['Race (Reported)'].value_counts()

patients_df['Target INR'].value_counts()

patients_df['Diabetes'].value_counts()

patients_df['Simvastatin (Zocor)'].value_counts()

patients_df['Amiodarone (Cordarone)'].value_counts()

patients_df['Cyp2C9 genotypes'].value_counts()

patients_df['VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T'].value_counts()

"""# Dropping any rows that have Nan in the target column ON WHOLE DATASET"""

# Dropping any rows that have Nan in the target column
patients_df.dropna(subset=['Therapeutic Dose of Warfarin'], inplace=True)
patients_df.info()

"""# Dividing Data into Statified Train (80%) and Test Set (20%)
This includes minimal pre-processing of gender and weight on the full dataset that was necessary for the statified sampling based on weight
Test and Train Sets with features and labels are stored in ;'strat_train_set' and 'strat_test_set'

patients_df -> strat_train_set, strat_test_set

### Perform Statified Sampling based on Weight (Chapter 2 Pages 54-55)

### Dropping Rows with Nan Gender Columns (since there are only 4 of them) -- NEED TO DO BEFORE STAT SAMPLING IN THIS CASE
"""

patients_df.dropna(subset=['Gender'], inplace=True)

"""#### Replacing Nan values in weight group with median based on Gender as is needed to perform statified sampling for the weight group"""

## looking at median female weight
median_female_weight=patients_df.loc[patients_df['Gender'] == 'female', 'Weight (kg)'].median()
median_female_weight

## looking at median male weight
median_male_weight=patients_df.loc[patients_df['Gender'] == 'male', 'Weight (kg)'].median()
median_male_weight

## filling in null weight values on full dataset
medians = patients_df.groupby(['Gender'])['Weight (kg)'].median()
patients_df = patients_df.set_index(['Gender'])
patients_df['Weight (kg)'] = patients_df['Weight (kg)'].fillna(medians)
patients_df = patients_df.reset_index()

patients_df['Weight (kg)'].isna().sum()

"""#### Creating Weight Catagories from which the test set will sample from"""

patients_df["weight_cat"] = pd.cut(patients_df["Weight (kg)"], bins=[0, 50, 75, 100, np.inf],
labels=[1, 2, 3, 4])
patients_df["weight_cat"].hist()

"""#### Dividing patients_df into strat_train_set (80%) and strat_test_set (20%) distribution"""

from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(patients_df, patients_df["weight_cat"]):
  strat_train_set = patients_df.loc[train_index]
  strat_test_set = patients_df.loc[test_index]

"""#### Comparing proportion of samples per weight catagory between test set and original dataset
#####(distrbutions are the same, showing that the stratified sampling worked)
"""

strat_test_set["weight_cat"].value_counts() / len(strat_test_set)

for set_ in (strat_train_set, strat_test_set):
 set_.drop("weight_cat", axis=1, inplace=True)

patients_df["weight_cat"].value_counts() / len(patients_df)

"""## Separate the data from the labels in training set:
##### strat_train_set -> patients_info and patients_labels
"""

patients_info = strat_train_set.drop("Therapeutic Dose of Warfarin", axis=1) # drop labels for training set
patients_labels = strat_train_set["Therapeutic Dose of Warfarin"].copy()

"""## Custom Transformers for Pre-processing (Important Part)
##### reference: Chapter 2 Textbook associated google collab notebook

##### creating a custom transformer to handle catagorical attributes Nan Values:
##### includes Gender, Cyp2C9 genotypes, VKORC1 genotype, Diabetes, Amiodarone, Simvastatin, Race, Age
"""

from sklearn.base import BaseEstimator, TransformerMixin

class CatTransformer(BaseEstimator, TransformerMixin):
    """
    REPLACEMENT OF NAN FOR ALL CATAGORICAL FEATURES
    for Gender, fills with mode from training set
    for Cyp2C9 genotypes, fills with mode from training set as there is a most common class by far
    for VKORC1 genotype, many more are unknown, and there is not a most common class, so fills with "unknown", thus creating a new catagory
    for Diabetes phenotype, fills with mode--assumes no diabetes
    for Amiodarone (Cordarone) drug, fills with mode from training set as there is a most common class by far
    for Simvastatin (Zocor), fills with mode from training set as there is a most common class by far
    for Race, fills nan with "unknown" and converts all classes to upper so that the several groups labelled "other" are grouped together
    for Race, only a few were missing--replacement of nan with Mode
    for Race, due to there already being a catagory for "Black or African American", the catagories "Black" and "African American" were grouped together under "Black or African American"
    for Age, fills nan with mode from training set--not many Age values are missing. Even though there is not a most common class by a lot, I think this is best
    """
    def __init__(self): # no *args or **kwargs
      pass
    def fit(self, X, y=None):
      self.mode_Gen = X['Gender'].mode()[0]
      self.mode_Cyp = X['Cyp2C9 genotypes'].mode()[0]
      self.mode_Amio = X['Amiodarone (Cordarone)'].mode()[0]
      self.mode_Simv = X['Simvastatin (Zocor)'].mode()[0]
      self.mode_Diab = X['Diabetes'].mode()[0]
      self.mode_Age = X['Age'].mode()[0]
      return self
    def transform(self, X):
      X['Cyp2C9 genotypes']=X['Cyp2C9 genotypes'].fillna(self.mode_Cyp)
      X['VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T']=X['VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T'].fillna("Unknown")
      X['Amiodarone (Cordarone)']=X['Amiodarone (Cordarone)'].fillna(self.mode_Amio)
      X['Simvastatin (Zocor)']=X['Simvastatin (Zocor)'].fillna(self.mode_Simv)
      X['Diabetes']=X['Diabetes'].fillna(self.mode_Diab)
      X['Race (Reported)'] = X['Race (Reported)'].fillna("UNSPECIFIED")
      X['Race (Reported)'] = X['Race (Reported)'].str.upper()
      X=X.replace({'Race (Reported)': {'AFRICAN-AMERICAN': 'BLACK OR AFRICAN AMERICAN', 'BLACK': 'BLACK OR AFRICAN AMERICAN'}})
      X['Age']=X['Age'].fillna(self.mode_Age)
      X['Gender']=X['Gender'].fillna(self.mode_Gen)
      return X

"""##### creating a custom transformer to handle the transformation of height nan variables based on gender-depenedent median"""

from sklearn.base import BaseEstimator, TransformerMixin

class GenderTransformer(BaseEstimator, TransformerMixin):
    """
    replaces missing Height variables by median for the associated gender
    replaces missing Weight variables by median for the associated gender
    """
    def __init__(self): # no *args or **kwargs
      pass
    def fit(self, X, y=None):
      self.medians_height = X.groupby(['Gender'])["Height (cm)"].median()
      self.medians_weight = X.groupby(['Gender'])["Weight (kg)"].median()
      return self
    def transform(self, X):
      X = X.set_index(['Gender'])
      X["Height (cm)"] = X["Height (cm)"].fillna(self.medians_height)
      X["Weight (kg)"] = X["Weight (kg)"].fillna(self.medians_weight)
      X = X.reset_index()
      return X

"""##### creating a custom transformer to add extra attributes (BMI, BSA):"""

from sklearn.base import BaseEstimator, TransformerMixin

# column index
col_names = ["Height (cm)", "Weight (kg)"]
weight_ix, height_ix = [0, 1] # get the column indices; they are 0 and 1
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    """
    adds the variables for BSA (body surface area) to the data
    def transform returns numpy array
    Body Surface Area (as calculated from the DuBois and DuBois formula)
    reference: https://www.uptodate.com/contents/image?imageKey=ONC%2F96451&topicKey=ONC%2F83810&search=Pharmacogenomics&rank=3~18&source=see_link
    """
    def __init__(self): # no *args or **kwargs
        pass
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
      # BMI = X[:, weight_ix] / ((X[:, height_ix]/100)**2)
      BSA = ((0.007184*(X[:, weight_ix])**0.425)) * ((X[:, height_ix])**0.725)
      return np.c_[X, BSA]

"""#### Working Transformer Pipelines

##### pipeline for dealing with missing height and weight values
"""

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

gender_pipeline = Pipeline([
        ('gender_transformer', GenderTransformer()),
    ])

"""##### pipeline for dealing with catagorical data nan values"""

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

cat_pipeline = Pipeline([
        ('catagorical_transformer',  CatTransformer()),
    ])

"""##### pipeline for dealing with numerical data: height, weight, INR
##### uses CombinedAttributeAdder class for the addition of BSA (or BMI)
##### uses SimpleImputer to replace any remaining Nan values with the median for that feature
##### uses StandardScaler for scaling
"""

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

"""##### full pipuline using ColumnTransformer
##### Adds Attributes (from num_pipeline), Scales and imputes numerical data (from num_pipeline), Uses ordinal encoder for Ordinal Catagorical Data (Age), Uses 1Hot Encoder for non-ordinal Catagorical Data
"""

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

num_attribs = ['Height (cm)', 'Weight (kg)', 'INR on Reported Therapeutic Dose of Warfarin']
cat_attribs_ordinal = ['Age', 'Gender', 'Diabetes', 'Simvastatin (Zocor)', 'Amiodarone (Cordarone)']
cat_attribs_1hot = ["Race (Reported)",
                    'Cyp2C9 genotypes', 'VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T']
"""
Pipeline using column transformer
Adds BSA attribute (from num_pipeline)
imputes remaining nan numerical data using median (from num_pipeline)
scales numerical data using StandardScaler (from num_pipeline)
Uses ordinal encoder for Ordinal Catagorical Data (Age) and Binary Catagorical Data (gender, diabetes, simvastatin, amiodorone)--see cat_attrib_ordinal
Uses 1Hot Encoder for non-ordinal Catagorical Data--see cat_attribs_1hot
"""
scale_encode_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ('cat_ord', OrdinalEncoder(), cat_attribs_ordinal),
        ("cat_1hot", OneHotEncoder(sparse=False, handle_unknown='ignore'), cat_attribs_1hot),
    ]) #input list of (name, transformer, columns) tuples specifying the transformer objects to be applied to subsets of the data.

"""## Full PreProcess Function to incorporate all pipelines
##### contains "full_preprocess_function()"
"""

def series_to_df(data_series):
  """
  function to help with processing new data (potentially useful for Gradio implementation)
  input: Series with dimensions (12,)
  output: pandas dataframe with features as column names; can now be sent through full_preprocess_function
  """
  data_df = data_series.to_frame()
  data_df = data_df.transpose()
  return data_df

def full_preprocess_function(data_df, train=False):
  """
  INPUT: program expects the equivalent of an instance (or multiple instances) from the non pre-processed dataset (without the label) in the form of a pandas_df

        --input should have the following 12 features as column names: Gender, Race (Reported), Age, Height (cm), Weight (kg), Diabetes, Simvastatin (Zocor), Amiodarone (Cordarone),
        Target INR, INR on Reported Therapeutic Dose of Warfarin,
        Cyp2C9 genotypes, VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T
        --input should either contain a value for each feature of Nan

  program will remove the Target INR column from dataset as there were too few values and it was multicollinearly related to INR Reported
  if train==True, function will send training data to pre-processing be fit and transformed

  else, function will send new data to pre-processing to be transformed (not fit)
  OUTPUT: function returns pandas df of features, including feature names as column names
  Note for encoded variables:
  Gender: 0=female, 1=male;
  Diabetes: 0=no, 1=yes;
  Simvastatin: 0=no, 1=yes;
  Amiodorone: 0=no, 1=yes;
  Age: {0: '10 - 19', 1:'20 - 29', 2:'30 - 39', 3:'40 - 49', 4:'50 - 59', 5:'60 - 69', 6:'70 - 79', 7:'80 - 89', 8:'90+'}
  """
  if isinstance(data_df, pd.Series) and data_df.shape == (12,):
    raise TypeError("Expects pd.DataFrame; Send your data through the series_to_df() function for conversion to proper format")
  if not isinstance(data_df, pd.DataFrame):
    raise TypeError("Expects pd.DataFrame; See full_preprocess function documentation for input expectations")
  # prepared_feature_names = ['Height (cm)', 'Weight (kg)', 'INR (Reported)', 'BSA (m**2)', 'Age', 'Gender', 'Diabetes', 'Simvastatin', 'Amiodorone',
  #                  'ASIAN', 'BLACK OR AFRICAN AMERICAN', 'CAUCASIAN', 'CHINESE', 'HAN CHINESE', 'HISPANIC', 'INDIAN', 'INTERMEDIATE', 'JAPANESE', 'KOREAN', 'MALAY', 'OTHER','OTHER MIXED RACE', 'UNSPECIFIED', 'WHITE',
  #                  '*1/*1', '*1/*11', '*1/*13', '*1/*14', '*1/*2', '*1/*3', '*1/*5', '*1/*6', '*2/*2', '*2/*3', '*3/*3',
  #                  'A/A', 'A/G', 'G/G', 'Unknown']
  data_df.drop(['Target INR'], axis=1, inplace=True) # remove Target INR due to too few values and collinearity with INR Reported
  if train==True:
    data_cat_tr = cat_pipeline.fit_transform(data_df)
    data_height_tr = gender_pipeline.fit_transform(data_cat_tr)
    data_prepared = scale_encode_pipeline.fit_transform(data_height_tr)
  else:
    data_cat_tr = cat_pipeline.transform(data_df)
    data_height_tr = gender_pipeline.transform(data_cat_tr )
    data_prepared = scale_encode_pipeline.transform(data_height_tr)
  data_prepared_df =  pd.DataFrame(data_prepared)
  # data_prepared_df.drop(['Weight (kg)'], axis=1, inplace=True) # removing weight to address multicollinearity
  return data_prepared_df


"""

## showing un-pre-processed dataset
patients_info.head()


X_train_prepared = full_preprocess_function(patients_info, train=True)

# showing pre-processed training dataset
X_train_prepared.head()

X_train_prepared.info()

"""##### Send pre-processed train_data to excel (labels too)"""

# X_train_prepared.to_excel("X_patients_train.xlsx")
# patients_labels.to_excel('y_patients_train.xlsx')

"""## Making Sure Pre-processed training set works with basic model"""

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train_prepared, patients_labels)

patients_labels

from sklearn.metrics import mean_squared_error

patients_predictions = lin_reg.predict(X_train_prepared)
lin_mse = mean_squared_error(patients_labels, patients_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

"""## Pre-processing on Test Set (currently stored in strat_test_set)
##### note: strat_test_set contains features and labels
##### produces X_test_prepared and y_test

#### Separate strat_test_set features from labels
##### stored in X_test and y_test
"""

X_test = strat_test_set.drop("Therapeutic Dose of Warfarin", axis=1)
y_test = strat_test_set["Therapeutic Dose of Warfarin"].copy()

"""#### Send X_test to pre-processing function/pipeline
##### stored in X_test_prepared
"""

X_test_prepared = full_preprocess_function(X_test)

"""##### Send pre-processed test_data to excel (labels too)"""

# X_test_prepared.to_excel("X_patients_test.xlsx")
# y_test.to_excel("y_patients_test.xlsx")

"""## Making sure Pre-processed testing set works with simple regression model"""

test_predictions = lin_reg.predict(X_test_prepared)

"""#### Evaluate mse and rmse"""

test_mse = mean_squared_error(y_test, test_predictions)
test_rmse = np.sqrt(test_mse)

test_rmse

"""## Pre-processing on Validation Set
##### produces X_val_prepared and y_val

#### Dropping nan labels and Separating validation_set features from labels
##### oridinally stored in 'X_val' and 'y_val'
"""

validation_set.dropna(subset=['Therapeutic Dose of Warfarin'], inplace=True)
X_val = validation_set.drop("Therapeutic Dose of Warfarin", axis=1)
y_val = validation_set["Therapeutic Dose of Warfarin"].copy()

"""## Sending a single instance from X_val through pre-processing pipeline and making sure it works with simple regression model"""

trial = X_val.iloc[3]
trial

trial.shape

trial_df = series_to_df(trial)

# example of input for full_preprocessing_function()
trial_df

X_val_trial = full_preprocess_function(trial_df)

# example of pre-processed single test input
X_val_trial

trial_val_prediction = lin_reg.predict(X_val_trial)

trial_val_prediction

y_trial = y_val.iloc[3]
y_trial

"""#### Sending X_val through pre-processing pipeline"""

X_val_prepared = full_preprocess_function(X_val)

"""## Making sure pre-processed validation set works with simple regression model"""

val_predictions = lin_reg.predict(X_val_prepared)

val_mse = mean_squared_error(y_val, val_predictions)
val_rmse = np.sqrt(val_mse)

val_rmse

"""##### Send pre-processed validation_data to excel (labels too)"""

# X_val_prepared.to_excel("X_patients_val.xlsx")
# y_val.to_excel("y_patients_val.xlsx")

"""#**PART II ----> ML MODELS FOR BINARY CLASSIFICATION**

**First let's create a binary classification dataset by cutting the target values into two categories (<30 mg , >=30 mg)**
"""

import numpy as np

y_train = patients_labels

#Preparing training/testing/validation data for binary classifier
train_label_binary = (y_train >= 30)
print("binary train labels:", train_label_binary)

# print("original test labels:", y_test)
test_label_binary = (y_test >= 30)
print("binary test labels:", test_label_binary)

validation_label_binary = (y_val >= 30)
print("binary validation labels:", validation_label_binary)

"""## 1.LOGISTIC REGRESSION MODEL

Logistic regression can be used for binary classification because it estimates the probability that one instance belogns to a class or not. So by using a probability threshold e.g 50%, it classifies the instances in positive class (1) if the probability is greater than 50 %. otherwise the instances will be classified in negative class (0). So, this model works in the same way as the Linear Regression but instead of outputing the result, it outputs the logistic of the result.
"""

from sklearn.linear_model import LogisticRegression
log_regression = LogisticRegression(penalty = 'l2', C = 1, random_state = 0 )
log_regression.fit(X_train_prepared, train_label_binary.values.ravel())
log_prediction = log_regression.predict(X_train_prepared)
log_prediction

"""## 2.SUPPORT VECTOR MACHINE
The main goal of Support Vector Machines is to fit the widest possible “street” between the classes. So, we need to have a large margin between the decision boundary which separates the classes and the training instances. the objective of SVM to find the optimal classifier is bacause the other linear classifiers might separate linear dataset in the correct way but the decision boundary is so close the training instances so that these models will probably not perform as well on new instances. Tha's why SVM tries to find the widest possible "street" between the classes.

"""

from sklearn.svm import SVC
# # define linear kernel,
# svm_model_linear = SVC(kernel = "linear",C = 1 )
# svm_model_linear.fit(X_train_prepared, train_label_binary.values.ravel())
# svm_linear_prediction= svm_model_linear.predict(X_train_prepared)
# svm_linear_prediction

# define polynomial kernel, P158
svm_model_polynomial = SVC(kernel = "poly", degree = 7, C = 7 )
svm_model_polynomial.fit(X_train_prepared, train_label_binary.values.ravel())
svm_polynomial_prediction = svm_model_polynomial.predict(X_train_prepared)

svm_polynomial_prediction

"""## 3.DECISION TREE MODEL"""

from sklearn.tree import DecisionTreeClassifier
# define tree model
decision_tree_model = DecisionTreeClassifier(max_depth = 5)
decision_tree_model.fit(X_train_prepared, train_label_binary.values.ravel())
decision_tree_prediction = decision_tree_model.predict(X_train_prepared)
decision_tree_prediction

"""## 4.RANDOM FOREST MODEL"""

from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier(n_estimators = 500, max_depth= 10, max_leaf_nodes = -1)
random_forest_model.fit(X_train_prepared, train_label_binary.values.ravel())
random_forest_prediction = random_forest_model.predict(X_train_prepared)
random_forest_prediction

"""## 5.NEURAL NETWORK"""

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout

# Define decision threshold
NN_threshold = 0.5;

def build_NN(n_layers = 3, n_neurons = 1000, dropout = 0):
   model = Sequential() # create Sequential model
   for i in range(n_layers-1):
       model.add(Dense(n_neurons, activation = 'relu'))
       model.add(Dropout(dropout))
   model.add(Dense(1, activation = 'sigmoid'))  # 2 output neurons for binary classification
   model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ['accuracy']) # binary cross-entropy because it's binary classification!
   return model

# Build random NN
NN_model = build_NN(n_layers = 3, n_neurons = 10)

train_history = NN_model.fit(X_train_prepared, train_label_binary.values.ravel(), validation_data=(X_val_prepared,validation_label_binary.values.ravel()), batch_size=128, epochs = 20)
NN_prediction = NN_model.predict(X_train_prepared)

# Prepare prediction to be comparable
NN_prediction = (NN_prediction >= NN_threshold)

"""## **Calculating the performance of each model in the train dataset**"""

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
methods = [decision_tree_prediction, random_forest_prediction,svm_polynomial_prediction,log_prediction, NN_prediction]
names = ["decision_tree_model", "random_forest_model","svm_polynomial_model","log_model", 'neural_net']
accuracy = []
precision =[]
recall = []
ROC = []
F1= []
for method in methods:
  accuracyy = accuracy_score(train_label_binary, method)
  accuracy.append(accuracyy)
  precision1 = precision_score(train_label_binary, method)
  precision.append(precision1)
  recall1 = recall_score(train_label_binary, method)
  recall.append(recall1)
  ROC1 = roc_auc_score(train_label_binary, method)
  ROC.append(ROC1)
  F11 = f1_score(train_label_binary, method)
  F1.append(F11)

data = {'Method': names,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'ROC': ROC,
        'F1 score': F1,
        }
evaluation = pd.DataFrame(data, columns=['Method', "Accuracy", "Precision","Recall", "ROC", "F1 score"])
evaluation

"""## **Let's do a better Evaluation Using Cross-Validation**

**Logistic Regression cross validation**
"""

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
log_regression= LogisticRegression(solver ='liblinear')
penalty = ['l1', 'l2']
C = [1,0.1,0.01,0.001]
hyperparameters = dict(C=C, penalty=penalty)
classifier = GridSearchCV(log_regression, hyperparameters, cv=10, verbose =0)
best_model = classifier.fit(X_train_prepared, train_label_binary )

#printing out the best parameters for Logistic Regression model
print('Best penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])

model = LogisticRegression(solver ='liblinear', **best_model.best_params_)
model.fit(X_train_prepared, train_label_binary )
logistic_prediction= model.predict(X_train_prepared)
logistic_prediction

#calculating the accuracy of the model
scores = cross_val_score(model, X_train_prepared, train_label_binary )
scores
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_curve

y_scores = cross_val_predict(model, X_train_prepared, train_label_binary, cv= 10, method = "decision_function")  #decision_function
fpr, tpr, thresholds = roc_curve (train_label_binary, y_scores)

def plot_roc_curve(fpr, tpr, label =None):
  plt.plot(fpr, tpr, linewidth=2, label = label)
  plt.plot([0,1], [0,1], "k--")
plot_roc_curve(fpr, tpr)
plt.title('ROC curve for Logistic Regression')
plt.xlabel('False Positive Rate (1- specifity')
plt.ylabel('True Positive Rate (Recall)')
plt.legend(['Logistic Regression'],loc ="lower right")
plt.grid()
plt.show()

"""**Support Vector Machine Cross validation**"""

from sklearn.svm import SVC

# hyperparameter_set = {'C': [0.001, 0.01, 0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': [0.001, 0.01, 0.1, 1]}
# svm = SVC()
# classifier2 = GridSearchCV(svm, hyperparameter_set, cv=10, verbose =0)
# best_SV = classifier2.fit(X_train_prepared, train_label_binary  )

# #printing out the best parameters for SVM model
# print('Best kernel:', best_SV.best_params_['kernel'])
# print('Best C:', best_SV.best_params_['C'])
# print('Best gamma:', best_SV.best_params_['gamma'])

SVM_final_model = SVC(C=1, kernel= 'rbf', gamma = 0.1, probability=True)
SVM_final_model.fit(X_train_prepared, train_label_binary)
svm_prediction= SVM_final_model.predict(X_train_prepared)
svm_prediction

#calculating the accuracy of the model
scores = cross_val_score(SVM_final_model, X_train_prepared, train_label_binary )
scores
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

#Drawing the ROC curve for SVM
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_curve

y_scores = cross_val_predict(model, X_train_prepared, train_label_binary, cv= 10, method = "decision_function")
fpr, tpr, thresholds = roc_curve (train_label_binary, y_scores)

def plot_roc_curve(fpr, tpr, label =None):
  plt.plot(fpr, tpr, linewidth=2, label = label)
  plt.plot([0,1], [0,1], "k--")
plot_roc_curve(fpr, tpr)
plt.title('ROC curve for Support Vector Machine')
plt.xlabel('False Positive Rate (1- specifity')
plt.ylabel('True Positive Rate (Recall)')
plt.legend(['Support Vector Machine '],loc ="lower right")
plt.grid()
plt.show()

"""**Random Forest Cross Validation**"""

# hyperparameter_set = {'n_estimators': [100, 200, 300, 400], 'max_features': ['auto', 'sqrt']}
# random_forest = RandomForestClassifier()

# classifier3 = GridSearchCV(random_forest, hyperparameter_set, cv=10, verbose =0)
# best_model3 = classifier3.fit(X_train_prepared, train_label_binary )

# print('Best n_estimators:', best_model3.best_params_['n_estimators'])
# print('Best max_features:', best_model3.best_params_['max_features'])

model3 = RandomForestClassifier(n_estimators = 200, max_features= 'sqrt')
model3.fit(X_train_prepared, train_label_binary)
random_forest_prediction= model3.predict(X_train_prepared)
random_forest_prediction

#calculating the accuracy of the model
scores = cross_val_score(model3, X_train_prepared, train_label_binary )
scores
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

#Drawing the ROC curve for SVM
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_curve

y_scores = cross_val_predict(model, X_train_prepared, train_label_binary, cv= 10, method = "decision_function")  #decision_function
fpr, tpr, thresholds = roc_curve (train_label_binary, y_scores)

def plot_roc_curve(fpr, tpr, label =None):
  plt.plot(fpr, tpr, linewidth=2, label = label)
  plt.plot([0,1], [0,1], "k--")
plot_roc_curve(fpr, tpr)
plt.title('ROC curve for Random Forest')
plt.xlabel('False Positive Rate (1- specifity')
plt.ylabel('True Positive Rate (Recall)')
plt.legend(['Random Forest '],loc ="lower right")
plt.grid()
plt.show()

"""**Showing the feature importance analysis in random forest.**"""

from pandas import DataFrame
random_forest = RandomForestClassifier(n_estimators = 300, random_state=60)
random_forest.fit(X_train_prepared,train_label_binary)
random_forest_importance = random_forest.feature_importances_
print(random_forest_importance)

features = original_df.columns

importances = random_forest_importance
indices = np.argsort(importances)


**Calculating the evaluation metrics for each model and then adding the data in pandas DataFrame**
"""

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
predictions = [logistic_prediction,svm_prediction, random_forest_prediction]
names = ["Logistic_regression model","Support Vector Machine model", "Random_forest_model"]
accuracy = []
precision =[]
recall = []
ROC = []
F1= []
for i in predictions:
  accuracyy = accuracy_score(train_label_binary, i)
  accuracy.append(accuracyy)
  precision1 = precision_score(train_label_binary, i)
  precision.append(precision1)
  recall1 = recall_score(train_label_binary, i)
  recall.append(recall1)
  ROC1 = roc_auc_score(train_label_binary, i)
  ROC.append(ROC1)
  F11 = f1_score(train_label_binary, i)
  F1.append(F11)

data2 = {'Method': names,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'ROC': ROC,
        'F1 score': F1,
        }
evaluation = pd.DataFrame(data2, columns=['Method', "Accuracy", "Precision","Recall", "ROC", "F1 score"])
evaluation

"""**Drawing the ROC curve of all models on the train dataset**"""

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_curve
roc_curve_rates = []
for model in [model3, SVM_final_model, model]:  #models are 'Logistic Regression', 'RandomForestClassifier', 'SVC'
  #finds the predicted probability for the sets and model
  predict_probability = cross_val_predict(model, X_train_prepared, train_label_binary, cv= 10, method = "predict_proba")
  #gets the probs for pos class
  y_scorse = predict_probability[:,1]
  #calculates the fpr and tpr with te scores
  fpr, tpr, threshold = roc_curve(train_label_binary, y_scorse)
  roc_curve_rates.append({'fpr': fpr, 'tpr': tpr})


#Takes the dics array and plots each line on the same graph
line_names = ['Logistic Regression', 'RandomForestClassifier', 'SVC']
plt.plot(fpr, tpr, linewidth=2)
for i in range(len(roc_curve_rates)):
  plt.plot(roc_curve_rates[i]['fpr'], roc_curve_rates[i]['tpr'], linewidth=2, label=line_names[i])
plt.xlim([0,1])
plt.ylim([0,1])
plt.plot([0,1], [0,1], "k--")
plt.title('ROC curve')
plt.xlabel('False Positive Rate (1 - specifity)')
plt.ylabel('True Positive Rate (Recall)')
plt.legend(loc ="lower right")
plt.grid()
plt.show()

"""**Optimizing the Neural Network**"""

# Parameters to check
number_of_layers = [3, 4, 5, 6, 7]
number_of_neurons = [10, 100, 100, 5000]

# Variables for saving data
best_epoch = [[]];
best_accuracy = [[]];
i = 0;

# Add early stopping into model training
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
keras_callbacks = [
      EarlyStopping(monitor='val_loss', patience=5, mode='min', min_delta=0.0001),
]

# Loop through all parameters
for layers in number_of_layers:
  for neurons in number_of_neurons:
    print("Testing NN - Layers: "+ str(layers) + "; Neurons per layer:" + str(neurons))
    NN_model = build_NN(layers, neurons)
    train_history = NN_model.fit(X_train_prepared, train_label_binary.values.ravel(), validation_data=(X_val_prepared,validation_label_binary.values.ravel()), batch_size=128, epochs = 30, callbacks=keras_callbacks)
    # Using validation accuracy as performance metric
    accuracy = train_history.history['val_accuracy']
    best_accuracy[i].append(max(accuracy))
    best_epoch[i].append(accuracy.index(max(accuracy)))
  i = i + 1;
  best_epoch.append([])
  best_accuracy.append([])

# Remove last element
best_epoch.pop(i)
best_accuracy.pop(i)

# Build model with best parameters
ideal_layers_index = best_accuracy.index(max(best_accuracy))
ideal_layers = number_of_layers[ideal_layers_index]
ideal_neurons = number_of_neurons[best_accuracy[ideal_layers_index].index(max(best_accuracy[ideal_layers_index]))]

# Print Results
print("Best number of layers:", str(ideal_layers))
print("Best number of neurons:", str(ideal_neurons))

"""## **Evaluate all the models on the Test Set**


"""

#Logistic Regression
logistic_regression_final_model = LogisticRegression(solver ='liblinear', **best_model.best_params_)
logistic_regression_final_model.fit(X_train_prepared, train_label_binary )
logistic_prediction_test= logistic_regression_final_model.predict(X_test_prepared)
logistic_prediction_test

#Support Vector Machine
SVM_final_model = SVC(C=0.1, kernel= 'linear', gamma = 'scale', probability=True)
SVM_final_model.fit(X_train_prepared, train_label_binary)
svm_prediction_test= SVM_final_model.predict(X_test_prepared)
svm_prediction_test

# Random Forest Classifier
random_forest_final_model = RandomForestClassifier(n_estimators = 400, max_features= 'sqrt')
random_forest_final_model.fit(X_train_prepared, train_label_binary)
random_forest_prediction_test= random_forest_final_model.predict(X_test_prepared)
random_forest_prediction_test

# Neural Network
keras_callbacks = [
      EarlyStopping(monitor='val_loss', patience=10, mode='min', min_delta=0.0001),
      ModelCheckpoint('./checkmodel.h5', monitor='val_loss', save_best_only=True, mode='min')
]
NN_final_model = build_NN(ideal_layers, ideal_neurons, dropout=0.15)
NN_final_model.fit(X_train_prepared, train_label_binary, validation_data=(X_val_prepared,validation_label_binary.values.ravel()), batch_size=128, epochs = 30, callbacks=keras_callbacks)
NN_prediction_test= NN_final_model.predict(X_test_prepared)

# Prepare prediction to be comparable
NN_prediction_test = (NN_prediction_test >= NN_threshold)

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
predictions = [logistic_prediction_test,svm_prediction_test, random_forest_prediction_test, NN_prediction_test]
names = ["Logistic_regression_test","Support_vector_machine_test", "Random_forest_test", "Neural_net_test"]
accuracy = []
precision =[]
recall = []
ROC = []
F1= []
for i in predictions:
  accuracyy = accuracy_score(test_label_binary, i)
  accuracy.append(accuracyy)
  precision1 = precision_score(test_label_binary, i)
  precision.append(precision1)
  recall1 = recall_score(test_label_binary, i)
  recall.append(recall1)
  ROC1 = roc_auc_score(test_label_binary, i)
  ROC.append(ROC1)
  F11 = f1_score(test_label_binary, i)
  F1.append(F11)

data3 = {'Method': names,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'ROC': ROC,
        'F1 score': F1,
        }
evaluation = pd.DataFrame(data3, columns=['Method', "Accuracy", "Precision","Recall", "ROC", "F1 score"])
evaluation

"""**Trade-off between precision and recall** **for** :

1.   Logistic Regression
2.   Support Vector Machine
1.   Random Forest


"""

from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

y_score = logistic_regression_final_model.predict_proba(X_test_prepared)[:, 1]

#calculate precision and recall
precision, recall, thresholds = precision_recall_curve(test_label_binary, y_score)
#create precision recall curve
fig, ax = plt.subplots()
ax.plot(recall, precision, color='red')
#add axis labels to plot
ax.set_title('Precision-Recall Curve for Logistic Regression')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')

#display plot
plt.grid(True)
plt.show()

from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

y_score = random_forest_final_model.predict_proba(X_test_prepared)[:, 1]

#calculate precision and recall
precision, recall, thresholds = precision_recall_curve(test_label_binary, y_score)
#create precision recall curve
fig, ax = plt.subplots()
ax.plot(recall, precision, color='blue')
#add axis labels to plot
ax.set_title('Precision-Recall Curve for Support Vector Machine')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')

#display plot
plt.grid(True)
plt.show()

from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

y_score = SVM_final_model.predict_proba(X_test_prepared)[:, 1]

#calculate precision and recall
precision, recall, thresholds = precision_recall_curve(test_label_binary, y_score)
#create precision recall curve
fig, ax = plt.subplots()
ax.plot(recall, precision, color='purple')
#add axis labels to plot
ax.set_title('Precision-Recall Curve for Random Forest Model')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')

#display plot
plt.grid(True)
plt.show()

"""**Drawing the ROC curve of all models on the test dataset**"""

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_curve
roc_curve_rates = []
for model in [logistic_regression_final_model, random_forest_final_model,SVM_final_model]:  #models are 'Logistic Regression', 'RandomForestClassifier', 'SVC'
  #finds the predicted probability for the sets and model
  predict_probability = cross_val_predict(logistic_regression_final_model, X_test_prepared, test_label_binary, cv= 10, method = "predict_proba")
  #gets the probs for pos class
  y_scorse = predict_probability[:,1]
  #calculates the fpr and tpr with te scores
  fpr, tpr, threshold = roc_curve(test_label_binary, y_scorse)
  roc_curve_rates.append({'fpr': fpr, 'tpr': tpr})

#Takes the dics array and plots each line on the same graph
line_names = ['Logistic Regression', 'RandomForestClassifier', 'SVC']
plt.plot(fpr, tpr, linewidth=2)
for i in range(len(roc_curve_rates)):
  plt.plot(roc_curve_rates[i]['fpr'], roc_curve_rates[i]['tpr'], linewidth=2, label=line_names[i])
plt.xlim([0,1])
plt.ylim([0,1])
plt.plot([0,1], [0,1], "k--")
plt.title('ROC curve')
plt.xlabel('False Positive Rate (1 - specifity)')
plt.ylabel('True Positive Rate (Recall)')
plt.legend(loc ="lower right")
plt.grid()
plt.show()

"""#**PART III ----> Gradio Implementation**


"""

# Install Gradio
# !pip install --quiet gradio

# Import Gradio Library
import gradio as gr

# Define callback function
def warfarin_callback(age, height, weight, gender, race, diabetes, medication, Cyp2C9, VKORC1, INR, model):
  # Input validation
  if not gender:
    return "Please select the patient's gender"
  if not race:
    return "Please select the patient's race"

  # Extract medication
  simvastatin = 0.0
  amiodarone = 0.0
  if 'Simvastatin (Zocor)' in medication: simvastatin = 1.0
  if 'Amiodarone (Cordarone)' in medication: amiodarone = 1.0
  # Categorize age
  age_categories = ['10 - 19', '20 - 29', '30 - 39', '40 - 49', '50 - 59', '60 - 69', '70 - 79', '80 - 89', '90+']
  age_category = age_categories[int(np.floor(age/10)) - 1]

  # Gender, Race (Reported), Age, Height (cm), Weight (kg), Diabetes, Simvastatin (Zocor), Amiodarone (Cordarone), Target INR, INR on Reported Therapeutic Dose of Warfarin, Cyp2C9 genotypes, VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T
  input_df = pd.DataFrame([[gender.lower(), race, age_category, height, weight, float(diabetes), simvastatin, amiodarone, 0.0, INR, Cyp2C9, VKORC1]], columns=["Gender", "Race (Reported)", "Age", "Height (cm)", "Weight (kg)", "Diabetes", "Simvastatin (Zocor)", "Amiodarone (Cordarone)", "Target INR", "INR on Reported Therapeutic Dose of Warfarin", "Cyp2C9 genotypes", "VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T"])
  preprocessed_input_df = full_preprocess_function(input_df)

  # Model Selection
  if model == "Logistic Regression":
    prediction = (logistic_regression_final_model.predict(preprocessed_input_df))
  elif model == "Support Vector Machine":
    prediction = (SVM_final_model.predict(preprocessed_input_df))
  elif model == "Random Forest":
    prediction = (random_forest_final_model.predict(preprocessed_input_df))
  elif model == "Neural Network":
    prediction = (NN_final_model.predict(preprocessed_input_df))
    prediction = prediction > NN_threshold
  else:
    return "Please select a Machine Learning Model"

  if prediction:
    return "The recommended Warfarin Dose is >30mg"
  else:
    return "The recommended Warfarin Dose is <=30mg"

# Define output module as Warfarin dose
output_dose = gr.Textbox(label = "Warfarin Dose")

# Define all input modules
input_age = gr.Slider(10, 100, step=1, label = "Age", default=30)
input_height = gr.Number(label = "Height (cm)")
input_weight = gr.Number(label = "Weight (kg)")
input_gender = gr.Radio(choices=["Male", "Female"], label = "Gender")
input_race = gr.Dropdown(choices=['Asian', 'Black or African American', 'Caucasian', 'Chinese', 'Han Chinese', 'Hispanic', 'Indian', 'Intermediate', 'Japanese', 'Korean', 'Malay', 'Other','Other Mixed Race', 'Unspecified', 'White'], label = "Race")
input_diabetes = gr.Checkbox(label = "Is the patient Diabetic?")
input_medication = gr.CheckboxGroup(["Simvastatin (Zocor)", "Amiodarone (Cordarone)"], label = "Is the patient taking any of the following medication?")
input_Cyp269 = gr.Dropdown(['*1/*1', '*1/*11', '*1/*13', '*1/*14', '*1/*2', '*1/*3', '*1/*5', '*1/*6', '*2/*2', '*2/*3', '*3/*3'], label = "Cyp2C9 genotype")
input_VKORC1 = gr.Dropdown(['A/A', 'A/G', 'G/G', 'Unknown'], label = "VKORC1 genotype")
input_INR = gr.Slider(1, 5, step=0.01, label = "INR on Reported Therapeutic Dose of Warfarin", default=2.45)
input_model = gr.Dropdown(choices=["Logistic Regression", "Support Vector Machine", "Random Forest", "Neural Network" ], label = "Machine Learning Model")

gr.Interface(fn=warfarin_callback, inputs=[input_age, input_height, input_weight,input_gender, input_race, input_diabetes, input_medication, input_Cyp269, input_VKORC1, input_INR, input_model], outputs=output_dose).launch(debug=False)