Spaces:

theresaschneider
/

final_project

Runtime error

App Files Files Community

theresaschneider commited on Dec 16, 2022

Commit

4f6fa63

•

1 Parent(s): 7320efe

feat: ML_final_project files being added to some git

Browse files

Files changed (2) hide show

app.py +1290 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,1290 @@

+# -*- coding: utf-8 -*-
+"""Copy of finalProjectDaniel
+Automatically generated by Colaboratory.
+Original file is located at
+    https://colab.research.google.com/drive/1Xmu0qEBPBWsUKnRKtCsUn2mmP6R5tkZQ
+# Importing libraries
+"""
+## Basic imports
+import matplotlib.pyplot as plt
+import pandas as pd
+import numpy as np
+import tensorflow as tf
+## Specific imports
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import StratifiedShuffleSplit
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import OneHotEncoder
+from sklearn import preprocessing
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import OrdinalEncoder
+from sklearn.linear_model import LinearRegression
+"""#Loading the DataSet and Reducing the Features of Interest"""
+# from google.colab import drive
+# drive.mount('/content/drive/', force_remount=True)
+# !ls /content/drive/MyDrive/FALL2022/Warfarin_Dose_Prediction_Dataset.xls
+## for theresa to run it
+from google.colab import drive
+drive.mount('/content/drive/')
+!ls /content/drive/MyDrive/Machine Learning/data_final_project.csv
+!pip install --upgrade xlrd
+import pandas as pd
+# original_df = pd.read_excel('/content/drive/MyDrive/FALL2022/Warfarin_Dose_Prediction_Dataset.xls')
+# for theresa to run it
+original_df = pd.read_csv('/content/drive/MyDrive/Machine Learning/data_final_project.csv', sep=',')
+original_df.info()
+patients = original_df[['Gender','Race (Reported)', 'Age', 'Height (cm)', 'Weight (kg)', 'Diabetes', 'Simvastatin (Zocor)', 'Amiodarone (Cordarone)',
+                        'Target INR', 'INR on Reported Therapeutic Dose of Warfarin', 'Cyp2C9 genotypes',
+                        'VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T', 'Therapeutic Dose of Warfarin']].copy()
+patients.head(n=5)
+patients.describe()
+patients.info()
+# patients.to_excel("patients_df_reduced.xlsx")
+"""# Setting aside a validation set right away
+separates dataset into patients_df (95%) and validation_set (5%)
+"""
+from sklearn.model_selection import train_test_split
+patients_df, validation_set = train_test_split(patients, test_size=0.05, random_state=42)
+"""# Visualizing Data Features and Correlations on whole dataset (minus validation set)
+###Looking at Numerical Data (note that some of these are numerical catagorical but are entered as 0 or 1)
+"""
+# Commented out IPython magic to ensure Python compatibility.
+# %matplotlib inline
+patients_df.hist(bins=50, figsize=(20,15))
+plt.show()
+corr_matrix = patients_df.corr()
+corr_matrix["Therapeutic Dose of Warfarin"].sort_values(ascending=False)
+# note that Target INR and INR on Reported Therapeutic Dose of Warfarin are linearly related. Target INR has so few values that I will remove it as part of pre-processing
+corr_matrix["Target INR"].sort_values(ascending=False)
+"""### Looking at Catagorical Text Data (Use these catagories for gradio implementation later)"""
+patients_df['Gender'].value_counts()
+patients_df['Age'].value_counts()
+patients_df['Race (Reported)'].value_counts()
+patients_df['Target INR'].value_counts()
+patients_df['Diabetes'].value_counts()
+patients_df['Simvastatin (Zocor)'].value_counts()
+patients_df['Amiodarone (Cordarone)'].value_counts()
+patients_df['Cyp2C9 genotypes'].value_counts()
+patients_df['VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T'].value_counts()
+"""# Dropping any rows that have Nan in the target column ON WHOLE DATASET"""
+# Dropping any rows that have Nan in the target column
+patients_df.dropna(subset=['Therapeutic Dose of Warfarin'], inplace=True)
+patients_df.info()
+"""# Dividing Data into Statified Train (80%) and Test Set (20%)
+This includes minimal pre-processing of gender and weight on the full dataset that was necessary for the statified sampling based on weight
+Test and Train Sets with features and labels are stored in ;'strat_train_set' and 'strat_test_set'
+patients_df -> strat_train_set, strat_test_set
+### Perform Statified Sampling based on Weight (Chapter 2 Pages 54-55)
+### Dropping Rows with Nan Gender Columns (since there are only 4 of them) -- NEED TO DO BEFORE STAT SAMPLING IN THIS CASE
+"""
+patients_df.dropna(subset=['Gender'], inplace=True)
+"""#### Replacing Nan values in weight group with median based on Gender as is needed to perform statified sampling for the weight group"""
+## looking at median female weight
+median_female_weight=patients_df.loc[patients_df['Gender'] == 'female', 'Weight (kg)'].median()
+median_female_weight
+## looking at median male weight
+median_male_weight=patients_df.loc[patients_df['Gender'] == 'male', 'Weight (kg)'].median()
+median_male_weight
+## filling in null weight values on full dataset
+medians = patients_df.groupby(['Gender'])['Weight (kg)'].median()
+patients_df = patients_df.set_index(['Gender'])
+patients_df['Weight (kg)'] = patients_df['Weight (kg)'].fillna(medians)
+patients_df = patients_df.reset_index()
+patients_df['Weight (kg)'].isna().sum()
+"""#### Creating Weight Catagories from which the test set will sample from"""
+patients_df["weight_cat"] = pd.cut(patients_df["Weight (kg)"], bins=[0, 50, 75, 100, np.inf],
+labels=[1, 2, 3, 4])
+patients_df["weight_cat"].hist()
+"""#### Dividing patients_df into strat_train_set (80%) and strat_test_set (20%) distribution"""
+from sklearn.model_selection import StratifiedShuffleSplit
+split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
+for train_index, test_index in split.split(patients_df, patients_df["weight_cat"]):
+  strat_train_set = patients_df.loc[train_index]
+  strat_test_set = patients_df.loc[test_index]
+"""#### Comparing proportion of samples per weight catagory between test set and original dataset
+#####(distrbutions are the same, showing that the stratified sampling worked)
+"""
+strat_test_set["weight_cat"].value_counts() / len(strat_test_set)
+for set_ in (strat_train_set, strat_test_set):
+ set_.drop("weight_cat", axis=1, inplace=True)
+patients_df["weight_cat"].value_counts() / len(patients_df)
+"""## Visualizing Training Set Features and Visualizing Effects of Pre-processing Steps
+##### (height, weight, and some catagorical variables)--nothing permanent done here--all incorporated into transformer later
+### Visualizing Outliers in Weight Class
+##### (not getting rid of outliers as they represent natural variation)
+"""
+# reference: https://statisticsbyjim.com/basics/remove-outliers/
+strat_train_set.boxplot(column='Weight (kg)')
+strat_train_set[['Weight (kg)']].describe()
+# note that the high of 237.7 kg represents around 522 lbs which is plausible to see in a population
+"""### Visualizing method for replacing Nan values in height group with median based on Gender """
+# souce 1: https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean
+# source 2: https://www.statology.org/conditional-mean-pandas/
+median_female_height=strat_train_set.loc[strat_train_set['Gender'] == 'female', 'Height (cm)'].median()
+median_female_height
+median_male_height=strat_train_set.loc[strat_train_set['Gender'] == 'male', 'Height (cm)'].median()
+median_male_height
+"""##### Copy of Strat_train_set created for testing purposes"""
+strat_train_set_copy = strat_train_set.copy()
+strat_train_set_copy.head(n=3)
+# getting gender specific medians for training set
+medians = strat_train_set_copy.groupby(['Gender'])['Height (cm)'].median()
+# performing test transformation of null Height values based on gender
+strat_train_set_copy = strat_train_set_copy.set_index(['Gender'])
+strat_train_set_copy['Height (cm)'] = strat_train_set_copy['Height (cm)'].fillna(medians)
+strat_train_set_copy = strat_train_set_copy.reset_index()
+strat_train_set_copy.head(n=3)
+"""### Visualizing Race Distribution
+#### Includes Visualization of pre-processing steps implemented later
+"""
+# craete copy for testing purposes
+strat_train_set_copy = strat_train_set.copy()
+# visualizing original race feature
+strat_train_set_copy['Race (Reported)'].value_counts().plot(kind='bar')
+strat_train_set_copy['Race (Reported)'] = strat_train_set_copy['Race (Reported)'].fillna("UNSPECIFIED") # full null with UNSPECIFIED
+strat_train_set_copy['Race (Reported)'] = strat_train_set_copy['Race (Reported)'].str.upper() # uppercase all catagories
+# remove redundancy
+strat_train_set_copy = strat_train_set_copy.replace({'Race (Reported)': {'AFRICAN-AMERICAN': 'BLACK OR AFRICAN AMERICAN', 'BLACK': 'BLACK OR AFRICAN AMERICAN'}})
+# visualizing the race feature after pre-processing
+strat_train_set_copy['Race (Reported)'].value_counts().plot(kind='bar')
+strat_train_set['Race (Reported)'].isna().sum()
+"""### Visualizing Age Distribution
+#### Replace Age Nan Values with mode from train set in pipeline
+"""
+# visualizing age dataset before pre-processing
+strat_train_set['Age'].value_counts().plot(kind='bar')
+strat_train_set['Age'].isna().sum()
+"""### Visualizing Diabetes Distribution
+#### Replace Diabetes Nan Values with mode from train set in pipeline
+"""
+# visualizing diabetes training set before pre-processing
+strat_train_set['Diabetes'].value_counts().plot(kind='bar')
+strat_train_set['Diabetes'].isna().sum()
+"""### Visualizing Simvastatin Distribution
+#### Replace Simvastatin Nan Values with mode from train set in pipeline
+"""
+strat_train_set['Simvastatin (Zocor)'].value_counts().plot(kind='bar')
+strat_train_set['Simvastatin (Zocor)'].isna().sum()
+"""### Visualizing Amiodarone Distribution
+#### Replace Amiodarone Nan Values with mode from train set in pipeline
+"""
+strat_train_set['Amiodarone (Cordarone)'].value_counts().plot(kind='bar')
+strat_train_set['Amiodarone (Cordarone)'].isna().sum()
+"""### Visualizing Cyp2C9 Distribution
+#### Includes Visualization of Pre-processing steps implemented later
+#### Replace Cyp2C9 Nan Values with mode from train set in pipeline
+"""
+strat_train_set_copy = strat_train_set.copy()
+strat_train_set_copy['Cyp2C9 genotypes'].value_counts()
+strat_train_set_copy['Cyp2C9 genotypes'].isna().sum()
+strat_train_set_copy['Cyp2C9 genotypes'].value_counts().plot(kind='bar')
+strat_train_set_copy['Cyp2C9 genotypes'] = strat_train_set_copy['Cyp2C9 genotypes'].fillna(strat_train_set_copy['Cyp2C9 genotypes'].mode()[0])
+strat_train_set['Cyp2C9 genotypes'].mode()[0]
+strat_train_set_copy['Cyp2C9 genotypes'].value_counts()
+strat_train_set_copy['Cyp2C9 genotypes'].value_counts().plot(kind='bar')
+"""### Visualizing VKORC1 genotype
+#### Includes Visualization of Pre-processing Steps Implemented Later
+#### Replacing VKORC1 genotype Nan Values with 'Unknown' since there is no obvious mode (creates new catagory)
+"""
+strat_train_set_copy = strat_train_set.copy()
+strat_train_set_copy['VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T'].value_counts().plot(kind='bar')
+strat_train_set_copy['VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T'].isna().sum()
+# filling null values with 'Unknown'
+strat_train_set_copy['VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T']=strat_train_set_copy['VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T'].fillna("Unknown")
+strat_train_set_copy['VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T'].value_counts().plot(kind='bar')
+"""## Separate the data from the labels in training set:
+##### strat_train_set -> patients_info and patients_labels
+"""
+patients_info = strat_train_set.drop("Therapeutic Dose of Warfarin", axis=1) # drop labels for training set
+patients_labels = strat_train_set["Therapeutic Dose of Warfarin"].copy()
+"""## Custom Transformers for Pre-processing (Important Part)
+##### reference: Chapter 2 Textbook associated google collab notebook
+##### creating a custom transformer to handle catagorical attributes Nan Values:
+##### includes Gender, Cyp2C9 genotypes, VKORC1 genotype, Diabetes, Amiodarone, Simvastatin, Race, Age
+"""
+from sklearn.base import BaseEstimator, TransformerMixin
+class CatTransformer(BaseEstimator, TransformerMixin):
+    """
+    REPLACEMENT OF NAN FOR ALL CATAGORICAL FEATURES
+    for Gender, fills with mode from training set
+    for Cyp2C9 genotypes, fills with mode from training set as there is a most common class by far
+    for VKORC1 genotype, many more are unknown, and there is not a most common class, so fills with "unknown", thus creating a new catagory
+    for Diabetes phenotype, fills with mode--assumes no diabetes
+    for Amiodarone (Cordarone) drug, fills with mode from training set as there is a most common class by far
+    for Simvastatin (Zocor), fills with mode from training set as there is a most common class by far
+    for Race, fills nan with "unknown" and converts all classes to upper so that the several groups labelled "other" are grouped together
+    for Race, only a few were missing--replacement of nan with Mode
+    for Race, due to there already being a catagory for "Black or African American", the catagories "Black" and "African American" were grouped together under "Black or African American"
+    for Age, fills nan with mode from training set--not many Age values are missing. Even though there is not a most common class by a lot, I think this is best
+    """
+    def __init__(self): # no *args or **kwargs
+      pass
+    def fit(self, X, y=None):
+      self.mode_Gen = X['Gender'].mode()[0]
+      self.mode_Cyp = X['Cyp2C9 genotypes'].mode()[0]
+      self.mode_Amio = X['Amiodarone (Cordarone)'].mode()[0]
+      self.mode_Simv = X['Simvastatin (Zocor)'].mode()[0]
+      self.mode_Diab = X['Diabetes'].mode()[0]
+      self.mode_Age = X['Age'].mode()[0]
+      return self
+    def transform(self, X):
+      X['Cyp2C9 genotypes']=X['Cyp2C9 genotypes'].fillna(self.mode_Cyp)
+      X['VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T']=X['VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T'].fillna("Unknown")
+      X['Amiodarone (Cordarone)']=X['Amiodarone (Cordarone)'].fillna(self.mode_Amio)
+      X['Simvastatin (Zocor)']=X['Simvastatin (Zocor)'].fillna(self.mode_Simv)
+      X['Diabetes']=X['Diabetes'].fillna(self.mode_Diab)
+      X['Race (Reported)'] = X['Race (Reported)'].fillna("UNSPECIFIED")
+      X['Race (Reported)'] = X['Race (Reported)'].str.upper()
+      X=X.replace({'Race (Reported)': {'AFRICAN-AMERICAN': 'BLACK OR AFRICAN AMERICAN', 'BLACK': 'BLACK OR AFRICAN AMERICAN'}})
+      X['Age']=X['Age'].fillna(self.mode_Age)
+      X['Gender']=X['Gender'].fillna(self.mode_Gen)
+      return X
+"""##### creating a custom transformer to handle the transformation of height nan variables based on gender-depenedent median"""
+from sklearn.base import BaseEstimator, TransformerMixin
+class GenderTransformer(BaseEstimator, TransformerMixin):
+    """
+    replaces missing Height variables by median for the associated gender
+    replaces missing Weight variables by median for the associated gender
+    """
+    def __init__(self): # no *args or **kwargs
+      pass
+    def fit(self, X, y=None):
+      self.medians_height = X.groupby(['Gender'])["Height (cm)"].median()
+      self.medians_weight = X.groupby(['Gender'])["Weight (kg)"].median()
+      return self
+    def transform(self, X):
+      X = X.set_index(['Gender'])
+      X["Height (cm)"] = X["Height (cm)"].fillna(self.medians_height)
+      X["Weight (kg)"] = X["Weight (kg)"].fillna(self.medians_weight)
+      X = X.reset_index()
+      return X
+"""##### creating a custom transformer to add extra attributes (BMI, BSA):"""
+from sklearn.base import BaseEstimator, TransformerMixin
+# column index
+col_names = ["Height (cm)", "Weight (kg)"]
+weight_ix, height_ix = [0, 1] # get the column indices; they are 0 and 1
+class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
+    """
+    adds the variables for BSA (body surface area) to the data
+    def transform returns numpy array
+    Body Surface Area (as calculated from the DuBois and DuBois formula)
+    reference: https://www.uptodate.com/contents/image?imageKey=ONC%2F96451&topicKey=ONC%2F83810&search=Pharmacogenomics&rank=3~18&source=see_link
+    """
+    def __init__(self): # no *args or **kwargs
+        pass
+    def fit(self, X, y=None):
+        return self  # nothing else to do
+    def transform(self, X):
+      # BMI = X[:, weight_ix] / ((X[:, height_ix]/100)**2)
+      BSA = ((0.007184*(X[:, weight_ix])**0.425)) * ((X[:, height_ix])**0.725)
+      return np.c_[X, BSA]
+"""#### Working Transformer Pipelines
+##### pipeline for dealing with missing height and weight values
+"""
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+gender_pipeline = Pipeline([
+        ('gender_transformer', GenderTransformer()),
+    ])
+"""##### pipeline for dealing with catagorical data nan values"""
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+cat_pipeline = Pipeline([
+        ('catagorical_transformer',  CatTransformer()),
+    ])
+"""##### pipeline for dealing with numerical data: height, weight, INR
+##### uses CombinedAttributeAdder class for the addition of BSA (or BMI)
+##### uses SimpleImputer to replace any remaining Nan values with the median for that feature
+##### uses StandardScaler for scaling
+"""
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+num_pipeline = Pipeline([
+        ('imputer', SimpleImputer(strategy="median")),
+        ('attribs_adder', CombinedAttributesAdder()),
+        ('std_scaler', StandardScaler()),
+    ])
+"""##### full pipuline using ColumnTransformer
+##### Adds Attributes (from num_pipeline), Scales and imputes numerical data (from num_pipeline), Uses ordinal encoder for Ordinal Catagorical Data (Age), Uses 1Hot Encoder for non-ordinal Catagorical Data
+"""
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import OrdinalEncoder
+num_attribs = ['Height (cm)', 'Weight (kg)', 'INR on Reported Therapeutic Dose of Warfarin']
+cat_attribs_ordinal = ['Age', 'Gender', 'Diabetes', 'Simvastatin (Zocor)', 'Amiodarone (Cordarone)']
+cat_attribs_1hot = ["Race (Reported)",
+                    'Cyp2C9 genotypes', 'VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T']
+"""
+Pipeline using column transformer
+Adds BSA attribute (from num_pipeline)
+imputes remaining nan numerical data using median (from num_pipeline)
+scales numerical data using StandardScaler (from num_pipeline)
+Uses ordinal encoder for Ordinal Catagorical Data (Age) and Binary Catagorical Data (gender, diabetes, simvastatin, amiodorone)--see cat_attrib_ordinal
+Uses 1Hot Encoder for non-ordinal Catagorical Data--see cat_attribs_1hot
+"""
+scale_encode_pipeline = ColumnTransformer([
+        ("num", num_pipeline, num_attribs),
+        ('cat_ord', OrdinalEncoder(), cat_attribs_ordinal),
+        ("cat_1hot", OneHotEncoder(sparse=False, handle_unknown='ignore'), cat_attribs_1hot),
+    ]) #input list of (name, transformer, columns) tuples specifying the transformer objects to be applied to subsets of the data.
+"""## Full PreProcess Function to incorporate all pipelines
+##### contains "full_preprocess_function()"
+"""
+def series_to_df(data_series):
+  """
+  function to help with processing new data (potentially useful for Gradio implementation)
+  input: Series with dimensions (12,)
+  output: pandas dataframe with features as column names; can now be sent through full_preprocess_function
+  """
+  data_df = data_series.to_frame()
+  data_df = data_df.transpose()
+  return data_df
+def full_preprocess_function(data_df, train=False):
+  """
+  INPUT: program expects the equivalent of an instance (or multiple instances) from the non pre-processed dataset (without the label) in the form of a pandas_df
+        --input should have the following 12 features as column names: Gender, Race (Reported), Age, Height (cm), Weight (kg), Diabetes, Simvastatin (Zocor), Amiodarone (Cordarone),
+        Target INR, INR on Reported Therapeutic Dose of Warfarin,
+        Cyp2C9 genotypes, VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T
+        --input should either contain a value for each feature of Nan
+  program will remove the Target INR column from dataset as there were too few values and it was multicollinearly related to INR Reported
+  if train==True, function will send training data to pre-processing be fit and transformed
+  else, function will send new data to pre-processing to be transformed (not fit)
+  OUTPUT: function returns pandas df of features, including feature names as column names
+  Note for encoded variables:
+  Gender: 0=female, 1=male;
+  Diabetes: 0=no, 1=yes;
+  Simvastatin: 0=no, 1=yes;
+  Amiodorone: 0=no, 1=yes;
+  Age: {0: '10 - 19', 1:'20 - 29', 2:'30 - 39', 3:'40 - 49', 4:'50 - 59', 5:'60 - 69', 6:'70 - 79', 7:'80 - 89', 8:'90+'}
+  """
+  if isinstance(data_df, pd.Series) and data_df.shape == (12,):
+    raise TypeError("Expects pd.DataFrame; Send your data through the series_to_df() function for conversion to proper format")
+  if not isinstance(data_df, pd.DataFrame):
+    raise TypeError("Expects pd.DataFrame; See full_preprocess function documentation for input expectations")
+  # prepared_feature_names = ['Height (cm)', 'Weight (kg)', 'INR (Reported)', 'BSA (m**2)', 'Age', 'Gender', 'Diabetes', 'Simvastatin', 'Amiodorone',
+  #                  'ASIAN', 'BLACK OR AFRICAN AMERICAN', 'CAUCASIAN', 'CHINESE', 'HAN CHINESE', 'HISPANIC', 'INDIAN', 'INTERMEDIATE', 'JAPANESE', 'KOREAN', 'MALAY', 'OTHER','OTHER MIXED RACE', 'UNSPECIFIED', 'WHITE',
+  #                  '*1/*1', '*1/*11', '*1/*13', '*1/*14', '*1/*2', '*1/*3', '*1/*5', '*1/*6', '*2/*2', '*2/*3', '*3/*3',
+  #                  'A/A', 'A/G', 'G/G', 'Unknown']
+  data_df.drop(['Target INR'], axis=1, inplace=True) # remove Target INR due to too few values and collinearity with INR Reported
+  if train==True:
+    data_cat_tr = cat_pipeline.fit_transform(data_df)
+    data_height_tr = gender_pipeline.fit_transform(data_cat_tr)
+    data_prepared = scale_encode_pipeline.fit_transform(data_height_tr)
+  else:
+    data_cat_tr = cat_pipeline.transform(data_df)
+    data_height_tr = gender_pipeline.transform(data_cat_tr )
+    data_prepared = scale_encode_pipeline.transform(data_height_tr)
+  data_prepared_df =  pd.DataFrame(data_prepared)
+  # data_prepared_df.drop(['Weight (kg)'], axis=1, inplace=True) # removing weight to address multicollinearity
+  return data_prepared_df
+"""##### Example test input for full_preprocess_function()
+![example_input.PNG](data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAABwMAAACBCAYAAAAsaxXvAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAFiUAABYlAUlSJPAAAEysSURBVHhe7b2/axzX3sD9/ivqFlQYXBhSxJXFW0SkiEgjCEHgQhCIIBDEBSMSjHAjXBgRMOJCEOIGEQisIaDCIHjN3VsEpTBykUfmwciFeVQEtjB3C8F5z6/vzJkzZ36svLJ2Zz8f+Nwbr2Znds6c39+ZM//Pf//7X4WIiIiIiIiIiIiIiIiIs+nJyYn63//9X/V///d/ajgcqhCCgYiIiIiIiIiIiIiIiIgzbG0w0P8/AAAAAAAAAAAAAAAAAMwgBAMBAAAAAAAAAAAAAAAAOgrBQAAAAAAAAAAAAAAAAICOQjAQAAAAAAAAAAAAAAAAoKMQDAQAAAAAAAAAAAAAAADoKAQDAQAAAAAAAAAAAAAAADoKwUAAAAAAAAAAAAAAAACAjkIwEAAAAAAAAAAAAAAAAKCjEAwEAAAAAAAAAAAAAAAA6CgEAwEAAAAAAAAAAAAAAAA6CsFAAAAAAAAAAAAAAAAAgI5CMBAAAAAAAAAAAAAAAACgoxAMBAAAAAAAAAAAAAAAAOgoBAMBAAAAAAAAAAAAAAAAOgrBQAAAAAAAAAAAAAAAAICOQjAQAAAAAAAAAAAAAAAAoKMQDAQAAAAAAAAAAAAAAADoKAQDAQAAAAAAAAAAAAAAADoKwUAAAAAAAAAAAAAAAACAjkIwEAAAAAAAAAAAAAAAALrL2yO1eX9Hnbz3/542Lodq8GhVbQ+KgbpJQTAQAAAAAAAAAAAAAAAAusn7E7V9b0Et3F5X/bf+s2lD/8adz3pqYWFJbf8x8h9ODoKBANfEySNduSwE3l5Sqw8O1dm03nkA14RuaGweWFOH7/xHmotf12y+WPv1wn/SktGJ2jWNgmm4gv21Q37Ltv6vOkbq7PcdtfHlxhWOMQX8sW3TdvsP/+8AWy5XD9W5/3crSHOYUYav+mp7fUndsnnwllpa31WDRP66eLaut+mp5ScnOidOM0N1crCl1u/tNpSnekavj9TOdytq47ea+vddX63fXlC9z/SxpjtRusvrfbVs8672wfGV8+Zk8/eFOlwzv6nYpsNVkPaxwkcfUsqvj1b1x6zg+0tlJX/n16j3wyAvP/57tg/77lCtZd8z9tSdz9bV7mAa06ec527dW1Vbv55Ncdt3Df1Due5ty1hlPnG6/nbb/m7XaNcvmZ1+Voe4HKrTZ9v62txyeXXxjlp7MlAXl/7vE2L46lBtfXlH9fwxVlLzPe8Gak+3G3cWXZm59cWWOvwryAn677vry/7vug79Mvo7TC+Xuu6z163cLwzne2RuMJufuDxT+1+Yz+6q7YG/1ibP/rqlVj4xAQDd7n6i+xoHJ2ooebZte3t5oQZPN7L9LNxeKbRzo4szNfjVtCs+364d6t7tjPP3idp/sK6WdB3beS5H6vxl352vHqsW8lWIzk9nz/fUxlcyF6AN2v3ifLWpdzbUXpSXLl7sqvXPGuq3Fly8iH6Hcex8N1LHD+T7y2r/tf94bC5U/74uG4u6zMaBwKi/Y8vg00S78e5EHT5aV8tSxoLyX4oDBGZzv22PI0HLxQ119Lf/bEIQDAS4Jqoqgd63R3rYAPODDI6LHcTpDgbO+GSnb1zLnSLT8OvzGrfjQZrDLPJqVy3ZvBeZyP+zM0nVtjzV06r+JRh445z9vKyv07pa/8Zc8y11fMXrQDBwWpHyXOGUBgOv3H+bRiqDPJK/w2u0ovbf2G9l37NpUJqcFJfU7iu//dRQled6auP3aR2dXUOdI9edYOAEaHfeBAM/NiN18mipkE+L+XUyDJ9vuknyyMJ8z1tdR/ogYEHpj+v+ZvLv1zD5C9fDySMXECj2C6Tudm1nHAyUvsSSroddnVCdZ5ce+23atLeX5+rQBDpK2/g25PJU7X7qP1vsdScYOG67NsMMdXvirqm+fr7uSM17HX2fyE+VwUAx7+td/LbWXL81Up2vC/nuzaHa+Ha/PtA4OlZb+nvL36zbm0WXfz7zfxiP0fGWPa+1XxKPB1T0d1YO8m1Hr/bUSqnOzvtotcFAuZGwxXGE0WBb3dV/u5vVFZOBYCDANRE3+KPX+77xnrdB0rwjg8TiIP5mJpPaDtRnfLJTD7pWTdkb+H9n+PP6qB3eOUlzmDpOf7pr65iNZxdq5O8yG70ZqN1H/Y+Y/yfNZCYbOzWZ31l8nagHnOe/b+jr1ZvoBN7Voa6+FmSCawYmpLpZf1TVrf5zP2F498mp+9hPYhSCgXLtLofq+KFrf6YvjaLzvBypswN3Pad3AvEa6pwPmDStzv+TaZ9nj3k97+lm9MJN9i4srqidF+euH2yfktlR+3+6bSaCCbz8Y1sdv/VTtG/i+Z5z//TXglr6/lCd+uDe6O2J2n96ZOvM819W3d8fDdwTYPp3njxZsZ+lJoZh+sjyWxgkGR6pDfPZ5/vKhCwKc4N/67+ZYMI9nU8k+PFq1074L9zbUse+rh++0vnJBh3WVd/knRbt7fmByzsL9zbV4Uv/a0bn6uRgTx35/Z4+XVNbByfq4s3s9L0amaNgoMk/m/d31fFr8045kz/KwcAsT+r81NfbNaLz0tEDF0Ref2ZrJnW4ava9pLZf+O8PT9Surc+Cm8OakJuTw7q4xEjnyRX7e3uf6fq0KjPqa9xbuKt2X/mA9pXyra+TF7eUPJBbx/DIp+M3Mn8idXpPrf10os7bROdGup9gfm/NDR7l44TIMfX3W1zKthAMBLgm4mDg8OWea2w/3VV+KJ0NqDJTS4leXtjlR1ZliQtrccAx+itYnkKWghu/ZoRrQQaJDcFAs5zDk/xRf/OoeHHZItmPmJgUMIOHg2BJiFDbMfL7WN1Xx8+21Io9Vk/dub+nTm2ei48ROkMTn76j7NLWdRZW7Z0/fkJFOomkOXQYGQyaTnXVIKDUBkUDKNuOrW6r3X8s6/bF3NF+rI4fm/9eULdMZ3U0UFtmkBp3qP2de7a9053+Vm3d8FQdPljNyqPTt3WVd8Ia8/aw6Ti1d+pJXVw6VrG9zeqRtX01MGXal32z5NPRtL5zYBYJ63F/g0cvy5+u3lx9uKs2zVPbi8tq9/jYv1fhlh/ENudvi813+dJdrh04zZdk0lwMwqVt5C7cvH5uk79d3tN56fJcHdnyZI61ofaDZcDsUjxZXy9e1rdlvmts16aUeIIrYCLp29hWG0bqLFieyxxnPVhSrlX9MbNIXySu7/zna7tq1zyhKxMZdcHAkb4GD/zNKOM8bRfl3Tj9s9/y8EidPJXtwv5UG6Lz1Pni9J8uf939SUZnmqZy5M9/++hE7clS3NEybM3nI3mqKt/Kb00Z9g/r863l/ZnqB8tZ9Rb9tqk6sQEpj+U8739vZX/X0TRmdfs352cmB1eztN0OZgfjstj7ZLlwzrKP/Rf97Fi9T9aipfQa0i1VJ4WTzaW+Qmhejkr1VyrNdV5pGuvDeAweunRc/mf6yRHpI8fBttMnpu6SJ4VTdc6thmXyojrG918WFjbVccV35Oa9lX+e5jfv/eEn0B+W7myFaUTGPcEqFubpI5sH/dNLUm+Zd38NbABvSe38mbUYfjWMct0qeXnL7LixvZUAzoLafJ7vu5KavlcjE+hXpctYud2o6x+X6thQ8zuqxqp/99W62ebebjBW/cB2Q/Cry5ix8nn8twmT5yv/gUWW0xzjRsrRmX+idEnt/WU+kCdIV9TeK59wlyN18pN7yq98032arH5rcWPD+e+brt67vV4YHwn2CdxF02eScnGFubKLvsvzbepWc8PYL+7pyKWnvi2RV1iMUWYkjy6F/cyQ1HEiZB+2Hoi5Yn4jGAhwTSQnDRaX1U7w8s+qxiuf8Bqqo28TDWw4QNAV0krp79p7OyxtNhXUDealw2ca7NR17uWPkpf2U278hkduqZI1aWz93Ym97+Uuterf4hrout96hcb2pvAdW5u20mDbO/XCYCBpDh0ne+m0z0+3zVr0x+os6OuV2qBokirZjmW6tfrdxElx0leWL1l/5j5r19aV/561dVeddPPKcerOJxt8l44VtLcWeUqjbOGdWvBBDO3TgObuT/MvPyDNbqaqqze1/i7spvxt7tTcMe9hCLfxyoDN/A5Tx5e3yevn5vwteW9NbcZL9nyxr0xLULUUT34XaZt816Zdm1JqJqQmkb7NbbWZiPN300fK8lyt6o+ZRcpUXN/5z/V1OfN3mdu+SyoYGDnepIR/f0tiP/lyVE39qTak99H7TI+ZsonHFuVIgkKJbbZemNzS5nwkT1Xl27p6Lq9/mvKtXRKuop4r1YktkPJYzvMtrk+LMavbf09tfB/VvcFEblVZlN9UVWeETzM0pluqTrqWYGCLsT6MSd5eVk6ESyDAtxEWCRr44EBdnq7q78nyc9lybpJnaiaOR3ob+0RYyjEmnOEmKQdf3NKh+XvNpN7a/skFeuPAQPb3KM8W6tzG9lbybMs5hJq+VxOT6FfVlTFpN5r6x9X1vdbXtxLkd+2zw90QkC8RPpF2w5Pva4wn6K5IOt9IHbiq9n7bU2s+eHnrXvn9ksX2dEltZTfemOU93Rg/Zbt+r4zvy+XAmsh3o7/2bWCr+FsMfizo+1BunDjmTWeGsP9aRdTHW/rhOA/4yt9+2Ff9ByvuhqW6dylmwejEU4F1x4mp+d1XzW8EAwGuiWLFagxeDlzB6OVe8U6DcLmAN6nvjtTgB1O564bsmSyBca72mzrA8BGpG8z7Cl13nuzg+ItddeIbieELPzAoNZLSuCcCU75RXJfJisrAVE+t/OReRj16vul+S2FwWn2M2cCdp0lb0ziuPd1TG/ZOvRO1oxvjnu6wkeYwF5gXjL/YV1v35S587e3N6s5oNEnl2jHXdrk7Vnu6XJ2rc9/ptG2MDCT1QNTl+eblN0ptnf5f+z5PU04eHqrB63xp0yJSntpNkpWP45BOc/1ApupYUlZ1p/17P/j+K30cuCp+8Ojv/jS4SRUJDvpr8+m2Grw/U/ufm2113fn23F+b6JpV5G+pv3v39QDO59XhH7uubVjc0fvw+9bH3TqSpW2a6+pUvsv6hPp32rtdLwdq2z5haNomufs2WIpndK7637nBuwtAtMh3Y7VrU8YYE1Ljp2+LtjoYrPcrl3tztKs/Zo2q+s5/btPa1+263J0cB5MSqclJvf04dyfn4x1dpn3eHb3tu2XUdFnr26SW39jUn6pD9hFo6xH/Z0ObciR1ihmf+fxyHi432up8mvOto6bOaZFv5emUhbW9bInCqjqxDdX5v+n6tBuz5pNafiJQb+POP59MjBn+XswHso/e/X13zpdD/ySO/91tynuqTkqmW1XZiahK88axPoxP3l5Wz4XIRLf0K4J2wt/Ill9bnRef+zxb0S5YzJKLJl+FSz/KdQ/zUYLz59vZk0/mif/+gS+3PBk4M0j+sU+ZX+q8Y/JCsCKY1Pdr9/0TQJMKBhbaW8mzLecTxuh7xUymX9XUbrTpH3vq2jU5z+xmnOAmQ592E2k3hA9+MlDSJbDiGqXzTV4Hli0GjLJ+iLi4nKe1bn+PH4VPRffV/g9uu3ZPBsrvWNVjNPdJ4XhV+e71nuuH6WNm8+e+rcz6HqVr2hKfT8p9mADJS5k9tazrYnuc0t9yUzeJyFPoyacC644T4883+bt5MhBguihUzNI5NAPosIZ4N1B738XLoml9xWjuhDH/rr7jta6ir+sAw8cj3SkrdOxqGpVyI1kzKTA8Vlt3eurOJ34ZBXOXynf76iSr2+W3BB0WOXah41RzjJnAnefqL6fq6FszyDMTuuZOvYH9nDSHueRvnf/90wGljmQyT0o75vKuq7Nc/pT6y7UxMsHnBxd6YGQ68Nm7pQwNbZ3lYqB2Zbk1a2oppkR5CmlzHE2h/q2k6liJsvoBg2lIkC23VNYtt+Svjc2v/nrYtJdrE12zqvz92E0mbL3wH1jC657KA6nr35zvpE+4knrZvSxZ82XwhIKmmE9b5Lux2rUpo64MfWj6Gpraajl+0mJ+ald/zBpV9Z3/3Kd19tT3N+7/bRqE185Mnj1yT7mNkz7p8U6c5xO/saJsV1Pcx7lflil7isfQphyljht81u58WuRbS6LsCy3ybTK/jp1uOdX5v+n6yHmklTGr7L/34Lg0qSYMX/XV9vpytrxzpj+f1G8sfNamvKfqpGS6VZWdiIo0bx7rw/jkq03IEo1JCv3VoeqbpZCD4ED62qav9+jlrlox+dEEdcOiIcHeQoC/Galra38/TBfhOwL9dQ+Xn87nBuXJ8SX933mmqFom1N0Mp79ngi+N7a0EzyqWE4xJ1XNtmUi/qqHdaNU/9tS2azJW9e9e1OXVXJ888D+hdmNiSLoEVlyjPF/5DyzS3vbUymO/jKnOL7XvczY3ED/fcst0BkHsIr6erLk5J0bGWmFaZ+eXOKeqJwOlfJQdr26VfJJMgwSjNzqf2/kTf+OI5DPzTs5X7pxGryuCwu/1d20AWf/G1FODAaXjxPy5457Ebfm720AwEOCaiCtmWUot6+zLHUN3/B2ppgI+3i7e7ewLffXdgtLZDe9ah+lCGvPiIL7QuZCBgrlr96KpNaueFHB3nmyqo8p9NHS4MuQYy2ov1RhNPe4JwIVv1tW6f7LErFd+98muPX/SHLqPHmj+Y1PtvzhTWdY0d1LaDrwuA+GdlIaKAZRrx1zedXWWKwNSf2UDDz+oMpMWrq3zgy1Dm7YuRP/94vWx2vnSDR4K73HKypMuc8U+61jHkd+//DQ91HEkyq4lUR98yGAaSmRPsqQMl8+z+dVfD5v2cm2ia1aRv2UStvBk4IvtYCDs2xJ9reUu2osXO27CT65/y3yXHqwLcpzwzmd5d4csgdMi343Vrk0ZVWVoIunboq2WSbxPt9RRw9M57eqPWaOqvvOfS1qP9L/9RKPR9qfiayfL79U8HV4iG+/o6+zbjtFrvV9bLjZ8fZ/4jRVlu5poH2YJTXs+wZ3ybcpRfFzdvh7/4CZlbZvV6nya862jpn/YIt9KPXfX3G1uxonmPamyLGnrdMspjF8KNF2fdmPW6v17JL/d122B2cfoQp0duMCJnE9pH3+fqv2wPm1T3uU4Mon37lhty9LrhXST8070S0Kq8mrjWB+uggTTFhZX1O4ffrWJy6E6/XVT7f3ptjF50vaLzTV+6QKDxXwX1xe6/UlMlp8/M++56qnlfySeysjqGPdUv9zgNno7ULuP+sX2zmD6wC8P1aadFL7+JQZhkuT9tK0HJnBxV+2+9H/SFOr7t75+CZ9M13VE3G4MX+27dkPa0xbtrcw5mmBFX967ptuowZNt1Y/mMSr7Xi2YTL+qqd1o0z/2yPfMk4mp9sWPVVd+PtXttf5+9JDGRNoN4cbfGWg+N79d9xte+t96edH4PmcTiHP5MhEMfH9h609b/4XLKzcw8kvM2xUQXspx/XWP8t2FHv/Z/ZfeGSgrtaQsLv/aiASYa244KmBWoLHl2gfppMwE+Wz4ai+ZbrKs7Eqbmzri40S4/FnxDsi/j9Smzm/FJe+bIRgIcE2UKuZgUGEb65o737OK8VJXCmZJntI2eYM50oMIW2nWbAM3iXRyioGkYoej+t0iWYdEOjgJZRt5QW/BxTtq7dGx74g0dbgEWbIvtPj7pxvpjGtl6QDpYGtlYpU0h+4SlIHYxeZ3kEl+de2Y+29XZ7k8KfVX3iGVO6pX1Mqn0TIZbdq6rJyULd4tL4PCUF+2Wh3HIZOjoVKm5dxSuvPNB/tZ+fyAwTSUcQPYuP6TgaD53OcXW4f662HTXq6NyRPN+dsMntyyfbH5u8FO/Yvyy/rf1zLfNU36Vx4ne5dWm3zXol2bVqrK0MTSt6mtHqmTxxXXIJq8r6s/Zo7Kfo7kM1/WgrSWJY+M9rxL1y5/z0zrp51q3mtXfreQL7+GZH+qjvI+pM7P260P6B8urrnJ1lbn05xvHXX9wxb5trKe07ZOt5zi+CWk+fq0GbNW798j7wJP6Y9T2Y5n9Wmb8p4Hcaq3MdT0S/S1a2yHWoz14QroMrgXvjc7sFDefICgt6i3DQIqDsnTsXkf4eLZeja+jJU8bN6tlq96EZjVmanj9NRy9B4ymH4KdU+w1L0hru8lUNC7L0HhqrbHvaLB0qa99YGB8n6k3WhRL7VgMv2q5najuX/skRsrQgt1tQT/e3a7uH8ymXbDke8r6LNPksq+W9B2vnLvpixtky1jXJUPgvyWOs7istoJnmhtxCz1XdGnCvPx6dMVe116n20Xn642+HwfvivcMPI3j8af15O/ziT1tF5VPjA3jrp8bW5saiinhoanApuPE5LfuJK6yc60Q/L9yr5TAoKBANdEeYAnj6fnjc/571tqJXih6/azPbVpCnIw6DZ3CfQfratlv52z2FBfDPbUxpfBO6ES28BNIZ2cYmdAGoCswvbXOV4GK/t7m0Zf7jJL6PJcc4cr4+2R2irkqWvqzFwLeecmS5tgQjErk6Q5dJl3J2r/QbC03u0ltfrdnsrfG948GHTtmPtvV2e5PCn1Vzihkt35F90Ja2hu68oTIW6709LdneZ9YfJeFWdetlq1qQY9MDl6sFJYYkzKdFXn3OjOV9ItKJ9VgQwYH3kSzCyz5D8S8qVm/j+XX2wd6q+HTXu5NiZPNOdvy5tjtZ0tT9tTdz5bV7svgquo80r4gvi1JwN18kteFgxt8l3jpP/lUJ0cbGT7seX1waE6zcZmLfNdU7s2rdSUoYmkb2Nbrbm8UIOnwTUQ47a6pv6YOSr7OZLPfN0cXhd5EkFrzzt17fzye1UTF0mGus36Lk/XW/dW1davYRswRn+qksQ+5D1A4VM4bfuHflLR1A0rpn2VsmloPJ8W+Vao6x+2yLfDP/fUmpShL7bU4V+63ou2aUtp/JLR7vo0jVmr9y+YSUO5NrrO/nJD7f2yo1aD48g+eibAYz7X9en6o35Qn2papJvpb7h0M8fR6fbc3/0fpVt1v6RlO9RirA9XILrGvU9WSmVQtzBZMDYODuR5WpdzW459fgv6CHV9xjAPu6Vtpa9xSy2t7wb1hRxHq+uS5fUttR/2Q2B2kLZPGwcoSvV9dtNIEESI2h7T5yn0Sdu2t8PTmv20rJeamEi/qkW70dg/FsK2wRvX1dlTauW+yaTaDct1PxlY2XeL6p0/9/P2Vvop2Z+jfJCqe4Lj9D5ZVusP9ov9nLb4dFvN2kldB36u+/O/n+mr5nlzqDa+1WO/msBZaelbecrPPJE3Rjqb912aNFlJPLFXrNPduHDrwC+1KhTepejK1/az4Fw0EsROHcPQ6jiekb4O5qaVwitYQt7rcuRvQOs9jvJiDQQD6/CVaLFj1qLD/kEkBvsAM8tIHT+QstN+bWm4CmdqzzYCuu7wL+g1yEv1x2kYoC0zkualDqPugH2lB7N5bzBndKJ2zZ20t9fLS4k0Msn2y3T8t9T6vd32gxIAAIAS9I9gwsSTlTBVyCRbdUARwDF63XdLciZvXEgEKgBAM4P9KhNU/Mk9fVYO/NNuzBXZ04rF93ZOJRLo021U3XsHz39Ztfm3agnYFAQDqzB3OgR3fIYSDARoiX8aa/mbdbukCy/Bvk5kOZt11X/rGzXd6XEvll5Kv4gWPpAZSfPKu8ei5QwMUxMMZAAOAACTgP4RTBiCgVMNk7rQRPxURumJEwtjEYA0s9SvknLs/WJfnSWevqLdmDPeHfkbQYoB7ani/Zl/b2VD0PLvgQ0YppcYrYZgYJJ8Hdil7w/VqX+B68eBYCB0CD1Y7tmXoPoOQ2L5J5ggdrmz5XzpKvu4/3b+8miYPLOQ5vGkle6sn/68VrlMxtUhGAgAAFMI/SOYJAQDpxomdaEJySN2+cTUSikWxiIAlcxMv8qXY/37Vh4cJpehNNBuzCFv+5VLk04Flxfq+OFazTsa3dybWQJ742l6idE6CAamGB6pDVthNE2SjtTZr/l7LOy6wk+Ci+AHCttHJ2pP1gjXldDa09PCerJ27fyv8veVuDXJw8nUhuNIBWcGJG+P1KZ9WXJP3ZnmjA1zwckjnRf9y5MHD+N8rSm8qyc02O5ioHbD9/mYdzYweQPQnuSklQTuTLDe/Du6ay4V0JP3FGSa5UbD8ij73FL9Pw/V1hdmXXhdZu/vBuvTa+rKdOkYocFg3Kw9/yRfq929B6S4Vvvwlf4NWdvqZeIOAAAAAAAAAADmEIKBKV7t2hc0Ljw4dhOL0eSkLBN69vNKPsEYuPT4xH1PJmBL5i8ol5dXlrfJJ2IbjyOTuGub7lHXYJvUesgAHwf/NOC3R8pULSavmzyZr2Ps1xr/dFsNzEdmaYGHd/U2S2rvL7eFen/sX+ofubh2hSUMAeaUijvYiy8yv0ow0OsD/nkwUG5qCfxiX9nWqKlMtwoGmneR+ptjCvbU2m8+6ig39cQSDAQAAAAAAAAAgDmEYGAKmTh9OHD/TgUDRwM3obm4ka+T/Gbfb+cnLGU/iytq90+TuCN1/H2wD3Wm9j83/76rto7O1agwmeonYtscJ5jE7d33TwMOzPKM+jMJaAJ8bHxQPXvUXsqRDw5mwcLPd9RJRTDw/MAFwleenKihKR/ZNjzCD9CaVsFAIWqD6hidqj3bhsm2eTBw7edTV2aHA7Vt3yngtmlfpqVdC54GFHQbuGL+9sWuOvHLeA9fbLubeGQp4ou+q290+7v960CdXdASAgAAAAAAAADA/EIwMIVMIn66q05tgM4h6wjbidM2Ty8kJmDTT2KEk53RRGyrpyRkPytq/7X9AODGOft5Ocqv4paSd3Tbp15v38nWGjfr9m8/z59mlfKSkmAgQEuSwcBztf+FKUvLUbtREww0S3M+3VCr98zyn3lZLAcDw+8WP2tfpmuCgXI+KYP3kl4MdtV6+Ftvl5cSBQAAAAAAAAAAmAcIBiaRSdIFtfTDsTr3M4eFYKAsQfbpljp6UzG12CIYuGODIGvZsqEXL3bUiv/MTqa2OU7dpOkMM3q5q9a+7avzICALFbw5VOvfHE5RWslTryl7auuFzsv+qdflpyf+qdgypz/5J4b+eap4sCfCvB/0/o46uY73gpqnub7yy7dCzqymedwWjc7V4PGKe3pclu/MqA4G2neALtxR26b8akbmnZ/JJwPz7w5f7qs126ZtqCN9bu3LtLRrm/Z7BWQp77U9ddqmYhhdqLPn0rbKOxLh5hjpfLCmNn6b7WXMz39ZV+u/sBT7h9ON/DA1mKetH62qbRpwAAAAAAAAiCAYWMHoj221ZCciy7pA3kidPF5K/j2bcG0MBpqJ0Yp9ZJOpLY7TxWDg20M7gdz77Jom3jvG+W/r6pbOA737UxIQ9E+09oK8bxgdb9m8az+veKfXrS821P5LP8H/ru8DCbHlQMVc8V6XefO+xdvrqv/WfzZBRn/sqGWT7vd0nUL5c8xymlc9Sbe4ovZe+bJWtY3WPbFn3tOX/ns5GFg2e8dt6zItN8uESht3ofr3U+8MDJ4urDyf/J29cDOc/7qmegs9tZy993gGuTxX/W/MU6c9necIYn0IncgP04Ruq3Y+M/Xjkh5rkKIAAAAAAACQQzCwhtFffbW9vpwtYdj7ZFmtfreXvaNIlkxb+SSalBwjGGgnlB6s2EDOwuIdtfZkoE5+MU8gBhOjTcfpWjBwdKJ2zKR7aVJczjP31r3VKV/2bahODrbU+r3diV2bUh7yuAm1BbX006n/5OaQp2i3ZD1QobAEr7l7vSLQvbilBv6rrhwuuTKSOc/BQB8IWdRpcA1BKUFuiOhl73icZ6rTXPL6JJatvbY0jwJjpi1bf3SoTsIyVBk8C87t7ZHa+vKOe6Lw9pLeR1/t2ffgSnnU9d1P62o5WJrTLv37zL8/0NO2TI9e7hWX+QzbuPdnqv9oXS3dDvdRFwy8pZbWt1X/Fbn5Jhn9uWPz+JLuvxRah7ol0YOlX6cK3X87tEHpJZ42vSKV+cEzfBXWFT115/6uGlx3ZpC6I+i7zxxy88rihjqSMcucI211Wd+uZG2GX73C4/rcvn0q1VM6T362rnavPVNCFynlycU7atn0U/5K1YYzxsVA7Rb6eeU+XgpX3lLzGW4OYPU6nsbX/cnDByvZfI8xHmNPgotn5sbdnlp+wo0vU8XfJ2r/gR5P6OsCMB9UrwI0FmblnReHauc7qT8r9jc8U8fRa0bGqmNL8xgrauPpQF2EcwsXZ2rw647akHmK1NhRt0t72W918wLH4Q3CLc6nqi+Zz+2P1PnLvqtT/BxF6Vwvh+r01608PcycypPi+egfo86ebWfzIPacD04K8ylTQ5vzeWf6BBJX0X3nL7fUYdTXGb46zOeZdH9o5cGhOgtjAS3ygRqeBu154jhxP97PZxWO0zZfT4I4ztPwSpvRiy2bPuvPzHxW9U341hbzJwQDYeqw75FbuKu2JRqUUQ4GOntq4/dpneCdfKC2KhhoJyYn0bB/JIZHm7Yyu/twkFd4o1O155cd7DO3ksQ8XWnSbe3al6cbqcFDs6SjLotz/nRBXZpPMhhImkOnuTxzS7B/uq0G8dOvNcHA7KnSacSsYmB+57QGLKeZuvygqVyh47qDdF0IBmpGg227nPLdikDrvNE+GKgNls+uDwaK3BAA41OdJ2f8qd7LU7VrbkYonFO7sWllMNDfTDqZvnbIUB19G91srSUYOEd0pM0HaM9kgoGnT9yrP8xcbK8qaPLuSG2W2oMPCwaKKwe+p2banE/954v6t5j/j8dlfx+pDfsbIxfXVN//5jbn0xQMHOp63n0m+4jPtXrVv/CBjqrVA6dvTNzifKpWhQpuWBw+d/PC8TaFG+Sb8sHwWG02HKeqH9/7IZ+PbpWvJ0F2U3Hxt1Qfz7/KLntohmAgdA3/Hrnye6wMUWDtcqTODnyFPLUduI8YDNTI3QJ3p+DpwCaGv2/Y81h+PMjuchm91hW0vv48jVZF3AhcM2/21YrJv3N9PerTfLLBQA1pDh1F2qes016HPNl0zzxF7j+bSkZq8IPpyPMuynGpzQ/SF9QDsRXdR5B3dw9fH6udg2vu73VmYlDef+7e1woemQiIB8n+uvcWfXl+6T5OBgPlu5dDdWxv4LmOIAV0nVL/cXSujn/wk2rJcfCMIOVkdV+djdl+n/+yqs9/Ww3i7/l9Tr6cyTh9S8WL2cCcQDAQ5o4JPRn4ck+tPdhXJ++qHkiQMdKCWvqhr84mtFLF8MiNHxa+6Wf9uNOna2pLjw8u3qT7eBLgWfrh2D1JFqxQdveJnzdtPJ+83a4MZv59pDbv76rj12b/Zh/RtpcD1+aY1aZe+0bnXd8FKmWu6e++Wvfb7PsVjWSOdGFhXfWnacWPFufj2nWd9o/8nK9O+5Mn5uGfMKCr0/wf2+r4rd/Hm313HWvm0FP54PzXTbX9/FyN7HH0OMgHy6qu1+jVnpt3u72bH6dFPpgE5wcuDRbubarDl36gpvuBJwd76ih1vJe79ibP9DzK1WIOBANhuvhzxxbq9DIgcTBwqE7/6SrkQvDLPG77JH802zxCHD5u6ypxXbm+6GePIvc+WSsu8xPto/y4s/8tpuP4Vlf69v0sPXXnWz3wMXe4V9x14AwL6Uid/boVPBpcfqy6uFxt1V0mnkv9u8zfP9e/w380tZiG4KC4ZIBZ9rX0uDfkyDKrDwf+gwTvz9WxzrvL4bLCvkNkJ7VWt9XuP5Z1vjd3px6r48fmv3XaBw1pjnQW53iQ3JDm8WTOSDfUK6YMhu8WfHOcWBbTmOpckObQTU4emzpptcXyxvKE7Iraf+0/8ly82A2WjjVLvOyqQVSGRrq85UuROAsTiIUlRKSPECxlG0w6mqVqV20/4JZaeeQHkDF/bNs6dPnnqW91p4ra/DDwk3Of79VOJheWlDF9MLMcTLAUsPT3Dt+N1OnTVVcH315R28dBfvBLDkub2bPBIL1dNjEofU+xvDRkq+M05LtU39QuF/N72B9u7jOGSPtUWrJ9nokDeoKfEF57ousY/f9yU1ptMFAP2o8euAmm6V2hBKaVuP9okXFcEMRvqudkzFro98f9y4axcSsaxsZyPklbBlryulT/45WZ+PJthC97MvaVG2NFu/x9WBc2teMSAEoZ1A0ffBxNKV3itPC/ZfvoRO3JWMG8Oubp6XjXZ4Y4f54vf1cwS/umts63yw+P1MlTyZO6bNzfU6fBSgN1Zac5v0ogo7h0tN6r6n9jtvNPhKeuX2mptxZtt+0jrOblyzq5m8lhzqiZD9K5P5trOPpjT635bW59of8d9Mld/afz4KXu69i5I1MHbqj90lLWFcHF0bHaMsddNPvwn1Vy4d7FHs6fpDAPg/ziX430NDHuSvbx5MnBPJB28ce+n7/Vn5fmTauDpVJvVAYDA6T9KGw7PFIb5piLOq0lqDfS5/6d2da1d/LARBakNMusPvFjC22hT/+ur9Z1nWHm8c4b07hM3uZWj19sPfpVNI8l7ViL8zn9yfWTV/6p2zT/G0d/7LrVXyrnNGXsVVEHNuUDT/IaBAxf+Ff0fJ+6Cb86Hzha5tkk5+pw1f22zedtWnpZxaAqGNyQXhUQDISpor6CjSdknL3PdoJ3C47U8YOg0cvsqbXffPNX2flbUft23Wj/frDENvkTa/63rG2WHn230XppiJLmhdQtiVreJnsEXA+EkstkadNpJJXWeBUBzAgyWRVOHIQkl+bR+g5RPKgtulyaeDe471Q1gnNAQ5pLfWL/bpYMNJM4hXcLnqk9c03MUnim8jB3otlAx5La+8ttETP3aQ4dpH3b5Jbl1W32r8Wbgi5+c53+Yr2lDZf/qGgzs/Ir7yRObJMvZ+La7963m9GSMvGEkEfa+5aTnWCozw9ZP60mTeV9g/n1EfMlG91+emrj+41i3pG7b6vaTGN27HTfM6yjG4/TIt9V9k1lH5rGPmNMU59hHklOFGmytDrzE8BuTFBojyv69ledhIH5ptB/zChOPrWp52SirWjYh2weGzfTPDaurMOMLdtHtw/3289+XrbftYF2X/Zk7Fs1nsnSsqkd9+U9aVA3fPBxNKV0idOi8rfIvETH+GvP5mnzmhB7M8xwoLbNRP29vWxCvrmtq2qX86cmmspOq/zqn8QIl5CTFVyyOaHK65fngebzMRO95b8znwNXomE+yNTnrp0pG+Z16f9sfh8tA1l6cr3YbmVIn2l1T/Wfrvmb4fyNnHHTE/Svkk8+ReUse8IvRvZT6OPlgZIj8y7yr+IbEeJyVnE+mrjeuHVvXW0/C24oDZD2ozhnW532RrOtHGPt4MS+T2650LYU+wz577lae+G+XzN+kWBfbNaONZ+Pee2DqUdTfy/1xT1uLkC3E/o4Wd1raJsPDO+P3Sozn+rrG+4kyGvGO+v7hZtIcqrzgaUpz9YiebJi3zG+3ckCxCXyPD5Om0EwEKYKqdDSga5Exy9+14wssffFrjrxk4Mm4m8rIF/ZyDF693XBN9tkk/O+crV3Iurt7+l9+32M3vrHnXWBde+yy3+L2Y99GnDgng5YeHAcVFo1BVOWwVrcUP3kI9Fm8OaOsfbPvJFJNyw57u/jVQQwI/gGsDhxkCN3Etm8fVFoOi0ub7j3cbpBtptwP68pd668tGyoukhDmkt9svpwOw8EFjpj/m60z3fUSctg4NynOXQQ6VA3tE2+4967fxhNrstdnUtq+4XvrI7OVf87NznpJjRlMkXXaz+fqkQVGNSRh+rM/334x67rNyzqMmo+CDr3Msg4/8WV8/TTf76dbznZCYb6/CD1anWaBtf6lzO/HEy+7Ezvsfteth+db7bMXa7Zu5XdzS9msGn/vrbn+oMGGWhWHVsf5+h7s4+8zWw6Tpt8l+3j3qbqmzbk8kzJO5RtW9DYZ0zg83JV+zWXJCeKNGFbL4NunQeO/WSYvQbRBIJV74dAIFwFKfPF8hlOPrWr59xT1rq9Wt9T/Zd+eayQFmPjRlqNjT1VZawNthyaczfpsKb2nuq604yr7cpBPbXzp98uYvj7pk2DrN5u3Y6PN4F29eNoqtoW+XxxRe3+afo3evwftTGdwuelZZ1/k8HAVm2dXLeeWvnJ7Wf0PLw27cqOpbbNl/KYP6nrgu/5UtLZ9+9tZUvcnYevsWl1PibYbv6tz+fhoRq8viiXY4CWNM0H5fla11vf+5uZ/tpTyyZPBvW2zDna5SrN04BmSUhb58erF4Xtlv/IkOoziYmAYu1TVlLOMntq2dxQ4P+ckWx/8nlb8dZXZinPo4r6v+J8NHl/v2gqGFQ5Zzs8Ufvf+ZVC7FPgfbVnnxJzT9KVjrG4rDYPTtSJr1cKfYaJPBlojpMev2QrZOn2afvXgTpLDq7rz8cQPg1unkDtH/jxV+rJQLPUq9mX7m/kD/x42uYDcx72BqbEO5gT+fLWN/n4LKc6Hzg+5MlAyZNV+y7ilrmtC/iO15cRCAbCVCEVUrrzW8zkprNdumOgVEEE+kYhNfgKPzNPH5j/LlbqcWUgv0UXysTTVDk1BbOugbTbpyug+mCgdCbHqwhgRvD5uzhxkCMTAlsv/AcRYaDY5XmXt+rKndtnu4aqkzSkuaTd8tpaNrESd8bsHaG372TLw9m7yJ5X30E092kOHUTas7q2yUze6LyvB53yMvcMGYx8WRw8FtvzE7XjB6ml9w150nVk1E5L27yoB7vJOwUj5LclJ5IgTX1+kImM6iXPK6619AH9tZD80SvcpJVTzD+eaB9mAvH0mR7EfiZLjeVKm9l0nDb5rvxboj5gY58xgV96v6r9mkuSE0WaQlsvy8Ctq3X7/9E1MN/V+ULeN0P6wlVI1j/Z+1LNZGu7es4sdRwuQWwmx+7cD568kO1TxuWggnZjY09VGWuD/a2r6vDlkdr4dFedvt5Xy2aJObt0dH6coXm6I1oO3Cpp0rodrxmnayZ3HE183YTE5/Vj/Rnn8kztf6nHQp9Im2qeFNpWxzLJKWmaVK5T4roV0rFl2TFUXRePvN94/dlQ/3Z/Y9o3/XzyuWmfrc5HczFQu4VXStxSKw8O3Q3nAGPQNB+UrLsT9bbUQyuNr0FoaAsWV9TOC7/Xv49d8D/edgzM6yC27JOPife1J84jrw90P/2z4P1spn0x264etnvSMeZypC6Oq2+saV2PS71i2319dN/emjpg9Ym8Nz1/SGS75m1B49I0fjFcDMJXdGhLyyBHROeTYvhs3e4rvmkme92OubmioQNRmQ/e6+N/acqAD3DWMLo4Vft+1YPycv8t88GVkDSKln1N4d8hWXeNTB6v68tUQTAQpgs/cVF7971k8qyiCaLkcueiucs7eSdMXullg6+/o0rA/4bC3Y/ZS1vlzrC2BU6228zuKMuQx64/3VJHb1K/VQJ7d7MnIcyazbIsabJhkXdNXGUABtOPTDpXTWz6zkP9k4Euz7py4Bo3KRPlPOUbwVZrvXeUpjTP6pMznb5uYnBJD/6ybf3EzvJT/VmrNCTNoZu4wam/0zCBWwa0V9EploFc+GTgmb/rTzrw+Z3VVU8G5nVk8ISWf1/Agpl4NB8kB5I1/OFWBSAgMB61+UFenq+vpbnrX67l8KXuA/3TtmD5tQ7u+pfgjLxHutTfi5D8kC9XdqoOZTkkP7GX7cNPgo8uztS+DRDlbWbb49Tlu/I+okFoY5+xjNtnr5sTylelqnz7yVtJ/+KyRv4axN+VfBos5QrQllKZ1+NRqX/cUm3t6rmQ0ZsTtS91mCxj2GJs3EirsbFn3DY0xB9n/Zt11bN1sBnr31W7T0z5jMqhHuvYm+9GF+rswE0sZgGZ1r+hZjw/0eNoUkEjQ+LzLgcDXXBtWe39WZEXW7V1ietWSMcxyo58z7wzymwXI/NNX+ixtV267q6+LsHviq/f6Fwd/xAcZ9y22wQYXh+rHTuZnS7nAHU0zQeV+peGRF3Wvh6qCJrIvOTnuu2Rn/HuSG3Z+duo3TBltu1TVu91P9wer20wUN7/uaBWnvgnkoMxZPmdcxXnE6PL6vnRlmtfs1dJ5bRJv9HwXA0er9h2L1uiVZ7m/3RD9V+7hMvGDIvRnPLfR2rztv5u4bVZ7WkavxQwbeDzHResS6W9Jnk+IaZ+M2M5O59dfNLt/NmmPseeWv5Hy6ccU/ngTd/u2wR97SorTehzOvYrBI4fDDQrxeg8u7isduKnD1vgnvbT+zcrwsh7OHX7MdD9nfCm6PMD80S7Pkd5Gj1JTV+mBoKBMF3IHZG6w3VWqgTKmVwqsLyyMZ0/V7HHSiUn3yl5T1eiZiem0+cDbrHlteqbClx+J0qufGekTh77AVus71Cau+Pt4CthsmHxa9uPv24xzAbnav8Lff2r7kCVtbGjvCIdorGDgXInSqoxnxvq07zQicqWJNCDT3nfWcVa67e+2FD7LxOpSppDR5G7q1dSN/u86/tJxYS+/jr9qaK9lLZbM/LvGIi3ySdbdXlMHkeXWXl30jiTe5rmpTsgRW1+0Jw+XU73f5r6R8GTpYX6OUVlftD648h7q1JKm3n14+T5rryPeBDa3Gcs4p9uI1BlkfQt6/vkfkI3T3/f9ttt/DUo1Q36mjxyg3n63TAulXny3qY68nVYcz0n9UTZccbGjbQaG3vGbEMLyHe1MjF38kh+u588lqc5Ukpd2Po31IznJ3Kc6usT1z1hPd7lYGD25H/BW2rlu33/3qY2bV3iukXp2KaPYJFAd2hwLQzuKZae6pl2PF7eUI4bmx1nnPMpS9sCY9MwH5TXS0GQI1GXNdVD1f2q/DtVY7fCjdOGoO6P26Wq4+SvlGhRz77V+0+lSbAUZfP5VB1nSe3IzQ1V9YFWzit5nEIQ1PQtU+kWzC95LvwTdsbW7XmA/JbK71aeTz7ubT6fVP3Ws0tFSx4w55Gsr7W16abN8sFFX61XjekkX1edz1j5wGOOJ39r7Gsk8IHc+BiFchnERupbgraxiSIEA2HqcI1G6umARCbPllMJJuLen6n+o3W1FBWuuCLpLfrBxe0ltf6or07D/B+ufay3uXVvVW39Gr4ctn2BG73cKz5aHX7n8sK+GHblk2iQJp3Dy6E6CV64a5eLeO4qsXLDLA2Urnzl3TfQOWRwUzWBapYZ2ftutZj/fQM1XjBQJrma7kTpPnVpLmmXdaLe67rBTpjIGuX5naglS5O0pDl0GJlMTAXWawZPWQfbtIcHQXup2+5V3SYW2m7NxWBPbXwVLrUUDXLeHKvtbCmmnrrz2braleVrDK0nETWybbhkFLSjLj94Ll7oa/mlX0ps8Y7tA4XXO3wHhfn78vquGgSTfKX6OcHwzz215vOUeY/F4V86f5j9ST/s/anak/xifsN3e+rw8ardXtrMNsdpynflfSQma5r6jAHyZFv1y+bnC0nfsr5P7uug8BrKMkbVwUCN3MFN0BXGJM6TZqy58fRYnUWNSX09V56c7H2yovczsO+vy2gYG7eicWzsGacNjckmhPN6z9zk436vjJ9H6vSpnIuuS7/cUHu/7KhVs43Uha1/Q914fhLHaTFJLf2foB7vcjDQviPw/w3TITcLYDe2dYnrlkjHpj6CI7zO3mAflmy+qae2XkQVvRx3sZf3VXQ/oXCc1ueTa18p8SxRvgDaUDMflNdLQf8yUZdNIhgYj92S7ZOl+snA4nFc/3nrINxHi3pWM/rrUG2FY4rv9tVJ0N42n090HDMOjcu61AcJpb0Nj2Pb/cfRPLTB1BlP8nqp8hUz2byTTtvwXagtkd9S2RconY9b1rn/Kv/BzecT1G+2Ht5S++G4W1OX9ql0S+aDrP+QUPJ1dD6p31v3W4plIQja+ve/j83wtNA3M9c5HBvKjSildqdEXV+mGoKBMH1IpbaoG6ixX8bZTGOlN6Oc6/OqDRJBN6h7Ie4EMROJZjmC+vWp54QPSPPh0aYtl2YZuuybo1O197lpsNdUP6iGSHPoOlkez+7mnGGyemGl4d3BUEWn8sM0kfWjqwOtAAAAHxezrJrpN91V28HdE+bGafsU5v1+FoiYJoZ/+PdYpZ7OkMnlOIAIAPAROP/F3aBYXuYSrp23h+4GocQysR+M3DRrXidxTWNkgoEwlYxe7fmXh25faf3jOroYDLw43mJCbZ54d+TW2r6mgPnor323lMI1lL+Z5YppLsvhLD/276TSyHtWekHHgTSH+cDcge3eJVBaImaWuLzw74QpL9kC49CR/DBNmHdofIQbhgAAAMZjqI6+1WOdhWW1M/AjoMuROvvFvTN62iaz5ckoZ8WNXwQDAeCm+Htgb/5jDvgGkPfQXtd8rH+dxvqz62sXCQbC1DL6Y0etPTxOPEb+YXQxGGgCCRvftnzZKnSDt319zffV2XUEji6O1fb9q72IuNNcJc398hirwVLBbimoaIkM0hzmBvP+lDW1fTzLbfBInf28oTZ+IxD44XQhP0wRJlD9cO1KL7MHAAC4Vuxys+HyhbfU0lcbam8wfX0AFww0S8SaJcQr2lSCgQDw0XFLllYvuwrXiY0npJaEnjEIBgIAAAAAAAAAAAAAAAB0FIKBAAAAAAAAAAAAAAAAAB2FYCAAAAAAAAAAAAAAAABARyEYCAAAAAAAAAAAAAAAANBRCAYCAAAAAAAAAAAAAAAAdBSCgQAAAAAAAAAAAAAAAAAdhWAgAAAAAAAAAAAAAAAAQEchGAgAAAAAAAAAAAAAAADQUQgGAgAAAAAAAAAAAAAAAHQUgoEAAAAAAAAAAAAAAAAAHYVgIAAAAAAAAAAAAAAAAEBHGSsY+D//8z+IiIiIiIiIiIiIiIiIOAMaxgoG/ve//0VERERERERERERERETEGdBAMBARERERERERERERERGxgxoIBiIiIiIiIiIiIiIiIiJ2UAPBQERERERERERERERERMQOaiAYiIiIiIiIiIiIiIiIiNhBDQQDERERERERERERERERETuogWAgIiIiIiIiIiIiIiIiYgc1EAxERERERERERERERERE7KAGgoGIiIiIiIiIiIiIiIiIHdRAMBARERERERERERERERGxgxoIBiIiIiIiIiIiIiIiIiJ2UAPBQES8Hv/9o1pYWMj8+l9vS9u8/dfXwTZfq4M3xb8jIiIiIiIiIiIiIuLVNRAMRMTJ6wOBP/7b//vNgfo6Cgi6QOCP6j8V/0ZERERERERERERExA/TQDAQESfsW3XwdflJQBvs+/pAvbX//o/6sfS0YPp7iIiIiIiIiIiIiIh4NQ0EAxFxstqnABuW/KzYphgwRERERERERERERETED9FAMBARJ6tdItQs9+me/ku+MzDbJvielqVCEREREREREREREREnp4FgICJOVBfQMwHAIKgXvzOQYCAiIiIiIiIiIiIi4rVrIBiIiBPVBfQqlgCVQB/BQERERERERERERETEa9dAMBARJ2plQM8GAH2QkGAgIiIiIiIiIiIiIuK1ayAYiIiTtSLQVwgG2mVDK54e/PpAvQ0+K/wteAdh1XaIiIiIiIiIiIiIiOg0XGsw8D8P/aT9w/8U/2aDAsGkfvz3/75VB18Hfxejyf9icKAcWOiuLn2y969hOVA0s0+XdeHa/kf9qK/Bj/8ufl586s9tUzxP8nV7pY6syOdh4DX195KT3l+XdXk3q2tSQemojWuXp+Ua1Oy3c5KPcTq8Un8yKudxm4f44V6lXZjHtiR08u2AG89WfMe/E5vyjzgPRmOAlnXJWH0M6pQp1Fz3uuvm2p30NTN/C79bt63XtFNz13YjInZQ36aXY15OO8YYq76fzDjHcH3BQDnp+MSjyZPMQuLEHS1vkEjl4I+xXYds9nUZgKCJ0eeVqABJ/qAjfTMWA39aX+7DPOu2ycts6TtYozQC2lTDMvZk16T311V9OmVp5P8d1j/xIN7/u76+rthv6lp0yknnu0nvD+fBuO1p1RbFeYm8hRP3Ku3CVb7TNSffDmQ3t6YG60zcI86Jfs4hqFfisWzK8foYef1FnTINyjU/0NflR3VQce2kjUheM9NG2LbDX1v93wcPdZ75V0VbJHOoqfYGERFnS6nTK9qIqwcDtR8wzjFcUzAw+IHRj8wGVPKZ/bHms6BhbWwE82ChS9D8ePMRIHPnSzCwvvC4vNYwoYfXZ1a2w7Ja1A2IZJvxJmfm22IdW0rbsSe7Jr2/jppKB9te5Z+l6iSbz+sa+Rb77abkY7xpXX+y2J9q6mP5fBZ1wG3ZT3XKEa/iVdqFuW1LQiffDmRj11JdofVj1tJxELFb2rojnldI9wdyx+tjhONi6pQp0l57uWZv1dus/cjnJCuvmflumD98m+E+C/cVzYuMNTmMiIhTqdT51oqbScaq7yczzjFcSzCwOMGvrZsc8Y1rmADZ96u+J98JEzOxn2lQJofCgaT87sJn8blm5yiGFzTViSx2RuaiA9E0AP+3TvfCZ8WCYy2ku0vD4v5SnfworeNCXXvttIUKwRh+P3FtS9sXf6MEGv5TKHfzNPEzj0q+NHcq6v+Py7vNg8U8UKyD4u+Mv7+5NJUOtnzWp01TIy9luPj3RF3QOcnHeMNWlN90mRRTfQWtzV/lTj7iVbxKuzC/bUnoFdoB+5neNjPR7uj9HNj2JyrjTWMRROy0tn6omrMap48h2/6bOmW69H2+f5nrEtb/fj7IXMeadsDkj/xzaY8P9HeL+cLmCb+PpnEjIiLOiNI+2DakPB5L1ff2M9O+iIW/T2a+y3ANwUAJlPyofpSTSHaQ/EnYbRODLjlxMdyHDNrCE88GctM1ESPnkl10nxlSn2X/TlxAtx85t3hg79M8SKPi9h3VplPbc5RCE+SjON2TE3zR90rfkc6b/x1N104qg+AY9u9ZXo6ubWJ7d7z8GNJ5zM/N/+a4YsAOGeTLRJ6M82ExjxnjOmO8/c2vUX3QpqwlynCsvT7ZPsX4WF2UfIw3rM0T5X5EoV2PtXkrkY/IXzhBr9IuzG9bEjpmO5AotzYdg/Kftz3l8VabNh4Ru6qrEwp1TGjrPka53qJOmRLNNfRjD9MWJK915TUz+SO4znY792+bByra5bzNKf8NERFnyKB9iOfxjXF9X67/r2e+yzDxYKD98f5k5b/TDZ0/KW9+Av7kgr9l+v1kwY8wkXxCVE7e3JAuDRKdvcIFDi6ofCdOs8JFddtLmqUnrRo6px0wfd4V2vwRF4piukuaFTtyxW3sMWs6Z43XrrFwJq5tfLygQsm2idOhZSWAs2qizgivd+H6p/K1/06Wt8bZH7r08cblPTNo42rqDGOy3ijVT12UfIw3rM0T5X5Ebf8i2Z/Qkr9wgl6lXZjftiR0vHYgmWapbXw74+qGoC2K+uSIOD+6+qCm3W/bxwi3o06ZPauumfm8YQyYMmxzUn9HRMQZsdA++PnBoH4v1vcfb77LMNlgoD1w/MPyfyf1iZP/YJlAjU8o/0wGYoVGMttPxeTNDZlqzO1nhTSpHqhnaVhIE7e9Cxg1fLcu7Wfc2sm6OrO84s3SKFX4wvStTuuU6Wsn+bsqUBte2+Lfsnzvld9pP4/yWNtKAGfVOC9GDUvl9c/zn7WiUWm/vznT1x1h+XTlvL4eaqqr0nX1ePXNbEo+xhvW5oly2awts7YeSOQj8hdO0Ku0C/PbloRevR1I99v953E7I/WD7xeUJoERsdvauqRqPO9t1cdwdVRWh1CnzJ4V18xc59r8UWGxzUFExJk1bh+ivkN1fX+9812GCQYDZXAUDZ7MD6sdgMr36jpT+TYmEV0HSv87TDSfqJWTNzdk6uLazwppEl3Q7FyMPj0LF9Vt79IrT5uktWk/49o0aXu9o3Sy6RKle9wZD78XbN/c6ZfjpK6dsXzN8mNGx/CVR2G7qEKx5SGuQErHxG4Z512tz3s270TXP6uLjT6vFOum8fY3r6bqc6k3auuFhm3KbYIxcU06J/kYb1ibJ5om6iJtG5zIR+QvnKBXaRfmty0JHbMd8H9z+s+jbYrtjFb65uYY8SAfEbtvWKek/i7a7er7GKV6mzpl9kxeM9MWXa1PWGpzEBFxNk20D7aO9+OMuL53f/Ne43yXYXLBQBkYVWo6PHl0M08MfzL6s/zHy/blbez3UtvIZ1PWcKYac/tZYVAeXtDExTUWLqrbxnVAK7afB5s6y+Hfk4UiTrt2wcDqtK74e/LYov9O8tqm80983gQD59F0XnONh64Xw+tfUU4aGxXZJt5f8Pd5s1x3GyvKfcH6YGCyDEd1QTclH+MNa/NVOU+ky6SY6itobf4K+qWIH+BV2oX5bUtCx2kH0tvGfehiO+O0aW22+Xe6bULEjmrrh5Z1amMfI58bS1o7tsCpMTlGMdf2an3CVJuDiIgzaGX74Nr4Qn3/Eee7DB85GOh/uPm3/Pjge+6k805R1snyna78pPJtJKHcyU/fYDfVmNvPChcvvKDu3JIZIDt/t72ca+oY8TZdNX3uwd98Jyw5QSJ5L7sWLu2LaZYXVPPvuuO1u3Ypw++F1y19Dd0ERH6c5Lm1rARwVg3rjPBzn1+t/vrbvBAPRvx2Wb4ZY3+Fv8+XzXVtRTpWNOyZqfJqv9P1NCcf403r8kaxnQ3LdLht8e9xPrP1QynvIV7Rq7QLc9uWhI7TDrjPmvrtdW2/219N+46I3dHWsXX9g9gr9DGaxgw4faaumckrV+wTptscREScOavadN+fsEp9bz/7OPNdhsm+MzDSDaaiHxqedGiwjQQ7YsNOU3qb6Rvsphpz+1nh4oUX1P93+J0gzVwmijuR/sKX0jDOSF00LhzOOGAmaZgXQp/Ohe+W076Uh31hLnTg7b5N3mtx7bJt/d+1xWtVvLbu+MF19Mc3+5Nt7Pej808dZzoNroM1kWeDcw7Pe7716VaoR7xZnvPXP5Fns3wd5btW+4v/Pk9WpmWQb316leqauIwWjLepuR6dknyMN69rg/N80ar/FJdz+2/yFk7Sq7QLV/lO1xynHYjTK9wmL9+2rUm14UH/NG/zEbGTJvqhbRy7j+GPQ50yQyaumWk3rnoNK9scREScLWva9GwuS+r7RD/juua7DB8/GGj0J+l+aOLvWtdRyrdJdbyK20znJEyqMbefFc45vqBhVNeoL3whY7jti2mS+E72t+6bF5Lq80/lKfe9cNtiOmbbFK5XnNZh3mu6dnqbYKKh/P342vq8UdjWH8P/JntecYdxJiYm/bkFv710PUoVYirvz6M+7RJ1Z55ngusf5zn9PVceZJsx9zfXtqhrE+kd/j1ZZrN09pb+3kXJxzgdFvsHcR5xZb7UiY/yIxN3OHkb2gXbR4rza8N3Ou+47UBzv932TSvSUcYflH/EbitlPWlWP6T7C/V9jEhf/1CnzJCla2bamquPN+raHEREnCHr2nT/t0J9/5HmuwzXGgxERKzUVnRxIKU4iEp2hm2lefUONiIiIiIiIiIiIiLivGggGIiIN6IN9CXvaBDdnQ3lpwArntJARERERERERERERMSCBoKBiHgD5oG+4tIp4ZOC9cFAlgpFRERERERERERERKzXQDAQEW/A/B0tYVDPvY8hDwhWLxNKMBARERERERERERERsUkDwUBEvAF9MLC0TGj01J9/gWoe+JOXohIMRERERERERERERERs0kAwEBFvwKqlPn2wLwwS+oCg82t18IZlQhERERERERERERER22ggGIiIN2DV+wATwcBYv0zoj/9O/A0RERERERERERERETMNBAMR8UZ8+6+vy+8DjJ4YTL0z0H4veK8gIiIiIiIiIiIiIiKmNRAMRMQbMl7uU94HGAT6/BKh2VOA/qlAlghFRERERERERERERGzWQDAQEW9QCQB6S08KagvvDGR5UERERERERERERETEthoIBiIiIiIiIiIiIiIiIiJ2UAPBQERERERERERERERERMQOaiAYiIiIiIiIiIiIiIiIiNhBDQQDERERERERERERERERETuogWAgIiIiIiIiIiIiIiIiYgc1EAxERERERERERERERERE7KAGgoGIiIiIiIiIiIiIiIiIHdRAMBARERERERERERERERGxgxoIBiIiIiIiIiIiIiIiIiJ2UAPBQERERERERERERERERMQOaiAYiIiIiIiIiIiIiIiIiNhBDQQDERERERERERERERERETuogWAgIiIiIiIiIiIiIiIiYgc1jBUMBAAAAAAAAAAAAAAAAIDZgWAgAAAAAAAAAAAAAAAAQEchGAgAAAAAAAAAAAAAAADQUQgGAgAAAAAAAAAAAAAAAHQUgoEAAAAAAAAAAAAAAAAAHYVgIAAAAAAAAAAAAAAAAEBHIRgIAAAAAAAAAAAAAAAA0FGqg4FK/f/iav+e06yChQAAAABJRU5ErkJggg==)
+#### Getting OneHotEncoder Catagory Info and Testing Ordinal Encoding for Age (can be ignored)
+"""
+# data_cat_tr = cat_pipeline.fit_transform(patients_info)
+# data_height_tr = gender_pipeline.fit_transform(data_cat_tr)
+# data_height_tr
+# data_height_tr_age = data_height_tr[['Age']]
+# cat_encoder = OrdinalEncoder()
+# patients_age_ord = cat_encoder.fit_transform(data_height_tr_age)
+# patients_age_ord
+# cat_encoder.categories_
+# array([[7.],
+#        [6.],
+#        [6.],
+#        ...,
+#        [4.],
+#        [7.],
+#        [5.]])
+# [array(['10 - 19', '20 - 29', '30 - 39', '40 - 49', '50 - 59', '60 - 69',
+#         '70 - 79', '80 - 89', '90+'], dtype=object)]
+# cat_encoder.categories_
+# [array(['10 - 19', '20 - 29', '30 - 39', '40 - 49', '50 - 59', '60 - 69',
+#         '70 - 79', '80 - 89', '90+'], dtype=object),
+#  array(['female', 'male'], dtype=object),
+#  array(['ASIAN', 'BLACK OR AFRICAN AMERICAN',
+#         'CAUCASIAN', 'CHINESE', 'HAN CHINESE', 'HISPANIC', 'INDIAN',
+#         'INTERMEDIATE', 'JAPANESE', 'KOREAN', 'MALAY', 'OTHER',
+#         'OTHER MIXED RACE', 'UNSPECIFIED', 'WHITE'], dtype=object),
+#  array([0., 1.]),
+#  array([0., 1.]),
+#  array([0., 1.]),
+#  array(['*1/*1', '*1/*11', '*1/*13', '*1/*14', '*1/*2', '*1/*3', '*1/*5',
+#         '*1/*6', '*2/*2', '*2/*3', '*3/*3'], dtype=object),
+#  array(['A/A', 'A/G', 'G/G', 'Unknown'], dtype=object)]
+## Sending training features data through pre-processing pipeline
+##### patients_info -> 'X_train_prepared'
+##### y_train stored in 'patients_labels'
+"""
+## showing un-pre-processed dataset
+patients_info.head()
+X_train_prepared = full_preprocess_function(patients_info, train=True)
+# showing pre-processed training dataset
+X_train_prepared.head()
+X_train_prepared.info()
+"""##### Send pre-processed train_data to excel (labels too)"""
+# X_train_prepared.to_excel("X_patients_train.xlsx")
+# patients_labels.to_excel('y_patients_train.xlsx')
+"""## Making Sure Pre-processed training set works with basic model"""
+from sklearn.linear_model import LinearRegression
+lin_reg = LinearRegression()
+lin_reg.fit(X_train_prepared, patients_labels)
+patients_labels
+from sklearn.metrics import mean_squared_error
+patients_predictions = lin_reg.predict(X_train_prepared)
+lin_mse = mean_squared_error(patients_labels, patients_predictions)
+lin_rmse = np.sqrt(lin_mse)
+lin_rmse
+"""## Pre-processing on Test Set (currently stored in strat_test_set)
+##### note: strat_test_set contains features and labels
+##### produces X_test_prepared and y_test
+#### Separate strat_test_set features from labels
+##### stored in X_test and y_test
+"""
+X_test = strat_test_set.drop("Therapeutic Dose of Warfarin", axis=1)
+y_test = strat_test_set["Therapeutic Dose of Warfarin"].copy()
+"""#### Send X_test to pre-processing function/pipeline
+##### stored in X_test_prepared
+"""
+X_test_prepared = full_preprocess_function(X_test)
+"""##### Send pre-processed test_data to excel (labels too)"""
+# X_test_prepared.to_excel("X_patients_test.xlsx")
+# y_test.to_excel("y_patients_test.xlsx")
+"""## Making sure Pre-processed testing set works with simple regression model"""
+test_predictions = lin_reg.predict(X_test_prepared)
+"""#### Evaluate mse and rmse"""
+test_mse = mean_squared_error(y_test, test_predictions)
+test_rmse = np.sqrt(test_mse)
+test_rmse
+"""## Pre-processing on Validation Set
+##### produces X_val_prepared and y_val
+#### Dropping nan labels and Separating validation_set features from labels
+##### oridinally stored in 'X_val' and 'y_val'
+"""
+validation_set.dropna(subset=['Therapeutic Dose of Warfarin'], inplace=True)
+X_val = validation_set.drop("Therapeutic Dose of Warfarin", axis=1)
+y_val = validation_set["Therapeutic Dose of Warfarin"].copy()
+"""## Sending a single instance from X_val through pre-processing pipeline and making sure it works with simple regression model"""
+trial = X_val.iloc[3]
+trial
+trial.shape
+trial_df = series_to_df(trial)
+# example of input for full_preprocessing_function()
+trial_df
+X_val_trial = full_preprocess_function(trial_df)
+# example of pre-processed single test input
+X_val_trial
+trial_val_prediction = lin_reg.predict(X_val_trial)
+trial_val_prediction
+y_trial = y_val.iloc[3]
+y_trial
+"""#### Sending X_val through pre-processing pipeline"""
+X_val_prepared = full_preprocess_function(X_val)
+"""## Making sure pre-processed validation set works with simple regression model"""
+val_predictions = lin_reg.predict(X_val_prepared)
+val_mse = mean_squared_error(y_val, val_predictions)
+val_rmse = np.sqrt(val_mse)
+val_rmse
+"""##### Send pre-processed validation_data to excel (labels too)"""
+# X_val_prepared.to_excel("X_patients_val.xlsx")
+# y_val.to_excel("y_patients_val.xlsx")
+"""#**PART II ----> ML MODELS FOR BINARY CLASSIFICATION**
+**First let's create a binary classification dataset by cutting the target values into two categories (<30 mg , >=30 mg)**
+"""
+import numpy as np
+y_train = patients_labels
+#Preparing training/testing/validation data for binary classifier
+train_label_binary = (y_train >= 30)
+print("binary train labels:", train_label_binary)
+# print("original test labels:", y_test)
+test_label_binary = (y_test >= 30)
+print("binary test labels:", test_label_binary)
+validation_label_binary = (y_val >= 30)
+print("binary validation labels:", validation_label_binary)
+"""## 1.LOGISTIC REGRESSION MODEL
+Logistic regression can be used for binary classification because it estimates the probability that one instance belogns to a class or not. So by using a probability threshold e.g 50%, it classifies the instances in positive class (1) if the probability is greater than 50 %. otherwise the instances will be classified in negative class (0). So, this model works in the same way as the Linear Regression but instead of outputing the result, it outputs the logistic of the result.
+"""
+from sklearn.linear_model import LogisticRegression
+log_regression = LogisticRegression(penalty = 'l2', C = 1, random_state = 0 )
+log_regression.fit(X_train_prepared, train_label_binary.values.ravel())
+log_prediction = log_regression.predict(X_train_prepared)
+log_prediction
+"""## 2.SUPPORT VECTOR MACHINE
+The main goal of Support Vector Machines is to fit the widest possible “street” between the classes. So, we need to have a large margin between the decision boundary which separates the classes and the training instances. the objective of SVM to find the optimal classifier is bacause the other linear classifiers might separate linear dataset in the correct way but the decision boundary is so close the training instances so that these models will probably not perform as well on new instances. Tha's why SVM tries to find the widest possible "street" between the classes.
+"""
+from sklearn.svm import SVC
+# # define linear kernel,
+# svm_model_linear = SVC(kernel = "linear",C = 1 )
+# svm_model_linear.fit(X_train_prepared, train_label_binary.values.ravel())
+# svm_linear_prediction= svm_model_linear.predict(X_train_prepared)
+# svm_linear_prediction
+# define polynomial kernel, P158
+svm_model_polynomial = SVC(kernel = "poly", degree = 7, C = 7 )
+svm_model_polynomial.fit(X_train_prepared, train_label_binary.values.ravel())
+svm_polynomial_prediction = svm_model_polynomial.predict(X_train_prepared)
+svm_polynomial_prediction
+"""## 3.DECISION TREE MODEL"""
+from sklearn.tree import DecisionTreeClassifier
+# define tree model
+decision_tree_model = DecisionTreeClassifier(max_depth = 5)
+decision_tree_model.fit(X_train_prepared, train_label_binary.values.ravel())
+decision_tree_prediction = decision_tree_model.predict(X_train_prepared)
+decision_tree_prediction
+"""## 4.RANDOM FOREST MODEL"""
+from sklearn.ensemble import RandomForestClassifier
+random_forest_model = RandomForestClassifier(n_estimators = 500, max_depth= 10, max_leaf_nodes = -1)
+random_forest_model.fit(X_train_prepared, train_label_binary.values.ravel())
+random_forest_prediction = random_forest_model.predict(X_train_prepared)
+random_forest_prediction
+"""## 5.NEURAL NETWORK"""
+import tensorflow as tf
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Dense
+from tensorflow.keras.layers import Flatten
+from tensorflow.keras.layers import Dropout
+# Define decision threshold
+NN_threshold = 0.5;
+def build_NN(n_layers = 3, n_neurons = 1000, dropout = 0):
+   model = Sequential() # create Sequential model
+   for i in range(n_layers-1):
+       model.add(Dense(n_neurons, activation = 'relu'))
+       model.add(Dropout(dropout))
+   model.add(Dense(1, activation = 'sigmoid'))  # 2 output neurons for binary classification
+   model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ['accuracy']) # binary cross-entropy because it's binary classification!
+   return model
+# Build random NN
+NN_model = build_NN(n_layers = 3, n_neurons = 10)
+train_history = NN_model.fit(X_train_prepared, train_label_binary.values.ravel(), validation_data=(X_val_prepared,validation_label_binary.values.ravel()), batch_size=128, epochs = 20)
+NN_prediction = NN_model.predict(X_train_prepared)
+# Prepare prediction to be comparable
+NN_prediction = (NN_prediction >= NN_threshold)
+"""## **Calculating the performance of each model in the train dataset**"""
+from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
+methods = [decision_tree_prediction, random_forest_prediction,svm_polynomial_prediction,log_prediction, NN_prediction]
+names = ["decision_tree_model", "random_forest_model","svm_polynomial_model","log_model", 'neural_net']
+accuracy = []
+precision =[]
+recall = []
+ROC = []
+F1= []
+for method in methods:
+  accuracyy = accuracy_score(train_label_binary, method)
+  accuracy.append(accuracyy)
+  precision1 = precision_score(train_label_binary, method)
+  precision.append(precision1)
+  recall1 = recall_score(train_label_binary, method)
+  recall.append(recall1)
+  ROC1 = roc_auc_score(train_label_binary, method)
+  ROC.append(ROC1)
+  F11 = f1_score(train_label_binary, method)
+  F1.append(F11)
+data = {'Method': names,
+        'Accuracy': accuracy,
+        'Precision': precision,
+        'Recall': recall,
+        'ROC': ROC,
+        'F1 score': F1,
+        }
+evaluation = pd.DataFrame(data, columns=['Method', "Accuracy", "Precision","Recall", "ROC", "F1 score"])
+evaluation
+"""## **Let's do a better Evaluation Using Cross-Validation**
+**Logistic Regression cross validation**
+"""
+from sklearn.model_selection import cross_val_score, GridSearchCV
+from sklearn.linear_model import LogisticRegression
+log_regression= LogisticRegression(solver ='liblinear')
+penalty = ['l1', 'l2']
+C = [1,0.1,0.01,0.001]
+hyperparameters = dict(C=C, penalty=penalty)
+classifier = GridSearchCV(log_regression, hyperparameters, cv=10, verbose =0)
+best_model = classifier.fit(X_train_prepared, train_label_binary )
+#printing out the best parameters for Logistic Regression model
+print('Best penalty:', best_model.best_estimator_.get_params()['penalty'])
+print('Best C:', best_model.best_estimator_.get_params()['C'])
+model = LogisticRegression(solver ='liblinear', **best_model.best_params_)
+model.fit(X_train_prepared, train_label_binary )
+logistic_prediction= model.predict(X_train_prepared)
+logistic_prediction
+#calculating the accuracy of the model
+scores = cross_val_score(model, X_train_prepared, train_label_binary )
+scores
+print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
+from sklearn.model_selection import cross_val_predict
+from sklearn.metrics import roc_curve
+y_scores = cross_val_predict(model, X_train_prepared, train_label_binary, cv= 10, method = "decision_function")  #decision_function
+fpr, tpr, thresholds = roc_curve (train_label_binary, y_scores)
+def plot_roc_curve(fpr, tpr, label =None):
+  plt.plot(fpr, tpr, linewidth=2, label = label)
+  plt.plot([0,1], [0,1], "k--")
+plot_roc_curve(fpr, tpr)
+plt.title('ROC curve for Logistic Regression')
+plt.xlabel('False Positive Rate (1- specifity')
+plt.ylabel('True Positive Rate (Recall)')
+plt.legend(['Logistic Regression'],loc ="lower right")
+plt.grid()
+plt.show()
+"""**Support Vector Machine Cross validation**"""
+from sklearn.svm import SVC
+# hyperparameter_set = {'C': [0.001, 0.01, 0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': [0.001, 0.01, 0.1, 1]}
+# svm = SVC()
+# classifier2 = GridSearchCV(svm, hyperparameter_set, cv=10, verbose =0)
+# best_SV = classifier2.fit(X_train_prepared, train_label_binary  )
+# #printing out the best parameters for SVM model
+# print('Best kernel:', best_SV.best_params_['kernel'])
+# print('Best C:', best_SV.best_params_['C'])
+# print('Best gamma:', best_SV.best_params_['gamma'])
+SVM_final_model = SVC(C=1, kernel= 'rbf', gamma = 0.1, probability=True)
+SVM_final_model.fit(X_train_prepared, train_label_binary)
+svm_prediction= SVM_final_model.predict(X_train_prepared)
+svm_prediction
+#calculating the accuracy of the model
+scores = cross_val_score(SVM_final_model, X_train_prepared, train_label_binary )
+scores
+print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
+#Drawing the ROC curve for SVM
+from sklearn.model_selection import cross_val_predict
+from sklearn.metrics import roc_curve
+y_scores = cross_val_predict(model, X_train_prepared, train_label_binary, cv= 10, method = "decision_function")
+fpr, tpr, thresholds = roc_curve (train_label_binary, y_scores)
+def plot_roc_curve(fpr, tpr, label =None):
+  plt.plot(fpr, tpr, linewidth=2, label = label)
+  plt.plot([0,1], [0,1], "k--")
+plot_roc_curve(fpr, tpr)
+plt.title('ROC curve for Support Vector Machine')
+plt.xlabel('False Positive Rate (1- specifity')
+plt.ylabel('True Positive Rate (Recall)')
+plt.legend(['Support Vector Machine '],loc ="lower right")
+plt.grid()
+plt.show()
+"""**Random Forest Cross Validation**"""
+# hyperparameter_set = {'n_estimators': [100, 200, 300, 400], 'max_features': ['auto', 'sqrt']}
+# random_forest = RandomForestClassifier()
+# classifier3 = GridSearchCV(random_forest, hyperparameter_set, cv=10, verbose =0)
+# best_model3 = classifier3.fit(X_train_prepared, train_label_binary )
+# print('Best n_estimators:', best_model3.best_params_['n_estimators'])
+# print('Best max_features:', best_model3.best_params_['max_features'])
+model3 = RandomForestClassifier(n_estimators = 200, max_features= 'sqrt')
+model3.fit(X_train_prepared, train_label_binary)
+random_forest_prediction= model3.predict(X_train_prepared)
+random_forest_prediction
+#calculating the accuracy of the model
+scores = cross_val_score(model3, X_train_prepared, train_label_binary )
+scores
+print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
+#Drawing the ROC curve for SVM
+from sklearn.model_selection import cross_val_predict
+from sklearn.metrics import roc_curve
+y_scores = cross_val_predict(model, X_train_prepared, train_label_binary, cv= 10, method = "decision_function")  #decision_function
+fpr, tpr, thresholds = roc_curve (train_label_binary, y_scores)
+def plot_roc_curve(fpr, tpr, label =None):
+  plt.plot(fpr, tpr, linewidth=2, label = label)
+  plt.plot([0,1], [0,1], "k--")
+plot_roc_curve(fpr, tpr)
+plt.title('ROC curve for Random Forest')
+plt.xlabel('False Positive Rate (1- specifity')
+plt.ylabel('True Positive Rate (Recall)')
+plt.legend(['Random Forest '],loc ="lower right")
+plt.grid()
+plt.show()
+"""**Showing the feature importance analysis in random forest.**"""
+from pandas import DataFrame
+random_forest = RandomForestClassifier(n_estimators = 300, random_state=60)
+random_forest.fit(X_train_prepared,train_label_binary)
+random_forest_importance = random_forest.feature_importances_
+print(random_forest_importance)
+features = original_df.columns
+importances = random_forest_importance
+indices = np.argsort(importances)
+**Calculating the evaluation metrics for each model and then adding the data in pandas DataFrame**
+"""
+from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
+predictions = [logistic_prediction,svm_prediction, random_forest_prediction]
+names = ["Logistic_regression model","Support Vector Machine model", "Random_forest_model"]
+accuracy = []
+precision =[]
+recall = []
+ROC = []
+F1= []
+for i in predictions:
+  accuracyy = accuracy_score(train_label_binary, i)
+  accuracy.append(accuracyy)
+  precision1 = precision_score(train_label_binary, i)
+  precision.append(precision1)
+  recall1 = recall_score(train_label_binary, i)
+  recall.append(recall1)
+  ROC1 = roc_auc_score(train_label_binary, i)
+  ROC.append(ROC1)
+  F11 = f1_score(train_label_binary, i)
+  F1.append(F11)
+data2 = {'Method': names,
+        'Accuracy': accuracy,
+        'Precision': precision,
+        'Recall': recall,
+        'ROC': ROC,
+        'F1 score': F1,
+        }
+evaluation = pd.DataFrame(data2, columns=['Method', "Accuracy", "Precision","Recall", "ROC", "F1 score"])
+evaluation
+"""**Drawing the ROC curve of all models on the train dataset**"""
+from sklearn.model_selection import cross_val_predict
+from sklearn.metrics import roc_curve
+roc_curve_rates = []
+for model in [model3, SVM_final_model, model]:  #models are 'Logistic Regression', 'RandomForestClassifier', 'SVC'
+  #finds the predicted probability for the sets and model
+  predict_probability = cross_val_predict(model, X_train_prepared, train_label_binary, cv= 10, method = "predict_proba")
+  #gets the probs for pos class
+  y_scorse = predict_probability[:,1]
+  #calculates the fpr and tpr with te scores
+  fpr, tpr, threshold = roc_curve(train_label_binary, y_scorse)
+  roc_curve_rates.append({'fpr': fpr, 'tpr': tpr})
+#Takes the dics array and plots each line on the same graph
+line_names = ['Logistic Regression', 'RandomForestClassifier', 'SVC']
+plt.plot(fpr, tpr, linewidth=2)
+for i in range(len(roc_curve_rates)):
+  plt.plot(roc_curve_rates[i]['fpr'], roc_curve_rates[i]['tpr'], linewidth=2, label=line_names[i])
+plt.xlim([0,1])
+plt.ylim([0,1])
+plt.plot([0,1], [0,1], "k--")
+plt.title('ROC curve')
+plt.xlabel('False Positive Rate (1 - specifity)')
+plt.ylabel('True Positive Rate (Recall)')
+plt.legend(loc ="lower right")
+plt.grid()
+plt.show()
+"""**Optimizing the Neural Network**"""
+# Parameters to check
+number_of_layers = [3, 4, 5, 6, 7]
+number_of_neurons = [10, 100, 100, 5000]
+# Variables for saving data
+best_epoch = [[]];
+best_accuracy = [[]];
+i = 0;
+# Add early stopping into model training
+from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
+keras_callbacks = [
+      EarlyStopping(monitor='val_loss', patience=5, mode='min', min_delta=0.0001),
+]
+# Loop through all parameters
+for layers in number_of_layers:
+  for neurons in number_of_neurons:
+    print("Testing NN - Layers: "+ str(layers) + "; Neurons per layer:" + str(neurons))
+    NN_model = build_NN(layers, neurons)
+    train_history = NN_model.fit(X_train_prepared, train_label_binary.values.ravel(), validation_data=(X_val_prepared,validation_label_binary.values.ravel()), batch_size=128, epochs = 30, callbacks=keras_callbacks)
+    # Using validation accuracy as performance metric
+    accuracy = train_history.history['val_accuracy']
+    best_accuracy[i].append(max(accuracy))
+    best_epoch[i].append(accuracy.index(max(accuracy)))
+  i = i + 1;
+  best_epoch.append([])
+  best_accuracy.append([])
+# Remove last element
+best_epoch.pop(i)
+best_accuracy.pop(i)
+# Build model with best parameters
+ideal_layers_index = best_accuracy.index(max(best_accuracy))
+ideal_layers = number_of_layers[ideal_layers_index]
+ideal_neurons = number_of_neurons[best_accuracy[ideal_layers_index].index(max(best_accuracy[ideal_layers_index]))]
+# Print Results
+print("Best number of layers:", str(ideal_layers))
+print("Best number of neurons:", str(ideal_neurons))
+"""## **Evaluate all the models on the Test Set**
+"""
+#Logistic Regression
+logistic_regression_final_model = LogisticRegression(solver ='liblinear', **best_model.best_params_)
+logistic_regression_final_model.fit(X_train_prepared, train_label_binary )
+logistic_prediction_test= logistic_regression_final_model.predict(X_test_prepared)
+logistic_prediction_test
+#Support Vector Machine
+SVM_final_model = SVC(C=0.1, kernel= 'linear', gamma = 'scale', probability=True)
+SVM_final_model.fit(X_train_prepared, train_label_binary)
+svm_prediction_test= SVM_final_model.predict(X_test_prepared)
+svm_prediction_test
+# Random Forest Classifier
+random_forest_final_model = RandomForestClassifier(n_estimators = 400, max_features= 'sqrt')
+random_forest_final_model.fit(X_train_prepared, train_label_binary)
+random_forest_prediction_test= random_forest_final_model.predict(X_test_prepared)
+random_forest_prediction_test
+# Neural Network
+keras_callbacks = [
+      EarlyStopping(monitor='val_loss', patience=10, mode='min', min_delta=0.0001),
+      ModelCheckpoint('./checkmodel.h5', monitor='val_loss', save_best_only=True, mode='min')
+]
+NN_final_model = build_NN(ideal_layers, ideal_neurons, dropout=0.15)
+NN_final_model.fit(X_train_prepared, train_label_binary, validation_data=(X_val_prepared,validation_label_binary.values.ravel()), batch_size=128, epochs = 30, callbacks=keras_callbacks)
+NN_prediction_test= NN_final_model.predict(X_test_prepared)
+# Prepare prediction to be comparable
+NN_prediction_test = (NN_prediction_test >= NN_threshold)
+from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
+predictions = [logistic_prediction_test,svm_prediction_test, random_forest_prediction_test, NN_prediction_test]
+names = ["Logistic_regression_test","Support_vector_machine_test", "Random_forest_test", "Neural_net_test"]
+accuracy = []
+precision =[]
+recall = []
+ROC = []
+F1= []
+for i in predictions:
+  accuracyy = accuracy_score(test_label_binary, i)
+  accuracy.append(accuracyy)
+  precision1 = precision_score(test_label_binary, i)
+  precision.append(precision1)
+  recall1 = recall_score(test_label_binary, i)
+  recall.append(recall1)
+  ROC1 = roc_auc_score(test_label_binary, i)
+  ROC.append(ROC1)
+  F11 = f1_score(test_label_binary, i)
+  F1.append(F11)
+data3 = {'Method': names,
+        'Accuracy': accuracy,
+        'Precision': precision,
+        'Recall': recall,
+        'ROC': ROC,
+        'F1 score': F1,
+        }
+evaluation = pd.DataFrame(data3, columns=['Method', "Accuracy", "Precision","Recall", "ROC", "F1 score"])
+evaluation
+"""**Trade-off between precision and recall** **for** :
+1.   Logistic Regression
+2.   Support Vector Machine
+1.   Random Forest
+"""
+from sklearn.metrics import precision_recall_curve
+import matplotlib.pyplot as plt
+y_score = logistic_regression_final_model.predict_proba(X_test_prepared)[:, 1]
+#calculate precision and recall
+precision, recall, thresholds = precision_recall_curve(test_label_binary, y_score)
+#create precision recall curve
+fig, ax = plt.subplots()
+ax.plot(recall, precision, color='red')
+#add axis labels to plot
+ax.set_title('Precision-Recall Curve for Logistic Regression')
+ax.set_ylabel('Precision')
+ax.set_xlabel('Recall')
+#display plot
+plt.grid(True)
+plt.show()
+from sklearn.metrics import precision_recall_curve
+import matplotlib.pyplot as plt
+y_score = random_forest_final_model.predict_proba(X_test_prepared)[:, 1]
+#calculate precision and recall
+precision, recall, thresholds = precision_recall_curve(test_label_binary, y_score)
+#create precision recall curve
+fig, ax = plt.subplots()
+ax.plot(recall, precision, color='blue')
+#add axis labels to plot
+ax.set_title('Precision-Recall Curve for Support Vector Machine')
+ax.set_ylabel('Precision')
+ax.set_xlabel('Recall')
+#display plot
+plt.grid(True)
+plt.show()
+from sklearn.metrics import precision_recall_curve
+import matplotlib.pyplot as plt
+y_score = SVM_final_model.predict_proba(X_test_prepared)[:, 1]
+#calculate precision and recall
+precision, recall, thresholds = precision_recall_curve(test_label_binary, y_score)
+#create precision recall curve
+fig, ax = plt.subplots()
+ax.plot(recall, precision, color='purple')
+#add axis labels to plot
+ax.set_title('Precision-Recall Curve for Random Forest Model')
+ax.set_ylabel('Precision')
+ax.set_xlabel('Recall')
+#display plot
+plt.grid(True)
+plt.show()
+"""**Drawing the ROC curve of all models on the test dataset**"""
+from sklearn.model_selection import cross_val_predict
+from sklearn.metrics import roc_curve
+roc_curve_rates = []
+for model in [logistic_regression_final_model, random_forest_final_model,SVM_final_model]:  #models are 'Logistic Regression', 'RandomForestClassifier', 'SVC'
+  #finds the predicted probability for the sets and model
+  predict_probability = cross_val_predict(logistic_regression_final_model, X_test_prepared, test_label_binary, cv= 10, method = "predict_proba")
+  #gets the probs for pos class
+  y_scorse = predict_probability[:,1]
+  #calculates the fpr and tpr with te scores
+  fpr, tpr, threshold = roc_curve(test_label_binary, y_scorse)
+  roc_curve_rates.append({'fpr': fpr, 'tpr': tpr})
+#Takes the dics array and plots each line on the same graph
+line_names = ['Logistic Regression', 'RandomForestClassifier', 'SVC']
+plt.plot(fpr, tpr, linewidth=2)
+for i in range(len(roc_curve_rates)):
+  plt.plot(roc_curve_rates[i]['fpr'], roc_curve_rates[i]['tpr'], linewidth=2, label=line_names[i])
+plt.xlim([0,1])
+plt.ylim([0,1])
+plt.plot([0,1], [0,1], "k--")
+plt.title('ROC curve')
+plt.xlabel('False Positive Rate (1 - specifity)')
+plt.ylabel('True Positive Rate (Recall)')
+plt.legend(loc ="lower right")
+plt.grid()
+plt.show()
+"""#**PART III ----> Gradio Implementation**
+"""
+# Install Gradio
+!pip install --quiet gradio
+# Import Gradio Library
+import gradio as gr
+# Define callback function
+def warfarin_callback(age, height, weight, gender, race, diabetes, medication, Cyp2C9, VKORC1, INR, model):
+  # Input validation
+  if not gender:
+    return "Please select the patient's gender"
+  if not race:
+    return "Please select the patient's race"
+  # Extract medication
+  simvastatin = 0.0
+  amiodarone = 0.0
+  if 'Simvastatin (Zocor)' in medication: simvastatin = 1.0
+  if 'Amiodarone (Cordarone)' in medication: amiodarone = 1.0
+  # Categorize age
+  age_categories = ['10 - 19', '20 - 29', '30 - 39', '40 - 49', '50 - 59', '60 - 69', '70 - 79', '80 - 89', '90+']
+  age_category = age_categories[int(np.floor(age/10)) - 1]
+  # Gender, Race (Reported), Age, Height (cm), Weight (kg), Diabetes, Simvastatin (Zocor), Amiodarone (Cordarone), Target INR, INR on Reported Therapeutic Dose of Warfarin, Cyp2C9 genotypes, VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T
+  input_df = pd.DataFrame([[gender.lower(), race, age_category, height, weight, float(diabetes), simvastatin, amiodarone, 0.0, INR, Cyp2C9, VKORC1]], columns=["Gender", "Race (Reported)", "Age", "Height (cm)", "Weight (kg)", "Diabetes", "Simvastatin (Zocor)", "Amiodarone (Cordarone)", "Target INR", "INR on Reported Therapeutic Dose of Warfarin", "Cyp2C9 genotypes", "VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T"])
+  preprocessed_input_df = full_preprocess_function(input_df)
+  # Model Selection
+  if model == "Logistic Regression":
+    prediction = (logistic_regression_final_model.predict(preprocessed_input_df))
+  elif model == "Support Vector Machine":
+    prediction = (SVM_final_model.predict(preprocessed_input_df))
+  elif model == "Random Forest":
+    prediction = (random_forest_final_model.predict(preprocessed_input_df))
+  elif model == "Neural Network":
+    prediction = (NN_final_model.predict(preprocessed_input_df))
+    prediction = prediction > NN_threshold
+  else:
+    return "Please select a Machine Learning Model"
+  if prediction:
+    return "The recommended Warfarin Dose is >30mg"
+  else:
+    return "The recommended Warfarin Dose is <=30mg"
+# Define output module as Warfarin dose
+output_dose = gr.Textbox(label = "Warfarin Dose")
+# Define all input modules
+input_age = gr.Slider(10, 100, step=1, label = "Age", default=30)
+input_height = gr.Number(label = "Height (cm)")
+input_weight = gr.Number(label = "Weight (kg)")
+input_gender = gr.Radio(choices=["Male", "Female"], label = "Gender")
+input_race = gr.Dropdown(choices=['Asian', 'Black or African American', 'Caucasian', 'Chinese', 'Han Chinese', 'Hispanic', 'Indian', 'Intermediate', 'Japanese', 'Korean', 'Malay', 'Other','Other Mixed Race', 'Unspecified', 'White'], label = "Race")
+input_diabetes = gr.Checkbox(label = "Is the patient Diabetic?")
+input_medication = gr.CheckboxGroup(["Simvastatin (Zocor)", "Amiodarone (Cordarone)"], label = "Is the patient taking any of the following medication?")
+input_Cyp269 = gr.Dropdown(['*1/*1', '*1/*11', '*1/*13', '*1/*14', '*1/*2', '*1/*3', '*1/*5', '*1/*6', '*2/*2', '*2/*3', '*3/*3'], label = "Cyp2C9 genotype")
+input_VKORC1 = gr.Dropdown(['A/A', 'A/G', 'G/G', 'Unknown'], label = "VKORC1 genotype")
+input_INR = gr.Slider(1, 5, step=0.01, label = "INR on Reported Therapeutic Dose of Warfarin", default=2.45)
+input_model = gr.Dropdown(choices=["Logistic Regression", "Support Vector Machine", "Random Forest", "Neural Network" ], label = "Machine Learning Model")
+gr.Interface(fn=warfarin_callback, inputs=[input_age, input_height, input_weight,input_gender, input_race, input_diabetes, input_medication, input_Cyp269, input_VKORC1, input_INR, input_model], outputs=output_dose).launch(debug=False)

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+tensorflow
+numpy
+matplotlib
+scikit-learn
+pandas
+joblib
+opencv-python