theresaschneider commited on
Commit
4f6fa63
1 Parent(s): 7320efe

feat: ML_final_project files being added to some git

Browse files
Files changed (2) hide show
  1. app.py +1290 -0
  2. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,1290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Copy of finalProjectDaniel
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1Xmu0qEBPBWsUKnRKtCsUn2mmP6R5tkZQ
8
+
9
+ # Importing libraries
10
+ """
11
+
12
+ ## Basic imports
13
+ import matplotlib.pyplot as plt
14
+ import pandas as pd
15
+ import numpy as np
16
+ import tensorflow as tf
17
+
18
+ ## Specific imports
19
+ from sklearn.model_selection import train_test_split
20
+ from sklearn.model_selection import StratifiedShuffleSplit
21
+ from sklearn.impute import SimpleImputer
22
+ from sklearn.preprocessing import OneHotEncoder
23
+ from sklearn import preprocessing
24
+ from sklearn.base import BaseEstimator, TransformerMixin
25
+ from sklearn.pipeline import Pipeline
26
+ from sklearn.preprocessing import StandardScaler
27
+ from sklearn.compose import ColumnTransformer
28
+ from sklearn.preprocessing import OrdinalEncoder
29
+ from sklearn.linear_model import LinearRegression
30
+
31
+ """#Loading the DataSet and Reducing the Features of Interest"""
32
+
33
+ # from google.colab import drive
34
+ # drive.mount('/content/drive/', force_remount=True)
35
+ # !ls /content/drive/MyDrive/FALL2022/Warfarin_Dose_Prediction_Dataset.xls
36
+
37
+ ## for theresa to run it
38
+ from google.colab import drive
39
+ drive.mount('/content/drive/')
40
+ !ls /content/drive/MyDrive/Machine Learning/data_final_project.csv
41
+
42
+ !pip install --upgrade xlrd
43
+
44
+ import pandas as pd
45
+ # original_df = pd.read_excel('/content/drive/MyDrive/FALL2022/Warfarin_Dose_Prediction_Dataset.xls')
46
+
47
+ # for theresa to run it
48
+ original_df = pd.read_csv('/content/drive/MyDrive/Machine Learning/data_final_project.csv', sep=',')
49
+
50
+ original_df.info()
51
+
52
+ patients = original_df[['Gender','Race (Reported)', 'Age', 'Height (cm)', 'Weight (kg)', 'Diabetes', 'Simvastatin (Zocor)', 'Amiodarone (Cordarone)',
53
+ 'Target INR', 'INR on Reported Therapeutic Dose of Warfarin', 'Cyp2C9 genotypes',
54
+ 'VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T', 'Therapeutic Dose of Warfarin']].copy()
55
+
56
+ patients.head(n=5)
57
+
58
+ patients.describe()
59
+
60
+ patients.info()
61
+
62
+ # patients.to_excel("patients_df_reduced.xlsx")
63
+
64
+ """# Setting aside a validation set right away
65
+ separates dataset into patients_df (95%) and validation_set (5%)
66
+ """
67
+
68
+ from sklearn.model_selection import train_test_split
69
+
70
+ patients_df, validation_set = train_test_split(patients, test_size=0.05, random_state=42)
71
+
72
+ """# Visualizing Data Features and Correlations on whole dataset (minus validation set)
73
+
74
+ ###Looking at Numerical Data (note that some of these are numerical catagorical but are entered as 0 or 1)
75
+ """
76
+
77
+ # Commented out IPython magic to ensure Python compatibility.
78
+ # %matplotlib inline
79
+ patients_df.hist(bins=50, figsize=(20,15))
80
+ plt.show()
81
+
82
+ corr_matrix = patients_df.corr()
83
+ corr_matrix["Therapeutic Dose of Warfarin"].sort_values(ascending=False)
84
+
85
+ # note that Target INR and INR on Reported Therapeutic Dose of Warfarin are linearly related. Target INR has so few values that I will remove it as part of pre-processing
86
+ corr_matrix["Target INR"].sort_values(ascending=False)
87
+
88
+ """### Looking at Catagorical Text Data (Use these catagories for gradio implementation later)"""
89
+
90
+ patients_df['Gender'].value_counts()
91
+
92
+ patients_df['Age'].value_counts()
93
+
94
+ patients_df['Race (Reported)'].value_counts()
95
+
96
+ patients_df['Target INR'].value_counts()
97
+
98
+ patients_df['Diabetes'].value_counts()
99
+
100
+ patients_df['Simvastatin (Zocor)'].value_counts()
101
+
102
+ patients_df['Amiodarone (Cordarone)'].value_counts()
103
+
104
+ patients_df['Cyp2C9 genotypes'].value_counts()
105
+
106
+ patients_df['VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T'].value_counts()
107
+
108
+ """# Dropping any rows that have Nan in the target column ON WHOLE DATASET"""
109
+
110
+ # Dropping any rows that have Nan in the target column
111
+ patients_df.dropna(subset=['Therapeutic Dose of Warfarin'], inplace=True)
112
+ patients_df.info()
113
+
114
+ """# Dividing Data into Statified Train (80%) and Test Set (20%)
115
+ This includes minimal pre-processing of gender and weight on the full dataset that was necessary for the statified sampling based on weight
116
+ Test and Train Sets with features and labels are stored in ;'strat_train_set' and 'strat_test_set'
117
+
118
+ patients_df -> strat_train_set, strat_test_set
119
+
120
+ ### Perform Statified Sampling based on Weight (Chapter 2 Pages 54-55)
121
+
122
+ ### Dropping Rows with Nan Gender Columns (since there are only 4 of them) -- NEED TO DO BEFORE STAT SAMPLING IN THIS CASE
123
+ """
124
+
125
+ patients_df.dropna(subset=['Gender'], inplace=True)
126
+
127
+ """#### Replacing Nan values in weight group with median based on Gender as is needed to perform statified sampling for the weight group"""
128
+
129
+ ## looking at median female weight
130
+ median_female_weight=patients_df.loc[patients_df['Gender'] == 'female', 'Weight (kg)'].median()
131
+ median_female_weight
132
+
133
+ ## looking at median male weight
134
+ median_male_weight=patients_df.loc[patients_df['Gender'] == 'male', 'Weight (kg)'].median()
135
+ median_male_weight
136
+
137
+ ## filling in null weight values on full dataset
138
+ medians = patients_df.groupby(['Gender'])['Weight (kg)'].median()
139
+ patients_df = patients_df.set_index(['Gender'])
140
+ patients_df['Weight (kg)'] = patients_df['Weight (kg)'].fillna(medians)
141
+ patients_df = patients_df.reset_index()
142
+
143
+ patients_df['Weight (kg)'].isna().sum()
144
+
145
+ """#### Creating Weight Catagories from which the test set will sample from"""
146
+
147
+ patients_df["weight_cat"] = pd.cut(patients_df["Weight (kg)"], bins=[0, 50, 75, 100, np.inf],
148
+ labels=[1, 2, 3, 4])
149
+ patients_df["weight_cat"].hist()
150
+
151
+ """#### Dividing patients_df into strat_train_set (80%) and strat_test_set (20%) distribution"""
152
+
153
+ from sklearn.model_selection import StratifiedShuffleSplit
154
+ split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
155
+ for train_index, test_index in split.split(patients_df, patients_df["weight_cat"]):
156
+ strat_train_set = patients_df.loc[train_index]
157
+ strat_test_set = patients_df.loc[test_index]
158
+
159
+ """#### Comparing proportion of samples per weight catagory between test set and original dataset
160
+ #####(distrbutions are the same, showing that the stratified sampling worked)
161
+ """
162
+
163
+ strat_test_set["weight_cat"].value_counts() / len(strat_test_set)
164
+
165
+ for set_ in (strat_train_set, strat_test_set):
166
+ set_.drop("weight_cat", axis=1, inplace=True)
167
+
168
+ patients_df["weight_cat"].value_counts() / len(patients_df)
169
+
170
+ """## Visualizing Training Set Features and Visualizing Effects of Pre-processing Steps
171
+ ##### (height, weight, and some catagorical variables)--nothing permanent done here--all incorporated into transformer later
172
+
173
+ ### Visualizing Outliers in Weight Class
174
+ ##### (not getting rid of outliers as they represent natural variation)
175
+ """
176
+
177
+ # reference: https://statisticsbyjim.com/basics/remove-outliers/
178
+
179
+ strat_train_set.boxplot(column='Weight (kg)')
180
+
181
+ strat_train_set[['Weight (kg)']].describe()
182
+ # note that the high of 237.7 kg represents around 522 lbs which is plausible to see in a population
183
+
184
+ """### Visualizing method for replacing Nan values in height group with median based on Gender """
185
+
186
+ # souce 1: https://stackoverflow.com/questions/33457191/python-pandas-dataframe-fill-nans-with-a-conditional-mean
187
+ # source 2: https://www.statology.org/conditional-mean-pandas/
188
+
189
+ median_female_height=strat_train_set.loc[strat_train_set['Gender'] == 'female', 'Height (cm)'].median()
190
+ median_female_height
191
+
192
+ median_male_height=strat_train_set.loc[strat_train_set['Gender'] == 'male', 'Height (cm)'].median()
193
+ median_male_height
194
+
195
+ """##### Copy of Strat_train_set created for testing purposes"""
196
+
197
+ strat_train_set_copy = strat_train_set.copy()
198
+
199
+ strat_train_set_copy.head(n=3)
200
+
201
+ # getting gender specific medians for training set
202
+ medians = strat_train_set_copy.groupby(['Gender'])['Height (cm)'].median()
203
+
204
+ # performing test transformation of null Height values based on gender
205
+ strat_train_set_copy = strat_train_set_copy.set_index(['Gender'])
206
+ strat_train_set_copy['Height (cm)'] = strat_train_set_copy['Height (cm)'].fillna(medians)
207
+ strat_train_set_copy = strat_train_set_copy.reset_index()
208
+
209
+ strat_train_set_copy.head(n=3)
210
+
211
+ """### Visualizing Race Distribution
212
+ #### Includes Visualization of pre-processing steps implemented later
213
+ """
214
+
215
+ # craete copy for testing purposes
216
+ strat_train_set_copy = strat_train_set.copy()
217
+
218
+ # visualizing original race feature
219
+ strat_train_set_copy['Race (Reported)'].value_counts().plot(kind='bar')
220
+
221
+ strat_train_set_copy['Race (Reported)'] = strat_train_set_copy['Race (Reported)'].fillna("UNSPECIFIED") # full null with UNSPECIFIED
222
+ strat_train_set_copy['Race (Reported)'] = strat_train_set_copy['Race (Reported)'].str.upper() # uppercase all catagories
223
+
224
+ # remove redundancy
225
+ strat_train_set_copy = strat_train_set_copy.replace({'Race (Reported)': {'AFRICAN-AMERICAN': 'BLACK OR AFRICAN AMERICAN', 'BLACK': 'BLACK OR AFRICAN AMERICAN'}})
226
+
227
+ # visualizing the race feature after pre-processing
228
+ strat_train_set_copy['Race (Reported)'].value_counts().plot(kind='bar')
229
+ strat_train_set['Race (Reported)'].isna().sum()
230
+
231
+ """### Visualizing Age Distribution
232
+ #### Replace Age Nan Values with mode from train set in pipeline
233
+ """
234
+
235
+ # visualizing age dataset before pre-processing
236
+ strat_train_set['Age'].value_counts().plot(kind='bar')
237
+ strat_train_set['Age'].isna().sum()
238
+
239
+ """### Visualizing Diabetes Distribution
240
+ #### Replace Diabetes Nan Values with mode from train set in pipeline
241
+ """
242
+
243
+ # visualizing diabetes training set before pre-processing
244
+ strat_train_set['Diabetes'].value_counts().plot(kind='bar')
245
+ strat_train_set['Diabetes'].isna().sum()
246
+
247
+ """### Visualizing Simvastatin Distribution
248
+ #### Replace Simvastatin Nan Values with mode from train set in pipeline
249
+ """
250
+
251
+ strat_train_set['Simvastatin (Zocor)'].value_counts().plot(kind='bar')
252
+ strat_train_set['Simvastatin (Zocor)'].isna().sum()
253
+
254
+ """### Visualizing Amiodarone Distribution
255
+ #### Replace Amiodarone Nan Values with mode from train set in pipeline
256
+ """
257
+
258
+ strat_train_set['Amiodarone (Cordarone)'].value_counts().plot(kind='bar')
259
+ strat_train_set['Amiodarone (Cordarone)'].isna().sum()
260
+
261
+ """### Visualizing Cyp2C9 Distribution
262
+ #### Includes Visualization of Pre-processing steps implemented later
263
+ #### Replace Cyp2C9 Nan Values with mode from train set in pipeline
264
+ """
265
+
266
+ strat_train_set_copy = strat_train_set.copy()
267
+
268
+ strat_train_set_copy['Cyp2C9 genotypes'].value_counts()
269
+
270
+ strat_train_set_copy['Cyp2C9 genotypes'].isna().sum()
271
+
272
+ strat_train_set_copy['Cyp2C9 genotypes'].value_counts().plot(kind='bar')
273
+
274
+ strat_train_set_copy['Cyp2C9 genotypes'] = strat_train_set_copy['Cyp2C9 genotypes'].fillna(strat_train_set_copy['Cyp2C9 genotypes'].mode()[0])
275
+
276
+ strat_train_set['Cyp2C9 genotypes'].mode()[0]
277
+
278
+ strat_train_set_copy['Cyp2C9 genotypes'].value_counts()
279
+
280
+ strat_train_set_copy['Cyp2C9 genotypes'].value_counts().plot(kind='bar')
281
+
282
+ """### Visualizing VKORC1 genotype
283
+ #### Includes Visualization of Pre-processing Steps Implemented Later
284
+ #### Replacing VKORC1 genotype Nan Values with 'Unknown' since there is no obvious mode (creates new catagory)
285
+ """
286
+
287
+ strat_train_set_copy = strat_train_set.copy()
288
+
289
+ strat_train_set_copy['VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T'].value_counts().plot(kind='bar')
290
+
291
+ strat_train_set_copy['VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T'].isna().sum()
292
+
293
+ # filling null values with 'Unknown'
294
+ strat_train_set_copy['VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T']=strat_train_set_copy['VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T'].fillna("Unknown")
295
+
296
+ strat_train_set_copy['VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T'].value_counts().plot(kind='bar')
297
+
298
+ """## Separate the data from the labels in training set:
299
+ ##### strat_train_set -> patients_info and patients_labels
300
+ """
301
+
302
+ patients_info = strat_train_set.drop("Therapeutic Dose of Warfarin", axis=1) # drop labels for training set
303
+ patients_labels = strat_train_set["Therapeutic Dose of Warfarin"].copy()
304
+
305
+ """## Custom Transformers for Pre-processing (Important Part)
306
+ ##### reference: Chapter 2 Textbook associated google collab notebook
307
+
308
+ ##### creating a custom transformer to handle catagorical attributes Nan Values:
309
+ ##### includes Gender, Cyp2C9 genotypes, VKORC1 genotype, Diabetes, Amiodarone, Simvastatin, Race, Age
310
+ """
311
+
312
+ from sklearn.base import BaseEstimator, TransformerMixin
313
+
314
+ class CatTransformer(BaseEstimator, TransformerMixin):
315
+ """
316
+ REPLACEMENT OF NAN FOR ALL CATAGORICAL FEATURES
317
+ for Gender, fills with mode from training set
318
+ for Cyp2C9 genotypes, fills with mode from training set as there is a most common class by far
319
+ for VKORC1 genotype, many more are unknown, and there is not a most common class, so fills with "unknown", thus creating a new catagory
320
+ for Diabetes phenotype, fills with mode--assumes no diabetes
321
+ for Amiodarone (Cordarone) drug, fills with mode from training set as there is a most common class by far
322
+ for Simvastatin (Zocor), fills with mode from training set as there is a most common class by far
323
+ for Race, fills nan with "unknown" and converts all classes to upper so that the several groups labelled "other" are grouped together
324
+ for Race, only a few were missing--replacement of nan with Mode
325
+ for Race, due to there already being a catagory for "Black or African American", the catagories "Black" and "African American" were grouped together under "Black or African American"
326
+ for Age, fills nan with mode from training set--not many Age values are missing. Even though there is not a most common class by a lot, I think this is best
327
+ """
328
+ def __init__(self): # no *args or **kwargs
329
+ pass
330
+ def fit(self, X, y=None):
331
+ self.mode_Gen = X['Gender'].mode()[0]
332
+ self.mode_Cyp = X['Cyp2C9 genotypes'].mode()[0]
333
+ self.mode_Amio = X['Amiodarone (Cordarone)'].mode()[0]
334
+ self.mode_Simv = X['Simvastatin (Zocor)'].mode()[0]
335
+ self.mode_Diab = X['Diabetes'].mode()[0]
336
+ self.mode_Age = X['Age'].mode()[0]
337
+ return self
338
+ def transform(self, X):
339
+ X['Cyp2C9 genotypes']=X['Cyp2C9 genotypes'].fillna(self.mode_Cyp)
340
+ X['VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T']=X['VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T'].fillna("Unknown")
341
+ X['Amiodarone (Cordarone)']=X['Amiodarone (Cordarone)'].fillna(self.mode_Amio)
342
+ X['Simvastatin (Zocor)']=X['Simvastatin (Zocor)'].fillna(self.mode_Simv)
343
+ X['Diabetes']=X['Diabetes'].fillna(self.mode_Diab)
344
+ X['Race (Reported)'] = X['Race (Reported)'].fillna("UNSPECIFIED")
345
+ X['Race (Reported)'] = X['Race (Reported)'].str.upper()
346
+ X=X.replace({'Race (Reported)': {'AFRICAN-AMERICAN': 'BLACK OR AFRICAN AMERICAN', 'BLACK': 'BLACK OR AFRICAN AMERICAN'}})
347
+ X['Age']=X['Age'].fillna(self.mode_Age)
348
+ X['Gender']=X['Gender'].fillna(self.mode_Gen)
349
+ return X
350
+
351
+ """##### creating a custom transformer to handle the transformation of height nan variables based on gender-depenedent median"""
352
+
353
+ from sklearn.base import BaseEstimator, TransformerMixin
354
+
355
+ class GenderTransformer(BaseEstimator, TransformerMixin):
356
+ """
357
+ replaces missing Height variables by median for the associated gender
358
+ replaces missing Weight variables by median for the associated gender
359
+ """
360
+ def __init__(self): # no *args or **kwargs
361
+ pass
362
+ def fit(self, X, y=None):
363
+ self.medians_height = X.groupby(['Gender'])["Height (cm)"].median()
364
+ self.medians_weight = X.groupby(['Gender'])["Weight (kg)"].median()
365
+ return self
366
+ def transform(self, X):
367
+ X = X.set_index(['Gender'])
368
+ X["Height (cm)"] = X["Height (cm)"].fillna(self.medians_height)
369
+ X["Weight (kg)"] = X["Weight (kg)"].fillna(self.medians_weight)
370
+ X = X.reset_index()
371
+ return X
372
+
373
+ """##### creating a custom transformer to add extra attributes (BMI, BSA):"""
374
+
375
+ from sklearn.base import BaseEstimator, TransformerMixin
376
+
377
+ # column index
378
+ col_names = ["Height (cm)", "Weight (kg)"]
379
+ weight_ix, height_ix = [0, 1] # get the column indices; they are 0 and 1
380
+ class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
381
+ """
382
+ adds the variables for BSA (body surface area) to the data
383
+ def transform returns numpy array
384
+ Body Surface Area (as calculated from the DuBois and DuBois formula)
385
+ reference: https://www.uptodate.com/contents/image?imageKey=ONC%2F96451&topicKey=ONC%2F83810&search=Pharmacogenomics&rank=3~18&source=see_link
386
+ """
387
+ def __init__(self): # no *args or **kwargs
388
+ pass
389
+ def fit(self, X, y=None):
390
+ return self # nothing else to do
391
+ def transform(self, X):
392
+ # BMI = X[:, weight_ix] / ((X[:, height_ix]/100)**2)
393
+ BSA = ((0.007184*(X[:, weight_ix])**0.425)) * ((X[:, height_ix])**0.725)
394
+ return np.c_[X, BSA]
395
+
396
+ """#### Working Transformer Pipelines
397
+
398
+ ##### pipeline for dealing with missing height and weight values
399
+ """
400
+
401
+ from sklearn.pipeline import Pipeline
402
+ from sklearn.preprocessing import StandardScaler
403
+
404
+ gender_pipeline = Pipeline([
405
+ ('gender_transformer', GenderTransformer()),
406
+ ])
407
+
408
+ """##### pipeline for dealing with catagorical data nan values"""
409
+
410
+ from sklearn.pipeline import Pipeline
411
+ from sklearn.preprocessing import StandardScaler
412
+
413
+ cat_pipeline = Pipeline([
414
+ ('catagorical_transformer', CatTransformer()),
415
+ ])
416
+
417
+ """##### pipeline for dealing with numerical data: height, weight, INR
418
+ ##### uses CombinedAttributeAdder class for the addition of BSA (or BMI)
419
+ ##### uses SimpleImputer to replace any remaining Nan values with the median for that feature
420
+ ##### uses StandardScaler for scaling
421
+ """
422
+
423
+ from sklearn.pipeline import Pipeline
424
+ from sklearn.preprocessing import StandardScaler
425
+
426
+ num_pipeline = Pipeline([
427
+ ('imputer', SimpleImputer(strategy="median")),
428
+ ('attribs_adder', CombinedAttributesAdder()),
429
+ ('std_scaler', StandardScaler()),
430
+ ])
431
+
432
+ """##### full pipuline using ColumnTransformer
433
+ ##### Adds Attributes (from num_pipeline), Scales and imputes numerical data (from num_pipeline), Uses ordinal encoder for Ordinal Catagorical Data (Age), Uses 1Hot Encoder for non-ordinal Catagorical Data
434
+ """
435
+
436
+ from sklearn.compose import ColumnTransformer
437
+ from sklearn.preprocessing import OrdinalEncoder
438
+
439
+ num_attribs = ['Height (cm)', 'Weight (kg)', 'INR on Reported Therapeutic Dose of Warfarin']
440
+ cat_attribs_ordinal = ['Age', 'Gender', 'Diabetes', 'Simvastatin (Zocor)', 'Amiodarone (Cordarone)']
441
+ cat_attribs_1hot = ["Race (Reported)",
442
+ 'Cyp2C9 genotypes', 'VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T']
443
+ """
444
+ Pipeline using column transformer
445
+ Adds BSA attribute (from num_pipeline)
446
+ imputes remaining nan numerical data using median (from num_pipeline)
447
+ scales numerical data using StandardScaler (from num_pipeline)
448
+ Uses ordinal encoder for Ordinal Catagorical Data (Age) and Binary Catagorical Data (gender, diabetes, simvastatin, amiodorone)--see cat_attrib_ordinal
449
+ Uses 1Hot Encoder for non-ordinal Catagorical Data--see cat_attribs_1hot
450
+ """
451
+ scale_encode_pipeline = ColumnTransformer([
452
+ ("num", num_pipeline, num_attribs),
453
+ ('cat_ord', OrdinalEncoder(), cat_attribs_ordinal),
454
+ ("cat_1hot", OneHotEncoder(sparse=False, handle_unknown='ignore'), cat_attribs_1hot),
455
+ ]) #input list of (name, transformer, columns) tuples specifying the transformer objects to be applied to subsets of the data.
456
+
457
+ """## Full PreProcess Function to incorporate all pipelines
458
+ ##### contains "full_preprocess_function()"
459
+ """
460
+
461
+ def series_to_df(data_series):
462
+ """
463
+ function to help with processing new data (potentially useful for Gradio implementation)
464
+ input: Series with dimensions (12,)
465
+ output: pandas dataframe with features as column names; can now be sent through full_preprocess_function
466
+ """
467
+ data_df = data_series.to_frame()
468
+ data_df = data_df.transpose()
469
+ return data_df
470
+
471
+ def full_preprocess_function(data_df, train=False):
472
+ """
473
+ INPUT: program expects the equivalent of an instance (or multiple instances) from the non pre-processed dataset (without the label) in the form of a pandas_df
474
+
475
+ --input should have the following 12 features as column names: Gender, Race (Reported), Age, Height (cm), Weight (kg), Diabetes, Simvastatin (Zocor), Amiodarone (Cordarone),
476
+ Target INR, INR on Reported Therapeutic Dose of Warfarin,
477
+ Cyp2C9 genotypes, VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T
478
+ --input should either contain a value for each feature of Nan
479
+
480
+ program will remove the Target INR column from dataset as there were too few values and it was multicollinearly related to INR Reported
481
+ if train==True, function will send training data to pre-processing be fit and transformed
482
+
483
+ else, function will send new data to pre-processing to be transformed (not fit)
484
+ OUTPUT: function returns pandas df of features, including feature names as column names
485
+ Note for encoded variables:
486
+ Gender: 0=female, 1=male;
487
+ Diabetes: 0=no, 1=yes;
488
+ Simvastatin: 0=no, 1=yes;
489
+ Amiodorone: 0=no, 1=yes;
490
+ Age: {0: '10 - 19', 1:'20 - 29', 2:'30 - 39', 3:'40 - 49', 4:'50 - 59', 5:'60 - 69', 6:'70 - 79', 7:'80 - 89', 8:'90+'}
491
+ """
492
+ if isinstance(data_df, pd.Series) and data_df.shape == (12,):
493
+ raise TypeError("Expects pd.DataFrame; Send your data through the series_to_df() function for conversion to proper format")
494
+ if not isinstance(data_df, pd.DataFrame):
495
+ raise TypeError("Expects pd.DataFrame; See full_preprocess function documentation for input expectations")
496
+ # prepared_feature_names = ['Height (cm)', 'Weight (kg)', 'INR (Reported)', 'BSA (m**2)', 'Age', 'Gender', 'Diabetes', 'Simvastatin', 'Amiodorone',
497
+ # 'ASIAN', 'BLACK OR AFRICAN AMERICAN', 'CAUCASIAN', 'CHINESE', 'HAN CHINESE', 'HISPANIC', 'INDIAN', 'INTERMEDIATE', 'JAPANESE', 'KOREAN', 'MALAY', 'OTHER','OTHER MIXED RACE', 'UNSPECIFIED', 'WHITE',
498
+ # '*1/*1', '*1/*11', '*1/*13', '*1/*14', '*1/*2', '*1/*3', '*1/*5', '*1/*6', '*2/*2', '*2/*3', '*3/*3',
499
+ # 'A/A', 'A/G', 'G/G', 'Unknown']
500
+ data_df.drop(['Target INR'], axis=1, inplace=True) # remove Target INR due to too few values and collinearity with INR Reported
501
+ if train==True:
502
+ data_cat_tr = cat_pipeline.fit_transform(data_df)
503
+ data_height_tr = gender_pipeline.fit_transform(data_cat_tr)
504
+ data_prepared = scale_encode_pipeline.fit_transform(data_height_tr)
505
+ else:
506
+ data_cat_tr = cat_pipeline.transform(data_df)
507
+ data_height_tr = gender_pipeline.transform(data_cat_tr )
508
+ data_prepared = scale_encode_pipeline.transform(data_height_tr)
509
+ data_prepared_df = pd.DataFrame(data_prepared)
510
+ # data_prepared_df.drop(['Weight (kg)'], axis=1, inplace=True) # removing weight to address multicollinearity
511
+ return data_prepared_df
512
+
513
+ """##### Example test input for full_preprocess_function()
514
+ ![example_input.PNG]()
515
+
516
+ #### Getting OneHotEncoder Catagory Info and Testing Ordinal Encoding for Age (can be ignored)
517
+ """
518
+
519
+ # data_cat_tr = cat_pipeline.fit_transform(patients_info)
520
+ # data_height_tr = gender_pipeline.fit_transform(data_cat_tr)
521
+
522
+ # data_height_tr
523
+
524
+ # data_height_tr_age = data_height_tr[['Age']]
525
+
526
+ # cat_encoder = OrdinalEncoder()
527
+
528
+ # patients_age_ord = cat_encoder.fit_transform(data_height_tr_age)
529
+
530
+ # patients_age_ord
531
+
532
+ # cat_encoder.categories_
533
+ # array([[7.],
534
+ # [6.],
535
+ # [6.],
536
+ # ...,
537
+ # [4.],
538
+ # [7.],
539
+ # [5.]])
540
+ # [array(['10 - 19', '20 - 29', '30 - 39', '40 - 49', '50 - 59', '60 - 69',
541
+ # '70 - 79', '80 - 89', '90+'], dtype=object)]
542
+
543
+ # cat_encoder.categories_
544
+ # [array(['10 - 19', '20 - 29', '30 - 39', '40 - 49', '50 - 59', '60 - 69',
545
+ # '70 - 79', '80 - 89', '90+'], dtype=object),
546
+ # array(['female', 'male'], dtype=object),
547
+ # array(['ASIAN', 'BLACK OR AFRICAN AMERICAN',
548
+ # 'CAUCASIAN', 'CHINESE', 'HAN CHINESE', 'HISPANIC', 'INDIAN',
549
+ # 'INTERMEDIATE', 'JAPANESE', 'KOREAN', 'MALAY', 'OTHER',
550
+ # 'OTHER MIXED RACE', 'UNSPECIFIED', 'WHITE'], dtype=object),
551
+ # array([0., 1.]),
552
+ # array([0., 1.]),
553
+ # array([0., 1.]),
554
+ # array(['*1/*1', '*1/*11', '*1/*13', '*1/*14', '*1/*2', '*1/*3', '*1/*5',
555
+ # '*1/*6', '*2/*2', '*2/*3', '*3/*3'], dtype=object),
556
+ # array(['A/A', 'A/G', 'G/G', 'Unknown'], dtype=object)]
557
+
558
+
559
+ ## Sending training features data through pre-processing pipeline
560
+ ##### patients_info -> 'X_train_prepared'
561
+ ##### y_train stored in 'patients_labels'
562
+ """
563
+
564
+ ## showing un-pre-processed dataset
565
+ patients_info.head()
566
+
567
+
568
+
569
+ X_train_prepared = full_preprocess_function(patients_info, train=True)
570
+
571
+ # showing pre-processed training dataset
572
+ X_train_prepared.head()
573
+
574
+ X_train_prepared.info()
575
+
576
+ """##### Send pre-processed train_data to excel (labels too)"""
577
+
578
+ # X_train_prepared.to_excel("X_patients_train.xlsx")
579
+ # patients_labels.to_excel('y_patients_train.xlsx')
580
+
581
+ """## Making Sure Pre-processed training set works with basic model"""
582
+
583
+ from sklearn.linear_model import LinearRegression
584
+
585
+ lin_reg = LinearRegression()
586
+ lin_reg.fit(X_train_prepared, patients_labels)
587
+
588
+ patients_labels
589
+
590
+ from sklearn.metrics import mean_squared_error
591
+
592
+ patients_predictions = lin_reg.predict(X_train_prepared)
593
+ lin_mse = mean_squared_error(patients_labels, patients_predictions)
594
+ lin_rmse = np.sqrt(lin_mse)
595
+ lin_rmse
596
+
597
+ """## Pre-processing on Test Set (currently stored in strat_test_set)
598
+ ##### note: strat_test_set contains features and labels
599
+ ##### produces X_test_prepared and y_test
600
+
601
+ #### Separate strat_test_set features from labels
602
+ ##### stored in X_test and y_test
603
+ """
604
+
605
+ X_test = strat_test_set.drop("Therapeutic Dose of Warfarin", axis=1)
606
+ y_test = strat_test_set["Therapeutic Dose of Warfarin"].copy()
607
+
608
+ """#### Send X_test to pre-processing function/pipeline
609
+ ##### stored in X_test_prepared
610
+ """
611
+
612
+ X_test_prepared = full_preprocess_function(X_test)
613
+
614
+ """##### Send pre-processed test_data to excel (labels too)"""
615
+
616
+ # X_test_prepared.to_excel("X_patients_test.xlsx")
617
+ # y_test.to_excel("y_patients_test.xlsx")
618
+
619
+ """## Making sure Pre-processed testing set works with simple regression model"""
620
+
621
+ test_predictions = lin_reg.predict(X_test_prepared)
622
+
623
+ """#### Evaluate mse and rmse"""
624
+
625
+ test_mse = mean_squared_error(y_test, test_predictions)
626
+ test_rmse = np.sqrt(test_mse)
627
+
628
+ test_rmse
629
+
630
+ """## Pre-processing on Validation Set
631
+ ##### produces X_val_prepared and y_val
632
+
633
+ #### Dropping nan labels and Separating validation_set features from labels
634
+ ##### oridinally stored in 'X_val' and 'y_val'
635
+ """
636
+
637
+ validation_set.dropna(subset=['Therapeutic Dose of Warfarin'], inplace=True)
638
+ X_val = validation_set.drop("Therapeutic Dose of Warfarin", axis=1)
639
+ y_val = validation_set["Therapeutic Dose of Warfarin"].copy()
640
+
641
+ """## Sending a single instance from X_val through pre-processing pipeline and making sure it works with simple regression model"""
642
+
643
+ trial = X_val.iloc[3]
644
+ trial
645
+
646
+ trial.shape
647
+
648
+ trial_df = series_to_df(trial)
649
+
650
+ # example of input for full_preprocessing_function()
651
+ trial_df
652
+
653
+ X_val_trial = full_preprocess_function(trial_df)
654
+
655
+ # example of pre-processed single test input
656
+ X_val_trial
657
+
658
+ trial_val_prediction = lin_reg.predict(X_val_trial)
659
+
660
+ trial_val_prediction
661
+
662
+ y_trial = y_val.iloc[3]
663
+ y_trial
664
+
665
+ """#### Sending X_val through pre-processing pipeline"""
666
+
667
+ X_val_prepared = full_preprocess_function(X_val)
668
+
669
+ """## Making sure pre-processed validation set works with simple regression model"""
670
+
671
+ val_predictions = lin_reg.predict(X_val_prepared)
672
+
673
+ val_mse = mean_squared_error(y_val, val_predictions)
674
+ val_rmse = np.sqrt(val_mse)
675
+
676
+ val_rmse
677
+
678
+ """##### Send pre-processed validation_data to excel (labels too)"""
679
+
680
+ # X_val_prepared.to_excel("X_patients_val.xlsx")
681
+ # y_val.to_excel("y_patients_val.xlsx")
682
+
683
+ """#**PART II ----> ML MODELS FOR BINARY CLASSIFICATION**
684
+
685
+ **First let's create a binary classification dataset by cutting the target values into two categories (<30 mg , >=30 mg)**
686
+ """
687
+
688
+ import numpy as np
689
+
690
+ y_train = patients_labels
691
+
692
+ #Preparing training/testing/validation data for binary classifier
693
+ train_label_binary = (y_train >= 30)
694
+ print("binary train labels:", train_label_binary)
695
+
696
+ # print("original test labels:", y_test)
697
+ test_label_binary = (y_test >= 30)
698
+ print("binary test labels:", test_label_binary)
699
+
700
+ validation_label_binary = (y_val >= 30)
701
+ print("binary validation labels:", validation_label_binary)
702
+
703
+ """## 1.LOGISTIC REGRESSION MODEL
704
+
705
+ Logistic regression can be used for binary classification because it estimates the probability that one instance belogns to a class or not. So by using a probability threshold e.g 50%, it classifies the instances in positive class (1) if the probability is greater than 50 %. otherwise the instances will be classified in negative class (0). So, this model works in the same way as the Linear Regression but instead of outputing the result, it outputs the logistic of the result.
706
+ """
707
+
708
+ from sklearn.linear_model import LogisticRegression
709
+ log_regression = LogisticRegression(penalty = 'l2', C = 1, random_state = 0 )
710
+ log_regression.fit(X_train_prepared, train_label_binary.values.ravel())
711
+ log_prediction = log_regression.predict(X_train_prepared)
712
+ log_prediction
713
+
714
+ """## 2.SUPPORT VECTOR MACHINE
715
+ The main goal of Support Vector Machines is to fit the widest possible “street” between the classes. So, we need to have a large margin between the decision boundary which separates the classes and the training instances. the objective of SVM to find the optimal classifier is bacause the other linear classifiers might separate linear dataset in the correct way but the decision boundary is so close the training instances so that these models will probably not perform as well on new instances. Tha's why SVM tries to find the widest possible "street" between the classes.
716
+
717
+ """
718
+
719
+ from sklearn.svm import SVC
720
+ # # define linear kernel,
721
+ # svm_model_linear = SVC(kernel = "linear",C = 1 )
722
+ # svm_model_linear.fit(X_train_prepared, train_label_binary.values.ravel())
723
+ # svm_linear_prediction= svm_model_linear.predict(X_train_prepared)
724
+ # svm_linear_prediction
725
+
726
+ # define polynomial kernel, P158
727
+ svm_model_polynomial = SVC(kernel = "poly", degree = 7, C = 7 )
728
+ svm_model_polynomial.fit(X_train_prepared, train_label_binary.values.ravel())
729
+ svm_polynomial_prediction = svm_model_polynomial.predict(X_train_prepared)
730
+
731
+ svm_polynomial_prediction
732
+
733
+ """## 3.DECISION TREE MODEL"""
734
+
735
+ from sklearn.tree import DecisionTreeClassifier
736
+ # define tree model
737
+ decision_tree_model = DecisionTreeClassifier(max_depth = 5)
738
+ decision_tree_model.fit(X_train_prepared, train_label_binary.values.ravel())
739
+ decision_tree_prediction = decision_tree_model.predict(X_train_prepared)
740
+ decision_tree_prediction
741
+
742
+ """## 4.RANDOM FOREST MODEL"""
743
+
744
+ from sklearn.ensemble import RandomForestClassifier
745
+ random_forest_model = RandomForestClassifier(n_estimators = 500, max_depth= 10, max_leaf_nodes = -1)
746
+ random_forest_model.fit(X_train_prepared, train_label_binary.values.ravel())
747
+ random_forest_prediction = random_forest_model.predict(X_train_prepared)
748
+ random_forest_prediction
749
+
750
+ """## 5.NEURAL NETWORK"""
751
+
752
+ import tensorflow as tf
753
+ from tensorflow.keras.models import Sequential
754
+ from tensorflow.keras.layers import Dense
755
+ from tensorflow.keras.layers import Flatten
756
+ from tensorflow.keras.layers import Dropout
757
+
758
+ # Define decision threshold
759
+ NN_threshold = 0.5;
760
+
761
+ def build_NN(n_layers = 3, n_neurons = 1000, dropout = 0):
762
+ model = Sequential() # create Sequential model
763
+ for i in range(n_layers-1):
764
+ model.add(Dense(n_neurons, activation = 'relu'))
765
+ model.add(Dropout(dropout))
766
+ model.add(Dense(1, activation = 'sigmoid')) # 2 output neurons for binary classification
767
+ model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ['accuracy']) # binary cross-entropy because it's binary classification!
768
+ return model
769
+
770
+ # Build random NN
771
+ NN_model = build_NN(n_layers = 3, n_neurons = 10)
772
+
773
+ train_history = NN_model.fit(X_train_prepared, train_label_binary.values.ravel(), validation_data=(X_val_prepared,validation_label_binary.values.ravel()), batch_size=128, epochs = 20)
774
+ NN_prediction = NN_model.predict(X_train_prepared)
775
+
776
+ # Prepare prediction to be comparable
777
+ NN_prediction = (NN_prediction >= NN_threshold)
778
+
779
+ """## **Calculating the performance of each model in the train dataset**"""
780
+
781
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
782
+ methods = [decision_tree_prediction, random_forest_prediction,svm_polynomial_prediction,log_prediction, NN_prediction]
783
+ names = ["decision_tree_model", "random_forest_model","svm_polynomial_model","log_model", 'neural_net']
784
+ accuracy = []
785
+ precision =[]
786
+ recall = []
787
+ ROC = []
788
+ F1= []
789
+ for method in methods:
790
+ accuracyy = accuracy_score(train_label_binary, method)
791
+ accuracy.append(accuracyy)
792
+ precision1 = precision_score(train_label_binary, method)
793
+ precision.append(precision1)
794
+ recall1 = recall_score(train_label_binary, method)
795
+ recall.append(recall1)
796
+ ROC1 = roc_auc_score(train_label_binary, method)
797
+ ROC.append(ROC1)
798
+ F11 = f1_score(train_label_binary, method)
799
+ F1.append(F11)
800
+
801
+ data = {'Method': names,
802
+ 'Accuracy': accuracy,
803
+ 'Precision': precision,
804
+ 'Recall': recall,
805
+ 'ROC': ROC,
806
+ 'F1 score': F1,
807
+ }
808
+ evaluation = pd.DataFrame(data, columns=['Method', "Accuracy", "Precision","Recall", "ROC", "F1 score"])
809
+ evaluation
810
+
811
+ """## **Let's do a better Evaluation Using Cross-Validation**
812
+
813
+ **Logistic Regression cross validation**
814
+ """
815
+
816
+ from sklearn.model_selection import cross_val_score, GridSearchCV
817
+ from sklearn.linear_model import LogisticRegression
818
+ log_regression= LogisticRegression(solver ='liblinear')
819
+ penalty = ['l1', 'l2']
820
+ C = [1,0.1,0.01,0.001]
821
+ hyperparameters = dict(C=C, penalty=penalty)
822
+ classifier = GridSearchCV(log_regression, hyperparameters, cv=10, verbose =0)
823
+ best_model = classifier.fit(X_train_prepared, train_label_binary )
824
+
825
+ #printing out the best parameters for Logistic Regression model
826
+ print('Best penalty:', best_model.best_estimator_.get_params()['penalty'])
827
+ print('Best C:', best_model.best_estimator_.get_params()['C'])
828
+
829
+ model = LogisticRegression(solver ='liblinear', **best_model.best_params_)
830
+ model.fit(X_train_prepared, train_label_binary )
831
+ logistic_prediction= model.predict(X_train_prepared)
832
+ logistic_prediction
833
+
834
+ #calculating the accuracy of the model
835
+ scores = cross_val_score(model, X_train_prepared, train_label_binary )
836
+ scores
837
+ print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
838
+
839
+ from sklearn.model_selection import cross_val_predict
840
+ from sklearn.metrics import roc_curve
841
+
842
+ y_scores = cross_val_predict(model, X_train_prepared, train_label_binary, cv= 10, method = "decision_function") #decision_function
843
+ fpr, tpr, thresholds = roc_curve (train_label_binary, y_scores)
844
+
845
+ def plot_roc_curve(fpr, tpr, label =None):
846
+ plt.plot(fpr, tpr, linewidth=2, label = label)
847
+ plt.plot([0,1], [0,1], "k--")
848
+ plot_roc_curve(fpr, tpr)
849
+ plt.title('ROC curve for Logistic Regression')
850
+ plt.xlabel('False Positive Rate (1- specifity')
851
+ plt.ylabel('True Positive Rate (Recall)')
852
+ plt.legend(['Logistic Regression'],loc ="lower right")
853
+ plt.grid()
854
+ plt.show()
855
+
856
+ """**Support Vector Machine Cross validation**"""
857
+
858
+ from sklearn.svm import SVC
859
+
860
+ # hyperparameter_set = {'C': [0.001, 0.01, 0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': [0.001, 0.01, 0.1, 1]}
861
+ # svm = SVC()
862
+ # classifier2 = GridSearchCV(svm, hyperparameter_set, cv=10, verbose =0)
863
+ # best_SV = classifier2.fit(X_train_prepared, train_label_binary )
864
+
865
+ # #printing out the best parameters for SVM model
866
+ # print('Best kernel:', best_SV.best_params_['kernel'])
867
+ # print('Best C:', best_SV.best_params_['C'])
868
+ # print('Best gamma:', best_SV.best_params_['gamma'])
869
+
870
+ SVM_final_model = SVC(C=1, kernel= 'rbf', gamma = 0.1, probability=True)
871
+ SVM_final_model.fit(X_train_prepared, train_label_binary)
872
+ svm_prediction= SVM_final_model.predict(X_train_prepared)
873
+ svm_prediction
874
+
875
+ #calculating the accuracy of the model
876
+ scores = cross_val_score(SVM_final_model, X_train_prepared, train_label_binary )
877
+ scores
878
+ print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
879
+
880
+ #Drawing the ROC curve for SVM
881
+ from sklearn.model_selection import cross_val_predict
882
+ from sklearn.metrics import roc_curve
883
+
884
+ y_scores = cross_val_predict(model, X_train_prepared, train_label_binary, cv= 10, method = "decision_function")
885
+ fpr, tpr, thresholds = roc_curve (train_label_binary, y_scores)
886
+
887
+ def plot_roc_curve(fpr, tpr, label =None):
888
+ plt.plot(fpr, tpr, linewidth=2, label = label)
889
+ plt.plot([0,1], [0,1], "k--")
890
+ plot_roc_curve(fpr, tpr)
891
+ plt.title('ROC curve for Support Vector Machine')
892
+ plt.xlabel('False Positive Rate (1- specifity')
893
+ plt.ylabel('True Positive Rate (Recall)')
894
+ plt.legend(['Support Vector Machine '],loc ="lower right")
895
+ plt.grid()
896
+ plt.show()
897
+
898
+ """**Random Forest Cross Validation**"""
899
+
900
+ # hyperparameter_set = {'n_estimators': [100, 200, 300, 400], 'max_features': ['auto', 'sqrt']}
901
+ # random_forest = RandomForestClassifier()
902
+
903
+ # classifier3 = GridSearchCV(random_forest, hyperparameter_set, cv=10, verbose =0)
904
+ # best_model3 = classifier3.fit(X_train_prepared, train_label_binary )
905
+
906
+ # print('Best n_estimators:', best_model3.best_params_['n_estimators'])
907
+ # print('Best max_features:', best_model3.best_params_['max_features'])
908
+
909
+ model3 = RandomForestClassifier(n_estimators = 200, max_features= 'sqrt')
910
+ model3.fit(X_train_prepared, train_label_binary)
911
+ random_forest_prediction= model3.predict(X_train_prepared)
912
+ random_forest_prediction
913
+
914
+ #calculating the accuracy of the model
915
+ scores = cross_val_score(model3, X_train_prepared, train_label_binary )
916
+ scores
917
+ print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
918
+
919
+ #Drawing the ROC curve for SVM
920
+ from sklearn.model_selection import cross_val_predict
921
+ from sklearn.metrics import roc_curve
922
+
923
+ y_scores = cross_val_predict(model, X_train_prepared, train_label_binary, cv= 10, method = "decision_function") #decision_function
924
+ fpr, tpr, thresholds = roc_curve (train_label_binary, y_scores)
925
+
926
+ def plot_roc_curve(fpr, tpr, label =None):
927
+ plt.plot(fpr, tpr, linewidth=2, label = label)
928
+ plt.plot([0,1], [0,1], "k--")
929
+ plot_roc_curve(fpr, tpr)
930
+ plt.title('ROC curve for Random Forest')
931
+ plt.xlabel('False Positive Rate (1- specifity')
932
+ plt.ylabel('True Positive Rate (Recall)')
933
+ plt.legend(['Random Forest '],loc ="lower right")
934
+ plt.grid()
935
+ plt.show()
936
+
937
+ """**Showing the feature importance analysis in random forest.**"""
938
+
939
+ from pandas import DataFrame
940
+ random_forest = RandomForestClassifier(n_estimators = 300, random_state=60)
941
+ random_forest.fit(X_train_prepared,train_label_binary)
942
+ random_forest_importance = random_forest.feature_importances_
943
+ print(random_forest_importance)
944
+
945
+ features = original_df.columns
946
+
947
+ importances = random_forest_importance
948
+ indices = np.argsort(importances)
949
+
950
+
951
+
952
+ **Calculating the evaluation metrics for each model and then adding the data in pandas DataFrame**
953
+ """
954
+
955
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
956
+ predictions = [logistic_prediction,svm_prediction, random_forest_prediction]
957
+ names = ["Logistic_regression model","Support Vector Machine model", "Random_forest_model"]
958
+ accuracy = []
959
+ precision =[]
960
+ recall = []
961
+ ROC = []
962
+ F1= []
963
+ for i in predictions:
964
+ accuracyy = accuracy_score(train_label_binary, i)
965
+ accuracy.append(accuracyy)
966
+ precision1 = precision_score(train_label_binary, i)
967
+ precision.append(precision1)
968
+ recall1 = recall_score(train_label_binary, i)
969
+ recall.append(recall1)
970
+ ROC1 = roc_auc_score(train_label_binary, i)
971
+ ROC.append(ROC1)
972
+ F11 = f1_score(train_label_binary, i)
973
+ F1.append(F11)
974
+
975
+ data2 = {'Method': names,
976
+ 'Accuracy': accuracy,
977
+ 'Precision': precision,
978
+ 'Recall': recall,
979
+ 'ROC': ROC,
980
+ 'F1 score': F1,
981
+ }
982
+ evaluation = pd.DataFrame(data2, columns=['Method', "Accuracy", "Precision","Recall", "ROC", "F1 score"])
983
+ evaluation
984
+
985
+ """**Drawing the ROC curve of all models on the train dataset**"""
986
+
987
+ from sklearn.model_selection import cross_val_predict
988
+ from sklearn.metrics import roc_curve
989
+ roc_curve_rates = []
990
+ for model in [model3, SVM_final_model, model]: #models are 'Logistic Regression', 'RandomForestClassifier', 'SVC'
991
+ #finds the predicted probability for the sets and model
992
+ predict_probability = cross_val_predict(model, X_train_prepared, train_label_binary, cv= 10, method = "predict_proba")
993
+ #gets the probs for pos class
994
+ y_scorse = predict_probability[:,1]
995
+ #calculates the fpr and tpr with te scores
996
+ fpr, tpr, threshold = roc_curve(train_label_binary, y_scorse)
997
+ roc_curve_rates.append({'fpr': fpr, 'tpr': tpr})
998
+
999
+
1000
+ #Takes the dics array and plots each line on the same graph
1001
+ line_names = ['Logistic Regression', 'RandomForestClassifier', 'SVC']
1002
+ plt.plot(fpr, tpr, linewidth=2)
1003
+ for i in range(len(roc_curve_rates)):
1004
+ plt.plot(roc_curve_rates[i]['fpr'], roc_curve_rates[i]['tpr'], linewidth=2, label=line_names[i])
1005
+ plt.xlim([0,1])
1006
+ plt.ylim([0,1])
1007
+ plt.plot([0,1], [0,1], "k--")
1008
+ plt.title('ROC curve')
1009
+ plt.xlabel('False Positive Rate (1 - specifity)')
1010
+ plt.ylabel('True Positive Rate (Recall)')
1011
+ plt.legend(loc ="lower right")
1012
+ plt.grid()
1013
+ plt.show()
1014
+
1015
+ """**Optimizing the Neural Network**"""
1016
+
1017
+ # Parameters to check
1018
+ number_of_layers = [3, 4, 5, 6, 7]
1019
+ number_of_neurons = [10, 100, 100, 5000]
1020
+
1021
+ # Variables for saving data
1022
+ best_epoch = [[]];
1023
+ best_accuracy = [[]];
1024
+ i = 0;
1025
+
1026
+ # Add early stopping into model training
1027
+ from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
1028
+ keras_callbacks = [
1029
+ EarlyStopping(monitor='val_loss', patience=5, mode='min', min_delta=0.0001),
1030
+ ]
1031
+
1032
+ # Loop through all parameters
1033
+ for layers in number_of_layers:
1034
+ for neurons in number_of_neurons:
1035
+ print("Testing NN - Layers: "+ str(layers) + "; Neurons per layer:" + str(neurons))
1036
+ NN_model = build_NN(layers, neurons)
1037
+ train_history = NN_model.fit(X_train_prepared, train_label_binary.values.ravel(), validation_data=(X_val_prepared,validation_label_binary.values.ravel()), batch_size=128, epochs = 30, callbacks=keras_callbacks)
1038
+ # Using validation accuracy as performance metric
1039
+ accuracy = train_history.history['val_accuracy']
1040
+ best_accuracy[i].append(max(accuracy))
1041
+ best_epoch[i].append(accuracy.index(max(accuracy)))
1042
+ i = i + 1;
1043
+ best_epoch.append([])
1044
+ best_accuracy.append([])
1045
+
1046
+ # Remove last element
1047
+ best_epoch.pop(i)
1048
+ best_accuracy.pop(i)
1049
+
1050
+ # Build model with best parameters
1051
+ ideal_layers_index = best_accuracy.index(max(best_accuracy))
1052
+ ideal_layers = number_of_layers[ideal_layers_index]
1053
+ ideal_neurons = number_of_neurons[best_accuracy[ideal_layers_index].index(max(best_accuracy[ideal_layers_index]))]
1054
+
1055
+ # Print Results
1056
+ print("Best number of layers:", str(ideal_layers))
1057
+ print("Best number of neurons:", str(ideal_neurons))
1058
+
1059
+ """## **Evaluate all the models on the Test Set**
1060
+
1061
+
1062
+ """
1063
+
1064
+ #Logistic Regression
1065
+ logistic_regression_final_model = LogisticRegression(solver ='liblinear', **best_model.best_params_)
1066
+ logistic_regression_final_model.fit(X_train_prepared, train_label_binary )
1067
+ logistic_prediction_test= logistic_regression_final_model.predict(X_test_prepared)
1068
+ logistic_prediction_test
1069
+
1070
+ #Support Vector Machine
1071
+ SVM_final_model = SVC(C=0.1, kernel= 'linear', gamma = 'scale', probability=True)
1072
+ SVM_final_model.fit(X_train_prepared, train_label_binary)
1073
+ svm_prediction_test= SVM_final_model.predict(X_test_prepared)
1074
+ svm_prediction_test
1075
+
1076
+ # Random Forest Classifier
1077
+ random_forest_final_model = RandomForestClassifier(n_estimators = 400, max_features= 'sqrt')
1078
+ random_forest_final_model.fit(X_train_prepared, train_label_binary)
1079
+ random_forest_prediction_test= random_forest_final_model.predict(X_test_prepared)
1080
+ random_forest_prediction_test
1081
+
1082
+ # Neural Network
1083
+ keras_callbacks = [
1084
+ EarlyStopping(monitor='val_loss', patience=10, mode='min', min_delta=0.0001),
1085
+ ModelCheckpoint('./checkmodel.h5', monitor='val_loss', save_best_only=True, mode='min')
1086
+ ]
1087
+ NN_final_model = build_NN(ideal_layers, ideal_neurons, dropout=0.15)
1088
+ NN_final_model.fit(X_train_prepared, train_label_binary, validation_data=(X_val_prepared,validation_label_binary.values.ravel()), batch_size=128, epochs = 30, callbacks=keras_callbacks)
1089
+ NN_prediction_test= NN_final_model.predict(X_test_prepared)
1090
+
1091
+ # Prepare prediction to be comparable
1092
+ NN_prediction_test = (NN_prediction_test >= NN_threshold)
1093
+
1094
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
1095
+ predictions = [logistic_prediction_test,svm_prediction_test, random_forest_prediction_test, NN_prediction_test]
1096
+ names = ["Logistic_regression_test","Support_vector_machine_test", "Random_forest_test", "Neural_net_test"]
1097
+ accuracy = []
1098
+ precision =[]
1099
+ recall = []
1100
+ ROC = []
1101
+ F1= []
1102
+ for i in predictions:
1103
+ accuracyy = accuracy_score(test_label_binary, i)
1104
+ accuracy.append(accuracyy)
1105
+ precision1 = precision_score(test_label_binary, i)
1106
+ precision.append(precision1)
1107
+ recall1 = recall_score(test_label_binary, i)
1108
+ recall.append(recall1)
1109
+ ROC1 = roc_auc_score(test_label_binary, i)
1110
+ ROC.append(ROC1)
1111
+ F11 = f1_score(test_label_binary, i)
1112
+ F1.append(F11)
1113
+
1114
+ data3 = {'Method': names,
1115
+ 'Accuracy': accuracy,
1116
+ 'Precision': precision,
1117
+ 'Recall': recall,
1118
+ 'ROC': ROC,
1119
+ 'F1 score': F1,
1120
+ }
1121
+ evaluation = pd.DataFrame(data3, columns=['Method', "Accuracy", "Precision","Recall", "ROC", "F1 score"])
1122
+ evaluation
1123
+
1124
+ """**Trade-off between precision and recall** **for** :
1125
+
1126
+ 1. Logistic Regression
1127
+ 2. Support Vector Machine
1128
+ 1. Random Forest
1129
+
1130
+
1131
+
1132
+
1133
+
1134
+ """
1135
+
1136
+ from sklearn.metrics import precision_recall_curve
1137
+ import matplotlib.pyplot as plt
1138
+
1139
+ y_score = logistic_regression_final_model.predict_proba(X_test_prepared)[:, 1]
1140
+
1141
+ #calculate precision and recall
1142
+ precision, recall, thresholds = precision_recall_curve(test_label_binary, y_score)
1143
+ #create precision recall curve
1144
+ fig, ax = plt.subplots()
1145
+ ax.plot(recall, precision, color='red')
1146
+ #add axis labels to plot
1147
+ ax.set_title('Precision-Recall Curve for Logistic Regression')
1148
+ ax.set_ylabel('Precision')
1149
+ ax.set_xlabel('Recall')
1150
+
1151
+ #display plot
1152
+ plt.grid(True)
1153
+ plt.show()
1154
+
1155
+ from sklearn.metrics import precision_recall_curve
1156
+ import matplotlib.pyplot as plt
1157
+
1158
+ y_score = random_forest_final_model.predict_proba(X_test_prepared)[:, 1]
1159
+
1160
+ #calculate precision and recall
1161
+ precision, recall, thresholds = precision_recall_curve(test_label_binary, y_score)
1162
+ #create precision recall curve
1163
+ fig, ax = plt.subplots()
1164
+ ax.plot(recall, precision, color='blue')
1165
+ #add axis labels to plot
1166
+ ax.set_title('Precision-Recall Curve for Support Vector Machine')
1167
+ ax.set_ylabel('Precision')
1168
+ ax.set_xlabel('Recall')
1169
+
1170
+ #display plot
1171
+ plt.grid(True)
1172
+ plt.show()
1173
+
1174
+ from sklearn.metrics import precision_recall_curve
1175
+ import matplotlib.pyplot as plt
1176
+
1177
+ y_score = SVM_final_model.predict_proba(X_test_prepared)[:, 1]
1178
+
1179
+ #calculate precision and recall
1180
+ precision, recall, thresholds = precision_recall_curve(test_label_binary, y_score)
1181
+ #create precision recall curve
1182
+ fig, ax = plt.subplots()
1183
+ ax.plot(recall, precision, color='purple')
1184
+ #add axis labels to plot
1185
+ ax.set_title('Precision-Recall Curve for Random Forest Model')
1186
+ ax.set_ylabel('Precision')
1187
+ ax.set_xlabel('Recall')
1188
+
1189
+ #display plot
1190
+ plt.grid(True)
1191
+ plt.show()
1192
+
1193
+ """**Drawing the ROC curve of all models on the test dataset**"""
1194
+
1195
+ from sklearn.model_selection import cross_val_predict
1196
+ from sklearn.metrics import roc_curve
1197
+ roc_curve_rates = []
1198
+ for model in [logistic_regression_final_model, random_forest_final_model,SVM_final_model]: #models are 'Logistic Regression', 'RandomForestClassifier', 'SVC'
1199
+ #finds the predicted probability for the sets and model
1200
+ predict_probability = cross_val_predict(logistic_regression_final_model, X_test_prepared, test_label_binary, cv= 10, method = "predict_proba")
1201
+ #gets the probs for pos class
1202
+ y_scorse = predict_probability[:,1]
1203
+ #calculates the fpr and tpr with te scores
1204
+ fpr, tpr, threshold = roc_curve(test_label_binary, y_scorse)
1205
+ roc_curve_rates.append({'fpr': fpr, 'tpr': tpr})
1206
+
1207
+ #Takes the dics array and plots each line on the same graph
1208
+ line_names = ['Logistic Regression', 'RandomForestClassifier', 'SVC']
1209
+ plt.plot(fpr, tpr, linewidth=2)
1210
+ for i in range(len(roc_curve_rates)):
1211
+ plt.plot(roc_curve_rates[i]['fpr'], roc_curve_rates[i]['tpr'], linewidth=2, label=line_names[i])
1212
+ plt.xlim([0,1])
1213
+ plt.ylim([0,1])
1214
+ plt.plot([0,1], [0,1], "k--")
1215
+ plt.title('ROC curve')
1216
+ plt.xlabel('False Positive Rate (1 - specifity)')
1217
+ plt.ylabel('True Positive Rate (Recall)')
1218
+ plt.legend(loc ="lower right")
1219
+ plt.grid()
1220
+ plt.show()
1221
+
1222
+ """#**PART III ----> Gradio Implementation**
1223
+
1224
+
1225
+
1226
+
1227
+ """
1228
+
1229
+ # Install Gradio
1230
+ !pip install --quiet gradio
1231
+
1232
+ # Import Gradio Library
1233
+ import gradio as gr
1234
+
1235
+ # Define callback function
1236
+ def warfarin_callback(age, height, weight, gender, race, diabetes, medication, Cyp2C9, VKORC1, INR, model):
1237
+ # Input validation
1238
+ if not gender:
1239
+ return "Please select the patient's gender"
1240
+ if not race:
1241
+ return "Please select the patient's race"
1242
+
1243
+ # Extract medication
1244
+ simvastatin = 0.0
1245
+ amiodarone = 0.0
1246
+ if 'Simvastatin (Zocor)' in medication: simvastatin = 1.0
1247
+ if 'Amiodarone (Cordarone)' in medication: amiodarone = 1.0
1248
+ # Categorize age
1249
+ age_categories = ['10 - 19', '20 - 29', '30 - 39', '40 - 49', '50 - 59', '60 - 69', '70 - 79', '80 - 89', '90+']
1250
+ age_category = age_categories[int(np.floor(age/10)) - 1]
1251
+
1252
+ # Gender, Race (Reported), Age, Height (cm), Weight (kg), Diabetes, Simvastatin (Zocor), Amiodarone (Cordarone), Target INR, INR on Reported Therapeutic Dose of Warfarin, Cyp2C9 genotypes, VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T
1253
+ input_df = pd.DataFrame([[gender.lower(), race, age_category, height, weight, float(diabetes), simvastatin, amiodarone, 0.0, INR, Cyp2C9, VKORC1]], columns=["Gender", "Race (Reported)", "Age", "Height (cm)", "Weight (kg)", "Diabetes", "Simvastatin (Zocor)", "Amiodarone (Cordarone)", "Target INR", "INR on Reported Therapeutic Dose of Warfarin", "Cyp2C9 genotypes", "VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T"])
1254
+ preprocessed_input_df = full_preprocess_function(input_df)
1255
+
1256
+ # Model Selection
1257
+ if model == "Logistic Regression":
1258
+ prediction = (logistic_regression_final_model.predict(preprocessed_input_df))
1259
+ elif model == "Support Vector Machine":
1260
+ prediction = (SVM_final_model.predict(preprocessed_input_df))
1261
+ elif model == "Random Forest":
1262
+ prediction = (random_forest_final_model.predict(preprocessed_input_df))
1263
+ elif model == "Neural Network":
1264
+ prediction = (NN_final_model.predict(preprocessed_input_df))
1265
+ prediction = prediction > NN_threshold
1266
+ else:
1267
+ return "Please select a Machine Learning Model"
1268
+
1269
+ if prediction:
1270
+ return "The recommended Warfarin Dose is >30mg"
1271
+ else:
1272
+ return "The recommended Warfarin Dose is <=30mg"
1273
+
1274
+ # Define output module as Warfarin dose
1275
+ output_dose = gr.Textbox(label = "Warfarin Dose")
1276
+
1277
+ # Define all input modules
1278
+ input_age = gr.Slider(10, 100, step=1, label = "Age", default=30)
1279
+ input_height = gr.Number(label = "Height (cm)")
1280
+ input_weight = gr.Number(label = "Weight (kg)")
1281
+ input_gender = gr.Radio(choices=["Male", "Female"], label = "Gender")
1282
+ input_race = gr.Dropdown(choices=['Asian', 'Black or African American', 'Caucasian', 'Chinese', 'Han Chinese', 'Hispanic', 'Indian', 'Intermediate', 'Japanese', 'Korean', 'Malay', 'Other','Other Mixed Race', 'Unspecified', 'White'], label = "Race")
1283
+ input_diabetes = gr.Checkbox(label = "Is the patient Diabetic?")
1284
+ input_medication = gr.CheckboxGroup(["Simvastatin (Zocor)", "Amiodarone (Cordarone)"], label = "Is the patient taking any of the following medication?")
1285
+ input_Cyp269 = gr.Dropdown(['*1/*1', '*1/*11', '*1/*13', '*1/*14', '*1/*2', '*1/*3', '*1/*5', '*1/*6', '*2/*2', '*2/*3', '*3/*3'], label = "Cyp2C9 genotype")
1286
+ input_VKORC1 = gr.Dropdown(['A/A', 'A/G', 'G/G', 'Unknown'], label = "VKORC1 genotype")
1287
+ input_INR = gr.Slider(1, 5, step=0.01, label = "INR on Reported Therapeutic Dose of Warfarin", default=2.45)
1288
+ input_model = gr.Dropdown(choices=["Logistic Regression", "Support Vector Machine", "Random Forest", "Neural Network" ], label = "Machine Learning Model")
1289
+
1290
+ gr.Interface(fn=warfarin_callback, inputs=[input_age, input_height, input_weight,input_gender, input_race, input_diabetes, input_medication, input_Cyp269, input_VKORC1, input_INR, input_model], outputs=output_dose).launch(debug=False)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ tensorflow
2
+ numpy
3
+ matplotlib
4
+ scikit-learn
5
+ pandas
6
+ joblib
7
+ opencv-python