Priyanka-Kumavat-At-TE commited on
Commit
2fc2c1f
1 Parent(s): 3eb0b43

Upload 19 files

Browse files
Files changed (19) hide show
  1. supv/__init__.py +0 -0
  2. supv/bacl.py +493 -0
  3. supv/basic_nn.py +293 -0
  4. supv/fftn.py +240 -0
  5. supv/gbt.py +482 -0
  6. supv/gcn.py +444 -0
  7. supv/knn.py +106 -0
  8. supv/lrd.py +112 -0
  9. supv/lstm.py +414 -0
  10. supv/mcalib.py +384 -0
  11. supv/mcclf.py +207 -0
  12. supv/nlm.py +434 -0
  13. supv/optunar.py +127 -0
  14. supv/pasearch.py +243 -0
  15. supv/regress.py +253 -0
  16. supv/rf.py +134 -0
  17. supv/svm.py +141 -0
  18. supv/svml.py +428 -0
  19. supv/tnn.py +789 -0
supv/__init__.py ADDED
File without changes
supv/bacl.py ADDED
@@ -0,0 +1,493 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/local/bin/python3
2
+
3
+ # avenir-python: Machine Learning
4
+ # Author: Pranab Ghosh
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You may
8
+ # obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+
18
+ # Package imports
19
+ import os
20
+ import sys
21
+ import matplotlib.pyplot as plt
22
+ import numpy as np
23
+ import sklearn as sk
24
+ import matplotlib
25
+ import random
26
+ import jprops
27
+ from io import StringIO
28
+ from sklearn.model_selection import cross_val_score
29
+ import joblib
30
+ from random import randint
31
+ from io import StringIO
32
+ sys.path.append(os.path.abspath("../lib"))
33
+ from util import *
34
+ from mlutil import *
35
+ from pasearch import *
36
+
37
+ #base classifier class
38
+ class BaseClassifier(object):
39
+
40
+ def __init__(self, configFile, defValues, mname):
41
+ self.config = Configuration(configFile, defValues)
42
+ self.subSampleRate = None
43
+ self.featData = None
44
+ self.clsData = None
45
+ self.classifier = None
46
+ self.trained = False
47
+ self.verbose = self.config.getBooleanConfig("common.verbose")[0]
48
+ logFilePath = self.config.getStringConfig("common.logging.file")[0]
49
+ logLevName = self.config.getStringConfig("common.logging.level")[0]
50
+ self.logger = createLogger(mname, logFilePath, logLevName)
51
+ self.logger.info("********* starting session")
52
+
53
+ def initConfig(self, configFile, defValues):
54
+ """
55
+ initialize config
56
+ """
57
+ self.config = Configuration(configFile, defValues)
58
+
59
+ def getConfig(self):
60
+ """
61
+ get config object
62
+ """
63
+ return self.config
64
+
65
+ def setConfigParam(self, name, value):
66
+ """
67
+ set config param
68
+ """
69
+ self.config.setParam(name, value)
70
+
71
+ def getMode(self):
72
+ """
73
+ get mode
74
+ """
75
+ return self.config.getStringConfig("common.mode")[0]
76
+
77
+ def getSearchParamStrategy(self):
78
+ """
79
+ get search parameter
80
+ """
81
+ return self.config.getStringConfig("train.search.param.strategy")[0]
82
+
83
+ def train(self):
84
+ """
85
+ train model
86
+ """
87
+ #build model
88
+ self.buildModel()
89
+
90
+ # training data
91
+ if self.featData is None:
92
+ (featData, clsData) = self.prepTrainingData()
93
+ (self.featData, self.clsData) = (featData, clsData)
94
+ else:
95
+ (featData, clsData) = (self.featData, self.clsData)
96
+ if self.subSampleRate is not None:
97
+ (featData, clsData) = subSample(featData, clsData, self.subSampleRate, False)
98
+ self.logger.info("subsample size " + str(featData.shape[0]))
99
+
100
+ # parameters
101
+ modelSave = self.config.getBooleanConfig("train.model.save")[0]
102
+
103
+ #train
104
+ self.logger.info("...training model")
105
+ self.classifier.fit(featData, clsData)
106
+ score = self.classifier.score(featData, clsData)
107
+ successCriterion = self.config.getStringConfig("train.success.criterion")[0]
108
+ result = None
109
+ if successCriterion == "accuracy":
110
+ self.logger.info("accuracy with training data {:06.3f}".format(score))
111
+ result = score
112
+ elif successCriterion == "error":
113
+ error = 1.0 - score
114
+ self.logger.info("error with training data {:06.3f}".format(error))
115
+ result = error
116
+ else:
117
+ raise ValueError("invalid success criterion")
118
+
119
+ if modelSave:
120
+ self.logger.info("...saving model")
121
+ modelFilePath = self.getModelFilePath()
122
+ joblib.dump(self.classifier, modelFilePath)
123
+ self.trained = True
124
+ return result
125
+
126
+ def trainValidate(self):
127
+ """
128
+ train with k fold validation
129
+ """
130
+ #build model
131
+ self.buildModel()
132
+
133
+ # training data
134
+ (featData, clsData) = self.prepTrainingData()
135
+
136
+ #parameter
137
+ validation = self.config.getStringConfig("train.validation")[0]
138
+ numFolds = self.config.getIntConfig("train.num.folds")[0]
139
+ successCriterion = self.config.getStringConfig("train.success.criterion")[0]
140
+ scoreMethod = self.config.getStringConfig("train.score.method")[0]
141
+
142
+ #train with validation
143
+ self.logger.info("...training and kfold cross validating model")
144
+ scores = cross_val_score(self.classifier, featData, clsData, cv=numFolds,scoring=scoreMethod)
145
+ avScore = np.mean(scores)
146
+ result = self.reportResult(avScore, successCriterion, scoreMethod)
147
+ return result
148
+
149
+ def trainValidateSearch(self):
150
+ """
151
+ train with k fold validation and search parameter space for optimum
152
+ """
153
+ self.logger.info("...starting train validate with parameter search")
154
+ searchStrategyName = self.getSearchParamStrategy()
155
+ if searchStrategyName is not None:
156
+ if searchStrategyName == "grid":
157
+ searchStrategy = GuidedParameterSearch(self.verbose)
158
+ elif searchStrategyName == "random":
159
+ searchStrategy = RandomParameterSearch(self.verbose)
160
+ maxIter = self.config.getIntConfig("train.search.max.iterations")[0]
161
+ searchStrategy.setMaxIter(maxIter)
162
+ elif searchStrategyName == "simuan":
163
+ searchStrategy = SimulatedAnnealingParameterSearch(self.verbose)
164
+ maxIter = self.config.getIntConfig("train.search.max.iterations")[0]
165
+ searchStrategy.setMaxIter(maxIter)
166
+ temp = self.config.getFloatConfig("train.search.sa.temp")[0]
167
+ searchStrategy.setTemp(temp)
168
+ tempRedRate = self.config.getFloatConfig("train.search.sa.temp.red.rate")[0]
169
+ searchStrategy.setTempReductionRate(tempRedRate)
170
+ else:
171
+ raise ValueError("invalid paramtere search strategy")
172
+ else:
173
+ raise ValueError("missing search strategy")
174
+
175
+ # add search params
176
+ searchParams = self.config.getStringConfig("train.search.params")[0].split(",")
177
+ searchParamNames = []
178
+ extSearchParamNames = []
179
+ if searchParams is not None:
180
+ for searchParam in searchParams:
181
+ paramItems = searchParam.split(":")
182
+ extSearchParamNames.append(paramItems[0])
183
+
184
+ #get rid name component search
185
+ paramNameItems = paramItems[0].split(".")
186
+ del paramNameItems[1]
187
+ paramItems[0] = ".".join(paramNameItems)
188
+
189
+ searchStrategy.addParam(paramItems)
190
+ searchParamNames.append(paramItems[0])
191
+ else:
192
+ raise ValueError("missing search parameter list")
193
+
194
+ # add search param data list for each param
195
+ for (searchParamName,extSearchParamName) in zip(searchParamNames,extSearchParamNames):
196
+ searchParamData = self.config.getStringConfig(extSearchParamName)[0].split(",")
197
+ searchStrategy.addParamVaues(searchParamName, searchParamData)
198
+
199
+ # train and validate for various param value combination
200
+ searchStrategy.prepare()
201
+ paramValues = searchStrategy.nextParamValues()
202
+ searchResults = []
203
+ while paramValues is not None:
204
+ self.logger.info("...next parameter set")
205
+ paramStr = ""
206
+ for paramValue in paramValues:
207
+ self.setConfigParam(paramValue[0], str(paramValue[1]))
208
+ paramStr = paramStr + paramValue[0] + "=" + str(paramValue[1]) + " "
209
+ result = self.trainValidate()
210
+ searchStrategy.setCost(result)
211
+ searchResults.append((paramStr, result))
212
+ paramValues = searchStrategy.nextParamValues()
213
+
214
+ # output
215
+ self.logger.info("all parameter search results")
216
+ for searchResult in searchResults:
217
+ self.logger.info("{}\t{06.3f}".format(searchResult[0], searchResult[1]))
218
+
219
+ self.logger.info("best parameter search result")
220
+ bestSolution = searchStrategy.getBestSolution()
221
+ paramStr = ""
222
+ for paramValue in bestSolution[0]:
223
+ paramStr = paramStr + paramValue[0] + "=" + str(paramValue[1]) + " "
224
+ self.logger.info("{}\t{:06.3f}".format(paramStr, bestSolution[1]))
225
+ return bestSolution
226
+
227
+ def validate(self):
228
+ """
229
+ predict
230
+ """
231
+ # create model
232
+ useSavedModel = self.config.getBooleanConfig("validate.use.saved.model")[0]
233
+ if useSavedModel:
234
+ # load saved model
235
+ self.logger.info("...loading model")
236
+ modelFilePath = self.getModelFilePath()
237
+ self.classifier = joblib.load(modelFilePath)
238
+ else:
239
+ # train model
240
+ if not self.trained:
241
+ self.train()
242
+
243
+ # prepare test data
244
+ (featData, clsDataActual) = self.prepValidationData()
245
+
246
+ #predict
247
+ self.logger.info("...predicting")
248
+ clsDataPred = self.classifier.predict(featData)
249
+
250
+ self.logger.info("...validating")
251
+ #print clsData
252
+ scoreMethod = self.config.getStringConfig("validate.score.method")[0]
253
+ if scoreMethod == "accuracy":
254
+ accuracy = sk.metrics.accuracy_score(clsDataActual, clsDataPred)
255
+ self.logger.info("accuracy:")
256
+ self.logger.info(accuracy)
257
+ elif scoreMethod == "confusionMatrix":
258
+ confMatrx = sk.metrics.confusion_matrix(clsDataActual, clsDataPred)
259
+ self.logger.info("confusion matrix:")
260
+ self.logger.info(confMatrx)
261
+
262
+
263
+ def predictx(self):
264
+ """
265
+ predict
266
+ """
267
+ # create model
268
+ self.prepModel()
269
+
270
+ # prepare test data
271
+ featData = self.prepPredictData()
272
+
273
+ #predict
274
+ self.logger.info("...predicting")
275
+ clsData = self.classifier.predict(featData)
276
+ self.logger.info(clsData)
277
+
278
+ def predict(self, recs=None):
279
+ """
280
+ predict with in memory data
281
+ """
282
+ # create model
283
+ self.prepModel()
284
+
285
+ #input record
286
+ if recs:
287
+ #passed record
288
+ featData = self.prepStringPredictData(recs)
289
+ if (featData.ndim == 1):
290
+ featData = featData.reshape(1, -1)
291
+ else:
292
+ #file
293
+ featData = self.prepPredictData()
294
+
295
+ #predict
296
+ self.logger.info("...predicting")
297
+ clsData = self.classifier.predict(featData)
298
+ return clsData
299
+
300
+ def predictProb(self, recs):
301
+ """
302
+ predict probability with in memory data
303
+ """
304
+ raise ValueError("can not predict class probability")
305
+
306
+ def prepModel(self):
307
+ """
308
+ preparing model
309
+ """
310
+ useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
311
+ if (useSavedModel and not self.classifier):
312
+ # load saved model
313
+ self.logger.info("...loading saved model")
314
+ modelFilePath = self.getModelFilePath()
315
+ self.classifier = joblib.load(modelFilePath)
316
+ else:
317
+ # train model
318
+ if not self.trained:
319
+ self.train()
320
+
321
+ def prepTrainingData(self):
322
+ """
323
+ loads and prepares training data
324
+ """
325
+ # parameters
326
+ dataFile = self.config.getStringConfig("train.data.file")[0]
327
+ fieldIndices = self.config.getStringConfig("train.data.fields")[0]
328
+ if not fieldIndices is None:
329
+ fieldIndices = strToIntArray(fieldIndices, ",")
330
+ featFieldIndices = self.config.getStringConfig("train.data.feature.fields")[0]
331
+ if not featFieldIndices is None:
332
+ featFieldIndices = strToIntArray(featFieldIndices, ",")
333
+ classFieldIndex = self.config.getIntConfig("train.data.class.field")[0]
334
+
335
+ #training data
336
+ (data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
337
+ if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
338
+ scalingMethod = self.config.getStringConfig("common.scaling.method")[0]
339
+ featData = scaleData(featData, scalingMethod)
340
+
341
+ clsData = extrColumns(data, classFieldIndex)
342
+ clsData = np.array([int(a) for a in clsData])
343
+ return (featData, clsData)
344
+
345
+ def prepValidationData(self):
346
+ """
347
+ loads and prepares training data
348
+ """
349
+ # parameters
350
+ dataFile = self.config.getStringConfig("validate.data.file")[0]
351
+ fieldIndices = self.config.getStringConfig("validate.data.fields")[0]
352
+ if not fieldIndices is None:
353
+ fieldIndices = strToIntArray(fieldIndices, ",")
354
+ featFieldIndices = self.config.getStringConfig("validate.data.feature.fields")[0]
355
+ if not featFieldIndices is None:
356
+ featFieldIndices = strToIntArray(featFieldIndices, ",")
357
+ classFieldIndex = self.config.getIntConfig("validate.data.class.field")[0]
358
+
359
+ #training data
360
+ (data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
361
+ if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
362
+ scalingMethod = self.config.getStringConfig("common.scaling.method")[0]
363
+ featData = scaleData(featData, scalingMethod)
364
+ clsData = extrColumns(data, classFieldIndex)
365
+ clsData = [int(a) for a in clsData]
366
+ return (featData, clsData)
367
+
368
+ def prepPredictData(self):
369
+ """
370
+ loads and prepares training data
371
+ """
372
+ # parameters
373
+ dataFile = self.config.getStringConfig("predict.data.file")[0]
374
+ if dataFile is None:
375
+ raise ValueError("missing prediction data file")
376
+ fieldIndices = self.config.getStringConfig("predict.data.fields")[0]
377
+ if not fieldIndices is None:
378
+ fieldIndices = strToIntArray(fieldIndices, ",")
379
+ featFieldIndices = self.config.getStringConfig("predict.data.feature.fields")[0]
380
+ if not featFieldIndices is None:
381
+ featFieldIndices = strToIntArray(featFieldIndices, ",")
382
+
383
+ #training data
384
+ (data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
385
+ if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
386
+ scalingMethod = self.config.getStringConfig("common.scaling.method")[0]
387
+ featData = scaleData(featData, scalingMethod)
388
+
389
+ return featData
390
+
391
+ def prepStringPredictData(self, recs):
392
+ """
393
+ prepare string predict data
394
+ """
395
+ frecs = StringIO(recs)
396
+ featData = np.loadtxt(frecs, delimiter=',')
397
+ return featData
398
+
399
+ def getModelFilePath(self):
400
+ """
401
+ get model file path
402
+ """
403
+ modelDirectory = self.config.getStringConfig("common.model.directory")[0]
404
+ modelFile = self.config.getStringConfig("common.model.file")[0]
405
+ if modelFile is None:
406
+ raise ValueError("missing model file name")
407
+ modelFilePath = modelDirectory + "/" + modelFile
408
+ return modelFilePath
409
+
410
+ def reportResult(self, score, successCriterion, scoreMethod):
411
+ """
412
+ report result
413
+ """
414
+ if successCriterion == "accuracy":
415
+ self.logger.info("average " + scoreMethod + " with k fold cross validation {:06.3f}".format(score))
416
+ result = score
417
+ elif successCriterion == "error":
418
+ error = 1.0 - score
419
+ self.logger.info("average error with k fold cross validation {:06.3f}".format(error))
420
+ result = error
421
+ else:
422
+ raise ValueError("invalid success criterion")
423
+ return result
424
+
425
+ def autoTrain(self):
426
+ """
427
+ auto train
428
+ """
429
+ maxTestErr = self.config.getFloatConfig("train.auto.max.test.error")[0]
430
+ maxErr = self.config.getFloatConfig("train.auto.max.error")[0]
431
+ maxErrDiff = self.config.getFloatConfig("train.auto.max.error.diff")[0]
432
+
433
+ self.config.setParam("train.model.save", "False")
434
+
435
+ #train, validate and serach optimum parameter
436
+ result = self.trainValidateSearch()
437
+ testError = result[1]
438
+
439
+ #subsample training size to match train size for k fold validation
440
+ numFolds = self.config.getIntConfig("train.num.folds")[0]
441
+ self.subSampleRate = float(numFolds - 1) / numFolds
442
+
443
+ #train only with optimum parameter values
444
+ for paramValue in result[0]:
445
+ pName = paramValue[0]
446
+ pValue = paramValue[1]
447
+ self.logger.info(pName + " " + pValue)
448
+ self.setConfigParam(pName, pValue)
449
+ trainError = self.train()
450
+
451
+ if testError < maxTestErr:
452
+ # criteria based on test error only
453
+ self.logger.info("Successfullt trained. Low test error level")
454
+ status = 1
455
+ else:
456
+ # criteria based on bias error and generalization error
457
+ avError = (trainError + testError) / 2
458
+ diffError = testError - trainError
459
+ self.logger.info("Auto training completed: training error {:06.3f} test error: {:06.3f}".format(trainError, testError))
460
+ self.logger.info("Average of test and training error: {:06.3f} test and training error diff: {:06.3f}".format(avError, diffError))
461
+ if diffError > maxErrDiff:
462
+ # high generalization error
463
+ if avError > maxErr:
464
+ # high bias error
465
+ self.logger.info("High generalization error and high error. Need larger training data set and increased model complexity")
466
+ status = 4
467
+ else:
468
+ # low bias error
469
+ self.logger.info("High generalization error. Need larger training data set")
470
+ status = 3
471
+ else:
472
+ # low generalization error
473
+ if avError > maxErr:
474
+ # high bias error
475
+ self.logger.info("Converged, but with high error rate. Need to increase model complexity")
476
+ status = 2
477
+ else:
478
+ # low bias error
479
+ self.logger.info("Successfullt trained. Low generalization error and low bias error level")
480
+ status = 1
481
+
482
+ if status == 1:
483
+ #train final model, use all data and save model
484
+ self.logger.info("...training the final model")
485
+ self.config.setParam("train.model.save", "True")
486
+ self.subSampleRate = None
487
+ trainError = self.train()
488
+ self.logger.info("training error in final model {:06.3f}".format(trainError))
489
+
490
+ return status
491
+
492
+
493
+
supv/basic_nn.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/Users/pranab/Tools/anaconda/bin/python
2
+
3
+ # avenir-python: Machine Learning
4
+ # Author: Pranab Ghosh
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You may
8
+ # obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+
18
+ # Package imports
19
+ import os
20
+ import sys
21
+ import matplotlib.pyplot as plt
22
+ import numpy as np
23
+ import sklearn
24
+ import sklearn.datasets
25
+ import sklearn.linear_model
26
+ import matplotlib
27
+
28
+
29
+ if len(sys.argv) != 7:
30
+ print "usage: <num_hidden_units> <data_set_size> <noise_in_data> <iteration_count> <learning_rate> <training_mode> "
31
+ sys.exit()
32
+
33
+ # number of hidden units
34
+ nn_hdim = int(sys.argv[1])
35
+
36
+ # dat set size
37
+ dsize = int(sys.argv[2])
38
+
39
+ # noise in training data
40
+ noise_level = float(sys.argv[3])
41
+
42
+ # iteration count
43
+ it_count = int(sys.argv[4])
44
+
45
+ # learning rate
46
+ epsilon = float(sys.argv[5])
47
+
48
+ #training mode
49
+ training_mode = sys.argv[6]
50
+
51
+ # validation
52
+ use_validation_data = True
53
+
54
+ # Generate a dataset
55
+ #noise_level = 0.20
56
+ #noise_level = 0.01
57
+ vlo = 100
58
+ vup = vlo + dsize / 5
59
+ vsize = vup - vlo
60
+ print "trainig data size %d" %(vsize)
61
+ np.random.seed(0)
62
+ XC, yc = sklearn.datasets.make_moons(dsize, noise=noise_level)
63
+
64
+ print "complete data set generated"
65
+ def print_array(X,y):
66
+ print X
67
+ print y
68
+
69
+
70
+ # Generate a validation dataset
71
+ #np.random.seed(0)
72
+ #XV, yv = sklearn.datasets.make_moons(40, noise=0.20)
73
+ #print "validation data set generated"
74
+
75
+ XV = XC[vlo:vup:1]
76
+ yv = yc[vlo:vup:1]
77
+ print "validation data generated"
78
+ #print_array(XV, yv)
79
+
80
+ X = np.delete(XC, np.s_[vlo:vup:1], 0)
81
+ y = np.delete(yc, np.s_[vlo:vup:1], 0)
82
+ print "training data generated"
83
+ #print_array(X, y)
84
+ print X
85
+ print y
86
+
87
+
88
+ # Parameters
89
+ num_examples = len(X) # training set size
90
+ nn_input_dim = 2 # input layer dimensionality
91
+ nn_output_dim = 2 # output layer dimensionality
92
+
93
+ #training data indices
94
+ tr_data_indices = np.arange(num_examples)
95
+ #print tr_data_indices
96
+
97
+ # Gradient descent parameters (I picked these by hand)
98
+ #epsilon = 0.01 # learning rate for gradient descent
99
+ reg_lambda = 0.01 # regularization strength
100
+
101
+
102
+ # Helper function to evaluate the total loss on the dataset
103
+ def calculate_loss(X,y,model):
104
+ W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
105
+ size = len(X)
106
+
107
+ # Forward propagation to calculate our predictions
108
+ z1 = X.dot(W1) + b1
109
+ a1 = np.tanh(z1)
110
+ z2 = a1.dot(W2) + b2
111
+ exp_scores = np.exp(z2)
112
+ probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
113
+
114
+ # Calculating the loss
115
+ corect_logprobs = -np.log(probs[range(size), y])
116
+ data_loss = np.sum(corect_logprobs)
117
+
118
+ # Add regulatization term to loss (optional)
119
+ data_loss += reg_lambda/2 * (np.sum(np.square(W1)) + np.sum(np.square(W2)))
120
+ return 1./size * data_loss
121
+
122
+
123
+ # Helper function to predict an output (0 or 1)
124
+ def predict(model, x):
125
+ W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
126
+
127
+ # Forward propagation
128
+ z1 = x.dot(W1) + b1
129
+ a1 = np.tanh(z1)
130
+ z2 = a1.dot(W2) + b2
131
+ exp_scores = np.exp(z2)
132
+ probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
133
+ return np.argmax(probs, axis=1)
134
+
135
+ # This function learns parameters for the neural network in batch mode and returns the model.
136
+ # - nn_hdim: Number of nodes in the hidden layer
137
+ # - num_passes: Number of passes through the training data for gradient descent
138
+ # - print_loss: If True, print the loss every 1000 iterations
139
+ def build_model_batch(nn_hdim, num_passes=10000, validation_interval=50):
140
+ # Initialize the parameters to random values. We need to learn these.
141
+ np.random.seed(0)
142
+ W1 = np.random.randn(nn_input_dim, nn_hdim) / np.sqrt(nn_input_dim)
143
+ b1 = np.zeros((1, nn_hdim))
144
+ W2 = np.random.randn(nn_hdim, nn_output_dim) / np.sqrt(nn_hdim)
145
+ b2 = np.zeros((1, nn_output_dim))
146
+
147
+ # This is what we return at the end
148
+ model = {}
149
+
150
+ # Gradient descent. For each batch...
151
+ loss = -1.0
152
+ for i in xrange(0, num_passes):
153
+ #print "pass %d" %(i)
154
+
155
+ # Forward propagation
156
+ z1 = X.dot(W1) + b1
157
+ a1 = np.tanh(z1)
158
+ z2 = a1.dot(W2) + b2
159
+ exp_scores = np.exp(z2)
160
+ probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
161
+
162
+ # Back propagation
163
+ delta3 = probs
164
+ delta3[range(num_examples), y] -= 1
165
+ dW2 = (a1.T).dot(delta3)
166
+ db2 = np.sum(delta3, axis=0, keepdims=True)
167
+ delta2 = delta3.dot(W2.T) * (1 - np.power(a1, 2))
168
+ dW1 = np.dot(X.T, delta2)
169
+ db1 = np.sum(delta2, axis=0)
170
+
171
+ # Add regularization terms (b1 and b2 don't have regularization terms)
172
+ dW2 += reg_lambda * W2
173
+ dW1 += reg_lambda * W1
174
+
175
+ # Gradient descent parameter update
176
+ W1 += -epsilon * dW1
177
+ b1 += -epsilon * db1
178
+ W2 += -epsilon * dW2
179
+ b2 += -epsilon * db2
180
+
181
+ # Assign new parameters to the model
182
+ model = { 'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}
183
+
184
+ # This is expensive because it uses the whole dataset, so we don't want to do it too often.
185
+ if i % validation_interval == 0:
186
+ if use_validation_data:
187
+ cur_loss = calculate_loss(XV,yv,model)
188
+ else:
189
+ cur_loss = calculate_loss(X,y,model)
190
+
191
+ print "Loss after iteration %i: %.8f" %(i, cur_loss)
192
+ loss = cur_loss
193
+
194
+
195
+ return model
196
+
197
+
198
+ # This function learns parameters for the neural network in incremental and returns the model.
199
+ # - nn_hdim: Number of nodes in the hidden layer
200
+ # - num_passes: Number of passes through the training data for gradient descent
201
+ # - print_loss: If True, print the loss every 1000 iterations
202
+ def build_model_incr(nn_hdim, num_passes=10000, validation_interval=50):
203
+ # Initialize the parameters to random values. We need to learn these.
204
+ np.random.seed(0)
205
+ W1 = np.random.randn(nn_input_dim, nn_hdim) / np.sqrt(nn_input_dim)
206
+ b1 = np.zeros((1, nn_hdim))
207
+ W2 = np.random.randn(nn_hdim, nn_output_dim) / np.sqrt(nn_hdim)
208
+ b2 = np.zeros((1, nn_output_dim))
209
+
210
+ # This is what we return at the end
211
+ model = {}
212
+
213
+ # gradient descent. For each batch...
214
+ loss = -1.0
215
+ for i in xrange(0, num_passes):
216
+ #print "pass %d" %(i)
217
+
218
+ #shuffle training data indices
219
+ np.random.shuffle(tr_data_indices)
220
+
221
+ # all training data
222
+ for j in tr_data_indices:
223
+ Xi = X[j].reshape(1,2)
224
+ yi = y[j].reshape(1)
225
+
226
+ # Forward propagation
227
+ z1 = Xi.dot(W1) + b1
228
+ a1 = np.tanh(z1)
229
+ z2 = a1.dot(W2) + b2
230
+ exp_scores = np.exp(z2)
231
+ probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
232
+
233
+ # Back propagation
234
+ delta3 = probs
235
+ delta3[0,yi] -= 1
236
+ dW2 = (a1.T).dot(delta3)
237
+ db2 = np.sum(delta3, axis=0, keepdims=True)
238
+ delta2 = delta3.dot(W2.T) * (1 - np.power(a1, 2))
239
+ dW1 = np.dot(Xi.T, delta2)
240
+ db1 = np.sum(delta2, axis=0)
241
+
242
+ # Add regularization terms (b1 and b2 don't have regularization terms)
243
+ dW2 += reg_lambda * W2
244
+ dW1 += reg_lambda * W1
245
+
246
+ # Gradient descent parameter update
247
+ W1 += -epsilon * dW1
248
+ b1 += -epsilon * db1
249
+ W2 += -epsilon * dW2
250
+ b2 += -epsilon * db2
251
+
252
+ # Assign new parameters to the model
253
+ model = { 'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}
254
+
255
+ # This is expensive because it uses the whole dataset, so we don't want to do it too often.
256
+ if i % validation_interval == 0:
257
+ if use_validation_data:
258
+ cur_loss = calculate_loss(XV,yv,model)
259
+ else:
260
+ cur_loss = calculate_loss(X,y,model)
261
+
262
+ print "Loss after iteration %i: %.8f" %(i, cur_loss)
263
+ loss = cur_loss
264
+
265
+ return model
266
+
267
+
268
+ # Build a model with a 3-dimensional hidden layer
269
+ if (training_mode == "batch"):
270
+ model = build_model_batch(nn_hdim, num_passes=it_count, validation_interval=1)
271
+ elif (training_mode == "incr"):
272
+ model = build_model_incr(nn_hdim, num_passes=it_count, validation_interval=1)
273
+ else:
274
+ print "invalid learning mode"
275
+ sys.exit()
276
+
277
+ print "hidden layer"
278
+ for row in model['W1']:
279
+ print(row)
280
+
281
+ print "hidden layer bias"
282
+ for row in model['b1']:
283
+ print(row)
284
+
285
+ print "output layer"
286
+ for row in model['W2']:
287
+ print(row)
288
+
289
+ print "output layer bias"
290
+ for row in model['b2']:
291
+ print(row)
292
+
293
+
supv/fftn.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/local/bin/python3
2
+
3
+ # avenir-python: Machine Learning
4
+ # Author: Pranab Ghosh
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You may
8
+ # obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+
18
+ # Package imports
19
+ import os
20
+ import sys
21
+ import matplotlib.pyplot as plt
22
+ import numpy as np
23
+ import torch
24
+ from torch.autograd import Variable
25
+ from torch.utils.data import Dataset, TensorDataset
26
+ from torch.utils.data import DataLoader
27
+ import sklearn as sk
28
+ import matplotlib
29
+ import random
30
+ import jprops
31
+ from random import randint
32
+ sys.path.append(os.path.abspath("../lib"))
33
+ from util import *
34
+ from mlutil import *
35
+ from tnn import *
36
+
37
+
38
+ class FeedForwardTwinNetwork(FeedForwardNetwork):
39
+ """
40
+ siamese twin feef forward network
41
+ """
42
+ def __init__(self, configFile):
43
+ defValues = dict()
44
+ defValues["train.twin.crossenc"] = (False, None)
45
+ super(FeedForwardTwinNetwork, self).__init__(configFile, defValues)
46
+
47
+ def buildModel(self):
48
+ """
49
+ Loads configuration and builds the various piecess necessary for the model
50
+ """
51
+ super().buildModel()
52
+
53
+ #final fully connected after merge
54
+
55
+ feCount = self.config.getIntConfig("train.input.size")[0]
56
+ self.vaFe1 = self.validFeatData[:,:feCount]
57
+ self.vaFe2 = self.validFeatData[:,feCount:2*feCount]
58
+ self.vaFe3 = self.validFeatData[:,2*feCount:]
59
+
60
+ def forward(self, x1, x2, x3):
61
+ """
62
+ Go through layers twice
63
+ """
64
+ y1 = self.layers(x1)
65
+ y2 = self.layers(x2)
66
+ y3 = self.layers(x3)
67
+ y = (y1, y2, y3)
68
+ return y
69
+
70
+ @staticmethod
71
+ def batchTrain(model):
72
+ """
73
+ train with batch data
74
+ """
75
+ feCount = model.config.getIntConfig("train.input.size")[0]
76
+ fe1 = model.featData[:,:feCount]
77
+ fe2 = model.featData[:,feCount:2*feCount]
78
+ fe3 = model.featData[:,2*feCount:]
79
+
80
+ print(fe1.shape)
81
+ print(fe2.shape)
82
+ print(fe3.shape)
83
+ trainData = TensorDataset(fe1, fe2, fe3)
84
+ trainDataLoader = DataLoader(dataset=trainData, batch_size=model.batchSize, shuffle=True)
85
+ epochIntv = model.config.getIntConfig("train.epoch.intv")[0]
86
+
87
+ # train mode
88
+ model.train()
89
+
90
+ if model.trackErr:
91
+ trErr = list()
92
+ vaErr = list()
93
+ #epoch
94
+ for t in range(model.numIter):
95
+ #batch
96
+ b = 0
97
+ epochLoss = 0.0
98
+ for x1Batch, x2Batch, x3Batch in trainDataLoader:
99
+
100
+ # Forward pass: Compute predicted y by passing x to the model
101
+ yPred = model(x1Batch, x2Batch, x3Batch)
102
+
103
+ # Compute and print loss
104
+ loss = model.lossFn(yPred[0], yPred[1], yPred[2])
105
+ if model.verbose and t % epochIntv == 0 and model.batchIntv > 0 and b % model.batchIntv == 0:
106
+ print("epoch {} batch {} loss {:.6f}".format(t, b, loss.item()))
107
+
108
+ if model.trackErr and model.batchIntv == 0:
109
+ epochLoss += loss.item()
110
+
111
+ #error tracking at batch level
112
+ if model.trackErr and model.batchIntv > 0 and b % model.batchIntv == 0:
113
+ trErr.append(loss.item())
114
+ vloss = FeedForwardTwinNetwork.evaluateModel(model)
115
+ vaErr.append(vloss)
116
+
117
+ # Zero gradients, perform a backward pass, and update the weights.
118
+ model.optimizer.zero_grad()
119
+ loss.backward()
120
+ model.optimizer.step()
121
+ b += 1
122
+
123
+ #error tracking at epoch level
124
+ if model.trackErr and model.batchIntv == 0:
125
+ epochLoss /= b
126
+ if model.verbose:
127
+ print("epoch {} loss {:.6f}".format(t, epochLoss))
128
+ trErr.append(epochLoss)
129
+ vloss = FeedForwardTwinNetwork.evaluateModel(model)
130
+ vaErr.append(vloss)
131
+
132
+ #validate
133
+ """
134
+ model.eval()
135
+ yPred = model(model.vaFeOne, model.vaFeTwo)
136
+ yPred = yPred.data.cpu().numpy()
137
+ yActual = model.validOutData.data.cpu().numpy()
138
+ if model.verbose:
139
+ vsize = yPred.shape[0]
140
+ print("\npredicted \t\t actual")
141
+ for i in range(vsize):
142
+ print(str(yPred[i]) + "\t" + str(yActual[i]))
143
+
144
+ score = perfMetric(model.accMetric, yActual, yPred)
145
+ print(yActual)
146
+ print(yPred)
147
+ print(formatFloat(3, score, "perf score"))
148
+ """
149
+
150
+ #save
151
+ modelSave = model.config.getBooleanConfig("train.model.save")[0]
152
+ if modelSave:
153
+ FeedForwardNetwork.saveCheckpt(model)
154
+
155
+ if model.trackErr:
156
+ FeedForwardNetwork.errorPlot(model, trErr, vaErr)
157
+
158
+ return 1.0
159
+
160
+
161
+ @staticmethod
162
+ def evaluateModel(model):
163
+ """
164
+ evaluate model
165
+
166
+ Parameters
167
+ model : torch model
168
+ """
169
+ model.eval()
170
+ with torch.no_grad():
171
+ yPred = model(model.vaFe1, model.vaFe2, model.vaFe3)
172
+ score = model.lossFn(yPred[0], yPred[1], yPred[2]).item()
173
+ model.train()
174
+ return score
175
+
176
+ @staticmethod
177
+ def testModel(model):
178
+ """
179
+ test model
180
+
181
+ Parameters
182
+ model : torch model
183
+ """
184
+ useSavedModel = model.config.getBooleanConfig("predict.use.saved.model")[0]
185
+ if useSavedModel:
186
+ FeedForwardNetwork.restoreCheckpt(model)
187
+ else:
188
+ FeedForwardTwinNetwork.batchTrain(model)
189
+
190
+ dataSource = model.config.getStringConfig("predict.data.file")[0]
191
+ featData = FeedForwardNetwork.prepData(model, dataSource, False)
192
+ featData = torch.from_numpy(featData)
193
+ feCount = model.config.getIntConfig("train.input.size")[0]
194
+ fe1 = featData[:,:feCount]
195
+ fe2 = featData[:,feCount:2*feCount]
196
+ fe3 = featData[:,2*feCount:]
197
+
198
+
199
+ model.eval()
200
+ with torch.no_grad():
201
+ yp = model(fe1, fe2, fe3)
202
+ cos = torch.nn.CosineSimilarity()
203
+ s1 = cos(yp[0], yp[1]).data.cpu().numpy()
204
+ s2 = cos(yp[0], yp[2]).data.cpu().numpy()
205
+ #print(s1.shape)
206
+
207
+ n = yp[0].shape[0]
208
+ if model.verbose:
209
+ print(n)
210
+ for i in range(15):
211
+ if i % 3 == 0:
212
+ print("next")
213
+ print(yp[0][i])
214
+ print(yp[1][i])
215
+ print(yp[2][i])
216
+ print("similarity {:.3f} {:.3f}".format(s1[i], s2[i]))
217
+
218
+ tc = 0
219
+ cc = 0
220
+ outputSize = model.config.getIntConfig("train.output.size")[0]
221
+ for i in range(0, n, outputSize):
222
+ #for each sample outputSize no of rows
223
+ msi = None
224
+ imsi = None
225
+ for j in range(outputSize):
226
+ #first one positive , followed by all negative
227
+ si = (s1[i+j] + s2[i+j]) / 2
228
+ if msi == None or si > msi:
229
+ msi = si
230
+ imsi = j
231
+ tc += 1
232
+ if imsi == 0:
233
+ cc += 1
234
+ score = cc / tc
235
+ print("score: {:.3f}".format(score))
236
+ model.train()
237
+ return score
238
+
239
+
240
+
supv/gbt.py ADDED
@@ -0,0 +1,482 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/local/bin/python3
2
+
3
+ # avenir-python: Machine Learning
4
+ # Author: Pranab Ghosh
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You may
8
+ # obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+
18
+ # Package imports
19
+ import os
20
+ import sys
21
+ import matplotlib.pyplot as plt
22
+ import numpy as np
23
+ import sklearn as sk
24
+ import matplotlib
25
+ import random
26
+ import jprops
27
+ from sklearn.ensemble import GradientBoostingClassifier
28
+ import joblib
29
+ from sklearn.metrics import accuracy_score
30
+ from sklearn.metrics import confusion_matrix
31
+ from sklearn.model_selection import cross_val_score
32
+ from random import randint
33
+ from io import StringIO
34
+ sys.path.append(os.path.abspath("../lib"))
35
+ from util import *
36
+ from mlutil import *
37
+ from pasearch import *
38
+ from bacl import *
39
+
40
+ # gradient boosting classification
41
+ class GradientBoostedTrees(object):
42
+ def __init__(self, configFile):
43
+ defValues = {}
44
+ defValues["common.mode"] = ("training", None)
45
+ defValues["common.model.directory"] = ("model", None)
46
+ defValues["common.model.file"] = (None, None)
47
+ defValues["common.preprocessing"] = (None, None)
48
+ defValues["common.verbose"] = (False, None)
49
+ defValues["train.data.file"] = (None, "missing training data file")
50
+ defValues["train.data.fields"] = (None, "missing training data field ordinals")
51
+ defValues["train.data.feature.fields"] = (None, "missing training data feature field ordinals")
52
+ defValues["train.data.class.field"] = (None, "missing class field ordinal")
53
+ defValues["train.validation"] = ("kfold", None)
54
+ defValues["train.num.folds"] = (5, None)
55
+ defValues["train.min.samples.split"] = ("4", None)
56
+ defValues["train.min.samples.leaf.gb"] = ("2", None)
57
+ defValues["train.max.depth.gb"] = (3, None)
58
+ defValues["train.max.leaf.nodes.gb"] = (None, None)
59
+ defValues["train.max.features.gb"] = (None, None)
60
+ defValues["train.learning.rate"] = (0.1, None)
61
+ defValues["train.num.estimators.gb"] = (100, None)
62
+ defValues["train.subsample"] = (1.0, None)
63
+ defValues["train.loss"] = ("deviance", None)
64
+ defValues["train.random.state"] = (None, None)
65
+ defValues["train.verbose"] = (0, None)
66
+ defValues["train.warm.start"] = (False, None)
67
+ defValues["train.presort"] = ("auto", None)
68
+ defValues["train.criterion"] = ("friedman_mse", None)
69
+ defValues["train.success.criterion"] = ("error", None)
70
+ defValues["train.model.save"] = (False, None)
71
+ defValues["train.score.method"] = ("accuracy", None)
72
+ defValues["train.search.param.strategy"] = (None, None)
73
+ defValues["train.search.params"] = (None, None)
74
+ defValues["predict.data.file"] = (None, None)
75
+ defValues["predict.data.fields"] = (None, "missing data field ordinals")
76
+ defValues["predict.data.feature.fields"] = (None, "missing data feature field ordinals")
77
+ defValues["predict.use.saved.model"] = (False, None)
78
+ defValues["validate.data.file"] = (None, "missing validation data file")
79
+ defValues["validate.data.fields"] = (None, "missing validation data field ordinals")
80
+ defValues["validate.data.feature.fields"] = (None, "missing validation data feature field ordinals")
81
+ defValues["validate.data.class.field"] = (None, "missing class field ordinal")
82
+ defValues["validate.use.saved.model"] = (False, None)
83
+ defValues["validate.score.method"] = ("accuracy", None)
84
+
85
+ self.config = Configuration(configFile, defValues)
86
+ self.subSampleRate = None
87
+ self.featData = None
88
+ self.clsData = None
89
+ self.gbcClassifier = None
90
+ self.verbose = self.config.getBooleanConfig("common.verbose")[0]
91
+ logFilePath = self.config.getStringConfig("common.logging.file")[0]
92
+ logLevName = self.config.getStringConfig("common.logging.level")[0]
93
+ self.logger = createLogger(__name__, logFilePath, logLevName)
94
+ self.logger.info("********* starting session")
95
+
96
+ # initialize config
97
+ def initConfig(self, configFile, defValues):
98
+ self.config = Configuration(configFile, defValues)
99
+
100
+ # get config object
101
+ def getConfig(self):
102
+ return self.config
103
+
104
+ #set config param
105
+ def setConfigParam(self, name, value):
106
+ self.config.setParam(name, value)
107
+
108
+ #get mode
109
+ def getMode(self):
110
+ return self.config.getStringConfig("common.mode")[0]
111
+
112
+ #get search parameter
113
+ def getSearchParamStrategy(self):
114
+ return self.config.getStringConfig("train.search.param.strategy")[0]
115
+
116
+ def setModel(self, model):
117
+ self.gbcClassifier = model
118
+
119
+ # train model
120
+ def train(self):
121
+ #build model
122
+ self.buildModel()
123
+
124
+ # training data
125
+ if self.featData is None:
126
+ (featData, clsData) = self.prepTrainingData()
127
+ (self.featData, self.clsData) = (featData, clsData)
128
+ else:
129
+ (featData, clsData) = (self.featData, self.clsData)
130
+ if self.subSampleRate is not None:
131
+ (featData, clsData) = subSample(featData, clsData, self.subSampleRate, False)
132
+ self.logger.info("subsample size " + str(featData.shape[0]))
133
+
134
+ # parameters
135
+ modelSave = self.config.getBooleanConfig("train.model.save")[0]
136
+
137
+ #train
138
+ self.logger.info("...training model")
139
+ self.gbcClassifier.fit(featData, clsData)
140
+ score = self.gbcClassifier.score(featData, clsData)
141
+ successCriterion = self.config.getStringConfig("train.success.criterion")[0]
142
+ result = None
143
+ if successCriterion == "accuracy":
144
+ self.logger.info("accuracy with training data {:06.3f}".format(score))
145
+ result = score
146
+ elif successCriterion == "error":
147
+ error = 1.0 - score
148
+ self.logger.info("error with training data {:06.3f}".format(error))
149
+ result = error
150
+ else:
151
+ raise ValueError("invalid success criterion")
152
+
153
+ if modelSave:
154
+ self.logger.info("...saving model")
155
+ modelFilePath = self.getModelFilePath()
156
+ joblib.dump(self.gbcClassifier, modelFilePath)
157
+ return result
158
+
159
+ #train with k fold validation
160
+ def trainValidate(self):
161
+ #build model
162
+ self.buildModel()
163
+
164
+ # training data
165
+ (featData, clsData) = self.prepTrainingData()
166
+
167
+ #parameter
168
+ validation = self.config.getStringConfig("train.validation")[0]
169
+ numFolds = self.config.getIntConfig("train.num.folds")[0]
170
+ successCriterion = self.config.getStringConfig("train.success.criterion")[0]
171
+ scoreMethod = self.config.getStringConfig("train.score.method")[0]
172
+
173
+ #train with validation
174
+ self.logger.info("...training and kfold cross validating model")
175
+ scores = cross_val_score(self.gbcClassifier, featData, clsData, cv=numFolds,scoring=scoreMethod)
176
+ avScore = np.mean(scores)
177
+ result = self.reportResult(avScore, successCriterion, scoreMethod)
178
+ return result
179
+
180
+ #train with k fold validation and search parameter space for optimum
181
+ def trainValidateSearch(self):
182
+ self.logger.info("...starting train validate with parameter search")
183
+ searchStrategyName = self.getSearchParamStrategy()
184
+ if searchStrategyName is not None:
185
+ if searchStrategyName == "grid":
186
+ searchStrategy = GuidedParameterSearch(self.verbose)
187
+ elif searchStrategyName == "random":
188
+ searchStrategy = RandomParameterSearch(self.verbose)
189
+ maxIter = self.config.getIntConfig("train.search.max.iterations")[0]
190
+ searchStrategy.setMaxIter(maxIter)
191
+ elif searchStrategyName == "simuan":
192
+ searchStrategy = SimulatedAnnealingParameterSearch(self.verbose)
193
+ maxIter = self.config.getIntConfig("train.search.max.iterations")[0]
194
+ searchStrategy.setMaxIter(maxIter)
195
+ temp = self.config.getFloatConfig("train.search.sa.temp")[0]
196
+ searchStrategy.setTemp(temp)
197
+ tempRedRate = self.config.getFloatConfig("train.search.sa.temp.red.rate")[0]
198
+ searchStrategy.setTempReductionRate(tempRedRate)
199
+ else:
200
+ raise ValueError("invalid paramtere search strategy")
201
+ else:
202
+ raise ValueError("missing search strategy")
203
+
204
+ # add search params
205
+ searchParams = self.config.getStringConfig("train.search.params")[0].split(",")
206
+ searchParamNames = []
207
+ extSearchParamNames = []
208
+ if searchParams is not None:
209
+ for searchParam in searchParams:
210
+ paramItems = searchParam.split(":")
211
+ extSearchParamNames.append(paramItems[0])
212
+
213
+ #get rid name component search
214
+ paramNameItems = paramItems[0].split(".")
215
+ del paramNameItems[1]
216
+ paramItems[0] = ".".join(paramNameItems)
217
+
218
+ searchStrategy.addParam(paramItems)
219
+ searchParamNames.append(paramItems[0])
220
+ else:
221
+ raise ValueError("missing search parameter list")
222
+
223
+ # add search param data list for each param
224
+ for (searchParamName,extSearchParamName) in zip(searchParamNames,extSearchParamNames):
225
+ searchParamData = self.config.getStringConfig(extSearchParamName)[0].split(",")
226
+ searchStrategy.addParamVaues(searchParamName, searchParamData)
227
+
228
+ # train and validate for various param value combination
229
+ searchStrategy.prepare()
230
+ paramValues = searchStrategy.nextParamValues()
231
+ searchResults = []
232
+ while paramValues is not None:
233
+ self.logger.info("...next parameter set")
234
+ paramStr = ""
235
+ for paramValue in paramValues:
236
+ self.setConfigParam(paramValue[0], str(paramValue[1]))
237
+ paramStr = paramStr + paramValue[0] + "=" + str(paramValue[1]) + " "
238
+ result = self.trainValidate()
239
+ searchStrategy.setCost(result)
240
+ searchResults.append((paramStr, result))
241
+ paramValues = searchStrategy.nextParamValues()
242
+
243
+ # output
244
+ self.logger.info("all parameter search results")
245
+ for searchResult in searchResults:
246
+ self.logger.info("{}\t{:06.3f}".format(searchResult[0], searchResult[1]))
247
+
248
+ self.logger.info("best parameter search result")
249
+ bestSolution = searchStrategy.getBestSolution()
250
+ paramStr = ""
251
+ for paramValue in bestSolution[0]:
252
+ paramStr = paramStr + paramValue[0] + "=" + str(paramValue[1]) + " "
253
+ self.logger.info("{}\t{:06.3f}".format(paramStr, bestSolution[1]))
254
+ return bestSolution
255
+
256
+ #predict
257
+ def validate(self):
258
+ # create model
259
+ useSavedModel = self.config.getBooleanConfig("validate.use.saved.model")[0]
260
+ if useSavedModel:
261
+ # load saved model
262
+ self.logger.info("...loading model")
263
+ modelFilePath = self.getModelFilePath()
264
+ self.gbcClassifier = joblib.load(modelFilePath)
265
+ else:
266
+ # train model
267
+ self.train()
268
+
269
+ # prepare test data
270
+ (featData, clsDataActual) = self.prepValidationData()
271
+
272
+ #predict
273
+ self.logger.info("...predicting")
274
+ clsDataPred = self.gbcClassifier.predict(featData)
275
+
276
+ self.logger.info("...validating")
277
+ #self.logger.info(clsData)
278
+ scoreMethod = self.config.getStringConfig("validate.score.method")[0]
279
+ if scoreMethod == "accuracy":
280
+ accuracy = accuracy_score(clsDataActual, clsDataPred)
281
+ self.logger.info("accuracy:")
282
+ self.logger.info(accuracy)
283
+ elif scoreMethod == "confusionMatrix":
284
+ confMatrx = confusion_matrix(clsDataActual, clsDataPred)
285
+ self.logger.info("confusion matrix:")
286
+ self.logger.info(confMatrx)
287
+
288
+
289
+ #predict
290
+ def predictx(self):
291
+ # create model
292
+ useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
293
+ if useSavedModel:
294
+ # load saved model
295
+ self.logger.info("...loading model")
296
+ modelFilePath = self.getModelFilePath()
297
+ self.gbcClassifier = joblib.load(modelFilePath)
298
+ else:
299
+ # train model
300
+ self.train()
301
+
302
+ # prepare test data
303
+ featData = self.prepPredictData()
304
+
305
+ #predict
306
+ self.logger.info("...predicting")
307
+ clsData = self.gbcClassifier.predict(featData)
308
+ self.logger.info(clsData)
309
+
310
+ #predict with in memory data
311
+ def predict(self, recs=None):
312
+ # create model
313
+ self.prepModel()
314
+
315
+ #input record
316
+ #input record
317
+ if recs:
318
+ #passed record
319
+ featData = self.prepStringPredictData(recs)
320
+ if (featData.ndim == 1):
321
+ featData = featData.reshape(1, -1)
322
+ else:
323
+ #file
324
+ featData = self.prepPredictData()
325
+
326
+ #predict
327
+ self.logger.info("...predicting")
328
+ clsData = self.gbcClassifier.predict(featData)
329
+ return clsData
330
+
331
+ #predict probability with in memory data
332
+ def predictProb(self, recs):
333
+ # create model
334
+ self.prepModel()
335
+
336
+ #input record
337
+ if type(recs) is str:
338
+ featData = self.prepStringPredictData(recs)
339
+ else:
340
+ featData = recs
341
+ #self.logger.info(featData.shape)
342
+ if (featData.ndim == 1):
343
+ featData = featData.reshape(1, -1)
344
+
345
+ #predict
346
+ self.logger.info("...predicting class probability")
347
+ clsData = self.gbcClassifier.predict_proba(featData)
348
+ return clsData
349
+
350
+ #preparing model
351
+ def prepModel(self):
352
+ useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
353
+ if (useSavedModel and not self.gbcClassifier):
354
+ # load saved model
355
+ self.logger.info("...loading saved model")
356
+ modelFilePath = self.getModelFilePath()
357
+ self.gbcClassifier = joblib.load(modelFilePath)
358
+ else:
359
+ # train model
360
+ self.train()
361
+ return self.gbcClassifier
362
+
363
+ #prepare string predict data
364
+ def prepStringPredictData(self, recs):
365
+ frecs = StringIO(recs)
366
+ featData = np.loadtxt(frecs, delimiter=',')
367
+ #self.logger.info(featData)
368
+ return featData
369
+
370
+ #loads and prepares training data
371
+ def prepTrainingData(self):
372
+ # parameters
373
+ dataFile = self.config.getStringConfig("train.data.file")[0]
374
+ fieldIndices = self.config.getStringConfig("train.data.fields")[0]
375
+ if not fieldIndices is None:
376
+ fieldIndices = strToIntArray(fieldIndices, ",")
377
+ featFieldIndices = self.config.getStringConfig("train.data.feature.fields")[0]
378
+ if not featFieldIndices is None:
379
+ featFieldIndices = strToIntArray(featFieldIndices, ",")
380
+ classFieldIndex = self.config.getIntConfig("train.data.class.field")[0]
381
+
382
+ #training data
383
+ (data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
384
+ clsData = extrColumns(data, classFieldIndex)
385
+ clsData = np.array([int(a) for a in clsData])
386
+ return (featData, clsData)
387
+
388
+ #loads and prepares training data
389
+ def prepValidationData(self):
390
+ # parameters
391
+ dataFile = self.config.getStringConfig("validate.data.file")[0]
392
+ fieldIndices = self.config.getStringConfig("validate.data.fields")[0]
393
+ if not fieldIndices is None:
394
+ fieldIndices = strToIntArray(fieldIndices, ",")
395
+ featFieldIndices = self.config.getStringConfig("validate.data.feature.fields")[0]
396
+ if not featFieldIndices is None:
397
+ featFieldIndices = strToIntArray(featFieldIndices, ",")
398
+ classFieldIndex = self.config.getIntConfig("validate.data.class.field")[0]
399
+
400
+ #training data
401
+ (data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
402
+ clsData = extrColumns(data, classFieldIndex)
403
+ clsData = [int(a) for a in clsData]
404
+ return (featData, clsData)
405
+
406
+ #loads and prepares training data
407
+ def prepPredictData(self):
408
+ # parameters
409
+ dataFile = self.config.getStringConfig("predict.data.file")[0]
410
+ if dataFile is None:
411
+ raise ValueError("missing prediction data file")
412
+ fieldIndices = self.config.getStringConfig("predict.data.fields")[0]
413
+ if not fieldIndices is None:
414
+ fieldIndices = strToIntArray(fieldIndices, ",")
415
+ featFieldIndices = self.config.getStringConfig("predict.data.feature.fields")[0]
416
+ if not featFieldIndices is None:
417
+ featFieldIndices = strToIntArray(featFieldIndices, ",")
418
+
419
+ #training data
420
+ (data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
421
+
422
+ return featData
423
+
424
+ # get model file path
425
+ def getModelFilePath(self):
426
+ modelDirectory = self.config.getStringConfig("common.model.directory")[0]
427
+ modelFile = self.config.getStringConfig("common.model.file")[0]
428
+ if modelFile is None:
429
+ raise ValueError("missing model file name")
430
+ modelFilePath = modelDirectory + "/" + modelFile
431
+ return modelFilePath
432
+
433
+ # report result
434
+ def reportResult(self, score, successCriterion, scoreMethod):
435
+ if successCriterion == "accuracy":
436
+ self.logger.info("average " + scoreMethod + " with k fold cross validation {:06.3f}".format(score))
437
+ result = score
438
+ elif successCriterion == "error":
439
+ error = 1.0 - score
440
+ self.logger.info("average error with k fold cross validation {:06.3f}".format(error))
441
+ result = error
442
+ else:
443
+ raise ValueError("invalid success criterion")
444
+ return result
445
+
446
+ # builds model object
447
+ def buildModel(self):
448
+ self.logger.info("...building gradient boosted tree model")
449
+ # parameters
450
+ minSamplesSplit = self.config.getStringConfig("train.min.samples.split")[0]
451
+ minSamplesSplit = typedValue(minSamplesSplit)
452
+ minSamplesLeaf = self.config.getStringConfig("train.min.samples.leaf.gb")[0]
453
+ minSamplesLeaf = typedValue(minSamplesLeaf)
454
+ #minWeightFractionLeaf = self.config.getFloatConfig("train.min.weight.fraction.leaf.gb")[0]
455
+ (maxDepth, maxLeafNodes) = self.config.eitherOrIntConfig("train.max.depth.gb", "train.max.leaf.nodes.gb")
456
+ maxFeatures = self.config.getStringConfig("train.max.features.gb")[0]
457
+ maxFeatures = typedValue(maxFeatures)
458
+ learningRate = self.config.getFloatConfig("train.learning.rate")[0]
459
+ numEstimators = self.config.getIntConfig("train.num.estimators.gb")[0]
460
+ subsampleFraction = self.config.getFloatConfig("train.subsample")[0]
461
+ lossFun = self.config.getStringConfig("train.loss")[0]
462
+ randomState = self.config.getIntConfig("train.random.state")[0]
463
+ verboseOutput = self.config.getIntConfig("train.verbose")[0]
464
+ warmStart = self.config.getBooleanConfig("train.warm.start")[0]
465
+ presort = self.config.getStringConfig("train.presort")
466
+ if (presort[1]):
467
+ presortChoice = presort[0]
468
+ else:
469
+ presortChoice = presort[0].lower() == "true"
470
+ splitCriterion = self.config.getStringConfig("train.criterion")[0]
471
+
472
+ #classifier
473
+ self.gbcClassifier = GradientBoostingClassifier(loss=lossFun, learning_rate=learningRate, n_estimators=numEstimators,
474
+ subsample=subsampleFraction, min_samples_split=minSamplesSplit,
475
+ min_samples_leaf=minSamplesLeaf, min_weight_fraction_leaf=0.0, max_depth=maxDepth,
476
+ init=None, random_state=randomState, max_features=maxFeatures, verbose=verboseOutput,
477
+ max_leaf_nodes=maxLeafNodes, warm_start=warmStart, presort=presortChoice)
478
+
479
+
480
+
481
+
482
+
supv/gcn.py ADDED
@@ -0,0 +1,444 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/local/bin/python3
2
+
3
+ # avenir-python: Machine Learning
4
+ # Author: Pranab Ghosh
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You may
8
+ # obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+
18
+ # Package imports
19
+ import os
20
+ import sys
21
+ import matplotlib.pyplot as plt
22
+ import matplotlib
23
+ import random
24
+ from random import randint
25
+ from itertools import compress
26
+ import numpy as np
27
+ import torch
28
+ from torch import nn
29
+ from torch.nn import Linear
30
+ from torch.autograd import Variable
31
+ from torch.utils.data import DataLoader
32
+ from torchvision import transforms
33
+ from torch_geometric.nn import GCNConv
34
+ from torch_geometric.nn import MessagePassing
35
+ from torch_geometric.data import Data
36
+ import sklearn as sk
37
+ import jprops
38
+ sys.path.append(os.path.abspath("../lib"))
39
+ from util import *
40
+ from mlutil import *
41
+ from tnn import FeedForwardNetwork
42
+
43
+ """
44
+ Graph convolution network
45
+ """
46
+
47
+ class GraphConvoNetwork(nn.Module):
48
+ def __init__(self, configFile):
49
+ """
50
+ initilizer
51
+
52
+ Parameters
53
+ configFile : config file path
54
+ """
55
+ defValues = dict()
56
+ defValues["common.model.directory"] = ("model", None)
57
+ defValues["common.model.file"] = (None, None)
58
+ defValues["common.preprocessing"] = (None, None)
59
+ defValues["common.scaling.method"] = ("zscale", None)
60
+ defValues["common.scaling.minrows"] = (50, None)
61
+ defValues["common.scaling.param.file"] = (None, None)
62
+ defValues["common.verbose"] = (False, None)
63
+ defValues["common.device"] = ("cpu", None)
64
+ defValues["train.data.file"] = (None, "missing training data file")
65
+ defValues["train.data.num.nodes.total"] = (None, None)
66
+ defValues["train.data.num.nodes.training"] = (None, None)
67
+ defValues["train.data.splits"] = ([.75,.15,.10], None)
68
+ defValues["train.layer.data"] = (None, "missing layer data")
69
+ defValues["train.input.size"] = (None, "missing output size")
70
+ defValues["train.output.size"] = (None, "missing output size")
71
+ defValues["train.loss.reduction"] = ("mean", None)
72
+ defValues["train.num.iterations"] = (500, None)
73
+ defValues["train.lossFn"] = ("mse", None)
74
+ defValues["train.optimizer"] = ("sgd", None)
75
+ defValues["train.opt.learning.rate"] = (.0001, None)
76
+ defValues["train.opt.weight.decay"] = (0, None)
77
+ defValues["train.opt.momentum"] = (0, None)
78
+ defValues["train.opt.eps"] = (1e-08, None)
79
+ defValues["train.opt.dampening"] = (0, None)
80
+ defValues["train.opt.momentum.nesterov"] = (False, None)
81
+ defValues["train.opt.betas"] = ([0.9, 0.999], None)
82
+ defValues["train.opt.alpha"] = (0.99, None)
83
+ defValues["train.save.model"] = (False, None)
84
+ defValues["train.track.error"] = (False, None)
85
+ defValues["train.epoch.intv"] = (5, None)
86
+ defValues["train.print.weights"] = (False, None)
87
+ defValues["valid.accuracy.metric"] = (None, None)
88
+ defValues["predict.create.mask"] = (False, None)
89
+ defValues["predict.use.saved.model"] = (True, None)
90
+
91
+ self.config = Configuration(configFile, defValues)
92
+ super(GraphConvoNetwork, self).__init__()
93
+
94
+
95
+ def getConfig(self):
96
+ """
97
+ return config
98
+ """
99
+ return self.config
100
+
101
+ def buildModel(self):
102
+ """
103
+ Loads configuration and builds the various piecess necessary for the model
104
+ """
105
+ torch.manual_seed(9999)
106
+
107
+ self.verbose = self.config.getBooleanConfig("common.verbose")[0]
108
+ numinp = self.config.getIntConfig("train.input.size")[0]
109
+ self.outputSize = self.config.getIntConfig("train.output.size")[0]
110
+ self.numIter = self.config.getIntConfig("train.num.iterations")[0]
111
+ optimizer = self.config.getStringConfig("train.optimizer")[0]
112
+ self.lossFnStr = self.config.getStringConfig("train.lossFn")[0]
113
+ self.accMetric = self.config.getStringConfig("valid.accuracy.metric")[0]
114
+ self.trackErr = self.config.getBooleanConfig("train.track.error")[0]
115
+ self.restored = False
116
+ self.clabels = list(range(self.outputSize)) if self.outputSize > 1 else None
117
+
118
+ #build network
119
+ layers = list()
120
+ ninp = numinp
121
+ trData = self.config.getStringConfig("train.layer.data")[0].split(",")
122
+ for ld in trData:
123
+ lde = ld.split(":")
124
+ ne = len(lde)
125
+ assert ne == 5 or ne == 6, "expecting 5 or 6 items for layer data"
126
+
127
+ gconv = False
128
+ if ne == 6:
129
+ if lde[0] == "gconv":
130
+ gconv == True
131
+ lde = lde[1:]
132
+
133
+ #num of units, activation, whether batch normalize, whether batch normalize after activation, dropout fraction
134
+ nunit = int(lde[0])
135
+ actStr = lde[1]
136
+ act = FeedForwardNetwork.createActivation(actStr) if actStr != "none" else None
137
+ bnorm = lde[2] == "true"
138
+ afterAct = lde[3] == "true"
139
+ dpr = float(lde[4])
140
+
141
+ if gconv:
142
+ layers.append(GCNConv(ninp, nunit))
143
+ else:
144
+ layers.append(Linear(ninp, nunit))
145
+ if bnorm:
146
+ #with batch norm
147
+ if afterAct:
148
+ safeAppend(layers, act)
149
+ layers.append(torch.nn.BatchNorm1d(nunit))
150
+ else:
151
+ layers.append(torch.nn.BatchNorm1d(nunit))
152
+ safeAppend(layers, act)
153
+ else:
154
+ #without batch norm
155
+ safeAppend(layers, act)
156
+
157
+ if dpr > 0:
158
+ layers.append(torch.nn.Dropout(dpr))
159
+ ninp = nunit
160
+
161
+ self.layers = torch.nn.ModuleList(layers)
162
+ self.device = FeedForwardNetwork.getDevice(self)
163
+ self.to(self.device)
164
+ self.loadData()
165
+
166
+ self.lossFn = FeedForwardNetwork.createLossFunction(self, self.lossFnStr)
167
+ self.optimizer = FeedForwardNetwork.createOptimizer(self, optimizer)
168
+ self.trained = False
169
+
170
+ def loadData(self):
171
+ """
172
+ load node and edge data
173
+ """
174
+ dataFilePath = self.config.getStringConfig("train.data.file")[0]
175
+ numNodes = self.config.getIntConfig("train.data.num.nodes.total")[0]
176
+ numLabeled = self.config.getIntConfig("train.data.num.nodes.training")[0]
177
+ splits = self.config.getFloatListConfig("train.data.splits")[0]
178
+ crPredMask = self.config.getBooleanConfig("predict.create.mask")[0]
179
+
180
+ dx = list()
181
+ dy = list()
182
+ edges = list()
183
+ mask = None
184
+ for rec in fileRecGen(dataFilePath, ","):
185
+ if len(rec) > 2:
186
+ x = rec[1 :-1]
187
+ x = toFloatList(x)
188
+ y = int(rec[-1])
189
+ dx.append(x)
190
+ dy.append(y)
191
+ elif len(rec) == 2:
192
+ e = toIntList(rec)
193
+ edges.append(e)
194
+ elif len(rec) == 1:
195
+ items = rec[0].split()
196
+ assertEqual(items[0], "mask", "invalid mask data")
197
+ numNodes = int(items[1])
198
+ print(numNodes)
199
+ mask = list()
200
+ for r in range(2, len(items), 1):
201
+ ri = items[r].split(":")
202
+ #print(ri)
203
+ ms = list(range(int(ri[0]), int(ri[1]), 1))
204
+ mask.extend(ms)
205
+ #scale node features
206
+ if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
207
+ scalingMethod = self.config.getStringConfig("common.scaling.method")[0]
208
+ dx = scaleData(dx, scalingMethod)
209
+
210
+ dx = torch.tensor(dx, dtype=torch.float)
211
+ dy = torch.tensor(dy, dtype=torch.long)
212
+ edges = torch.tensor(edges, dtype=torch.long)
213
+ edges = edges.t().contiguous()
214
+ dx = dx.to(self.device)
215
+ dy = dy.to(self.device)
216
+ edges = edges.to(self.device)
217
+ self.data = Data(x=dx, edge_index=edges, y=dy)
218
+
219
+ #maks
220
+ if mask is None:
221
+ #trainiug data in the beginning
222
+ trStart = 0
223
+ vaStart = int(splits[0] * numLabeled)
224
+ teStart = vaStart + int(splits[1] * numLabeled)
225
+
226
+ trMask = [False] * numNodes
227
+ trMask[0:vaStart] = [True] * vaStart
228
+ vaMask = [False] * numNodes
229
+ vaMask[vaStart:teStart] = [True] * (teStart - vaStart)
230
+ teMask = [False] * numNodes
231
+ teMask[teStart:] = [True] * (numNodes - teStart)
232
+ else:
233
+ #training data anywhere
234
+ if crPredMask:
235
+ prMask = [True] * numNodes
236
+ for i in mask:
237
+ prMask[i] = False
238
+ self.prMask = torch.tensor(prMask, dtype=torch.bool)
239
+
240
+ nshuffle = int(len(mask) / 2)
241
+ shuffle(mask, nshuffle)
242
+ #print(mask)
243
+ lmask = len(mask)
244
+ trme = int(splits[0] * lmask)
245
+ vame = int((splits[0] + splits[1]) * lmask)
246
+ teme = lmask
247
+ trMask = [False] * numNodes
248
+ for i in mask[:trme]:
249
+ trMask[i] = True
250
+ vaMask = [False] * numNodes
251
+ for i in mask[trme:vame]:
252
+ vaMask[i] = True
253
+ teMask = [False] * numNodes
254
+ for i in mask[vame:]:
255
+ teMask[i] = True
256
+ #print(vaMask)
257
+
258
+ trMask = torch.tensor(trMask, dtype=torch.bool)
259
+ trMask = trMask.to(self.device)
260
+ self.data.train_mask = trMask
261
+ vaMask = torch.tensor(vaMask, dtype=torch.bool)
262
+ vaMask = vaMask.to(self.device)
263
+ self.data.val_mask = vaMask
264
+ teMask = torch.tensor(teMask, dtype=torch.bool)
265
+ teMask = teMask.to(self.device)
266
+ self.data.test_mask = teMask
267
+
268
+
269
+ def descData(self):
270
+ """
271
+ describe data
272
+ """
273
+ print(f'Number of nodes: {self.data.num_nodes}')
274
+ print(f'Number of edges: {self.data.num_edges}')
275
+ print(f'Number of node features: {self.data.num_node_features}')
276
+ print(f'Number of training nodes: {self.data.train_mask.sum()}')
277
+ print(f'Training node label rate: {int(self.data.train_mask.sum()) / data.num_nodes:.2f}')
278
+ print(f'Number of validation nodes: {self.data.val_mask.sum()}')
279
+ print(f'Number of test nodes: {self.data.test_mask.sum()}')
280
+ print(f'Is undirected: {self.data.is_undirected()}')
281
+
282
+ print("Data attributes")
283
+ print(self.data.keys)
284
+
285
+ print("Data types")
286
+ print(type(self.data.x))
287
+ print(type(self.data.y))
288
+ print(type(self.data.edge_index))
289
+ print(type(self.data.train_mask))
290
+
291
+ print("Sample data")
292
+ print("x", self.data.x[:4])
293
+ print("y", self.data.y[:4])
294
+ print("edge", self.data.edge_index[:4])
295
+ print("train mask", self.data.train_mask[:4])
296
+ print("test mask", self.data.test_mask[:4])
297
+
298
+ print("Any isolated node? " , self.data.has_isolated_nodes())
299
+ print("Any self loop? ", self.data.has_self_loops())
300
+ print("Is graph directed? ", self.data.is_directed())
301
+
302
+ def forward(self):
303
+ """
304
+ forward prop
305
+ """
306
+ x, edges = self.data.x, self.data.edge_index
307
+ for l in self.layers:
308
+ if isinstance(l, MessagePassing):
309
+ x = l(x, edges)
310
+ else:
311
+ x = l(x)
312
+ return x
313
+
314
+ @staticmethod
315
+ def trainModel(model):
316
+ """
317
+ train with batch data
318
+
319
+ Parameters
320
+ model : torch model
321
+ """
322
+ epochIntv = model.config.getIntConfig("train.epoch.intv")[0]
323
+
324
+ model.train()
325
+ if model.trackErr:
326
+ trErr = list()
327
+ vaErr = list()
328
+
329
+ for epoch in range(model.numIter):
330
+ out = model()
331
+ loss = model.lossFn(out[model.data.train_mask], model.data.y[model.data.train_mask])
332
+
333
+ #error tracking at batch level
334
+ if model.trackErr:
335
+ trErr.append(loss.item())
336
+ vErr = GraphConvoNetwork.evaluateModel(model)
337
+ vaErr.append(vErr)
338
+ if model.verbose and epoch % epochIntv == 0:
339
+ print("epoch {} loss {:.6f} val error {:.6f}".format(epoch, loss.item(), vErr))
340
+
341
+ model.optimizer.zero_grad()
342
+ loss.backward()
343
+ model.optimizer.step()
344
+
345
+ #acc = GraphConvoNetwork.evaluateModel(model, True)
346
+ #print(acc)
347
+ modelSave = model.config.getBooleanConfig("train.model.save")[0]
348
+ if modelSave:
349
+ FeedForwardNetwork.saveCheckpt(model)
350
+
351
+ if model.trackErr:
352
+ FeedForwardNetwork.errorPlot(model, trErr, vaErr)
353
+
354
+ model.trained = True
355
+
356
+ @staticmethod
357
+ def evaluateModel(model, verbose=False):
358
+ """
359
+ evaluate model
360
+
361
+ Parameters
362
+ model : torch model
363
+ verbose : if True additional output
364
+ """
365
+ model.eval()
366
+ with torch.no_grad():
367
+ out = model()
368
+ if verbose:
369
+ print(out)
370
+ yPred = out[model.data.val_mask].data.cpu().numpy()
371
+ yActual = model.data.y[model.data.val_mask].data.cpu().numpy()
372
+ if verbose:
373
+ for pa in zip(yPred, yActual):
374
+ print(pa)
375
+ #correct = yPred == yActual
376
+ #score = int(correct.sum()) / int(model.data.val_mask.sum())
377
+
378
+ score = perfMetric(model.lossFnStr, yActual, yPred, model.clabels)
379
+
380
+ model.train()
381
+ return score
382
+
383
+ @staticmethod
384
+ def validateModel(model, retPred=False):
385
+ """
386
+ model validation
387
+
388
+ Parameters
389
+ model : torch model
390
+ retPred : if True return prediction
391
+ """
392
+ model.eval()
393
+ with torch.no_grad():
394
+ out = model()
395
+ yPred = out.argmax(dim=1)
396
+ yPred = yPred[model.data.test_mask].data.cpu().numpy()
397
+ yActual = model.data.y[model.data.test_mask].data.cpu().numpy()
398
+ #correct = yPred == yActual
399
+ #score = int(correct.sum()) / int(model.data.val_mask.sum())
400
+ score = perfMetric(model.accMetric, yActual, yPred)
401
+ print(formatFloat(3, score, "test #perf score"))
402
+ return score
403
+
404
+ @staticmethod
405
+ def modelPrediction(model, inclData=True):
406
+ """
407
+ make prediction
408
+
409
+ Parameters
410
+ model : torch model
411
+ inclData : True to include input data
412
+ """
413
+ cmask = model.config.getBooleanConfig("predict.create.mask")[0]
414
+ if not cmask:
415
+ print("create prediction mask property needs to be set to True")
416
+ return None
417
+
418
+ useSavedModel = model.config.getBooleanConfig("predict.use.saved.model")[0]
419
+ if useSavedModel:
420
+ FeedForwardNetwork.restoreCheckpt(model)
421
+ else:
422
+ if not model.trained:
423
+ GraphConvoNetwork.trainModel(model)
424
+
425
+ model.eval()
426
+ with torch.no_grad():
427
+ out = model()
428
+ yPred = out.argmax(dim=1)
429
+ yPred = yPred[model.prMask].data.cpu().numpy()
430
+
431
+ if inclData:
432
+ dataFilePath = model.config.getStringConfig("train.data.file")[0]
433
+ filt = lambda r : len(r) > 2
434
+ ndata = list(fileFiltRecGen(dataFilePath, filt))
435
+ prMask = model.prMask.data.cpu().numpy()
436
+ assertEqual(len(ndata), prMask.shape[0], "data and mask lengths are not equal")
437
+ precs = list(compress(ndata, prMask))
438
+ precs = list(map(lambda r : r[:-1], precs))
439
+ assertEqual(len(precs), yPred.shape[0], "data and mask lengths are not equal")
440
+ res = zip(precs, yPred)
441
+ else:
442
+ res = yPred
443
+ return res
444
+
supv/knn.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/local/bin/python3
2
+
3
+ # avenir-python: Machine Learning
4
+ # Author: Pranab Ghosh
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You may
8
+ # obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+
18
+ # Package imports
19
+ import os
20
+ import sys
21
+ import matplotlib.pyplot as plt
22
+ import numpy as np
23
+ import sklearn as sk
24
+ import matplotlib
25
+ import random
26
+ import jprops
27
+ from sklearn.neighbors import KNeighborsClassifier
28
+ from random import randint
29
+ sys.path.append(os.path.abspath("../lib"))
30
+ from util import *
31
+ from mlutil import *
32
+ from bacl import *
33
+
34
+
35
+ # gradient boosting classification
36
+ class NearestNeighbor(BaseClassifier):
37
+ def __init__(self, configFile):
38
+ defValues = {}
39
+ defValues["common.mode"] = ("training", None)
40
+ defValues["common.model.directory"] = ("model", None)
41
+ defValues["common.model.file"] = (None, None)
42
+ defValues["common.preprocessing"] = (None, None)
43
+ defValues["common.scaling.method"] = ("zscale", None)
44
+ defValues["common.verbose"] = (False, None)
45
+ defValues["train.data.file"] = (None, "missing training data file")
46
+ defValues["train.data.fields"] = (None, "missing training data field ordinals")
47
+ defValues["train.data.feature.fields"] = (None, "missing training data feature field ordinals")
48
+ defValues["train.data.class.field"] = (None, "missing class field ordinal")
49
+ defValues["train.num.neighbors"] = (5, None)
50
+ defValues["train.neighbor.weight"] = ("uniform", None)
51
+ defValues["train.neighbor.search.algo"] = ("auto", None)
52
+ defValues["train.neighbor.search.leaf.size"] = (10, None)
53
+ defValues["train.neighbor.dist.metric"] = ("minkowski", None)
54
+ defValues["train.neighbor.dist.metric.pow"] = (2.0, None)
55
+ defValues["train.success.criterion"] = ("error", None)
56
+ defValues["train.model.save"] = (False, None)
57
+ defValues["train.score.method"] = ("accuracy", None)
58
+ defValues["predict.data.file"] = (None, None)
59
+ defValues["predict.data.fields"] = (None, "missing data field ordinals")
60
+ defValues["predict.data.feature.fields"] = (None, "missing data feature field ordinals")
61
+ defValues["predict.use.saved.model"] = (False, None)
62
+
63
+ super(NearestNeighbor, self).__init__(configFile, defValues, __name__)
64
+
65
+ def buildModel(self):
66
+ """
67
+ builds model object
68
+ """
69
+ self.logger.info("...building knn classifer model")
70
+ numNeighbors = self.config.getIntConfig("train.num.neighbors")[0]
71
+ neighborWeight = self.config.getStringConfig("train.neighbor.weight")[0]
72
+ searchAlgo = self.config.getStringConfig("train.neighbor.search.algo")[0]
73
+ leafSize = self.config.getIntConfig("train.neighbor.search.leaf.size")[0]
74
+ distMetric = self.config.getStringConfig("train.neighbor.dist.metric")[0]
75
+ metricPow = self.config.getIntConfig("train.neighbor.dist.metric.pow")[0]
76
+
77
+ model = KNeighborsClassifier(n_neighbors=numNeighbors, weights=neighborWeight, algorithm=searchAlgo,
78
+ leaf_size=30, p=metricPow, metric=distMetric)
79
+ self.classifier = model
80
+ return self.classifier
81
+
82
+ def predictProb(self, recs=None):
83
+ """
84
+ predict probability
85
+ """
86
+ # create model
87
+ self.prepModel()
88
+
89
+ #input record
90
+ if recs is None:
91
+ featData = self.prepPredictData()
92
+ else:
93
+ if type(recs) is str:
94
+ featData = self.prepStringPredictData(recs)
95
+ else:
96
+ featData = recs
97
+ if (featData.ndim == 1):
98
+ featData = featData.reshape(1, -1)
99
+
100
+ #predict
101
+ self.logger.info("...predicting class probability")
102
+ clsData = self.classifier.predict_proba(featData)
103
+ return clsData
104
+
105
+
106
+
supv/lrd.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/local/bin/python3
2
+
3
+ # avenir-python: Machine Learning
4
+ # Author: Pranab Ghosh
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You may
8
+ # obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+
18
+ # Package imports
19
+ import os
20
+ import sys
21
+ import matplotlib.pyplot as plt
22
+ import numpy as np
23
+ import sklearn as sk
24
+ import sklearn.linear_model
25
+ import matplotlib
26
+ import random
27
+ import jprops
28
+ from sklearn.linear_model import LogisticRegression
29
+ from random import randint
30
+ sys.path.append(os.path.abspath("../lib"))
31
+ from util import *
32
+ from mlutil import *
33
+ from pasearch import *
34
+ from bacl import *
35
+
36
+ # logistic regression classification
37
+ class LogisticRegressionDiscriminant(BaseClassifier):
38
+
39
+ def __init__(self, configFile):
40
+ defValues = {}
41
+ defValues["common.mode"] = ("train", None)
42
+ defValues["common.model.directory"] = ("model", None)
43
+ defValues["common.model.file"] = (None, None)
44
+ defValues["common.scale.file.path"] = (None, "missing scale file path")
45
+ defValues["common.preprocessing"] = (None, None)
46
+ defValues["common.verbose"] = (False, None)
47
+ defValues["train.data.file"] = (None, "missing training data file")
48
+ defValues["train.data.fields"] = (None, "missing training data field ordinals")
49
+ defValues["train.data.feature.fields"] = (None, "missing training data feature field ordinals")
50
+ defValues["train.data.class.field"] = (None, "missing class field ordinal")
51
+ defValues["train.validation"] = ("kfold", None)
52
+ defValues["train.num.folds"] = (5, None)
53
+ defValues["train.penalty"] = ("l2", None)
54
+ defValues["train.dual"] = (False, None)
55
+ defValues["train.tolerance"] = (0.0001, None)
56
+ defValues["train.regularization"] = (1.0, None)
57
+ defValues["train.fit.intercept"] = (True, None)
58
+ defValues["train.intercept.scaling"] = (1.0, None)
59
+ defValues["train.class.weight"] = (None, None)
60
+ defValues["train.random.state"] = (None, None)
61
+ defValues["train.solver"] = ("liblinear", None)
62
+ defValues["train.max.iter"] = (100, None)
63
+ defValues["train.multi.class"] = ("ovr", None)
64
+ defValues["train.verbose"] = (0, None)
65
+ defValues["train.warm.start"] = (False, None)
66
+ defValues["train.num.jobs"] = (None, None)
67
+ defValues["train.l1.ratio"] = (None, None)
68
+ defValues["train.success.criterion"] = ("error", None)
69
+ defValues["train.model.save"] = (False, None)
70
+ defValues["train.score.method"] = ("accuracy", None)
71
+ defValues["train.search.param.strategy"] = (None, None)
72
+ defValues["train.search.params"] = (None, None)
73
+ defValues["predict.data.file"] = (None, None)
74
+ defValues["predict.data.fields"] = (None, "missing data field ordinals")
75
+ defValues["predict.data.feature.fields"] = (None, "missing data feature field ordinals")
76
+ defValues["predict.use.saved.model"] = (False, None)
77
+ defValues["validate.data.file"] = (None, "missing validation data file")
78
+ defValues["validate.data.fields"] = (None, "missing validation data field ordinals")
79
+ defValues["validate.data.feature.fields"] = (None, "missing validation data feature field ordinals")
80
+ defValues["validate.data.class.field"] = (None, "missing class field ordinal")
81
+ defValues["validate.use.saved.model"] = (False, None)
82
+ defValues["validate.score.method"] = ("accuracy", None)
83
+
84
+ super(LogisticRegressionDiscriminant, self).__init__(configFile, defValues, __name__)
85
+
86
+ # builds model object
87
+ def buildModel(self):
88
+ print ("...building logistic regression model")
89
+ penalty = self.config.getStringConfig("train.penalty")[0]
90
+ dual = self.config.getBooleanConfig("train.dual")[0]
91
+ tol = self.config.getFloatConfig("train.tolerance")[0]
92
+ c = self.config.getFloatConfig("train.regularization")[0]
93
+ fitIntercept = self.config.getBooleanConfig("train.fit.intercept")[0]
94
+ interceptScaling = self.config.getFloatConfig("train.intercept.scaling")[0]
95
+ classWeight = self.config.getStringConfig("train.class.weight")[0]
96
+ randomState = self.config.getIntConfig("train.random.state")[0]
97
+ solver = self.config.getStringConfig("train.solver")[0]
98
+ maxIter = self.config.getIntConfig("train.max.iter")[0]
99
+ multiClass = self.config.getStringConfig("train.multi.class")[0]
100
+ verbos = self.config.getIntConfig("train.verbose")[0]
101
+ warmStart = self.config.getBooleanConfig("train.warm.start")[0]
102
+ nJobs = self.config.getIntConfig("train.num.jobs")[0]
103
+ l1Ratio = self.config.getFloatConfig("train.l1.ratio")[0]
104
+
105
+ self.classifier = LogisticRegression(penalty=penalty, dual=dual, tol=tol, C=c, fit_intercept=fitIntercept,\
106
+ intercept_scaling=interceptScaling, class_weight=classWeight, random_state=randomState, solver=solver,\
107
+ max_iter=maxIter, multi_class=multiClass, verbose=verbos, warm_start=warmStart, n_jobs=nJobs, l1_ratio=l1Ratio)
108
+
109
+ return self.classifier
110
+
111
+
112
+
supv/lstm.py ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/local/bin/python3
2
+
3
+ # avenir-python: Machine Learning
4
+ # Author: Pranab Ghosh
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You may
8
+ # obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+
18
+ # Package imports
19
+ import os
20
+ import sys
21
+ import matplotlib.pyplot as plt
22
+ import numpy as np
23
+ import torch
24
+ from torch import nn
25
+ from torch.autograd import Variable
26
+ from torch.utils.data import DataLoader
27
+ from torchvision import transforms
28
+ import sklearn as sk
29
+ import matplotlib
30
+ import random
31
+ import jprops
32
+ from random import randint
33
+ sys.path.append(os.path.abspath("../lib"))
34
+ from util import *
35
+ from mlutil import *
36
+ from tnn import FeedForwardNetwork
37
+
38
+ """
39
+ LSTM with one or more hidden layers with multi domensional data
40
+ """
41
+
42
+ class LstmNetwork(nn.Module):
43
+ def __init__(self, configFile):
44
+ """
45
+ In the constructor we instantiate two nn.Linear modules and assign them as
46
+ member variables.
47
+
48
+ Parameters
49
+ configFile : config file path
50
+ """
51
+ defValues = dict()
52
+ defValues["common.mode"] = ("training", None)
53
+ defValues["common.model.directory"] = ("model", None)
54
+ defValues["common.model.file"] = (None, None)
55
+ defValues["common.preprocessing"] = (None, None)
56
+ defValues["common.scaling.method"] = ("zscale", None)
57
+ defValues["common.scaling.minrows"] = (50, None)
58
+ defValues["common.verbose"] = (False, None)
59
+ defValues["common.device"] = ("cpu", None)
60
+ defValues["train.data.file"] = (None, "missing training data file path")
61
+ defValues["train.data.type"] = ("numeric", None)
62
+ defValues["train.data.feat.cols"] = (None, "missing feature columns")
63
+ defValues["train.data.target.col"] = (None, "missing target column")
64
+ defValues["train.data.delim"] = (",", None)
65
+ defValues["train.input.size"] = (None, "missing input size")
66
+ defValues["train.hidden.size"] = (None, "missing hidden size")
67
+ defValues["train.output.size"] = (None, "missing output size")
68
+ defValues["train.num.layers"] = (1, None)
69
+ defValues["train.seq.len"] = (1, None)
70
+ defValues["train.batch.size"] = (32, None)
71
+ defValues["train.batch.first"] = (False, None)
72
+ defValues["train.drop.prob"] = (0, None)
73
+ defValues["train.optimizer"] = ("adam", None)
74
+ defValues["train.opt.learning.rate"] = (.0001, None)
75
+ defValues["train.opt.weight.decay"] = (0, None)
76
+ defValues["train.opt.momentum"] = (0, None)
77
+ defValues["train.opt.eps"] = (1e-08, None)
78
+ defValues["train.opt.dampening"] = (0, None)
79
+ defValues["train.opt.momentum.nesterov"] = (False, None)
80
+ defValues["train.opt.betas"] = ([0.9, 0.999], None)
81
+ defValues["train.opt.alpha"] = (0.99, None)
82
+ defValues["train.out.sequence"] = (True, None)
83
+ defValues["train.out.activation"] = ("sigmoid", None)
84
+ defValues["train.loss.fn"] = ("mse", None)
85
+ defValues["train.loss.reduction"] = ("mean", None)
86
+ defValues["train.grad.clip"] = (5, None)
87
+ defValues["train.num.iterations"] = (500, None)
88
+ defValues["train.save.model"] = (False, None)
89
+ defValues["valid.data.file"] = (None, "missing validation data file path")
90
+ defValues["valid.accuracy.metric"] = (None, None)
91
+ defValues["predict.data.file"] = (None, None)
92
+ defValues["predict.use.saved.model"] = (True, None)
93
+ defValues["predict.output"] = ("binary", None)
94
+ defValues["predict.feat.pad.size"] = (60, None)
95
+
96
+ self.config = Configuration(configFile, defValues)
97
+
98
+ super(LstmNetwork, self).__init__()
99
+
100
+ def getConfig(self):
101
+ return self.config
102
+
103
+ def buildModel(self):
104
+ """
105
+ Loads configuration and builds the various piecess necessary for the model
106
+ """
107
+ torch.manual_seed(9999)
108
+ self.verbose = self.config.getStringConfig("common.verbose")[0]
109
+ self.inputSize = self.config.getIntConfig("train.input.size")[0]
110
+ self.outputSize = self.config.getIntConfig("train.output.size")[0]
111
+ self.nLayers = self.config.getIntConfig("train.num.layers")[0]
112
+ self.hiddenSize = self.config.getIntConfig("train.hidden.size")[0]
113
+ self.seqLen = self.config.getIntConfig("train.seq.len")[0]
114
+ self.batchSize = self.config.getIntConfig("train.batch.size")[0]
115
+ self.batchFirst = self.config.getBooleanConfig("train.batch.first")[0]
116
+ dropProb = self.config.getFloatConfig("train.drop.prob")[0]
117
+ self.outSeq = self.config.getBooleanConfig("train.out.sequence")[0]
118
+ self.device = FeedForwardNetwork.getDevice(self)
119
+
120
+ #model
121
+ self.lstm = nn.LSTM(self.inputSize, self.hiddenSize, self.nLayers, dropout=dropProb, batch_first=self.batchFirst)
122
+ self.linear = nn.Linear(self.hiddenSize, self.outputSize)
123
+ outAct = self.config.getStringConfig("train.out.activation")[0]
124
+ self.outAct = FeedForwardNetwork.createActivation(outAct)
125
+
126
+ #load training data
127
+ dataFilePath = self.config.getStringConfig("train.data.file")[0]
128
+ self.fCols = self.config.getIntListConfig("train.data.feat.cols")[0]
129
+ assert len(self.fCols) == 2, "specify only start and end columns of features"
130
+ self.tCol = self.config.getIntConfig("train.data.target.col")[0]
131
+ self.delim = self.config.getStringConfig("train.data.delim")[0]
132
+
133
+ self.fData, self.tData = self.loadData(dataFilePath, self.delim, self.fCols[0],self.fCols[1], self.tCol)
134
+ self.fData = torch.from_numpy(self.fData)
135
+ self.fData = self.fData.to(self.device)
136
+ self.tData = torch.from_numpy(self.tData)
137
+ self.tData = self.tData.to(self.device)
138
+
139
+ #load validation data
140
+ vaDataFilePath = self.config.getStringConfig("valid.data.file")[0]
141
+ self.vfData, self.vtData = self.loadData(vaDataFilePath, self.delim, self.fCols[0], self.fCols[1], self.tCol)
142
+ self.vfData = torch.from_numpy(self.vfData)
143
+ self.vfData = self.vfData.to(self.device)
144
+ self.vtData = torch.from_numpy(self.vtData)
145
+ self.vtData = self.vtData.to(self.device)
146
+
147
+ self.batchSize = self.config.getIntConfig("train.batch.size")[0]
148
+ self.dataSize = self.fData.shape[0]
149
+ self.numBatch = int(self.dataSize / self.batchSize)
150
+ self.restored = False
151
+
152
+ self.to(self.device)
153
+
154
+ def loadData(self, filePath, delim, scolStart, scolEnd, targetCol):
155
+ """
156
+ loads data for file with one sequence per line and data can be a vector
157
+
158
+ Parameters
159
+ filePath : file path
160
+ delim : field delemeter
161
+ scolStart : seq column start index
162
+ scolEnd : seq column end index
163
+ targetCol : target field col index
164
+ """
165
+ if targetCol >= 0:
166
+ #include target column
167
+ cols = list(range(scolStart, scolEnd + 1, 1))
168
+ cols.append(targetCol)
169
+ data = np.loadtxt(filePath, delimiter=delim, usecols=cols)
170
+ #one output for whole sequence
171
+ sData = data[:, :-1]
172
+ if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
173
+ sData = self.scaleSeqData(sData)
174
+ tData = data[:, -1]
175
+
176
+ #target int (index into class labels) for classification
177
+ sData = sData.astype(np.float32)
178
+ tData = tData.astype(np.float32) if self.outputSize == 1 else tData.astype(np.long)
179
+ exData = (sData, tData)
180
+ else:
181
+ #exclude target column
182
+ cols = list(range(scolStart, scolEnd + 1, 1))
183
+ data = np.loadtxt(filePath, delimiter=delim, usecols=cols)
184
+
185
+ #one output for whole sequence
186
+ sData = data
187
+ if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
188
+ sData = self.scaleSeqData(sData)
189
+
190
+ #target int (index into class labels) for classification
191
+ sData = sData.astype(np.float32)
192
+ exData = sData
193
+
194
+ return exData
195
+
196
+ def scaleSeqData(self, sData):
197
+ """
198
+ scales data transforming non squence format
199
+
200
+ Parameters
201
+ sData : sequence data
202
+ """
203
+ scalingMethod = self.config.getStringConfig("common.scaling.method")[0]
204
+ sData = fromMultDimSeqToTabular(sData, self.inputSize, self.seqLen)
205
+ sData = scaleData(sData, scalingMethod)
206
+ sData = fromTabularToMultDimSeq(sData, self.inputSize, self.seqLen)
207
+ return sData
208
+
209
+ def formattedBatchGenarator(self):
210
+ """
211
+ transforms traing data from (dataSize, seqLength x inputSize) to (batch, seqLength, inputSize) tensor
212
+ or (seqLength, batch, inputSize) tensor
213
+ """
214
+
215
+ for _ in range(self.numBatch):
216
+ bfData = torch.zeros([self.batchSize, self.seqLen, self.inputSize], dtype=torch.float32) if self.batchFirst\
217
+ else torch.zeros([self.seqLen, self.batchSize, self.inputSize], dtype=torch.float32)
218
+ tdType = torch.float32 if self.outputSize == 1 else torch.long
219
+ btData = torch.zeros([self.batchSize], dtype=tdType)
220
+
221
+ i = 0
222
+ for bdi in range(self.batchSize):
223
+ di = sampleUniform(0, self.dataSize-1)
224
+ row = self.fData[di]
225
+ for ci, cv in enumerate(row):
226
+ si = int(ci / self.inputSize)
227
+ ii = ci % self.inputSize
228
+ if self.batchFirst:
229
+ bfData[bdi][si][ii] = cv
230
+ else:
231
+ #print(si, bdi, ii)
232
+ bfData[si][bdi][ii] = cv
233
+ btData[i] = self.tData[di]
234
+ i += 1
235
+
236
+ #for seq output correct first 2 dimensions
237
+ if self.outSeq and not self.batchFirst:
238
+ btData = torch.transpose(btData,0,1)
239
+
240
+ yield (bfData, btData)
241
+
242
+ def formatData(self, fData, tData=None):
243
+ """
244
+ transforms validation or prediction data data from (dataSize, seqLength x inputSize) to
245
+ (batch, seqLength, inputSize) tensor or (seqLength, batch, inputSize) tensor
246
+
247
+ Parameters
248
+ fData : feature data
249
+ tData : target data
250
+ """
251
+ dSize = fData.shape[0]
252
+ bfData = torch.zeros([dSize, self.seqLen, self.inputSize], dtype=torch.float32) if self.batchFirst\
253
+ else torch.zeros([self.seqLen, dSize, self.inputSize], dtype=torch.float32)
254
+
255
+ for ri in range(dSize):
256
+ row = fData[ri]
257
+ for ci, cv in enumerate(row):
258
+ si = int(ci / self.inputSize)
259
+ ii = ci % self.inputSize
260
+ if self.batchFirst:
261
+ bfData[ri][si][ii] = cv
262
+ else:
263
+ bfData[si][ri][ii] = cv
264
+ if tData is not None:
265
+ btData = torch.transpose(tData,0,1) if self.outSeq and not self.batchFirst else tData
266
+ formData = (bfData, btData)
267
+ else:
268
+ formData = bfData
269
+ return formData
270
+
271
+ def forward(self, x, h):
272
+ """
273
+ Forward pass
274
+
275
+ Parameters
276
+ x : input data
277
+ h : targhiddenet state
278
+ """
279
+ out, hout = self.lstm(x,h)
280
+ if self.outSeq:
281
+ # seq to seq prediction
282
+ out = out.view(-1, self.hiddenSize)
283
+ out = self.linear(out)
284
+ if self.outAct is not None:
285
+ out = self.outAct(out)
286
+ out = out.view(self.batchSize * self.seqLen, -1)
287
+ else:
288
+ #seq to one prediction
289
+ out = out[self.seqLen - 1].view(-1, self.hiddenSize)
290
+ out = self.linear(out)
291
+ if self.outAct is not None:
292
+ out = self.outAct(out)
293
+ #out = out.view(self.batchSize, -1)
294
+
295
+ return out, hout
296
+
297
+ def initHidden(self, batch):
298
+ """
299
+ Initialize hidden weights
300
+
301
+ Parameters
302
+ batch : batch size
303
+ """
304
+ hidden = (torch.zeros(self.nLayers,batch,self.hiddenSize),
305
+ torch.zeros(self.nLayers,batch,self.hiddenSize))
306
+ return hidden
307
+
308
+ def trainLstm(self):
309
+ """
310
+ train lstm
311
+ """
312
+ print("..starting training")
313
+ self.train()
314
+
315
+ #device = self.config.getStringConfig("common.device")[0]
316
+ #self.to(device)
317
+ optimizerName = self.config.getStringConfig("train.optimizer")[0]
318
+ self.optimizer = FeedForwardNetwork.createOptimizer(self, optimizerName)
319
+ lossFn = self.config.getStringConfig("train.loss.fn")[0]
320
+ criterion = FeedForwardNetwork.createLossFunction(self, lossFn)
321
+ clip = self.config.getFloatConfig("train.grad.clip")[0]
322
+ numIter = self.config.getIntConfig("train.num.iterations")[0]
323
+ accMetric = self.config.getStringConfig("valid.accuracy.metric")[0]
324
+
325
+
326
+ for it in range(numIter):
327
+ b = 0
328
+ for inputs, labels in self.formattedBatchGenarator():
329
+ #forward pass
330
+ hid = self.initHidden(self.batchSize)
331
+ hid = (hid[0].to(self.device), hid[1].to(self.device))
332
+ inputs, labels = inputs.to(self.device), labels.to(self.device)
333
+ output, hid = self(inputs, hid)
334
+
335
+ #loss
336
+ if self.outSeq:
337
+ labels = labels.view(self.batchSize * self.seqLen, -1)
338
+ loss = criterion(output, labels)
339
+
340
+ if self.verbose and it % 50 == 0 and b % 10 == 0:
341
+ print("epoch {} batch {} loss {:.6f}".format(it, b, loss.item()))
342
+
343
+ # zero gradients, perform a backward pass, and update the weights.
344
+ self.optimizer.zero_grad()
345
+ loss.backward()
346
+ nn.utils.clip_grad_norm_(self.parameters(), clip)
347
+ self.optimizer.step()
348
+ b += 1
349
+
350
+ #validate
351
+ print("..validating model")
352
+ self.eval()
353
+ with torch.no_grad():
354
+ fData, tData = self.formatData(self.vfData, self.vtData)
355
+ fData = fData.to(self.device)
356
+ vsize = tData.shape[0]
357
+ hid = self.initHidden(vsize)
358
+ hid = (hid[0].to(self.device), hid[1].to(self.device))
359
+ yPred, _ = self(fData, hid)
360
+ yPred = yPred.data.cpu().numpy()
361
+ yActual = tData.data.cpu().numpy()
362
+
363
+ if self.verbose:
364
+ print("\npredicted \t\t actual")
365
+ for i in range(vsize):
366
+ print(str(yPred[i]) + "\t" + str(yActual[i]))
367
+
368
+ score = perfMetric(accMetric, yActual, yPred)
369
+ print(formatFloat(3, score, "perf score"))
370
+
371
+ #save
372
+ modelSave = self.config.getBooleanConfig("train.model.save")[0]
373
+ if modelSave:
374
+ FeedForwardNetwork.saveCheckpt(self)
375
+
376
+ def predictLstm(self):
377
+ """
378
+ predict
379
+ """
380
+ print("..predicting using model")
381
+ useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
382
+ if useSavedModel:
383
+ FeedForwardNetwork.restoreCheckpt(self)
384
+ else:
385
+ self.trainLstm()
386
+
387
+ prDataFilePath = self.config.getStringConfig("predict.data.file")[0]
388
+ pfData = self.loadData(prDataFilePath, self.delim, self.fCols[0], self.fCols[1], -1)
389
+ pfData = torch.from_numpy(pfData)
390
+ dsize = pfData.shape[0]
391
+
392
+ #predict
393
+ #device = self.config.getStringConfig("common.device")[0]
394
+ self.eval()
395
+ with torch.no_grad():
396
+ fData = self.formatData(pfData)
397
+ fData = fData.to(self.device)
398
+ hid = self.initHidden(dsize)
399
+ hid = (hid[0].to(self.device), hid[1].to(self.device))
400
+ yPred, _ = self(fData, hid)
401
+ yPred = yPred.data.cpu().numpy()
402
+
403
+ if self.outputSize == 2:
404
+ #classification
405
+ yPred = FeedForwardNetwork.processClassifOutput(yPred, self.config)
406
+
407
+ # print prediction
408
+ FeedForwardNetwork.printPrediction(yPred, self.config, prDataFilePath)
409
+
410
+
411
+
412
+
413
+
414
+
supv/mcalib.py ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/local/bin/python3
2
+
3
+ # avenir-python: Machine Learning
4
+ # Author: Pranab Ghosh
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You may
8
+ # obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+
18
+ # Package imports
19
+ import os
20
+ import sys
21
+ import matplotlib.pyplot as plt
22
+ import numpy as np
23
+ import sklearn as sk
24
+ from sklearn.neighbors import KDTree
25
+ import matplotlib
26
+ import random
27
+ import jprops
28
+ from random import randint
29
+ import statistics
30
+ sys.path.append(os.path.abspath("../lib"))
31
+ from util import *
32
+ from mlutil import *
33
+ from tnn import *
34
+ from stats import *
35
+
36
+ """
37
+ neural model calibration
38
+ """
39
+ class ModelCalibration(object):
40
+ def __init__(self):
41
+ pass
42
+
43
+ @staticmethod
44
+ def findModelCalibration(model):
45
+ """
46
+ pmodel calibration
47
+ """
48
+ FeedForwardNetwork.prepValidate(model)
49
+ FeedForwardNetwork.validateModel(model)
50
+
51
+ yPred = model.yPred.flatten()
52
+ yActual = model.validOutData.flatten()
53
+ nsamp = len(yActual)
54
+
55
+ #print(yPred.shape)
56
+ #print(yActual.shape)
57
+
58
+ nBins = model.config.getIntConfig("calibrate.num.bins")[0]
59
+ prThreshhold = model.config.getFloatConfig("calibrate.pred.prob.thresh")[0]
60
+
61
+ minConf = yPred.min()
62
+ maxConf = yPred.max()
63
+ bsize = (maxConf - minConf) / nBins
64
+ #print("minConf {:.3f} maxConf {:.3f} bsize {:.3f}".format(minConf, maxConf, bsize))
65
+ blist = list(map(lambda i : None, range(nBins)))
66
+
67
+ #binning
68
+ for yp, ya in zip(yPred, yActual):
69
+ indx = int((yp - minConf) / bsize)
70
+ if indx == nBins:
71
+ indx = nBins - 1
72
+ #print("yp {:.3f} indx {}".format(yp, indx))
73
+ pair = (yp, ya)
74
+ plist = blist[indx]
75
+ if plist is None:
76
+ plist = list()
77
+ blist[indx] = plist
78
+ plist.append(pair)
79
+
80
+ x = list()
81
+ y = list()
82
+ yideal = list()
83
+ ece = 0
84
+ mce = 0
85
+
86
+ # per bin confidence and accuracy
87
+ b = 0
88
+ for plist in blist:
89
+ if plist is not None:
90
+ #confidence
91
+ ypl = list(map(lambda p : p[0], plist))
92
+ ypm = statistics.mean(ypl)
93
+ x.append(ypm)
94
+
95
+ #accuracy
96
+ ypcount = 0
97
+ for p in plist:
98
+ yp = 1 if p[0] > prThreshhold else 0
99
+ if (yp == 1 and p[1] == 1):
100
+ ypcount += 1
101
+
102
+ acc = ypcount / len(plist)
103
+ y.append(acc)
104
+ yideal.append(ypm)
105
+
106
+ ce = abs(ypm - acc)
107
+ ece += len(plist) * ce
108
+ if ce > mce:
109
+ mce = ce
110
+ else:
111
+ ypm = minConf + (b + 0.5) * bsize
112
+ x.append(ypm)
113
+ yideal.append(ypm)
114
+ y.append(0)
115
+ b += 1
116
+
117
+ #calibration plot
118
+ drawPairPlot(x, y, yideal, "confidence", "accuracy", "actual", "ideal")
119
+
120
+ print("confidence\taccuracy")
121
+ for z in zip(x,y):
122
+ print("{:.3f}\t{:.3f}".format(z[0], z[1]))
123
+
124
+
125
+ #expected calibration error
126
+ ece /= nsamp
127
+ print("expected calibration error\t{:.3f}".format(ece))
128
+ print("maximum calibration error\t{:.3f}".format(mce))
129
+
130
+
131
+ @staticmethod
132
+ def findModelCalibrationLocal(model):
133
+ """
134
+ pmodel calibration based k nearest neghbors
135
+ """
136
+ FeedForwardNetwork.prepValidate(model)
137
+ FeedForwardNetwork.validateModel(model)
138
+
139
+ yPred = model.yPred.flatten()
140
+ yActual = model.validOutData.flatten()
141
+ nsamp = len(yActual)
142
+
143
+ neighborCnt = model.config.getIntConfig("calibrate.num.nearest.neighbors")[0]
144
+ prThreshhold = model.config.getFloatConfig("calibrate.pred.prob.thresh")[0]
145
+ fData = model.validFeatData.numpy()
146
+ tree = KDTree(fData, leaf_size=4)
147
+
148
+ dist, ind = tree.query(fData, k=neighborCnt)
149
+ calibs = list()
150
+ #all data
151
+ for si, ni in enumerate(ind):
152
+ conf = 0
153
+ ypcount = 0
154
+ #all neighbors
155
+ for i in ni:
156
+ conf += yPred[i]
157
+ yp = 1 if yPred[i] > prThreshhold else 0
158
+ if (yp == 1 and yActual[i] == 1):
159
+ ypcount += 1
160
+ conf /= neighborCnt
161
+ acc = ypcount / neighborCnt
162
+ calib = (si, conf, acc)
163
+ calibs.append(calib)
164
+
165
+ #descending sort by difference between confidence and accuracy
166
+ calibs = sorted(calibs, key=lambda c : abs(c[1] - c[2]), reverse=True)
167
+ print("local calibration")
168
+ print("conf\taccu\trecord")
169
+ for i in range(19):
170
+ si, conf, acc = calibs[i]
171
+ rec = toStrFromList(fData[si], 3)
172
+ print("{:.3f}\t{:.3f}\t{}".format(conf, acc, rec))
173
+
174
+ @staticmethod
175
+ def findModelSharpness(model):
176
+ """
177
+ pmodel calibration
178
+ """
179
+ FeedForwardNetwork.prepValidate(model)
180
+ FeedForwardNetwork.validateModel(model)
181
+
182
+ yPred = model.yPred.flatten()
183
+ yActual = model.validOutData.flatten()
184
+ nsamp = len(yActual)
185
+
186
+ #print(yPred.shape)
187
+ #print(yActual.shape)
188
+
189
+ nBins = model.config.getIntConfig("calibrate.num.bins")[0]
190
+ prThreshhold = model.config.getFloatConfig("calibrate.pred.prob.thresh")[0]
191
+
192
+ minConf = yPred.min()
193
+ maxConf = yPred.max()
194
+ bsize = (maxConf - minConf) / nBins
195
+ #print("minConf {:.3f} maxConf {:.3f} bsize {:.3f}".format(minConf, maxConf, bsize))
196
+ blist = list(map(lambda i : None, range(nBins)))
197
+
198
+ #binning
199
+ for yp, ya in zip(yPred, yActual):
200
+ indx = int((yp - minConf) / bsize)
201
+ if indx == nBins:
202
+ indx = nBins - 1
203
+ #print("yp {:.3f} indx {}".format(yp, indx))
204
+ pair = (yp, ya)
205
+ plist = blist[indx]
206
+ if plist is None:
207
+ plist = list()
208
+ blist[indx] = plist
209
+ plist.append(pair)
210
+
211
+ y = list()
212
+ ypgcount = 0
213
+ # per bin confidence and accuracy
214
+ for plist in blist:
215
+ #ypl = list(map(lambda p : p[0], plist))
216
+ #ypm = statistics.mean(ypl)
217
+ #x.append(ypm)
218
+
219
+ ypcount = 0
220
+ for p in plist:
221
+ yp = 1 if p[0] > prThreshhold else 0
222
+ if (yp == 1 and p[1] == 1):
223
+ ypcount += 1
224
+ ypgcount += 1
225
+
226
+ acc = ypcount / len(plist)
227
+ y.append(acc)
228
+
229
+ print("{} {}".format(ypgcount, nsamp))
230
+ accg = ypgcount / nsamp
231
+ accgl = [accg] * nBins
232
+ x = list(range(nBins))
233
+ drawPairPlot(x, y, accgl, "discretized confidence", "accuracy", "local", "global")
234
+
235
+ contrast = list(map(lambda acc : abs(acc - accg), y))
236
+ contrast = statistics.mean(contrast)
237
+ print("contrast {:.3f}".format(contrast))
238
+
239
+ """
240
+ neural model robustness
241
+ """
242
+ class ModelRobustness(object):
243
+ def __init__(self):
244
+ pass
245
+
246
+ def localPerformance(self, model, fpath, nsamp, neighborCnt):
247
+ """
248
+ local performnance sampling
249
+ """
250
+
251
+ #load data
252
+ fData, oData = FeedForwardNetwork.prepData(model, fpath)
253
+ #print(type(fData))
254
+ #print(type(oData))
255
+ #print(fData.shape)
256
+ dsize = fData.shape[0]
257
+ ncol = fData.shape[1]
258
+
259
+ #kdd
260
+ tree = KDTree(fData, leaf_size=4)
261
+
262
+ scores = list()
263
+ indices = list()
264
+ for _ in range(nsamp):
265
+ indx = randomInt(0, dsize - 1)
266
+ indices.append(indx)
267
+ frow = fData[indx]
268
+ frow = np.reshape(frow, (1, ncol))
269
+ dist, ind = tree.query(frow, k=neighborCnt)
270
+
271
+ ind = ind[0]
272
+ vfData = fData[ind]
273
+ voData = oData[ind]
274
+
275
+ #print(type(vfData))
276
+ #print(vfData.shape)
277
+ #print(type(voData))
278
+ #print(voData.shape)
279
+
280
+ model.setValidationData((vfData, voData), False)
281
+ score = FeedForwardNetwork.validateModel(model)
282
+ scores.append(score)
283
+
284
+ #performance distribution
285
+ m, s = basicStat(scores)
286
+ print("model performance: mean {:.3f}\tstd dev {:.3f}".format(m,s))
287
+ drawHist(scores, "model accuracy", "accuracy", "frequency")
288
+
289
+ #worst performance
290
+ lscores = sorted(zip(indices, scores), key=lambda s : s[1])
291
+ print(lscores[:5])
292
+
293
+ lines = getFileLines(fpath, None)
294
+ print("worst performing features regions")
295
+ for i,s in lscores[:5]:
296
+ print("score {:.3f}\t{}".format(s, lines[i]))
297
+
298
+
299
+ """
300
+ conformal prediction for regression
301
+ """
302
+ class ConformalRegressionPrediction(object):
303
+ def __init__(self):
304
+ self.calibration = dict()
305
+
306
+ def calibrate(self, ypair, confBound):
307
+ """ n
308
+ calibration for conformal prediction
309
+ """
310
+ cscores = list()
311
+ ymax = None
312
+ ymin = None
313
+ for yp, ya in ypair:
314
+ cscore = abs(yp - ya)
315
+ cscores.append(cscore)
316
+ if ymax is None:
317
+ ymax = ya
318
+ ymin = ya
319
+ else:
320
+ ymax = ya if ya > ymax else ymax
321
+ ymin = ya if ya < ymin else ymin
322
+
323
+ cscores.sort()
324
+ drawHist(cscores, "conformal score distribution", "conformal score", "frequency", 20)
325
+ cbi = int(confBound * len(cscores))
326
+ scoreConfBound = cscores[cbi]
327
+ self.calibration["scoreConfBound"] = scoreConfBound
328
+ self.calibration["ymin"] = ymin
329
+ self.calibration["ymax"] = ymax
330
+ print(self.calibration)
331
+
332
+ def saveCalib(self, fPath):
333
+ """
334
+ saves scoformal score calibration
335
+ """
336
+ saveObject(self.calibration, fPath)
337
+
338
+ def restoreCalib(self, fPath):
339
+ """
340
+ saves scoformal score calibration
341
+ """
342
+ self.calibration = restoreObject(fPath)
343
+ print(self.calibration)
344
+
345
+ def getPredRange(self, yp, nstep=100):
346
+ """
347
+ get prediction range and related data
348
+ """
349
+ ymin = self.calibration["ymin"]
350
+ ymax = self.calibration["ymax"]
351
+ step = (ymax - ymin) / nstep
352
+ scoreConfBound = self.calibration["scoreConfBound"]
353
+
354
+ rmin = None
355
+ rmax = None
356
+ rcount = 0
357
+ #print(ymin, ymax, step)
358
+ for ya in np.arange(ymin, ymax, step):
359
+ cscore = abs(yp - ya)
360
+ if cscore < scoreConfBound:
361
+ if rmin is None:
362
+ #lower bound
363
+ rmin = ya
364
+ rmax = ya
365
+ else:
366
+ #keep updating upper bound
367
+ rmax = ya if ya > rmax else rmax
368
+ rcount += 1
369
+ else:
370
+ if rmax is not None and rcount > 0:
371
+ #past upper bound
372
+ break
373
+
374
+ res = dict()
375
+ res["predRangeMin"] = rmin
376
+ res["predRangeMax"] = rmax
377
+ accepted = yp >= rmin and yp <= rmax
378
+ res["status"] = "accepted" if accepted else "rejected"
379
+ conf = 1.0 - (rmax - rmin) / (ymax - ymin)
380
+ res["confidence"] = conf
381
+
382
+ return res
383
+
384
+
supv/mcclf.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/local/bin/python3
2
+
3
+ # Author: Pranab Ghosh
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
6
+ # may not use this file except in compliance with the License. You may
7
+ # obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14
+ # implied. See the License for the specific language governing
15
+ # permissions and limitations under the License.
16
+
17
+ # Package imports
18
+ import os
19
+ import sys
20
+ import matplotlib.pyplot as plt
21
+ import numpy as np
22
+ import random
23
+ import jprops
24
+ from random import randint
25
+ from matumizi.util import *
26
+ from matumizi.mlutil import *
27
+
28
+ """
29
+ Markov chain classifier
30
+ """
31
+ class MarkovChainClassifier():
32
+ def __init__(self, configFile):
33
+ """
34
+ constructor
35
+
36
+ Parameters
37
+ configFile: config file path
38
+ """
39
+ defValues = {}
40
+ defValues["common.model.directory"] = ("model", None)
41
+ defValues["common.model.file"] = (None, None)
42
+ defValues["common.verbose"] = (False, None)
43
+ defValues["common.states"] = (None, "missing state list")
44
+ defValues["train.data.file"] = (None, "missing training data file")
45
+ defValues["train.data.class.labels"] = (["F", "T"], None)
46
+ defValues["train.data.key.len"] = (1, None)
47
+ defValues["train.model.save"] = (False, None)
48
+ defValues["train.score.method"] = ("accuracy", None)
49
+ defValues["predict.data.file"] = (None, None)
50
+ defValues["predict.use.saved.model"] = (True, None)
51
+ defValues["predict.log.odds.threshold"] = (0, None)
52
+ defValues["validate.data.file"] = (None, "missing validation data file")
53
+ defValues["validate.use.saved.model"] = (False, None)
54
+ defValues["valid.accuracy.metric"] = ("acc", None)
55
+ self.config = Configuration(configFile, defValues)
56
+
57
+ self.stTranPr = dict()
58
+ self.clabels = self.config.getStringListConfig("train.data.class.labels")[0]
59
+ self.states = self.config.getStringListConfig("common.states")[0]
60
+ self.nstates = len(self.states)
61
+ for cl in self.clabels:
62
+ stp = np.ones((self.nstates,self.nstates))
63
+ self.stTranPr[cl] = stp
64
+
65
+ def train(self):
66
+ """
67
+ train model
68
+ """
69
+ #state transition matrix
70
+ tdfPath = self.config.getStringConfig("train.data.file")[0]
71
+ klen = self.config.getIntConfig("train.data.key.len")[0]
72
+ for rec in fileRecGen(tdfPath):
73
+ cl = rec[klen]
74
+ rlen = len(rec)
75
+ for i in range(klen+1, rlen-1, 1):
76
+ fst = self.states.index(rec[i])
77
+ tst = self.states.index(rec[i+1])
78
+ self.stTranPr[cl][fst][tst] += 1
79
+
80
+ #normalize to probability
81
+ for cl in self.clabels:
82
+ stp = self.stTranPr[cl]
83
+ for i in range(self.nstates):
84
+ s = stp[i].sum()
85
+ r = stp[i] / s
86
+ stp[i] = r
87
+
88
+ #save
89
+ if self.config.getBooleanConfig("train.model.save")[0]:
90
+ mdPath = self.config.getStringConfig("common.model.directory")[0]
91
+ assert os.path.exists(mdPath), "model save directory does not exist"
92
+ mfPath = self.config.getStringConfig("common.model.file")[0]
93
+ mfPath = os.path.join(mdPath, mfPath)
94
+
95
+ with open(mfPath, "w") as fh:
96
+ for cl in self.clabels:
97
+ fh.write("label:" + cl +"\n")
98
+ stp = self.stTranPr[cl]
99
+ for r in stp:
100
+ rs = ",".join(toStrList(r, 6)) + "\n"
101
+ fh.write(rs)
102
+
103
+ def validate(self):
104
+ """
105
+ validate using model
106
+ """
107
+ useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
108
+ if useSavedModel:
109
+ self.__restoreModel()
110
+ else:
111
+ self.train()
112
+
113
+ vdfPath = self.config.getStringConfig("validate.data.file")[0]
114
+ accMetric = self.config.getStringConfig("valid.accuracy.metric")[0]
115
+
116
+ yac, ypr = self.__getPrediction(vdfPath, True)
117
+ if type(self.clabels[0]) == str:
118
+ yac = self.__toIntClabel(yac)
119
+ ypr = self.__toIntClabel(ypr)
120
+ score = perfMetric(accMetric, yac, ypr)
121
+ print(formatFloat(3, score, "perf score"))
122
+
123
+
124
+ def predict(self):
125
+ """
126
+ predict using model
127
+ """
128
+ useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
129
+ if useSavedModel:
130
+ self.__restoreModel()
131
+ else:
132
+ self.train()
133
+
134
+ #predict
135
+ pdfPath = self.config.getStringConfig("predict.data.file")[0]
136
+ _ , ypr = self.__getPrediction(pdfPath)
137
+ return ypr
138
+
139
+ def __restoreModel(self):
140
+ """
141
+ restore model
142
+ """
143
+ mdPath = self.config.getStringConfig("common.model.directory")[0]
144
+ assert os.path.exists(mdPath), "model save directory does not exist"
145
+ mfPath = self.config.getStringConfig("common.model.file")[0]
146
+ mfPath = os.path.join(mdPath, mfPath)
147
+ stp = None
148
+ cl = None
149
+ for rec in fileRecGen(mfPath):
150
+ if len(rec) == 1:
151
+ if stp is not None:
152
+ stp = np.array(stp)
153
+ self.stTranPr[cl] = stp
154
+ cl = rec[0].split(":")[1]
155
+ stp = list()
156
+ else:
157
+ frec = asFloatList(rec)
158
+ stp.append(frec)
159
+
160
+ stp = np.array(stp)
161
+ self.stTranPr[cl] = stp
162
+
163
+ def __getPrediction(self, fpath, validate=False):
164
+ """
165
+ get predictions
166
+
167
+ Parameters
168
+ fpath : data file path
169
+ validate: True if validation
170
+ """
171
+
172
+ nc = self.clabels[0]
173
+ pc = self.clabels[1]
174
+ thold = self.config.getFloatConfig("predict.log.odds.threshold")[0]
175
+ klen = self.config.getIntConfig("train.data.key.len")[0]
176
+ offset = klen+1 if validate else klen
177
+ ypr = list()
178
+ yac = list()
179
+ for rec in fileRecGen(fpath):
180
+ lodds = 0
181
+ rlen = len(rec)
182
+ for i in range(offset, rlen-1, 1):
183
+ fst = self.states.index(rec[i])
184
+ tst = self.states.index(rec[i+1])
185
+ odds = self.stTranPr[pc][fst][tst] / self.stTranPr[nc][fst][tst]
186
+ lodds += math.log(odds)
187
+ prc = pc if lodds > thold else nc
188
+ ypr.append(prc)
189
+ if validate:
190
+ yac.append(rec[klen])
191
+ else:
192
+ recp = prc + "\t" + ",".join(rec)
193
+ print(recp)
194
+
195
+ re = (yac, ypr)
196
+ return re
197
+
198
+ def __toIntClabel(self, labels):
199
+ """
200
+ convert string class label to int
201
+
202
+ Parameters
203
+ labels : class label values
204
+ """
205
+ return list(map(lambda l : self.clabels.index(l), labels))
206
+
207
+
supv/nlm.py ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/local/bin/python3
2
+
3
+ # avenir-python: Machine Learning
4
+ # Author: Pranab Ghosh
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You may
8
+ # obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+
18
+ import os
19
+ import sys
20
+ from random import randint
21
+ import random
22
+ import time
23
+ from datetime import datetime
24
+ import re, string, unicodedata
25
+ import spacy
26
+ import torch
27
+ from collections import defaultdict
28
+ import pickle
29
+ import numpy as np
30
+ import re
31
+ from sentence_transformers import CrossEncoder
32
+
33
+ sys.path.append(os.path.abspath("../lib"))
34
+ from util import *
35
+ from mlutil import *
36
+
37
+ """
38
+ neural language model
39
+ """
40
+
41
+ class NeuralLangModel(object):
42
+ def __init__(self):
43
+ """
44
+ initialize
45
+ """
46
+ self.dexts = None
47
+
48
+ def loadDocs(self, fpaths):
49
+ """
50
+ loads documents from one file
51
+ """
52
+ fPaths = fpaths.split(",")
53
+ if len(fPaths) == 1:
54
+ if os.path.isfile(fPaths[0]):
55
+ #one file
56
+ print("got one file from path")
57
+ dnames = fpaths
58
+ docStr = getOneFileContent(fPaths[0])
59
+ dtexts = [docStr]
60
+ else:
61
+ #all files under directory
62
+ print("got all files under directory from path")
63
+ dtexts, dnames = getFileContent(fPaths[0])
64
+ print("found following files")
65
+ for dt, dn in zip(dtexts, dnames):
66
+ print(dn + "\t" + dt[:40])
67
+ else:
68
+ #list of files
69
+ print("got list of files from path")
70
+ dnames = fpaths
71
+ dtexts = list(map(getOneFileContent, fpaths))
72
+
73
+ ndocs = (dtexts, dnames)
74
+ return ndocs
75
+
76
+ #Encoded doc
77
+ class EncodedDoc:
78
+ def __init__(self, dtext, dname, drank=None):
79
+ """
80
+ initialize
81
+ """
82
+ self.dtext = dtext
83
+ self.dname = dname
84
+ self.drank = drank
85
+ self.denc = None
86
+ self.score = None
87
+
88
+ def encode(self, nlp):
89
+ """
90
+ encode
91
+ """
92
+ self.denc = nlp(self.dtext)
93
+
94
+ #similarity at token and sentence level for BERT encoding
95
+ class SemanticSearch:
96
+ def __init__(self, docs=None):
97
+ """
98
+ initialize
99
+ """
100
+ print("loading BERT transformer model")
101
+ self.nlp = spacy.load("en_trf_bertbaseuncased_lg")
102
+ self.docs = docs if docs is not None else list()
103
+
104
+ def docAv(self,qu, doc):
105
+ """
106
+ whole doc similarity
107
+ """
108
+ return qu.similarity(doc)
109
+
110
+ def tokSimAv(self, qu, doc):
111
+ """
112
+ token pair wise average
113
+ """
114
+ qts = simAll(qu, doc)
115
+ asi = numpy.mean(qts)
116
+ return asi
117
+
118
+ def tokSimMed(self, qu, doc):
119
+ """
120
+ token pair wise average
121
+
122
+ """
123
+ qts = simAll(qu, doc)
124
+ asi = numpy.median(qts)
125
+ return asi
126
+
127
+ def tokSimMax(self, qu, doc):
128
+ """
129
+ token pair wise max (tsma)
130
+ """
131
+ qte = self. __getTensor(qu)
132
+ dte = self. __getTensor(doc)
133
+ return self.simMax(qte, dte)
134
+
135
+ def tokSimAvMax(self, qu, doc):
136
+ """
137
+ token max then average (tsavm)
138
+ """
139
+ qte = self. __getTensor(qu)
140
+ dte = self. __getTensor(doc)
141
+ return self.simAvMax(qte, dte)
142
+
143
+ def tokSimMaxAv(self, qu, doc):
144
+ """
145
+ token average and then max
146
+ """
147
+ qte = self. __getTensor(qu)
148
+ dte = self. __getTensor(doc)
149
+ return self.simMaxAv(qte, dte)
150
+
151
+ def sentSimAv(self, qu, doc):
152
+ """
153
+ sentence wise average
154
+ """
155
+ qse, dse = self.__sentEnc(qu, doc)
156
+ sims = self.simAll(qse, dse)
157
+ return numpy.mean(sims)
158
+
159
+ def sentSimMed(self, qu, doc):
160
+ """
161
+ sentence wise average (ssma)
162
+ """
163
+ qse, dse = self.__sentEnc(qu, doc)
164
+ sims = self.simAll(qse, dse)
165
+ return numpy.median(sims)
166
+
167
+ def sentSimMax(self, qu, doc):
168
+ """
169
+ sentence wise average (ssma)
170
+ """
171
+ qse, dse = self.__sentEnc(qu, doc)
172
+ sims = self.simAll(qse, dse)
173
+ return numpy.maximum(sims)
174
+
175
+
176
+ def sentSimAvMax(self, qu, doc):
177
+ """
178
+ sentence max then average (tsavm)
179
+ """
180
+ qse, dse = self.__sentEnc(qu, doc)
181
+ return self.simAvMax(qse, dse)
182
+
183
+ def sentSimMaxAv(self, qu, doc):
184
+ """
185
+ sentence average and then max
186
+ """
187
+ qse, dse = self.__sentEnc(qu, doc)
188
+ return self.simMaxAv(qse, dse)
189
+
190
+ def simMax(self, qte, dte):
191
+ """
192
+ max similarity between 2 elements
193
+ """
194
+ msi = 0
195
+ for qt in qte:
196
+ for dt in dte:
197
+ si = cosineSimilarity(qt, dt)
198
+ if not math.isnan(si) and si > msi:
199
+ msi = si
200
+ return msi
201
+
202
+ def simAvMax(self, qte, dte):
203
+ """
204
+ max then average (tsavm)
205
+ """
206
+ qts = list()
207
+ for qt in qte:
208
+ msi = 0
209
+ for dt in dte:
210
+ si = cosineSimilarity(qt, dt)
211
+ if not math.isnan(si) and si > msi:
212
+ msi = si
213
+ qts.append(msi)
214
+
215
+ amsi = numpy.mean(numpy.array(qts))
216
+ return amsi
217
+
218
+ def simMaxAv(self, lqe, lde):
219
+ """
220
+ average and then max
221
+ """
222
+ masi = 0
223
+ for qe in lqe:
224
+ qes = list()
225
+ for de in lde:
226
+ si = cosineSimilarity(qe, de)
227
+ if not math.isnan(si):
228
+ qes.append(si)
229
+ av = numpy.mean(numpy.array(qes))
230
+ if av > masi:
231
+ masi = av
232
+ return masi
233
+
234
+ def simAll(self, lqe, lde):
235
+ """
236
+ all similarity
237
+ """
238
+ qes = list()
239
+ for qe in lqe:
240
+ for de in lde:
241
+ si = cosineSimilarity(qe, de)
242
+ if not math.isnan(si):
243
+ qes.append(si)
244
+ return numpy.array(qes)
245
+
246
+ def __sentEnc(self, qu, doc):
247
+ """
248
+ sentence encoding for query and doc
249
+ """
250
+ qstr = qu._.trf_word_pieces_
251
+ qte = zip(qstr, qu._.trf_last_hidden_state)
252
+ qse = list()
253
+ for t, v in qte:
254
+ if t == "[CLS]":
255
+ qse.append(v)
256
+
257
+
258
+ dstr = doc._.trf_word_pieces_
259
+ dte = zip(dstr, doc._.trf_last_hidden_state)
260
+ dse = list()
261
+ for t, v in dte:
262
+ if t == "[CLS]":
263
+ dse.append(v)
264
+
265
+ enp = (numpy.array(qse), numpy.array(dse))
266
+ return enp
267
+
268
+ def __getTensor(self, toks):
269
+ """
270
+ tensors from tokens
271
+ """
272
+ return list(map(lambda t: t.tensor, toks))
273
+
274
+ def addDocs(self, docs):
275
+ """
276
+ add named doc content
277
+ """
278
+ self.docs.extend(docs)
279
+
280
+ def loadDocs(self, fpaths):
281
+ """
282
+ loads documents from one file
283
+ """
284
+ fPaths = fpaths.split(",")
285
+ if len(fPaths) == 1:
286
+ if os.path.isfile(fPaths[0]):
287
+ #one file
288
+ print("one file")
289
+ dnames = fpaths
290
+ docStr = getOneFileContent(fPaths[0])
291
+ dtexts = [docStr]
292
+ else:
293
+ #all files under directory
294
+ print("all files under directory")
295
+ dtexts, dnames = getFileContent(fPaths[0])
296
+ print("found following files")
297
+ for dt, dn in zip(dtexts, dnames):
298
+ print(dn + "\t" + dt[:40])
299
+ else:
300
+ #list of files
301
+ print("list of files")
302
+ dnames = fpaths
303
+ dtexts = list(map(getOneFileContent, fpaths))
304
+
305
+ docs = list(map(lambda dtext, dname : EncodedDoc(dtext, dname), zip(dtexts, dnames)))
306
+ self.docs.extend(docs)
307
+
308
+ def search(self, qstr, algo, gdranks=None):
309
+ """
310
+ tensors from tokens
311
+ """
312
+ qv = self.nlp(qstr)
313
+ res = list()
314
+ for d in self.docs:
315
+ dn = d.dname
316
+ if d.denc == None:
317
+ d.encode(self.nlp)
318
+ dv = d.denc
319
+ if algo == "ds":
320
+ si = self.docAv(qv, dv)
321
+ elif algo == "tsa":
322
+ si = self.tokSimAv(qv, dv)
323
+ elif algo == "tsme":
324
+ si = self.tokSimMed(qv, dv)
325
+ elif algo == "tsma":
326
+ si = self.tokSimMax(qv, dv)
327
+ elif algo == "tsavm":
328
+ si = self.tokSimAvMax(qv, dv)
329
+ elif algo == "tsmav":
330
+ si = self.tokSimMaxAv(qv, dv)
331
+ elif algo == "ssa":
332
+ si = self.sentSimAv(qv, dv)
333
+ elif algo == "ssme":
334
+ si = self.sentSimMed(qv, dv)
335
+ elif algo == "ssma":
336
+ si = self.sentSimMax(qv, dv)
337
+ elif algo == "ssavm":
338
+ si = self.sentSimAvMax(qv, dv)
339
+ elif algo == "ssmav":
340
+ si = self.sentSimMaxAv(qv, dv)
341
+ else:
342
+ si = -1.0
343
+ print("invalid semilarity algo")
344
+
345
+ #print("{} score {:.6f}".format(dn, si))
346
+ d.score = si
347
+ r = (dn, si)
348
+ res.append(r)
349
+
350
+ #search score for each document
351
+ res.sort(key=lambda r : r[1], reverse=True)
352
+ print("\nsorted search result")
353
+ print("query: {} matching algo: {}".format(qstr, algo))
354
+ for r in res:
355
+ print("{} score {:.3f}".format(r[0], r[1]))
356
+
357
+ #rank order if gold truuth rank provided
358
+ if gdranks is not None:
359
+ i = 0
360
+ count = 0
361
+ for d in gdranks:
362
+ while i < len(gdranks):
363
+ if d == res[i][0]:
364
+ count += 1
365
+ i += 1
366
+ break;
367
+ i += 1
368
+ ro = count / len(gdranks)
369
+ print("rank order {:.3f}".format(ro))
370
+
371
+ #similarity at passage or paragraph level using sbertcross encoder
372
+ class SemanticSimilaityCrossEnc(NeuralLangModel):
373
+
374
+ def __init__(self, docs=None):
375
+ self.dparas = None
376
+ self.scores = None
377
+ print("loading cross encoder")
378
+ self.model = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2")
379
+ print("done loading cross encoder")
380
+ super(NeuralLangModel, self).__init__()
381
+
382
+ def paraSimilarity(self, dtext, fpaths, minParNl=1):
383
+ """
384
+ returns paragarph pair similarity across 2 documents
385
+ """
386
+ dtexts, dnames = self.loadDocs(fpaths)
387
+ if dtext is None:
388
+ assertEqual(len(dtexts), 2, "exactly 2 files needed")
389
+ self.dtexts = dtexts
390
+ else:
391
+ assertEqual(len(dtexts), 1, "exactly 1 file needed")
392
+ self.dtexts = list()
393
+ self.dtexts.append(dtext)
394
+ self.dtexts.append(dtexts[0])
395
+
396
+
397
+ self.dparas = list()
398
+ for text in self.dtexts:
399
+ regx = "\n+" if minParNl == 1 else "\n{2,}"
400
+ paras = re.split(regx, text.replace("\r\n", "\n"))
401
+ print("no of paras {}".format(len(paras)))
402
+ self.dparas.append(paras)
403
+
404
+ tinp = list()
405
+ for para1 in self.dparas[0]:
406
+ inp = list(map(lambda para2: [para1, para2], self.dparas[1]))
407
+ tinp.extend(inp)
408
+
409
+ print("input shape " + str(np.array(tinp).shape))
410
+ scores = self.model.predict(tinp)
411
+ print("score shape " + str(np.array(scores).shape))
412
+ #assertEqual(len(scores), len(self.dparas[0]) * len(self.dparas[1]), "no of scores don't match no of paragraph pairs")
413
+ print(scores)
414
+
415
+ i = 0
416
+ print("text paragraph pair wise similarity")
417
+ for para1 in self.dparas[0]:
418
+ for para2 in self.dparas[1]:
419
+ print("first: {}\t second: {}\t score: {:.6f}".format(para1[:20], para2[:20], scores[i]))
420
+ i += 1
421
+
422
+ self.scores = scores
423
+
424
+ def avMaxScore(self):
425
+ """
426
+ """
427
+ pass
428
+
429
+ def ner(text, nlp):
430
+ #nlp = spacy.load("en_core_web_md")
431
+ doc = nlp(text)
432
+ for ent in doc.ents:
433
+ print(ent.text, ent.start_char, ent.end_char, ent.label_)
434
+
supv/optunar.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/local/bin/python3
2
+
3
+ # avenir-python: Machine Learning
4
+ # Author: Pranab Ghosh
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You may
8
+ # obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+
18
+ # Package imports
19
+ import os
20
+ import sys
21
+ import torch
22
+ from torch.utils.data import DataLoader
23
+ import random
24
+ import jprops
25
+ from random import randint
26
+ import optuna
27
+ sys.path.append(os.path.abspath("../lib"))
28
+ from util import *
29
+ from mlutil import *
30
+
31
+ """
32
+ neural network hyper paramter tuning with ptuna
33
+ """
34
+
35
+ def createTunerConfig(configFile):
36
+ """
37
+ create tuner config pbject
38
+ """
39
+ defValues = dict()
40
+ defValues["train.num.layers"] = ([2,4], None)
41
+ defValues["train.num.units"] = (None, "missing range of number of units")
42
+ defValues["train.activation"] = ("relu", None)
43
+ defValues["train.batch.normalize"] = (["true", "false"], None)
44
+ defValues["train.dropout.prob"] = ([-0.1, 0.5], None)
45
+ defValues["train.out.num.units"] = (None, "missing number of output units")
46
+ defValues["train.out.activation"] = (None, "missing output activation")
47
+ defValues["train.batch.size"] = ([16, 128], None)
48
+ defValues["train.opt.learning.rate"] = ([.0001, .005], None)
49
+
50
+ config = Configuration(configFile, defValues)
51
+ return config
52
+
53
+ def showStudyResults(study):
54
+ """
55
+ shows study results
56
+ """
57
+ print("Number of finished trials: ", len(study.trials))
58
+ print("Best trial:")
59
+ trial = study.best_trial
60
+ print("Value: ", trial.value)
61
+ print("Params: ")
62
+ for key, value in trial.params.items():
63
+ print(" {}: {}".format(key, value))
64
+
65
+
66
+ def objective(trial, networkType, modelConfigFile, tunerConfigFile):
67
+ """
68
+ optuna based hyperparamter tuning for neural network
69
+ """
70
+ tConfig = createTunerConfig(tunerConfigFile)
71
+
72
+ #tuning parameters
73
+ nlayers = config.getIntListConfig("train.num.layers")[0]
74
+ nunits = config.getIntListConfig("train.num.units")[0]
75
+ act = config.getStringConfig("train.activation")[0]
76
+ dropOutRange = config.getFloatListConfig("train.dropout.prob")[0]
77
+ outNunits = config.getIntConfig("train.out.num.units")[0]
78
+ outAct = config.getStringConfig("train.out.activation")[0]
79
+ batchSizes = config.getIntListConfig("train.batch.size")[0]
80
+ learningRates = config.getFloatListConfig("train.opt.learning.rate")[0]
81
+
82
+ numLayers = trial.suggest_int("numLayers", nlayers[0], nlayers[1])
83
+
84
+ #batch normalize on for all layers or none
85
+ batchNormOptions = ["true", "false"]
86
+ batchNorm = trial.suggest_categorical("batchNorm", batchNormOptions)
87
+
88
+ layerConfig = ""
89
+ maxUnits = nunits[1]
90
+ sep = ":"
91
+ for i in range(nlayers):
92
+ if i < nlayers - 1:
93
+ nunit = trial.suggest_int("numUnits_l{}".format(i), nunits[0], maxUnits)
94
+ dropOut = trial.suggest_int("dropOut_l{}".format(i), dropOutRange[0], dropOutRange[1])
95
+ lconfig = [str(nunit), act, batchNorm, "true", "{:.3f}".format(dropOut)]
96
+ lconfig = sep.join(lconfig) + ","
97
+ maxUnits = nunit
98
+ else:
99
+ lconfig = [str(outNunits), outAct, "false", "false", "{:.3f}".format(-0.1)]
100
+ lconfig = sep.join(lconfig)
101
+ layerConfig = layerConfig + lconfig
102
+
103
+ batchSize = trial.suggest_int("batchSize", batchSizes[0], batchSizes[1])
104
+ learningRate = trial.suggest_int("learningRate", learningRates[0], learningRates[1])
105
+
106
+ #train model
107
+ nnModel = FeedForwardNetwork(modelConfigFile)
108
+ nnModel.setConfigParam("train.layer.data", layerConfig)
109
+ nnModel.setConfigParam("train.batch.size", batchSize)
110
+ nnModel.setConfigParam("train.opt.learning.rate", learningRate)
111
+ nnModel.buildModel()
112
+ score = FeedForwardNetwork.batchTrain(nnModel)
113
+ return score
114
+
115
+ if __name__ == "__main__":
116
+ assert len(sys.argv) == 5, "requires 4 command line args"
117
+
118
+ networkType = sys.argv[1]
119
+ modelConfigFile = sys.argv[2]
120
+ tunerConfigFile = sys.argv[3]
121
+ numTrial = int(sys.argv[4])
122
+
123
+ study = optuna.create_study()
124
+ study.optimize(lambda trial: objective(trial, networkType, modelConfigFile, tunerConfigFile), n_trials=numTrial)
125
+
126
+ showStudyResults(study)
127
+
supv/pasearch.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/Users/pranab/Tools/anaconda/bin/python
2
+
3
+ # Package imports
4
+ import os
5
+ import sys
6
+ import numpy as np
7
+ import sklearn as sk
8
+ import random
9
+ import jprops
10
+ import abc
11
+ import math
12
+ import random
13
+ sys.path.append(os.path.abspath("../lib"))
14
+ from util import *
15
+
16
+ #base parameter search
17
+ class BaseParameterSearch(object):
18
+ __metaclass__ = abc.ABCMeta
19
+
20
+ def __init__(self, verbose):
21
+ self.verbose = verbose
22
+ self.parameters = []
23
+ self.paramData = {}
24
+ self.currentParams = []
25
+ self.curIter = 0
26
+ self.bestSolution = None
27
+
28
+ # add param name and type
29
+ def addParam(self, param):
30
+ self.parameters.append(param)
31
+
32
+ # add param data
33
+ def addParamVaues(self, paramName, paramData):
34
+ self.paramData[paramName] = paramData
35
+
36
+ # max iterations
37
+ def setMaxIter(self, maxIter):
38
+ self.maxIter = maxIter
39
+
40
+ @abc.abstractmethod
41
+ def prepare(self):
42
+ pass
43
+
44
+ @abc.abstractmethod
45
+ def nextParamValues(self):
46
+ pass
47
+
48
+ @abc.abstractmethod
49
+ def setCost(self, cost):
50
+ pass
51
+
52
+ # get best solution
53
+ def getBestSolution(self):
54
+ return self.bestSolution
55
+
56
+ #enumerate through provided list of param values
57
+ class GuidedParameterSearch:
58
+ def __init__(self, verbose=False):
59
+ self.verbose = verbose
60
+ self.parameters = []
61
+ self.paramData = {}
62
+ self.paramIndexes = []
63
+ self.numParamValues = []
64
+ self.currentParams = []
65
+ self.bestSolution = None
66
+
67
+ # max iterations
68
+ def setMaxIter(self,maxIter):
69
+ self.maxIter = maxIter
70
+
71
+ # add param name and type
72
+ def addParam(self, param):
73
+ self.parameters.append(param)
74
+
75
+ # add param data
76
+ def addParamVaues(self, paramName, paramData):
77
+ self.paramData[paramName] = paramData
78
+
79
+ # prepare
80
+ def prepare(self):
81
+ self.numParams = len(self.parameters)
82
+ for i in range(self.numParams):
83
+ self.paramIndexes.append(0)
84
+
85
+ #number of values for each parameter
86
+ paramName = self.parameters[i][0]
87
+ self.numParamValues.append(len(self.paramData[paramName]))
88
+ self.curParamIndex = 0
89
+
90
+ paramValueCombList = []
91
+ paramValueComb = []
92
+ paramValueCombList.append(paramValueComb)
93
+
94
+ # all params
95
+ for i in range(self.numParams):
96
+ paramValueCombListTemp = []
97
+ for paramValueComb in paramValueCombList:
98
+ # all param values
99
+ for j in range(self.numParamValues[i]):
100
+ paramValueCombTemp = paramValueComb[:]
101
+ paramValueCombTemp.append(j)
102
+ paramValueCombListTemp.append(paramValueCombTemp)
103
+ paramValueCombList = paramValueCombListTemp
104
+ self.paramValueCombList = paramValueCombList
105
+ self.numParamValueComb = len(self.paramValueCombList)
106
+ self.curParamValueCombIndx = 0;
107
+
108
+ # next param combination
109
+ def nextParamValues(self):
110
+ retParamNameValue = None
111
+ if self.curParamValueCombIndx < len(self.paramValueCombList):
112
+ retParamNameValue = []
113
+ curParams = self.paramValueCombList[self.curParamValueCombIndx]
114
+ print (curParams)
115
+ for i in range(len(curParams)):
116
+ paramName = self.parameters[i][0]
117
+ paramValue = self.paramData[paramName][curParams[i]]
118
+ retParamNameValue.append((paramName, paramValue))
119
+ self.curParamValueCombIndx = self.curParamValueCombIndx + 1
120
+ self.currentParams = retParamNameValue
121
+ return retParamNameValue
122
+
123
+ # set cost of current parameter set
124
+ def setCost(self, cost):
125
+ if self.bestSolution is not None:
126
+ if cost < self.bestSolution[1]:
127
+ self.bestSolution = (self.currentParams, cost)
128
+ else:
129
+ self.bestSolution = (self.currentParams, cost)
130
+
131
+ # get best solution
132
+ def getBestSolution(self):
133
+ return self.bestSolution
134
+
135
+ #random search through provided list of parameter values
136
+ class RandomParameterSearch(BaseParameterSearch):
137
+ def __init__(self, verbose=False):
138
+ super(RandomParameterSearch, self).__init__(verbose)
139
+
140
+
141
+ # prepare
142
+ def prepare(self):
143
+ pass
144
+
145
+ # next param combination
146
+ def nextParamValues(self):
147
+ retParamNameValue = None
148
+ if (self.curIter < self.maxIter):
149
+ retParamNameValue = []
150
+ for pName, pValues in self.paramData.iteritems():
151
+ pValue = selectRandomFromList(pValues)
152
+ retParamNameValue.append((pName, pValue))
153
+ self.curIter = self.curIter + 1
154
+ self.currentParams = retParamNameValue
155
+ return retParamNameValue
156
+
157
+ # set cost of current parameter set
158
+ def setCost(self, cost):
159
+ if self.bestSolution is not None:
160
+ if cost < self.bestSolution[1]:
161
+ self.bestSolution = (self.currentParams, cost)
162
+ else:
163
+ self.bestSolution = (self.currentParams, cost)
164
+
165
+ #random search through provided list of parameter values
166
+ class SimulatedAnnealingParameterSearch(BaseParameterSearch):
167
+ def __init__(self, verbose=False):
168
+ self.curSolution = None
169
+ self.nextSolution = None
170
+ super(SimulatedAnnealingParameterSearch, self).__init__(verbose)
171
+
172
+ # prepare
173
+ def prepare(self):
174
+ pass
175
+
176
+ def setTemp(self, temp):
177
+ self.temp = temp
178
+
179
+ def setTempReductionRate(self, tempRedRate):
180
+ self.tempRedRate = tempRedRate
181
+
182
+ # next param combination
183
+ def nextParamValues(self):
184
+ retParamNameValue = None
185
+ if (self.curIter == 0):
186
+ #initial random solution
187
+ retParamNameValue = []
188
+ for pName, pValues in self.paramData.iteritems():
189
+ pValue = selectRandomFromList(pValues)
190
+ retParamNameValue.append((pName, pValue))
191
+ self.curIter = self.curIter + 1
192
+ self.currentParams = retParamNameValue
193
+ elif (self.curIter < self.maxIter):
194
+ #perturb current solution
195
+ retParamNameValue = []
196
+
197
+ #randomly mutate one parameter value
198
+ (pNameSel, pValue) = selectRandomFromList(self.currentParams)
199
+ pValueNext = selectRandomFromList(self.paramData[pNameSel])
200
+ while (pValueNext == pValue):
201
+ pValueNext = selectRandomFromList(self.paramData[pNameSel])
202
+
203
+ #copy
204
+ for (pName, pValue) in self.currentParams:
205
+ if (pName == pNameSel):
206
+ pValueNew = pValueNext
207
+ else:
208
+ pValueNew = pValue
209
+ retParamNameValue.append((pName, pValueNew))
210
+ self.curIter = self.curIter + 1
211
+ self.currentParams = retParamNameValue
212
+ return retParamNameValue
213
+
214
+ # set cost of current parameter set
215
+ def setCost(self, cost):
216
+ if self.curSolution is None:
217
+ self.curSolution = (self.currentParams, cost)
218
+ self.bestSolution = (self.currentParams, cost)
219
+ else:
220
+ self.nextSolution = (self.currentParams, cost)
221
+ if (self.nextSolution[1] < self.curSolution[1]):
222
+ if (self.verbose):
223
+ print ("next soln better")
224
+ self.curSolution = self.nextSolution
225
+ if (self.nextSolution[1] < self.bestSolution[1]):
226
+ if (self.verbose):
227
+ print ("next soln better than best")
228
+ self.bestSolution = self.nextSolution
229
+ else:
230
+ if (self.verbose):
231
+ print ("next soln worst")
232
+ pr = math.exp((self.curSolution[1] - self.nextSolution[1]) / self.temp)
233
+ if (pr > random.random()):
234
+ self.curSolution = self.nextSolution
235
+ if (self.verbose):
236
+ print ("next soln worst but accepted")
237
+ else:
238
+ if (self.verbose):
239
+ print ("next soln worst and rejected")
240
+
241
+ self.temp = self.temp * self.tempRedRate
242
+
243
+
supv/regress.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/local/bin/python3
2
+
3
+ # avenir-python: Machine Learning
4
+ # Author: Pranab Ghosh
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You may
8
+ # obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+
18
+ # Package imports
19
+ import os
20
+ import sys
21
+ import matplotlib.pyplot as plt
22
+ import numpy as np
23
+ import sklearn as sk
24
+ import matplotlib
25
+ import random
26
+ import jprops
27
+ from io import StringIO
28
+ from sklearn.model_selection import cross_val_score
29
+ import joblib
30
+ from random import randint
31
+ from io import StringIO
32
+ from sklearn.linear_model import LinearRegression
33
+ sys.path.append(os.path.abspath("../lib"))
34
+ from util import *
35
+ from mlutil import *
36
+ from pasearch import *
37
+
38
+ class BaseRegressor(object):
39
+ """
40
+ base regression class
41
+ """
42
+
43
+ def __init__(self, configFile, defValues):
44
+ """
45
+ intializer
46
+ """
47
+ defValues["common.mode"] = ("train", None)
48
+ defValues["common.model.directory"] = ("model", None)
49
+ defValues["common.model.file"] = (None, None)
50
+ defValues["common.scale.file.path"] = (None, "missing scale file path")
51
+ defValues["common.preprocessing"] = (None, None)
52
+ defValues["common.verbose"] = (False, None)
53
+ defValues["train.data.file"] = (None, "missing training data file")
54
+ defValues["train.data.fields"] = (None, "missing training data field ordinals")
55
+ defValues["train.data.feature.fields"] = (None, "missing training data feature field ordinals")
56
+ defValues["train.data.out.field"] = (None, "missing out field ordinal")
57
+
58
+ self.config = Configuration(configFile, defValues)
59
+ self.featData = None
60
+ self.outData = None
61
+ self.regressor = None
62
+ self.verbose = self.config.getBooleanConfig("common.verbose")[0]
63
+ self.mode = self.config.getBooleanConfig("common.mode")[0]
64
+ logFilePath = self.config.getStringConfig("common.logging.file")[0]
65
+ logLevName = self.config.getStringConfig("common.logging.level")[0]
66
+ self.logger = createLogger(__name__, logFilePath, logLevName)
67
+ self.logger.info("********* starting session")
68
+
69
+ def initConfig(self, configFile, defValues):
70
+ """
71
+ initialize config
72
+ """
73
+ self.config = Configuration(configFile, defValues)
74
+
75
+ def getConfig(self):
76
+ """
77
+ get config object
78
+ """
79
+ return self.config
80
+
81
+ def setConfigParam(self, name, value):
82
+ """
83
+ set config param
84
+ """
85
+ self.config.setParam(name, value)
86
+
87
+ def getMode(self):
88
+ """
89
+ get mode
90
+ """
91
+ return self.mode
92
+
93
+ def train(self):
94
+ """
95
+ train model
96
+ """
97
+ #build model
98
+ self.buildModel()
99
+
100
+ # training data
101
+ if self.featData is None:
102
+ (featData, outData) = self.prepData("train")
103
+ (self.featData, self.outData) = (featData, outData)
104
+ else:
105
+ (featData, outData) = (self.featData, self.outData)
106
+
107
+ # parameters
108
+ modelSave = self.config.getBooleanConfig("train.model.save")[0]
109
+
110
+ #train
111
+ self.logger.info("...training model")
112
+ self.regressor.fit(featData, outData)
113
+ rsqScore = self.regressor.score(featData, outData)
114
+ coef = self.regressor.coef_
115
+ intc = self.regressor.intercept_
116
+ result = (rsqScore, intc, coef)
117
+
118
+ if modelSave:
119
+ self.logger.info("...saving model")
120
+ modelFilePath = self.getModelFilePath()
121
+ joblib.dump(self.regressor, modelFilePath)
122
+ return result
123
+
124
+ def validate(self):
125
+ # create model
126
+ self.prepModel()
127
+
128
+ # prepare test data
129
+ (featData, outDataActual) = self.prepData("validate")
130
+
131
+ #predict
132
+ self.logger.info("...predicting")
133
+ outDataPred = self.regressor.predict(featData)
134
+
135
+ #error
136
+ rsqScore = self.regressor.score(featData, outDataActual)
137
+ result = (outDataPred, rsqScore)
138
+ return result
139
+
140
+ def predict(self):
141
+ """
142
+ predict using trained model
143
+ """
144
+ # create model
145
+ self.prepModel()
146
+
147
+ # prepare test data
148
+ featData = self.prepData("predict")[0]
149
+
150
+ #predict
151
+ self.logger.info("...predicting")
152
+ outData = self.regressor.predict(featData)
153
+ return outData
154
+
155
+ def prepData(self, mode):
156
+ """
157
+ loads and prepares data for training and validation
158
+ """
159
+ # parameters
160
+ key = mode + ".data.file"
161
+ dataFile = self.config.getStringConfig(key)[0]
162
+
163
+ key = mode + ".data.fields"
164
+ fieldIndices = self.config.getStringConfig(key)[0]
165
+ if not fieldIndices is None:
166
+ fieldIndices = strToIntArray(fieldIndices, ",")
167
+
168
+
169
+ key = mode + ".data.feature.fields"
170
+ featFieldIndices = self.config.getStringConfig(key)[0]
171
+ if not featFieldIndices is None:
172
+ featFieldIndices = strToIntArray(featFieldIndices, ",")
173
+
174
+ if not mode == "predict":
175
+ key = mode + ".data.out.field"
176
+ outFieldIndex = self.config.getIntConfig(key)[0]
177
+
178
+ #load data
179
+ (data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
180
+ if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
181
+ featData = sk.preprocessing.scale(featData)
182
+ outData = None
183
+ if not mode == "predict":
184
+ outData = extrColumns(data, outFieldIndex)
185
+ return (featData, outData)
186
+
187
+ def prepModel(self):
188
+ """
189
+ load saved model or train model
190
+ """
191
+ useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
192
+ if (useSavedModel and not self.regressor):
193
+ # load saved model
194
+ self.logger.info("...loading saved model")
195
+ modelFilePath = self.getModelFilePath()
196
+ self.regressor = joblib.load(modelFilePath)
197
+ else:
198
+ # train model
199
+ self.train()
200
+
201
+ class LinearRegressor(BaseRegressor):
202
+ """
203
+ linear regression
204
+ """
205
+ def __init__(self, configFile):
206
+ defValues = {}
207
+ defValues["train.normalize"] = (False, None)
208
+
209
+ super(LinearRegressor, self).__init__(configFile, defValues)
210
+
211
+ def buildModel(self):
212
+ """
213
+ builds model object
214
+ """
215
+ self.logger.info("...building linear regression model")
216
+ normalize = self.config.getBooleanConfig("train.normalize")[0]
217
+ self.regressor = LinearRegression(normalize=normalize)
218
+
219
+ class ElasticNetRegressor(BaseRegressor):
220
+ """
221
+ elastic net regression
222
+ """
223
+ def __init__(self, configFile):
224
+ defValues = {}
225
+ defValues["train.alpha"] = (1.0, None)
226
+ defValues["train.loneratio"] = (0.5, None)
227
+ defValues["train.normalize"] = (False, None)
228
+ defValues["train.precompute"] = (False, None)
229
+ defValues["train.max.iter"] = (1000, None)
230
+ defValues["train.tol"] = (0.0001, None)
231
+ defValues["train.random.state"] = (None, None)
232
+ defValues["train.selection"] = ("cyclic", None)
233
+
234
+ super(ElasticNetRegressor, self).__init__(configFile, defValues)
235
+
236
+ def buildModel(self):
237
+ """
238
+ builds model object
239
+ """
240
+ self.logger.info("...building elastic net regression model")
241
+ alpha = self.config.getFloatConfig("train.alpha")[0]
242
+ loneratio = self.config.getFloatConfig("train.loneratio")[0]
243
+ normalize = self.config.getBooleanConfig("train.normalize")[0]
244
+ precompute = self.config.getBooleanConfig("train.precompute")[0]
245
+ maxIter = self.config.getIntConfig("train.max.iter")[0]
246
+ tol = self.config.getFloatConfig("train.tol")[0]
247
+ randState = self.config.getIntConfig("train.random.state")[0]
248
+ selection = self.config.getIntConfig("train.selection")[0]
249
+
250
+ self.regressor = ElasticNet(alpha=alpha, l1_ratio=loneratio, normalize=normalize, precompute=precompute,
251
+ max_iter=maxIter, tol=tol, random_state=randState, selection=selection)
252
+
253
+
supv/rf.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/local/bin/python3
2
+
3
+ # avenir-python: Machine Learning
4
+ # Author: Pranab Ghosh
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You may
8
+ # obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+
18
+ # Package imports
19
+ import os
20
+ import sys
21
+ import matplotlib.pyplot as plt
22
+ import numpy as np
23
+ import sklearn as sk
24
+ import matplotlib
25
+ import random
26
+ import jprops
27
+ from sklearn.ensemble import RandomForestClassifier
28
+ from random import randint
29
+ sys.path.append(os.path.abspath("../lib"))
30
+ from util import *
31
+ from mlutil import *
32
+ from pasearch import *
33
+ from bacl import *
34
+
35
+
36
+ # gradient boosting classification
37
+ class RandomForest(BaseClassifier):
38
+ def __init__(self, configFile):
39
+ defValues = {}
40
+ defValues["common.mode"] = ("training", None)
41
+ defValues["common.model.directory"] = ("model", None)
42
+ defValues["common.model.file"] = (None, None)
43
+ defValues["common.preprocessing"] = (None, None)
44
+ defValues["common.verbose"] = (False, None)
45
+ defValues["train.data.file"] = (None, "missing training data file")
46
+ defValues["train.data.fields"] = (None, "missing training data field ordinals")
47
+ defValues["train.data.feature.fields"] = (None, "missing training data feature field ordinals")
48
+ defValues["train.data.class.field"] = (None, "missing class field ordinal")
49
+ defValues["train.validation"] = ("kfold", None)
50
+ defValues["train.num.folds"] = (5, None)
51
+ defValues["train.num.trees"] = (100, None)
52
+ defValues["train.split.criterion"] = ("gini", None)
53
+ defValues["train.max.depth"] = (None, None)
54
+ defValues["train.min.samples.split"] = (4, None)
55
+ defValues["train.min.samples.leaf"] = (2, None)
56
+ defValues["train.min.weight.fraction.leaf"] = (0, None)
57
+ defValues["train.max.features"] = ("auto", None)
58
+ defValues["train.max.leaf.nodes"] = (None, None)
59
+ defValues["train.min.impurity.decrease"] = (0, None)
60
+ defValues["train.min.impurity.split"] = (1.0e-07, None)
61
+ defValues["train.bootstrap"] = (True, None)
62
+ defValues["train.oob.score"] = (False, None)
63
+ defValues["train.num.jobs"] = (1, None)
64
+ defValues["train.random.state"] = (None, None)
65
+ defValues["train.verbose"] = (0, None)
66
+ defValues["train.warm.start"] = (False, None)
67
+ defValues["train.success.criterion"] = ("error", None)
68
+ defValues["train.model.save"] = (False, None)
69
+ defValues["train.score.method"] = ("accuracy", None)
70
+ defValues["train.search.param.strategy"] = (None, None)
71
+ defValues["train.search.params"] = (None, None)
72
+ defValues["predict.data.file"] = (None, None)
73
+ defValues["predict.data.fields"] = (None, "missing data field ordinals")
74
+ defValues["predict.data.feature.fields"] = (None, "missing data feature field ordinals")
75
+ defValues["predict.use.saved.model"] = (False, None)
76
+ defValues["validate.data.file"] = (None, "missing validation data file")
77
+ defValues["validate.data.fields"] = (None, "missing validation data field ordinals")
78
+ defValues["validate.data.feature.fields"] = (None, "missing validation data feature field ordinals")
79
+ defValues["validate.data.class.field"] = (None, "missing class field ordinal")
80
+ defValues["validate.use.saved.model"] = (False, None)
81
+ defValues["validate.score.method"] = ("accuracy", None)
82
+
83
+ super(RandomForest, self).__init__(configFile, defValues, __name__)
84
+
85
+ # builds model object
86
+ def buildModel(self):
87
+ self.logger.info("...building random forest model")
88
+ numTrees = self.config.getIntConfig("train.num.trees")[0]
89
+ splitCriterion = self.config.getStringConfig("train.split.criterion")[0]
90
+ maxDepth = self.config.getStringConfig("train.max.depth")[0]
91
+ maxDepth = typedValue(maxDepth)
92
+ minSamplesSplit = self.config.getStringConfig("train.min.samples.split")[0]
93
+ minSamplesSplit = typedValue(minSamplesSplit)
94
+ minSamplesLeaf = self.config.getStringConfig("train.min.samples.leaf")[0]
95
+ minSamplesLeaf = typedValue(minSamplesLeaf)
96
+ minWeightFractionLeaf = self.config.getFloatConfig("train.min.weight.fraction.leaf")[0]
97
+ maxFeatures = self.config.getStringConfig("train.max.features")[0]
98
+ maxFeatures = typedValue(maxFeatures)
99
+ maxLeafNodes = self.config.getIntConfig("train.max.leaf.nodes")[0]
100
+ minImpurityDecrease = self.config.getFloatConfig("train.min.impurity.decrease")[0]
101
+ minImpurityDecrease = self.config.getFloatConfig("train.min.impurity.split")[0]
102
+ bootstrap = self.config.getBooleanConfig("train.bootstrap")[0]
103
+ oobScore = self.config.getBooleanConfig("train.oob.score")[0]
104
+ numJobs = self.config.getIntConfig("train.num.jobs")[0]
105
+ randomState = self.config.getIntConfig("train.random.state")[0]
106
+ verbose = self.config.getIntConfig("train.verbose")[0]
107
+ warmStart = self.config.getBooleanConfig("train.warm.start")[0]
108
+
109
+ model = RandomForestClassifier(n_estimators=numTrees, criterion=splitCriterion, max_depth=maxDepth, \
110
+ min_samples_split=minSamplesSplit, min_samples_leaf=minSamplesLeaf, min_weight_fraction_leaf=minWeightFractionLeaf, \
111
+ max_features=maxFeatures, max_leaf_nodes=maxLeafNodes, min_impurity_decrease=minImpurityDecrease, \
112
+ min_impurity_split=None, bootstrap=bootstrap, oob_score=oobScore, n_jobs=numJobs, random_state=randomState, \
113
+ verbose=verbose, warm_start=warmStart, class_weight=None)
114
+ self.classifier = model
115
+ return self.classifier
116
+
117
+ #predict probability with in memory data
118
+ def predictProb(self, recs):
119
+ # create model
120
+ self.prepModel()
121
+
122
+ #input record
123
+ if type(recs) is str:
124
+ featData = self.prepStringPredictData(recs)
125
+ else:
126
+ featData = recs
127
+ if (featData.ndim == 1):
128
+ featData = featData.reshape(1, -1)
129
+
130
+ #predict
131
+ self.logger.info("...predicting class probability")
132
+ clsData = self.classifier.predict_proba(featData)
133
+ return clsData
134
+
supv/svm.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/local/bin/python3
2
+
3
+ # avenir-python: Machine Learning
4
+ # Author: Pranab Ghosh
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You may
8
+ # obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+
18
+ # Package imports
19
+ import os
20
+ import sys
21
+ import matplotlib.pyplot as plt
22
+ import numpy as np
23
+ import sklearn as sk
24
+ import sklearn.linear_model
25
+ import matplotlib
26
+ import random
27
+ import jprops
28
+ from random import randint
29
+ sys.path.append(os.path.abspath("../lib"))
30
+ from util import *
31
+ from mlutil import *
32
+ from pasearch import *
33
+ from bacl import *
34
+
35
+ # gradient boosting classification
36
+ class SupportVectorMachine(BaseClassifier):
37
+
38
+ def __init__(self, configFile):
39
+ defValues = {}
40
+ defValues["common.mode"] = ("train", None)
41
+ defValues["common.model.directory"] = ("model", None)
42
+ defValues["common.model.file"] = (None, None)
43
+ defValues["common.scale.file.path"] = (None, "missing scale file path")
44
+ defValues["common.preprocessing"] = (None, None)
45
+ defValues["common.verbose"] = (False, None)
46
+ defValues["train.data.file"] = (None, "missing training data file")
47
+ defValues["train.data.fields"] = (None, "missing training data field ordinals")
48
+ defValues["train.data.feature.fields"] = (None, "missing training data feature field ordinals")
49
+ defValues["train.data.class.field"] = (None, "missing class field ordinal")
50
+ defValues["train.validation"] = ("kfold", None)
51
+ defValues["train.num.folds"] = (5, None)
52
+ defValues["train.algorithm"] = ("svc", None)
53
+ defValues["train.kernel.function"] = ("rbf", None)
54
+ defValues["train.poly.degree"] = (3, None)
55
+ defValues["train.penalty"] = (1.0, None)
56
+ defValues["train.gamma"] = ("scale", None)
57
+ defValues["train.penalty.norm"] = ("l2", None)
58
+ defValues["train.loss"] = ("squared_hinge", None)
59
+ defValues["train.dual"] = (True, None)
60
+ defValues["train.shrinking"] = (True, None)
61
+ defValues["train.nu"] = (0.5, None)
62
+ defValues["train.predict.probability"] = (False, None)
63
+ defValues["train.print.sup.vectors"] = (False, None)
64
+ defValues["train.success.criterion"] = ("error", None)
65
+ defValues["train.model.save"] = (False, None)
66
+ defValues["train.score.method"] = ("accuracy", None)
67
+ defValues["train.search.param.strategy"] = (None, None)
68
+ defValues["train.search.params"] = (None, None)
69
+ defValues["predict.data.file"] = (None, None)
70
+ defValues["predict.data.fields"] = (None, "missing data field ordinals")
71
+ defValues["predict.data.feature.fields"] = (None, "missing data feature field ordinals")
72
+ defValues["predict.use.saved.model"] = (False, None)
73
+ defValues["validate.data.file"] = (None, "missing validation data file")
74
+ defValues["validate.data.fields"] = (None, "missing validation data field ordinals")
75
+ defValues["validate.data.feature.fields"] = (None, "missing validation data feature field ordinals")
76
+ defValues["validate.data.class.field"] = (None, "missing class field ordinal")
77
+ defValues["validate.use.saved.model"] = (False, None)
78
+ defValues["validate.score.method"] = ("accuracy", None)
79
+
80
+ super(SupportVectorMachine, self).__init__(configFile, defValues, __name__)
81
+
82
+ # builds model object
83
+ def buildModel(self):
84
+ self.logger.info("...building svm model")
85
+ algo = self.config.getStringConfig("train.algorithm")[0]
86
+ kernelFun = self.config.getStringConfig("train.kernel.function")[0]
87
+ penalty = self.config.getFloatConfig("train.penalty")[0]
88
+ polyDegree = self.config.getIntConfig("train.poly.degree")[0]
89
+ kernelCoeff = self.config.getStringConfig("train.gamma")[0]
90
+ kernelCoeff = typedValue(kernelCoeff)
91
+ penaltyNorm = self.config.getStringConfig("train.penalty.norm")[0]
92
+ trainLoss = self.config.getStringConfig("train.loss")[0]
93
+ dualOpt = self.config.getBooleanConfig("train.dual")[0]
94
+ shrinkHeuristic = self.config.getBooleanConfig("train.shrinking")[0]
95
+ predictProb = self.config.getBooleanConfig("train.predict.probability")[0]
96
+ supVecBound = self.config.getFloatConfig("train.nu")[0]
97
+
98
+ if (algo == "svc"):
99
+ if kernelFun == "poly":
100
+ model = sk.svm.SVC(C=penalty,kernel=kernelFun,degree=polyDegree,gamma=kernelCoeff, shrinking=shrinkHeuristic, \
101
+ probability=predictProb)
102
+ elif kernelFun == "rbf" or kernelFun == "sigmoid":
103
+ model = sk.svm.SVC(C=penalty,kernel=kernelFun,gamma=kernelCoeff, shrinking=shrinkHeuristic, probability=predictProb)
104
+ else:
105
+ model = sk.svm.SVC(C=penalty, kernel=kernelFun, shrinking=shrinkHeuristic, probability=predictProb)
106
+ elif (algo == "nusvc"):
107
+ if kernelFun == "poly":
108
+ model = sk.svm.NuSVC(nu=supVecBound, kernel=kernelFun,degree=polyDegree,gamma=kernelCoeff, shrinking=shrinkHeuristic, \
109
+ probability=predictProb)
110
+ elif kernelFun == "rbf" or kernelFun == "sigmoid":
111
+ model = sk.svm.NuSVC(nu=supVecBound, kernel=kernelFun,gamma=kernelCoeff, shrinking=shrinkHeuristic, probability=predictProb)
112
+ else:
113
+ model = sk.svm.NuSVC(nu=supVecBound, kernel=kernelFun, shrinking=shrinkHeuristic, probability=predictProb)
114
+ elif (algo == "linearsvc"):
115
+ model = sk.svm.LinearSVC(penalty=penaltyNorm, loss=trainLoss, dual=dualOpt)
116
+ else:
117
+ self.logger.info("invalid svm algorithm")
118
+ sys.exit()
119
+ self.classifier = model
120
+ return self.classifier
121
+
122
+ #predict probability with in memory data
123
+ def predictProb(self, recs):
124
+ # create model
125
+ self.prepModel()
126
+
127
+ #input record
128
+ if type(recs) is str:
129
+ featData = self.prepStringPredictData(recs)
130
+ else:
131
+ featData = recs
132
+ if (featData.ndim == 1):
133
+ featData = featData.reshape(1, -1)
134
+
135
+ #predict
136
+ self.logger.info("...predicting class probability")
137
+ clsData = self.classifier.predict_proba(featData)
138
+ return clsData
139
+
140
+
141
+
supv/svml.py ADDED
@@ -0,0 +1,428 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/Users/pranab/Tools/anaconda/bin/python
2
+
3
+ # avenir-python: Machine Learning
4
+ # Author: Pranab Ghosh
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You may
8
+ # obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+
18
+ # Package imports
19
+ import os
20
+ import sys
21
+ import matplotlib.pyplot as plt
22
+ import numpy as np
23
+ import sklearn as sk
24
+ import sklearn.linear_model
25
+ import matplotlib
26
+ import random
27
+ import jprops
28
+ from sklearn.externals import joblib
29
+ from sklearn.ensemble import BaggingClassifier
30
+ from random import randint
31
+
32
+ if len(sys.argv) < 2:
33
+ print "usage: ./svm.py <config_properties_file>"
34
+ sys.exit()
35
+
36
+ #train by bagging
37
+ def train_bagging():
38
+ model = build_model()
39
+ bagging_model = BaggingClassifier(base_estimator=model,n_estimators=bagging_num_estimator,
40
+ max_samples=bagging_sample_fraction,oob_score=bagging_use_oob)
41
+
42
+ #train model
43
+ bagging_model.fit(XC, yc)
44
+
45
+ #persist model
46
+ if persist_model:
47
+ models = bagging_model.estimators_
48
+ for m in zip(range(0, len(models)), models):
49
+ model_file = model_file_directory + "/" + model_file_prefix + "_" + str(m[0] + 1) + ".mod"
50
+ joblib.dump(m[1], model_file)
51
+
52
+ score = bagging_model.score(XC, yc)
53
+ print "average error %.3f" %(1.0 - score)
54
+
55
+ #linear k fold validation
56
+ def train_kfold_validation(nfold):
57
+ if native_kfold_validation:
58
+ print "native linear kfold validation"
59
+ model = build_model()
60
+ scores = sk.cross_validation.cross_val_score(model, XC, yc, cv=nfold)
61
+ av_score = np.mean(scores)
62
+ print "average error %.3f" %(1.0 - av_score)
63
+ else:
64
+ print "extended linear kfold validation"
65
+ train_kfold_validation_ext(nfold)
66
+
67
+ #linear k fold validation
68
+ def train_kfold_validation_ext(nfold):
69
+ model = build_model()
70
+ #scores = sk.cross_validation.cross_val_score(model, XC, yc, cv=nfold)
71
+ #print scores
72
+
73
+ offset = 0
74
+ length = dsize / nfold
75
+ errors = []
76
+ fp_errors = []
77
+ fn_errors = []
78
+ for i in range(0, nfold):
79
+ print "....Next fold %d" %(i)
80
+
81
+ #split data
82
+ (XV,yv,X,y) = split_data(offset, length)
83
+ dvsize = len(XV)
84
+
85
+ #train model
86
+ model.fit(X, y)
87
+
88
+ #persist model
89
+ if persist_model:
90
+ model_file = model_file_directory + "/" + model_file_prefix + "_" + str(i + 1) + ".mod"
91
+ joblib.dump(model, model_file)
92
+
93
+ #print support vectors
94
+ print_support_vectors(model)
95
+
96
+ #predict
97
+ print "making predictions..."
98
+ yp = model.predict(XV)
99
+
100
+ #show prediction output
101
+ (er, fp_er, fn_er) = validate(dvsize,yv,yp)
102
+ errors.append(er)
103
+ fp_errors.append(fp_er)
104
+ fn_errors.append(fn_er)
105
+
106
+ offset += length
107
+
108
+ #average error
109
+ av_error = np.mean(errors)
110
+ av_fp_error = np.mean(fp_errors)
111
+ av_fn_error = np.mean(fn_errors)
112
+ print "average error %.3f false positive error %.3f false negative error %.3f" %(av_error, av_fp_error, av_fn_error)
113
+
114
+ # random k fold validation
115
+ def train_rfold_validation(nfold, niter):
116
+ if native_rfold_validation:
117
+ print "native random kfold validation"
118
+ train_fraction = 1.0 / nfold
119
+ scores = []
120
+ for i in range(0,niter):
121
+ state = randint(1,100)
122
+ X, XV, y, yv = sk.cross_validation.train_test_split(XC, yc, test_size=train_fraction, random_state=state)
123
+ model = build_model()
124
+ model.fit(X,y)
125
+ scores.append(model.score(XV, yv))
126
+
127
+ print scores
128
+ av_score = np.mean(scores)
129
+ print "average error %.3f" %(1.0 - av_score)
130
+
131
+ else:
132
+ print "extended random kfold validation"
133
+ train_rfold_validation_ext(nfold, niter)
134
+
135
+ # random k fold validation
136
+ def train_rfold_validation_ext(nfold, niter):
137
+ max_offset_frac = 1.0 - 1.0 / nfold
138
+ max_offset_frac -= .01
139
+ length = dsize / nfold
140
+
141
+ errors = []
142
+ fp_errors = []
143
+ fn_errors = []
144
+ for i in range(0,niter):
145
+ print "...Next iteration %d" %(i)
146
+ offset = int(dsize * random.random() * max_offset_frac)
147
+ print "offset: %d length: %d" %(offset, length)
148
+ (XV,yv,X,y) = split_data(offset, length)
149
+ dvsize = len(XV)
150
+
151
+ #build model
152
+ model = build_model()
153
+
154
+ #train model
155
+ model.fit(X, y)
156
+
157
+ #persist model
158
+ if persist_model:
159
+ model_file = model_file_directory + "/" + model_file_prefix + "_" + str(i + 1) + ".mod"
160
+ print "saving model file " + model_file
161
+ joblib.dump(model, model_file)
162
+
163
+ #print support vectors
164
+ print_support_vectors(model)
165
+
166
+ #predict
167
+ print "making predictions..."
168
+ yp = model.predict(XV)
169
+
170
+ #show prediction output
171
+ (er, fp_er, fn_er) = validate(dvsize,yv,yp)
172
+ errors.append(er)
173
+ fp_errors.append(fp_er)
174
+ fn_errors.append(fn_er)
175
+
176
+ av_error = np.mean(errors)
177
+ av_fp_error = np.mean(fp_errors)
178
+ av_fn_error = np.mean(fn_errors)
179
+ print "average error %.3f false positive error %.3f false negative error %.3f" %(av_error, av_fp_error, av_fn_error)
180
+
181
+ # make predictions
182
+ def predict():
183
+ psize = len(X)
184
+ class_counts = []
185
+
186
+ #all models
187
+ for i in range(0, num_models):
188
+ model_file = model_file_directory + "/" + model_file_prefix + "_" + str(i + 1) + ".mod"
189
+ print "loading model file " + model_file
190
+ model = joblib.load(model_file)
191
+
192
+ yp = model.predict(X)
193
+ if i == 0:
194
+ #initialize class counts
195
+ for y in yp:
196
+ class_count = {}
197
+ if y == 0:
198
+ class_count[0] = 1
199
+ class_count[1] = 0
200
+ else:
201
+ class_count[1] = 1
202
+ class_count[0] = 0
203
+ class_counts.append(class_count)
204
+
205
+ else:
206
+ #increment class count
207
+ for j in range(0, psize):
208
+ class_count = class_counts[j]
209
+ y = yp[j]
210
+ class_count[y] += 1
211
+
212
+ # predict based on majority vote
213
+ print "here are the predictions"
214
+ for k in range(0, psize):
215
+ class_count = class_counts[k]
216
+ if (class_count[0] > class_count[1]):
217
+ y = 0
218
+ majority = class_count[0]
219
+ else:
220
+ y = 1
221
+ majority = class_count[1]
222
+
223
+ print X[k]
224
+ print "prediction %d majority count %d" %(y, majority)
225
+
226
+ #builds model
227
+ def build_model():
228
+ #build model
229
+ print "building model..."
230
+ if algo == "svc":
231
+ if kernel_fun == "poly":
232
+ model = sk.svm.SVC(C=penalty,kernel=kernel_fun,degree=poly_degree,gamma=kernel_coeff)
233
+ elif kernel_fun == "rbf" or kernel_fun == "sigmoid":
234
+ model = sk.svm.SVC(C=penalty,kernel=kernel_fun,gamma=kernel_coeff)
235
+ else:
236
+ model = sk.svm.SVC(C=penalty,kernel=kernel_fun)
237
+ elif algo == "nusvc":
238
+ if kernel_fun == "poly":
239
+ model = sk.svm.NuSVC(kernel=kernel_fun,degree=poly_degree,gamma=kernel_coeff)
240
+ elif kernel_fun == "rbf" or kernel_fun == "sigmoid":
241
+ model = sk.svm.NuSVC(kernel=kernel_fun,gamma=kernel_coeff)
242
+ else:
243
+ model = sk.svm.NuSVC(kernel=kernel_fun)
244
+ elif algo == "linearsvc":
245
+ model = sk.svm.LinearSVC()
246
+ else:
247
+ print "invalid svm algorithm"
248
+ sys.exit()
249
+ return model
250
+
251
+ #splits data into training and validation sets
252
+ def split_data(offset, length):
253
+ print "splitting data..."
254
+ #copy data
255
+ XC_c = np.copy(XC)
256
+ yc_c = list(yc)
257
+
258
+ # validation set
259
+ vlo = offset
260
+ vup = vlo + length
261
+ if (vup > len(yc)):
262
+ vup = len(yc)
263
+ XV = XC_c[vlo:vup:1]
264
+ yv = yc_c[vlo:vup:1]
265
+ dvsize = len(XV)
266
+ print "data size %d validation data size %d" %(dsize, dvsize)
267
+ #print "validation set"
268
+ #print XV
269
+ #print yv
270
+
271
+ #training set
272
+ X = np.delete(XC_c, np.s_[vlo:vup:1], 0)
273
+ y = np.delete(yc_c, np.s_[vlo:vup:1], 0)
274
+ #print "training set"
275
+ #print X
276
+ #print y
277
+ return (XV,yv,X,y)
278
+
279
+ #print support vectors
280
+ def print_support_vectors(model):
281
+ if (not algo == "linearsvc"):
282
+ if print_sup_vectors:
283
+ print "showing support vectors..."
284
+ print model.support_vectors_
285
+ print "num of support vectors"
286
+ print model.n_support_
287
+
288
+ #prints prediction output
289
+ def validate(dvsize,yv,yp):
290
+ print "showing predictions..."
291
+ err_count = 0
292
+ tp = 0
293
+ tn = 0
294
+ fp = 0
295
+ fn = 0
296
+ for r in range(0,dvsize):
297
+ #print "actual: %d predicted: %d" %(yv[r], yp[r])
298
+ if (not yv[r] == yp[r]):
299
+ err_count += 1
300
+
301
+ if (yp[r] == 1 and yv[r] == 1):
302
+ tp += 1
303
+ elif (yp[r] == 1 and yv[r] == 0):
304
+ fp += 1
305
+ elif (yp[r] == 0 and yv[r] == 0):
306
+ tn += 1
307
+ else:
308
+ fn += 1
309
+
310
+ er = float(err_count) / dvsize
311
+ fp_er = float(fp) / dvsize
312
+ fn_er = float(fn) / dvsize
313
+ print "error %.3f" %(er)
314
+ print "true positive : %.3f" %(float(tp) / dvsize)
315
+ print "false positive: %.3f" %(fp_er)
316
+ print "true negative : %.3f" %(float(tn) / dvsize)
317
+ print "false negative: %.3f" %(fn_er)
318
+
319
+ return (er, fp_er, fn_er)
320
+
321
+ # load configuration
322
+ def getConfigs(configFile):
323
+ configs = {}
324
+ print "using following configurations"
325
+ with open(configFile) as fp:
326
+ for key, value in jprops.iter_properties(fp):
327
+ print key, value
328
+ configs[key] = value
329
+
330
+ return configs
331
+
332
+
333
+ # load configuration
334
+ configs = getConfigs(sys.argv[1])
335
+ mode = configs["common.mode"]
336
+
337
+ if mode == "train":
338
+ #train
339
+ print "running in train mode"
340
+ data_file = configs["train.data.file"]
341
+ feat_field_indices = configs["train.data.feature.fields"].split(",")
342
+ feat_field_indices = [int(a) for a in feat_field_indices]
343
+ class_field_index = int(configs["train.data.class.field"])
344
+ preprocess = configs["common.preprocessing"]
345
+ validation = configs["train.validation"]
346
+ num_folds = int(configs["train.num.folds"])
347
+ num_iter = int(configs["train.num.iter"])
348
+ algo = configs["train.algorithm"]
349
+ kernel_fun = configs["train.kernel.function"]
350
+ poly_degree = int(configs["train.poly.degree"])
351
+ penalty = float(configs["train.penalty"])
352
+ if penalty < 0:
353
+ penalty = 1.0
354
+ print "using default for penalty"
355
+ kernel_coeff = float(configs["train.gamma"])
356
+ if kernel_coeff < 0:
357
+ kernel_coeff = 'auto'
358
+ print "using default for gamma"
359
+ print_sup_vectors = configs["train.print.sup.vectors"].lower() == "true"
360
+ persist_model = configs["train.persist.model"].lower() == "true"
361
+ model_file_directory = configs["common.model.directory"]
362
+ model_file_prefix = configs["common.model.file.prefix"]
363
+
364
+ print feat_field_indices
365
+
366
+ #extract feature fields
367
+ d = np.loadtxt(data_file, delimiter=',')
368
+ dsize = len(d)
369
+ XC = d[:,feat_field_indices]
370
+
371
+ #preprocess features
372
+ if (preprocess == "scale"):
373
+ XC = sk.preprocessing.scale(XC)
374
+ elif (preprocess == "normalize"):
375
+ XC = sk.preprocessing.normalize(XC, norm='l2')
376
+ else:
377
+ print "no preprocessing done"
378
+
379
+ #extract output field
380
+ yc = d[:,[class_field_index]]
381
+ yc = yc.reshape(dsize)
382
+ yc = [int(a) for a in yc]
383
+
384
+ #print XC
385
+ #print yc
386
+
387
+
388
+ # train model
389
+ if validation == "kfold":
390
+ native_kfold_validation = configs["train.native.kfold.validation"].lower() == "true"
391
+ train_kfold_validation(num_folds)
392
+ elif validation == "rfold":
393
+ native_rfold_validation = configs["train.native.rfold.validation"].lower() == "true"
394
+ train_rfold_validation(num_folds,num_iter)
395
+ elif validation == "bagging":
396
+ bagging_num_estimator = int(configs["train.bagging.num.estimators"])
397
+ bagging_sample_fraction = float(configs["train.bagging.sample.fraction"])
398
+ bagging_use_oob = configs["train.bagging.sample.fraction"].lower() == "true"
399
+ train_bagging()
400
+ else:
401
+ print "invalid training validation method"
402
+ sys.exit()
403
+
404
+ else:
405
+ #predict
406
+ print "running in prediction mode"
407
+ pred_data_file = configs["pred.data.file"]
408
+ pred_feat_field_indices = configs["pred.data.feature.fields"].split(",")
409
+ pred_feat_field_indices = [int(a) for a in pred_feat_field_indices]
410
+ preprocess = configs["common.preprocessing"]
411
+ num_models = int(configs["pred.num.models"])
412
+ model_file_directory = configs["common.model.directory"]
413
+ model_file_prefix = configs["common.model.file.prefix"]
414
+
415
+ #extract feature fields
416
+ pd = np.loadtxt(pred_data_file, delimiter=',')
417
+ pdsize = len(pd)
418
+ X = pd[:,pred_feat_field_indices]
419
+
420
+ #preprocess features
421
+ if (preprocess == "scale"):
422
+ X = sk.preprocessing.scale(X)
423
+ elif (preprocess == "normalize"):
424
+ X = sk.preprocessing.normalize(X, norm='l2')
425
+ else:
426
+ print "no preprocessing done"
427
+
428
+ predict()
supv/tnn.py ADDED
@@ -0,0 +1,789 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/local/bin/python3
2
+
3
+ # avenir-python: Machine Learning
4
+ # Author: Pranab Ghosh
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License"); you
7
+ # may not use this file except in compliance with the License. You may
8
+ # obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15
+ # implied. See the License for the specific language governing
16
+ # permissions and limitations under the License.
17
+
18
+ # Package imports
19
+ import os
20
+ import sys
21
+ import matplotlib.pyplot as plt
22
+ import numpy as np
23
+ import torch
24
+ from torch.autograd import Variable
25
+ from torch.utils.data import Dataset, TensorDataset
26
+ from torch.utils.data import DataLoader
27
+ import sklearn as sk
28
+ from sklearn.neighbors import KDTree
29
+ import matplotlib
30
+ import random
31
+ import jprops
32
+ from random import randint
33
+ import statistics
34
+ sys.path.append(os.path.abspath("../lib"))
35
+ from util import *
36
+ from mlutil import *
37
+
38
+ """
39
+ forward hook function
40
+ """
41
+ intermedOut = {}
42
+ lvalues = list()
43
+
44
+ def hookFn(m, i, o):
45
+ """
46
+ call back for latent values
47
+ """
48
+ #intermedOut[m] = o
49
+ lv = o.data.cpu().numpy()
50
+ lv = lv[0].tolist()
51
+ lvalues.append(lv)
52
+ #print(lv)
53
+
54
+ def getLatValues():
55
+ """
56
+ """
57
+ return lvalues
58
+
59
+ class FeedForwardNetwork(torch.nn.Module):
60
+ def __init__(self, configFile, addDefValues=None):
61
+ """
62
+ In the constructor we instantiate two nn.Linear modules and assign them as
63
+ member variables.
64
+
65
+ Parameters
66
+ configFile : config file path
67
+ addDefValues : dictionary of additional default values
68
+ """
69
+ defValues = dict() if addDefValues is None else addDefValues.copy()
70
+ defValues["common.mode"] = ("training", None)
71
+ defValues["common.model.directory"] = ("model", None)
72
+ defValues["common.model.file"] = (None, None)
73
+ defValues["common.preprocessing"] = (None, None)
74
+ defValues["common.scaling.method"] = ("zscale", None)
75
+ defValues["common.scaling.minrows"] = (50, None)
76
+ defValues["common.scaling.param.file"] = (None, None)
77
+ defValues["common.verbose"] = (False, None)
78
+ defValues["common.device"] = ("cpu", None)
79
+ defValues["train.data.file"] = (None, "missing training data file")
80
+ defValues["train.data.fields"] = (None, "missing training data field ordinals")
81
+ defValues["train.data.feature.fields"] = (None, "missing training data feature field ordinals")
82
+ defValues["train.data.out.fields"] = (None, "missing training data feature field ordinals")
83
+ defValues["train.layer.data"] = (None, "missing layer data")
84
+ defValues["train.input.size"] = (None, None)
85
+ defValues["train.output.size"] = (None, "missing output size")
86
+ defValues["train.batch.size"] = (10, None)
87
+ defValues["train.loss.reduction"] = ("mean", None)
88
+ defValues["train.num.iterations"] = (500, None)
89
+ defValues["train.lossFn"] = ("mse", None)
90
+ defValues["train.optimizer"] = ("sgd", None)
91
+ defValues["train.opt.learning.rate"] = (.0001, None)
92
+ defValues["train.opt.weight.decay"] = (0, None)
93
+ defValues["train.opt.momentum"] = (0, None)
94
+ defValues["train.opt.eps"] = (1e-08, None)
95
+ defValues["train.opt.dampening"] = (0, None)
96
+ defValues["train.opt.momentum.nesterov"] = (False, None)
97
+ defValues["train.opt.betas"] = ([0.9, 0.999], None)
98
+ defValues["train.opt.alpha"] = (0.99, None)
99
+ defValues["train.save.model"] = (False, None)
100
+ defValues["train.track.error"] = (False, None)
101
+ defValues["train.epoch.intv"] = (5, None)
102
+ defValues["train.batch.intv"] = (5, None)
103
+ defValues["train.print.weights"] = (False, None)
104
+ defValues["valid.data.file"] = (None, None)
105
+ defValues["valid.accuracy.metric"] = (None, None)
106
+ defValues["predict.data.file"] = (None, None)
107
+ defValues["predict.use.saved.model"] = (True, None)
108
+ defValues["predict.output"] = ("binary", None)
109
+ defValues["predict.feat.pad.size"] = (60, None)
110
+ defValues["predict.print.output"] = (True, None)
111
+ defValues["calibrate.num.bins"] = (10, None)
112
+ defValues["calibrate.pred.prob.thresh"] = (0.5, None)
113
+ defValues["calibrate.num.nearest.neighbors"] = (10, None)
114
+ self.config = Configuration(configFile, defValues)
115
+
116
+ super(FeedForwardNetwork, self).__init__()
117
+
118
+ def setConfigParam(self, name, value):
119
+ """
120
+ set config param
121
+
122
+ Parameters
123
+ name : config name
124
+ value : config value
125
+ """
126
+ self.config.setParam(name, value)
127
+
128
+ def getConfig(self):
129
+ """
130
+ get config object
131
+ """
132
+ return self.config
133
+
134
+ def setVerbose(self, verbose):
135
+ self.verbose = verbose
136
+
137
+ def buildModel(self):
138
+ """
139
+ Loads configuration and builds the various piecess necessary for the model
140
+ """
141
+ torch.manual_seed(9999)
142
+
143
+ self.verbose = self.config.getBooleanConfig("common.verbose")[0]
144
+ numinp = self.config.getIntConfig("train.input.size")[0]
145
+ if numinp is None:
146
+ numinp = len(self.config.getIntListConfig("train.data.feature.fields")[0])
147
+ #numOut = len(self.config.getStringConfig("train.data.out.fields")[0].split(","))
148
+ self.outputSize = self.config.getIntConfig("train.output.size")[0]
149
+ self.batchSize = self.config.getIntConfig("train.batch.size")[0]
150
+ #lossRed = self.config.getStringConfig("train.loss.reduction")[0]
151
+ #learnRate = self.config.getFloatConfig("train.opt.learning.rate")[0]
152
+ self.numIter = self.config.getIntConfig("train.num.iterations")[0]
153
+ optimizer = self.config.getStringConfig("train.optimizer")[0]
154
+ self.lossFnStr = self.config.getStringConfig("train.lossFn")[0]
155
+ self.accMetric = self.config.getStringConfig("valid.accuracy.metric")[0]
156
+ self.trackErr = self.config.getBooleanConfig("train.track.error")[0]
157
+ self.batchIntv = self.config.getIntConfig("train.batch.intv")[0]
158
+ self.restored = False
159
+ self.clabels = list(range(self.outputSize)) if self.outputSize > 1 else None
160
+
161
+ #build network
162
+ layers = list()
163
+ ninp = numinp
164
+ trData = self.config.getStringConfig("train.layer.data")[0].split(",")
165
+ for ld in trData:
166
+ lde = ld.split(":")
167
+ assert len(lde) == 5, "expecting 5 items for layer data"
168
+
169
+ #num of units, activation, whether batch normalize, whether batch normalize after activation, dropout fraction
170
+ nunit = int(lde[0])
171
+ actStr = lde[1]
172
+ act = FeedForwardNetwork.createActivation(actStr) if actStr != "none" else None
173
+ bnorm = lde[2] == "true"
174
+ afterAct = lde[3] == "true"
175
+ dpr = float(lde[4])
176
+
177
+ layers.append(torch.nn.Linear(ninp, nunit))
178
+ if bnorm:
179
+ #with batch norm
180
+ if afterAct:
181
+ safeAppend(layers, act)
182
+ layers.append(torch.nn.BatchNorm1d(nunit))
183
+ else:
184
+ layers.append(torch.nn.BatchNorm1d(nunit))
185
+ safeAppend(layers, act)
186
+ else:
187
+ #without batch norm
188
+ safeAppend(layers, act)
189
+
190
+ if dpr > 0:
191
+ layers.append(torch.nn.Dropout(dpr))
192
+ ninp = nunit
193
+
194
+ self.layers = torch.nn.Sequential(*layers)
195
+
196
+ self.device = FeedForwardNetwork.getDevice(self)
197
+
198
+ #training data
199
+ dataFile = self.config.getStringConfig("train.data.file")[0]
200
+ (featData, outData) = FeedForwardNetwork.prepData(self, dataFile)
201
+ self.featData = torch.from_numpy(featData)
202
+ self.outData = torch.from_numpy(outData)
203
+
204
+ #validation data
205
+ dataFile = self.config.getStringConfig("valid.data.file")[0]
206
+ (featDataV, outDataV) = FeedForwardNetwork.prepData(self, dataFile)
207
+ self.validFeatData = torch.from_numpy(featDataV)
208
+ self.validOutData = torch.from_numpy(outDataV)
209
+
210
+ # loss function and optimizer
211
+ self.lossFn = FeedForwardNetwork.createLossFunction(self, self.lossFnStr)
212
+ self.optimizer = FeedForwardNetwork.createOptimizer(self, optimizer)
213
+
214
+ self.yPred = None
215
+ self.restored = False
216
+
217
+ #mode to device
218
+ self.device = FeedForwardNetwork.getDevice(self)
219
+ self.featData = self.featData.to(self.device)
220
+ self.outData = self.outData.to(self.device)
221
+ self.validFeatData = self.validFeatData.to(self.device)
222
+ self.to(self.device)
223
+
224
+ @staticmethod
225
+ def getDevice(model):
226
+ """
227
+ gets device
228
+
229
+ Parameters
230
+ model : torch model
231
+ """
232
+ devType = model.config.getStringConfig("common.device")[0]
233
+ if devType == "cuda":
234
+ if torch.cuda.is_available():
235
+ device = torch.device("cuda")
236
+ else:
237
+ exitWithMsg("cuda not available")
238
+ else:
239
+ device = torch.device("cpu")
240
+ return device
241
+
242
+ def setValidationData(self, dataSource, prep=True):
243
+ """
244
+ sets validation data
245
+
246
+ Parameters
247
+ dataSource : data source str if file path or 2D array
248
+ prep : if True load and prepare
249
+ """
250
+ if prep:
251
+ (featDataV, outDataV) = FeedForwardNetwork.prepData(self, dataSource)
252
+ self.validFeatData = torch.from_numpy(featDataV)
253
+ self.validOutData = outDataV
254
+ else:
255
+ self.validFeatData = torch.from_numpy(dataSource[0])
256
+ self.validOutData = dataSource[1]
257
+
258
+ self.validFeatData = self.validFeatData.to(self.device)
259
+
260
+ @staticmethod
261
+ def createActivation(actName):
262
+ """
263
+ create activation
264
+
265
+ Parameters
266
+ actName : activation name
267
+ """
268
+ if actName is None:
269
+ activation = None
270
+ elif actName == "relu":
271
+ activation = torch.nn.ReLU()
272
+ elif actName == "tanh":
273
+ activation = torch.nn.Tanh()
274
+ elif actName == "sigmoid":
275
+ activation = torch.nn.Sigmoid()
276
+ elif actName == "softmax":
277
+ activation = torch.nn.Softmax(dim=1)
278
+ else:
279
+ exitWithMsg("invalid activation function name " + actName)
280
+ return activation
281
+
282
+ @staticmethod
283
+ def createLossFunction(model, lossFnName):
284
+ """
285
+ create loss function
286
+
287
+ Parameters
288
+ lossFnName : loss function name
289
+ """
290
+ config = model.config
291
+ lossRed = config.getStringConfig("train.loss.reduction")[0]
292
+ if lossFnName == "ltwo" or lossFnName == "mse":
293
+ lossFunc = torch.nn.MSELoss(reduction=lossRed)
294
+ elif lossFnName == "ce":
295
+ lossFunc = torch.nn.CrossEntropyLoss(reduction=lossRed)
296
+ elif lossFnName == "lone" or lossFnName == "mae":
297
+ lossFunc = torch.nn.L1Loss(reduction=lossRed)
298
+ elif lossFnName == "bce":
299
+ lossFunc = torch.nn.BCELoss(reduction=lossRed)
300
+ elif lossFnName == "bcel":
301
+ lossFunc = torch.nn.BCEWithLogitsLoss(reduction=lossRed)
302
+ elif lossFnName == "sm":
303
+ lossFunc = torch.nn.SoftMarginLoss(reduction=lossRed)
304
+ elif lossFnName == "mlsm":
305
+ lossFunc = torch.nn.MultiLabelSoftMarginLoss(reduction=lossRed)
306
+ else:
307
+ exitWithMsg("invalid loss function name " + lossFnName)
308
+ return lossFunc
309
+
310
+ @staticmethod
311
+ def createOptimizer(model, optName):
312
+ """
313
+ create optimizer
314
+
315
+ Parameters
316
+ optName : optimizer name
317
+ """
318
+ config = model.config
319
+ learnRate = config.getFloatConfig("train.opt.learning.rate")[0]
320
+ weightDecay = config.getFloatConfig("train.opt.weight.decay")[0]
321
+ momentum = config.getFloatConfig("train.opt.momentum")[0]
322
+ eps = config.getFloatConfig("train.opt.eps")[0]
323
+ if optName == "sgd":
324
+ dampening = config.getFloatConfig("train.opt.dampening")[0]
325
+ momentumNesterov = config.getBooleanConfig("train.opt.momentum.nesterov")[0]
326
+ optimizer = torch.optim.SGD(model.parameters(),lr=learnRate, momentum=momentum,
327
+ dampening=dampening, weight_decay=weightDecay, nesterov=momentumNesterov)
328
+ elif optName == "adam":
329
+ betas = config.getFloatListConfig("train.opt.betas")[0]
330
+ betas = (betas[0], betas[1])
331
+ optimizer = torch.optim.Adam(model.parameters(), lr=learnRate,betas=betas, eps = eps,
332
+ weight_decay=weightDecay)
333
+ elif optName == "rmsprop":
334
+ alpha = config.getFloatConfig("train.opt.alpha")[0]
335
+ optimizer = torch.optim.RMSprop(model.parameters(), lr=learnRate, alpha=alpha,
336
+ eps=eps, weight_decay=weightDecay, momentum=momentum)
337
+ else:
338
+ exitWithMsg("invalid optimizer name " + optName)
339
+ return optimizer
340
+
341
+
342
+ def forward(self, x):
343
+ """
344
+ In the forward function we accept a Tensor of input data and we must return
345
+ a Tensor of output data. We can use Modules defined in the constructor as
346
+ well as arbitrary (differentiable) operations on Tensors.
347
+
348
+ Parameters
349
+ x : data batch
350
+ """
351
+ y = self.layers(x)
352
+ return y
353
+
354
+ @staticmethod
355
+ def addForwardHook(model, l, cl = 0):
356
+ """
357
+ register forward hooks
358
+
359
+ Parameters
360
+ l :
361
+ cl :
362
+ """
363
+ for name, layer in model._modules.items():
364
+ #If it is a sequential, don't register a hook on it
365
+ # but recursively register hook on all it's module children
366
+ print(str(cl) + " : " + name)
367
+ if isinstance(layer, torch.nn.Sequential):
368
+ FeedForwardNetwork.addForwardHook(layer, l, cl)
369
+ else:
370
+ # it's a non sequential. Register a hook
371
+ if cl == l:
372
+ print("setting hook at layer " + str(l))
373
+ layer.register_forward_hook(hookFn)
374
+ cl += 1
375
+
376
+ @staticmethod
377
+ def prepData(model, dataSource, includeOutFld=True):
378
+ """
379
+ loads and prepares data
380
+
381
+ Parameters
382
+ dataSource : data source str if file path or 2D array
383
+ includeOutFld : True if target freld to be included
384
+ """
385
+ # parameters
386
+ fieldIndices = model.config.getIntListConfig("train.data.fields")[0]
387
+ featFieldIndices = model.config.getIntListConfig("train.data.feature.fields")[0]
388
+
389
+ #all data and feature data
390
+ isDataFile = isinstance(dataSource, str)
391
+ selFieldIndices = fieldIndices if includeOutFld else fieldIndices[:-1]
392
+ if isDataFile:
393
+ #source file path
394
+ (data, featData) = loadDataFile(dataSource, ",", selFieldIndices, featFieldIndices)
395
+ else:
396
+ # tabular data
397
+ data = tableSelFieldsFilter(dataSource, selFieldIndices)
398
+ featData = tableSelFieldsFilter(data, featFieldIndices)
399
+ #print(featData)
400
+ featData = np.array(featData)
401
+
402
+ if (model.config.getStringConfig("common.preprocessing")[0] == "scale"):
403
+ scalingMethod = model.config.getStringConfig("common.scaling.method")[0]
404
+
405
+ #scale only if there are enough rows
406
+ nrow = featData.shape[0]
407
+ minrows = model.config.getIntConfig("common.scaling.minrows")[0]
408
+ if nrow > minrows:
409
+ #in place scaling
410
+ featData = scaleData(featData, scalingMethod)
411
+ else:
412
+ #use pre computes scaling parameters
413
+ spFile = model.config.getStringConfig("common.scaling.param.file")[0]
414
+ if spFile is None:
415
+ exitWithMsg("for small data sets pre computed scaling parameters need to provided")
416
+ scParams = restoreObject(spFile)
417
+ featData = scaleDataWithParams(featData, scalingMethod, scParams)
418
+ featData = np.array(featData)
419
+
420
+ # target data
421
+ if includeOutFld:
422
+ outFieldIndices = model.config.getStringConfig("train.data.out.fields")[0]
423
+ outFieldIndices = strToIntArray(outFieldIndices, ",")
424
+ if isDataFile:
425
+ outData = data[:,outFieldIndices]
426
+ else:
427
+ outData = tableSelFieldsFilter(data, outFieldIndices)
428
+ outData = np.array(outData)
429
+ foData = (featData.astype(np.float32), outData.astype(np.float32))
430
+ else:
431
+ foData = featData.astype(np.float32)
432
+ return foData
433
+
434
+ @staticmethod
435
+ def saveCheckpt(model):
436
+ """
437
+ checkpoints model
438
+
439
+ Parameters
440
+ model : torch model
441
+ """
442
+ print("..saving model checkpoint")
443
+ modelDirectory = model.config.getStringConfig("common.model.directory")[0]
444
+ assert os.path.exists(modelDirectory), "model save directory does not exist"
445
+ modelFile = model.config.getStringConfig("common.model.file")[0]
446
+ filepath = os.path.join(modelDirectory, modelFile)
447
+ state = {"state_dict": model.state_dict(), "optim_dict": model.optimizer.state_dict()}
448
+ torch.save(state, filepath)
449
+ if model.verbose:
450
+ print("model saved")
451
+
452
+ @staticmethod
453
+ def restoreCheckpt(model, loadOpt=False):
454
+ """
455
+ restored checkpointed model
456
+
457
+ Parameters
458
+ model : torch model
459
+ loadOpt : True if optimizer to be loaded
460
+ """
461
+ if not model.restored:
462
+ print("..restoring model checkpoint")
463
+ modelDirectory = model.config.getStringConfig("common.model.directory")[0]
464
+ modelFile = model.config.getStringConfig("common.model.file")[0]
465
+ filepath = os.path.join(modelDirectory, modelFile)
466
+ assert os.path.exists(filepath), "model save file does not exist"
467
+ checkpoint = torch.load(filepath)
468
+ model.load_state_dict(checkpoint["state_dict"])
469
+ model.to(model.device)
470
+ if loadOpt:
471
+ model.optimizer.load_state_dict(checkpoint["optim_dict"])
472
+ model.restored = True
473
+
474
+ @staticmethod
475
+ def processClassifOutput(yPred, config):
476
+ """
477
+ extracts probability label 1 or label with highest probability
478
+
479
+ Parameters
480
+ yPred : predicted output
481
+ config : config object
482
+ """
483
+ outType = config.getStringConfig("predict.output")[0]
484
+ if outType == "prob":
485
+ outputSize = config.getIntConfig("train.output.size")[0]
486
+ if outputSize == 2:
487
+ #return prob of pos class for binary classifier
488
+ yPred = yPred[:, 1]
489
+ else:
490
+ #return class value and probability for multi classifier
491
+ yCl = np.argmax(yPred, axis=1)
492
+ yPred = list(map(lambda y : y[0][y[1]], zip(yPred, yCl)))
493
+ yPred = zip(yCl, yPred)
494
+ else:
495
+ yPred = np.argmax(yPred, axis=1)
496
+ return yPred
497
+
498
+ @staticmethod
499
+ def printPrediction(yPred, config, dataSource):
500
+ """
501
+ prints input feature data and prediction
502
+
503
+ Parameters
504
+ yPred : predicted output
505
+ config : config object
506
+ dataSource : data source str if file path or 2D array
507
+ """
508
+ #prDataFilePath = config.getStringConfig("predict.data.file")[0]
509
+ padWidth = config.getIntConfig("predict.feat.pad.size")[0]
510
+ i = 0
511
+ if type(dataSource) == str:
512
+ for rec in fileRecGen(dataSource, ","):
513
+ feat = (",".join(rec)).ljust(padWidth, " ")
514
+ rec = feat + "\t" + str(yPred[i])
515
+ print(rec)
516
+ i += 1
517
+ else:
518
+ for rec in dataSource:
519
+ srec = toStrList(rec, 6)
520
+ feat = (",".join(srec)).ljust(padWidth, " ")
521
+ srec = feat + "\t" + str(yPred[i])
522
+ print(srec)
523
+ i += 1
524
+
525
+
526
+ @staticmethod
527
+ def allTrain(model):
528
+ """
529
+ train with all data
530
+
531
+ Parameters
532
+ model : torch model
533
+ """
534
+ # train mode
535
+ model.train()
536
+ for t in range(model.numIter):
537
+
538
+
539
+ # Forward pass: Compute predicted y by passing x to the model
540
+ yPred = model(model.featData)
541
+
542
+ # Compute and print loss
543
+ loss = model.lossFn(yPred, model.outData)
544
+ if model.verbose and t % 50 == 0:
545
+ print("epoch {} loss {:.6f}".format(t, loss.item()))
546
+
547
+ # Zero gradients, perform a backward pass, and update the weights.
548
+ model.optimizer.zero_grad()
549
+ loss.backward()
550
+ model.optimizer.step()
551
+
552
+ #validate
553
+ model.eval()
554
+ yPred = model(model.validFeatData)
555
+ yPred = yPred.data.cpu().numpy()
556
+ yActual = model.validOutData
557
+ if model.verbose:
558
+ result = np.concatenate((yPred, yActual), axis = 1)
559
+ print("predicted actual")
560
+ print(result)
561
+
562
+ score = perfMetric(model.accMetric, yActual, yPred)
563
+ print(formatFloat(3, score, "perf score"))
564
+ return score
565
+
566
+ @staticmethod
567
+ def batchTrain(model):
568
+ """
569
+ train with batch data
570
+
571
+ Parameters
572
+ model : torch model
573
+ """
574
+ model.restored = False
575
+ trainData = TensorDataset(model.featData, model.outData)
576
+ trainDataLoader = DataLoader(dataset=trainData, batch_size=model.batchSize, shuffle=True)
577
+ epochIntv = model.config.getIntConfig("train.epoch.intv")[0]
578
+
579
+ # train mode
580
+ model.train()
581
+
582
+ if model.trackErr:
583
+ trErr = list()
584
+ vaErr = list()
585
+ #epoch
586
+ for t in range(model.numIter):
587
+ #batch
588
+ b = 0
589
+ epochLoss = 0.0
590
+ for xBatch, yBatch in trainDataLoader:
591
+
592
+ # Forward pass: Compute predicted y by passing x to the model
593
+ xBatch, yBatch = xBatch.to(model.device), yBatch.to(model.device)
594
+ yPred = model(xBatch)
595
+
596
+ # Compute and print loss
597
+ loss = model.lossFn(yPred, yBatch)
598
+ if model.verbose and t % epochIntv == 0 and b % model.batchIntv == 0:
599
+ print("epoch {} batch {} loss {:.6f}".format(t, b, loss.item()))
600
+
601
+ if model.trackErr and model.batchIntv == 0:
602
+ epochLoss += loss.item()
603
+
604
+ #error tracking at batch level
605
+ if model.trackErr and model.batchIntv > 0 and b % model.batchIntv == 0:
606
+ trErr.append(loss.item())
607
+ vloss = FeedForwardNetwork.evaluateModel(model)
608
+ vaErr.append(vloss)
609
+
610
+ # Zero gradients, perform a backward pass, and update the weights.
611
+ model.optimizer.zero_grad()
612
+ loss.backward()
613
+ model.optimizer.step()
614
+ b += 1
615
+
616
+ #error tracking at epoch level
617
+ if model.trackErr and model.batchIntv == 0:
618
+ epochLoss /= len(trainDataLoader)
619
+ trErr.append(epochLoss)
620
+ vloss = FeedForwardNetwork.evaluateModel(model)
621
+ vaErr.append(vloss)
622
+
623
+ #validate
624
+ model.eval()
625
+ yPred = model(model.validFeatData)
626
+ yPred = yPred.data.cpu().numpy()
627
+ yActual = model.validOutData
628
+ if model.verbose:
629
+ vsize = yPred.shape[0]
630
+ print("\npredicted \t\t actual")
631
+ for i in range(vsize):
632
+ print(str(yPred[i]) + "\t" + str(yActual[i]))
633
+
634
+ score = perfMetric(model.accMetric, yActual, yPred)
635
+ print(yActual)
636
+ print(yPred)
637
+ print(formatFloat(3, score, "perf score"))
638
+
639
+ #save
640
+ modelSave = model.config.getBooleanConfig("train.model.save")[0]
641
+ if modelSave:
642
+ FeedForwardNetwork.saveCheckpt(model)
643
+
644
+ if model.trackErr:
645
+ FeedForwardNetwork.errorPlot(model, trErr, vaErr)
646
+
647
+ if model.config.getBooleanConfig("train.print.weights")[0]:
648
+ print("model weights")
649
+ for param in model.parameters():
650
+ print(param.data)
651
+ return score
652
+
653
+ @staticmethod
654
+ def errorPlot(model, trErr, vaErr):
655
+ """
656
+ plot errors
657
+
658
+ Parameters
659
+ trErr : training error list
660
+ vaErr : validation error list
661
+ """
662
+ x = np.arange(len(trErr))
663
+ plt.plot(x,trErr,label = "training error")
664
+ plt.plot(x,vaErr,label = "validation error")
665
+ plt.xlabel("iteration")
666
+ plt.ylabel("error")
667
+ plt.legend(["training error", "validation error"], loc='upper left')
668
+ plt.show()
669
+
670
+ @staticmethod
671
+ def modelPredict(model, dataSource = None):
672
+ """
673
+ predict
674
+
675
+ Parameters
676
+ model : torch model
677
+ dataSource : data source
678
+ """
679
+ #train or restore model
680
+ useSavedModel = model.config.getBooleanConfig("predict.use.saved.model")[0]
681
+ if useSavedModel:
682
+ FeedForwardNetwork.restoreCheckpt(model)
683
+ else:
684
+ FeedForwardNetwork.batchTrain(model)
685
+
686
+ #predict
687
+ if dataSource is None:
688
+ dataSource = model.config.getStringConfig("predict.data.file")[0]
689
+ featData = FeedForwardNetwork.prepData(model, dataSource, False)
690
+ #print(featData)
691
+ featData = torch.from_numpy(featData)
692
+ featData = featData.to(model.device)
693
+
694
+ model.eval()
695
+ yPred = model(featData)
696
+ yPred = yPred.data.cpu().numpy()
697
+ #print(yPred)
698
+
699
+ if model.outputSize >= 2:
700
+ #classification
701
+ yPred = FeedForwardNetwork.processClassifOutput(yPred, model.config)
702
+
703
+ # print prediction
704
+ if model.config.getBooleanConfig("predict.print.output")[0]:
705
+ FeedForwardNetwork.printPrediction(yPred, model.config, dataSource)
706
+
707
+ return yPred
708
+
709
+ def predict(self, dataSource = None):
710
+ """
711
+ predict
712
+
713
+ Parameters
714
+ dataSource : data source
715
+ """
716
+ return FeedForwardNetwork.modelPredict(self, dataSource)
717
+
718
+ @staticmethod
719
+ def evaluateModel(model):
720
+ """
721
+ evaluate model
722
+
723
+ Parameters
724
+ model : torch model
725
+ """
726
+ model.eval()
727
+ with torch.no_grad():
728
+ yPred = model(model.validFeatData)
729
+ #yPred = yPred.data.cpu().numpy()
730
+ yActual = model.validOutData
731
+ score = model.lossFn(yPred, yActual).item()
732
+ model.train()
733
+ return score
734
+
735
+ @staticmethod
736
+ def prepValidate(model, dataSource=None):
737
+ """
738
+ prepare for validation
739
+
740
+ Parameters
741
+ model : torch model
742
+ dataSource : data source
743
+ """
744
+ #train or restore model
745
+ if not model.restored:
746
+ useSavedModel = model.config.getBooleanConfig("predict.use.saved.model")[0]
747
+ if useSavedModel:
748
+ FeedForwardNetwork.restoreCheckpt(model)
749
+ else:
750
+ FeedForwardNetwork.batchTrain(model)
751
+ model.restored = True
752
+
753
+ if dataSource is not None:
754
+ model.setValidationData(dataSource)
755
+
756
+ @staticmethod
757
+ def validateModel(model, retPred=False):
758
+ """
759
+ pmodel validation
760
+
761
+ Parameters
762
+ model : torch model
763
+ retPred : if True return prediction
764
+ """
765
+ model.eval()
766
+ yPred = model(model.validFeatData)
767
+ yPred = yPred.data.cpu().numpy()
768
+ model.yPred = yPred
769
+ yActual = model.validOutData
770
+ vsize = yPred.shape[0]
771
+ if model.verbose:
772
+ print("\npredicted \t actual")
773
+ for i in range(vsize):
774
+ print("{:.3f}\t\t{:.3f}".format(yPred[i][0], yActual[i][0]))
775
+
776
+ score = perfMetric(model.accMetric, yActual, yPred)
777
+ print(formatFloat(3, score, "perf score"))
778
+
779
+ if retPred:
780
+ y = list(map(lambda i : (yPred[i][0], yActual[i][0]), range(vsize)))
781
+ res = (y, score)
782
+ return res
783
+ else:
784
+ return score
785
+
786
+
787
+
788
+
789
+