Priyanka-Kumavat-At-TE's picture
Upload 19 files
2fc2c1f
#!/usr/local/bin/python3
# avenir-python: Machine Learning
# Author: Pranab Ghosh
#
# Licensed under the Apache License, Version 2.0 (the "License"); you
# may not use this file except in compliance with the License. You may
# obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied. See the License for the specific language governing
# permissions and limitations under the License.
# Package imports
import os
import sys
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk
import matplotlib
import random
import jprops
from io import StringIO
from sklearn.model_selection import cross_val_score
import joblib
from random import randint
from io import StringIO
sys.path.append(os.path.abspath("../lib"))
from util import *
from mlutil import *
from pasearch import *
#base classifier class
class BaseClassifier(object):
def __init__(self, configFile, defValues, mname):
self.config = Configuration(configFile, defValues)
self.subSampleRate = None
self.featData = None
self.clsData = None
self.classifier = None
self.trained = False
self.verbose = self.config.getBooleanConfig("common.verbose")[0]
logFilePath = self.config.getStringConfig("common.logging.file")[0]
logLevName = self.config.getStringConfig("common.logging.level")[0]
self.logger = createLogger(mname, logFilePath, logLevName)
self.logger.info("********* starting session")
def initConfig(self, configFile, defValues):
"""
initialize config
"""
self.config = Configuration(configFile, defValues)
def getConfig(self):
"""
get config object
"""
return self.config
def setConfigParam(self, name, value):
"""
set config param
"""
self.config.setParam(name, value)
def getMode(self):
"""
get mode
"""
return self.config.getStringConfig("common.mode")[0]
def getSearchParamStrategy(self):
"""
get search parameter
"""
return self.config.getStringConfig("train.search.param.strategy")[0]
def train(self):
"""
train model
"""
#build model
self.buildModel()
# training data
if self.featData is None:
(featData, clsData) = self.prepTrainingData()
(self.featData, self.clsData) = (featData, clsData)
else:
(featData, clsData) = (self.featData, self.clsData)
if self.subSampleRate is not None:
(featData, clsData) = subSample(featData, clsData, self.subSampleRate, False)
self.logger.info("subsample size " + str(featData.shape[0]))
# parameters
modelSave = self.config.getBooleanConfig("train.model.save")[0]
#train
self.logger.info("...training model")
self.classifier.fit(featData, clsData)
score = self.classifier.score(featData, clsData)
successCriterion = self.config.getStringConfig("train.success.criterion")[0]
result = None
if successCriterion == "accuracy":
self.logger.info("accuracy with training data {:06.3f}".format(score))
result = score
elif successCriterion == "error":
error = 1.0 - score
self.logger.info("error with training data {:06.3f}".format(error))
result = error
else:
raise ValueError("invalid success criterion")
if modelSave:
self.logger.info("...saving model")
modelFilePath = self.getModelFilePath()
joblib.dump(self.classifier, modelFilePath)
self.trained = True
return result
def trainValidate(self):
"""
train with k fold validation
"""
#build model
self.buildModel()
# training data
(featData, clsData) = self.prepTrainingData()
#parameter
validation = self.config.getStringConfig("train.validation")[0]
numFolds = self.config.getIntConfig("train.num.folds")[0]
successCriterion = self.config.getStringConfig("train.success.criterion")[0]
scoreMethod = self.config.getStringConfig("train.score.method")[0]
#train with validation
self.logger.info("...training and kfold cross validating model")
scores = cross_val_score(self.classifier, featData, clsData, cv=numFolds,scoring=scoreMethod)
avScore = np.mean(scores)
result = self.reportResult(avScore, successCriterion, scoreMethod)
return result
def trainValidateSearch(self):
"""
train with k fold validation and search parameter space for optimum
"""
self.logger.info("...starting train validate with parameter search")
searchStrategyName = self.getSearchParamStrategy()
if searchStrategyName is not None:
if searchStrategyName == "grid":
searchStrategy = GuidedParameterSearch(self.verbose)
elif searchStrategyName == "random":
searchStrategy = RandomParameterSearch(self.verbose)
maxIter = self.config.getIntConfig("train.search.max.iterations")[0]
searchStrategy.setMaxIter(maxIter)
elif searchStrategyName == "simuan":
searchStrategy = SimulatedAnnealingParameterSearch(self.verbose)
maxIter = self.config.getIntConfig("train.search.max.iterations")[0]
searchStrategy.setMaxIter(maxIter)
temp = self.config.getFloatConfig("train.search.sa.temp")[0]
searchStrategy.setTemp(temp)
tempRedRate = self.config.getFloatConfig("train.search.sa.temp.red.rate")[0]
searchStrategy.setTempReductionRate(tempRedRate)
else:
raise ValueError("invalid paramtere search strategy")
else:
raise ValueError("missing search strategy")
# add search params
searchParams = self.config.getStringConfig("train.search.params")[0].split(",")
searchParamNames = []
extSearchParamNames = []
if searchParams is not None:
for searchParam in searchParams:
paramItems = searchParam.split(":")
extSearchParamNames.append(paramItems[0])
#get rid name component search
paramNameItems = paramItems[0].split(".")
del paramNameItems[1]
paramItems[0] = ".".join(paramNameItems)
searchStrategy.addParam(paramItems)
searchParamNames.append(paramItems[0])
else:
raise ValueError("missing search parameter list")
# add search param data list for each param
for (searchParamName,extSearchParamName) in zip(searchParamNames,extSearchParamNames):
searchParamData = self.config.getStringConfig(extSearchParamName)[0].split(",")
searchStrategy.addParamVaues(searchParamName, searchParamData)
# train and validate for various param value combination
searchStrategy.prepare()
paramValues = searchStrategy.nextParamValues()
searchResults = []
while paramValues is not None:
self.logger.info("...next parameter set")
paramStr = ""
for paramValue in paramValues:
self.setConfigParam(paramValue[0], str(paramValue[1]))
paramStr = paramStr + paramValue[0] + "=" + str(paramValue[1]) + " "
result = self.trainValidate()
searchStrategy.setCost(result)
searchResults.append((paramStr, result))
paramValues = searchStrategy.nextParamValues()
# output
self.logger.info("all parameter search results")
for searchResult in searchResults:
self.logger.info("{}\t{06.3f}".format(searchResult[0], searchResult[1]))
self.logger.info("best parameter search result")
bestSolution = searchStrategy.getBestSolution()
paramStr = ""
for paramValue in bestSolution[0]:
paramStr = paramStr + paramValue[0] + "=" + str(paramValue[1]) + " "
self.logger.info("{}\t{:06.3f}".format(paramStr, bestSolution[1]))
return bestSolution
def validate(self):
"""
predict
"""
# create model
useSavedModel = self.config.getBooleanConfig("validate.use.saved.model")[0]
if useSavedModel:
# load saved model
self.logger.info("...loading model")
modelFilePath = self.getModelFilePath()
self.classifier = joblib.load(modelFilePath)
else:
# train model
if not self.trained:
self.train()
# prepare test data
(featData, clsDataActual) = self.prepValidationData()
#predict
self.logger.info("...predicting")
clsDataPred = self.classifier.predict(featData)
self.logger.info("...validating")
#print clsData
scoreMethod = self.config.getStringConfig("validate.score.method")[0]
if scoreMethod == "accuracy":
accuracy = sk.metrics.accuracy_score(clsDataActual, clsDataPred)
self.logger.info("accuracy:")
self.logger.info(accuracy)
elif scoreMethod == "confusionMatrix":
confMatrx = sk.metrics.confusion_matrix(clsDataActual, clsDataPred)
self.logger.info("confusion matrix:")
self.logger.info(confMatrx)
def predictx(self):
"""
predict
"""
# create model
self.prepModel()
# prepare test data
featData = self.prepPredictData()
#predict
self.logger.info("...predicting")
clsData = self.classifier.predict(featData)
self.logger.info(clsData)
def predict(self, recs=None):
"""
predict with in memory data
"""
# create model
self.prepModel()
#input record
if recs:
#passed record
featData = self.prepStringPredictData(recs)
if (featData.ndim == 1):
featData = featData.reshape(1, -1)
else:
#file
featData = self.prepPredictData()
#predict
self.logger.info("...predicting")
clsData = self.classifier.predict(featData)
return clsData
def predictProb(self, recs):
"""
predict probability with in memory data
"""
raise ValueError("can not predict class probability")
def prepModel(self):
"""
preparing model
"""
useSavedModel = self.config.getBooleanConfig("predict.use.saved.model")[0]
if (useSavedModel and not self.classifier):
# load saved model
self.logger.info("...loading saved model")
modelFilePath = self.getModelFilePath()
self.classifier = joblib.load(modelFilePath)
else:
# train model
if not self.trained:
self.train()
def prepTrainingData(self):
"""
loads and prepares training data
"""
# parameters
dataFile = self.config.getStringConfig("train.data.file")[0]
fieldIndices = self.config.getStringConfig("train.data.fields")[0]
if not fieldIndices is None:
fieldIndices = strToIntArray(fieldIndices, ",")
featFieldIndices = self.config.getStringConfig("train.data.feature.fields")[0]
if not featFieldIndices is None:
featFieldIndices = strToIntArray(featFieldIndices, ",")
classFieldIndex = self.config.getIntConfig("train.data.class.field")[0]
#training data
(data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
scalingMethod = self.config.getStringConfig("common.scaling.method")[0]
featData = scaleData(featData, scalingMethod)
clsData = extrColumns(data, classFieldIndex)
clsData = np.array([int(a) for a in clsData])
return (featData, clsData)
def prepValidationData(self):
"""
loads and prepares training data
"""
# parameters
dataFile = self.config.getStringConfig("validate.data.file")[0]
fieldIndices = self.config.getStringConfig("validate.data.fields")[0]
if not fieldIndices is None:
fieldIndices = strToIntArray(fieldIndices, ",")
featFieldIndices = self.config.getStringConfig("validate.data.feature.fields")[0]
if not featFieldIndices is None:
featFieldIndices = strToIntArray(featFieldIndices, ",")
classFieldIndex = self.config.getIntConfig("validate.data.class.field")[0]
#training data
(data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
scalingMethod = self.config.getStringConfig("common.scaling.method")[0]
featData = scaleData(featData, scalingMethod)
clsData = extrColumns(data, classFieldIndex)
clsData = [int(a) for a in clsData]
return (featData, clsData)
def prepPredictData(self):
"""
loads and prepares training data
"""
# parameters
dataFile = self.config.getStringConfig("predict.data.file")[0]
if dataFile is None:
raise ValueError("missing prediction data file")
fieldIndices = self.config.getStringConfig("predict.data.fields")[0]
if not fieldIndices is None:
fieldIndices = strToIntArray(fieldIndices, ",")
featFieldIndices = self.config.getStringConfig("predict.data.feature.fields")[0]
if not featFieldIndices is None:
featFieldIndices = strToIntArray(featFieldIndices, ",")
#training data
(data, featData) = loadDataFile(dataFile, ",", fieldIndices, featFieldIndices)
if (self.config.getStringConfig("common.preprocessing")[0] == "scale"):
scalingMethod = self.config.getStringConfig("common.scaling.method")[0]
featData = scaleData(featData, scalingMethod)
return featData
def prepStringPredictData(self, recs):
"""
prepare string predict data
"""
frecs = StringIO(recs)
featData = np.loadtxt(frecs, delimiter=',')
return featData
def getModelFilePath(self):
"""
get model file path
"""
modelDirectory = self.config.getStringConfig("common.model.directory")[0]
modelFile = self.config.getStringConfig("common.model.file")[0]
if modelFile is None:
raise ValueError("missing model file name")
modelFilePath = modelDirectory + "/" + modelFile
return modelFilePath
def reportResult(self, score, successCriterion, scoreMethod):
"""
report result
"""
if successCriterion == "accuracy":
self.logger.info("average " + scoreMethod + " with k fold cross validation {:06.3f}".format(score))
result = score
elif successCriterion == "error":
error = 1.0 - score
self.logger.info("average error with k fold cross validation {:06.3f}".format(error))
result = error
else:
raise ValueError("invalid success criterion")
return result
def autoTrain(self):
"""
auto train
"""
maxTestErr = self.config.getFloatConfig("train.auto.max.test.error")[0]
maxErr = self.config.getFloatConfig("train.auto.max.error")[0]
maxErrDiff = self.config.getFloatConfig("train.auto.max.error.diff")[0]
self.config.setParam("train.model.save", "False")
#train, validate and serach optimum parameter
result = self.trainValidateSearch()
testError = result[1]
#subsample training size to match train size for k fold validation
numFolds = self.config.getIntConfig("train.num.folds")[0]
self.subSampleRate = float(numFolds - 1) / numFolds
#train only with optimum parameter values
for paramValue in result[0]:
pName = paramValue[0]
pValue = paramValue[1]
self.logger.info(pName + " " + pValue)
self.setConfigParam(pName, pValue)
trainError = self.train()
if testError < maxTestErr:
# criteria based on test error only
self.logger.info("Successfullt trained. Low test error level")
status = 1
else:
# criteria based on bias error and generalization error
avError = (trainError + testError) / 2
diffError = testError - trainError
self.logger.info("Auto training completed: training error {:06.3f} test error: {:06.3f}".format(trainError, testError))
self.logger.info("Average of test and training error: {:06.3f} test and training error diff: {:06.3f}".format(avError, diffError))
if diffError > maxErrDiff:
# high generalization error
if avError > maxErr:
# high bias error
self.logger.info("High generalization error and high error. Need larger training data set and increased model complexity")
status = 4
else:
# low bias error
self.logger.info("High generalization error. Need larger training data set")
status = 3
else:
# low generalization error
if avError > maxErr:
# high bias error
self.logger.info("Converged, but with high error rate. Need to increase model complexity")
status = 2
else:
# low bias error
self.logger.info("Successfullt trained. Low generalization error and low bias error level")
status = 1
if status == 1:
#train final model, use all data and save model
self.logger.info("...training the final model")
self.config.setParam("train.model.save", "True")
self.subSampleRate = None
trainError = self.train()
self.logger.info("training error in final model {:06.3f}".format(trainError))
return status